feat(factors): 新增筹码集中度相关因子并优化训练框架

- 添加 19 个筹码分布和胜率相关因子(包括chip_dispersion、winner_rate等系列)
- LightGBM模型添加早停和训练指标记录功能
- 统一Label配置到common.py模块
- 新增list_factors.py因子列表脚本
This commit is contained in:
2026-03-29 01:34:58 +08:00
parent d4e0e2a0b6
commit c3d1b157e9
9 changed files with 373 additions and 246 deletions

View File

@@ -15,13 +15,15 @@ from src.training import (
NullFiller,
Winsorizer,
StandardScaler,
CrossSectionalStandardScaler,
)
from src.training.core.trainer_v2 import Trainer
from src.training.components.filters import STFilter
from src.experiment.common import (
SELECTED_FACTORS,
FACTOR_DEFINITIONS,
get_label_factor,
LABEL_NAME,
LABEL_FACTOR,
TRAIN_START,
TRAIN_END,
VAL_START,
@@ -44,58 +46,93 @@ TRAINING_TYPE = "regression"
# %% md
# ## 2. 训练特定配置
# %%
# Label 配置
LABEL_NAME = "future_return_5"
LABEL_FACTOR = get_label_factor(LABEL_NAME)
# Label 配置(从 common.py 统一导入)
# LABEL_NAME 和 LABEL_FACTOR 已在 common.py 中绑定,只需从 common 导入
# 排除的因子列表
EXCLUDED_FACTORS = [
"GTJA_alpha062",
"GTJA_alpha060",
"GTJA_alpha058",
"GTJA_alpha056",
"GTJA_alpha053",
"GTJA_alpha040",
"GTJA_alpha043",
"GTJA_alpha027",
"CP",
"max_ret_20",
"debt_to_equity",
"close_vwap_deviation",
"EP",
"BP",
"EP_rank",
"GTJA_alpha044",
"GTJA_alpha036",
"GTJA_alpha010",
"GTJA_alpha005",
"GTJA_alpha036",
"GTJA_alpha027",
"GTJA_alpha044",
"GTJA_alpha001",
"GTJA_alpha002",
"GTJA_alpha007",
"GTJA_alpha016",
"GTJA_alpha073",
"GTJA_alpha104",
"GTJA_alpha103",
"GTJA_alpha105",
"GTJA_alpha092",
"GTJA_alpha087",
"GTJA_alpha085",
"GTJA_alpha062",
"GTJA_alpha124",
"GTJA_alpha133",
"GTJA_alpha131",
"GTJA_alpha117",
"GTJA_alpha124",
"GTJA_alpha120",
"GTJA_alpha119",
"GTJA_alpha103",
"GTJA_alpha099",
"GTJA_alpha105",
"GTJA_alpha104",
"GTJA_alpha090",
"GTJA_alpha085",
"GTJA_alpha083",
"GTJA_alpha084",
"GTJA_alpha087",
"GTJA_alpha092",
"GTJA_alpha074",
"GTJA_alpha089",
"GTJA_alpha173",
"GTJA_alpha157",
"GTJA_alpha139",
"GTJA_alpha162",
"GTJA_alpha163",
"GTJA_alpha177",
"GTJA_alpha180",
"price_to_avg_cost",
"cost_skewness",
"GTJA_alpha191",
"GTJA_alpha180",
"history_position",
"bottom_profit",
"smart_money_accumulation",
]
# 模型参数配置
MODEL_PARAMS = {
# 基础设置
"objective": "regression_l1",
# ==================== 基础设置 ====================
"objective": "huber", # 【修改】相比纯 L1(MAE)huber 对异常值鲁棒且在极小误差处平滑,更适合收益率预测
"metric": "mae",
# 树结构约束
"max_depth": 5,
"num_leaves": 24,
"min_data_in_leaf": 100,
# 学习参数
"learning_rate": 0.01,
"n_estimators": 1500,
# 随机采样
"subsample": 0.8,
# ==================== 树结构约束 ====================
"max_depth": 5, # 【修改】适当加深,允许捕捉一定的高阶交叉
"num_leaves": 31, # 【修改】限制为 312的5次方-1确保树是不对称生长的防止过拟合
"min_data_in_leaf": 512, # 【大幅增加】从256加到1000。训练集有97万条极大地限制叶子节点样本量能有效抵抗股市噪音
# ==================== 学习参数 ====================
"learning_rate": 0.02, # 【修改】稍微调大一点,帮助模型跳出初始的局部最优(避免十几轮就早停)
"n_estimators": 2000,
# ==================== 随机采样与降维 ====================
"subsample": 0.85,
"subsample_freq": 1,
"colsample_bytree": 0.8,
# 正则化
"reg_alpha": 0.5,
"reg_lambda": 1.0,
# 杂项
"colsample_bytree": 0.4, # 【大幅降低】从0.8降到0.4。强制打压 GTJA_alpha127 的霸权,逼迫模型去学习其他因子的信息
"extra_trees": True, # 【新增且极度推荐】极度随机树模式。在分裂点选择时增加随机性,是量化比赛中防过拟合的神器
# ==================== 正则化 ====================
"reg_alpha": 1.0, # 【修改】L1正则增加强行把一些无用特征的权重压到0
"reg_lambda": 5.0, # 【修改】L2正则大幅增加从1到5惩罚过大的叶子节点输出权重
"max_bin": 127, # 【新增】默认255降低到127相当于对连续特征做了一次粗颗粒度的分箱也是极好的正则化手段
# ==================== 杂项 ====================
"verbose": -1,
"random_state": 42,
"n_jobs": -1,
}
# 日期范围配置
@@ -143,6 +180,7 @@ def main():
(NullFiller, {"strategy": "mean"}),
(Winsorizer, {"lower": 0.01, "upper": 0.99}),
(StandardScaler, {}),
# (CrossSectionalStandardScaler, {}),
],
filters=[STFilter(data_router=engine.router)],
stock_pool_filter_func=stock_pool_filter,