- 新增 TabMRankModel、TabMRankTask 及配套损失函数与配置 - 将 DataQualityAnalyzer 从 experiment 迁移至 training 模块 - 调整数据处理器移除过度的 NaN/null 硬填充逻辑 - 优化 RankTask 评估指标使用分位数标签替代原始收益率 - 更新实验脚本处理器顺序与模型超参数配置
213 lines
6.5 KiB
Python
213 lines
6.5 KiB
Python
# %% md
|
||
# # LightGBM 回归训练流程(模块化版本)
|
||
#
|
||
# 使用新的模块化 Trainer 架构,代码更简洁、可维护性更高。
|
||
# %% md
|
||
# ## 1. 导入依赖
|
||
# %%
|
||
import os
|
||
|
||
from src.factors import FactorEngine
|
||
from src.training import (
|
||
FactorManager,
|
||
DataPipeline,
|
||
RegressionTask,
|
||
NullFiller,
|
||
Winsorizer,
|
||
StandardScaler,
|
||
CrossSectionalStandardScaler,
|
||
)
|
||
from src.training.core.trainer_v2 import Trainer
|
||
from src.training.components.filters import STFilter
|
||
from src.experiment.common import (
|
||
SELECTED_FACTORS,
|
||
FACTOR_DEFINITIONS,
|
||
LABEL_NAME,
|
||
LABEL_FACTOR,
|
||
TRAIN_START,
|
||
TRAIN_END,
|
||
VAL_START,
|
||
VAL_END,
|
||
TEST_START,
|
||
TEST_END,
|
||
stock_pool_filter,
|
||
STOCK_FILTER_REQUIRED_COLUMNS,
|
||
OUTPUT_DIR,
|
||
SAVE_PREDICTIONS,
|
||
SAVE_MODEL,
|
||
get_model_save_path,
|
||
save_model_with_factors,
|
||
TOP_N,
|
||
TRAIN_SKIP_DAYS,
|
||
)
|
||
|
||
# 训练类型标识
|
||
TRAINING_TYPE = "regression"
|
||
|
||
# %% md
|
||
# ## 2. 训练特定配置
|
||
# %%
|
||
# Label 配置(从 common.py 统一导入)
|
||
# LABEL_NAME 和 LABEL_FACTOR 已在 common.py 中绑定,只需从 common 导入
|
||
|
||
# 排除的因子列表
|
||
EXCLUDED_FACTORS = [
|
||
# 'GTJA_alpha016',
|
||
# 'volatility_20',
|
||
# 'current_ratio',
|
||
# 'GTJA_alpha001',
|
||
# 'GTJA_alpha141',
|
||
# 'GTJA_alpha129',
|
||
# 'GTJA_alpha164',
|
||
# 'amivest_liq_20',
|
||
# 'GTJA_alpha012',
|
||
# 'debt_to_equity',
|
||
# 'turnover_deviation',
|
||
# 'GTJA_alpha073',
|
||
# 'GTJA_alpha043',
|
||
# 'GTJA_alpha032',
|
||
# 'GTJA_alpha028',
|
||
# 'GTJA_alpha090',
|
||
# 'GTJA_alpha108',
|
||
# 'GTJA_alpha105',
|
||
# 'GTJA_alpha091',
|
||
# 'GTJA_alpha119',
|
||
# 'GTJA_alpha104',
|
||
# 'GTJA_alpha163',
|
||
# 'GTJA_alpha157',
|
||
# 'cost_skewness',
|
||
# 'GTJA_alpha176',
|
||
# 'chip_transition',
|
||
# 'amount_skewness_20',
|
||
# 'GTJA_alpha148',
|
||
# 'mean_median_dev',
|
||
# 'downside_illiq_20',
|
||
]
|
||
|
||
# 模型参数配置
|
||
MODEL_PARAMS = {
|
||
# ==================== 基础设置 ====================
|
||
"objective": "huber", # 【修改】相比纯 L1(MAE),huber 对异常值鲁棒且在极小误差处平滑,更适合收益率预测
|
||
"metric": "mae",
|
||
# ==================== 树结构约束 ====================
|
||
"max_depth": 5, # 【修改】适当加深,允许捕捉一定的高阶交叉
|
||
"num_leaves": 31, # 【修改】限制为 31(2的5次方-1),确保树是不对称生长的,防止过拟合
|
||
"min_data_in_leaf": 512, # 【大幅增加】从256加到1000。训练集有97万条,极大地限制叶子节点样本量能有效抵抗股市噪音
|
||
# ==================== 学习参数 ====================
|
||
"learning_rate": 0.01, # 【修改】稍微调大一点,帮助模型跳出初始的局部最优(避免十几轮就早停)
|
||
"n_estimators": 2000,
|
||
# ==================== 随机采样与降维 ====================
|
||
"subsample": 0.85,
|
||
"subsample_freq": 1,
|
||
"colsample_bytree": 0.4, # 【大幅降低】从0.8降到0.4。强制打压 GTJA_alpha127 的霸权,逼迫模型去学习其他因子的信息
|
||
"extra_trees": True, # 【新增且极度推荐】极度随机树模式。在分裂点选择时增加随机性,是量化比赛中防过拟合的神器
|
||
# ==================== 正则化 ====================
|
||
"reg_alpha": 1.0, # 【修改】L1正则增加,强行把一些无用特征的权重压到0
|
||
"reg_lambda": 5.0, # 【修改】L2正则大幅增加(从1到5),惩罚过大的叶子节点输出权重
|
||
"max_bin": 127, # 【新增】默认255,降低到127相当于对连续特征做了一次粗颗粒度的分箱,也是极好的正则化手段
|
||
# ==================== 杂项 ====================
|
||
"verbose": -1,
|
||
"random_state": 42,
|
||
"n_jobs": -1,
|
||
}
|
||
|
||
# 日期范围配置
|
||
date_range = {
|
||
"train": (TRAIN_START, TRAIN_END),
|
||
"val": (VAL_START, VAL_END),
|
||
"test": (TEST_START, TEST_END),
|
||
}
|
||
|
||
# 输出配置
|
||
output_config = {
|
||
"output_dir": OUTPUT_DIR,
|
||
"output_filename": "regression_output.csv",
|
||
"save_predictions": SAVE_PREDICTIONS,
|
||
"save_model": SAVE_MODEL,
|
||
"model_save_path": get_model_save_path(TRAINING_TYPE),
|
||
"top_n": TOP_N,
|
||
}
|
||
|
||
|
||
def main():
|
||
"""主函数"""
|
||
print("\n" + "=" * 80)
|
||
print("LightGBM 回归模型训练(模块化版本)")
|
||
print("=" * 80)
|
||
|
||
# 1. 创建 FactorEngine
|
||
print("\n[1] 创建 FactorEngine")
|
||
engine = FactorEngine()
|
||
|
||
# 2. 创建 FactorManager
|
||
print("\n[2] 创建 FactorManager")
|
||
factor_manager = FactorManager(
|
||
selected_factors=SELECTED_FACTORS,
|
||
factor_definitions=FACTOR_DEFINITIONS,
|
||
label_factor=LABEL_FACTOR,
|
||
excluded_factors=EXCLUDED_FACTORS,
|
||
)
|
||
|
||
# 3. 创建 DataPipeline
|
||
print("\n[3] 创建 DataPipeline")
|
||
pipeline = DataPipeline(
|
||
factor_manager=factor_manager,
|
||
processor_configs=[
|
||
(Winsorizer, {"lower": 0.01, "upper": 0.99}),
|
||
(NullFiller, {"strategy": "mean"}),
|
||
(StandardScaler, {}),
|
||
# (CrossSectionalStandardScaler, {}),
|
||
],
|
||
label_processor_configs=[
|
||
# 对 label 进行缩尾处理(去除极端收益率)
|
||
(Winsorizer, {"lower": 0.05, "upper": 0.95}),
|
||
(StandardScaler, {}),
|
||
],
|
||
filters=[STFilter(data_router=engine.router)],
|
||
stock_pool_filter_func=stock_pool_filter,
|
||
stock_pool_required_columns=STOCK_FILTER_REQUIRED_COLUMNS,
|
||
train_skip_days=TRAIN_SKIP_DAYS,
|
||
)
|
||
|
||
# 4. 创建 RegressionTask
|
||
print("\n[4] 创建 RegressionTask")
|
||
task = RegressionTask(
|
||
model_params=MODEL_PARAMS,
|
||
label_name=LABEL_NAME,
|
||
)
|
||
|
||
# 5. 创建 Trainer
|
||
print("\n[5] 创建 Trainer")
|
||
trainer = Trainer(
|
||
data_pipeline=pipeline,
|
||
task=task,
|
||
output_config=output_config,
|
||
verbose=True,
|
||
)
|
||
|
||
# 6. 执行训练
|
||
print("\n[6] 执行训练")
|
||
results = trainer.run(engine=engine, date_range=date_range)
|
||
|
||
# 7. 保存模型和因子信息(如果启用)
|
||
if SAVE_MODEL:
|
||
print("\n[7] 保存模型和因子信息")
|
||
save_model_with_factors(
|
||
model=task.get_model(),
|
||
model_path=output_config["model_save_path"],
|
||
selected_factors=SELECTED_FACTORS,
|
||
factor_definitions=FACTOR_DEFINITIONS,
|
||
fitted_processors=pipeline.get_fitted_processors(),
|
||
)
|
||
|
||
print("\n" + "=" * 80)
|
||
print("训练流程完成!")
|
||
print(f"结果保存路径: {os.path.join(OUTPUT_DIR, 'regression_output.csv')}")
|
||
print("=" * 80)
|
||
|
||
return results
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|