# %% md # # LightGBM 回归训练流程(模块化版本) # # 使用新的模块化 Trainer 架构,代码更简洁、可维护性更高。 # %% md # ## 1. 导入依赖 # %% import os from src.factors import FactorEngine from src.training import ( FactorManager, DataPipeline, RegressionTask, NullFiller, Winsorizer, StandardScaler, CrossSectionalStandardScaler, ) from src.training.core.trainer_v2 import Trainer from src.training.components.filters import STFilter from src.experiment.common import ( SELECTED_FACTORS, FACTOR_DEFINITIONS, LABEL_NAME, LABEL_FACTOR, TRAIN_START, TRAIN_END, VAL_START, VAL_END, TEST_START, TEST_END, stock_pool_filter, STOCK_FILTER_REQUIRED_COLUMNS, OUTPUT_DIR, SAVE_PREDICTIONS, SAVE_MODEL, get_model_save_path, save_model_with_factors, TOP_N, ) # 训练类型标识 TRAINING_TYPE = "regression" # %% md # ## 2. 训练特定配置 # %% # Label 配置(从 common.py 统一导入) # LABEL_NAME 和 LABEL_FACTOR 已在 common.py 中绑定,只需从 common 导入 # 排除的因子列表 EXCLUDED_FACTORS = [ "GTJA_alpha062", "GTJA_alpha060", "GTJA_alpha058", "GTJA_alpha056", "GTJA_alpha053", "GTJA_alpha040", "GTJA_alpha043", "GTJA_alpha027", "CP", "max_ret_20", "debt_to_equity", "close_vwap_deviation", "EP", "BP", "EP_rank", "GTJA_alpha044", "GTJA_alpha036", "GTJA_alpha010", "GTJA_alpha005", "GTJA_alpha001", "GTJA_alpha002", "GTJA_alpha007", "GTJA_alpha016", "GTJA_alpha073", "GTJA_alpha133", "GTJA_alpha131", "GTJA_alpha117", "GTJA_alpha124", "GTJA_alpha120", "GTJA_alpha119", "GTJA_alpha103", "GTJA_alpha099", "GTJA_alpha105", "GTJA_alpha104", "GTJA_alpha090", "GTJA_alpha085", "GTJA_alpha083", "GTJA_alpha084", "GTJA_alpha087", "GTJA_alpha092", "GTJA_alpha074", "GTJA_alpha089", "GTJA_alpha173", "GTJA_alpha157", "GTJA_alpha139", "GTJA_alpha162", "GTJA_alpha163", "GTJA_alpha177", "price_to_avg_cost", "cost_skewness", "GTJA_alpha191", "GTJA_alpha180", "history_position", "bottom_profit", "smart_money_accumulation", ] # 模型参数配置 MODEL_PARAMS = { # ==================== 基础设置 ==================== "objective": "huber", # 【修改】相比纯 L1(MAE),huber 对异常值鲁棒且在极小误差处平滑,更适合收益率预测 "metric": "mae", # ==================== 树结构约束 ==================== "max_depth": 5, # 【修改】适当加深,允许捕捉一定的高阶交叉 "num_leaves": 31, # 【修改】限制为 31(2的5次方-1),确保树是不对称生长的,防止过拟合 "min_data_in_leaf": 512, # 【大幅增加】从256加到1000。训练集有97万条,极大地限制叶子节点样本量能有效抵抗股市噪音 # ==================== 学习参数 ==================== "learning_rate": 0.02, # 【修改】稍微调大一点,帮助模型跳出初始的局部最优(避免十几轮就早停) "n_estimators": 2000, # ==================== 随机采样与降维 ==================== "subsample": 0.85, "subsample_freq": 1, "colsample_bytree": 0.4, # 【大幅降低】从0.8降到0.4。强制打压 GTJA_alpha127 的霸权,逼迫模型去学习其他因子的信息 "extra_trees": True, # 【新增且极度推荐】极度随机树模式。在分裂点选择时增加随机性,是量化比赛中防过拟合的神器 # ==================== 正则化 ==================== "reg_alpha": 1.0, # 【修改】L1正则增加,强行把一些无用特征的权重压到0 "reg_lambda": 5.0, # 【修改】L2正则大幅增加(从1到5),惩罚过大的叶子节点输出权重 "max_bin": 127, # 【新增】默认255,降低到127相当于对连续特征做了一次粗颗粒度的分箱,也是极好的正则化手段 # ==================== 杂项 ==================== "verbose": -1, "random_state": 42, "n_jobs": -1, } # 日期范围配置 date_range = { "train": (TRAIN_START, TRAIN_END), "val": (VAL_START, VAL_END), "test": (TEST_START, TEST_END), } # 输出配置 output_config = { "output_dir": OUTPUT_DIR, "output_filename": "regression_output.csv", "save_predictions": SAVE_PREDICTIONS, "save_model": SAVE_MODEL, "model_save_path": get_model_save_path(TRAINING_TYPE), "top_n": TOP_N, } def main(): """主函数""" print("\n" + "=" * 80) print("LightGBM 回归模型训练(模块化版本)") print("=" * 80) # 1. 创建 FactorEngine print("\n[1] 创建 FactorEngine") engine = FactorEngine() # 2. 创建 FactorManager print("\n[2] 创建 FactorManager") factor_manager = FactorManager( selected_factors=SELECTED_FACTORS, factor_definitions=FACTOR_DEFINITIONS, label_factor=LABEL_FACTOR, excluded_factors=EXCLUDED_FACTORS, ) # 3. 创建 DataPipeline print("\n[3] 创建 DataPipeline") pipeline = DataPipeline( factor_manager=factor_manager, processor_configs=[ (NullFiller, {"strategy": "mean"}), (Winsorizer, {"lower": 0.01, "upper": 0.99}), (StandardScaler, {}), # (CrossSectionalStandardScaler, {}), ], filters=[STFilter(data_router=engine.router)], stock_pool_filter_func=stock_pool_filter, stock_pool_required_columns=STOCK_FILTER_REQUIRED_COLUMNS, ) # 4. 创建 RegressionTask print("\n[4] 创建 RegressionTask") task = RegressionTask( model_params=MODEL_PARAMS, label_name=LABEL_NAME, ) # 5. 创建 Trainer print("\n[5] 创建 Trainer") trainer = Trainer( data_pipeline=pipeline, task=task, output_config=output_config, verbose=True, ) # 6. 执行训练 print("\n[6] 执行训练") results = trainer.run(engine=engine, date_range=date_range) # 7. 保存模型和因子信息(如果启用) if SAVE_MODEL: print("\n[7] 保存模型和因子信息") save_model_with_factors( model=task.get_model(), model_path=output_config["model_save_path"], selected_factors=SELECTED_FACTORS, factor_definitions=FACTOR_DEFINITIONS, fitted_processors=pipeline.get_fitted_processors(), ) print("\n" + "=" * 80) print("训练流程完成!") print(f"结果保存路径: {os.path.join(OUTPUT_DIR, 'regression_output.csv')}") print("=" * 80) return results if __name__ == "__main__": main()