Files
ProStock/src/experiment/regression.py

231 lines
6.6 KiB
Python
Raw Normal View History

# %% md
# # LightGBM 回归训练流程(模块化版本)
#
# 使用新的模块化 Trainer 架构,代码更简洁、可维护性更高。
# %% md
# ## 1. 导入依赖
# %%
import os
from src.factors import FactorEngine
from src.training import (
FactorManager,
DataPipeline,
RegressionTask,
NullFiller,
Winsorizer,
StandardScaler,
CrossSectionalStandardScaler,
)
from src.training.core.trainer_v2 import Trainer
from src.training.components.filters import STFilter
from src.experiment.common import (
SELECTED_FACTORS,
FACTOR_DEFINITIONS,
LABEL_NAME,
LABEL_FACTOR,
TRAIN_START,
TRAIN_END,
VAL_START,
VAL_END,
TEST_START,
TEST_END,
stock_pool_filter,
STOCK_FILTER_REQUIRED_COLUMNS,
OUTPUT_DIR,
SAVE_PREDICTIONS,
SAVE_MODEL,
get_model_save_path,
save_model_with_factors,
TOP_N,
)
# 训练类型标识
TRAINING_TYPE = "regression"
# %% md
# ## 2. 训练特定配置
# %%
# Label 配置(从 common.py 统一导入)
# LABEL_NAME 和 LABEL_FACTOR 已在 common.py 中绑定,只需从 common 导入
# 排除的因子列表
EXCLUDED_FACTORS = [
"GTJA_alpha062",
"GTJA_alpha060",
"GTJA_alpha058",
"GTJA_alpha056",
"GTJA_alpha053",
"GTJA_alpha040",
"GTJA_alpha043",
"GTJA_alpha027",
"CP",
"max_ret_20",
"debt_to_equity",
"close_vwap_deviation",
"EP",
"BP",
"EP_rank",
"GTJA_alpha044",
"GTJA_alpha036",
"GTJA_alpha010",
"GTJA_alpha005",
"GTJA_alpha001",
"GTJA_alpha002",
"GTJA_alpha007",
"GTJA_alpha016",
"GTJA_alpha073",
"GTJA_alpha133",
"GTJA_alpha131",
"GTJA_alpha117",
"GTJA_alpha124",
"GTJA_alpha120",
"GTJA_alpha119",
"GTJA_alpha103",
"GTJA_alpha099",
"GTJA_alpha105",
"GTJA_alpha104",
"GTJA_alpha090",
"GTJA_alpha085",
"GTJA_alpha083",
"GTJA_alpha084",
"GTJA_alpha087",
"GTJA_alpha092",
"GTJA_alpha074",
"GTJA_alpha089",
"GTJA_alpha173",
"GTJA_alpha157",
"GTJA_alpha139",
"GTJA_alpha162",
"GTJA_alpha163",
"GTJA_alpha177",
"price_to_avg_cost",
"cost_skewness",
"GTJA_alpha191",
"GTJA_alpha180",
"history_position",
"bottom_profit",
"smart_money_accumulation",
]
# 模型参数配置
MODEL_PARAMS = {
# ==================== 基础设置 ====================
"objective": "huber", # 【修改】相比纯 L1(MAE)huber 对异常值鲁棒且在极小误差处平滑,更适合收益率预测
"metric": "mae",
# ==================== 树结构约束 ====================
"max_depth": 5, # 【修改】适当加深,允许捕捉一定的高阶交叉
"num_leaves": 31, # 【修改】限制为 312的5次方-1确保树是不对称生长的防止过拟合
"min_data_in_leaf": 512, # 【大幅增加】从256加到1000。训练集有97万条极大地限制叶子节点样本量能有效抵抗股市噪音
# ==================== 学习参数 ====================
"learning_rate": 0.02, # 【修改】稍微调大一点,帮助模型跳出初始的局部最优(避免十几轮就早停)
"n_estimators": 2000,
# ==================== 随机采样与降维 ====================
"subsample": 0.85,
"subsample_freq": 1,
"colsample_bytree": 0.4, # 【大幅降低】从0.8降到0.4。强制打压 GTJA_alpha127 的霸权,逼迫模型去学习其他因子的信息
"extra_trees": True, # 【新增且极度推荐】极度随机树模式。在分裂点选择时增加随机性,是量化比赛中防过拟合的神器
# ==================== 正则化 ====================
"reg_alpha": 1.0, # 【修改】L1正则增加强行把一些无用特征的权重压到0
"reg_lambda": 5.0, # 【修改】L2正则大幅增加从1到5惩罚过大的叶子节点输出权重
"max_bin": 127, # 【新增】默认255降低到127相当于对连续特征做了一次粗颗粒度的分箱也是极好的正则化手段
# ==================== 杂项 ====================
"verbose": -1,
"random_state": 42,
"n_jobs": -1,
}
# 日期范围配置
date_range = {
"train": (TRAIN_START, TRAIN_END),
"val": (VAL_START, VAL_END),
"test": (TEST_START, TEST_END),
}
# 输出配置
output_config = {
"output_dir": OUTPUT_DIR,
"output_filename": "regression_output.csv",
"save_predictions": SAVE_PREDICTIONS,
"save_model": SAVE_MODEL,
"model_save_path": get_model_save_path(TRAINING_TYPE),
"top_n": TOP_N,
}
def main():
"""主函数"""
print("\n" + "=" * 80)
print("LightGBM 回归模型训练(模块化版本)")
print("=" * 80)
# 1. 创建 FactorEngine
print("\n[1] 创建 FactorEngine")
engine = FactorEngine()
# 2. 创建 FactorManager
print("\n[2] 创建 FactorManager")
factor_manager = FactorManager(
selected_factors=SELECTED_FACTORS,
factor_definitions=FACTOR_DEFINITIONS,
label_factor=LABEL_FACTOR,
excluded_factors=EXCLUDED_FACTORS,
)
# 3. 创建 DataPipeline
print("\n[3] 创建 DataPipeline")
pipeline = DataPipeline(
factor_manager=factor_manager,
processor_configs=[
(NullFiller, {"strategy": "mean"}),
(Winsorizer, {"lower": 0.01, "upper": 0.99}),
(StandardScaler, {}),
# (CrossSectionalStandardScaler, {}),
],
filters=[STFilter(data_router=engine.router)],
stock_pool_filter_func=stock_pool_filter,
stock_pool_required_columns=STOCK_FILTER_REQUIRED_COLUMNS,
)
# 4. 创建 RegressionTask
print("\n[4] 创建 RegressionTask")
task = RegressionTask(
model_params=MODEL_PARAMS,
label_name=LABEL_NAME,
)
# 5. 创建 Trainer
print("\n[5] 创建 Trainer")
trainer = Trainer(
data_pipeline=pipeline,
task=task,
output_config=output_config,
verbose=True,
)
# 6. 执行训练
print("\n[6] 执行训练")
results = trainer.run(engine=engine, date_range=date_range)
# 7. 保存模型和因子信息(如果启用)
if SAVE_MODEL:
print("\n[7] 保存模型和因子信息")
save_model_with_factors(
model=task.get_model(),
model_path=output_config["model_save_path"],
selected_factors=SELECTED_FACTORS,
factor_definitions=FACTOR_DEFINITIONS,
fitted_processors=pipeline.get_fitted_processors(),
)
print("\n" + "=" * 80)
print("训练流程完成!")
print(f"结果保存路径: {os.path.join(OUTPUT_DIR, 'regression_output.csv')}")
print("=" * 80)
return results
if __name__ == "__main__":
main()