Files
ProStock/src/experiment/learn_to_rank.py
liaozhaorun 1fa4ff9544 feat(training): TabM 排序模型架构优化与 Rank-Gauss 标签工程
- TabMSetRank: 将 TabM 输出改为隐藏层特征,经 SetRankHead 交互后通过 final_mlp 输出 Ensemble 排序分
- SetRankHead 引入可学习残差缩放因子(Zero-init)与 Pre-Norm 结构,提升训练稳定性
- TabMRankTask 新增 Rank-Gauss 连续标签变换,支持标准分位数/指数增益/Rank-Gauss 三种标签模式
- 修复 NDCG 评估在负值标签下的计算问题
- 调整实验脚本超参数(dropout、hidden dim、weight decay)及排除因子列表
- 迁移废弃的 torch.cuda.amp 到 torch.amp,并将数据预加载至 GPU 减少循环拷贝
2026-04-05 19:01:08 +08:00

196 lines
4.8 KiB
Python

# %% md
# # LightGBM LambdaRank 排序学习训练流程(模块化版本)
#
# 使用新的模块化 Trainer 架构,代码更简洁、可维护性更高。
# %% md
# ## 1. 导入依赖
# %%
import os
from src.factors import FactorEngine
from src.training import (
FactorManager,
DataPipeline,
RankTask,
NullFiller,
Winsorizer,
CrossSectionalStandardScaler,
)
from src.training.core.trainer_v2 import Trainer
from src.training.components.filters import STFilter
from src.experiment.common import (
SELECTED_FACTORS,
FACTOR_DEFINITIONS,
LABEL_NAME,
LABEL_FACTOR,
TRAIN_START,
TRAIN_END,
VAL_START,
VAL_END,
TEST_START,
TEST_END,
stock_pool_filter,
STOCK_FILTER_REQUIRED_COLUMNS,
OUTPUT_DIR,
SAVE_PREDICTIONS,
SAVE_MODEL,
get_model_save_path,
save_model_with_factors,
TOP_N,
TRAIN_SKIP_DAYS,
)
# 训练类型标识
TRAINING_TYPE = "rank"
# %% md
# ## 2. 训练特定配置
# %%
# Label 配置(从 common.py 统一导入)
# LABEL_NAME 和 LABEL_FACTOR 已在 common.py 中绑定,只需从 common 导入
# 分位数配置
N_QUANTILES = 20
# 排除的因子列表
EXCLUDED_FACTORS = [
"amivest_liq_20",
"atr_price_impact",
"hui_heubel_ratio",
"corwin_schultz_spread_20",
"roll_spread_20",
"gibbs_effective_spread",
"overnight_illiq_20",
"illiq_volatility_20",
"amount_cv_20",
"amount_skewness_20",
"low_vol_days_20",
"liquidity_shock_momentum",
"downside_illiq_20",
"upside_illiq_20",
"illiq_asymmetry_20",
"pastor_stambaugh_proxy"
]
# LambdaRank 模型参数配置
MODEL_PARAMS = {
"objective": "lambdarank",
"metric": "ndcg",
"ndcg_at": 25,
"learning_rate": 0.1,
"n_estimators": 1000,
"early_stopping_round": 50,
# 防止过拟合的核心约束
"max_depth": 4,
"num_leaves": 32,
"min_data_in_leaf": 256,
# 随机采样(增加鲁棒性)
"subsample": 0.4,
"subsample_freq": 1,
"colsample_bytree": 0.4,
# 正则化惩罚
"reg_alpha": 10.0,
"reg_lambda": 50.0,
# Lambdarank 专属配置
"lambdarank_truncation_level": 50,
"label_gain": [i * i for i in range(1, N_QUANTILES + 1)],
"verbose": -1,
"random_state": 42,
}
# 日期范围配置
date_range = {
"train": (TRAIN_START, TRAIN_END),
"val": (VAL_START, VAL_END),
"test": (TEST_START, TEST_END),
}
# 输出配置
output_config = {
"output_dir": OUTPUT_DIR,
"output_filename": "rank_output.csv",
"save_predictions": SAVE_PREDICTIONS,
"save_model": SAVE_MODEL,
"model_save_path": get_model_save_path(TRAINING_TYPE),
"top_n": TOP_N,
}
def main():
"""主函数"""
print("\n" + "=" * 80)
print("LightGBM LambdaRank 排序学习训练(模块化版本)")
print("=" * 80)
# 1. 创建 FactorEngine
print("\n[1] 创建 FactorEngine")
engine = FactorEngine()
# 2. 创建 FactorManager
print("\n[2] 创建 FactorManager")
factor_manager = FactorManager(
selected_factors=SELECTED_FACTORS,
factor_definitions=FACTOR_DEFINITIONS,
label_factor=LABEL_FACTOR,
excluded_factors=EXCLUDED_FACTORS,
)
# 3. 创建 DataPipeline
print("\n[3] 创建 DataPipeline")
pipeline = DataPipeline(
factor_manager=factor_manager,
processor_configs=[
(Winsorizer, {"lower": 0.01, "upper": 0.99}),
(NullFiller, {"strategy": "mean"}),
(CrossSectionalStandardScaler, {}),
],
filters=[STFilter(data_router=engine.router)],
stock_pool_filter_func=stock_pool_filter,
stock_pool_required_columns=STOCK_FILTER_REQUIRED_COLUMNS,
train_skip_days=TRAIN_SKIP_DAYS,
)
# 4. 创建 RankTask
print("\n[4] 创建 RankTask")
task = RankTask(
model_params=MODEL_PARAMS,
label_name=LABEL_NAME,
n_quantiles=N_QUANTILES,
)
# 5. 创建 Trainer
print("\n[5] 创建 Trainer")
trainer = Trainer(
data_pipeline=pipeline,
task=task,
output_config=output_config,
verbose=True,
)
# 6. 执行训练
print("\n[6] 执行训练")
results = trainer.run(engine=engine, date_range=date_range)
# 7. 保存模型和因子信息(如果启用)
if SAVE_MODEL:
print("\n[7] 保存模型和因子信息")
save_model_with_factors(
model=task.get_model(),
model_path=output_config["model_save_path"],
selected_factors=SELECTED_FACTORS,
factor_definitions=FACTOR_DEFINITIONS,
fitted_processors=pipeline.get_fitted_processors(),
)
print("\n" + "=" * 80)
print("训练流程完成!")
print(f"结果保存路径: {os.path.join(OUTPUT_DIR, 'rank_output.csv')}")
print("=" * 80)
return results
if __name__ == "__main__":
main()