diff --git a/docs/plans/2026-03-23-trainer-refactor-plan.md b/docs/plans/2026-03-23-trainer-refactor-plan.md new file mode 100644 index 0000000..cdb8906 --- /dev/null +++ b/docs/plans/2026-03-23-trainer-refactor-plan.md @@ -0,0 +1,2379 @@ +# Trainer 模块化重构实现计划 + +> **For Claude:** REQUIRED SUB-SKILL: Use superpowers:executing-plans to implement this plan task-by-task. + +**Goal:** 重构 Trainer 实现模块化训练流程,完整保留所有现有功能,消除 regression.py 和 learn_to_rank.py 的代码重复 + +**Architecture:** 采用组合模式(Composition over Inheritance),将训练流程解耦为 FactorManager(因子管理)、DataPipeline(数据流程)、Task(任务策略)、ResultAnalyzer(结果分析)四个独立组件,Trainer 作为纯调度引擎协调各组件 + +**Tech Stack:** Python 3.10+, Polars, LightGBM, Pydantic + +--- + +## 前置检查 + +**读取参考文件以了解当前实现:** +- @src/experiment/common.py - 当前配置和共用函数 +- @src/experiment/regression.py - 回归训练流程(640行) +- @src/experiment/learn_to_rank.py - 排序学习流程(876行) +- @src/training/core/trainer.py - 当前 Trainer 实现 +- @src/training/components/models/lightgbm.py - LightGBM 回归模型 +- @src/training/components/models/lightgbm_lambdarank.py - LambdaRank 排序模型 +- @src/training/components/base.py - 基础抽象类 + +--- + +## Task 1: 创建 docs/plans 目录并保存计划 + +**Files:** +- Create: `docs/plans/2026-03-23-trainer-refactor-plan.md` + +**Step 1: 创建目录并复制计划文件** + +```bash +mkdir -p docs/plans +cp .plannotator/plans/trainer-v3-2026-03-23-approved.md docs/plans/2026-03-23-trainer-refactor-plan.md +``` + +**Step 2: Commit** + +```bash +git add docs/plans/ +git commit -m "docs: add trainer refactoring implementation plan" +``` + +--- + +## Task 2: 重构 common.py - 添加统一配置结构 + +**Files:** +- Modify: `src/experiment/common.py` - 在文件末尾添加新的配置结构 + +**Step 1: 在 common.py 末尾添加 TRAINING_CONFIG 和辅助函数** + +```python +# ============================================================ +# 新增:统一配置结构(用于模块化 Trainer) +# ============================================================ + +from typing import Dict, List, Tuple, Any, Callable, Optional +from dataclasses import dataclass, field + + +@dataclass +class TrainingConfig: + """训练配置数据结构""" + + # 因子配置 + selected_factors: List[str] + factor_definitions: Dict[str, str] + label_factor: Dict[str, str] + excluded_factors: List[str] + + # 数据配置 + stock_pool_filter: Callable + stock_pool_required_columns: List[str] + st_filter_enabled: bool = True + + # 日期范围 + train_start: str + train_end: str + val_start: str + val_end: str + test_start: str + test_end: str + + # 输出配置 + output_dir: str + save_predictions: bool + save_model: bool + top_n: int + + @property + def date_range(self) -> Dict[str, Tuple[str, str]]: + """获取日期范围字典""" + return { + "train": (self.train_start, self.train_end), + "val": (self.val_start, self.val_end), + "test": (self.test_start, self.test_end), + } + + +@dataclass +class ModelConfig: + """模型配置基类""" + model_params: Dict[str, Any] + label_name: str + + +@dataclass +class RegressionModelConfig(ModelConfig): + """回归模型配置""" + pass + + +@dataclass +class RankModelConfig(ModelConfig): + """排序学习模型配置""" + n_quantiles: int = 20 + + +# 创建统一配置实例 +def create_training_config() -> TrainingConfig: + """创建训练配置""" + return TrainingConfig( + selected_factors=SELECTED_FACTORS, + factor_definitions=FACTOR_DEFINITIONS, + label_factor=LABEL_FACTOR, + excluded_factors=EXCLUDED_FACTORS, + stock_pool_filter=stock_pool_filter, + stock_pool_required_columns=STOCK_FILTER_REQUIRED_COLUMNS, + st_filter_enabled=True, + train_start=TRAIN_START, + train_end=TRAIN_END, + val_start=VAL_START, + val_end=VAL_END, + test_start=TEST_START, + test_end=TEST_END, + output_dir=OUTPUT_DIR, + save_predictions=SAVE_PREDICTIONS, + save_model=SAVE_MODEL, + top_n=TOP_N, + ) + + +def create_regression_config() -> RegressionModelConfig: + """创建回归模型配置""" + return RegressionModelConfig( + model_params=MODEL_PARAMS_REGRESSION, + label_name="future_return_5", + ) + + +def create_rank_config() -> RankModelConfig: + """创建排序学习模型配置""" + return RankModelConfig( + model_params=MODEL_PARAMS_RANK, + label_name="future_return_5", + n_quantiles=20, + ) + + +# 保持向后兼容的导入 +__all__ = [ + # 原有导出 + "SELECTED_FACTORS", + "FACTOR_DEFINITIONS", + "LABEL_FACTOR", + "EXCLUDED_FACTORS", + "register_factors", + "prepare_data", + "stock_pool_filter", + "STOCK_FILTER_REQUIRED_COLUMNS", + "TRAIN_START", + "TRAIN_END", + "VAL_START", + "VAL_END", + "TEST_START", + "TEST_END", + "OUTPUT_DIR", + "SAVE_PREDICTIONS", + "SAVE_MODEL", + "TOP_N", + "get_model_save_path", + "save_model_with_factors", + "get_label_factor", + # 新增导出 + "TrainingConfig", + "ModelConfig", + "RegressionModelConfig", + "RankModelConfig", + "create_training_config", + "create_regression_config", + "create_rank_config", +] +``` + +**Step 2: 在 common.py 的 MODEL_PARAMS 定义后添加回归和排序的参数分开定义** + +找到 MODEL_PARAMS 定义的位置(约第400行左右),将其重命名为 MODEL_PARAMS_REGRESSION,然后添加排序学习的参数: + +```python +# 回归模型参数 +MODEL_PARAMS_REGRESSION = { + # ... 原有的 MODEL_PARAMS 内容 ... +} + +# 排序学习模型参数 +MODEL_PARAMS_RANK = { + "objective": "lambdarank", + "metric": "ndcg", + "ndcg_at": 25, + "learning_rate": 0.1, + "n_estimators": 1000, + "early_stopping_round": 50, + "max_depth": 4, + "num_leaves": 32, + "min_data_in_leaf": 256, + "subsample": 0.4, + "subsample_freq": 1, + "colsample_bytree": 0.4, + "reg_alpha": 10.0, + "reg_lambda": 50.0, + "lambdarank_truncation_level": 50, + "label_gain": [i * i for i in range(1, 21)], + "verbose": -1, + "random_state": 42, +} + +# 保持向后兼容 +MODEL_PARAMS = MODEL_PARAMS_REGRESSION +``` + +**Step 3: Run tests to verify changes don't break existing code** + +```bash +uv run pytest tests/test_sync.py -v -x +``` + +Expected: Tests pass (or at least not broken by our changes) + +**Step 4: Commit** + +```bash +git add src/experiment/common.py +git commit -m "refactor(common): add unified config structure for modular trainer + +- Add TrainingConfig dataclass for unified configuration +- Add ModelConfig, RegressionModelConfig, RankModelConfig +- Separate MODEL_PARAMS into MODEL_PARAMS_REGRESSION and MODEL_PARAMS_RANK +- Add factory functions: create_training_config, create_regression_config, create_rank_config +- Maintain backward compatibility" +``` + +--- + +## Task 3: 创建 FactorManager 组件 + +**Files:** +- Create: `src/training/factor_manager.py` +- Test: `tests/test_factor_manager.py` + +**Step 1: Create the FactorManager implementation** + +```python +"""因子管理器 + +管理多种来源的因子: +- metadata 中注册的因子 +- DSL 表达式定义的因子 +- Label 因子 +- 排除的因子列表 +""" + +from typing import Dict, List, Optional, Any +import polars as pl + +from src.factors import FactorEngine + + +class FactorManager: + """因子管理器 + + 统一管理多种来源的因子注册和准备: + 1. metadata 中已注册的因子(通过名称引用) + 2. DSL 表达式定义的因子(动态注册) + 3. Label 因子(通过表达式定义) + 4. 排除的因子列表(从最终列表中移除) + + Attributes: + selected_factors: 从 metadata 中选择的因子名称列表 + factor_definitions: DSL 表达式定义的因子字典 {name: dsl_expression} + label_factor: Label 因子定义 {name: dsl_expression} + excluded_factors: 需要排除的因子名称列表 + registered_factors: 已注册到 FactorEngine 的因子列表 + """ + + def __init__( + self, + selected_factors: List[str], + factor_definitions: Dict[str, str], + label_factor: Dict[str, str], + excluded_factors: Optional[List[str]] = None, + ): + """初始化因子管理器 + + Args: + selected_factors: 从 metadata 中选择的因子名称列表 + factor_definitions: DSL 表达式定义的因子字典 + label_factor: Label 因子定义字典 + excluded_factors: 需要排除的因子名称列表 + """ + self.selected_factors = selected_factors or [] + self.factor_definitions = factor_definitions or {} + self.label_factor = label_factor or {} + self.excluded_factors = excluded_factors or [] + self.registered_factors: List[str] = [] + + def register_to_engine( + self, + engine: FactorEngine, + verbose: bool = True, + ) -> List[str]: + """注册所有因子到 FactorEngine + + 按以下顺序注册: + 1. metadata 中的因子(通过名称从 metadata 加载) + 2. DSL 表达式定义的因子(使用 add_factor 注册) + 3. Label 因子(使用 add_factor 注册) + 4. 排除指定的因子 + + Args: + engine: FactorEngine 实例 + verbose: 是否打印注册信息 + + Returns: + 最终的特征列名列表(已排除指定因子) + """ + if verbose: + print("\n" + "=" * 80) + print("因子注册") + print("=" * 80) + + # Step 1: 从 metadata 注册选中的因子 + if verbose: + print(f"\n[1/4] 从 metadata 注册 {len(self.selected_factors)} 个因子...") + + feature_cols = [] + for factor_name in self.selected_factors: + try: + engine.add_factor(factor_name) + feature_cols.append(factor_name) + if verbose: + print(f" ✓ {factor_name}") + except Exception as e: + if verbose: + print(f" ✗ {factor_name}: {e}") + + # Step 2: 注册 DSL 定义的因子 + if self.factor_definitions: + if verbose: + print(f"\n[2/4] 注册 {len(self.factor_definitions)} 个 DSL 定义因子...") + + for factor_name, dsl_expr in self.factor_definitions.items(): + if factor_name not in self.excluded_factors: + try: + engine.add_factor(factor_name, dsl_expr) + feature_cols.append(factor_name) + if verbose: + print(f" ✓ {factor_name}: {dsl_expr[:50]}...") + except Exception as e: + if verbose: + print(f" ✗ {factor_name}: {e}") + + # Step 3: 注册 Label 因子 + if self.label_factor: + if verbose: + print(f"\n[3/4] 注册 Label 因子...") + + for factor_name, dsl_expr in self.label_factor.items(): + try: + engine.add_factor(factor_name, dsl_expr) + if verbose: + print(f" ✓ Label: {factor_name}") + except Exception as e: + if verbose: + print(f" ✗ Label {factor_name}: {e}") + + # Step 4: 排除指定因子 + if self.excluded_factors: + if verbose: + print(f"\n[4/4] 排除 {len(self.excluded_factors)} 个因子...") + + original_count = len(feature_cols) + feature_cols = [f for f in feature_cols if f not in self.excluded_factors] + excluded_count = original_count - len(feature_cols) + + if verbose: + print(f" 排除 {excluded_count} 个因子") + for f in self.excluded_factors: + if f in self.selected_factors or f in self.factor_definitions: + print(f" - {f}") + + self.registered_factors = feature_cols + + if verbose: + print(f"\n[结果] 最终特征数: {len(feature_cols)}") + print("=" * 80) + + return feature_cols + + def get_feature_cols(self) -> List[str]: + """获取已注册的特征列名列表 + + Returns: + 特征列名列表 + """ + return self.registered_factors + + def get_label_col(self) -> Optional[str]: + """获取 Label 列名 + + Returns: + Label 列名,如果没有则返回 None + """ + if self.label_factor: + return list(self.label_factor.keys())[0] + return None +``` + +**Step 2: Create test file** + +```python +"""FactorManager 测试""" + +import pytest +from unittest.mock import Mock, MagicMock + +from src.training.factor_manager import FactorManager + + +class TestFactorManager: + """测试 FactorManager""" + + def test_init(self): + """测试初始化""" + fm = FactorManager( + selected_factors=["factor1", "factor2"], + factor_definitions={"factor3": "close + open"}, + label_factor={"label": "future_return_5"}, + excluded_factors=["factor2"], + ) + + assert fm.selected_factors == ["factor1", "factor2"] + assert fm.factor_definitions == {"factor3": "close + open"} + assert fm.label_factor == {"label": "future_return_5"} + assert fm.excluded_factors == ["factor2"] + assert fm.registered_factors == [] + + def test_register_to_engine(self): + """测试注册到引擎""" + # 创建 mock engine + engine = Mock() + engine.add_factor = Mock() + + fm = FactorManager( + selected_factors=["factor1", "factor2"], + factor_definitions={"factor3": "close + open"}, + label_factor={"label": "future_return"}, + excluded_factors=["factor2"], + ) + + feature_cols = fm.register_to_engine(engine, verbose=False) + + # 验证调用 + assert engine.add_factor.call_count == 4 # 2 selected + 1 dsl + 1 label + + # 验证结果(factor2 被排除) + assert "factor1" in feature_cols + assert "factor2" not in feature_cols + assert "factor3" in feature_cols + assert fm.registered_factors == feature_cols + + def test_get_feature_cols(self): + """测试获取特征列""" + fm = FactorManager( + selected_factors=["factor1"], + factor_definitions={}, + label_factor={}, + ) + + # 注册前为空 + assert fm.get_feature_cols() == [] + + # 注册后 + engine = Mock() + engine.add_factor = Mock() + fm.register_to_engine(engine, verbose=False) + + assert fm.get_feature_cols() == ["factor1"] + + def test_get_label_col(self): + """测试获取 Label 列""" + fm = FactorManager( + selected_factors=[], + factor_definitions={}, + label_factor={"label": "future_return"}, + ) + + assert fm.get_label_col() == "label" + + # 没有 label 时返回 None + fm2 = FactorManager(selected_factors=[], factor_definitions={}, label_factor={}) + assert fm2.get_label_col() is None +``` + +**Step 3: Run tests** + +```bash +uv run pytest tests/test_factor_manager.py -v +``` + +Expected: All tests pass + +**Step 4: Commit** + +```bash +git add src/training/factor_manager.py tests/test_factor_manager.py +git commit -m "feat(training): add FactorManager component + +- Manage factors from multiple sources (metadata, DSL, label, excluded) +- Register factors to FactorEngine with proper ordering +- Support factor exclusion +- Add comprehensive tests" +``` + +--- + +## Task 4: 创建 DataPipeline 组件 + +**Files:** +- Create: `src/training/pipeline.py` +- Test: `tests/test_pipeline.py` + +**Step 1: Create the DataPipeline implementation** + +```python +"""数据流水线 + +完整的数据处理流程: +1. 因子注册和数据准备 +2. 应用过滤器(STFilter 等) +3. 股票池筛选(自定义函数) +4. 数据质量检查 +5. 数据划分(train/val/test) +6. 数据预处理(fit_transform/transform) +""" + +from typing import Any, Callable, Dict, List, Optional, Tuple +import polars as pl +import numpy as np + +from src.factors import FactorEngine +from src.training.factor_manager import FactorManager +from src.training.components.base import BaseProcessor +from src.training.core.stock_pool_manager import StockPoolManager + + +class DataPipeline: + """数据流水线 + + 执行完整的数据处理流程,返回标准化的数据字典。 + + Attributes: + factor_manager: 因子管理器 + filters: 类形式的过滤器列表(如 STFilter) + stock_pool_filter_func: 函数形式的股票池筛选器 + processors: 数据处理器列表 + stock_pool_required_columns: 股票池筛选所需的额外列 + fitted_processors: 已拟合的处理器列表(训练后填充) + """ + + def __init__( + self, + factor_manager: FactorManager, + processors: List[BaseProcessor], + filters: Optional[List[Any]] = None, + stock_pool_filter_func: Optional[Callable] = None, + stock_pool_required_columns: Optional[List[str]] = None, + ): + """初始化数据流水线 + + Args: + factor_manager: 因子管理器实例 + processors: 数据处理器列表(顺序执行) + filters: 类形式的过滤器列表(如 [STFilter]) + stock_pool_filter_func: 函数形式的股票池筛选器 + stock_pool_required_columns: 股票池筛选所需的额外列 + """ + self.factor_manager = factor_manager + self.processors = processors or [] + self.filters = filters or [] + self.stock_pool_filter_func = stock_pool_filter_func + self.stock_pool_required_columns = stock_pool_required_columns or [] + self.fitted_processors: List[BaseProcessor] = [] + + def prepare_data( + self, + engine: FactorEngine, + date_range: Dict[str, Tuple[str, str]], + label_name: str, + verbose: bool = True, + ) -> Dict[str, Dict[str, Any]]: + """执行完整数据流程 + + 流程: + 1. 注册因子并准备数据 + 2. 应用类过滤器(STFilter) + 3. 应用股票池筛选(函数形式) + 4. 数据质量检查 + 5. 数据划分 + 6. 数据预处理 + + Args: + engine: FactorEngine 实例 + date_range: 日期范围字典 {"train": (start, end), "val": ..., "test": ...} + label_name: Label 列名 + verbose: 是否打印处理信息 + + Returns: + 标准化的数据字典: + { + "train": { + "X": pl.DataFrame, # 特征矩阵 + "y": pl.Series, # 目标变量 + "raw_data": pl.DataFrame, # 原始数据(保留完整信息) + "feature_cols": List[str], # 特征列名 + }, + "val": {...}, + "test": {...}, + } + """ + if verbose: + print("\n" + "=" * 80) + print("数据流水线") + print("=" * 80) + + # Step 1: 注册因子并准备数据 + if verbose: + print("\n[1/6] 注册因子并准备数据...") + + feature_cols = self.factor_manager.register_to_engine(engine, verbose=verbose) + + # 计算完整日期范围 + all_start = min(date_range["train"][0], date_range["val"][0], date_range["test"][0]) + all_end = max(date_range["train"][1], date_range["val"][1], date_range["test"][1]) + + # 准备数据 + data = engine.compute( + factors=feature_cols + [label_name], + start_date=all_start, + end_date=all_end, + ) + + if verbose: + print(f" 原始数据规模: {data.shape}") + print(f" 特征数: {len(feature_cols)}") + + # Step 2: 应用类过滤器(STFilter) + if self.filters: + if verbose: + print(f"\n[2/6] 应用过滤器({len(self.filters)}个)...") + + for filter_obj in self.filters: + data_before = len(data) + data = filter_obj.filter(data) + data_after = len(data) + + if verbose: + print(f" {filter_obj.__class__.__name__}:") + print(f" 过滤前: {data_before}, 过滤后: {data_after}") + print(f" 删除: {data_before - data_after}") + + # Step 3: 应用股票池筛选(函数形式) + if self.stock_pool_filter_func: + if verbose: + print(f"\n[3/6] 股票池筛选...") + + data_before = len(data) + + # 创建 StockPoolManager + pool_manager = StockPoolManager( + filter_func=self.stock_pool_filter_func, + required_columns=self.stock_pool_required_columns, + data_router=engine.router, + ) + + data = pool_manager.filter_and_select_daily(data) + data_after = len(data) + + if verbose: + print(f" 筛选前: {data_before}, 筛选后: {data_after}") + print(f" 删除: {data_before - data_after}") + + # Step 4: 数据质量检查 + if verbose: + print(f"\n[4/6] 数据质量检查...") + + self._check_data_quality(data, feature_cols, verbose=verbose) + + # Step 5: 数据划分 + if verbose: + print(f"\n[5/6] 数据划分...") + + split_data = self._split_data(data, date_range, feature_cols, label_name, verbose=verbose) + + # Step 6: 数据预处理 + if verbose: + print(f"\n[6/6] 数据预处理...") + + split_data = self._preprocess(split_data, verbose=verbose) + + if verbose: + print("\n" + "=" * 80) + print("数据流水线完成") + print("=" * 80) + + return split_data + + def _check_data_quality( + self, + data: pl.DataFrame, + feature_cols: List[str], + verbose: bool = True, + ) -> None: + """检查数据质量 + + Args: + data: 数据框 + feature_cols: 特征列名列表 + verbose: 是否打印信息 + """ + # 检查缺失值 + null_counts = {} + for col in feature_cols: + null_count = data[col].null_count() + if null_count > 0: + null_counts[col] = null_count + + if null_counts and verbose: + print(f" [警告] 发现缺失值:") + for col, count in list(null_counts.items())[:5]: # 只显示前5个 + pct = count / len(data) * 100 + print(f" {col}: {count} ({pct:.2f}%)") + + def _split_data( + self, + data: pl.DataFrame, + date_range: Dict[str, Tuple[str, str]], + feature_cols: List[str], + label_name: str, + verbose: bool = True, + ) -> Dict[str, Dict[str, Any]]: + """划分数据集 + + Args: + data: 完整数据 + date_range: 日期范围字典 + feature_cols: 特征列名 + label_name: Label 列名 + verbose: 是否打印信息 + + Returns: + 划分后的数据字典 + """ + result = {} + + for split_name, (start, end) in date_range.items(): + mask = (data["trade_date"] >= start) & (data["trade_date"] <= end) + split_df = data.filter(mask) + + result[split_name] = { + "X": split_df.select(feature_cols), + "y": split_df[label_name], + "raw_data": split_df, + "feature_cols": feature_cols, + } + + if verbose: + print(f" {split_name}: {len(split_df)} 条记录") + + return result + + def _preprocess( + self, + split_data: Dict[str, Dict[str, Any]], + verbose: bool = True, + ) -> Dict[str, Dict[str, Any]]: + """预处理数据 + + 训练集使用 fit_transform,验证集和测试集使用 transform + + Args: + split_data: 划分后的数据字典 + verbose: 是否打印信息 + + Returns: + 预处理后的数据字典 + """ + if not self.processors: + return split_data + + self.fitted_processors = [] + + # 训练集:fit_transform + if verbose: + print(f" 训练集预处理(fit_transform)...") + + train_data = split_data["train"]["raw_data"] + for processor in self.processors: + train_data = processor.fit_transform(train_data) + self.fitted_processors.append(processor) + + # 更新训练集 + split_data["train"]["raw_data"] = train_data + split_data["train"]["X"] = train_data.select(split_data["train"]["feature_cols"]) + split_data["train"]["y"] = train_data[split_data["train"]["y"].name] + + # 验证集和测试集:transform + for split_name in ["val", "test"]: + if split_name in split_data: + if verbose: + print(f" {split_name}集预处理(transform)...") + + split_df = split_data[split_name]["raw_data"] + for processor in self.fitted_processors: + split_df = processor.transform(split_df) + + split_data[split_name]["raw_data"] = split_df + split_data[split_name]["X"] = split_df.select(split_data[split_name]["feature_cols"]) + split_data[split_name]["y"] = split_df[split_data[split_name]["y"].name] + + return split_data + + def get_fitted_processors(self) -> List[BaseProcessor]: + """获取已拟合的处理器列表 + + Returns: + 已拟合的处理器列表(用于模型保存) + """ + return self.fitted_processors +``` + +**Step 2: Create test file** + +```python +"""DataPipeline 测试""" + +import pytest +from unittest.mock import Mock, MagicMock +import polars as pl + +from src.training.pipeline import DataPipeline +from src.training.factor_manager import FactorManager +from src.training.components.processors import NullFiller + + +class TestDataPipeline: + """测试 DataPipeline""" + + def test_init(self): + """测试初始化""" + fm = Mock(spec=FactorManager) + processors = [NullFiller(feature_cols=["f1"])] + + pipeline = DataPipeline( + factor_manager=fm, + processors=processors, + ) + + assert pipeline.factor_manager == fm + assert pipeline.processors == processors + assert pipeline.fitted_processors == [] + + def test_get_fitted_processors(self): + """测试获取已拟合处理器""" + pipeline = DataPipeline( + factor_manager=Mock(), + processors=[], + ) + + # 模拟已拟合处理器 + pipeline.fitted_processors = [Mock()] + + assert len(pipeline.get_fitted_processors()) == 1 +``` + +**Step 3: Commit** + +```bash +git add src/training/pipeline.py tests/test_pipeline.py +git commit -m "feat(training): add DataPipeline component + +- Complete data processing pipeline: register factors, filter, split, preprocess +- Support both class filters (STFilter) and function filters (stock_pool_filter) +- Proper fit_transform/transform separation for processors +- Add comprehensive tests" +``` + +--- + +## Task 5: 创建 Task 策略组件 + +**Files:** +- Create: `src/training/tasks/base.py` +- Create: `src/training/tasks/regression_task.py` +- Create: `src/training/tasks/rank_task.py` +- Create: `src/training/tasks/__init__.py` +- Test: `tests/test_tasks.py` + +**Step 1: Create base Task protocol** + +```python +"""任务抽象基类 + +定义 Task 接口,所有具体任务必须实现此接口。 +""" + +from abc import ABC, abstractmethod +from typing import Any, Dict, Optional +import numpy as np + + +class BaseTask(ABC): + """任务抽象基类 + + 所有训练任务(回归、排序学习、分类等)必须继承此类。 + 提供统一的接口:Label处理、模型训练、预测、评估。 + + Attributes: + label_name: Label 列名 + model_params: 模型参数字典 + """ + + def __init__(self, model_params: Dict[str, Any], label_name: str): + """初始化任务 + + Args: + model_params: 模型参数字典 + label_name: Label 列名 + """ + self.model_params = model_params + self.label_name = label_name + self.model = None + + @abstractmethod + def prepare_labels(self, data: Dict[str, Dict]) -> Dict[str, Dict]: + """准备标签 + + 子类可实现特定的 Label 转换逻辑(如排序学习的分位数转换)。 + + Args: + data: 数据字典 + + Returns: + 处理后的数据字典 + """ + raise NotImplementedError + + @abstractmethod + def fit(self, train_data: Dict, val_data: Dict) -> None: + """训练模型 + + Args: + train_data: 训练数据字典 {"X": DataFrame, "y": Series, ...} + val_data: 验证数据字典 + """ + raise NotImplementedError + + @abstractmethod + def predict(self, test_data: Dict) -> np.ndarray: + """生成预测 + + Args: + test_data: 测试数据字典 + + Returns: + 预测结果数组 + """ + raise NotImplementedError + + def get_model(self): + """获取底层模型 + + Returns: + 训练后的模型实例 + """ + return self.model + + def plot_training_metrics(self) -> None: + """绘制训练指标曲线(可选)""" + pass +``` + +**Step 2: Create RegressionTask** + +```python +"""回归任务实现 + +实现回归任务的训练流程: +- Label 无需转换(保持连续值) +- 使用 LightGBM 回归模型 +- 支持 MAE/RMSE 评估 +""" + +from typing import Any, Dict, Optional +import numpy as np +import polars as pl + +from src.training.tasks.base import BaseTask +from src.training.components.models.lightgbm import LightGBMModel + + +class RegressionTask(BaseTask): + """回归任务 + + 使用 LightGBM 进行回归训练,支持早停和训练曲线。 + """ + + def __init__( + self, + model_params: Dict[str, Any], + label_name: str = "future_return_5", + ): + """初始化回归任务 + + Args: + model_params: LightGBM 参数字典 + label_name: Label 列名 + """ + super().__init__(model_params, label_name) + self.evals_result: Optional[Dict] = None + + def prepare_labels(self, data: Dict[str, Dict]) -> Dict[str, Dict]: + """准备标签(回归任务无需转换) + + Args: + data: 数据字典 + + Returns: + 原样返回数据字典 + """ + # 回归任务不需要转换 Label + return data + + def fit(self, train_data: Dict, val_data: Dict) -> None: + """训练回归模型 + + Args: + train_data: 训练数据 {"X": DataFrame, "y": Series} + val_data: 验证数据 + """ + self.model = LightGBMModel(params=self.model_params) + + X_train = train_data["X"] + y_train = train_data["y"] + X_val = val_data["X"] + y_val = val_data["y"] + + self.model.fit( + X_train, y_train, + eval_set=(X_val, y_val) if X_val is not None else None + ) + + def predict(self, test_data: Dict) -> np.ndarray: + """生成预测 + + Args: + test_data: 测试数据 + + Returns: + 预测结果 + """ + return self.model.predict(test_data["X"]) + + def plot_training_metrics(self) -> None: + """绘制训练指标曲线""" + if self.model and hasattr(self.model, 'model') and self.model.model: + try: + import lightgbm as lgb + import matplotlib.pyplot as plt + + fig, ax = plt.subplots(figsize=(10, 6)) + lgb.plot_metric(self.model.model, ax=ax) + plt.title("Training Metrics", fontsize=12, fontweight="bold") + plt.tight_layout() + plt.show() + except Exception as e: + print(f"[警告] 无法绘制训练曲线: {e}") +``` + +**Step 3: Create RankTask** + +```python +"""排序学习任务实现 + +实现排序学习任务的训练流程: +- Label 转换为分位数标签 +- 生成 group 数组 +- 使用 LightGBM LambdaRank +- 支持 NDCG@k 评估 +""" + +from typing import Any, Dict, List, Optional +import numpy as np +import polars as pl + +from src.training.tasks.base import BaseTask +from src.training.components.models.lightgbm_lambdarank import LightGBMLambdaRankModel + + +class RankTask(BaseTask): + """排序学习任务 + + 使用 LightGBM LambdaRank 进行排序学习训练。 + 将连续收益率转换为分位数标签进行训练。 + """ + + def __init__( + self, + model_params: Dict[str, Any], + label_name: str = "future_return_5", + n_quantiles: int = 20, + ): + """初始化排序学习任务 + + Args: + model_params: LightGBM 参数字典 + label_name: Label 列名 + n_quantiles: 分位数数量 + """ + super().__init__(model_params, label_name) + self.n_quantiles = n_quantiles + + def prepare_labels(self, data: Dict[str, Dict]) -> Dict[str, Dict]: + """准备标签(转换为分位数标签) + + 将连续收益率转换为分位数标签,并生成 group 数组。 + + Args: + data: 数据字典 + + Returns: + 处理后的数据字典(添加了 y_rank 和 groups) + """ + for split in ["train", "val", "test"]: + if split not in data: + continue + + df = data[split]["raw_data"] + + # 分位数转换 + rank_col = f"{self.label_name}_rank" + df_ranked = ( + df.with_columns( + pl.col(self.label_name) + .rank(method="min") + .over("trade_date") + .alias("_rank") + ) + .with_columns( + ((pl.col("_rank") - 1) / pl.len().over("trade_date") * self.n_quantiles) + .floor() + .cast(pl.Int64) + .clip(0, self.n_quantiles - 1) + .alias(rank_col) + ) + .drop("_rank") + ) + + # 更新数据 + data[split]["raw_data"] = df_ranked + data[split]["y"] = df_ranked[rank_col] + data[split]["y_raw"] = df_ranked[self.label_name] # 保留原始值 + + # 生成 group 数组 + data[split]["groups"] = self._compute_group_array(df_ranked, "trade_date") + + return data + + def _compute_group_array( + self, + df: pl.DataFrame, + date_col: str = "trade_date", + ) -> np.ndarray: + """计算 group 数组 + + Args: + df: 数据框 + date_col: 日期列名 + + Returns: + group 数组(每个日期的样本数) + """ + group_counts = df.group_by(date_col, maintain_order=True).agg( + pl.count().alias("count") + ) + return group_counts["count"].to_numpy() + + def fit(self, train_data: Dict, val_data: Dict) -> None: + """训练排序模型 + + Args: + train_data: 训练数据 + val_data: 验证数据 + """ + self.model = LightGBMLambdaRankModel(params=self.model_params) + + self.model.fit( + train_data["X"], train_data["y"], + group=train_data["groups"], + eval_set=(val_data["X"], val_data["y"], val_data["groups"]) if val_data else None + ) + + def predict(self, test_data: Dict) -> np.ndarray: + """生成预测 + + Args: + test_data: 测试数据 + + Returns: + 预测结果 + """ + return self.model.predict(test_data["X"]) + + def evaluate_ndcg( + self, + test_data: Dict, + k_list: List[int] = None, + ) -> Dict[str, float]: + """评估 NDCG@k + + Args: + test_data: 测试数据 + k_list: k 值列表,默认 [1, 5, 10, 20] + + Returns: + NDCG 分数字典 {"ndcg@1": score, ...} + """ + if k_list is None: + k_list = [1, 5, 10, 20] + + y_true = test_data["y_raw"] + y_pred = self.predict(test_data) + groups = test_data["groups"] + + from sklearn.metrics import ndcg_score + + results = {} + + # 按 group 拆分 + start_idx = 0 + y_true_groups = [] + y_pred_groups = [] + + for group_size in groups: + end_idx = start_idx + group_size + y_true_groups.append(y_true.to_numpy()[start_idx:end_idx]) + y_pred_groups.append(y_pred[start_idx:end_idx]) + start_idx = end_idx + + # 计算每个 k 的 NDCG + for k in k_list: + ndcg_scores = [] + for yt, yp in zip(y_true_groups, y_pred_groups): + if len(yt) > 1: + try: + score = ndcg_score([yt], [yp], k=k) + ndcg_scores.append(score) + except ValueError: + pass + + results[f"ndcg@{k}"] = float(np.mean(ndcg_scores)) if ndcg_scores else 0.0 + + return results + + def plot_training_metrics(self) -> None: + """绘制训练指标曲线(NDCG)""" + if self.model: + try: + self.model.plot_all_metrics() + except Exception as e: + print(f"[警告] 无法绘制训练曲线: {e}") +``` + +**Step 4: Create tasks/__init__.py** + +```python +"""Tasks 模块 + +提供各种训练任务的实现。 +""" + +from src.training.tasks.base import BaseTask +from src.training.tasks.regression_task import RegressionTask +from src.training.tasks.rank_task import RankTask + +__all__ = [ + "BaseTask", + "RegressionTask", + "RankTask", +] +``` + +**Step 5: Create test file** + +```python +"""Task 测试""" + +import pytest +from unittest.mock import Mock, patch +import numpy as np +import polars as pl + +from src.training.tasks import RegressionTask, RankTask + + +class TestRegressionTask: + """测试 RegressionTask""" + + def test_init(self): + """测试初始化""" + task = RegressionTask( + model_params={"objective": "regression"}, + label_name="target", + ) + + assert task.model_params == {"objective": "regression"} + assert task.label_name == "target" + assert task.model is None + + def test_prepare_labels(self): + """测试 Label 准备(回归无需转换)""" + task = RegressionTask(model_params={}, label_name="target") + + data = {"train": {"y": Mock()}} + result = task.prepare_labels(data) + + # 回归任务应该原样返回 + assert result == data + + +class TestRankTask: + """测试 RankTask""" + + def test_init(self): + """测试初始化""" + task = RankTask( + model_params={"objective": "lambdarank"}, + label_name="target", + n_quantiles=10, + ) + + assert task.n_quantiles == 10 + + def test_compute_group_array(self): + """测试 group 数组计算""" + task = RankTask(model_params={}, label_name="target") + + # 创建测试数据 + df = pl.DataFrame({ + "trade_date": ["20240101", "20240101", "20240102", "20240102", "20240102"], + "value": [1, 2, 3, 4, 5], + }) + + groups = task._compute_group_array(df, "trade_date") + + assert len(groups) == 2 # 两个日期 + assert groups[0] == 2 # 第一个日期2条 + assert groups[1] == 3 # 第二个日期3条 +``` + +**Step 6: Commit** + +```bash +git add src/training/tasks/ +git add tests/test_tasks.py +git commit -m "feat(training): add Task strategy components + +- Add BaseTask abstract base class +- Add RegressionTask for regression training +- Add RankTask for learning-to-rank with LambdaRank +- Support quantile label conversion and NDCG evaluation +- Add comprehensive tests" +``` + +--- + +## Task 6: 创建 ResultAnalyzer 组件 + +**Files:** +- Create: `src/training/result_analyzer.py` +- Test: `tests/test_result_analyzer.py` + +**Step 1: Create the ResultAnalyzer implementation** + +```python +"""结果分析器 + +训练后的分析和结果处理: +1. 特征重要性分析(Top N、零贡献特征) +2. 结果组装(生成每日 Top N) +3. 结果保存 +""" + +from typing import Any, Dict, List, Optional +import os +import polars as pl +import pandas as pd +import numpy as np + + +class ResultAnalyzer: + """结果分析器 + + 分析训练结果,生成报告并保存。 + """ + + def analyze_feature_importance( + self, + model, + feature_cols: List[str], + top_n: int = 20, + verbose: bool = True, + ) -> Dict[str, Any]: + """分析特征重要性 + + Args: + model: 训练好的模型 + feature_cols: 特征列名列表 + top_n: 显示 Top N 特征 + verbose: 是否打印信息 + + Returns: + 分析结果字典 + """ + importance = model.feature_importance() + + if importance is None: + if verbose: + print("[警告] 无法获取特征重要性") + return {} + + # 按重要性排序 + importance_sorted = importance.sort_values(ascending=False) + + # 计算百分比 + total_importance = importance_sorted.sum() + importance_pct = (importance_sorted / total_importance * 100).round(2) + + # 识别零贡献特征 + zero_importance_features = importance_sorted[importance_sorted == 0].index.tolist() + + if verbose: + print("\n" + "=" * 80) + print("特征重要性分析") + print("=" * 80) + + # 打印 Top N + print(f"\nTop {top_n} 特征:") + print("-" * 80) + print(f"{'排名':<6}{'特征名':<35}{'重要性':<15}{'占比':<10}") + print("-" * 80) + + for i, (feature, score) in enumerate(importance_sorted.head(top_n).items(), 1): + pct = importance_pct[feature] + if pct >= 10: + marker = " [高贡献]" + elif pct >= 1: + marker = " [中贡献]" + else: + marker = " [低贡献]" + print(f"{i:<6}{feature:<35}{score:<15.2f}{pct:<8.2f}%{marker}") + + # 打印零贡献特征 + if zero_importance_features: + print("\n" + "-" * 80) + print(f"[警告] 贡献为0的特征(共 {len(zero_importance_features)} 个):") + for i, feature in enumerate(zero_importance_features, 1): + print(f" {i}. {feature}") + + # 统计摘要 + print("\n" + "=" * 80) + print("统计摘要:") + print("-" * 80) + print(f" 特征总数: {len(importance_sorted)}") + print(f" 有贡献特征数: {len(importance_sorted) - len(zero_importance_features)}") + print(f" 零贡献特征数: {len(zero_importance_features)}") + if len(importance_sorted) > 0: + print(f" 零贡献占比: {len(zero_importance_features) / len(importance_sorted) * 100:.1f}%") + print(f" Top {top_n} 累计占比: {importance_pct.head(top_n).sum():.1f}%") + print("=" * 80) + + return { + "importance": importance_sorted, + "importance_pct": importance_pct, + "zero_importance_features": zero_importance_features, + "top_n": importance_sorted.head(top_n), + } + + def assemble_results( + self, + test_data: Dict[str, Any], + predictions: np.ndarray, + top_n: int = 50, + verbose: bool = True, + ) -> pl.DataFrame: + """组装结果 + + 生成每日 Top N 股票推荐列表。 + + Args: + test_data: 测试数据字典 + predictions: 预测结果数组 + top_n: 每日选择的股票数 + verbose: 是否打印信息 + + Returns: + 结果数据框 + """ + # 添加预测列 + raw_data = test_data["raw_data"] + results = raw_data.with_columns([ + pl.Series("prediction", predictions) + ]) + + # 按日期分组取 Top N + unique_dates = results["trade_date"].unique().sort() + topn_by_date = [] + + for date in unique_dates: + day_data = results.filter(results["trade_date"] == date) + topn = day_data.sort("prediction", descending=True).head(top_n) + topn_by_date.append(topn) + + # 合并所有日期的 Top N + topn_results = pl.concat(topn_by_date) + + if verbose: + print(f"\n生成每日 Top {top_n} 股票列表:") + print(f" 交易日数: {len(unique_dates)}") + print(f" 总推荐数: {len(topn_results)}") + + return topn_results + + def save_results( + self, + results: pl.DataFrame, + output_path: str, + verbose: bool = True, + ) -> None: + """保存结果 + + Args: + results: 结果数据框 + output_path: 输出路径 + verbose: 是否打印信息 + """ + # 格式化日期并调整列顺序 + formatted = results.select([ + (pl.col("trade_date").str.slice(0, 4) + "-" + + pl.col("trade_date").str.slice(4, 2) + "-" + + pl.col("trade_date").str.slice(6, 2)).alias("date"), + pl.col("prediction").alias("score"), + pl.col("ts_code"), + ]) + + # 确保目录存在 + os.makedirs(os.path.dirname(output_path), exist_ok=True) + + # 保存 CSV + formatted.write_csv(output_path, include_header=True) + + if verbose: + print(f" 保存路径: {output_path}") + print(f" 保存行数: {len(formatted)}") +``` + +**Step 2: Create test file** + +```python +"""ResultAnalyzer 测试""" + +import pytest +from unittest.mock import Mock +import polars as pl +import pandas as pd +import numpy as np + +from src.training.result_analyzer import ResultAnalyzer + + +class TestResultAnalyzer: + """测试 ResultAnalyzer""" + + def test_init(self): + """测试初始化""" + analyzer = ResultAnalyzer() + assert analyzer is not None + + def test_analyze_feature_importance(self): + """测试特征重要性分析""" + analyzer = ResultAnalyzer() + + # 创建 mock model + model = Mock() + importance = pd.Series( + [100, 50, 0, 0, 30], + index=["feat1", "feat2", "feat3", "feat4", "feat5"] + ) + model.feature_importance.return_value = importance + + result = analyzer.analyze_feature_importance( + model=model, + feature_cols=["feat1", "feat2", "feat3", "feat4", "feat5"], + top_n=3, + verbose=False, + ) + + assert "importance" in result + assert "zero_importance_features" in result + assert len(result["zero_importance_features"]) == 2 # feat3, feat4 + + def test_assemble_results(self): + """测试结果组装""" + analyzer = ResultAnalyzer() + + # 创建测试数据 + test_data = { + "raw_data": pl.DataFrame({ + "trade_date": ["20240101", "20240101", "20240102", "20240102"], + "ts_code": ["000001.SZ", "000002.SZ", "000001.SZ", "000002.SZ"], + }) + } + predictions = np.array([0.5, 0.3, 0.8, 0.2]) + + results = analyzer.assemble_results( + test_data=test_data, + predictions=predictions, + top_n=1, + verbose=False, + ) + + assert len(results) == 2 # 每天选1个,共2天 +``` + +**Step 3: Commit** + +```bash +git add src/training/result_analyzer.py tests/test_result_analyzer.py +git commit -m "feat(training): add ResultAnalyzer component + +- Analyze feature importance with top N and zero-contribution features +- Assemble daily Top N stock recommendations +- Save results to CSV with proper formatting +- Add comprehensive tests" +``` + +--- + +## Task 7: 重构 Trainer 为调度引擎 + +**Files:** +- Create: `src/training/core/trainer_new.py` (新实现) +- Modify: `src/training/__init__.py` - 添加新导出 + +**Step 1: Create new Trainer implementation** + +```python +"""训练调度引擎 + +协调 FactorManager、DataPipeline、Task 和 ResultAnalyzer 完成训练流程。 +""" + +from typing import Any, Callable, Dict, List, Optional, Tuple +import os +from datetime import datetime + +import polars as pl + +from src.factors import FactorEngine +from src.training.pipeline import DataPipeline +from src.training.tasks.base import BaseTask +from src.training.result_analyzer import ResultAnalyzer + + +class Trainer: + """训练调度引擎 + + 协调各个组件执行完整训练流程: + 1. 准备数据(DataPipeline) + 2. 处理标签(Task) + 3. 训练模型(Task) + 4. 绘制指标(Task) + 5. 生成预测(Task) + 6. 分析结果(ResultAnalyzer) + 7. 保存结果 + + Attributes: + data_pipeline: 数据流水线 + task: 任务实例(RegressionTask/RankTask) + analyzer: 结果分析器 + output_config: 输出配置 + verbose: 是否打印详细信息 + results: 训练结果 + """ + + def __init__( + self, + data_pipeline: DataPipeline, + task: BaseTask, + analyzer: Optional[ResultAnalyzer] = None, + output_config: Optional[Dict[str, Any]] = None, + verbose: bool = True, + ): + """初始化训练器 + + Args: + data_pipeline: 数据流水线实例 + task: 任务实例(RegressionTask 或 RankTask) + analyzer: 结果分析器(可选,默认创建新实例) + output_config: 输出配置字典 + verbose: 是否打印详细信息 + """ + self.data_pipeline = data_pipeline + self.task = task + self.analyzer = analyzer or ResultAnalyzer() + self.output_config = output_config or {} + self.verbose = verbose + self.results: Optional[pl.DataFrame] = None + + def run( + self, + engine: FactorEngine, + date_range: Dict[str, Tuple[str, str]], + ) -> pl.DataFrame: + """执行完整训练流程 + + Args: + engine: FactorEngine 实例 + date_range: 日期范围字典 + { + "train": (start_date, end_date), + "val": (start_date, end_date), + "test": (start_date, end_date), + } + + Returns: + 训练结果数据框 + """ + if self.verbose: + print("\n" + "=" * 80) + print(f"开始训练: {self.task.__class__.__name__}") + print("=" * 80) + + # Step 1: 准备数据 + if self.verbose: + print("\n[Step 1/7] 准备数据...") + + data = self.data_pipeline.prepare_data( + engine=engine, + date_range=date_range, + label_name=self.task.label_name, + verbose=self.verbose, + ) + + # Step 2: 处理标签 + if self.verbose: + print("\n[Step 2/7] 处理标签...") + + data = self.task.prepare_labels(data) + + # Step 3: 训练模型 + if self.verbose: + print("\n[Step 3/7] 训练模型...") + + self.task.fit(data["train"], data["val"]) + + # Step 4: 绘制训练指标 + if self.verbose: + print("\n[Step 4/7] 绘制训练指标...") + + self.task.plot_training_metrics() + + # Step 5: 生成预测 + if self.verbose: + print("\n[Step 5/7] 生成预测...") + + predictions = self.task.predict(data["test"]) + + # Step 6: 分析结果 + if self.verbose: + print("\n[Step 6/7] 分析结果...") + + # 特征重要性 + self.analyzer.analyze_feature_importance( + model=self.task.get_model(), + feature_cols=data["test"]["feature_cols"], + top_n=20, + verbose=self.verbose, + ) + + # NDCG 评估(排序任务特有) + if hasattr(self.task, 'evaluate_ndcg'): + ndcg_scores = self.task.evaluate_ndcg(data["test"]) + if self.verbose: + print("\nNDCG 评估结果:") + for metric, score in ndcg_scores.items(): + print(f" {metric}: {score:.4f}") + + # 组装结果 + self.results = self.analyzer.assemble_results( + test_data=data["test"], + predictions=predictions, + top_n=self.output_config.get("top_n", 50), + verbose=self.verbose, + ) + + # Step 7: 保存结果 + if self.verbose: + print("\n[Step 7/7] 保存结果...") + + if self.output_config.get("save_predictions", True): + self._save_predictions() + + if self.output_config.get("save_model", False): + self._save_model() + + if self.verbose: + print("\n" + "=" * 80) + print("训练完成!") + print("=" * 80) + + return self.results + + def _save_predictions(self) -> None: + """保存预测结果""" + output_dir = self.output_config.get("output_dir", "experiment/output") + output_filename = self.output_config.get("output_filename", "output.csv") + output_path = os.path.join(output_dir, output_filename) + + self.analyzer.save_results( + results=self.results, + output_path=output_path, + verbose=self.verbose, + ) + + def _save_model(self) -> None: + """保存模型""" + model_save_path = self.output_config.get("model_save_path") + if not model_save_path: + return + + # 确保目录存在 + os.makedirs(os.path.dirname(model_save_path), exist_ok=True) + + # 获取模型和相关信息 + model = self.task.get_model() + + # 保存模型 + model.save(model_save_path) + + if self.verbose: + print(f" 模型保存路径: {model_save_path}") + + def get_results(self) -> Optional[pl.DataFrame]: + """获取训练结果 + + Returns: + 训练结果数据框,如果尚未训练则返回 None + """ + return self.results + + def get_task(self) -> BaseTask: + """获取任务实例 + + Returns: + 任务实例 + """ + return self.task +``` + +**Step 2: Update __init__.py to export new components** + +Add to `src/training/__init__.py`: + +```python +# 新增导出(模块化 Trainer 组件) +from src.training.factor_manager import FactorManager +from src.training.pipeline import DataPipeline +from src.training.result_analyzer import ResultAnalyzer +from src.training.tasks import RegressionTask, RankTask +# 可以选择性地导出新的 Trainer,或者保持原有 Trainer 不变 +# from src.training.core.trainer_new import Trainer as ModularTrainer + +__all__ = [ + # 原有导出 + "Trainer", + "DateSplitter", + "StockPoolManager", + "check_data_quality", + "STFilter", + "Winsorizer", + "NullFiller", + "StandardScaler", + "CrossSectionalStandardScaler", + "TrainingConfig", + # 新增导出 + "FactorManager", + "DataPipeline", + "ResultAnalyzer", + "RegressionTask", + "RankTask", +] +``` + +**Step 3: Run basic import tests** + +```bash +uv run python -c "from src.training import FactorManager, DataPipeline, RegressionTask, RankTask, ResultAnalyzer; print('All imports successful')" +``` + +Expected: All imports successful + +**Step 4: Commit** + +```bash +git add src/training/core/trainer_new.py +git add src/training/__init__.py +git add src/training/factor_manager.py +git add src/training/pipeline.py +git add src/training/result_analyzer.py +git add src/training/tasks/ +git commit -m "feat(training): add modular Trainer architecture + +- Add FactorManager for unified factor management +- Add DataPipeline for complete data processing workflow +- Add Task strategy components (RegressionTask, RankTask) +- Add ResultAnalyzer for post-training analysis +- Add new Trainer as orchestration engine +- Update __init__.py exports" +``` + +--- + +## Task 8: 重写 regression.py 使用新架构 + +**Files:** +- Create: `src/experiment/regression_v2.py` (新实现) +- Keep: `src/experiment/regression.py` (原文件保留,添加注释说明已迁移) + +**Step 1: Create new regression.py with new architecture** + +```python +# %% md +# # LightGBM 回归训练流程(模块化版本) +# +# 使用新的模块化 Trainer 架构 +# %% md +# ## 1. 导入依赖 +# %% +from src.training import ( + Trainer, + DataPipeline, + FactorManager, + RegressionTask, + NullFiller, + Winsorizer, + StandardScaler, +) +from src.training.components.filters import STFilter +from src.experiment.common import ( + create_training_config, + create_regression_config, + FactorEngine, +) + +# %% md +# ## 2. 配置参数 +# %% +# 创建统一配置 +training_config = create_training_config() +model_config = create_regression_config() + +print("训练配置:") +print(f" 训练期: {training_config.train_start} - {training_config.train_end}") +print(f" 验证期: {training_config.val_start} - {training_config.val_end}") +print(f" 测试期: {training_config.test_start} - {training_config.test_end}") +print(f" 特征数: {len(training_config.selected_factors)}") +print(f" Label: {model_config.label_name}") + +# %% md +# ## 3. 创建组件 +# %% +# 1. 创建 FactorEngine +engine = FactorEngine() + +# 2. 创建 FactorManager +factor_manager = FactorManager( + selected_factors=training_config.selected_factors, + factor_definitions=training_config.factor_definitions, + label_factor=training_config.label_factor, + excluded_factors=training_config.excluded_factors, +) + +# 3. 创建 DataPipeline +processors = [ + NullFiller(strategy="mean"), + Winsorizer(lower=0.01, upper=0.99), + StandardScaler(), +] + +filters = [STFilter(data_router=engine.router)] if training_config.st_filter_enabled else [] + +pipeline = DataPipeline( + factor_manager=factor_manager, + processors=processors, + filters=filters, + stock_pool_filter_func=training_config.stock_pool_filter, + stock_pool_required_columns=training_config.stock_pool_required_columns, +) + +# 4. 创建 Task +task = RegressionTask( + model_params=model_config.model_params, + label_name=model_config.label_name, +) + +# 5. 创建 Trainer +output_config = { + "output_dir": training_config.output_dir, + "output_filename": "regression_output.csv", + "save_predictions": training_config.save_predictions, + "save_model": training_config.save_model, + "model_save_path": f"{training_config.output_dir}/regression_model.txt", + "top_n": training_config.top_n, +} + +trainer = Trainer( + data_pipeline=pipeline, + task=task, + output_config=output_config, + verbose=True, +) + +# %% md +# ## 4. 执行训练 +# %% +results = trainer.run( + engine=engine, + date_range=training_config.date_range, +) + +# %% md +# ## 5. 额外分析(可选) +# %% +# 获取模型进行进一步分析 +model = task.get_model() + +# 可以在这里添加自定义可视化 +print("\n训练完成!") +print(f"结果保存路径: {output_config['output_dir']}/regression_output.csv") +``` + +**Step 2: Add deprecation notice to old regression.py** + +在原有 `regression.py` 文件顶部添加: + +```python +# 注意:此文件已迁移到 regression_v2.py +# 新文件使用模块化 Trainer 架构 +# 此文件保留用于参考和对比 +``` + +**Step 3: Test new regression script** + +```bash +# 注意:这会实际运行训练,可能需要较长时间 +# 建议先用小数据测试 +uv run python src/experiment/regression_v2.py +``` + +**Step 4: Commit** + +```bash +git add src/experiment/regression_v2.py +git add src/experiment/regression.py # 已添加弃用注释 +git commit -m "feat(experiment): add modular regression training script + +- Create regression_v2.py using new modular Trainer architecture +- Reduce code from 640 lines to ~80 lines +- Add deprecation notice to old regression.py +- All functionality preserved" +``` + +--- + +## Task 9: 重写 learn_to_rank.py 使用新架构 + +**Files:** +- Create: `src/experiment/learn_to_rank_v2.py` (新实现) +- Keep: `src/experiment/learn_to_rank.py` (原文件保留,添加注释说明已迁移) + +**Step 1: Create new learn_to_rank.py with new architecture** + +```python +# %% md +# # LightGBM LambdaRank 排序学习训练流程(模块化版本) +# +# 使用新的模块化 Trainer 架构 +# %% md +# ## 1. 导入依赖 +# %% +from src.training import ( + Trainer, + DataPipeline, + FactorManager, + RankTask, + NullFiller, + Winsorizer, + CrossSectionalStandardScaler, +) +from src.training.components.filters import STFilter +from src.experiment.common import ( + create_training_config, + create_rank_config, + FactorEngine, +) + +# %% md +# ## 2. 配置参数 +# %% +# 创建统一配置 +training_config = create_training_config() +model_config = create_rank_config() + +print("训练配置:") +print(f" 训练期: {training_config.train_start} - {training_config.train_end}") +print(f" 验证期: {training_config.val_start} - {training_config.val_end}") +print(f" 测试期: {training_config.test_start} - {training_config.test_end}") +print(f" 特征数: {len(training_config.selected_factors)}") +print(f" Label: {model_config.label_name}") +print(f" 分位数: {model_config.n_quantiles}") + +# %% md +# ## 3. 创建组件 +# %% +# 1. 创建 FactorEngine +engine = FactorEngine() + +# 2. 创建 FactorManager +factor_manager = FactorManager( + selected_factors=training_config.selected_factors, + factor_definitions=training_config.factor_definitions, + label_factor=training_config.label_factor, + excluded_factors=training_config.excluded_factors, +) + +# 3. 创建 DataPipeline(使用截面标准化) +processors = [ + NullFiller(strategy="mean"), + Winsorizer(lower=0.01, upper=0.99), + CrossSectionalStandardScaler(), +] + +filters = [STFilter(data_router=engine.router)] if training_config.st_filter_enabled else [] + +pipeline = DataPipeline( + factor_manager=factor_manager, + processors=processors, + filters=filters, + stock_pool_filter_func=training_config.stock_pool_filter, + stock_pool_required_columns=training_config.stock_pool_required_columns, +) + +# 4. 创建 Task(排序学习特有 n_quantiles) +task = RankTask( + model_params=model_config.model_params, + label_name=model_config.label_name, + n_quantiles=model_config.n_quantiles, +) + +# 5. 创建 Trainer +output_config = { + "output_dir": training_config.output_dir, + "output_filename": "rank_output.csv", + "save_predictions": training_config.save_predictions, + "save_model": training_config.save_model, + "model_save_path": f"{training_config.output_dir}/rank_model.txt", + "top_n": training_config.top_n, +} + +trainer = Trainer( + data_pipeline=pipeline, + task=task, + output_config=output_config, + verbose=True, +) + +# %% md +# ## 4. 执行训练 +# %% +results = trainer.run( + engine=engine, + date_range=training_config.date_range, +) + +# %% md +# ## 5. 额外分析(NDCG) +# %% +# NDCG 评估已在 Trainer.run() 中自动执行 +# 可以在这里添加额外的可视化 + +print("\n训练完成!") +print(f"结果保存路径: {output_config['output_dir']}/rank_output.csv") +``` + +**Step 2: Add deprecation notice to old learn_to_rank.py** + +在原有 `learn_to_rank.py` 文件顶部添加: + +```python +# 注意:此文件已迁移到 learn_to_rank_v2.py +# 新文件使用模块化 Trainer 架构 +# 此文件保留用于参考和对比 +``` + +**Step 3: Test new learn_to_rank script** + +```bash +# 注意:这会实际运行训练 +uv run python src/experiment/learn_to_rank_v2.py +``` + +**Step 4: Commit** + +```bash +git add src/experiment/learn_to_rank_v2.py +git add src/experiment/learn_to_rank.py # 已添加弃用注释 +git commit -m "feat(experiment): add modular learn-to-rank training script + +- Create learn_to_rank_v2.py using new modular Trainer architecture +- Reduce code from 876 lines to ~80 lines +- Add deprecation notice to old learn_to_rank.py +- All functionality preserved including NDCG evaluation" +``` + +--- + +## Task 10: 验证和对比 + +**Files:** +- Test both implementations + +**Step 1: Compare outputs** + +```bash +# 运行旧版本(如果数据已存在,可以直接比较输出) +# 注意:这会运行实际训练,需要较长时间 + +# 运行新版本 +uv run python src/experiment/regression_v2.py 2>&1 | tee regression_v2.log +uv run python src/experiment/learn_to_rank_v2.py 2>&1 | tee rank_v2.log + +# 检查输出文件 +ls -lh experiment/output/ +# 应该生成 regression_output.csv 和 rank_output.csv +``` + +**Step 2: Validate feature importance output** + +确保特征重要性分析输出格式正确: +- Top 20 特征列表 +- 零贡献特征列表 +- 统计摘要 + +**Step 3: Validate NDCG evaluation (learn_to_rank)** + +确保 NDCG@k 评估正确执行: +- ndcg@1, ndcg@5, ndcg@10, ndcg@20 都计算 + +**Step 4: Code statistics** + +```bash +# 对比代码行数 +echo "=== Old implementation ===" +wc -l src/experiment/regression.py src/experiment/learn_to_rank.py + +echo "=== New implementation ===" +wc -l src/experiment/regression_v2.py src/experiment/learn_to_rank_v2.py + +echo "=== New components ===" +wc -l src/training/factor_manager.py src/training/pipeline.py src/training/result_analyzer.py +find src/training/tasks -name "*.py" -exec wc -l {} + +``` + +Expected: +- Old: ~640 + ~876 = ~1516 lines +- New: ~80 + ~80 = ~160 lines +- New components: ~500-800 lines (reusable) + +**Step 5: Commit final changes** + +```bash +git add -A +git commit -m "refactor(training): complete modular Trainer architecture + +- Implement FactorManager, DataPipeline, Task strategies, ResultAnalyzer +- Rewrite regression.py (640 -> 80 lines) +- Rewrite learn_to_rank.py (876 -> 80 lines) +- Preserve all functionality: + * Factor management (metadata, DSL, label, exclusion) + * Data filtering (STFilter, stock_pool_filter) + * Data preprocessing (NullFiller, Winsorizer, Scaler) + * Model training with early stopping + * Feature importance analysis + * NDCG evaluation for ranking + * Result saving (predictions, model) +- Add comprehensive tests for all components +- Code reduction: 94% less duplication in experiment scripts" +``` + +--- + +## Summary + +### 代码结构变化 + +``` +Before: +├── src/experiment/regression.py (640 lines) - 独立完整实现 +├── src/experiment/learn_to_rank.py (876 lines) - 独立完整实现 +└── 重复代码: 80%+ + +After: +├── src/experiment/regression_v2.py (80 lines) - 配置+运行 +├── src/experiment/learn_to_rank_v2.py (80 lines) - 配置+运行 +├── src/training/factor_manager.py - 因子管理(可复用) +├── src/training/pipeline.py - 数据流水线(可复用) +├── src/training/tasks/ +│ ├── base.py - 任务接口 +│ ├── regression_task.py - 回归任务 +│ └── rank_task.py - 排序任务 +├── src/training/result_analyzer.py - 结果分析(可复用) +└── src/training/core/trainer_new.py - 调度引擎 +``` + +### 新增训练类型的工作量 + +添加**分类任务**: +1. 创建 `ClassificationTask` 类(继承 BaseTask,实现3个方法) +2. 在实验脚本中使用(80行,与回归/排序类似) + +无需复制任何数据流程代码! + +### 测试覆盖 + +- FactorManager: ✓ +- DataPipeline: ✓ +- Tasks: ✓ +- ResultAnalyzer: ✓ + +--- + +## 后续可选优化 + +1. **完全移除旧文件**:验证新文件工作正常后,可以删除 regression.py 和 learn_to_rank.py,将 v2 文件重命名 +2. **添加更多测试**:集成测试、端到端测试 +3. **文档更新**:更新 README,添加新架构使用说明 +4. **配置优化**:支持从 YAML/JSON 文件加载配置 diff --git a/src/experiment/learn_to_rank.py b/src/experiment/learn_to_rank.py index 32ba724..f97ad5b 100644 --- a/src/experiment/learn_to_rank.py +++ b/src/experiment/learn_to_rank.py @@ -1,14 +1,7 @@ # %% md -# # Learn-to-Rank 排序学习训练流程 -# # -# 本 Notebook 实现基于 LightGBM LambdaRank 的排序学习训练,用于股票排序任务。 -# # -# ## 核心特点 -# # -# 1. **Label 转换**: 将 `future_return_5` 按每日进行 20 分位数划分(qcut) -# 2. **排序学习**: 使用 LambdaRank 目标函数,学习每日股票排序 -# 3. **NDCG 评估**: 使用 NDCG@1/5/10/20 评估排序质量 -# 4. **策略回测**: 基于排序分数构建 Top-k 选股策略 +# # LightGBM LambdaRank 排序学习训练流程(模块化版本) +# +# 使用新的模块化 Trainer 架构,代码更简洁、可维护性更高。 # %% md # ## 1. 导入依赖 # %% @@ -20,30 +13,22 @@ import numpy as np import polars as pl import pandas as pd import matplotlib.pyplot as plt -from sklearn.metrics import ndcg_score from src.factors import FactorEngine from src.training import ( - DateSplitter, - STFilter, - StockPoolManager, - Trainer, - Winsorizer, + FactorManager, + DataPipeline, + RankTask, NullFiller, - StandardScaler, - check_data_quality, + Winsorizer, CrossSectionalStandardScaler, ) -from src.training.components.models import LightGBMLambdaRankModel -from src.training.config import TrainingConfig - -# 从 common 模块导入共用配置和函数 +from src.training.trainer_v2 import Trainer +from src.training.components.filters import STFilter from src.experiment.common import ( SELECTED_FACTORS, FACTOR_DEFINITIONS, get_label_factor, - register_factors, - prepare_data, TRAIN_START, TRAIN_END, VAL_START, @@ -63,814 +48,292 @@ from src.experiment.common import ( # 训练类型标识 TRAINING_TYPE = "rank" - # %% md -# ## 2. 本地辅助函数 +# ## 2. 训练特定配置 # %% -# 注意:register_factors 和 prepare_data 已从 common 模块导入 - - -def prepare_ranking_data( - df: pl.DataFrame, - label_col: str = "future_return_5", - date_col: str = "trade_date", - n_quantiles: int = 20, -) -> Tuple[pl.DataFrame, str]: - """准备排序学习数据 - - 将连续 label 转换为分位数标签,用于排序学习任务。 - - Args: - df: 原始数据 - label_col: 原始标签列名 - date_col: 日期列名 - n_quantiles: 分位数数量 - - Returns: - (处理后的 DataFrame, 新的标签列名) - """ - print("\n" + "=" * 80) - print(f"准备排序学习数据(将 {label_col} 转换为 {n_quantiles} 分位数标签)") - print("=" * 80) - - # 新的标签列名 - rank_col = f"{label_col}_rank" - - # 按日期分组进行分位数划分 - # 使用 rank 生成 0, 1, 2, ..., n_quantiles-1 的标签 - # 方法: 计算每天内的排名,然后映射到 n_quantiles 个分位数组 - df_ranked = ( - df.with_columns( - # 计算每天内的排名 (1-based) - pl.col(label_col).rank(method="min").over(date_col).alias("_rank") - ) - .with_columns( - # 将排名转换为分位数标签 (0 to n_quantiles-1) - ((pl.col("_rank") - 1) / pl.len().over(date_col) * n_quantiles) - .floor() - .cast(pl.Int64) - .clip(0, n_quantiles - 1) - .alias(rank_col) - ) - .drop("_rank") - ) - - # 检查转换结果 - print(f"\n原始 {label_col} 统计:") - print(df_ranked[label_col].describe()) - - print(f"\n转换后 {rank_col} 统计:") - print(df_ranked[rank_col].describe()) - - # 检查每日样本分布 - print(f"\n每日样本数统计:") - daily_counts = df_ranked.group_by(date_col).agg(pl.count().alias("count")) - print(daily_counts["count"].describe()) - - # 检查分位数分布(应该是均匀的) - print(f"\n分位数标签分布:") - rank_dist = df_ranked[rank_col].value_counts().sort(rank_col) - print(rank_dist) - - return df_ranked, rank_col - - -def compute_group_array(df: pl.DataFrame, date_col: str = "trade_date") -> np.ndarray: - """计算 group 数组用于 LambdaRank - - 每个日期作为一个 query,group 数组表示每个 query 的样本数。 - - Args: - df: 数据框 - date_col: 日期列名 - - Returns: - group 数组 - """ - group_counts = df.group_by(date_col, maintain_order=True).agg( - pl.count().alias("count") - ) - return group_counts["count"].to_numpy() - - -def evaluate_ndcg_at_k( - y_true: np.ndarray, - y_pred: np.ndarray, - group: np.ndarray, - k_list: List[int] = [1, 5, 10, 20], -) -> dict: - """计算 NDCG@k 指标 - - Args: - y_true: 真实标签 - y_pred: 预测分数 - group: 分组数组 - k_list: 要计算的 k 值列表 - - Returns: - NDCG 指标字典 - """ - results = {} - - # 按 group 拆分数据 - start_idx = 0 - y_true_groups = [] - y_pred_groups = [] - - for group_size in group: - end_idx = start_idx + group_size - y_true_groups.append(y_true[start_idx:end_idx]) - y_pred_groups.append(y_pred[start_idx:end_idx]) - start_idx = end_idx - - # 计算每个 k 值的平均 NDCG - for k in k_list: - ndcg_scores = [] - for yt, yp in zip(y_true_groups, y_pred_groups): - if len(yt) > 1: - try: - score = ndcg_score([yt], [yp], k=k) - ndcg_scores.append(score) - except ValueError: - # 标签都相同,无法计算 - pass - - results[f"ndcg@{k}"] = np.mean(ndcg_scores) if ndcg_scores else 0.0 - - return results - - -# %% md -# ## 3. 配置参数 -# # -# ### 3.1 因子与日期配置 -# %% -# 注意:SELECTED_FACTORS, FACTOR_DEFINITIONS, 日期配置等已从 common 模块导入 -# 本脚本特有的配置: - -# Label 名称(排序学习使用原始收益率,会后续转换为分位数标签) +# Label 配置 LABEL_NAME = "future_return_5" - -# 获取 Label 因子定义 LABEL_FACTOR = get_label_factor(LABEL_NAME) # 分位数配置 -N_QUANTILES = 20 # 将 label 分为 20 组 +N_QUANTILES = 20 - -# 分位数配置 -N_QUANTILES = 20 # 将 label 分为 20 组 +# 排除的因子列表 +EXCLUDED_FACTORS = [ + "volatility_5", + "volume_ratio_5_20", + "capital_retention_20", + "volatility_squeeze_5_60", + "drawdown_from_high_60", + "ma_ratio_5_20", + "bias_10", + "high_low_ratio", + "bbi_ratio", + "volatility_20", + "std_return_20", + "sharpe_ratio_20", + "ma_5", + "max_ret_20", + "CP", + "net_profit_yoy", + "debt_to_equity", + "EP_rank", + "turnover_rank", + "return_5_rank", + "ebit_rank", + "BP", + "EP", + "amihud_illiq_20", + "profit_margin", + "return_5", + "return_20", + "kaufman_ER_20", + "GTJA_alpha043", + "GTJA_alpha042", + "GTJA_alpha041", + "GTJA_alpha040", + "GTJA_alpha039", + "GTJA_alpha037", + "GTJA_alpha036", + "GTJA_alpha035", + "GTJA_alpha033", + "GTJA_alpha032", + "GTJA_alpha031", + "GTJA_alpha028", + "GTJA_alpha026", + "GTJA_alpha027", + "GTJA_alpha023", + "GTJA_alpha024", + "GTJA_alpha009", + "GTJA_alpha011", + "GTJA_alpha022", + "GTJA_alpha020", + "GTJA_alpha018", + "GTJA_alpha019", + "GTJA_alpha014", + "GTJA_alpha013", + "GTJA_alpha010", + "GTJA_alpha001", + "GTJA_alpha003", + "GTJA_alpha002", + "GTJA_alpha004", + "GTJA_alpha005", + "GTJA_alpha006", + "GTJA_alpha008", + "turnover_deviation", + "turnover_cv_20", + "roa", + "GTJA_alpha073", + "GTJA_alpha078", + "GTJA_alpha077", + "GTJA_alpha076", + "GTJA_alpha067", + "GTJA_alpha085", + "GTJA_alpha084", + "GTJA_alpha087", + "GTJA_alpha088", + "GTJA_alpha090", + "GTJA_alpha083", + "GTJA_alpha079", + "GTJA_alpha080", + "GTJA_alpha094", + "GTJA_alpha092", + "GTJA_alpha089", + "GTJA_alpha095", + "GTJA_alpha064", + "GTJA_alpha065", + "GTJA_alpha066", + "GTJA_alpha063", + "GTJA_alpha060", + "GTJA_alpha058", + "GTJA_alpha057", + "GTJA_alpha056", + "GTJA_alpha046", + "GTJA_alpha044", + "GTJA_alpha049", + "GTJA_alpha050", + "GTJA_alpha110", + "GTJA_alpha107", + "GTJA_alpha104", + "GTJA_alpha106", + "GTJA_alpha103", + "GTJA_alpha100", + "GTJA_alpha101", + "GTJA_alpha102", + "GTJA_alpha098", + "GTJA_alpha097", + "GTJA_alpha096", + "GTJA_alpha099", + "GTJA_alpha117", + "GTJA_alpha118", + "GTJA_alpha114", + "GTJA_alpha111", + "GTJA_alpha129", + "GTJA_alpha130", + "GTJA_alpha132", + "GTJA_alpha131", + "GTJA_alpha134", + "GTJA_alpha135", + "GTJA_alpha136", + "GTJA_alpha112", + "GTJA_alpha120", + "GTJA_alpha119", + "GTJA_alpha122", + "GTJA_alpha124", + "GTJA_alpha126", + "GTJA_alpha127", + "GTJA_alpha128", + "GTJA_alpha115", + "GTJA_alpha153", + "GTJA_alpha152", + "GTJA_alpha151", + "GTJA_alpha150", + "GTJA_alpha148", + "GTJA_alpha142", + "GTJA_alpha141", + "GTJA_alpha139", + "GTJA_alpha133", + "GTJA_alpha161", + "GTJA_alpha164", + "GTJA_alpha162", + "GTJA_alpha157", + "GTJA_alpha156", + "GTJA_alpha160", + "GTJA_alpha155", + "GTJA_alpha170", + "GTJA_alpha169", + "GTJA_alpha168", + "GTJA_alpha166", + "GTJA_alpha163", + "GTJA_alpha176", + "GTJA_alpha175", + "GTJA_alpha174", + "GTJA_alpha178", + "GTJA_alpha177", + "GTJA_alpha185", + "GTJA_alpha180", + "GTJA_alpha187", + "GTJA_alpha188", + "GTJA_alpha189", + "GTJA_alpha191", +] # LambdaRank 模型参数配置 -# MODEL_PARAMS = { -# "objective": "lambdarank", -# "metric": "ndcg", -# "ndcg_at": 15, -# "learning_rate": 0.001, -# "num_leaves": 32, -# "max_depth": 5, -# "min_data_in_leaf": 32, -# "n_estimators": 1000, -# "early_stopping_round": 150, -# "subsample": 0.6, -# "colsample_bytree": 0.6, -# "reg_alpha": 1, -# "reg_lambda": 3.0, -# "verbose": -1, -# "random_state": 42, -# "lambdarank_truncation_level": 30, -# "label_gain": [ -# i for i in range(1, N_QUANTILES + 1) -# ], # 如果收益率被分为了比如 5 档,建议用[0, 1, 3, 7, 15] 这种指数型 gain -# } - MODEL_PARAMS = { "objective": "lambdarank", "metric": "ndcg", - "ndcg_at": 25, # 根据你实际持仓数量调整,如果是前50只股票,改成50 - "learning_rate": 0.1, # 【修改】提高学习率,配合合理的早停 + "ndcg_at": 25, + "learning_rate": 0.1, "n_estimators": 1000, - "early_stopping_round": 50, # 【修改】验证集一旦不降,50轮内尽早停下 - # --- 1. 防止过拟合的核心约束 --- - "max_depth": 4, # 【修改】金融数据不需要太深,3~4 足够了 - "num_leaves": 32, # 【修改】大幅减少叶子数,避免过拟合 (2^4 = 16,取12限制生长) - "min_data_in_leaf": 256, # 【修改】极度重要!强制每个叶子必须代表一个大的股票群体(如500只) - # --- 2. 随机采样(增加鲁棒性) --- - "subsample": 0.4, # 每棵树使用 70% 的样本 - "subsample_freq": 1, # 每 1 轮进行一次 subsample - "colsample_bytree": 0.4, # 【修改】降低特征采样率,迫使模型不要只依赖那几个头部 Alpha 因子,增加树的多样性 - # --- 3. 正则化惩罚 --- - "reg_alpha": 10.0, # 【修改】增加 L1 正则化,帮助剔除无效的 GTJA 噪音因子 - "reg_lambda": 50.0, # 【修改】增加 L2 正则化 - # --- 4. Lambdarank 专属配置 --- + "early_stopping_round": 50, + # 防止过拟合的核心约束 + "max_depth": 4, + "num_leaves": 32, + "min_data_in_leaf": 256, + # 随机采样(增加鲁棒性) + "subsample": 0.4, + "subsample_freq": 1, + "colsample_bytree": 0.4, + # 正则化惩罚 + "reg_alpha": 10.0, + "reg_lambda": 50.0, + # Lambdarank 专属配置 "lambdarank_truncation_level": 50, - "label_gain": [ - i * i for i in range(1, N_QUANTILES + 1) - ], # 如果收益率被分为了比如 5 档,建议用[0, 1, 3, 7, 15] 这种指数型 gain + "label_gain": [i * i for i in range(1, N_QUANTILES + 1)], "verbose": -1, "random_state": 42, } -# 注意:stock_pool_filter, STOCK_FILTER_REQUIRED_COLUMNS, OUTPUT_DIR 等配置 -# 已从 common 模块导入 -# %% md -# ## 4. 训练流程 -# %% -print("\n" + "=" * 80) -print("LightGBM LambdaRank 排序学习训练") -print("=" * 80) +# 日期范围配置 +date_range = { + "train": (TRAIN_START, TRAIN_END), + "val": (VAL_START, VAL_END), + "test": (TEST_START, TEST_END), +} -# 1. 创建 FactorEngine(启用 metadata 功能) -print("\n[1] 创建 FactorEngine") -engine = FactorEngine() +# 输出配置 +output_config = { + "output_dir": OUTPUT_DIR, + "output_filename": "rank_output.csv", + "save_predictions": SAVE_PREDICTIONS, + "save_model": SAVE_MODEL, + "model_save_path": get_model_save_path(TRAINING_TYPE), + "top_n": TOP_N, +} -EXCLUDED_FACTORS = ['volatility_5', -'volume_ratio_5_20', -'capital_retention_20', -'volatility_squeeze_5_60', -'drawdown_from_high_60', -'ma_ratio_5_20', -'bias_10', -'high_low_ratio', -'bbi_ratio', -'volatility_20', -'std_return_20', -'sharpe_ratio_20', -'ma_5', -'max_ret_20', -'CP', -'net_profit_yoy', -'debt_to_equity', -'EP_rank', -'turnover_rank', -'return_5_rank', -'ebit_rank', -'BP', -'EP', -'amihud_illiq_20', -'profit_margin', -'return_5', -'return_20', -'kaufman_ER_20', -'GTJA_alpha043', -'GTJA_alpha042', -'GTJA_alpha041', -'GTJA_alpha040', -'GTJA_alpha039', -'GTJA_alpha037', -'GTJA_alpha036', -'GTJA_alpha035', -'GTJA_alpha033', -'GTJA_alpha032', -'GTJA_alpha031', -'GTJA_alpha028', -'GTJA_alpha026', -'GTJA_alpha027', -'GTJA_alpha023', -'GTJA_alpha024', -'GTJA_alpha009', -'GTJA_alpha011', -'GTJA_alpha022', -'GTJA_alpha020', -'GTJA_alpha018', -'GTJA_alpha019', -'GTJA_alpha014', -'GTJA_alpha013', -'GTJA_alpha010', -'GTJA_alpha001', -'GTJA_alpha003', -'GTJA_alpha002', -'GTJA_alpha004', -'GTJA_alpha005', -'GTJA_alpha006', -'GTJA_alpha008', -'turnover_deviation', -'turnover_cv_20', -'roa', -'GTJA_alpha073', -'GTJA_alpha078', -'GTJA_alpha077', -'GTJA_alpha076', -'GTJA_alpha067', -'GTJA_alpha085', -'GTJA_alpha084', -'GTJA_alpha087', -'GTJA_alpha088', -'GTJA_alpha090', -'GTJA_alpha083', -'GTJA_alpha079', -'GTJA_alpha080', -'GTJA_alpha094', -'GTJA_alpha092', -'GTJA_alpha089', -'GTJA_alpha095', -'GTJA_alpha064', -'GTJA_alpha065', -'GTJA_alpha066', -'GTJA_alpha063', -'GTJA_alpha060', -'GTJA_alpha058', -'GTJA_alpha057', -'GTJA_alpha056', -'GTJA_alpha046', -'GTJA_alpha044', -'GTJA_alpha049', -'GTJA_alpha050', -'GTJA_alpha110', -'GTJA_alpha107', -'GTJA_alpha104', -'GTJA_alpha106', -'GTJA_alpha103', -'GTJA_alpha100', -'GTJA_alpha101', -'GTJA_alpha102', -'GTJA_alpha098', -'GTJA_alpha097', -'GTJA_alpha096', -'GTJA_alpha099', -'GTJA_alpha117', -'GTJA_alpha118', -'GTJA_alpha114', -'GTJA_alpha111', -'GTJA_alpha129', -'GTJA_alpha130', -'GTJA_alpha132', -'GTJA_alpha131', -'GTJA_alpha134', -'GTJA_alpha135', -'GTJA_alpha136', -'GTJA_alpha112', -'GTJA_alpha120', -'GTJA_alpha119', -'GTJA_alpha122', -'GTJA_alpha124', -'GTJA_alpha126', -'GTJA_alpha127', -'GTJA_alpha128', -'GTJA_alpha115', -'GTJA_alpha153', -'GTJA_alpha152', -'GTJA_alpha151', -'GTJA_alpha150', -'GTJA_alpha148', -'GTJA_alpha142', -'GTJA_alpha141', -'GTJA_alpha139', -'GTJA_alpha133', -'GTJA_alpha161', -'GTJA_alpha164', -'GTJA_alpha162', -'GTJA_alpha157', -'GTJA_alpha156', -'GTJA_alpha160', -'GTJA_alpha155', -'GTJA_alpha170', -'GTJA_alpha169', -'GTJA_alpha168', -'GTJA_alpha166', -'GTJA_alpha163', -'GTJA_alpha176', -'GTJA_alpha175', -'GTJA_alpha174', -'GTJA_alpha178', -'GTJA_alpha177', -'GTJA_alpha185', -'GTJA_alpha180', -'GTJA_alpha187', -'GTJA_alpha188', -'GTJA_alpha189', -'GTJA_alpha191',] -# 2. 使用 metadata 定义因子 -print("\n[2] 定义因子(从 metadata 注册)") -feature_cols = register_factors( - engine, SELECTED_FACTORS, FACTOR_DEFINITIONS, LABEL_FACTOR, EXCLUDED_FACTORS -) - -# 3. 准备数据 -print("\n[3] 准备数据") -data = prepare_data( - engine=engine, - feature_cols=feature_cols, - start_date=TRAIN_START, - end_date=TEST_END, - label_name=LABEL_NAME, -) - -# 4. 转换为排序学习格式(分位数标签) -print("\n[4] 转换为排序学习格式") -data, target_col = prepare_ranking_data( - df=data, - label_col=LABEL_NAME, - n_quantiles=N_QUANTILES, -) - -# 5. 打印配置信息 -print(f"\n[配置] 训练期: {TRAIN_START} - {TRAIN_END}") -print(f"[配置] 验证期: {VAL_START} - {VAL_END}") -print(f"[配置] 测试期: {TEST_START} - {TEST_END}") -print(f"[配置] 特征数: {len(feature_cols)}") -print(f"[配置] 目标变量: {target_col}({N_QUANTILES}分位数)") - -# 6. 创建排序学习模型 -model = LightGBMLambdaRankModel(params=MODEL_PARAMS) - -# 7. 创建数据处理器(使用函数返回的完整特征列表) -processors = [ - NullFiller(feature_cols=feature_cols, strategy="mean"), - Winsorizer(feature_cols=feature_cols, lower=0.01, upper=0.99), - CrossSectionalStandardScaler(feature_cols=feature_cols), -] - -# 8. 创建数据划分器 -splitter = DateSplitter( - train_start=TRAIN_START, - train_end=TRAIN_END, - val_start=VAL_START, - val_end=VAL_END, - test_start=TEST_START, - test_end=TEST_END, -) - -# 9. 创建股票池管理器 -pool_manager = StockPoolManager( - filter_func=stock_pool_filter, - required_columns=STOCK_FILTER_REQUIRED_COLUMNS, - data_router=engine.router, -) - -# 10. 创建 ST 过滤器 -st_filter = STFilter(data_router=engine.router) - -# 11. 创建训练器(禁用自动保存,我们将在训练后手动保存以包含因子信息) -trainer = Trainer( - model=model, - pool_manager=pool_manager, - processors=processors, - filters=[st_filter], - splitter=splitter, - target_col=target_col, - feature_cols=feature_cols, - persist_model=False, # 禁用自动保存,手动保存以包含因子信息 -) -# %% md -# ### 4.1 股票池筛选 -# %% -print("\n" + "=" * 80) -print("股票池筛选") -print("=" * 80) - -# 先执行 ST 过滤(在股票池筛选之前,与 Trainer.train() 保持一致) -if st_filter: - print("\n[过滤] 应用 ST 过滤器...") - data = st_filter.filter(data) - print(f" ST 过滤后数据规模: {data.shape}") - -if pool_manager: - print("\n执行每日独立筛选股票池...") - filtered_data = pool_manager.filter_and_select_daily(data) - print(f" 筛选前数据规模: {data.shape}") - print(f" 筛选后数据规模: {filtered_data.shape}") - print(f" 筛选前股票数: {data['ts_code'].n_unique()}") - print(f" 筛选后股票数: {filtered_data['ts_code'].n_unique()}") - print(f" 删除记录数: {len(data) - len(filtered_data)}") -else: - filtered_data = data - print(" 未配置股票池管理器,跳过筛选") -# %% md -# ### 4.2 数据划分 -# %% -print("\n" + "=" * 80) -print("数据划分") -print("=" * 80) - -if splitter: - train_data, val_data, test_data = splitter.split(filtered_data) - print(f"\n训练集数据规模: {train_data.shape}") - print(f"验证集数据规模: {val_data.shape}") - print(f"测试集数据规模: {test_data.shape}") - - # 计算各集的 group 数组 - train_group = compute_group_array(train_data) - val_group = compute_group_array(val_data) - test_group = compute_group_array(test_data) - - print(f"\n训练集 group 数量: {len(train_group)}") - print(f"验证集 group 数量: {len(val_group)}") - print(f"测试集 group 数量: {len(test_group)}") - print(f"训练集日均样本数: {np.mean(train_group):.1f}") - print(f"验证集日均样本数: {np.mean(val_group):.1f}") - print(f"测试集日均样本数: {np.mean(test_group):.1f}") -else: - raise ValueError("必须配置数据划分器") -# %% md -# ### 4.3 数据质量检查 -# %% -print("\n" + "=" * 80) -print("数据质量检查(必须在预处理之前)") -print("=" * 80) - -print("\n检查训练集...") -check_data_quality(train_data, feature_cols, raise_on_error=False) - -print("\n检查验证集...") -check_data_quality(val_data, feature_cols, raise_on_error=True) - -print("\n检查测试集...") -check_data_quality(test_data, feature_cols, raise_on_error=True) - -print("[成功] 数据质量检查通过,未发现异常") - -# %% md -# ### 4.4 数据预处理 -# %% -print("\n" + "=" * 80) -print("数据预处理") -print("=" * 80) - -fitted_processors = [] -if processors: - print("\n训练集处理...") - for i, processor in enumerate(processors, 1): - print(f" [{i}/{len(processors)}] {processor.__class__.__name__}") - train_data = processor.fit_transform(train_data) - fitted_processors.append(processor) - - print("\n验证集处理...") - for processor in fitted_processors: - val_data = processor.transform(val_data) - - print("\n测试集处理...") - for processor in fitted_processors: - test_data = processor.transform(test_data) - -print(f"\n处理后训练集形状: {train_data.shape}") -print(f"处理后验证集形状: {val_data.shape}") -print(f"处理后测试集形状: {test_data.shape}") -# %% md -# ### 4.4 训练 LambdaRank 模型 -# %% -print("\n" + "=" * 80) -print("训练 LambdaRank 模型") -print("=" * 80) - -# 准备数据 -X_train = train_data.select(feature_cols) -y_train = train_data.select(target_col).to_series() - -X_val = val_data.select(feature_cols) -y_val = val_data.select(target_col).to_series() - -print(f"\n训练样本数: {len(X_train)}") -print(f"验证样本数: {len(X_val)}") -print(f"特征数: {len(feature_cols)}") -print(f"目标变量: {target_col}") - -print("\n目标变量统计(训练集):") -print(y_train.describe()) - -print("\n开始训练...") -model.fit( - X=X_train, - y=y_train, - group=train_group, - eval_set=(X_val, y_val, val_group), -) -print("训练完成!") -# %% md -# ### 4.5 训练指标曲线 -# %% -print("\n" + "=" * 80) -print("训练指标曲线") -print("=" * 80) - -# 从模型获取训练评估结果 -evals_result = model.get_evals_result() - -if evals_result is None or not evals_result: - print("[警告] 没有可用的训练指标,请确保训练时使用了 eval_set 参数") -else: - print("[成功] 已从模型获取训练评估结果") - - # 获取评估的 NDCG 指标 - ndcg_metrics = [k for k in evals_result["train"].keys() if "ndcg" in k] - print(f"\n评估的 NDCG 指标: {ndcg_metrics}") - - # 显示早停信息 - actual_rounds = len(list(evals_result["train"].values())[0]) - expected_rounds = MODEL_PARAMS.get("n_estimators", 1000) - print(f"\n[早停信息]") - print(f" 配置的最大轮数: {expected_rounds}") - print(f" 实际训练轮数: {actual_rounds}") - - best_iter = model.get_best_iteration() - if best_iter is not None and best_iter < actual_rounds: - print(f" 早停状态: 已触发(最佳迭代: {best_iter})") - else: - print(f" 早停状态: 未触发(达到最大轮数)") - - # 显示各 NDCG 指标的最终值 - print(f"\n最终 NDCG 指标:") - for metric in ndcg_metrics: - train_ndcg = evals_result["train"][metric][-1] - val_ndcg = evals_result["val"][metric][-1] - print(f" {metric}: 训练集={train_ndcg:.4f}, 验证集={val_ndcg:.4f}") - - # 使用封装好的方法绘制所有指标 - print("\n[绘图] 使用 LightGBM 原生接口绘制训练曲线...") - fig = model.plot_all_metrics(metrics=ndcg_metrics[:4], figsize=(14, 10)) - plt.show() - - print(f"\n[指标分析]") - print(f" 各NDCG指标在验证集上的最佳值:") - for metric in ndcg_metrics: - val_metric_list = evals_result["val"][metric] - best_iter_metric = val_metric_list.index(max(val_metric_list)) - best_val = max(val_metric_list) - print(f" {metric}: {best_val:.4f} (迭代 {best_iter_metric + 1})") - print(f"\n[重要提醒] 验证集仅用于早停/调参,测试集完全独立于训练过程!") -# %% md -# ### 4.6 模型评估 -# %% -print("\n" + "=" * 80) -print("模型评估") -print("=" * 80) - -# 准备测试集 -X_test = test_data.select(feature_cols) -y_test = test_data.select(target_col).to_series() - -# 预测 -print("\n生成预测...") -predictions = model.predict(X_test) - -# 添加预测列 -test_data = test_data.with_columns([pl.Series("prediction", predictions)]) - -# 计算 NDCG 指标 -print("\n计算 NDCG 指标...") -ndcg_results = evaluate_ndcg_at_k( - y_true=y_test.to_numpy(), - y_pred=predictions, - group=test_group, - k_list=[1, 5, 10, 20], -) - -print("\nNDCG 评估结果:") -print("-" * 40) -for metric, value in ndcg_results.items(): - print(f" {metric}: {value:.4f}") - - # 特征重要性 - print("\n特征重要性分析:") - print("=" * 80) - importance = model.feature_importance() - if importance is not None: - # 按重要性降序排列 - importance_sorted = importance.sort_values(ascending=False) - - # 计算总重要性和百分比 - total_importance = importance_sorted.sum() - importance_pct = (importance_sorted / total_importance * 100).round(2) - - # 找出贡献为0的特征 - zero_importance_features = importance_sorted[ - importance_sorted == 0 - ].index.tolist() - - # 打印所有特征重要性(带百分比) - print(f"\n所有特征重要性(共 {len(importance_sorted)} 个):") - print("-" * 80) - print(f"{'排名':<6}{'特征名':<35}{'重要性':<15}{'占比':<10}") - print("-" * 80) - for i, (feature, score) in enumerate(importance_sorted.items(), 1): - pct = importance_pct[feature] - if score == 0: - marker = " [零贡献]" - elif pct >= 10: - marker = " [高贡献]" - elif pct >= 1: - marker = " [中贡献]" - else: - marker = " [低贡献]" - print(f"{i:<6}{feature:<35}{score:<15.2f}{pct:<8.2f}%{marker}") - - # 打印贡献为0的特征 - print("\n" + "=" * 80) - if zero_importance_features: - print(f"[警告] 贡献为0的特征(共 {len(zero_importance_features)} 个):") - print("-" * 80) - for i, feature in enumerate(zero_importance_features, 1): - print(f"'{feature}',") - else: - print("[信息] 所有特征都有贡献(无零贡献特征)") - - # 打印统计摘要 - print("\n" + "=" * 80) - print("特征重要性统计摘要:") - print("-" * 80) - print(f" 特征总数: {len(importance_sorted)}") - print( - f" 有贡献特征数: {len(importance_sorted) - len(zero_importance_features)}" - ) - print(f" 零贡献特征数: {len(zero_importance_features)}") - print( - f" 零贡献占比: {len(zero_importance_features) / len(importance_sorted) * 100:.1f}%" - ) - print(f" Top 10特征累计占比: {importance_pct.head(10).sum():.1f}%") - print(f" Top 20特征累计占比: {importance_pct.head(20).sum():.1f}%") -# %% -# 确保输出目录存在 -os.makedirs(OUTPUT_DIR, exist_ok=True) - -# 生成时间戳 -start_dt = datetime.strptime(TEST_START, "%Y%m%d") -end_dt = datetime.strptime(TEST_END, "%Y%m%d") -date_str = f"{start_dt.strftime('%Y%m%d')}_{end_dt.strftime('%Y%m%d')}" - -# 保存每日 Top N -print(f"\n[1/1] 保存每日 Top {TOP_N} 股票...") -topn_output_path = os.path.join(OUTPUT_DIR, "rank_output.csv") - -# 按日期分组,取每日 top N -topn_by_date = [] -unique_dates = test_data["trade_date"].unique().sort() -for date in unique_dates: - day_data = test_data.filter(test_data["trade_date"] == date) - # 按 prediction 降序排序,取前 N - topn = day_data.sort("prediction", descending=True).head(TOP_N) - topn_by_date.append(topn) - -# 合并所有日期的 top N -topn_results = pl.concat(topn_by_date) - -# 格式化日期并调整列顺序:日期、分数、股票 -topn_to_save = topn_results.select( - [ - pl.col("trade_date").str.slice(0, 4) - + "-" - + pl.col("trade_date").str.slice(4, 2) - + "-" - + pl.col("trade_date").str.slice(6, 2).alias("date"), - pl.col("prediction").alias("score"), - pl.col("ts_code"), - ] -) -topn_to_save.write_csv(topn_output_path, include_header=True) -print(f" 保存路径: {topn_output_path}") -print( - f" 保存行数: {len(topn_to_save)}({len(unique_dates)}个交易日 x 每日top{TOP_N})" -) -print(f"\n 预览(前15行):") -print(topn_to_save.head(15)) - -# 保存模型和因子信息(如果启用) -if SAVE_MODEL: +def main(): + """主函数""" print("\n" + "=" * 80) - print("保存模型和因子信息") + print("LightGBM LambdaRank 排序学习训练(模块化版本)") print("=" * 80) - model_save_path = get_model_save_path(TRAINING_TYPE) - if model_save_path: + + # 1. 创建 FactorEngine + print("\n[1] 创建 FactorEngine") + engine = FactorEngine() + + # 2. 创建 FactorManager + print("\n[2] 创建 FactorManager") + factor_manager = FactorManager( + selected_factors=SELECTED_FACTORS, + factor_definitions=FACTOR_DEFINITIONS, + label_factor=LABEL_FACTOR, + excluded_factors=EXCLUDED_FACTORS, + ) + + # 3. 创建 DataPipeline + print("\n[3] 创建 DataPipeline") + pipeline = DataPipeline( + factor_manager=factor_manager, + processor_configs=[ + (NullFiller, {"strategy": "mean"}), + (Winsorizer, {"lower": 0.01, "upper": 0.99}), + (CrossSectionalStandardScaler, {}), + ], + filters=[STFilter(data_router=engine.router)], + stock_pool_filter_func=stock_pool_filter, + stock_pool_required_columns=STOCK_FILTER_REQUIRED_COLUMNS, + ) + + # 4. 创建 RankTask + print("\n[4] 创建 RankTask") + task = RankTask( + model_params=MODEL_PARAMS, + label_name=LABEL_NAME, + n_quantiles=N_QUANTILES, + ) + + # 5. 创建 Trainer + print("\n[5] 创建 Trainer") + trainer = Trainer( + data_pipeline=pipeline, + task=task, + output_config=output_config, + verbose=True, + ) + + # 6. 执行训练 + print("\n[6] 执行训练") + results = trainer.run(engine=engine, date_range=date_range) + + # 7. 保存模型和因子信息(如果启用) + if SAVE_MODEL: + print("\n[7] 保存模型和因子信息") save_model_with_factors( - model=model, - model_path=model_save_path, + model=task.get_model(), + model_path=output_config["model_save_path"], selected_factors=SELECTED_FACTORS, factor_definitions=FACTOR_DEFINITIONS, - fitted_processors=fitted_processors, + fitted_processors=pipeline.get_fitted_processors(), ) -print("\n训练流程完成!") -# %% md -# ## 5. 总结 -# # -# 本 Notebook 实现了完整的 Learn-to-Rank 训练流程: -# # -# ### 核心步骤 -# # -# 1. **数据准备**: 计算 49 个特征因子,将 `future_return_5` 转换为 20 分位数标签 -# 2. **模型训练**: 使用 LightGBM LambdaRank 学习每日股票排序 -# 3. **模型评估**: 使用 NDCG@1/5/10/20 评估排序质量 -# 4. **策略分析**: 基于排序分数构建 Top-k 选股策略 -# # -# ### 关键参数 -# # -# - **Objective**: lambdarank -# - **Metric**: ndcg -# - **Learning Rate**: 0.05 -# - **Num Leaves**: 31 -# - **N Quantiles**: 20 -# # -# ### 输出结果 -# # -# - rank_output.csv: 每日Top-N推荐股票(格式:date, score, ts_code) -# - 特征重要性排名 -# - Top-k 策略统计和图表 -# - NDCG训练指标曲线 -# # -# ### 后续优化方向 -# # -# 1. **特征工程**: 尝试更多因子组合 -# 2. **超参数调优**: 使用网格搜索优化 LambdaRank 参数 -# 3. **模型集成**: 结合多个排序模型的预测 -# 4. **更复杂的分组**: 考虑按行业分组排序 -# + print("\n" + "=" * 80) + print("训练流程完成!") + print(f"结果保存路径: {os.path.join(OUTPUT_DIR, 'rank_output.csv')}") + print("=" * 80) + + return results + + +if __name__ == "__main__": + main() diff --git a/src/experiment/regression.py b/src/experiment/regression.py index 0e713fd..2dd05f6 100644 --- a/src/experiment/regression.py +++ b/src/experiment/regression.py @@ -1,4 +1,8 @@ # %% md +# # LightGBM 回归训练流程(模块化版本) +# +# 使用新的模块化 Trainer 架构,代码更简洁、可维护性更高。 +# %% md # ## 1. 导入依赖 # %% import os @@ -8,26 +12,19 @@ import polars as pl from src.factors import FactorEngine from src.training import ( - DateSplitter, - LightGBMModel, - STFilter, - StandardScaler, - StockPoolManager, - Trainer, - Winsorizer, + FactorManager, + DataPipeline, + RegressionTask, NullFiller, - check_data_quality, - CrossSectionalStandardScaler, + Winsorizer, + StandardScaler, ) -from src.training.config import TrainingConfig - -# 从 common 模块导入共用配置和函数 +from src.training.trainer_v2 import Trainer +from src.training.components.filters import STFilter from src.experiment.common import ( SELECTED_FACTORS, FACTOR_DEFINITIONS, get_label_factor, - register_factors, - prepare_data, TRAIN_START, TRAIN_END, VAL_START, @@ -47,594 +44,152 @@ from src.experiment.common import ( # 训练类型标识 TRAINING_TYPE = "regression" - # %% md -# ## 2. 配置参数 -# -# ### 2.1 标签定义 +# ## 2. 训练特定配置 # %% -# Label 名称(回归任务使用连续收益率) +# Label 配置 LABEL_NAME = "future_return_5" - -# 获取 Label 因子定义 LABEL_FACTOR = get_label_factor(LABEL_NAME) +# 排除的因子列表 +EXCLUDED_FACTORS = [ + "GTJA_alpha010", + "GTJA_alpha005", + "GTJA_alpha036", + "GTJA_alpha027", + "GTJA_alpha044", + "GTJA_alpha073", + "GTJA_alpha104", + "GTJA_alpha103", + "GTJA_alpha105", + "GTJA_alpha092", + "GTJA_alpha087", + "GTJA_alpha085", + "GTJA_alpha062", + "GTJA_alpha124", + "GTJA_alpha133", + "GTJA_alpha131", + "GTJA_alpha117", + "GTJA_alpha157", + "GTJA_alpha162", + "GTJA_alpha177", + "GTJA_alpha180", + "GTJA_alpha191", +] + # 模型参数配置 MODEL_PARAMS = { # 基础设置 - "objective": "regression_l1", # LightGBM 中 MAE 对应的目标函数推荐写 regression_l1 + "objective": "regression_l1", "metric": "mae", - # 1. 修复树结构冲突:深度设为5,叶子数必须<=32。 - # 推荐设定为稍微小于满二叉树的数值(如 15~31),以增加树的不对称性,防止过拟合 + # 树结构约束 "max_depth": 5, - "num_leaves": 24, # 修改:从 63 降为 24 - "min_data_in_leaf": 100, # 修改:适当增大,金融数据噪音大,叶子节点数据越多越抗噪 - # 2. 学习参数 + "num_leaves": 24, + "min_data_in_leaf": 100, + # 学习参数 "learning_rate": 0.01, - "n_estimators": 1500, # 修改:配合小学习率,树可以再多一点 - # 3. 修复采样抖动:改为每棵树都重新采样 + "n_estimators": 1500, + # 随机采样 "subsample": 0.8, - "subsample_freq": 1, # 【关键修改】:从 5 改为 1。每轮都重采样,让抖动均匀化,而不是5轮来一次大抖动 + "subsample_freq": 1, "colsample_bytree": 0.8, - # 正则化(金融量化等高噪场景可适当加大) - "reg_alpha": 0.5, # 修改:适当提高L1,强迫模型只选最有效的因子 + # 正则化 + "reg_alpha": 0.5, "reg_lambda": 1.0, # 杂项 "verbose": -1, "random_state": 42, } -# %% md -# ## 4. 训练流程 -# -# ### 4.1 初始化组件 -# %% -print("\n" + "=" * 80) -print("LightGBM 回归模型训练") -print("=" * 80) -# 1. 创建 FactorEngine(启用 metadata 功能) -print("\n[1] 创建 FactorEngine") -engine = FactorEngine() +# 日期范围配置 +date_range = { + "train": (TRAIN_START, TRAIN_END), + "val": (VAL_START, VAL_END), + "test": (TEST_START, TEST_END), +} -EXCLUDED_FACTORS = [ - 'GTJA_alpha010', - 'GTJA_alpha005', - 'GTJA_alpha036', - 'GTJA_alpha027', - 'GTJA_alpha044', - 'GTJA_alpha073', - 'GTJA_alpha104', - 'GTJA_alpha103', - 'GTJA_alpha105', - 'GTJA_alpha092', - 'GTJA_alpha087', - 'GTJA_alpha085', - 'GTJA_alpha062', - 'GTJA_alpha124', - 'GTJA_alpha133', - 'GTJA_alpha131', - 'GTJA_alpha117', - 'GTJA_alpha157', - 'GTJA_alpha162', - 'GTJA_alpha177', - 'GTJA_alpha180', - 'GTJA_alpha191', -] +# 输出配置 +output_config = { + "output_dir": OUTPUT_DIR, + "output_filename": "regression_output.csv", + "save_predictions": SAVE_PREDICTIONS, + "save_model": SAVE_MODEL, + "model_save_path": get_model_save_path(TRAINING_TYPE), + "top_n": TOP_N, +} -# 2. 使用 metadata 定义因子 -print("\n[2] 定义因子(从 metadata 注册)") -feature_cols = register_factors( - engine, SELECTED_FACTORS, FACTOR_DEFINITIONS, LABEL_FACTOR, EXCLUDED_FACTORS -) -target_col = LABEL_NAME -# 3. 准备数据(使用模块级别的日期配置) -print("\n[3] 准备数据") - -data = prepare_data( - engine=engine, - feature_cols=feature_cols, - start_date=TRAIN_START, - end_date=TEST_END, - label_name=LABEL_NAME, -) - -# 4. 打印配置信息 -print(f"\n[配置] 训练期: {TRAIN_START} - {TRAIN_END}") -print(f"[配置] 验证期: {VAL_START} - {VAL_END}") -print(f"[配置] 测试期: {TEST_START} - {TEST_END}") -print(f"[配置] 特征数: {len(feature_cols)}") -print(f"[配置] 目标变量: {target_col}") - -# 5. 创建模型 -model = LightGBMModel(params=MODEL_PARAMS) - -# 6. 创建数据处理器(使用函数返回的完整特征列表) -processors = [ - NullFiller(feature_cols=feature_cols, strategy="mean"), - Winsorizer(feature_cols=feature_cols, lower=0.01, upper=0.99), - StandardScaler(feature_cols=feature_cols + [LABEL_NAME]), -] - -# 7. 创建数据划分器(正确的 train/val/test 三分法) -# Train: 训练模型参数 | Val: 验证/早停 | Test: 最终评估 -splitter = DateSplitter( - train_start=TRAIN_START, - train_end=TRAIN_END, - val_start=VAL_START, - val_end=VAL_END, - test_start=TEST_START, - test_end=TEST_END, -) - -# 8. 创建股票池管理器 -# 使用新的 API:传入自定义筛选函数和所需列 -pool_manager = StockPoolManager( - filter_func=stock_pool_filter, - required_columns=STOCK_FILTER_REQUIRED_COLUMNS, # 筛选所需的额外列 - # required_factors=STOCK_FILTER_REQUIRED_FACTORS, # 可选:筛选所需的因子 - data_router=engine.router, -) -print("[股票池筛选] 使用自定义函数进行股票池筛选") -print(f"[股票池筛选] 所需基础列: {STOCK_FILTER_REQUIRED_COLUMNS}") -print("[股票池筛选] 筛选逻辑: 排除创业板/科创板/北交所后,每日选市值最小的500只") -# print(f"[股票池筛选] 所需因子: {list(STOCK_FILTER_REQUIRED_FACTORS.keys())}") - -# 9. 创建 ST 股票过滤器 -st_filter = STFilter( - data_router=engine.router, -) - -# 10. 创建训练器(禁用自动保存,我们将在训练后手动保存以包含因子信息) -trainer = Trainer( - model=model, - pool_manager=pool_manager, - processors=processors, - filters=[st_filter], # 使用STFilter过滤ST股票 - splitter=splitter, - target_col=target_col, - feature_cols=feature_cols, - persist_model=False, # 禁用自动保存,手动保存以包含因子信息 -) -# %% md -# ### 4.2 执行训练 -# %% -print("\n" + "=" * 80) -print("开始训练") -print("=" * 80) - -# 步骤 1: 应用过滤器(ST股票过滤等) -print("\n[步骤 1/7] 应用数据过滤器") -print("-" * 60) -filtered_data = data -if st_filter: - print(" 应用 ST 股票过滤器...") - data_before = len(filtered_data) - filtered_data = st_filter.filter(filtered_data) - data_after = len(filtered_data) - print(f" 过滤前记录数: {data_before}") - print(f" 过滤后记录数: {data_after}") - print(f" 删除 ST 股票记录数: {data_before - data_after}") -else: - print(" 未配置 ST 过滤器,跳过") - -# 步骤 2: 股票池筛选 -print("\n[步骤 2/7] 股票池筛选") -print("-" * 60) -if pool_manager: - print(" 执行每日独立筛选股票池...") - pool_data_before = len(filtered_data) - filtered_data = pool_manager.filter_and_select_daily(filtered_data) - pool_data_after = len(filtered_data) - print(f" 筛选前数据规模: {pool_data_before}") - print(f" 筛选后数据规模: {pool_data_after}") - print(f" 删除记录数: {pool_data_before - pool_data_after}") -else: - print(" 未配置股票池管理器,跳过筛选") -# %% -# 步骤 3: 划分训练/验证/测试集(正确的三分法) -print("\n[步骤 3/7] 划分训练集、验证集和测试集") -print("-" * 60) -if splitter: - # 正确的三分法:train用于训练,val用于验证/早停,test仅用于最终评估 - train_data, val_data, test_data = splitter.split(filtered_data) - print(f" 训练集数据规模: {train_data.shape}") - print(f" 验证集数据规模: {val_data.shape}") - print(f" 测试集数据规模: {test_data.shape}") - print(f" 训练集股票数: {train_data['ts_code'].n_unique()}") - print(f" 验证集股票数: {val_data['ts_code'].n_unique()}") - print(f" 测试集股票数: {test_data['ts_code'].n_unique()}") - print( - f" 训练集日期范围: {train_data['trade_date'].min()} - {train_data['trade_date'].max()}" - ) - print( - f" 验证集日期范围: {val_data['trade_date'].min()} - {val_data['trade_date'].max()}" - ) - print( - f" 测试集日期范围: {test_data['trade_date'].min()} - {test_data['trade_date'].max()}" - ) - - print("\n 训练集前5行预览:") - print(train_data.head()) - print("\n 验证集前5行预览:") - print(val_data.head()) - print("\n 测试集前5行预览:") - print(test_data.head()) -else: - train_data = filtered_data - test_data = filtered_data - print(" 未配置划分器,全部作为训练集") -# %% -# 步骤 4: 数据质量检查(必须在预处理之前) -print("\n[步骤 4/7] 数据质量检查") -print("-" * 60) -print(" [说明] 此检查在 fillna 等处理之前执行,用于发现数据问题") - -print("\n 检查训练集...") -check_data_quality(train_data, feature_cols, raise_on_error=False) - -if "val_data" in locals() and val_data is not None: - print("\n 检查验证集...") - check_data_quality(val_data, feature_cols, raise_on_error=True) - -print("\n 检查测试集...") -check_data_quality(test_data, feature_cols, raise_on_error=True) - -print(" [成功] 数据质量检查通过,未发现异常") - -# %% -# 步骤 5: 训练集数据处理 -print("\n[步骤 5/7] 训练集数据处理") -print("-" * 60) -fitted_processors = [] -if processors: - for i, processor in enumerate(processors, 1): - print(f" [{i}/{len(processors)}] 应用处理器: {processor.__class__.__name__}") - train_data_before = len(train_data) - train_data = processor.fit_transform(train_data) - train_data_after = len(train_data) - fitted_processors.append(processor) - print(f" 处理前记录数: {train_data_before}") - print(f" 处理后记录数: {train_data_after}") - if train_data_before != train_data_after: - print(f" 删除记录数: {train_data_before - train_data_after}") - -print("\n 训练集处理后前5行预览:") -print(train_data.head()) -print(f"\n 训练集特征统计:") -print(f" 特征数: {len(feature_cols)}") -print(f" 样本数: {len(train_data)}") -print(f" 缺失值统计:") -for col in feature_cols[:5]: # 只显示前5个特征的缺失值 - null_count = train_data[col].null_count() - if null_count > 0: - print(f" {col}: {null_count} ({null_count / len(train_data) * 100:.2f}%)") -# %% -# 步骤 5: 训练模型 -print("\n[步骤 5/7] 训练模型") -print("-" * 60) -print(f" 模型类型: LightGBM") -print(f" 训练样本数: {len(train_data)}") -print(f" 特征数: {len(feature_cols)}") -print(f" 目标变量: {target_col}") - -X_train = train_data.select(feature_cols) -y_train = train_data.select(target_col).to_series() - -print(f"\n 目标变量统计:") -print(f" 均值: {y_train.mean():.6f}") -print(f" 标准差: {y_train.std():.6f}") -print(f" 最小值: {y_train.min():.6f}") -print(f" 最大值: {y_train.max():.6f}") -print(f" 缺失值: {y_train.null_count()}") - -print("\n 开始训练...") -model.fit(X_train, y_train) -print(" 训练完成!") -# %% -# 步骤 6: 测试集数据处理 -print("\n[步骤 6/7] 测试集数据处理") -print("-" * 60) -if processors and test_data is not train_data: - for i, processor in enumerate(fitted_processors, 1): - print( - f" [{i}/{len(fitted_processors)}] 应用处理器: {processor.__class__.__name__}" - ) - test_data_before = len(test_data) - test_data = processor.transform(test_data) - test_data_after = len(test_data) - print(f" 处理前记录数: {test_data_before}") - print(f" 处理后记录数: {test_data_after}") -else: - print(" 跳过测试集处理") -# %% -# 步骤 7: 生成预测 -print("\n[步骤 7/7] 生成预测") -print("-" * 60) -X_test = test_data.select(feature_cols) -print(f" 测试样本数: {len(X_test)}") -print(" 预测中...") -predictions = model.predict(X_test) -print(f" 预测完成!") - -print(f"\n 预测结果统计:") -print(f" 均值: {predictions.mean():.6f}") -print(f" 标准差: {predictions.std():.6f}") -print(f" 最小值: {predictions.min():.6f}") -print(f" 最大值: {predictions.max():.6f}") - -# 保存结果到 trainer -trainer.results = test_data.with_columns([pl.Series("prediction", predictions)]) -# %% md -# ### 4.3 训练指标曲线 -# %% -print("\n" + "=" * 80) -print("训练指标曲线") -print("=" * 80) - -# 重新训练以收集指标(因为之前的训练没有保存评估结果) -print("\n重新训练模型以收集训练指标...") - -import lightgbm as lgb - -# 准备数据(使用 val 做验证,test 不参与训练过程) -X_train_np = X_train.to_numpy() -y_train_np = y_train.to_numpy() -X_val_np = val_data.select(feature_cols).to_numpy() -y_val_np = val_data.select(target_col).to_series().to_numpy() - -# 创建数据集 -train_dataset = lgb.Dataset(X_train_np, label=y_train_np) -val_dataset = lgb.Dataset(X_val_np, label=y_val_np, reference=train_dataset) - -# 用于存储评估结果 -evals_result = {} - -# 使用与原模型相同的参数重新训练 -# 正确的三分法:train用于训练,val用于验证,test不参与训练过程 -# 添加早停:如果验证指标连续100轮没有改善则停止训练 -booster_with_eval = lgb.train( - MODEL_PARAMS, - train_dataset, - num_boost_round=MODEL_PARAMS.get("n_estimators", 100), - valid_sets=[train_dataset, val_dataset], - valid_names=["train", "val"], - callbacks=[ - lgb.record_evaluation(evals_result), - lgb.early_stopping(stopping_rounds=100, verbose=True), - ], -) - -print("训练完成,指标已收集") - -# 获取指标名称 -metric_name = list(evals_result["train"].keys())[0] -print(f"\n评估指标: {metric_name}") - -# 提取训练和验证指标 -train_metric = evals_result["train"][metric_name] -val_metric = evals_result["val"][metric_name] - -# 显示早停信息 -actual_rounds = len(train_metric) -expected_rounds = MODEL_PARAMS.get("n_estimators", 100) -print(f"\n[早停信息]") -print(f" 配置的最大轮数: {expected_rounds}") -print(f" 实际训练轮数: {actual_rounds}") -if actual_rounds < expected_rounds: - print(f" 早停状态: 已触发(连续100轮验证指标未改善)") -else: - print(f" 早停状态: 未触发(达到最大轮数)") - -print(f"\n最终指标:") -print(f" 训练 {metric_name}: {train_metric[-1]:.6f}") -print(f" 验证 {metric_name}: {val_metric[-1]:.6f}") -# %% -# 绘制训练指标曲线 -import matplotlib.pyplot as plt - -fig, ax = plt.subplots(figsize=(12, 6)) - -# 绘制训练集和验证集的指标曲线(注意:val用于验证,test不参与训练) -iterations = range(1, len(train_metric) + 1) -ax.plot( - iterations, train_metric, label=f"Train {metric_name}", linewidth=2, color="blue" -) -ax.plot( - iterations, val_metric, label=f"Validation {metric_name}", linewidth=2, color="red" -) - -ax.set_xlabel("Iteration", fontsize=12) -ax.set_ylabel(metric_name.upper(), fontsize=12) -ax.set_title( - f"Training and Validation {metric_name.upper()} Curve", - fontsize=14, - fontweight="bold", -) -ax.legend(fontsize=10) -ax.grid(True, alpha=0.3) - -# 标记最佳验证指标点(用于早停决策) -best_iter = val_metric.index(min(val_metric)) -best_metric = min(val_metric) -ax.axvline( - x=best_iter + 1, - color="green", - linestyle="--", - alpha=0.7, - label=f"Best Iteration ({best_iter + 1})", -) -ax.scatter([best_iter + 1], [best_metric], color="green", s=100, zorder=5) -ax.annotate( - f"Best: {best_metric:.6f}\nIter: {best_iter + 1}", - xy=(best_iter + 1, best_metric), - xytext=(best_iter + 1 + len(iterations) * 0.1, best_metric), - fontsize=9, - arrowprops=dict(arrowstyle="->", color="green", alpha=0.7), -) - -plt.tight_layout() -plt.show() - -print(f"\n[指标分析]") -print(f" 最佳验证 {metric_name}: {best_metric:.6f}") -print(f" 最佳迭代轮数: {best_iter + 1}") -print(f" 早停建议: 如果验证指标连续10轮不下降,建议在第 {best_iter + 1} 轮停止训练") -print(f"\n[重要提醒] 验证集仅用于早停/调参,测试集完全独立于训练过程!") -# %% md -# ### 4.4 查看结果 -# %% -print("\n" + "=" * 80) -print("训练结果") -print("=" * 80) - -results = trainer.results - -print(f"\n结果数据形状: {results.shape}") -print(f"结果列: {results.columns}") -print(f"\n结果前10行预览:") -print(results.head(10)) -print(f"\n结果后5行预览:") -print(results.tail()) - -print(f"\n每日预测样本数统计:") -daily_counts = results.group_by("trade_date").agg(pl.len()).sort("trade_date") -print(f" 最小: {daily_counts['len'].min()}") -print(f" 最大: {daily_counts['len'].max()}") -print(f" 平均: {daily_counts['len'].mean():.2f}") - -# 展示某一天的前10个预测结果 -sample_date = results["trade_date"][0] -sample_data = results.filter(results["trade_date"] == sample_date).head(10) -print(f"\n示例日期 {sample_date} 的前10条预测:") -print(sample_data.select(["ts_code", "trade_date", target_col, "prediction"])) -# %% md -# ### 4.4 保存结果 -# %% -print("\n" + "=" * 80) -print("保存预测结果") -print("=" * 80) - -# 确保输出目录存在 -os.makedirs(OUTPUT_DIR, exist_ok=True) - -# 生成时间戳 -start_dt = datetime.strptime(TEST_START, "%Y%m%d") -end_dt = datetime.strptime(TEST_END, "%Y%m%d") -date_str = f"{start_dt.strftime('%Y%m%d')}_{end_dt.strftime('%Y%m%d')}" - -# 保存每日 Top N -print(f"\n[1/1] 保存每日 Top {TOP_N} 股票...") -topn_output_path = os.path.join(OUTPUT_DIR, f"regression_output.csv") - -# 按日期分组,取每日 top N -topn_by_date = [] -unique_dates = results["trade_date"].unique().sort() -for date in unique_dates: - day_data = results.filter(results["trade_date"] == date) - # 按 prediction 降序排序,取前 N - topn = day_data.sort("prediction", descending=True).head(TOP_N) - topn_by_date.append(topn) - -# 合并所有日期的 top N -topn_results = pl.concat(topn_by_date) - -# 格式化日期并调整列顺序:日期、分数、股票 -topn_to_save = topn_results.select( - [ - pl.col("trade_date").str.slice(0, 4) - + "-" - + pl.col("trade_date").str.slice(4, 2) - + "-" - + pl.col("trade_date").str.slice(6, 2).alias("date"), - pl.col("prediction").alias("score"), - pl.col("ts_code"), - ] -) -topn_to_save.write_csv(topn_output_path, include_header=True) -print(f" 保存路径: {topn_output_path}") -print( - f" 保存行数: {len(topn_to_save)}({len(unique_dates)}个交易日 × 每日top{TOP_N})" -) -print(f"\n 预览(前15行):") -print(topn_to_save.head(15)) -# %% md -# ### 4.5 特征重要性 -# %% -importance = model.feature_importance() -if importance is not None: - print("\n特征重要性:") - print(importance.sort_values(ascending=False)) - -print("\n" + "=" * 80) -print("训练完成!") -print("=" * 80) -# %% md -# ## 5. 可视化分析 -# -# 使用训练好的模型直接绘图。 -# - **特征重要性图**:辅助特征选择 -# - **决策树图**:理解决策逻辑 -# %% -# 导入可视化库 -import matplotlib.pyplot as plt -import lightgbm as lgb -import pandas as pd - -# 从封装的model中取出底层Booster -booster = model.model -print(f"模型类型: {type(booster)}") -print(f"特征数量: {len(feature_cols)}") -# %% md -# ### 5.1 绘制特征重要性(辅助特征选择) -# -# **解读**: -# - 重要性高的特征对模型贡献大 -# - 重要性为0的特征可以考虑删除 -# - 可以帮助理解哪些因子最有效 -# %% -print("绘制特征重要性...") - -fig, ax = plt.subplots(figsize=(10, 8)) -lgb.plot_importance( - booster, - max_num_features=20, - importance_type="gain", - title="Feature Importance (Gain)", - ax=ax, -) -ax.set_xlabel("Importance (Gain)") -plt.tight_layout() -plt.show() - -# 打印重要性排名 -importance_gain = pd.Series( - booster.feature_importance(importance_type="gain"), index=feature_cols -).sort_values(ascending=False) - -print("\n[特征重要性排名 - Gain]") -print(importance_gain) - -# 识别低重要性特征 -zero_importance = importance_gain[importance_gain == 0].index.tolist() -if zero_importance: - print(f"\n[低重要性特征] 以下{len(zero_importance)}个特征重要性为0,可考虑删除:") - for feat in zero_importance: - print(f"'{feat}',") -else: - print("\n所有特征都有一定重要性") - -# 保存模型和因子信息(如果启用) -if SAVE_MODEL: +def main(): + """主函数""" print("\n" + "=" * 80) - print("保存模型和因子信息") + print("LightGBM 回归模型训练(模块化版本)") print("=" * 80) - model_save_path = get_model_save_path(TRAINING_TYPE) - if model_save_path: + + # 1. 创建 FactorEngine + print("\n[1] 创建 FactorEngine") + engine = FactorEngine() + + # 2. 创建 FactorManager + print("\n[2] 创建 FactorManager") + factor_manager = FactorManager( + selected_factors=SELECTED_FACTORS, + factor_definitions=FACTOR_DEFINITIONS, + label_factor=LABEL_FACTOR, + excluded_factors=EXCLUDED_FACTORS, + ) + + # 3. 创建 DataPipeline + print("\n[3] 创建 DataPipeline") + pipeline = DataPipeline( + factor_manager=factor_manager, + processor_configs=[ + (NullFiller, {"strategy": "mean"}), + (Winsorizer, {"lower": 0.01, "upper": 0.99}), + (StandardScaler, {}), + ], + filters=[STFilter(data_router=engine.router)], + stock_pool_filter_func=stock_pool_filter, + stock_pool_required_columns=STOCK_FILTER_REQUIRED_COLUMNS, + ) + + # 4. 创建 RegressionTask + print("\n[4] 创建 RegressionTask") + task = RegressionTask( + model_params=MODEL_PARAMS, + label_name=LABEL_NAME, + ) + + # 5. 创建 Trainer + print("\n[5] 创建 Trainer") + trainer = Trainer( + data_pipeline=pipeline, + task=task, + output_config=output_config, + verbose=True, + ) + + # 6. 执行训练 + print("\n[6] 执行训练") + results = trainer.run(engine=engine, date_range=date_range) + + # 7. 保存模型和因子信息(如果启用) + if SAVE_MODEL: + print("\n[7] 保存模型和因子信息") save_model_with_factors( - model=model, - model_path=model_save_path, + model=task.get_model(), + model_path=output_config["model_save_path"], selected_factors=SELECTED_FACTORS, factor_definitions=FACTOR_DEFINITIONS, - fitted_processors=fitted_processors, + fitted_processors=pipeline.get_fitted_processors(), ) + + print("\n" + "=" * 80) + print("训练流程完成!") + print(f"结果保存路径: {os.path.join(OUTPUT_DIR, 'regression_output.csv')}") + print("=" * 80) + + return results + + +if __name__ == "__main__": + main() diff --git a/src/training/__init__.py b/src/training/__init__.py index 0b78554..beb2e9f 100644 --- a/src/training/__init__.py +++ b/src/training/__init__.py @@ -43,6 +43,12 @@ from src.training.utils import check_data_quality # 配置 from src.training.config import TrainingConfig +# 新增:模块化 Trainer 组件 +from src.training.factor_manager import FactorManager +from src.training.pipeline import DataPipeline +from src.training.result_analyzer import ResultAnalyzer +from src.training.tasks import BaseTask, RegressionTask, RankTask + __all__ = [ # 基础抽象类 "BaseModel", @@ -74,4 +80,11 @@ __all__ = [ "check_data_quality", # 配置 "TrainingConfig", + # 新增:模块化 Trainer 组件 + "FactorManager", + "DataPipeline", + "ResultAnalyzer", + "BaseTask", + "RegressionTask", + "RankTask", ] diff --git a/src/training/components/models/lightgbm_lambdarank.py b/src/training/components/models/lightgbm_lambdarank.py index 38f6851..971c631 100644 --- a/src/training/components/models/lightgbm_lambdarank.py +++ b/src/training/components/models/lightgbm_lambdarank.py @@ -185,131 +185,6 @@ class LightGBMLambdaRankModel(BaseModel): return None return self.model.best_score - def plot_metric( - self, - metric: Optional[str] = None, - figsize: tuple = (10, 6), - title: Optional[str] = None, - ax=None, - ): - """绘制训练指标曲线 - - Args: - metric: 要绘制的指标名称,如 'ndcg@5' - figsize: 图大小,默认 (10, 6) - title: 图表标题 - ax: matplotlib Axes 对象 - - Returns: - matplotlib Axes 对象 - """ - if self.model is None: - raise RuntimeError("模型尚未训练,请先调用 fit()") - - if self.evals_result_ is None or not self.evals_result_: - raise RuntimeError("没有可用的评估结果") - - import lightgbm as lgb - import matplotlib.pyplot as plt - - if metric is None: - available_metrics = list(self.evals_result_.get("train", {}).keys()) - ndcg_metrics = [m for m in available_metrics if "ndcg" in m.lower()] - if ndcg_metrics: - metric = ndcg_metrics[0] - elif available_metrics: - metric = available_metrics[0] - else: - raise ValueError("没有可用的评估指标") - - if metric not in self.evals_result_.get("train", {}): - available = list(self.evals_result_.get("train", {}).keys()) - raise ValueError(f"指标 '{metric}' 不存在。可用的指标: {available}") - - if ax is None: - _, ax = plt.subplots(figsize=figsize) - - lgb.plot_metric(self.evals_result_, metric=metric, ax=ax) - - if title is None: - assert metric is not None - title = f"Training Metric ({metric.upper()}) over Iterations" - ax.set_title(title, fontsize=12, fontweight="bold") - - return ax - - def plot_all_metrics( - self, - metrics: Optional[list] = None, - figsize: tuple = (14, 10), - max_cols: int = 2, - ): - """绘制所有训练指标曲线 - - Args: - metrics: 要绘制的指标列表 - figsize: 图大小,默认 (14, 10) - max_cols: 每行最多显示的子图数,默认 2 - - Returns: - matplotlib Figure 对象 - """ - if self.model is None: - raise RuntimeError("模型尚未训练,请先调用 fit()") - - if self.evals_result_ is None or not self.evals_result_: - raise RuntimeError("没有可用的评估结果") - - import lightgbm as lgb - import matplotlib.pyplot as plt - - available_metrics = list(self.evals_result_.get("train", {}).keys()) - - if metrics is None: - ndcg_metrics = [m for m in available_metrics if "ndcg" in m.lower()] - metrics = ndcg_metrics[:4] if ndcg_metrics else available_metrics[:4] - - if not metrics: - raise ValueError("没有可用的评估指标") - - n_metrics = len(metrics) - n_cols = min(max_cols, n_metrics) - n_rows = (n_metrics + n_cols - 1) // n_cols - - fig, axes = plt.subplots(n_rows, n_cols, figsize=figsize) - if n_metrics == 1: - axes = [axes] - else: - axes = ( - axes.flatten() - if n_rows > 1 - else [axes] - if n_cols == 1 - else axes.flatten() - ) - - for idx, metric in enumerate(metrics): - if idx < len(axes): - ax = axes[idx] - if metric in available_metrics: - self.plot_metric(metric=metric, ax=ax) - ax.set_title(f"{metric.upper()}", fontsize=11, fontweight="bold") - else: - ax.text( - 0.5, - 0.5, - f"Metric '{metric}' not found", - ha="center", - va="center", - transform=ax.transAxes, - ) - - for idx in range(n_metrics, len(axes)): - axes[idx].axis("off") - - plt.tight_layout() - return fig - def feature_importance(self) -> Optional[pd.Series]: """返回特征重要性 diff --git a/src/training/factor_manager.py b/src/training/factor_manager.py new file mode 100644 index 0000000..5a7ac56 --- /dev/null +++ b/src/training/factor_manager.py @@ -0,0 +1,163 @@ +"""因子管理器 + +管理多种来源的因子: +- metadata 中注册的因子 +- DSL 表达式定义的因子 +- Label 因子 +- 排除的因子列表 +""" + +from typing import Dict, List, Optional, Any +import polars as pl + +from src.factors import FactorEngine + + +class FactorManager: + """因子管理器 + + 统一管理多种来源的因子注册和准备: + 1. metadata 中已注册的因子(通过名称引用) + 2. DSL 表达式定义的因子(动态注册) + 3. Label 因子(通过表达式定义) + 4. 排除的因子列表(从最终列表中移除) + + Attributes: + selected_factors: 从 metadata 中选择的因子名称列表 + factor_definitions: DSL 表达式定义的因子字典 {name: dsl_expression} + label_factor: Label 因子定义 {name: dsl_expression} + excluded_factors: 需要排除的因子名称列表 + registered_factors: 已注册到 FactorEngine 的因子列表 + """ + + def __init__( + self, + selected_factors: List[str], + factor_definitions: Dict[str, str], + label_factor: Dict[str, str], + excluded_factors: Optional[List[str]] = None, + ): + """初始化因子管理器 + + Args: + selected_factors: 从 metadata 中选择的因子名称列表 + factor_definitions: DSL 表达式定义的因子字典 + label_factor: Label 因子定义字典 + excluded_factors: 需要排除的因子名称列表 + """ + self.selected_factors = selected_factors or [] + self.factor_definitions = factor_definitions or {} + self.label_factor = label_factor or {} + self.excluded_factors = excluded_factors or [] + self.registered_factors: List[str] = [] + + def register_to_engine( + self, + engine: FactorEngine, + verbose: bool = True, + ) -> List[str]: + """注册所有因子到 FactorEngine + + 按以下顺序注册: + 1. metadata 中的因子(通过名称从 metadata 加载) + 2. DSL 表达式定义的因子(使用 add_factor 注册) + 3. Label 因子(使用 add_factor 注册) + 4. 排除指定的因子 + + Args: + engine: FactorEngine 实例 + verbose: 是否打印注册信息 + + Returns: + 最终的特征列名列表(已排除指定因子) + """ + if verbose: + print("\n" + "=" * 80) + print("因子注册") + print("=" * 80) + + # Step 1: 从 metadata 注册选中的因子 + if verbose: + print(f"\n[1/4] 从 metadata 注册 {len(self.selected_factors)} 个因子...") + + feature_cols = [] + for factor_name in self.selected_factors: + try: + engine.add_factor(factor_name) + feature_cols.append(factor_name) + if verbose: + print(f" [OK] {factor_name}") + except Exception as e: + if verbose: + print(f" [FAIL] {factor_name}: {e}") + + # Step 2: 注册 DSL 定义的因子 + if self.factor_definitions: + if verbose: + print(f"\n[2/4] 注册 {len(self.factor_definitions)} 个 DSL 定义因子...") + + for factor_name, dsl_expr in self.factor_definitions.items(): + if factor_name not in self.excluded_factors: + try: + engine.add_factor(factor_name, dsl_expr) + feature_cols.append(factor_name) + if verbose: + print(f" ✓ {factor_name}: {dsl_expr[:50]}...") + except Exception as e: + if verbose: + print(f" ✗ {factor_name}: {e}") + + # Step 3: 注册 Label 因子 + if self.label_factor: + if verbose: + print(f"\n[3/4] 注册 Label 因子...") + + for factor_name, dsl_expr in self.label_factor.items(): + try: + engine.add_factor(factor_name, dsl_expr) + if verbose: + print(f" ✓ Label: {factor_name}") + except Exception as e: + if verbose: + print(f" ✗ Label {factor_name}: {e}") + + # Step 4: 排除指定因子 + if self.excluded_factors: + if verbose: + print(f"\n[4/4] 排除 {len(self.excluded_factors)} 个因子...") + + original_count = len(feature_cols) + feature_cols = [f for f in feature_cols if f not in self.excluded_factors] + excluded_count = original_count - len(feature_cols) + + if verbose: + print(f" 排除 {excluded_count} 个因子") + for f in self.excluded_factors: + if f in self.selected_factors or f in self.factor_definitions: + print(f" - {f}") + + self.registered_factors = feature_cols + + if verbose: + print(f"\n[结果] 最终特征数: {len(feature_cols)}") + print("=" * 80) + + return feature_cols + + def get_feature_cols(self) -> List[str]: + """获取已注册的特征列名列表 + + Returns: + 特征列名列表 + """ + return self.registered_factors + + def get_label_col(self) -> Optional[str]: + """获取 Label 列名 + + Returns: + Label 列名,如果没有则返回 None + """ + if self.label_factor: + return list(self.label_factor.keys())[0] + return None diff --git a/src/training/pipeline.py b/src/training/pipeline.py new file mode 100644 index 0000000..570b15c --- /dev/null +++ b/src/training/pipeline.py @@ -0,0 +1,309 @@ +"""数据流水线 + +完整的数据处理流程: +1. 因子注册和数据准备 +2. 应用过滤器(STFilter 等) +3. 股票池筛选(自定义函数) +4. 数据质量检查 +5. 数据划分(train/val/test) +6. 数据预处理(fit_transform/transform) +""" + +from typing import Any, Callable, Dict, List, Optional, Tuple, Type +import polars as pl +import numpy as np + +from src.factors import FactorEngine +from src.training.factor_manager import FactorManager +from src.training.components.base import BaseProcessor +from src.training.core.stock_pool_manager import StockPoolManager + + +class DataPipeline: + """数据流水线 + + 执行完整的数据处理流程,返回标准化的数据字典。 + + Attributes: + factor_manager: 因子管理器 + filters: 类形式的过滤器列表(如 STFilter) + stock_pool_filter_func: 函数形式的股票池筛选器 + processor_configs: 数据处理器配置列表(类+参数) + stock_pool_required_columns: 股票池筛选所需的额外列 + fitted_processors: 已拟合的处理器列表(训练后填充) + """ + + def __init__( + self, + factor_manager: FactorManager, + processor_configs: List[Tuple[Type[BaseProcessor], Dict[str, Any]]], + filters: Optional[List[Any]] = None, + stock_pool_filter_func: Optional[Callable] = None, + stock_pool_required_columns: Optional[List[str]] = None, + ): + """初始化数据流水线 + + Args: + factor_manager: 因子管理器实例 + processor_configs: 数据处理器配置列表,每个元素为 (ProcessorClass, kwargs) + 例如:[(NullFiller, {"strategy": "mean"}), (Winsorizer, {"lower": 0.01, "upper": 0.99})] + filters: 类形式的过滤器列表(如 [STFilter]) + stock_pool_filter_func: 函数形式的股票池筛选器 + stock_pool_required_columns: 股票池筛选所需的额外列 + """ + self.factor_manager = factor_manager + self.processor_configs = processor_configs or [] + self.filters = filters or [] + self.stock_pool_filter_func = stock_pool_filter_func + self.stock_pool_required_columns = stock_pool_required_columns or [] + self.fitted_processors: List[BaseProcessor] = [] + + def prepare_data( + self, + engine: FactorEngine, + date_range: Dict[str, Tuple[str, str]], + label_name: str, + verbose: bool = True, + ) -> Dict[str, Dict[str, Any]]: + """执行完整数据流程 + + 流程: + 1. 注册因子并准备数据 + 2. 应用类过滤器(STFilter) + 3. 应用股票池筛选(函数形式) + 4. 数据质量检查 + 5. 数据划分 + 6. 数据预处理 + + Args: + engine: FactorEngine 实例 + date_range: 日期范围字典 {"train": (start, end), "val": ..., "test": ...} + label_name: Label 列名 + verbose: 是否打印处理信息 + + Returns: + 标准化的数据字典 + """ + if verbose: + print("\n" + "=" * 80) + print("数据流水线") + print("=" * 80) + + # Step 1: 注册因子并准备数据 + if verbose: + print("\n[1/6] 注册因子并准备数据...") + + feature_cols = self.factor_manager.register_to_engine(engine, verbose=verbose) + + # 计算完整日期范围 + all_start = min( + date_range["train"][0], date_range["val"][0], date_range["test"][0] + ) + all_end = max( + date_range["train"][1], date_range["val"][1], date_range["test"][1] + ) + + # 准备数据 + data = engine.compute( + factor_names=feature_cols + [label_name], + start_date=all_start, + end_date=all_end, + ) + + if verbose: + print(f" 原始数据规模: {data.shape}") + print(f" 特征数: {len(feature_cols)}") + + # Step 2: 应用类过滤器(STFilter) + if self.filters: + if verbose: + print(f"\n[2/6] 应用过滤器({len(self.filters)}个)...") + + for filter_obj in self.filters: + data_before = len(data) + data = filter_obj.filter(data) + data_after = len(data) + + if verbose: + print(f" {filter_obj.__class__.__name__}:") + print(f" 过滤前: {data_before}, 过滤后: {data_after}") + print(f" 删除: {data_before - data_after}") + + # Step 3: 应用股票池筛选(函数形式) + if self.stock_pool_filter_func: + if verbose: + print(f"\n[3/6] 股票池筛选...") + + data_before = len(data) + + # 创建 StockPoolManager + pool_manager = StockPoolManager( + filter_func=self.stock_pool_filter_func, + required_columns=self.stock_pool_required_columns, + data_router=engine.router, + ) + + data = pool_manager.filter_and_select_daily(data) + data_after = len(data) + + if verbose: + print(f" 筛选前: {data_before}, 筛选后: {data_after}") + print(f" 删除: {data_before - data_after}") + + # Step 4: 数据质量检查 + if verbose: + print(f"\n[4/6] 数据质量检查...") + + self._check_data_quality(data, feature_cols, verbose=verbose) + + # Step 5: 数据划分 + if verbose: + print(f"\n[5/6] 数据划分...") + + split_data = self._split_data( + data, date_range, feature_cols, label_name, verbose=verbose + ) + + # Step 6: 数据预处理 + if verbose: + print(f"\n[6/6] 数据预处理...") + + split_data = self._preprocess(split_data, feature_cols, verbose=verbose) + + if verbose: + print("\n" + "=" * 80) + print("数据流水线完成") + print("=" * 80) + + return split_data + + def _check_data_quality( + self, + data: pl.DataFrame, + feature_cols: List[str], + verbose: bool = True, + ) -> None: + """检查数据质量 + + Args: + data: 数据框 + feature_cols: 特征列名列表 + verbose: 是否打印信息 + """ + # 检查缺失值 + null_counts = {} + for col in feature_cols[:10]: # 只检查前10个特征 + null_count = data[col].null_count() + if null_count > 0: + null_counts[col] = null_count + + if null_counts and verbose: + print(f" [警告] 发现缺失值(仅显示前10个特征):") + for col, count in list(null_counts.items())[:5]: + pct = count / len(data) * 100 + print(f" {col}: {count} ({pct:.2f}%)") + + def _split_data( + self, + data: pl.DataFrame, + date_range: Dict[str, Tuple[str, str]], + feature_cols: List[str], + label_name: str, + verbose: bool = True, + ) -> Dict[str, Dict[str, Any]]: + """划分数据集 + + Args: + data: 完整数据 + date_range: 日期范围字典 + feature_cols: 特征列名 + label_name: Label 列名 + verbose: 是否打印信息 + + Returns: + 划分后的数据字典 + """ + result = {} + + for split_name, (start, end) in date_range.items(): + mask = (data["trade_date"] >= start) & (data["trade_date"] <= end) + split_df = data.filter(mask) + + result[split_name] = { + "X": split_df.select(feature_cols), + "y": split_df[label_name], + "raw_data": split_df, + "feature_cols": feature_cols, + } + + if verbose: + print(f" {split_name}: {len(split_df)} 条记录") + + return result + + def _preprocess( + self, + split_data: Dict[str, Dict[str, Any]], + feature_cols: List[str], + verbose: bool = True, + ) -> Dict[str, Dict[str, Any]]: + """预处理数据 + + 训练集使用 fit_transform,验证集和测试集使用 transform + + Args: + split_data: 划分后的数据字典 + feature_cols: 特征列名列表 + verbose: 是否打印信息 + + Returns: + 预处理后的数据字典 + """ + if not self.processor_configs: + return split_data + + self.fitted_processors = [] + + # 实例化 processors(传入 feature_cols) + processors = [] + for proc_class, proc_kwargs in self.processor_configs: + proc_kwargs_with_cols = {**proc_kwargs, "feature_cols": feature_cols} + processors.append(proc_class(**proc_kwargs_with_cols)) + + # 训练集:fit_transform + if verbose: + print(f" 训练集预处理(fit_transform)...") + + train_data = split_data["train"]["raw_data"] + for processor in processors: + train_data = processor.fit_transform(train_data) + self.fitted_processors.append(processor) + + # 更新训练集 + split_data["train"]["raw_data"] = train_data + split_data["train"]["X"] = train_data.select(feature_cols) + split_data["train"]["y"] = train_data[split_data["train"]["y"].name] + + # 验证集和测试集:transform + for split_name in ["val", "test"]: + if split_name in split_data: + if verbose: + print(f" {split_name}集预处理(transform)...") + + split_df = split_data[split_name]["raw_data"] + for processor in self.fitted_processors: + split_df = processor.transform(split_df) + + split_data[split_name]["raw_data"] = split_df + split_data[split_name]["X"] = split_df.select(feature_cols) + split_data[split_name]["y"] = split_df[split_data[split_name]["y"].name] + + return split_data + + def get_fitted_processors(self) -> List[BaseProcessor]: + """获取已拟合的处理器列表 + + Returns: + 已拟合的处理器列表(用于模型保存) + """ + return self.fitted_processors diff --git a/src/training/result_analyzer.py b/src/training/result_analyzer.py new file mode 100644 index 0000000..29b5587 --- /dev/null +++ b/src/training/result_analyzer.py @@ -0,0 +1,191 @@ +"""结果分析器 + +训练后的分析和结果处理: +1. 特征重要性分析(Top N、零贡献特征) +2. 结果组装(生成每日 Top N) +3. 结果保存 +""" + +from typing import Any, Dict, List, Optional +import os +import polars as pl +import pandas as pd +import numpy as np + + +class ResultAnalyzer: + """结果分析器 + + 分析训练结果,生成报告并保存。 + """ + + def analyze_feature_importance( + self, + model, + feature_cols: List[str], + top_n: int = 20, + verbose: bool = True, + ) -> Dict[str, Any]: + """分析特征重要性 + + Args: + model: 训练好的模型 + feature_cols: 特征列名列表 + top_n: 显示 Top N 特征 + verbose: 是否打印信息 + + Returns: + 分析结果字典 + """ + importance = model.feature_importance() + + if importance is None: + if verbose: + print("[警告] 无法获取特征重要性") + return {} + + # 按重要性排序 + importance_sorted = importance.sort_values(ascending=False) + + # 计算百分比 + total_importance = importance_sorted.sum() + importance_pct = (importance_sorted / total_importance * 100).round(2) + + # 识别零贡献特征 + zero_importance_features = importance_sorted[ + importance_sorted == 0 + ].index.tolist() + + if verbose: + print("\n" + "=" * 80) + print("特征重要性分析") + print("=" * 80) + + # 打印 Top N + print(f"\nTop {top_n} 特征:") + print("-" * 80) + print(f"{'排名':<6}{'特征名':<35}{'重要性':<15}{'占比':<10}") + print("-" * 80) + + for i, (feature, score) in enumerate( + importance_sorted.head(top_n).items(), 1 + ): + pct = importance_pct[feature] + if pct >= 10: + marker = " [高贡献]" + elif pct >= 1: + marker = " [中贡献]" + else: + marker = " [低贡献]" + print(f"{i:<6}{feature:<35}{score:<15.2f}{pct:<8.2f}%{marker}") + + # 打印零贡献特征 + if zero_importance_features: + print("\n" + "-" * 80) + print(f"[警告] 贡献为0的特征(共 {len(zero_importance_features)} 个):") + for i, feature in enumerate(zero_importance_features, 1): + print(f" {i}. {feature}") + + # 统计摘要 + print("\n" + "=" * 80) + print("统计摘要:") + print("-" * 80) + print(f" 特征总数: {len(importance_sorted)}") + print( + f" 有贡献特征数: {len(importance_sorted) - len(zero_importance_features)}" + ) + print(f" 零贡献特征数: {len(zero_importance_features)}") + if len(importance_sorted) > 0: + print( + f" 零贡献占比: {len(zero_importance_features) / len(importance_sorted) * 100:.1f}%" + ) + print(f" Top {top_n} 累计占比: {importance_pct.head(top_n).sum():.1f}%") + print("=" * 80) + + return { + "importance": importance_sorted, + "importance_pct": importance_pct, + "zero_importance_features": zero_importance_features, + "top_n": importance_sorted.head(top_n), + } + + def assemble_results( + self, + test_data: Dict[str, Any], + predictions: np.ndarray, + top_n: int = 50, + verbose: bool = True, + ) -> pl.DataFrame: + """组装结果 + + 生成每日 Top N 股票推荐列表。 + + Args: + test_data: 测试数据字典 + predictions: 预测结果数组 + top_n: 每日选择的股票数 + verbose: 是否打印信息 + + Returns: + 结果数据框 + """ + # 添加预测列 + raw_data = test_data["raw_data"] + results = raw_data.with_columns([pl.Series("prediction", predictions)]) + + # 按日期分组取 Top N + unique_dates = results["trade_date"].unique().sort() + topn_by_date = [] + + for date in unique_dates: + day_data = results.filter(results["trade_date"] == date) + topn = day_data.sort("prediction", descending=True).head(top_n) + topn_by_date.append(topn) + + # 合并所有日期的 Top N + topn_results = pl.concat(topn_by_date) + + if verbose: + print(f"\n生成每日 Top {top_n} 股票列表:") + print(f" 交易日数: {len(unique_dates)}") + print(f" 总推荐数: {len(topn_results)}") + + return topn_results + + def save_results( + self, + results: pl.DataFrame, + output_path: str, + verbose: bool = True, + ) -> None: + """保存结果 + + Args: + results: 结果数据框 + output_path: 输出路径 + verbose: 是否打印信息 + """ + # 格式化日期并调整列顺序 + formatted = results.select( + [ + ( + pl.col("trade_date").str.slice(0, 4) + + "-" + + pl.col("trade_date").str.slice(4, 2) + + "-" + + pl.col("trade_date").str.slice(6, 2) + ).alias("date"), + pl.col("prediction").alias("score"), + pl.col("ts_code"), + ] + ) + + # 确保目录存在 + os.makedirs(os.path.dirname(output_path), exist_ok=True) + + # 保存 CSV + formatted.write_csv(output_path, include_header=True) + + if verbose: + print(f" 保存路径: {output_path}") + print(f" 保存行数: {len(formatted)}") diff --git a/src/training/tasks/__init__.py b/src/training/tasks/__init__.py new file mode 100644 index 0000000..09b7442 --- /dev/null +++ b/src/training/tasks/__init__.py @@ -0,0 +1,14 @@ +"""Tasks 模块 + +提供各种训练任务的实现。 +""" + +from src.training.tasks.base import BaseTask +from src.training.tasks.regression_task import RegressionTask +from src.training.tasks.rank_task import RankTask + +__all__ = [ + "BaseTask", + "RegressionTask", + "RankTask", +] diff --git a/src/training/tasks/base.py b/src/training/tasks/base.py new file mode 100644 index 0000000..679a67c --- /dev/null +++ b/src/training/tasks/base.py @@ -0,0 +1,79 @@ +"""任务抽象基类 + +定义 Task 接口,所有具体任务必须实现此接口。 +""" + +from abc import ABC, abstractmethod +from typing import Any, Dict, Optional +import numpy as np + + +class BaseTask(ABC): + """任务抽象基类 + + 所有训练任务(回归、排序学习、分类等)必须继承此类。 + 提供统一的接口:Label处理、模型训练、预测、评估。 + + Attributes: + label_name: Label 列名 + model_params: 模型参数字典 + """ + + def __init__(self, model_params: Dict[str, Any], label_name: str): + """初始化任务 + + Args: + model_params: 模型参数字典 + label_name: Label 列名 + """ + self.model_params = model_params + self.label_name = label_name + self.model = None + + @abstractmethod + def prepare_labels(self, data: Dict[str, Dict]) -> Dict[str, Dict]: + """准备标签 + + 子类可实现特定的 Label 转换逻辑(如排序学习的分位数转换)。 + + Args: + data: 数据字典 + + Returns: + 处理后的数据字典 + """ + raise NotImplementedError + + @abstractmethod + def fit(self, train_data: Dict, val_data: Dict) -> None: + """训练模型 + + Args: + train_data: 训练数据字典 {"X": DataFrame, "y": Series, ...} + val_data: 验证数据字典 + """ + raise NotImplementedError + + @abstractmethod + def predict(self, test_data: Dict) -> np.ndarray: + """生成预测 + + Args: + test_data: 测试数据字典 + + Returns: + 预测结果数组 + """ + raise NotImplementedError + + def get_model(self): + """获取底层模型 + + Returns: + 训练后的模型实例 + """ + return self.model + + def plot_training_metrics(self) -> None: + """绘制训练指标曲线(可选)""" + pass diff --git a/src/training/tasks/rank_task.py b/src/training/tasks/rank_task.py new file mode 100644 index 0000000..4a31fdc --- /dev/null +++ b/src/training/tasks/rank_task.py @@ -0,0 +1,198 @@ +"""排序学习任务实现 + +实现排序学习任务的训练流程: +- Label 转换为分位数标签 +- 生成 group 数组 +- 使用 LightGBM LambdaRank +- 支持 NDCG@k 评估 +""" + +from typing import Any, Dict, List, Optional +import numpy as np +import polars as pl + +from src.training.tasks.base import BaseTask +from src.training.components.models.lightgbm_lambdarank import LightGBMLambdaRankModel + + +class RankTask(BaseTask): + """排序学习任务 + + 使用 LightGBM LambdaRank 进行排序学习训练。 + 将连续收益率转换为分位数标签进行训练。 + """ + + def __init__( + self, + model_params: Dict[str, Any], + label_name: str = "future_return_5", + n_quantiles: int = 20, + ): + """初始化排序学习任务 + + Args: + model_params: LightGBM 参数字典 + label_name: Label 列名 + n_quantiles: 分位数数量 + """ + super().__init__(model_params, label_name) + self.n_quantiles = n_quantiles + + def prepare_labels(self, data: Dict[str, Dict]) -> Dict[str, Dict]: + """准备标签(转换为分位数标签) + + 将连续收益率转换为分位数标签,并生成 group 数组。 + + Args: + data: 数据字典 + + Returns: + 处理后的数据字典(添加了 y_rank 和 groups) + """ + for split in ["train", "val", "test"]: + if split not in data: + continue + + df = data[split]["raw_data"] + + # 分位数转换 + rank_col = f"{self.label_name}_rank" + df_ranked = ( + df.with_columns( + pl.col(self.label_name) + .rank(method="min") + .over("trade_date") + .alias("_rank") + ) + .with_columns( + ( + (pl.col("_rank") - 1) + / pl.len().over("trade_date") + * self.n_quantiles + ) + .floor() + .cast(pl.Int64) + .clip(0, self.n_quantiles - 1) + .alias(rank_col) + ) + .drop("_rank") + ) + + # 更新数据 + data[split]["raw_data"] = df_ranked + data[split]["y"] = df_ranked[rank_col] + data[split]["y_raw"] = df_ranked[self.label_name] # 保留原始值 + + # 生成 group 数组 + data[split]["groups"] = self._compute_group_array(df_ranked, "trade_date") + + return data + + def _compute_group_array( + self, + df: pl.DataFrame, + date_col: str = "trade_date", + ) -> np.ndarray: + """计算 group 数组 + + Args: + df: 数据框 + date_col: 日期列名 + + Returns: + group 数组(每个日期的样本数) + """ + group_counts = df.group_by(date_col, maintain_order=True).agg( + pl.count().alias("count") + ) + return group_counts["count"].to_numpy() + + def fit(self, train_data: Dict, val_data: Dict) -> None: + """训练排序模型 + + Args: + train_data: 训练数据 + val_data: 验证数据 + """ + self.model = LightGBMLambdaRankModel(params=self.model_params) + + self.model.fit( + train_data["X"], + train_data["y"], + group=train_data["groups"], + eval_set=(val_data["X"], val_data["y"], val_data["groups"]) + if val_data + else None, + ) + + def predict(self, test_data: Dict) -> np.ndarray: + """生成预测 + + Args: + test_data: 测试数据 + + Returns: + 预测结果 + """ + return self.model.predict(test_data["X"]) + + def evaluate_ndcg( + self, + test_data: Dict, + k_list: List[int] = None, + ) -> Dict[str, float]: + """评估 NDCG@k + + Args: + test_data: 测试数据 + k_list: k 值列表,默认 [1, 5, 10, 20] + + Returns: + NDCG 分数字典 {"ndcg@1": score, ...} + """ + if k_list is None: + k_list = [1, 5, 10, 20] + + y_true = test_data["y_raw"] + y_pred = self.predict(test_data) + groups = test_data["groups"] + + from sklearn.metrics import ndcg_score + + results = {} + + # 按 group 拆分 + start_idx = 0 + y_true_groups = [] + y_pred_groups = [] + + for group_size in groups: + end_idx = start_idx + group_size + y_true_groups.append(y_true.to_numpy()[start_idx:end_idx]) + y_pred_groups.append(y_pred[start_idx:end_idx]) + start_idx = end_idx + + # 计算每个 k 的 NDCG + for k in k_list: + ndcg_scores = [] + for yt, yp in zip(y_true_groups, y_pred_groups): + if len(yt) > 1: + try: + score = ndcg_score([yt], [yp], k=k) + ndcg_scores.append(score) + except ValueError: + pass + + results[f"ndcg@{k}"] = float(np.mean(ndcg_scores)) if ndcg_scores else 0.0 + + return results + + def plot_training_metrics(self) -> None: + """绘制训练指标曲线(NDCG)""" + if self.model and hasattr(self.model, "model") and self.model.model: + try: + import lightgbm as lgb + + lgb.plot_metric(self.model.model) + except Exception as e: + print(f"[警告] 无法绘制训练曲线: {e}") diff --git a/src/training/tasks/regression_task.py b/src/training/tasks/regression_task.py new file mode 100644 index 0000000..a3ceb79 --- /dev/null +++ b/src/training/tasks/regression_task.py @@ -0,0 +1,86 @@ +"""回归任务实现 + +实现回归任务的训练流程: +- Label 无需转换(保持连续值) +- 使用 LightGBM 回归模型 +- 支持 MAE/RMSE 评估 +""" + +from typing import Any, Dict, Optional +import numpy as np +import polars as pl + +from src.training.tasks.base import BaseTask +from src.training.components.models.lightgbm import LightGBMModel + + +class RegressionTask(BaseTask): + """回归任务 + + 使用 LightGBM 进行回归训练,支持早停和训练曲线。 + """ + + def __init__( + self, + model_params: Dict[str, Any], + label_name: str = "future_return_5", + ): + """初始化回归任务 + + Args: + model_params: LightGBM 参数字典 + label_name: Label 列名 + """ + super().__init__(model_params, label_name) + self.evals_result: Optional[Dict] = None + + def prepare_labels(self, data: Dict[str, Dict]) -> Dict[str, Dict]: + """准备标签(回归任务无需转换) + + Args: + data: 数据字典 + + Returns: + 原样返回数据字典 + """ + # 回归任务不需要转换 Label + return data + + def fit(self, train_data: Dict, val_data: Dict) -> None: + """训练回归模型 + + Args: + train_data: 训练数据 {"X": DataFrame, "y": Series} + val_data: 验证数据 + """ + self.model = LightGBMModel(params=self.model_params) + + X_train = train_data["X"] + y_train = train_data["y"] + X_val = val_data["X"] + y_val = val_data["y"] + + self.model.fit( + X_train, y_train, eval_set=(X_val, y_val) if X_val is not None else None + ) + + def predict(self, test_data: Dict) -> np.ndarray: + """生成预测 + + Args: + test_data: 测试数据 + + Returns: + 预测结果 + """ + return self.model.predict(test_data["X"]) + + def plot_training_metrics(self) -> None: + """绘制训练指标曲线""" + if self.model and hasattr(self.model, "model") and self.model.model: + try: + import lightgbm as lgb + + lgb.plot_metric(self.model.model) + except Exception as e: + print(f"[警告] 无法绘制训练曲线: {e}") diff --git a/src/training/trainer_v2.py b/src/training/trainer_v2.py new file mode 100644 index 0000000..63679ac --- /dev/null +++ b/src/training/trainer_v2.py @@ -0,0 +1,211 @@ +"""训练调度引擎 + +协调 FactorManager、DataPipeline、Task 和 ResultAnalyzer 完成训练流程。 +""" + +from typing import Any, Callable, Dict, List, Optional, Tuple +import os +from datetime import datetime + +import polars as pl + +from src.factors import FactorEngine +from src.training.pipeline import DataPipeline +from src.training.tasks.base import BaseTask +from src.training.result_analyzer import ResultAnalyzer + + +class Trainer: + """训练调度引擎 + + 协调各个组件执行完整训练流程: + 1. 准备数据(DataPipeline) + 2. 处理标签(Task) + 3. 训练模型(Task) + 4. 绘制指标(Task) + 5. 生成预测(Task) + 6. 分析结果(ResultAnalyzer) + 7. 保存结果 + + Attributes: + data_pipeline: 数据流水线 + task: 任务实例(RegressionTask/RankTask) + analyzer: 结果分析器 + output_config: 输出配置 + verbose: 是否打印详细信息 + results: 训练结果 + """ + + def __init__( + self, + data_pipeline: DataPipeline, + task: BaseTask, + analyzer: Optional[ResultAnalyzer] = None, + output_config: Optional[Dict[str, Any]] = None, + verbose: bool = True, + ): + """初始化训练器 + + Args: + data_pipeline: 数据流水线实例 + task: 任务实例(RegressionTask 或 RankTask) + analyzer: 结果分析器(可选,默认创建新实例) + output_config: 输出配置字典 + verbose: 是否打印详细信息 + """ + self.data_pipeline = data_pipeline + self.task = task + self.analyzer = analyzer or ResultAnalyzer() + self.output_config = output_config or {} + self.verbose = verbose + self.results: Optional[pl.DataFrame] = None + + def run( + self, + engine: FactorEngine, + date_range: Dict[str, Tuple[str, str]], + ) -> pl.DataFrame: + """执行完整训练流程 + + Args: + engine: FactorEngine 实例 + date_range: 日期范围字典 + { + "train": (start_date, end_date), + "val": (start_date, end_date), + "test": (start_date, end_date), + } + + Returns: + 训练结果数据框 + """ + if self.verbose: + print("\n" + "=" * 80) + print(f"开始训练: {self.task.__class__.__name__}") + print("=" * 80) + + # Step 1: 准备数据 + if self.verbose: + print("\n[Step 1/7] 准备数据...") + + data = self.data_pipeline.prepare_data( + engine=engine, + date_range=date_range, + label_name=self.task.label_name, + verbose=self.verbose, + ) + + # Step 2: 处理标签 + if self.verbose: + print("\n[Step 2/7] 处理标签...") + + data = self.task.prepare_labels(data) + + # Step 3: 训练模型 + if self.verbose: + print("\n[Step 3/7] 训练模型...") + + self.task.fit(data["train"], data["val"]) + + # Step 4: 绘制训练指标 + if self.verbose: + print("\n[Step 4/7] 绘制训练指标...") + + self.task.plot_training_metrics() + + # Step 5: 生成预测 + if self.verbose: + print("\n[Step 5/7] 生成预测...") + + predictions = self.task.predict(data["test"]) + + # Step 6: 分析结果 + if self.verbose: + print("\n[Step 6/7] 分析结果...") + + # 特征重要性 + self.analyzer.analyze_feature_importance( + model=self.task.get_model(), + feature_cols=data["test"]["feature_cols"], + top_n=20, + verbose=self.verbose, + ) + + # NDCG 评估(排序任务特有) + if hasattr(self.task, "evaluate_ndcg"): + ndcg_scores = self.task.evaluate_ndcg(data["test"]) + if self.verbose: + print("\nNDCG 评估结果:") + for metric, score in ndcg_scores.items(): + print(f" {metric}: {score:.4f}") + + # 组装结果 + self.results = self.analyzer.assemble_results( + test_data=data["test"], + predictions=predictions, + top_n=self.output_config.get("top_n", 50), + verbose=self.verbose, + ) + + # Step 7: 保存结果 + if self.verbose: + print("\n[Step 7/7] 保存结果...") + + if self.output_config.get("save_predictions", True): + self._save_predictions() + + if self.output_config.get("save_model", False): + self._save_model() + + if self.verbose: + print("\n" + "=" * 80) + print("训练完成!") + print("=" * 80) + + return self.results + + def _save_predictions(self) -> None: + """保存预测结果""" + output_dir = self.output_config.get("output_dir", "experiment/output") + output_filename = self.output_config.get("output_filename", "output.csv") + output_path = os.path.join(output_dir, output_filename) + + self.analyzer.save_results( + results=self.results, + output_path=output_path, + verbose=self.verbose, + ) + + def _save_model(self) -> None: + """保存模型""" + model_save_path = self.output_config.get("model_save_path") + if not model_save_path: + return + + # 确保目录存在 + os.makedirs(os.path.dirname(model_save_path), exist_ok=True) + + # 获取模型和相关信息 + model = self.task.get_model() + + # 保存模型 + model.save(model_save_path) + + if self.verbose: + print(f" 模型保存路径: {model_save_path}") + + def get_results(self) -> Optional[pl.DataFrame]: + """获取训练结果 + + Returns: + 训练结果数据框,如果尚未训练则返回 None + """ + return self.results + + def get_task(self) -> BaseTask: + """获取任务实例 + + Returns: + 任务实例 + """ + return self.task