feat(factors): 集成 metadata 模块,支持按名称注册因子

- 新增 add_factor_by_name() 方法,从 metadata 查询 DSL 表达式并注册
- FactorEngine 支持可选的 metadata_path 参数初始化
- 将 regression.ipynb 和 learn_to_rank.ipynb 转换为 Python 脚本
- 新增 test_factor_engine_metadata.py 测试文件
This commit is contained in:
2026-03-11 22:54:52 +08:00
parent 038f5f1722
commit 2bb7718dd1
7 changed files with 2085 additions and 3101 deletions

File diff suppressed because one or more lines are too long

View File

@@ -0,0 +1,792 @@
# %% md
# # Learn-to-Rank 排序学习训练流程
#
# 本 Notebook 实现基于 LightGBM LambdaRank 的排序学习训练,用于股票排序任务。
#
# ## 核心特点
#
# 1. **Label 转换**: 将 `future_return_5` 按每日进行 20 分位数划分qcut
# 2. **排序学习**: 使用 LambdaRank 目标函数,学习每日股票排序
# 3. **NDCG 评估**: 使用 NDCG@1/5/10/20 评估排序质量
# 4. **策略回测**: 基于排序分数构建 Top-k 选股策略
# %% md
# ## 1. 导入依赖
# %%
import os
from datetime import datetime
from typing import List, Tuple, Optional
import numpy as np
import polars as pl
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import ndcg_score
from src.factors import FactorEngine
from src.training import (
DateSplitter,
STFilter,
StockPoolManager,
Trainer,
Winsorizer,
NullFiller,
StandardScaler,
)
from src.training.components.models import LightGBMLambdaRankModel
from src.training.config import TrainingConfig
# %% md
# ## 2. 辅助函数
# %%
def create_factors_with_metadata(
engine: FactorEngine, factor_definitions: dict, label_factor: dict
) -> List[str]:
"""使用 metadata 注册因子特征因子通过名称注册label 因子通过表达式注册)"""
print("=" * 80)
print("使用 metadata 注册因子")
print("=" * 80)
# 注册所有特征因子(通过 metadata 名称)
print("\n注册特征因子(从 metadata:")
for name in factor_definitions.keys():
engine.add_factor_by_name(name)
print(f" - {name}")
# 注册 label 因子(通过表达式,因为 label 不在 metadata 中)
print("\n注册 Label 因子(表达式):")
for name, expr in label_factor.items():
engine.add_factor(name, expr)
print(f" - {name}: {expr}")
# 从字典自动获取特征列
feature_cols = list(factor_definitions.keys())
print(f"\n特征因子数: {len(feature_cols)}")
print(f"Label: {list(label_factor.keys())[0]}")
print(f"已注册因子总数: {len(engine.list_registered())}")
return feature_cols
def prepare_data(
engine: FactorEngine,
feature_cols: List[str],
start_date: str,
end_date: str,
) -> pl.DataFrame:
"""准备数据"""
print("\n" + "=" * 80)
print("准备数据")
print("=" * 80)
# 计算因子(全市场数据)
print(f"\n计算因子: {start_date} - {end_date}")
factor_names = feature_cols + [LABEL_NAME] # 包含 label
data = engine.compute(
factor_names=factor_names,
start_date=start_date,
end_date=end_date,
)
print(f"数据形状: {data.shape}")
print(f"数据列: {data.columns}")
print(f"\n前5行预览:")
print(data.head())
return data
def prepare_ranking_data(
df: pl.DataFrame,
label_col: str = "future_return_5",
date_col: str = "trade_date",
n_quantiles: int = 20,
) -> Tuple[pl.DataFrame, str]:
"""准备排序学习数据
将连续 label 转换为分位数标签,用于排序学习任务。
Args:
df: 原始数据
label_col: 原始标签列名
date_col: 日期列名
n_quantiles: 分位数数量
Returns:
(处理后的 DataFrame, 新的标签列名)
"""
print("\n" + "=" * 80)
print(f"准备排序学习数据(将 {label_col} 转换为 {n_quantiles} 分位数标签)")
print("=" * 80)
# 新的标签列名
rank_col = f"{label_col}_rank"
# 按日期分组进行分位数划分
# 使用 rank 生成 0, 1, 2, ..., n_quantiles-1 的标签
# 方法: 计算每天内的排名,然后映射到 n_quantiles 个分位数组
df_ranked = (
df.with_columns(
# 计算每天内的排名 (1-based)
pl.col(label_col).rank(method="min").over(date_col).alias("_rank")
)
.with_columns(
# 将排名转换为分位数标签 (0 to n_quantiles-1)
((pl.col("_rank") - 1) / pl.len().over(date_col) * n_quantiles)
.floor()
.cast(pl.Int64)
.clip(0, n_quantiles - 1)
.alias(rank_col)
)
.drop("_rank")
)
# 检查转换结果
print(f"\n原始 {label_col} 统计:")
print(df_ranked[label_col].describe())
print(f"\n转换后 {rank_col} 统计:")
print(df_ranked[rank_col].describe())
# 检查每日样本分布
print(f"\n每日样本数统计:")
daily_counts = df_ranked.group_by(date_col).agg(pl.count().alias("count"))
print(daily_counts["count"].describe())
# 检查分位数分布(应该是均匀的)
print(f"\n分位数标签分布:")
rank_dist = df_ranked[rank_col].value_counts().sort(rank_col)
print(rank_dist)
return df_ranked, rank_col
def compute_group_array(df: pl.DataFrame, date_col: str = "trade_date") -> np.ndarray:
"""计算 group 数组用于 LambdaRank
每个日期作为一个 querygroup 数组表示每个 query 的样本数。
Args:
df: 数据框
date_col: 日期列名
Returns:
group 数组
"""
group_counts = df.group_by(date_col, maintain_order=True).agg(
pl.count().alias("count")
)
return group_counts["count"].to_numpy()
def evaluate_ndcg_at_k(
y_true: np.ndarray,
y_pred: np.ndarray,
group: np.ndarray,
k_list: List[int] = [1, 5, 10, 20],
) -> dict:
"""计算 NDCG@k 指标
Args:
y_true: 真实标签
y_pred: 预测分数
group: 分组数组
k_list: 要计算的 k 值列表
Returns:
NDCG 指标字典
"""
results = {}
# 按 group 拆分数据
start_idx = 0
y_true_groups = []
y_pred_groups = []
for group_size in group:
end_idx = start_idx + group_size
y_true_groups.append(y_true[start_idx:end_idx])
y_pred_groups.append(y_pred[start_idx:end_idx])
start_idx = end_idx
# 计算每个 k 值的平均 NDCG
for k in k_list:
ndcg_scores = []
for yt, yp in zip(y_true_groups, y_pred_groups):
if len(yt) > 1:
try:
score = ndcg_score([yt], [yp], k=k)
ndcg_scores.append(score)
except ValueError:
# 标签都相同,无法计算
pass
results[f"ndcg@{k}"] = np.mean(ndcg_scores) if ndcg_scores else 0.0
return results
# %% md
# ## 3. 配置参数
#
# ### 3.1 因子定义
# %%
# 特征因子定义字典(复用 regression.ipynb 的因子定义)
LABEL_NAME = "future_return_5_rank"
FACTOR_DEFINITIONS = {
# ================= 1. 价格、趋势与路径依赖 (Trend, Momentum & Path Dependency) =================
"ma_5": "ts_mean(close, 5)",
"ma_20": "ts_mean(close, 20)",
"ma_ratio_5_20": "ts_mean(close, 5) / (ts_mean(close, 20) + 1e-8) - 1",
"bias_10": "close / (ts_mean(close, 10) + 1e-8) - 1",
"high_low_ratio": "(close - ts_min(low, 20)) / (ts_max(high, 20) - ts_min(low, 20) + 1e-8)",
"bbi_ratio": "(ts_mean(close, 3) + ts_mean(close, 6) + ts_mean(close, 12) + ts_mean(close, 24)) / (4 * close + 1e-8)",
"return_5": "(close / (ts_delay(close, 5) + 1e-8)) - 1",
"return_20": "(close / (ts_delay(close, 20) + 1e-8)) - 1",
"kaufman_ER_20": "abs(close - ts_delay(close, 20)) / (ts_sum(abs(close - ts_delay(close, 1)), 20) + 1e-8)",
"mom_acceleration_10_20": "(close / (ts_delay(close, 10) + 1e-8) - 1) - (ts_delay(close, 10) / (ts_delay(close, 20) + 1e-8) - 1)",
"drawdown_from_high_60": "close / (ts_max(high, 60) + 1e-8) - 1",
"up_days_ratio_20": "ts_sum(close > ts_delay(close, 1), 20) / 20",
# ================= 2. 波动率、风险调整与高阶矩 =================
"volatility_5": "ts_std(close, 5)",
"volatility_20": "ts_std(close, 20)",
"volatility_ratio": "ts_std(close, 5) / (ts_std(close, 20) + 1e-8)",
"std_return_20": "ts_std((close / (ts_delay(close, 1) + 1e-8)) - 1, 20)",
"sharpe_ratio_20": "ts_mean(close / (ts_delay(close, 1) + 1e-8) - 1, 20) / (ts_std(close / (ts_delay(close, 1) + 1e-8) - 1, 20) + 1e-8)",
"min_ret_20": "ts_min(close / (ts_delay(close, 1) + 1e-8) - 1, 20)",
"volatility_squeeze_5_60": "ts_std(close, 5) / (ts_std(close, 60) + 1e-8)",
# ================= 3. 日内微观结构与异象 =================
"overnight_intraday_diff": "(open / (ts_delay(close, 1) + 1e-8) - 1) - (close / (open + 1e-8) - 1)",
"upper_shadow_ratio": "(high - ((open + close + abs(open - close)) / 2)) / (high - low + 1e-8)",
"capital_retention_20": "ts_sum(abs(close - open), 20) / (ts_sum(high - low, 20) + 1e-8)",
"max_ret_20": "ts_max(close / (ts_delay(close, 1) + 1e-8) - 1, 20)",
# ================= 4. 量能、流动性与量价背离 =================
"volume_ratio_5_20": "ts_mean(vol, 5) / (ts_mean(vol, 20) + 1e-8)",
"turnover_rate_mean_5": "ts_mean(turnover_rate, 5)",
"turnover_deviation": "(turnover_rate - ts_mean(turnover_rate, 10)) / (ts_std(turnover_rate, 10) + 1e-8)",
"amihud_illiq_20": "ts_mean(abs(close / (ts_delay(close, 1) + 1e-8) - 1) / (amount + 1e-8), 20)",
"turnover_cv_20": "ts_std(turnover_rate, 20) / (ts_mean(turnover_rate, 20) + 1e-8)",
"pv_corr_20": "ts_corr(close / (ts_delay(close, 1) + 1e-8) - 1, vol, 20)",
"close_vwap_deviation": "close / (amount / (vol * 100 + 1e-8) + 1e-8) - 1",
# ================= 5. 基本面财务特征 =================
"roe": "n_income / (total_hldr_eqy_exc_min_int + 1e-8)",
"roa": "n_income / (total_assets + 1e-8)",
"profit_margin": "n_income / (revenue + 1e-8)",
"debt_to_equity": "total_liab / (total_hldr_eqy_exc_min_int + 1e-8)",
"current_ratio": "total_cur_assets / (total_cur_liab + 1e-8)",
"net_profit_yoy": "(n_income / (ts_delay(n_income, 252) + 1e-8)) - 1",
"revenue_yoy": "(revenue / (ts_delay(revenue, 252) + 1e-8)) - 1",
"healthy_expansion_velocity": "(total_assets / (ts_delay(total_assets, 252) + 1e-8) - 1) - (total_liab / (ts_delay(total_liab, 252) + 1e-8) - 1)",
# ================= 6. 基本面估值与截面动量共振 =================
"EP": "n_income / (total_mv * 10000 + 1e-8)",
"BP": "total_hldr_eqy_exc_min_int / (total_mv * 10000 + 1e-8)",
"CP": "n_cashflow_act / (total_mv * 10000 + 1e-8)",
"market_cap_rank": "cs_rank(total_mv)",
"turnover_rank": "cs_rank(turnover_rate)",
"return_5_rank": "cs_rank((close / (ts_delay(close, 5) + 1e-8)) - 1)",
"EP_rank": "cs_rank(n_income / (total_mv + 1e-8))",
"pe_expansion_trend": "(total_mv / (n_income + 1e-8)) / (ts_delay(total_mv, 60) / (ts_delay(n_income, 60) + 1e-8) + 1e-8) - 1",
"value_price_divergence": "cs_rank((n_income - ts_delay(n_income, 252)) / (abs(ts_delay(n_income, 252)) + 1e-8)) - cs_rank(close / (ts_delay(close, 20) + 1e-8))",
"active_market_cap": "total_mv * ts_mean(turnover_rate, 20)",
"ebit_rank": "cs_rank(ebit)",
}
# Label 因子定义(不参与训练,用于计算目标)
LABEL_FACTOR = {
LABEL_NAME: "(ts_delay(close, -5) / ts_delay(open, -1)) - 1",
}
# %% md
# ### 3.2 训练参数配置
# %%
# 日期范围配置(正确的 train/val/test 三分法)
TRAIN_START = "20200101"
TRAIN_END = "20231231"
VAL_START = "20240101"
VAL_END = "20241231"
TEST_START = "20250101"
TEST_END = "20251231"
# LambdaRank 模型参数配置
MODEL_PARAMS = {
"objective": "lambdarank",
"metric": "ndcg",
"ndcg_at": [1, 5, 10, 20], # 评估 NDCG@k
"learning_rate": 0.05,
"num_leaves": 31,
"max_depth": 6,
"min_data_in_leaf": 20,
"n_estimators": 1000,
"early_stopping_rounds": 50,
"subsample": 0.8,
"colsample_bytree": 0.8,
"reg_alpha": 0.1,
"reg_lambda": 1.0,
"verbose": -1,
"random_state": 42,
}
# 分位数配置
N_QUANTILES = 20 # 将 label 分为 20 组
# 特征列(用于数据处理器)
FEATURE_COLS = list(FACTOR_DEFINITIONS.keys())
# 数据处理器配置
PROCESSORS = [
NullFiller(feature_cols=FEATURE_COLS, strategy="mean"),
Winsorizer(feature_cols=FEATURE_COLS, lower=0.01, upper=0.99),
StandardScaler(feature_cols=FEATURE_COLS),
]
# 股票池筛选函数
def stock_pool_filter(df: pl.DataFrame) -> pl.Series:
"""股票池筛选函数(单日数据)
筛选条件:
1. 排除创业板(代码以 300 开头)
2. 排除科创板(代码以 688 开头)
3. 排除北交所(代码以 8、9 或 4 开头)
4. 选取当日市值最小的500只股票
"""
code_filter = (
~df["ts_code"].str.starts_with("30")
& ~df["ts_code"].str.starts_with("68")
& ~df["ts_code"].str.starts_with("8")
& ~df["ts_code"].str.starts_with("9")
& ~df["ts_code"].str.starts_with("4")
)
valid_df = df.filter(code_filter)
n = min(1000, len(valid_df))
small_cap_codes = valid_df.sort("total_mv").head(n)["ts_code"]
return df["ts_code"].is_in(small_cap_codes)
STOCK_FILTER_REQUIRED_COLUMNS = ["total_mv"]
# 输出配置
OUTPUT_DIR = "output"
SAVE_PREDICTIONS = True
PERSIST_MODEL = False
# Top N 配置:每日推荐股票数量
TOP_N = 5 # 可调整为 10, 20 等
# %% md
# ## 4. 训练流程
# %%
print("\n" + "=" * 80)
print("LightGBM LambdaRank 排序学习训练")
print("=" * 80)
# 1. 创建 FactorEngine启用 metadata 功能)
print("\n[1] 创建 FactorEngine")
engine = FactorEngine(metadata_path="data/factors.jsonl")
# 2. 使用 metadata 定义因子
print("\n[2] 定义因子(从 metadata 注册)")
feature_cols = create_factors_with_metadata(engine, FACTOR_DEFINITIONS, LABEL_FACTOR)
# 3. 准备数据
print("\n[3] 准备数据")
data = prepare_data(
engine=engine,
feature_cols=feature_cols,
start_date=TRAIN_START,
end_date=TEST_END,
)
# 4. 转换为排序学习格式(分位数标签)
print("\n[4] 转换为排序学习格式")
data, target_col = prepare_ranking_data(
df=data,
label_col=LABEL_NAME,
n_quantiles=N_QUANTILES,
)
# 5. 打印配置信息
print(f"\n[配置] 训练期: {TRAIN_START} - {TRAIN_END}")
print(f"[配置] 验证期: {VAL_START} - {VAL_END}")
print(f"[配置] 测试期: {TEST_START} - {TEST_END}")
print(f"[配置] 特征数: {len(feature_cols)}")
print(f"[配置] 目标变量: {target_col}{N_QUANTILES}分位数)")
# 6. 创建排序学习模型
model = LightGBMLambdaRankModel(params=MODEL_PARAMS)
# 7. 创建数据处理器
processors = PROCESSORS
# 8. 创建数据划分器
splitter = DateSplitter(
train_start=TRAIN_START,
train_end=TRAIN_END,
val_start=VAL_START,
val_end=VAL_END,
test_start=TEST_START,
test_end=TEST_END,
)
# 9. 创建股票池管理器
pool_manager = StockPoolManager(
filter_func=stock_pool_filter,
required_columns=STOCK_FILTER_REQUIRED_COLUMNS,
data_router=engine.router,
)
# 10. 创建 ST 过滤器
st_filter = STFilter(data_router=engine.router)
# 11. 创建训练器
trainer = Trainer(
model=model,
pool_manager=pool_manager,
processors=processors,
filters=[st_filter],
splitter=splitter,
target_col=target_col,
feature_cols=feature_cols,
persist_model=PERSIST_MODEL,
)
# %% md
# ### 4.1 股票池筛选
# %%
print("\n" + "=" * 80)
print("股票池筛选")
print("=" * 80)
# 先执行 ST 过滤(在股票池筛选之前,与 Trainer.train() 保持一致)
if st_filter:
print("\n[过滤] 应用 ST 过滤器...")
data = st_filter.filter(data)
print(f" ST 过滤后数据规模: {data.shape}")
if pool_manager:
print("\n执行每日独立筛选股票池...")
filtered_data = pool_manager.filter_and_select_daily(data)
print(f" 筛选前数据规模: {data.shape}")
print(f" 筛选后数据规模: {filtered_data.shape}")
print(f" 筛选前股票数: {data['ts_code'].n_unique()}")
print(f" 筛选后股票数: {filtered_data['ts_code'].n_unique()}")
print(f" 删除记录数: {len(data) - len(filtered_data)}")
else:
filtered_data = data
print(" 未配置股票池管理器,跳过筛选")
# %% md
# ### 4.2 数据划分
# %%
print("\n" + "=" * 80)
print("数据划分")
print("=" * 80)
if splitter:
train_data, val_data, test_data = splitter.split(filtered_data)
print(f"\n训练集数据规模: {train_data.shape}")
print(f"验证集数据规模: {val_data.shape}")
print(f"测试集数据规模: {test_data.shape}")
# 计算各集的 group 数组
train_group = compute_group_array(train_data)
val_group = compute_group_array(val_data)
test_group = compute_group_array(test_data)
print(f"\n训练集 group 数量: {len(train_group)}")
print(f"验证集 group 数量: {len(val_group)}")
print(f"测试集 group 数量: {len(test_group)}")
print(f"训练集日均样本数: {np.mean(train_group):.1f}")
print(f"验证集日均样本数: {np.mean(val_group):.1f}")
print(f"测试集日均样本数: {np.mean(test_group):.1f}")
else:
raise ValueError("必须配置数据划分器")
# %% md
# ### 4.3 数据预处理
# %%
print("\n" + "=" * 80)
print("数据预处理")
print("=" * 80)
fitted_processors = []
if processors:
print("\n训练集处理...")
for i, processor in enumerate(processors, 1):
print(f" [{i}/{len(processors)}] {processor.__class__.__name__}")
train_data = processor.fit_transform(train_data)
fitted_processors.append(processor)
print("\n验证集处理...")
for processor in fitted_processors:
val_data = processor.transform(val_data)
print("\n测试集处理...")
for processor in fitted_processors:
test_data = processor.transform(test_data)
print(f"\n处理后训练集形状: {train_data.shape}")
print(f"处理后验证集形状: {val_data.shape}")
print(f"处理后测试集形状: {test_data.shape}")
# %% md
# ### 4.4 训练 LambdaRank 模型
# %%
print("\n" + "=" * 80)
print("训练 LambdaRank 模型")
print("=" * 80)
# 准备数据
X_train = train_data.select(feature_cols)
y_train = train_data.select(target_col).to_series()
X_val = val_data.select(feature_cols)
y_val = val_data.select(target_col).to_series()
print(f"\n训练样本数: {len(X_train)}")
print(f"验证样本数: {len(X_val)}")
print(f"特征数: {len(feature_cols)}")
print(f"目标变量: {target_col}")
print("\n目标变量统计(训练集):")
print(y_train.describe())
print("\n开始训练...")
model.fit(
X=X_train,
y=y_train,
group=train_group,
eval_set=(X_val, y_val, val_group),
)
print("训练完成!")
# %% md
# ### 4.5 训练指标曲线
# %%
print("\n" + "=" * 80)
print("训练指标曲线")
print("=" * 80)
# 重新训练以收集指标(因为之前的训练没有保存评估结果)
print("\n重新训练模型以收集训练指标...")
import lightgbm as lgb
# 准备数据(使用 val 做验证test 不参与训练过程)
X_train_np = X_train.to_numpy()
y_train_np = y_train.to_numpy()
X_val_np = val_data.select(feature_cols).to_numpy()
y_val_np = val_data.select(target_col).to_series().to_numpy()
# 创建数据集
train_dataset = lgb.Dataset(X_train_np, label=y_train_np, group=train_group)
val_dataset = lgb.Dataset(
X_val_np, label=y_val_np, group=val_group, reference=train_dataset
)
# 用于存储评估结果
evals_result = {}
# 使用与原模型相同的参数重新训练
# 正确的三分法train用于训练val用于验证test不参与训练过程
booster_with_eval = lgb.train(
MODEL_PARAMS,
train_dataset,
num_boost_round=MODEL_PARAMS.get("n_estimators", 1000),
valid_sets=[train_dataset, val_dataset],
valid_names=["train", "val"],
callbacks=[
lgb.record_evaluation(evals_result),
lgb.early_stopping(stopping_rounds=50, verbose=True),
],
)
print("训练完成,指标已收集")
# 获取评估的 NDCG 指标
ndcg_metrics = [k for k in evals_result["train"].keys() if "ndcg" in k]
print(f"\n评估的 NDCG 指标: {ndcg_metrics}")
# 显示早停信息
actual_rounds = len(list(evals_result["train"].values())[0])
expected_rounds = MODEL_PARAMS.get("n_estimators", 1000)
print(f"\n[早停信息]")
print(f" 配置的最大轮数: {expected_rounds}")
print(f" 实际训练轮数: {actual_rounds}")
if actual_rounds < expected_rounds:
print(f" 早停状态: 已触发连续50轮验证指标未改善")
else:
print(f" 早停状态: 未触发(达到最大轮数)")
# 显示各 NDCG 指标的最终值
print(f"\n最终 NDCG 指标:")
for metric in ndcg_metrics:
train_ndcg = evals_result["train"][metric][-1]
val_ndcg = evals_result["val"][metric][-1]
print(f" {metric}: 训练集={train_ndcg:.4f}, 验证集={val_ndcg:.4f}")
# %%
# 绘制 NDCG 训练指标曲线
import matplotlib.pyplot as plt
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
axes = axes.flatten()
for idx, metric in enumerate(ndcg_metrics[:4]): # 最多显示4个NDCG指标
ax = axes[idx]
train_metric = evals_result["train"][metric]
val_metric = evals_result["val"][metric]
iterations = range(1, len(train_metric) + 1)
ax.plot(
iterations, train_metric, label=f"Train {metric}", linewidth=2, color="blue"
)
ax.plot(iterations, val_metric, label=f"Val {metric}", linewidth=2, color="red")
ax.set_xlabel("Iteration", fontsize=10)
ax.set_ylabel(metric.upper(), fontsize=10)
ax.set_title(
f"Training and Validation {metric.upper()}", fontsize=12, fontweight="bold"
)
ax.legend(fontsize=9)
ax.grid(True, alpha=0.3)
# 标记最佳验证指标点
best_iter = val_metric.index(max(val_metric))
best_metric = max(val_metric)
ax.axvline(x=best_iter + 1, color="green", linestyle="--", alpha=0.7)
ax.scatter([best_iter + 1], [best_metric], color="green", s=80, zorder=5)
ax.annotate(
f"Best: {best_metric:.4f}",
xy=(best_iter + 1, best_metric),
xytext=(best_iter + 1 + len(iterations) * 0.05, best_metric),
fontsize=8,
arrowprops=dict(arrowstyle="->", color="green", alpha=0.7),
)
plt.tight_layout()
plt.show()
print(f"\n[指标分析]")
print(f" 各NDCG指标在验证集上的最佳值:")
for metric in ndcg_metrics:
val_metric_list = evals_result["val"][metric]
best_iter = val_metric_list.index(max(val_metric_list))
best_val = max(val_metric_list)
print(f" {metric}: {best_val:.4f} (迭代 {best_iter + 1})")
print(f"\n[重要提醒] 验证集仅用于早停/调参,测试集完全独立于训练过程!")
# %% md
# ### 4.6 模型评估
# %%
print("\n" + "=" * 80)
print("模型评估")
print("=" * 80)
# 准备测试集
X_test = test_data.select(feature_cols)
y_test = test_data.select(target_col).to_series()
# 预测
print("\n生成预测...")
predictions = model.predict(X_test)
# 添加预测列
test_data = test_data.with_columns([pl.Series("prediction", predictions)])
# 计算 NDCG 指标
print("\n计算 NDCG 指标...")
ndcg_results = evaluate_ndcg_at_k(
y_true=y_test.to_numpy(),
y_pred=predictions,
group=test_group,
k_list=[1, 5, 10, 20],
)
print("\nNDCG 评估结果:")
print("-" * 40)
for metric, value in ndcg_results.items():
print(f" {metric}: {value:.4f}")
# 特征重要性
print("\n特征重要性Top 20:")
print("-" * 40)
importance = model.feature_importance()
if importance is not None:
top_features = importance.sort_values(ascending=False).head(20)
for i, (feature, score) in enumerate(top_features.items(), 1):
print(f" {i:2d}. {feature:30s} {score:10.2f}")
# %%
# 确保输出目录存在
os.makedirs(OUTPUT_DIR, exist_ok=True)
# 生成时间戳
start_dt = datetime.strptime(TEST_START, "%Y%m%d")
end_dt = datetime.strptime(TEST_END, "%Y%m%d")
date_str = f"{start_dt.strftime('%Y%m%d')}_{end_dt.strftime('%Y%m%d')}"
# 保存每日 Top N
print(f"\n[1/1] 保存每日 Top {TOP_N} 股票...")
topn_output_path = os.path.join(OUTPUT_DIR, "rank_output.csv")
# 按日期分组,取每日 top N
topn_by_date = []
unique_dates = test_data["trade_date"].unique().sort()
for date in unique_dates:
day_data = test_data.filter(test_data["trade_date"] == date)
# 按 prediction 降序排序,取前 N
topn = day_data.sort("prediction", descending=True).head(TOP_N)
topn_by_date.append(topn)
# 合并所有日期的 top N
topn_results = pl.concat(topn_by_date)
# 格式化日期并调整列顺序:日期、分数、股票
topn_to_save = topn_results.select(
[
pl.col("trade_date").str.slice(0, 4)
+ "-"
+ pl.col("trade_date").str.slice(4, 2)
+ "-"
+ pl.col("trade_date").str.slice(6, 2).alias("date"),
pl.col("prediction").alias("score"),
pl.col("ts_code"),
]
)
topn_to_save.write_csv(topn_output_path, include_header=True)
print(f" 保存路径: {topn_output_path}")
print(
f" 保存行数: {len(topn_to_save)}{len(unique_dates)}个交易日 x 每日top{TOP_N}"
)
print(f"\n 预览前15行:")
print(topn_to_save.head(15))
print("\n训练流程完成!")
# %% md
# ## 5. 总结
#
# 本 Notebook 实现了完整的 Learn-to-Rank 训练流程:
#
# ### 核心步骤
#
# 1. **数据准备**: 计算 49 个特征因子,将 `future_return_5` 转换为 20 分位数标签
# 2. **模型训练**: 使用 LightGBM LambdaRank 学习每日股票排序
# 3. **模型评估**: 使用 NDCG@1/5/10/20 评估排序质量
# 4. **策略分析**: 基于排序分数构建 Top-k 选股策略
#
# ### 关键参数
#
# - **Objective**: lambdarank
# - **Metric**: ndcg
# - **Learning Rate**: 0.05
# - **Num Leaves**: 31
# - **N Quantiles**: 20
#
# ### 输出结果
#
# - rank_output.csv: 每日Top-N推荐股票格式date, score, ts_code
# - 特征重要性排名
# - Top-k 策略统计和图表
# - NDCG训练指标曲线
#
# ### 后续优化方向
#
# 1. **特征工程**: 尝试更多因子组合
# 2. **超参数调优**: 使用网格搜索优化 LambdaRank 参数
# 3. **模型集成**: 结合多个排序模型的预测
# 4. **更复杂的分组**: 考虑按行业分组排序

File diff suppressed because one or more lines are too long

View File

@@ -0,0 +1,751 @@
# %% md
# ## 1. 导入依赖
# %%
import os
from datetime import datetime
from typing import List
import polars as pl
from src.factors import FactorEngine
from src.training import (
DateSplitter,
LightGBMModel,
STFilter,
StandardScaler,
# StockFilterConfig, # 已删除,使用 StockPoolManager + filter_func 替代
StockPoolManager,
Trainer,
Winsorizer,
NullFiller,
)
from src.training.config import TrainingConfig
# %% md
# ## 2. 定义辅助函数
# %%
def create_factors_with_metadata(
engine: FactorEngine, factor_definitions: dict, label_factor: dict
) -> List[str]:
"""使用 metadata 注册因子特征因子通过名称注册label 因子通过表达式注册)"""
print("=" * 80)
print("使用 metadata 注册因子")
print("=" * 80)
# 注册所有特征因子(通过 metadata 名称)
print("\n注册特征因子(从 metadata:")
for name in factor_definitions.keys():
engine.add_factor_by_name(name)
print(f" - {name}")
# 注册 label 因子(通过表达式,因为 label 不在 metadata 中)
print("\n注册 Label 因子(表达式):")
for name, expr in label_factor.items():
engine.add_factor(name, expr)
print(f" - {name}: {expr}")
# 从字典自动获取特征列
feature_cols = list(factor_definitions.keys())
print(f"\n特征因子数: {len(feature_cols)}")
print(f"Label: {list(label_factor.keys())[0]}")
print(f"已注册因子总数: {len(engine.list_registered())}")
return feature_cols
def prepare_data(
engine: FactorEngine,
feature_cols: List[str],
start_date: str,
end_date: str,
) -> pl.DataFrame:
print("\n" + "=" * 80)
print("准备数据")
print("=" * 80)
# 计算因子(全市场数据)
print(f"\n计算因子: {start_date} - {end_date}")
factor_names = feature_cols + [LABEL_NAME] # 包含 label
data = engine.compute(
factor_names=factor_names,
start_date=start_date,
end_date=end_date,
)
print(f"数据形状: {data.shape}")
print(f"数据列: {data.columns}")
print(f"\n前5行预览:")
print(data.head())
return data
# %% md
# ## 3. 配置参数
#
# ### 3.1 因子定义
# %%
# 特征因子定义字典:新增因子只需在此处添加一行
LABEL_NAME = "future_return_5"
FACTOR_DEFINITIONS = FACTOR_DICT = {
# ================= 1. 价格、趋势与路径依赖 (Trend, Momentum & Path Dependency) =================
"ma_5": "ts_mean(close, 5)",
"ma_20": "ts_mean(close, 20)",
"ma_ratio_5_20": "ts_mean(close, 5) / (ts_mean(close, 20) + 1e-8) - 1", # 均线发散度
"bias_10": "close / (ts_mean(close, 10) + 1e-8) - 1", # 10日乖离率
"high_low_ratio": "(close - ts_min(low, 20)) / (ts_max(high, 20) - ts_min(low, 20) + 1e-8)", # 威廉指标变形
"bbi_ratio": "(ts_mean(close, 3) + ts_mean(close, 6) + ts_mean(close, 12) + ts_mean(close, 24)) / (4 * close + 1e-8)", # 多空指标比率
"return_5": "(close / (ts_delay(close, 5) + 1e-8)) - 1", # 5日动量
"return_20": "(close / (ts_delay(close, 20) + 1e-8)) - 1", # 20日动量
# [高阶] Kaufman 趋势效率 (极高价值) - 衡量趋势流畅度,剔除无序震荡
"kaufman_ER_20": "abs(close - ts_delay(close, 20)) / (ts_sum(abs(close - ts_delay(close, 1)), 20) + 1e-8)",
# [高阶] 动量加速度 - 寻找二阶导数大于0正在加速爆发的股票
"mom_acceleration_10_20": "(close / (ts_delay(close, 10) + 1e-8) - 1) - (ts_delay(close, 10) / (ts_delay(close, 20) + 1e-8) - 1)",
# [高阶] 高点距离衰减 - 衡量套牢盘压力
"drawdown_from_high_60": "close / (ts_max(high, 60) + 1e-8) - 1",
# [高阶] 趋势一致性 - 过去20天内收红的天数比例
"up_days_ratio_20": "ts_sum(close > ts_delay(close, 1), 20) / 20",
# ================= 2. 波动率、风险调整与高阶矩 (Volatility & Risk-Adjusted Returns) =================
"volatility_5": "ts_std(close, 5)",
"volatility_20": "ts_std(close, 20)",
"volatility_ratio": "ts_std(close, 5) / (ts_std(close, 20) + 1e-8)", # 波动率期限结构
"std_return_20": "ts_std((close / (ts_delay(close, 1) + 1e-8)) - 1, 20)", # 真实收益率波动率
# [高阶] 夏普趋势比率 - 惩罚暴涨暴跌,奖励稳健爬坡
"sharpe_ratio_20": "ts_mean(close / (ts_delay(close, 1) + 1e-8) - 1, 20) / (ts_std(close / (ts_delay(close, 1) + 1e-8) - 1, 20) + 1e-8)",
# [高阶] 尾部崩盘风险 - 过去一个月最大单日跌幅
"min_ret_20": "ts_min(close / (ts_delay(close, 1) + 1e-8) - 1, 20)",
# [高阶] 波动率挤压比 - 寻找盘整到极致面临变盘的股票 (布林带收口)
"volatility_squeeze_5_60": "ts_std(close, 5) / (ts_std(close, 60) + 1e-8)",
# ================= 3. 日内微观结构与异象 (Intraday Microstructure & Anomalies) =================
# [高阶] 隔夜与日内背离 - 差值越小说明主力越喜欢在盘中吸筹
"overnight_intraday_diff": "(open / (ts_delay(close, 1) + 1e-8) - 1) - (close / (open + 1e-8) - 1)",
# [高阶] 上影线抛压极值 - 冲高回落被套牢的概率
"upper_shadow_ratio": "(high - ((open + close + abs(open - close)) / 2)) / (high - low + 1e-8)",
# [高阶] 资金沉淀率 - 衡量主力日内高抛低吸洗盘的剧烈程度
"capital_retention_20": "ts_sum(abs(close - open), 20) / (ts_sum(high - low, 20) + 1e-8)",
# [高阶] MAX 彩票效应 - 反转因子,剔除近期有过妖股连板特征的标的
"max_ret_20": "ts_max(close / (ts_delay(close, 1) + 1e-8) - 1, 20)",
# ================= 4. 量能、流动性与量价背离 (Volume, Liquidity & Divergence) =================
"volume_ratio_5_20": "ts_mean(vol, 5) / (ts_mean(vol, 20) + 1e-8)", # 相对放量比
"turnover_rate_mean_5": "ts_mean(turnover_rate, 5)", # 活跃度
"turnover_deviation": "(turnover_rate - ts_mean(turnover_rate, 10)) / (ts_std(turnover_rate, 10) + 1e-8)", # 换手率偏离度
# [高阶] Amihud 非流动性异象 (绝对核心) - 衡量砸盘/拉升的摩擦成本
"amihud_illiq_20": "ts_mean(abs(close / (ts_delay(close, 1) + 1e-8) - 1) / (amount + 1e-8), 20)",
# [高阶] 换手率惩罚因子 - 换手率忽高忽低说明游资接力,行情极不稳定
"turnover_cv_20": "ts_std(turnover_rate, 20) / (ts_mean(turnover_rate, 20) + 1e-8)",
# [高阶] 纯粹量价相关性 - 检验是否是"放量上涨,缩量下跌"的良性多头
"pv_corr_20": "ts_corr(close / (ts_delay(close, 1) + 1e-8) - 1, vol, 20)",
# [高阶] 收盘价与均价背离 - 专门抓尾盘突袭拉升骗线的股票
"close_vwap_deviation": "close / (amount / (vol * 100 + 1e-8) + 1e-8) - 1",
# ================= 5. 基本面财务特征 (Fundamental Quality & Structure) =================
"roe": "n_income / (total_hldr_eqy_exc_min_int + 1e-8)", # 净资产收益率
"roa": "n_income / (total_assets + 1e-8)", # 总资产收益率
"profit_margin": "n_income / (revenue + 1e-8)", # 销售净利率
"debt_to_equity": "total_liab / (total_hldr_eqy_exc_min_int + 1e-8)", # 杠杆率
"current_ratio": "total_cur_assets / (total_cur_liab + 1e-8)", # 短期偿债安全垫
# [高阶] 利润同比增速 (日频延后252天等于去年同期)
"net_profit_yoy": "(n_income / (ts_delay(n_income, 252) + 1e-8)) - 1",
# [高阶] 营收同比增速
"revenue_yoy": "(revenue / (ts_delay(revenue, 252) + 1e-8)) - 1",
# [高阶] 资产负债表扩张斜率 - 剔除单纯靠举债扩张的公司
"healthy_expansion_velocity": "(total_assets / (ts_delay(total_assets, 252) + 1e-8) - 1) - (total_liab / (ts_delay(total_liab, 252) + 1e-8) - 1)",
# ================= 6. 基本面估值与截面动量共振 (Valuation & Cross-Sectional Ranking) =================
# 估值水平绝对值 (Tushare 市值单位需要 * 10000 转换为元)
"EP": "n_income / (total_mv * 10000 + 1e-8)", # 盈利收益率 (1/PE)
"BP": "total_hldr_eqy_exc_min_int / (total_mv * 10000 + 1e-8)", # 账面市值比 (1/PB)
"CP": "n_cashflow_act / (total_mv * 10000 + 1e-8)", # 经营现金流收益率 (1/PCF)
# 全市场截面排名因子
"market_cap_rank": "cs_rank(total_mv)", # 规模因子 (Size)
"turnover_rank": "cs_rank(turnover_rate)",
"return_5_rank": "cs_rank((close / (ts_delay(close, 5) + 1e-8)) - 1)",
"EP_rank": "cs_rank(n_income / (total_mv + 1e-8))", # 谁最便宜
# [高阶] 戴维斯双击动量 - 估值相对上一年是否在扩张
"pe_expansion_trend": "(total_mv / (n_income + 1e-8)) / (ts_delay(total_mv, 60) / (ts_delay(n_income, 60) + 1e-8) + 1e-8) - 1",
# [高阶] 业绩与价格背离度 - 截面做差利润排名全市场第一但近20日价格排名倒数第一捕捉被错杀的潜伏股
"value_price_divergence": "cs_rank((n_income - ts_delay(n_income, 252)) / (abs(ts_delay(n_income, 252)) + 1e-8)) - cs_rank(close / (ts_delay(close, 20) + 1e-8))",
# [高阶] 流动性溢价调整后市值 - 识别僵尸大盘股和极度活跃的小微盘
"active_market_cap": "total_mv * ts_mean(turnover_rate, 20)",
"ebit_rank": "cs_rank(ebit)",
}
# Label 因子定义(不参与训练,用于计算目标)
LABEL_FACTOR = {
LABEL_NAME: "(ts_delay(close, -5) / ts_delay(open, -1)) - 1", # 未来5日收益率
}
# %% md
# ### 3.2 训练参数配置
# %%
# 日期范围配置(正确的 train/val/test 三分法)
# Train: 用于训练模型参数
# Val: 用于验证/早停/调参(位于 train 之后test 之前)
# Test: 仅用于最终评估,完全独立于训练过程
TRAIN_START = "20200101"
TRAIN_END = "20231231"
VAL_START = "20240101"
VAL_END = "20241231"
TEST_START = "20250101"
TEST_END = "20261231"
# 模型参数配置
MODEL_PARAMS = {
"objective": "regression",
"metric": "mae", # 改为 MAE对异常值更稳健
# 树结构控制(防过拟合核心)
# "num_leaves": 20, # 从31降为20降低模型复杂度
# "max_depth": 16, # 显式限制深度,防止过度拟合噪声
# "min_child_samples": 50, # 叶子最小样本数,防止学习极端样本
# "min_child_weight": 0.001,
# 学习参数
"learning_rate": 0.01, # 降低学习率,配合更多树
"n_estimators": 1000, # 增加树数量,配合早停
# 采样策略(关键防过拟合)
"subsample": 0.8, # 每棵树随机采样80%数据(行采样)
"subsample_freq": 5, # 每5轮迭代进行一次 subsample
"colsample_bytree": 0.8, # 每棵树随机选择80%特征(列采样)
# 正则化
"reg_alpha": 0.1, # L1正则增加稀疏性
"reg_lambda": 1.0, # L2正则平滑权重
# 数值稳定性
"verbose": -1,
"random_state": 42,
}
# 数据处理器配置(新 API需要传入 feature_cols
# 注意processor 现在需要显式指定要处理的特征列
# 股票池筛选函数
# 使用新的 StockPoolManager API传入自定义筛选函数和所需列/因子
# 筛选函数接收单日 DataFrame返回布尔 Series
#
# 筛选逻辑(针对单日数据):
# 1. 先排除创业板、科创板、北交所ST过滤由STFilter组件处理
# 2. 然后选取市值最小的500只股票
def stock_pool_filter(df: pl.DataFrame) -> pl.Series:
"""股票池筛选函数(单日数据)
筛选条件:
1. 排除创业板(代码以 300 开头)
2. 排除科创板(代码以 688 开头)
3. 排除北交所(代码以 8、9 或 4 开头)
4. 选取当日市值最小的500只股票
"""
# 代码筛选(排除创业板、科创板、北交所)
code_filter = (
~df["ts_code"].str.starts_with("30") # 排除创业板
& ~df["ts_code"].str.starts_with("68") # 排除科创板
& ~df["ts_code"].str.starts_with("8") # 排除北交所
& ~df["ts_code"].str.starts_with("9") # 排除北交所
& ~df["ts_code"].str.starts_with("4") # 排除北交所
)
# 在已筛选的股票中选取市值最小的500只
# 按市值升序排序取前500
valid_df = df.filter(code_filter)
n = min(1000, len(valid_df))
small_cap_codes = valid_df.sort("total_mv").head(n)["ts_code"]
# 返回布尔 Series是否在被选中的股票中
return df["ts_code"].is_in(small_cap_codes)
# 定义筛选所需的基础列
STOCK_FILTER_REQUIRED_COLUMNS = ["total_mv"] # ST过滤由STFilter组件处理
# 可选:定义筛选所需的因子(如果需要用因子进行筛选)
# STOCK_FILTER_REQUIRED_FACTORS = {
# "market_cap_rank": "cs_rank(total_mv)",
# }
# 输出配置(相对于本文件所在目录)
OUTPUT_DIR = "output"
SAVE_PREDICTIONS = True
PERSIST_MODEL = False
# Top N 配置:每日推荐股票数量
TOP_N = 5 # 可调整为 10, 20 等
# %% md
# ## 4. 训练流程
#
# ### 4.1 初始化组件
# %%
print("\n" + "=" * 80)
print("LightGBM 回归模型训练")
print("=" * 80)
# 1. 创建 FactorEngine启用 metadata 功能)
print("\n[1] 创建 FactorEngine")
engine = FactorEngine(metadata_path="data/factors.jsonl")
# 2. 使用 metadata 定义因子
print("\n[2] 定义因子(从 metadata 注册)")
feature_cols = create_factors_with_metadata(engine, FACTOR_DEFINITIONS, LABEL_FACTOR)
target_col = LABEL_NAME
# 3. 准备数据(使用模块级别的日期配置)
print("\n[3] 准备数据")
data = prepare_data(
engine=engine,
feature_cols=feature_cols,
start_date=TRAIN_START,
end_date=TEST_END,
)
# 4. 打印配置信息
print(f"\n[配置] 训练期: {TRAIN_START} - {TRAIN_END}")
print(f"[配置] 验证期: {VAL_START} - {VAL_END}")
print(f"[配置] 测试期: {TEST_START} - {TEST_END}")
print(f"[配置] 特征数: {len(feature_cols)}")
print(f"[配置] 目标变量: {target_col}")
# 5. 创建模型
model = LightGBMModel(params=MODEL_PARAMS)
# 6. 创建数据处理器(新 API需要传入 feature_cols
processors = [
NullFiller(feature_cols=feature_cols, strategy="mean"),
Winsorizer(feature_cols=feature_cols, lower=0.01, upper=0.99),
StandardScaler(feature_cols=feature_cols),
]
# 7. 创建数据划分器(正确的 train/val/test 三分法)
# Train: 训练模型参数 | Val: 验证/早停 | Test: 最终评估
splitter = DateSplitter(
train_start=TRAIN_START,
train_end=TRAIN_END,
val_start=VAL_START,
val_end=VAL_END,
test_start=TEST_START,
test_end=TEST_END,
)
# 8. 创建股票池管理器
# 使用新的 API传入自定义筛选函数和所需列
pool_manager = StockPoolManager(
filter_func=stock_pool_filter,
required_columns=STOCK_FILTER_REQUIRED_COLUMNS, # 筛选所需的额外列
# required_factors=STOCK_FILTER_REQUIRED_FACTORS, # 可选:筛选所需的因子
data_router=engine.router,
)
print("[股票池筛选] 使用自定义函数进行股票池筛选")
print(f"[股票池筛选] 所需基础列: {STOCK_FILTER_REQUIRED_COLUMNS}")
print("[股票池筛选] 筛选逻辑: 排除创业板/科创板/北交所后每日选市值最小的500只")
# print(f"[股票池筛选] 所需因子: {list(STOCK_FILTER_REQUIRED_FACTORS.keys())}")
# 9. 创建 ST 股票过滤器
st_filter = STFilter(
data_router=engine.router,
)
# 10. 创建训练器
trainer = Trainer(
model=model,
pool_manager=pool_manager,
processors=processors,
filters=[st_filter], # 使用STFilter过滤ST股票
splitter=splitter,
target_col=target_col,
feature_cols=feature_cols,
persist_model=PERSIST_MODEL,
)
# %% md
# ### 4.2 执行训练
# %%
print("\n" + "=" * 80)
print("开始训练")
print("=" * 80)
# 步骤 1: 股票池筛选
print("\n[步骤 1/6] 股票池筛选")
print("-" * 60)
if pool_manager:
print(" 执行每日独立筛选股票池...")
filtered_data = pool_manager.filter_and_select_daily(data)
print(f" 筛选前数据规模: {data.shape}")
print(f" 筛选后数据规模: {filtered_data.shape}")
print(f" 筛选前股票数: {data['ts_code'].n_unique()}")
print(f" 筛选后股票数: {filtered_data['ts_code'].n_unique()}")
print(f" 删除记录数: {len(data) - len(filtered_data)}")
else:
filtered_data = data
print(" 未配置股票池管理器,跳过筛选")
# %%
# 步骤 2: 划分训练/验证/测试集(正确的三分法)
print("\n[步骤 2/6] 划分训练集、验证集和测试集")
print("-" * 60)
if splitter:
# 正确的三分法train用于训练val用于验证/早停test仅用于最终评估
train_data, val_data, test_data = splitter.split(filtered_data)
print(f" 训练集数据规模: {train_data.shape}")
print(f" 验证集数据规模: {val_data.shape}")
print(f" 测试集数据规模: {test_data.shape}")
print(f" 训练集股票数: {train_data['ts_code'].n_unique()}")
print(f" 验证集股票数: {val_data['ts_code'].n_unique()}")
print(f" 测试集股票数: {test_data['ts_code'].n_unique()}")
print(
f" 训练集日期范围: {train_data['trade_date'].min()} - {train_data['trade_date'].max()}"
)
print(
f" 验证集日期范围: {val_data['trade_date'].min()} - {val_data['trade_date'].max()}"
)
print(
f" 测试集日期范围: {test_data['trade_date'].min()} - {test_data['trade_date'].max()}"
)
print("\n 训练集前5行预览:")
print(train_data.head())
print("\n 验证集前5行预览:")
print(val_data.head())
print("\n 测试集前5行预览:")
print(test_data.head())
else:
train_data = filtered_data
test_data = filtered_data
print(" 未配置划分器,全部作为训练集")
# %%
# 步骤 3: 训练集数据处理
print("\n[步骤 3/6] 训练集数据处理")
print("-" * 60)
fitted_processors = []
if processors:
for i, processor in enumerate(processors, 1):
print(f" [{i}/{len(processors)}] 应用处理器: {processor.__class__.__name__}")
train_data_before = len(train_data)
train_data = processor.fit_transform(train_data)
train_data_after = len(train_data)
fitted_processors.append(processor)
print(f" 处理前记录数: {train_data_before}")
print(f" 处理后记录数: {train_data_after}")
if train_data_before != train_data_after:
print(f" 删除记录数: {train_data_before - train_data_after}")
print("\n 训练集处理后前5行预览:")
print(train_data.head())
print(f"\n 训练集特征统计:")
print(f" 特征数: {len(feature_cols)}")
print(f" 样本数: {len(train_data)}")
print(f" 缺失值统计:")
for col in feature_cols[:5]: # 只显示前5个特征的缺失值
null_count = train_data[col].null_count()
if null_count > 0:
print(f" {col}: {null_count} ({null_count / len(train_data) * 100:.2f}%)")
# %%
# 步骤 4: 训练模型
print("\n[步骤 4/6] 训练模型")
print("-" * 60)
print(f" 模型类型: LightGBM")
print(f" 训练样本数: {len(train_data)}")
print(f" 特征数: {len(feature_cols)}")
print(f" 目标变量: {target_col}")
X_train = train_data.select(feature_cols)
y_train = train_data.select(target_col).to_series()
print(f"\n 目标变量统计:")
print(f" 均值: {y_train.mean():.6f}")
print(f" 标准差: {y_train.std():.6f}")
print(f" 最小值: {y_train.min():.6f}")
print(f" 最大值: {y_train.max():.6f}")
print(f" 缺失值: {y_train.null_count()}")
print("\n 开始训练...")
model.fit(X_train, y_train)
print(" 训练完成!")
# %%
# 步骤 5: 测试集数据处理
print("\n[步骤 5/6] 测试集数据处理")
print("-" * 60)
if processors and test_data is not train_data:
for i, processor in enumerate(fitted_processors, 1):
print(
f" [{i}/{len(fitted_processors)}] 应用处理器: {processor.__class__.__name__}"
)
test_data_before = len(test_data)
test_data = processor.transform(test_data)
test_data_after = len(test_data)
print(f" 处理前记录数: {test_data_before}")
print(f" 处理后记录数: {test_data_after}")
else:
print(" 跳过测试集处理")
# %%
# 步骤 6: 生成预测
print("\n[步骤 6/6] 生成预测")
print("-" * 60)
X_test = test_data.select(feature_cols)
print(f" 测试样本数: {len(X_test)}")
print(" 预测中...")
predictions = model.predict(X_test)
print(f" 预测完成!")
print(f"\n 预测结果统计:")
print(f" 均值: {predictions.mean():.6f}")
print(f" 标准差: {predictions.std():.6f}")
print(f" 最小值: {predictions.min():.6f}")
print(f" 最大值: {predictions.max():.6f}")
# 保存结果到 trainer
trainer.results = test_data.with_columns([pl.Series("prediction", predictions)])
# %% md
# ### 4.3 训练指标曲线
# %%
print("\n" + "=" * 80)
print("训练指标曲线")
print("=" * 80)
# 重新训练以收集指标(因为之前的训练没有保存评估结果)
print("\n重新训练模型以收集训练指标...")
import lightgbm as lgb
# 准备数据(使用 val 做验证test 不参与训练过程)
X_train_np = X_train.to_numpy()
y_train_np = y_train.to_numpy()
X_val_np = val_data.select(feature_cols).to_numpy()
y_val_np = val_data.select(target_col).to_series().to_numpy()
# 创建数据集
train_dataset = lgb.Dataset(X_train_np, label=y_train_np)
val_dataset = lgb.Dataset(X_val_np, label=y_val_np, reference=train_dataset)
# 用于存储评估结果
evals_result = {}
# 使用与原模型相同的参数重新训练
# 正确的三分法train用于训练val用于验证test不参与训练过程
# 添加早停如果验证指标连续100轮没有改善则停止训练
booster_with_eval = lgb.train(
MODEL_PARAMS,
train_dataset,
num_boost_round=MODEL_PARAMS.get("n_estimators", 100),
valid_sets=[train_dataset, val_dataset],
valid_names=["train", "val"],
callbacks=[
lgb.record_evaluation(evals_result),
lgb.early_stopping(stopping_rounds=100, verbose=True),
],
)
print("训练完成,指标已收集")
# 获取指标名称
metric_name = list(evals_result["train"].keys())[0]
print(f"\n评估指标: {metric_name}")
# 提取训练和验证指标
train_metric = evals_result["train"][metric_name]
val_metric = evals_result["val"][metric_name]
# 显示早停信息
actual_rounds = len(train_metric)
expected_rounds = MODEL_PARAMS.get("n_estimators", 100)
print(f"\n[早停信息]")
print(f" 配置的最大轮数: {expected_rounds}")
print(f" 实际训练轮数: {actual_rounds}")
if actual_rounds < expected_rounds:
print(f" 早停状态: 已触发连续100轮验证指标未改善")
else:
print(f" 早停状态: 未触发(达到最大轮数)")
print(f"\n最终指标:")
print(f" 训练 {metric_name}: {train_metric[-1]:.6f}")
print(f" 验证 {metric_name}: {val_metric[-1]:.6f}")
# %%
# 绘制训练指标曲线
import matplotlib.pyplot as plt
fig, ax = plt.subplots(figsize=(12, 6))
# 绘制训练集和验证集的指标曲线注意val用于验证test不参与训练
iterations = range(1, len(train_metric) + 1)
ax.plot(
iterations, train_metric, label=f"Train {metric_name}", linewidth=2, color="blue"
)
ax.plot(
iterations, val_metric, label=f"Validation {metric_name}", linewidth=2, color="red"
)
ax.set_xlabel("Iteration", fontsize=12)
ax.set_ylabel(metric_name.upper(), fontsize=12)
ax.set_title(
f"Training and Validation {metric_name.upper()} Curve",
fontsize=14,
fontweight="bold",
)
ax.legend(fontsize=10)
ax.grid(True, alpha=0.3)
# 标记最佳验证指标点(用于早停决策)
best_iter = val_metric.index(min(val_metric))
best_metric = min(val_metric)
ax.axvline(
x=best_iter + 1,
color="green",
linestyle="--",
alpha=0.7,
label=f"Best Iteration ({best_iter + 1})",
)
ax.scatter([best_iter + 1], [best_metric], color="green", s=100, zorder=5)
ax.annotate(
f"Best: {best_metric:.6f}\nIter: {best_iter + 1}",
xy=(best_iter + 1, best_metric),
xytext=(best_iter + 1 + len(iterations) * 0.1, best_metric),
fontsize=9,
arrowprops=dict(arrowstyle="->", color="green", alpha=0.7),
)
plt.tight_layout()
plt.show()
print(f"\n[指标分析]")
print(f" 最佳验证 {metric_name}: {best_metric:.6f}")
print(f" 最佳迭代轮数: {best_iter + 1}")
print(f" 早停建议: 如果验证指标连续10轮不下降建议在第 {best_iter + 1} 轮停止训练")
print(f"\n[重要提醒] 验证集仅用于早停/调参,测试集完全独立于训练过程!")
# %% md
# ### 4.4 查看结果
# %%
print("\n" + "=" * 80)
print("训练结果")
print("=" * 80)
results = trainer.results
print(f"\n结果数据形状: {results.shape}")
print(f"结果列: {results.columns}")
print(f"\n结果前10行预览:")
print(results.head(10))
print(f"\n结果后5行预览:")
print(results.tail())
print(f"\n每日预测样本数统计:")
daily_counts = results.group_by("trade_date").agg(pl.len()).sort("trade_date")
print(f" 最小: {daily_counts['len'].min()}")
print(f" 最大: {daily_counts['len'].max()}")
print(f" 平均: {daily_counts['len'].mean():.2f}")
# 展示某一天的前10个预测结果
sample_date = results["trade_date"][0]
sample_data = results.filter(results["trade_date"] == sample_date).head(10)
print(f"\n示例日期 {sample_date} 的前10条预测:")
print(sample_data.select(["ts_code", "trade_date", target_col, "prediction"]))
# %% md
# ### 4.4 保存结果
# %%
print("\n" + "=" * 80)
print("保存预测结果")
print("=" * 80)
# 确保输出目录存在
os.makedirs(OUTPUT_DIR, exist_ok=True)
# 生成时间戳
start_dt = datetime.strptime(TEST_START, "%Y%m%d")
end_dt = datetime.strptime(TEST_END, "%Y%m%d")
date_str = f"{start_dt.strftime('%Y%m%d')}_{end_dt.strftime('%Y%m%d')}"
# 保存每日 Top N
print(f"\n[1/1] 保存每日 Top {TOP_N} 股票...")
topn_output_path = os.path.join(OUTPUT_DIR, f"regression_output.csv")
# 按日期分组,取每日 top N
topn_by_date = []
unique_dates = results["trade_date"].unique().sort()
for date in unique_dates:
day_data = results.filter(results["trade_date"] == date)
# 按 prediction 降序排序,取前 N
topn = day_data.sort("prediction", descending=True).head(TOP_N)
topn_by_date.append(topn)
# 合并所有日期的 top N
topn_results = pl.concat(topn_by_date)
# 格式化日期并调整列顺序:日期、分数、股票
topn_to_save = topn_results.select(
[
pl.col("trade_date").str.slice(0, 4)
+ "-"
+ pl.col("trade_date").str.slice(4, 2)
+ "-"
+ pl.col("trade_date").str.slice(6, 2).alias("date"),
pl.col("prediction").alias("score"),
pl.col("ts_code"),
]
)
topn_to_save.write_csv(topn_output_path, include_header=True)
print(f" 保存路径: {topn_output_path}")
print(
f" 保存行数: {len(topn_to_save)}{len(unique_dates)}个交易日 × 每日top{TOP_N}"
)
print(f"\n 预览前15行:")
print(topn_to_save.head(15))
# %% md
# ### 4.5 特征重要性
# %%
importance = model.feature_importance()
if importance is not None:
print("\n特征重要性:")
print(importance.sort_values(ascending=False))
print("\n" + "=" * 80)
print("训练完成!")
print("=" * 80)
# %% md
# ## 5. 可视化分析
#
# 使用训练好的模型直接绘图。
# - **特征重要性图**:辅助特征选择
# - **决策树图**:理解决策逻辑
# %%
# 导入可视化库
import matplotlib.pyplot as plt
import lightgbm as lgb
import pandas as pd
# 从封装的model中取出底层Booster
booster = model.model
print(f"模型类型: {type(booster)}")
print(f"特征数量: {len(feature_cols)}")
# %% md
# ### 5.1 绘制特征重要性(辅助特征选择)
#
# **解读**
# - 重要性高的特征对模型贡献大
# - 重要性为0的特征可以考虑删除
# - 可以帮助理解哪些因子最有效
# %%
print("绘制特征重要性...")
fig, ax = plt.subplots(figsize=(10, 8))
lgb.plot_importance(
booster,
max_num_features=20,
importance_type="gain",
title="Feature Importance (Gain)",
ax=ax,
)
ax.set_xlabel("Importance (Gain)")
plt.tight_layout()
plt.show()
# 打印重要性排名
importance_gain = pd.Series(
booster.feature_importance(importance_type="gain"), index=feature_cols
).sort_values(ascending=False)
print("\n[特征重要性排名 - Gain]")
print(importance_gain)
# 识别低重要性特征
zero_importance = importance_gain[importance_gain == 0].index.tolist()
if zero_importance:
print(f"\n[低重要性特征] 以下{len(zero_importance)}个特征重要性为0可考虑删除:")
for feat in zero_importance:
print(f" - {feat}")
else:
print("\n所有特征都有一定重要性")

View File

@@ -16,6 +16,7 @@ import polars as pl
if TYPE_CHECKING:
from src.factors.registry import FunctionRegistry
from src.factors.metadata import FactorManager
from src.factors.dsl import (
Node,
@@ -57,6 +58,7 @@ class FactorEngine:
data_source: Optional[Dict[str, pl.DataFrame]] = None,
max_workers: int = 4,
registry: Optional["FunctionRegistry"] = None,
metadata_path: Optional[str] = None,
) -> None:
"""初始化因子引擎。
@@ -64,6 +66,7 @@ class FactorEngine:
data_source: 内存数据源,为 None 时使用数据库连接
max_workers: 并行计算的最大工作线程数
registry: 函数注册表None 时创建独立实例
metadata_path: 因子元数据文件路径,为 None 时不启用 metadata 功能
"""
from src.factors.registry import FunctionRegistry
from src.factors.parser import FormulaParser
@@ -78,6 +81,13 @@ class FactorEngine:
self._registry = registry if registry is not None else FunctionRegistry()
self._parser = FormulaParser(self._registry)
# 初始化 metadata 管理器(可选)
self._metadata: Optional["FactorManager"] = None
if metadata_path is not None:
from src.factors.metadata import FactorManager
self._metadata = FactorManager(metadata_path)
def register(
self,
name: str,
@@ -175,6 +185,76 @@ class FactorEngine:
# 委托给现有的 register 方法
return self.register(name, node, data_specs)
def add_factor_by_name(
self,
name: str,
factor_name_in_metadata: Optional[str] = None,
data_specs: Optional[List[DataSpec]] = None,
) -> "FactorEngine":
"""根据 metadata 中的因子名称注册因子。
从 metadata 管理器中根据因子名称查询 DSL 表达式,
然后解析并注册到引擎中。
Args:
name: 要注册的因子名称(引擎中使用的名称)
factor_name_in_metadata: metadata 中的因子名称,
为 None 时默认使用 name 参数
data_specs: 可选的数据规格
Returns:
self支持链式调用
Raises:
RuntimeError: 当引擎未配置 metadata 路径时
ValueError: 当在 metadata 中未找到因子时
FormulaParseError: 当 DSL 表达式解析失败时
Example:
>>> # 初始化时启用 metadata
>>> engine = FactorEngine(metadata_path="data/factors.jsonl")
>>>
>>> # 注册 metadata 中的因子(使用相同名称)
>>> engine.add_factor_by_name("return_5")
>>>
>>> # 使用不同名称注册
>>> engine.add_factor_by_name("my_mom", "momentum_5d")
>>>
>>> # 链式调用
>>> (engine
... .add_factor_by_name("ma20")
... .add_factor_by_name("rsi14")
... .compute(["ma20", "rsi14"], "20240101", "20240131"))
"""
if self._metadata is None:
raise RuntimeError(
"引擎未配置 metadata 路径。请在初始化时传入 metadata_path 参数,"
+ "例如FactorEngine(metadata_path='data/factors.jsonl')"
)
# 使用传入的名称或默认使用 name
query_name = (
factor_name_in_metadata if factor_name_in_metadata is not None else name
)
# 从 metadata 查询因子
df = self._metadata.get_factors_by_name(query_name)
if len(df) == 0:
raise ValueError(
f"在 metadata 中未找到因子 '{query_name}'"
+ "请确认因子名称正确,或先使用 FactorManager 添加该因子。"
)
# 获取 DSL 表达式
dsl_expr = df["dsl"][0]
# 解析表达式为 Node
node = self._parser.parse(dsl_expr)
# 委托给 register 方法
return self.register(name, node, data_specs)
def compute(
self,
factor_names: Union[str, List[str]],

View File

@@ -101,7 +101,8 @@ class STFilter(BaseFilter):
# 打印过滤信息
n_removed = len(daily_codes) - len(daily_filtered)
if n_removed > 0:
print(f" [{date}] 过滤 {n_removed} 只 ST 股票")
pass
# print(f" [{date}] 过滤 {n_removed} 只 ST 股票")
return pl.concat(result_frames)