feat(training): 添加数据质量检查工具并重构实验脚本

- 新增 check_data_quality 函数用于检测全空/全零/全NaN数据质量问题
- 重构 register_factors 函数,消除 FEATURE_COLS 和 PROCESSORS 冗余定义
- 修复实验脚本中特征列表不一致的问题,确保处理器覆盖所有特征
- 优化 LambdaRank 模型参数配置
This commit is contained in:
2026-03-13 22:24:12 +08:00
parent 5b4db7a2c2
commit 3f8ca2cebf
6 changed files with 1135 additions and 305 deletions

File diff suppressed because one or more lines are too long

View File

@@ -31,6 +31,7 @@ from src.training import (
Winsorizer,
NullFiller,
StandardScaler,
check_data_quality,
)
from src.training.components.models import LightGBMLambdaRankModel
from src.training.config import TrainingConfig
@@ -39,13 +40,13 @@ from src.training.config import TrainingConfig
# %% md
# ## 2. 辅助函数
# %%
def create_factors_with_metadata(
def register_factors(
engine: FactorEngine,
selected_factors: List[str],
factor_definitions: dict,
label_factor: dict,
) -> List[str]:
"""注册因子(SELECTED_FACTORS 从 metadata 查询FACTOR_DEFINITIONS 用表达式注册)"""
"""注册因子(selected_factors 从 metadata 查询factor_definitions 用 DSL 表达式注册)"""
print("=" * 80)
print("注册因子")
print("=" * 80)
@@ -326,14 +327,18 @@ VAL_END = "20241231"
TEST_START = "20250101"
TEST_END = "20251231"
# 分位数配置
N_QUANTILES = 20 # 将 label 分为 20 组
# LambdaRank 模型参数配置
MODEL_PARAMS = {
"objective": "lambdarank",
"metric": "ndcg",
"ndcg_at": 2, # 评估 NDCG@k
"ndcg_at": 10, # 评估 NDCG@k
"learning_rate": 0.01,
"num_leaves": 31,
"max_depth": 6,
"max_depth": 4,
"min_data_in_leaf": 20,
"n_estimators": 2000,
"early_stopping_round": 300,
@@ -343,21 +348,10 @@ MODEL_PARAMS = {
"reg_lambda": 1.0,
"verbose": -1,
"random_state": 42,
"lambdarank_truncation_level": 10,
"label_gain": [i for i in range(1, N_QUANTILES + 1)],
}
# 分位数配置
N_QUANTILES = 20 # 将 label 分为 20 组
# 特征列(用于数据处理器)
FEATURE_COLS = SELECTED_FACTORS
# 数据处理器配置
PROCESSORS = [
NullFiller(feature_cols=FEATURE_COLS, strategy="mean"),
Winsorizer(feature_cols=FEATURE_COLS, lower=0.01, upper=0.99),
StandardScaler(feature_cols=FEATURE_COLS),
]
# 股票池筛选函数
def stock_pool_filter(df: pl.DataFrame) -> pl.Series:
@@ -406,7 +400,7 @@ engine = FactorEngine()
# 2. 使用 metadata 定义因子
print("\n[2] 定义因子(从 metadata 注册)")
feature_cols = create_factors_with_metadata(
feature_cols = register_factors(
engine, SELECTED_FACTORS, FACTOR_DEFINITIONS, LABEL_FACTOR
)
@@ -435,10 +429,14 @@ print(f"[配置] 特征数: {len(feature_cols)}")
print(f"[配置] 目标变量: {target_col}{N_QUANTILES}分位数)")
# 6. 创建排序学习模型
model = LightGBMLambdaRankModel(params=MODEL_PARAMS)
model: LightGBMLambdaRankModel = LightGBMLambdaRankModel(params=MODEL_PARAMS)
# 7. 创建数据处理器
processors = PROCESSORS
# 7. 创建数据处理器(使用函数返回的完整特征列表)
processors = [
NullFiller(feature_cols=feature_cols, strategy="mean"),
Winsorizer(feature_cols=feature_cols, lower=0.01, upper=0.99),
StandardScaler(feature_cols=feature_cols),
]
# 8. 创建数据划分器
splitter = DateSplitter(
@@ -522,7 +520,25 @@ if splitter:
else:
raise ValueError("必须配置数据划分器")
# %% md
# ### 4.3 数据预处理
# ### 4.3 数据质量检查
# %%
print("\n" + "=" * 80)
print("数据质量检查(必须在预处理之前)")
print("=" * 80)
print("\n检查训练集...")
check_data_quality(train_data, feature_cols, raise_on_error=True)
print("\n检查验证集...")
check_data_quality(val_data, feature_cols, raise_on_error=True)
print("\n检查测试集...")
check_data_quality(test_data, feature_cols, raise_on_error=True)
print("[成功] 数据质量检查通过,未发现异常")
# %% md
# ### 4.4 数据预处理
# %%
print("\n" + "=" * 80)
print("数据预处理")
@@ -584,112 +600,51 @@ print("\n" + "=" * 80)
print("训练指标曲线")
print("=" * 80)
# 重新训练以收集指标(因为之前的训练没有保存评估结果
print("\n重新训练模型以收集训练指标...")
# 从模型获取训练评估结果
evals_result = model.get_evals_result()
import lightgbm as lgb
# 准备数据(使用 val 做验证test 不参与训练过程)
X_train_np = X_train.to_numpy()
y_train_np = y_train.to_numpy()
X_val_np = val_data.select(feature_cols).to_numpy()
y_val_np = val_data.select(target_col).to_series().to_numpy()
# 创建数据集
train_dataset = lgb.Dataset(X_train_np, label=y_train_np, group=train_group)
val_dataset = lgb.Dataset(
X_val_np, label=y_val_np, group=val_group, reference=train_dataset
)
# 用于存储评估结果
evals_result = {}
# 使用与原模型相同的参数重新训练
# 正确的三分法train用于训练val用于验证test不参与训练过程
booster_with_eval = lgb.train(
MODEL_PARAMS,
train_dataset,
num_boost_round=MODEL_PARAMS.get("n_estimators", 1000),
valid_sets=[train_dataset, val_dataset],
valid_names=["train", "val"],
callbacks=[
lgb.record_evaluation(evals_result),
lgb.early_stopping(stopping_rounds=50, verbose=True),
],
)
print("训练完成,指标已收集")
# 获取评估的 NDCG 指标
ndcg_metrics = [k for k in evals_result["train"].keys() if "ndcg" in k]
print(f"\n评估的 NDCG 指标: {ndcg_metrics}")
# 显示早停信息
actual_rounds = len(list(evals_result["train"].values())[0])
expected_rounds = MODEL_PARAMS.get("n_estimators", 1000)
print(f"\n[早停信息]")
print(f" 配置的最大轮数: {expected_rounds}")
print(f" 实际训练轮数: {actual_rounds}")
if actual_rounds < expected_rounds:
print(f" 早停状态: 已触发连续50轮验证指标未改善")
if evals_result is None or not evals_result:
print("[警告] 没有可用的训练指标,请确保训练时使用了 eval_set 参数")
else:
print(f" 早停状态: 未触发(达到最大轮数)")
print("[成功] 已从模型获取训练评估结果")
# 显示各 NDCG 指标的最终值
print(f"\n最终 NDCG 指标:")
for metric in ndcg_metrics:
train_ndcg = evals_result["train"][metric][-1]
val_ndcg = evals_result["val"][metric][-1]
print(f" {metric}: 训练集={train_ndcg:.4f}, 验证集={val_ndcg:.4f}")
# %%
# 绘制 NDCG 训练指标曲线
import matplotlib.pyplot as plt
# 获取评估的 NDCG 指标
ndcg_metrics = [k for k in evals_result["train"].keys() if "ndcg" in k]
print(f"\n评估的 NDCG 指标: {ndcg_metrics}")
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
axes = axes.flatten()
# 显示早停信息
actual_rounds = len(list(evals_result["train"].values())[0])
expected_rounds = MODEL_PARAMS.get("n_estimators", 1000)
print(f"\n[早停信息]")
print(f" 配置的最大轮数: {expected_rounds}")
print(f" 实际训练轮数: {actual_rounds}")
for idx, metric in enumerate(ndcg_metrics[:4]): # 最多显示4个NDCG指标
ax = axes[idx]
train_metric = evals_result["train"][metric]
val_metric = evals_result["val"][metric]
iterations = range(1, len(train_metric) + 1)
best_iter = model.get_best_iteration()
if best_iter is not None and best_iter < actual_rounds:
print(f" 早停状态: 已触发(最佳迭代: {best_iter}")
else:
print(f" 早停状态: 未触发(达到最大轮数)")
ax.plot(
iterations, train_metric, label=f"Train {metric}", linewidth=2, color="blue"
)
ax.plot(iterations, val_metric, label=f"Val {metric}", linewidth=2, color="red")
ax.set_xlabel("Iteration", fontsize=10)
ax.set_ylabel(metric.upper(), fontsize=10)
ax.set_title(
f"Training and Validation {metric.upper()}", fontsize=12, fontweight="bold"
)
ax.legend(fontsize=9)
ax.grid(True, alpha=0.3)
# 显示各 NDCG 指标的最终值
print(f"\n最终 NDCG 指标:")
for metric in ndcg_metrics:
train_ndcg = evals_result["train"][metric][-1]
val_ndcg = evals_result["val"][metric][-1]
print(f" {metric}: 训练集={train_ndcg:.4f}, 验证集={val_ndcg:.4f}")
# 标记最佳验证指标
best_iter = val_metric.index(max(val_metric))
best_metric = max(val_metric)
ax.axvline(x=best_iter + 1, color="green", linestyle="--", alpha=0.7)
ax.scatter([best_iter + 1], [best_metric], color="green", s=80, zorder=5)
ax.annotate(
f"Best: {best_metric:.4f}",
xy=(best_iter + 1, best_metric),
xytext=(best_iter + 1 + len(iterations) * 0.05, best_metric),
fontsize=8,
arrowprops=dict(arrowstyle="->", color="green", alpha=0.7),
)
# 使用封装好的方法绘制所有指标
print("\n[绘图] 使用 LightGBM 原生接口绘制训练曲线...")
fig = model.plot_all_metrics(metrics=ndcg_metrics[:4], figsize=(14, 10))
plt.show()
plt.tight_layout()
plt.show()
print(f"\n[指标分析]")
print(f" 各NDCG指标在验证集上的最佳值:")
for metric in ndcg_metrics:
val_metric_list = evals_result["val"][metric]
best_iter = val_metric_list.index(max(val_metric_list))
best_val = max(val_metric_list)
print(f" {metric}: {best_val:.4f} (迭代 {best_iter + 1})")
print(f"\n[重要提醒] 验证集仅用于早停/调参,测试集完全独立于训练过程!")
print(f"\n[指标分析]")
print(f" 各NDCG指标在验证集上的最佳值:")
for metric in ndcg_metrics:
val_metric_list = evals_result["val"][metric]
best_iter_metric = val_metric_list.index(max(val_metric_list))
best_val = max(val_metric_list)
print(f" {metric}: {best_val:.4f} (迭代 {best_iter_metric + 1})")
print(f"\n[重要提醒] 验证集仅用于早停/调参,测试集完全独立于训练过程!")
# %% md
# ### 4.6 模型评估
# %%

View File

@@ -18,6 +18,7 @@ from src.training import (
Trainer,
Winsorizer,
NullFiller,
check_data_quality,
)
from src.training.config import TrainingConfig
@@ -25,13 +26,13 @@ from src.training.config import TrainingConfig
# %% md
# ## 2. 定义辅助函数
# %%
def create_factors_with_metadata(
def register_factors(
engine: FactorEngine,
selected_factors: List[str],
factor_definitions: dict,
label_factor: dict,
) -> List[str]:
"""注册因子(SELECTED_FACTORS 从 metadata 查询FACTOR_DEFINITIONS 用表达式注册)"""
"""注册因子(selected_factors 从 metadata 查询factor_definitions 用 DSL 表达式注册)"""
print("=" * 80)
print("注册因子")
print("=" * 80)
@@ -285,9 +286,6 @@ MODEL_PARAMS = {
"random_state": 42,
}
# 数据处理器配置(新 API需要传入 feature_cols
# 注意processor 现在需要显式指定要处理的特征列
# 股票池筛选函数
# 使用新的 StockPoolManager API传入自定义筛选函数和所需列/因子
@@ -355,7 +353,7 @@ engine = FactorEngine(metadata_path="data/factors.jsonl")
# 2. 使用 metadata 定义因子
print("\n[2] 定义因子(从 metadata 注册)")
feature_cols = create_factors_with_metadata(
feature_cols = register_factors(
engine, SELECTED_FACTORS, FACTOR_DEFINITIONS, LABEL_FACTOR
)
target_col = LABEL_NAME
@@ -380,7 +378,7 @@ print(f"[配置] 目标变量: {target_col}")
# 5. 创建模型
model = LightGBMModel(params=MODEL_PARAMS)
# 6. 创建数据处理器(新 API需要传入 feature_cols
# 6. 创建数据处理器(使用函数返回的完整特征列表
processors = [
NullFiller(feature_cols=feature_cols, strategy="mean"),
Winsorizer(feature_cols=feature_cols, lower=0.01, upper=0.99),
@@ -482,8 +480,26 @@ else:
test_data = filtered_data
print(" 未配置划分器,全部作为训练集")
# %%
# 步骤 3: 训练集数据处理
print("\n[步骤 3/6] 训练集数据处理")
# 步骤 3: 数据质量检查(必须在预处理之前)
print("\n[步骤 3/7] 数据质量检查")
print("-" * 60)
print(" [说明] 此检查在 fillna 等处理之前执行,用于发现数据问题")
print("\n 检查训练集...")
check_data_quality(train_data, feature_cols, raise_on_error=True)
if "val_data" in locals() and val_data is not None:
print("\n 检查验证集...")
check_data_quality(val_data, feature_cols, raise_on_error=True)
print("\n 检查测试集...")
check_data_quality(test_data, feature_cols, raise_on_error=True)
print(" [成功] 数据质量检查通过,未发现异常")
# %%
# 步骤 4: 训练集数据处理
print("\n[步骤 4/7] 训练集数据处理")
print("-" * 60)
fitted_processors = []
if processors:
@@ -510,7 +526,7 @@ for col in feature_cols[:5]: # 只显示前5个特征的缺失值
print(f" {col}: {null_count} ({null_count / len(train_data) * 100:.2f}%)")
# %%
# 步骤 4: 训练模型
print("\n[步骤 4/6] 训练模型")
print("\n[步骤 5/7] 训练模型")
print("-" * 60)
print(f" 模型类型: LightGBM")
print(f" 训练样本数: {len(train_data)}")
@@ -532,7 +548,7 @@ model.fit(X_train, y_train)
print(" 训练完成!")
# %%
# 步骤 5: 测试集数据处理
print("\n[步骤 5/6] 测试集数据处理")
print("\n[步骤 6/7] 测试集数据处理")
print("-" * 60)
if processors and test_data is not train_data:
for i, processor in enumerate(fitted_processors, 1):
@@ -548,7 +564,7 @@ else:
print(" 跳过测试集处理")
# %%
# 步骤 6: 生成预测
print("\n[步骤 6/6] 生成预测")
print("\n[步骤 7/7] 生成预测")
print("-" * 60)
X_test = test_data.select(feature_cols)
print(f" 测试样本数: {len(X_test)}")