feat(training): 新增 TabM 模型支持及数据质量优化
- 添加 TabMModel、TabPFNModel 深度学习模型实现 - 新增 DataQualityAnalyzer 进行训练前数据质量诊断 - 改进数据处理器 NaN/null 双重处理,增强数据鲁棒性 - 支持 train_skip_days 参数跳过训练初期数据不足期 - Pipeline 自动清理标签为 NaN 的样本
This commit is contained in:
85
tests/check_gtja.py
Normal file
85
tests/check_gtja.py
Normal file
@@ -0,0 +1,85 @@
|
||||
"""检查 GTJA_alpha 因子"""
|
||||
|
||||
import polars as pl
|
||||
|
||||
from src.factors import FactorEngine
|
||||
from src.training import FactorManager
|
||||
from src.experiment.common import (
|
||||
SELECTED_FACTORS,
|
||||
FACTOR_DEFINITIONS,
|
||||
LABEL_FACTOR,
|
||||
)
|
||||
|
||||
EXCLUDED_FACTORS = [
|
||||
"GTJA_alpha001",
|
||||
"GTJA_alpha002",
|
||||
"GTJA_alpha003",
|
||||
"GTJA_alpha004",
|
||||
"GTJA_alpha005",
|
||||
"GTJA_alpha006",
|
||||
"GTJA_alpha007",
|
||||
"GTJA_alpha008",
|
||||
"GTJA_alpha009",
|
||||
"GTJA_alpha010",
|
||||
"GTJA_alpha011",
|
||||
"GTJA_alpha012",
|
||||
"GTJA_alpha013",
|
||||
"GTJA_alpha014",
|
||||
"GTJA_alpha015",
|
||||
]
|
||||
|
||||
|
||||
def main():
|
||||
print("=" * 80)
|
||||
print("检查 GTJA_alpha 因子")
|
||||
print("=" * 80)
|
||||
|
||||
engine = FactorEngine()
|
||||
factor_manager = FactorManager(
|
||||
selected_factors=SELECTED_FACTORS,
|
||||
factor_definitions=FACTOR_DEFINITIONS,
|
||||
label_factor=LABEL_FACTOR,
|
||||
excluded_factors=EXCLUDED_FACTORS,
|
||||
)
|
||||
|
||||
# 注册因子
|
||||
feature_cols = factor_manager.register_to_engine(engine, verbose=False)
|
||||
|
||||
# 找出 GTJA_alpha 因子
|
||||
gtja_factors = [f for f in feature_cols if f.startswith("GTJA_alpha")]
|
||||
print(f"\nGTJA_alpha 因子数量: {len(gtja_factors)}")
|
||||
print(f"前10个: {gtja_factors[:10]}")
|
||||
|
||||
# 计算一个小的日期范围
|
||||
print("\n计算因子数据...")
|
||||
data = engine.compute(
|
||||
factor_names=gtja_factors[:10] + ["close"], # 只计算前10个 GTJA_alpha + close
|
||||
start_date="20200101",
|
||||
end_date="20200110",
|
||||
)
|
||||
|
||||
print(f"\n数据形状: {data.shape}")
|
||||
print(f"列: {data.columns}")
|
||||
|
||||
# 检查每个 GTJA_alpha 因子的 NaN 情况
|
||||
print("\nGTJA_alpha 因子 NaN 统计:")
|
||||
for col in gtja_factors[:10]:
|
||||
if col in data.columns:
|
||||
nan_count = data[col].null_count()
|
||||
total = len(data)
|
||||
print(f" {col}: {nan_count}/{total} ({nan_count / total * 100:.1f}%)")
|
||||
else:
|
||||
print(f" {col}: 列不存在!")
|
||||
|
||||
# 检查 close 列作为对比
|
||||
print(
|
||||
f"\n close: {data['close'].null_count()}/{len(data)} ({data['close'].null_count() / len(data) * 100:.1f}%)"
|
||||
)
|
||||
|
||||
# 查看实际数据
|
||||
print("\n实际数据预览:")
|
||||
print(data.select(["trade_date", "ts_code"] + gtja_factors[:3]).head(10))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
208
tests/diagnose_nan.py
Normal file
208
tests/diagnose_nan.py
Normal file
@@ -0,0 +1,208 @@
|
||||
"""诊断 NaN 来源"""
|
||||
|
||||
import numpy as np
|
||||
import polars as pl
|
||||
|
||||
from src.factors import FactorEngine
|
||||
from src.training import (
|
||||
FactorManager,
|
||||
DataPipeline,
|
||||
NullFiller,
|
||||
Winsorizer,
|
||||
StandardScaler,
|
||||
)
|
||||
from src.training.components.filters import STFilter
|
||||
from src.experiment.common import (
|
||||
SELECTED_FACTORS,
|
||||
FACTOR_DEFINITIONS,
|
||||
LABEL_NAME,
|
||||
LABEL_FACTOR,
|
||||
stock_pool_filter,
|
||||
STOCK_FILTER_REQUIRED_COLUMNS,
|
||||
)
|
||||
|
||||
# 只使用少量因子加速测试
|
||||
EXCLUDED_FACTORS = [
|
||||
"GTJA_alpha001",
|
||||
"GTJA_alpha002",
|
||||
"GTJA_alpha003",
|
||||
"GTJA_alpha004",
|
||||
"GTJA_alpha005",
|
||||
"GTJA_alpha006",
|
||||
"GTJA_alpha007",
|
||||
"GTJA_alpha008",
|
||||
"GTJA_alpha009",
|
||||
"GTJA_alpha010",
|
||||
"GTJA_alpha011",
|
||||
"GTJA_alpha012",
|
||||
"GTJA_alpha013",
|
||||
"GTJA_alpha014",
|
||||
"GTJA_alpha015",
|
||||
]
|
||||
|
||||
TEST_DATE_RANGE = {
|
||||
"train": ("20200101", "20200331"), # 缩小范围加速测试
|
||||
"val": ("20200401", "20200430"),
|
||||
"test": ("20200501", "20200531"),
|
||||
}
|
||||
|
||||
|
||||
def main():
|
||||
print("=" * 80)
|
||||
print("NaN 来源诊断")
|
||||
print("=" * 80)
|
||||
|
||||
engine = FactorEngine()
|
||||
factor_manager = FactorManager(
|
||||
selected_factors=SELECTED_FACTORS,
|
||||
factor_definitions=FACTOR_DEFINITIONS,
|
||||
label_factor=LABEL_FACTOR,
|
||||
excluded_factors=EXCLUDED_FACTORS,
|
||||
)
|
||||
|
||||
# Step 1: 注册因子并计算原始数据
|
||||
print("\n[Step 1] 注册因子并计算原始数据...")
|
||||
feature_cols = factor_manager.register_to_engine(engine, verbose=False)
|
||||
print(f" 特征数: {len(feature_cols)}")
|
||||
|
||||
all_start = min(
|
||||
TEST_DATE_RANGE["train"][0],
|
||||
TEST_DATE_RANGE["val"][0],
|
||||
TEST_DATE_RANGE["test"][0],
|
||||
)
|
||||
all_end = max(
|
||||
TEST_DATE_RANGE["train"][1],
|
||||
TEST_DATE_RANGE["val"][1],
|
||||
TEST_DATE_RANGE["test"][1],
|
||||
)
|
||||
|
||||
raw_data = engine.compute(
|
||||
factor_names=feature_cols + [LABEL_NAME],
|
||||
start_date=all_start,
|
||||
end_date=all_end,
|
||||
)
|
||||
print(f" 原始数据形状: {raw_data.shape}")
|
||||
|
||||
# 检查原始数据中的 NaN
|
||||
print("\n[Step 2] 原始数据 NaN 统计...")
|
||||
nan_counts = {}
|
||||
for col in feature_cols[:20]: # 只检查前20个特征
|
||||
nan_count = raw_data[col].null_count()
|
||||
if nan_count > 0:
|
||||
nan_counts[col] = nan_count
|
||||
|
||||
print(f" 含 NaN 的特征数 (前20个): {len(nan_counts)}")
|
||||
for col, count in list(nan_counts.items())[:10]:
|
||||
pct = count / len(raw_data) * 100
|
||||
print(f" {col}: {count} ({pct:.1f}%)")
|
||||
|
||||
# Step 3: 应用过滤器
|
||||
print("\n[Step 3] 应用过滤器...")
|
||||
st_filter = STFilter(data_router=engine.router)
|
||||
filtered_data = st_filter.filter(raw_data)
|
||||
print(f" 过滤后数据形状: {filtered_data.shape}")
|
||||
|
||||
# 检查过滤后的 NaN
|
||||
nan_after_filter = sum(filtered_data[col].null_count() for col in feature_cols[:20])
|
||||
print(f" 前20个特征总 NaN 数: {nan_after_filter}")
|
||||
|
||||
# Step 4: 应用股票池筛选
|
||||
print("\n[Step 4] 应用股票池筛选...")
|
||||
from src.training.core.stock_pool_manager import StockPoolManager
|
||||
|
||||
pool_manager = StockPoolManager(
|
||||
filter_func=stock_pool_filter,
|
||||
required_columns=STOCK_FILTER_REQUIRED_COLUMNS,
|
||||
data_router=engine.router,
|
||||
)
|
||||
pool_data = pool_manager.filter_and_select_daily(filtered_data)
|
||||
print(f" 筛选后数据形状: {pool_data.shape}")
|
||||
|
||||
# 检查筛选后的 NaN
|
||||
nan_after_pool = sum(pool_data[col].null_count() for col in feature_cols[:20])
|
||||
print(f" 前20个特征总 NaN 数: {nan_after_pool}")
|
||||
|
||||
# Step 5: 划分数据
|
||||
print("\n[Step 5] 划分训练集...")
|
||||
train_mask = (pool_data["trade_date"] >= TEST_DATE_RANGE["train"][0]) & (
|
||||
pool_data["trade_date"] <= TEST_DATE_RANGE["train"][1]
|
||||
)
|
||||
train_df = pool_data.filter(train_mask)
|
||||
print(f" 训练集形状: {train_df.shape}")
|
||||
|
||||
# 检查训练集的 NaN
|
||||
nan_train_before = sum(train_df[col].null_count() for col in feature_cols[:20])
|
||||
print(f" 前20个特征总 NaN 数: {nan_train_before}")
|
||||
|
||||
# Step 6: 依次应用 processors 并检查每一步的 NaN
|
||||
print("\n[Step 6] 依次应用 processors...")
|
||||
|
||||
# 6.1 NullFiller
|
||||
print("\n [6.1] NullFiller (by_date=True, strategy=mean)...")
|
||||
null_filler = NullFiller(feature_cols=feature_cols, strategy="mean", by_date=True)
|
||||
after_null = null_filler.fit_transform(train_df)
|
||||
nan_after_null = sum(after_null[col].null_count() for col in feature_cols[:20])
|
||||
print(f" 处理后前20个特征总 NaN 数: {nan_after_null}")
|
||||
|
||||
# 检查具体哪些列还有 NaN
|
||||
if nan_after_null > 0:
|
||||
print(" 仍有 NaN 的列:")
|
||||
for col in feature_cols[:20]:
|
||||
count = after_null[col].null_count()
|
||||
if count > 0:
|
||||
print(f" {col}: {count}")
|
||||
|
||||
# 6.2 Winsorizer
|
||||
print("\n [6.2] Winsorizer (by_date=False)...")
|
||||
winsorizer = Winsorizer(
|
||||
feature_cols=feature_cols, lower=0.01, upper=0.99, by_date=False
|
||||
)
|
||||
after_winsor = winsorizer.fit_transform(after_null)
|
||||
nan_after_winsor = sum(after_winsor[col].null_count() for col in feature_cols[:20])
|
||||
print(f" 处理后前20个特征总 NaN 数: {nan_after_winsor}")
|
||||
|
||||
# 6.3 StandardScaler
|
||||
print("\n [6.3] StandardScaler...")
|
||||
scaler = StandardScaler(feature_cols=feature_cols)
|
||||
after_scaler = scaler.fit_transform(after_winsor)
|
||||
nan_after_scaler = sum(after_scaler[col].null_count() for col in feature_cols[:20])
|
||||
print(f" 处理后前20个特征总 NaN 数: {nan_after_scaler}")
|
||||
|
||||
# 检查具体哪些列还有 NaN
|
||||
if nan_after_scaler > 0:
|
||||
print(" 仍有 NaN 的列:")
|
||||
for col in feature_cols[:20]:
|
||||
count = after_scaler[col].null_count()
|
||||
if count > 0:
|
||||
# 检查这列在训练时的统计量
|
||||
has_mean = col in scaler.mean_
|
||||
has_std = col in scaler.std_
|
||||
mean_val = scaler.mean_.get(col, "N/A")
|
||||
std_val = scaler.std_.get(col, "N/A")
|
||||
print(f" {col}: {count}, mean={mean_val}, std={std_val}")
|
||||
|
||||
# Step 7: 提取 X 并检查
|
||||
print("\n[Step 7] 提取特征矩阵 X...")
|
||||
X = after_scaler.select(feature_cols)
|
||||
X_np = X.to_numpy()
|
||||
print(f" X 形状: {X_np.shape}")
|
||||
print(f" X 中 NaN 总数: {np.isnan(X_np).sum()}")
|
||||
|
||||
# 检查哪些特征列有 NaN
|
||||
nan_by_col = []
|
||||
for i, col in enumerate(feature_cols):
|
||||
col_nan = np.isnan(X_np[:, i]).sum()
|
||||
if col_nan > 0:
|
||||
nan_by_col.append((col, col_nan))
|
||||
|
||||
print(f" 含 NaN 的特征列数: {len(nan_by_col)}")
|
||||
for col, count in nan_by_col[:10]:
|
||||
print(f" {col}: {count}")
|
||||
|
||||
print("\n" + "=" * 80)
|
||||
print("诊断完成")
|
||||
print("=" * 80)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
278
tests/test_diagnose_nan.py
Normal file
278
tests/test_diagnose_nan.py
Normal file
@@ -0,0 +1,278 @@
|
||||
"""诊断 NaN 来源 - pytest 版本"""
|
||||
|
||||
import numpy as np
|
||||
import polars as pl
|
||||
import pytest
|
||||
|
||||
from src.factors import FactorEngine
|
||||
from src.training import (
|
||||
FactorManager,
|
||||
NullFiller,
|
||||
Winsorizer,
|
||||
StandardScaler,
|
||||
)
|
||||
from src.training.components.filters import STFilter
|
||||
from src.training.core.stock_pool_manager import StockPoolManager
|
||||
from src.experiment.common import (
|
||||
SELECTED_FACTORS,
|
||||
FACTOR_DEFINITIONS,
|
||||
LABEL_NAME,
|
||||
LABEL_FACTOR,
|
||||
stock_pool_filter,
|
||||
STOCK_FILTER_REQUIRED_COLUMNS,
|
||||
)
|
||||
|
||||
# 只使用少量因子加速测试
|
||||
EXCLUDED_FACTORS = [
|
||||
"GTJA_alpha001",
|
||||
"GTJA_alpha002",
|
||||
"GTJA_alpha003",
|
||||
"GTJA_alpha004",
|
||||
"GTJA_alpha005",
|
||||
"GTJA_alpha006",
|
||||
"GTJA_alpha007",
|
||||
"GTJA_alpha008",
|
||||
"GTJA_alpha009",
|
||||
"GTJA_alpha010",
|
||||
"GTJA_alpha011",
|
||||
"GTJA_alpha012",
|
||||
"GTJA_alpha013",
|
||||
"GTJA_alpha014",
|
||||
"GTJA_alpha015",
|
||||
]
|
||||
|
||||
TEST_DATE_RANGE = {
|
||||
"train": ("20200101", "20200331"), # 缩小范围加速测试
|
||||
"val": ("20200401", "20200430"),
|
||||
"test": ("20200501", "20200531"),
|
||||
}
|
||||
|
||||
|
||||
def test_diagnose_nan_source():
|
||||
"""诊断 NaN 来源"""
|
||||
print("\n" + "=" * 80)
|
||||
print("NaN 来源诊断")
|
||||
print("=" * 80)
|
||||
|
||||
engine = FactorEngine()
|
||||
factor_manager = FactorManager(
|
||||
selected_factors=SELECTED_FACTORS,
|
||||
factor_definitions=FACTOR_DEFINITIONS,
|
||||
label_factor=LABEL_FACTOR,
|
||||
excluded_factors=EXCLUDED_FACTORS,
|
||||
)
|
||||
|
||||
# Step 1: 注册因子并计算原始数据
|
||||
print("\n[Step 1] 注册因子并计算原始数据...")
|
||||
feature_cols = factor_manager.register_to_engine(engine, verbose=False)
|
||||
print(f" 特征数: {len(feature_cols)}")
|
||||
|
||||
all_start = min(
|
||||
TEST_DATE_RANGE["train"][0],
|
||||
TEST_DATE_RANGE["val"][0],
|
||||
TEST_DATE_RANGE["test"][0],
|
||||
)
|
||||
all_end = max(
|
||||
TEST_DATE_RANGE["train"][1],
|
||||
TEST_DATE_RANGE["val"][1],
|
||||
TEST_DATE_RANGE["test"][1],
|
||||
)
|
||||
|
||||
raw_data = engine.compute(
|
||||
factor_names=feature_cols + [LABEL_NAME],
|
||||
start_date=all_start,
|
||||
end_date=all_end,
|
||||
)
|
||||
print(f" 原始数据形状: {raw_data.shape}")
|
||||
|
||||
# 检查原始数据中的 NaN
|
||||
print("\n[Step 2] 原始数据 NaN 统计...")
|
||||
nan_counts = {}
|
||||
for col in feature_cols[:20]: # 只检查前20个特征
|
||||
nan_count = raw_data[col].null_count()
|
||||
if nan_count > 0:
|
||||
nan_counts[col] = nan_count
|
||||
|
||||
print(f" 含 NaN 的特征数 (前20个): {len(nan_counts)}")
|
||||
for col, count in list(nan_counts.items())[:10]:
|
||||
pct = count / len(raw_data) * 100
|
||||
print(f" {col}: {count} ({pct:.1f}%)")
|
||||
|
||||
# Step 3: 应用过滤器
|
||||
print("\n[Step 3] 应用过滤器...")
|
||||
st_filter = STFilter(data_router=engine.router)
|
||||
filtered_data = st_filter.filter(raw_data)
|
||||
print(f" 过滤后数据形状: {filtered_data.shape}")
|
||||
|
||||
# 检查过滤后的 NaN
|
||||
nan_after_filter = sum(filtered_data[col].null_count() for col in feature_cols[:20])
|
||||
print(f" 前20个特征总 NaN 数: {nan_after_filter}")
|
||||
|
||||
# Step 4: 应用股票池筛选
|
||||
print("\n[Step 4] 应用股票池筛选...")
|
||||
pool_manager = StockPoolManager(
|
||||
filter_func=stock_pool_filter,
|
||||
required_columns=STOCK_FILTER_REQUIRED_COLUMNS,
|
||||
data_router=engine.router,
|
||||
)
|
||||
pool_data = pool_manager.filter_and_select_daily(filtered_data)
|
||||
print(f" 筛选后数据形状: {pool_data.shape}")
|
||||
|
||||
# 检查筛选后的 NaN
|
||||
nan_after_pool = sum(pool_data[col].null_count() for col in feature_cols[:20])
|
||||
print(f" 前20个特征总 NaN 数: {nan_after_pool}")
|
||||
|
||||
# Step 5: 划分数据
|
||||
print("\n[Step 5] 划分训练集...")
|
||||
train_mask = (pool_data["trade_date"] >= TEST_DATE_RANGE["train"][0]) & (
|
||||
pool_data["trade_date"] <= TEST_DATE_RANGE["train"][1]
|
||||
)
|
||||
train_df = pool_data.filter(train_mask)
|
||||
print(f" 训练集形状: {train_df.shape}")
|
||||
|
||||
# 检查训练集的 NaN
|
||||
nan_train_before = sum(train_df[col].null_count() for col in feature_cols[:20])
|
||||
print(f" 前20个特征总 NaN 数: {nan_train_before}")
|
||||
|
||||
# Step 6: 依次应用 processors 并检查每一步的 NaN
|
||||
print("\n[Step 6] 依次应用 processors...")
|
||||
|
||||
# 6.1 NullFiller
|
||||
print("\n [6.1] NullFiller (by_date=True, strategy=mean)...")
|
||||
null_filler = NullFiller(feature_cols=feature_cols, strategy="mean", by_date=True)
|
||||
after_null = null_filler.fit_transform(train_df)
|
||||
nan_after_null = sum(after_null[col].null_count() for col in feature_cols[:20])
|
||||
print(f" 处理后前20个特征总 NaN 数: {nan_after_null}")
|
||||
|
||||
# 检查具体哪些列还有 NaN
|
||||
if nan_after_null > 0:
|
||||
print(" 仍有 NaN 的列:")
|
||||
for col in feature_cols[:20]:
|
||||
count = after_null[col].null_count()
|
||||
if count > 0:
|
||||
print(f" {col}: {count}")
|
||||
|
||||
# 6.2 Winsorizer
|
||||
print("\n [6.2] Winsorizer (by_date=False)...")
|
||||
winsorizer = Winsorizer(
|
||||
feature_cols=feature_cols, lower=0.01, upper=0.99, by_date=False
|
||||
)
|
||||
after_winsor = winsorizer.fit_transform(after_null)
|
||||
nan_after_winsor = sum(after_winsor[col].null_count() for col in feature_cols[:20])
|
||||
print(f" 处理后前20个特征总 NaN 数: {nan_after_winsor}")
|
||||
|
||||
# 6.3 StandardScaler
|
||||
print("\n [6.3] StandardScaler...")
|
||||
scaler = StandardScaler(feature_cols=feature_cols)
|
||||
after_scaler = scaler.fit_transform(after_winsor)
|
||||
nan_after_scaler = sum(after_scaler[col].null_count() for col in feature_cols[:20])
|
||||
print(f" 处理后前20个特征总 NaN 数: {nan_after_scaler}")
|
||||
|
||||
# 检查具体哪些列还有 NaN
|
||||
if nan_after_scaler > 0:
|
||||
print(" 仍有 NaN 的列:")
|
||||
for col in feature_cols[:20]:
|
||||
count = after_scaler[col].null_count()
|
||||
if count > 0:
|
||||
# 检查这列在训练时的统计量
|
||||
has_mean = col in scaler.mean_
|
||||
has_std = col in scaler.std_
|
||||
mean_val = scaler.mean_.get(col, "N/A")
|
||||
std_val = scaler.std_.get(col, "N/A")
|
||||
print(f" {col}: {count}, mean={has_mean}, std={has_std}")
|
||||
|
||||
# Step 6.4: 检查 StandardScaler 之后、select 之前的所有列
|
||||
print("\n [6.4] 检查 StandardScaler 后的所有列...")
|
||||
all_nan_counts = {}
|
||||
for col in feature_cols:
|
||||
count = after_scaler[col].null_count()
|
||||
if count > 0:
|
||||
all_nan_counts[col] = count
|
||||
print(f" 所有特征列中含 NaN 的列数: {len(all_nan_counts)}")
|
||||
|
||||
# 检查这些列是否在 feature_cols 的前20个中
|
||||
nan_cols_in_first_20 = [c for c in all_nan_counts.keys() if c in feature_cols[:20]]
|
||||
nan_cols_not_in_first_20 = [
|
||||
c for c in all_nan_counts.keys() if c not in feature_cols[:20]
|
||||
]
|
||||
print(f" 在前20个中的: {len(nan_cols_in_first_20)}")
|
||||
print(f" 不在前20个中的: {len(nan_cols_not_in_first_20)}")
|
||||
if nan_cols_not_in_first_20:
|
||||
print(f" 例如: {nan_cols_not_in_first_20[:10]}")
|
||||
|
||||
# 检查 StandardScaler 是否学到了这些列的统计量
|
||||
print("\n [6.5] 检查 StandardScaler 学到的统计量...")
|
||||
missing_stats_cols = [c for c in all_nan_counts.keys() if c not in scaler.mean_]
|
||||
print(f" 未学到 mean 的列数: {len(missing_stats_cols)}")
|
||||
if missing_stats_cols:
|
||||
print(f" 例如: {missing_stats_cols[:10]}")
|
||||
# 检查这些列的数据类型
|
||||
for col in missing_stats_cols[:3]:
|
||||
dtype = after_scaler[col].dtype
|
||||
print(f" {col}: dtype={dtype}")
|
||||
|
||||
# Step 7: 提取 X 并检查
|
||||
print("\n[Step 7] 提取特征矩阵 X...")
|
||||
X = after_scaler.select(feature_cols)
|
||||
|
||||
# 关键检查:对比 after_scaler 和 X 中的列
|
||||
print("\n [7.1] 对比 after_scaler 和 X 中的列...")
|
||||
for col in feature_cols[:20]:
|
||||
null_in_raw = after_scaler[col].null_count()
|
||||
null_in_x = X[col].null_count()
|
||||
if null_in_raw != null_in_x:
|
||||
print(f" {col}: after_scaler={null_in_raw}, X={null_in_x}")
|
||||
|
||||
X_np = X.to_numpy()
|
||||
print(f" X 形状: {X_np.shape}")
|
||||
print(f" X 中 NaN 总数: {np.isnan(X_np).sum()}")
|
||||
|
||||
# 检查哪些特征列有 NaN
|
||||
nan_by_col = []
|
||||
for i, col in enumerate(feature_cols):
|
||||
col_nan = np.isnan(X_np[:, i]).sum()
|
||||
if col_nan > 0:
|
||||
nan_by_col.append((col, col_nan))
|
||||
|
||||
print(f" 含 NaN 的特征列数: {len(nan_by_col)}")
|
||||
for col, count in nan_by_col[:10]:
|
||||
print(f" {col}: {count}")
|
||||
|
||||
# 检查这些列在 after_scaler 中的数据类型
|
||||
print("\n [Step 8] 检查含 NaN 列的数据类型...")
|
||||
for col, count in nan_by_col[:5]:
|
||||
dtype = after_scaler[col].dtype
|
||||
null_count = after_scaler[col].null_count()
|
||||
print(f" {col}: dtype={dtype}, null_count={null_count}")
|
||||
|
||||
# 检查这些列是否是布尔类型
|
||||
boolean_cols = [
|
||||
col for col in feature_cols if after_scaler[col].dtype == pl.Boolean
|
||||
]
|
||||
print(f"\n Boolean 类型的特征列数: {len(boolean_cols)}")
|
||||
print(f" 例如: {boolean_cols[:10]}")
|
||||
|
||||
# 检查这些布尔列是否有 null
|
||||
boolean_with_null = [
|
||||
col for col in boolean_cols if after_scaler[col].null_count() > 0
|
||||
]
|
||||
print(f"\n 含 null 的 Boolean 列数: {len(boolean_with_null)}")
|
||||
|
||||
# Step 9: 检查是否有不在 feature_cols 中的列有 NaN
|
||||
print("\n [Step 9] 检查非特征列的 NaN...")
|
||||
non_feature_cols = [c for c in after_scaler.columns if c not in feature_cols]
|
||||
non_feature_nan = {}
|
||||
for col in non_feature_cols[:10]:
|
||||
count = after_scaler[col].null_count()
|
||||
if count > 0:
|
||||
non_feature_nan[col] = count
|
||||
print(f" 非特征列中含 NaN 的列数: {len(non_feature_nan)}")
|
||||
for col, count in list(non_feature_nan.items())[:5]:
|
||||
print(f" {col}: {count}")
|
||||
|
||||
print("\n" + "=" * 80)
|
||||
print("诊断完成")
|
||||
print("=" * 80)
|
||||
|
||||
# 断言用于pytest
|
||||
assert True
|
||||
492
tests/test_nan_step_by_step.py
Normal file
492
tests/test_nan_step_by_step.py
Normal file
@@ -0,0 +1,492 @@
|
||||
"""NaN 问题逐步诊断测试 - 精确定位问题环节
|
||||
|
||||
此测试会逐步检查 DataPipeline 的每个处理步骤,精确定位 NaN 产生的位置。
|
||||
"""
|
||||
|
||||
import numpy as np
|
||||
import polars as pl
|
||||
import pytest
|
||||
|
||||
from src.factors import FactorEngine
|
||||
from src.training import (
|
||||
FactorManager,
|
||||
DataPipeline,
|
||||
NullFiller,
|
||||
Winsorizer,
|
||||
StandardScaler,
|
||||
)
|
||||
from src.training.components.filters import STFilter
|
||||
from src.experiment.common import (
|
||||
SELECTED_FACTORS,
|
||||
FACTOR_DEFINITIONS,
|
||||
LABEL_NAME,
|
||||
LABEL_FACTOR,
|
||||
stock_pool_filter,
|
||||
STOCK_FILTER_REQUIRED_COLUMNS,
|
||||
)
|
||||
|
||||
# 测试配置
|
||||
EXCLUDED_FACTORS = [f"GTJA_alpha{i:03d}" for i in range(1, 50)] # 排除前50个加速测试
|
||||
TEST_DATE_RANGE = {
|
||||
"train": ("20200101", "20201231"), # 一整年数据
|
||||
"val": ("20210101", "20210331"),
|
||||
"test": ("20210401", "20210630"),
|
||||
}
|
||||
|
||||
|
||||
class TestNaNStepByStep:
|
||||
"""逐步诊断 NaN 问题的测试类"""
|
||||
|
||||
@pytest.fixture(scope="class")
|
||||
def base_data(self):
|
||||
"""准备基础数据(未经过任何处理)"""
|
||||
print("\n" + "=" * 80)
|
||||
print("[Fixture] 准备基础数据...")
|
||||
|
||||
engine = FactorEngine()
|
||||
factor_manager = FactorManager(
|
||||
selected_factors=SELECTED_FACTORS,
|
||||
factor_definitions=FACTOR_DEFINITIONS,
|
||||
label_factor=LABEL_FACTOR,
|
||||
excluded_factors=EXCLUDED_FACTORS,
|
||||
)
|
||||
|
||||
# 注册因子
|
||||
feature_cols = factor_manager.register_to_engine(engine, verbose=False)
|
||||
print(f" 特征数: {len(feature_cols)}")
|
||||
|
||||
# 计算完整日期范围
|
||||
all_start = min(
|
||||
TEST_DATE_RANGE["train"][0],
|
||||
TEST_DATE_RANGE["val"][0],
|
||||
TEST_DATE_RANGE["test"][0],
|
||||
)
|
||||
all_end = max(
|
||||
TEST_DATE_RANGE["train"][1],
|
||||
TEST_DATE_RANGE["val"][1],
|
||||
TEST_DATE_RANGE["test"][1],
|
||||
)
|
||||
|
||||
# 计算因子
|
||||
raw_data = engine.compute(
|
||||
factor_names=feature_cols + [LABEL_NAME],
|
||||
start_date=all_start,
|
||||
end_date=all_end,
|
||||
)
|
||||
print(f" 原始数据形状: {raw_data.shape}")
|
||||
|
||||
return {
|
||||
"engine": engine,
|
||||
"factor_manager": factor_manager,
|
||||
"feature_cols": feature_cols,
|
||||
"raw_data": raw_data,
|
||||
}
|
||||
|
||||
def test_step_0_raw_data(self, base_data):
|
||||
"""步骤0: 检查原始数据中的 NaN"""
|
||||
print("\n" + "=" * 80)
|
||||
print("[步骤0] 检查原始数据中的 NaN")
|
||||
|
||||
raw_data = base_data["raw_data"]
|
||||
feature_cols = base_data["feature_cols"]
|
||||
|
||||
nan_stats = self._check_nan_in_df(raw_data, feature_cols, "原始数据")
|
||||
|
||||
# 记录有 NaN 的列
|
||||
print(f" 含 NaN 的特征列数: {len(nan_stats['cols_with_nan'])}")
|
||||
if nan_stats["cols_with_nan"]:
|
||||
print(f" 示例: {nan_stats['cols_with_nan'][:5]}")
|
||||
|
||||
return nan_stats
|
||||
|
||||
def test_step_1_after_st_filter(self, base_data):
|
||||
"""步骤1: 检查 STFilter 后的 NaN"""
|
||||
print("\n" + "=" * 80)
|
||||
print("[步骤1] 检查 STFilter 后的 NaN")
|
||||
|
||||
raw_data = base_data["raw_data"]
|
||||
feature_cols = base_data["feature_cols"]
|
||||
engine = base_data["engine"]
|
||||
|
||||
st_filter = STFilter(data_router=engine.router)
|
||||
filtered_data = st_filter.filter(raw_data)
|
||||
|
||||
print(f" 过滤后数据形状: {filtered_data.shape}")
|
||||
print(f" 删除记录数: {len(raw_data) - len(filtered_data)}")
|
||||
|
||||
nan_stats = self._check_nan_in_df(filtered_data, feature_cols, "STFilter后")
|
||||
|
||||
# 对比步骤0,看是否有新增 NaN
|
||||
step0_nan = self.test_step_0_raw_data(base_data)
|
||||
if nan_stats["total_nan"] != step0_nan["total_nan"]:
|
||||
print(
|
||||
f" [警告] NaN 数量变化: {step0_nan['total_nan']} -> {nan_stats['total_nan']}"
|
||||
)
|
||||
|
||||
return nan_stats
|
||||
|
||||
def test_step_2_after_stock_pool(self, base_data):
|
||||
"""步骤2: 检查股票池筛选后的 NaN"""
|
||||
print("\n" + "=" * 80)
|
||||
print("[步骤2] 检查股票池筛选后的 NaN")
|
||||
|
||||
raw_data = base_data["raw_data"]
|
||||
feature_cols = base_data["feature_cols"]
|
||||
engine = base_data["engine"]
|
||||
|
||||
# 先应用 STFilter
|
||||
st_filter = STFilter(data_router=engine.router)
|
||||
filtered_data = st_filter.filter(raw_data)
|
||||
|
||||
# 再应用股票池筛选
|
||||
from src.training.core.stock_pool_manager import StockPoolManager
|
||||
|
||||
pool_manager = StockPoolManager(
|
||||
filter_func=stock_pool_filter,
|
||||
required_columns=STOCK_FILTER_REQUIRED_COLUMNS,
|
||||
data_router=engine.router,
|
||||
)
|
||||
pool_data = pool_manager.filter_and_select_daily(filtered_data)
|
||||
|
||||
print(f" 筛选后数据形状: {pool_data.shape}")
|
||||
print(f" 删除记录数: {len(filtered_data) - len(pool_data)}")
|
||||
|
||||
nan_stats = self._check_nan_in_df(pool_data, feature_cols, "股票池筛选后")
|
||||
return nan_stats
|
||||
|
||||
def test_step_3_train_split_without_skip(self, base_data):
|
||||
"""步骤3: 检查训练集划分后的 NaN(不跳过天数)"""
|
||||
print("\n" + "=" * 80)
|
||||
print("[步骤3] 检查训练集划分后的 NaN(不跳过天数)")
|
||||
|
||||
raw_data = base_data["raw_data"]
|
||||
feature_cols = base_data["feature_cols"]
|
||||
engine = base_data["engine"]
|
||||
|
||||
# 应用过滤器
|
||||
st_filter = STFilter(data_router=engine.router)
|
||||
filtered_data = st_filter.filter(raw_data)
|
||||
|
||||
from src.training.core.stock_pool_manager import StockPoolManager
|
||||
|
||||
pool_manager = StockPoolManager(
|
||||
filter_func=stock_pool_filter,
|
||||
required_columns=STOCK_FILTER_REQUIRED_COLUMNS,
|
||||
data_router=engine.router,
|
||||
)
|
||||
pool_data = pool_manager.filter_and_select_daily(filtered_data)
|
||||
|
||||
# 划分训练集
|
||||
train_start, train_end = TEST_DATE_RANGE["train"]
|
||||
train_mask = (pool_data["trade_date"] >= train_start) & (
|
||||
pool_data["trade_date"] <= train_end
|
||||
)
|
||||
train_df = pool_data.filter(train_mask)
|
||||
|
||||
print(f" 训练集形状: {train_df.shape}")
|
||||
|
||||
# 统计交易日数量
|
||||
unique_dates = train_df["trade_date"].unique().sort()
|
||||
print(f" 训练集交易日数量: {len(unique_dates)}")
|
||||
print(f" 日期范围: {unique_dates[0]} ~ {unique_dates[-1]}")
|
||||
|
||||
nan_stats = self._check_nan_in_df(train_df, feature_cols, "训练集(不跳过)")
|
||||
|
||||
# 返回训练集供后续测试使用
|
||||
return {
|
||||
"nan_stats": nan_stats,
|
||||
"train_df": train_df,
|
||||
"unique_dates": unique_dates,
|
||||
}
|
||||
|
||||
def test_step_4_train_split_with_skip(self, base_data):
|
||||
"""步骤4: 检查训练集划分后的 NaN(跳过前252天)"""
|
||||
print("\n" + "=" * 80)
|
||||
print("[步骤4] 检查训练集划分后的 NaN(跳过前252天)")
|
||||
|
||||
step3_result = self.test_step_3_train_split_without_skip(base_data)
|
||||
train_df = step3_result["train_df"]
|
||||
unique_dates = step3_result["unique_dates"]
|
||||
feature_cols = base_data["feature_cols"]
|
||||
|
||||
# 跳过前252天
|
||||
skip_days = 252
|
||||
if len(unique_dates) > skip_days:
|
||||
start_date = unique_dates[skip_days]
|
||||
train_df_skipped = train_df.filter(pl.col("trade_date") >= start_date)
|
||||
print(f" 跳过前{skip_days}天后,从 {start_date} 开始")
|
||||
print(f" 跳过后训练集形状: {train_df_skipped.shape}")
|
||||
print(f" 跳过记录数: {len(train_df) - len(train_df_skipped)}")
|
||||
else:
|
||||
train_df_skipped = train_df
|
||||
print(
|
||||
f" [警告] 训练集交易日数({len(unique_dates)})少于跳过天数({skip_days}),未跳过"
|
||||
)
|
||||
|
||||
nan_stats = self._check_nan_in_df(
|
||||
train_df_skipped, feature_cols, "训练集(跳过252天)"
|
||||
)
|
||||
|
||||
return {
|
||||
"nan_stats": nan_stats,
|
||||
"train_df": train_df_skipped,
|
||||
}
|
||||
|
||||
def test_step_5_after_null_filler(self, base_data):
|
||||
"""步骤5: 检查 NullFiller 后的 NaN"""
|
||||
print("\n" + "=" * 80)
|
||||
print("[步骤5] 检查 NullFiller 后的 NaN")
|
||||
|
||||
step4_result = self.test_step_4_train_split_with_skip(base_data)
|
||||
train_df = step4_result["train_df"]
|
||||
feature_cols = base_data["feature_cols"]
|
||||
|
||||
print(f" 处理前数据形状: {train_df.shape}")
|
||||
|
||||
# 应用 NullFiller
|
||||
null_filler = NullFiller(
|
||||
feature_cols=feature_cols, strategy="mean", by_date=True
|
||||
)
|
||||
after_null = null_filler.fit_transform(train_df)
|
||||
|
||||
print(f" 处理后数据形状: {after_null.shape}")
|
||||
|
||||
nan_stats = self._check_nan_in_df(after_null, feature_cols, "NullFiller后")
|
||||
|
||||
# 检查哪些列还有 NaN
|
||||
if nan_stats["cols_with_nan"]:
|
||||
print(
|
||||
f" [错误] NullFiller 后仍有 {len(nan_stats['cols_with_nan'])} 列含 NaN:"
|
||||
)
|
||||
for col in nan_stats["cols_with_nan"][:10]:
|
||||
count = after_null[col].null_count()
|
||||
dtype = after_null[col].dtype
|
||||
print(f" {col}: {count} 个 NaN, dtype={dtype}")
|
||||
|
||||
return {
|
||||
"nan_stats": nan_stats,
|
||||
"after_null": after_null,
|
||||
}
|
||||
|
||||
def test_step_6_after_winsorizer(self, base_data):
|
||||
"""步骤6: 检查 Winsorizer 后的 NaN"""
|
||||
print("\n" + "=" * 80)
|
||||
print("[步骤6] 检查 Winsorizer 后的 NaN")
|
||||
|
||||
step5_result = self.test_step_5_after_null_filler(base_data)
|
||||
after_null = step5_result["after_null"]
|
||||
feature_cols = base_data["feature_cols"]
|
||||
|
||||
# 应用 Winsorizer
|
||||
winsorizer = Winsorizer(
|
||||
feature_cols=feature_cols, lower=0.01, upper=0.99, by_date=False
|
||||
)
|
||||
after_winsor = winsorizer.fit_transform(after_null)
|
||||
|
||||
nan_stats = self._check_nan_in_df(after_winsor, feature_cols, "Winsorizer后")
|
||||
|
||||
# 检查哪些列还有 NaN
|
||||
if nan_stats["cols_with_nan"]:
|
||||
print(
|
||||
f" [错误] Winsorizer 后仍有 {len(nan_stats['cols_with_nan'])} 列含 NaN:"
|
||||
)
|
||||
for col in nan_stats["cols_with_nan"][:10]:
|
||||
count = after_winsor[col].null_count()
|
||||
dtype = after_winsor[col].dtype
|
||||
print(f" {col}: {count} 个 NaN, dtype={dtype}")
|
||||
|
||||
return {
|
||||
"nan_stats": nan_stats,
|
||||
"after_winsor": after_winsor,
|
||||
}
|
||||
|
||||
def test_step_7_after_standard_scaler(self, base_data):
|
||||
"""步骤7: 检查 StandardScaler 后的 NaN"""
|
||||
print("\n" + "=" * 80)
|
||||
print("[步骤7] 检查 StandardScaler 后的 NaN")
|
||||
|
||||
step6_result = self.test_step_6_after_winsorizer(base_data)
|
||||
after_winsor = step6_result["after_winsor"]
|
||||
feature_cols = base_data["feature_cols"]
|
||||
|
||||
# 在应用 StandardScaler 之前,检查那些后来出问题的列
|
||||
print("\n [预检查] StandardScaler 前,检查关键列...")
|
||||
problematic_cols = [
|
||||
"GTJA_alpha062",
|
||||
"GTJA_alpha073",
|
||||
"GTJA_alpha085",
|
||||
"GTJA_alpha087",
|
||||
"GTJA_alpha092",
|
||||
"GTJA_alpha103",
|
||||
"GTJA_alpha104",
|
||||
"GTJA_alpha117",
|
||||
"GTJA_alpha124",
|
||||
"GTJA_alpha131",
|
||||
]
|
||||
for col in problematic_cols:
|
||||
if col in after_winsor.columns:
|
||||
null_count = after_winsor[col].null_count()
|
||||
dtype = after_winsor[col].dtype
|
||||
min_val = after_winsor[col].min()
|
||||
max_val = after_winsor[col].max()
|
||||
print(
|
||||
f" {col}: null={null_count}, dtype={dtype}, min={min_val}, max={max_val}"
|
||||
)
|
||||
|
||||
# 应用 StandardScaler
|
||||
scaler = StandardScaler(feature_cols=feature_cols)
|
||||
after_scaler = scaler.fit_transform(after_winsor)
|
||||
|
||||
# 检查 StandardScaler 学到的统计量
|
||||
print("\n [统计量检查] StandardScaler 学到的统计量...")
|
||||
for col in problematic_cols:
|
||||
if col in scaler.mean_:
|
||||
print(f" {col}: mean={scaler.mean_[col]}, std={scaler.std_[col]}")
|
||||
else:
|
||||
print(f" {col}: [未学到统计量]")
|
||||
|
||||
nan_stats = self._check_nan_in_df(
|
||||
after_scaler, feature_cols, "StandardScaler后"
|
||||
)
|
||||
|
||||
# 检查哪些列还有 NaN
|
||||
if nan_stats["cols_with_nan"]:
|
||||
print(
|
||||
f" [错误] StandardScaler 后仍有 {len(nan_stats['cols_with_nan'])} 列含 NaN:"
|
||||
)
|
||||
for col in nan_stats["cols_with_nan"][:10]:
|
||||
count = after_scaler[col].null_count()
|
||||
dtype = after_scaler[col].dtype
|
||||
print(f" {col}: {count} 个 NaN, dtype={dtype}")
|
||||
|
||||
# 检查这列是否学到了统计量
|
||||
if col in scaler.mean_:
|
||||
print(
|
||||
f" mean={scaler.mean_[col]:.4f}, std={scaler.std_[col]:.4f}"
|
||||
)
|
||||
else:
|
||||
print(f" [警告] 未学到统计量!")
|
||||
|
||||
return {
|
||||
"nan_stats": nan_stats,
|
||||
"after_scaler": after_scaler,
|
||||
"scaler": scaler,
|
||||
}
|
||||
|
||||
def test_step_8_extract_X(self, base_data):
|
||||
"""步骤8: 检查提取 X 后的 NaN(转换为 numpy)"""
|
||||
print("\n" + "=" * 80)
|
||||
print("[步骤8] 检查提取 X 后的 NaN")
|
||||
|
||||
step7_result = self.test_step_7_after_standard_scaler(base_data)
|
||||
after_scaler = step7_result["after_scaler"]
|
||||
feature_cols = base_data["feature_cols"]
|
||||
|
||||
# 提取 X
|
||||
X_df = after_scaler.select(feature_cols)
|
||||
print(f" X DataFrame 形状: {X_df.shape}")
|
||||
|
||||
# 对比 DataFrame 和 select 后的 null 数量
|
||||
print("\n [对比] DataFrame vs select 后的 null 数量:")
|
||||
mismatched = []
|
||||
for col in feature_cols[:20]: # 只检查前20个
|
||||
null_in_df = after_scaler[col].null_count()
|
||||
null_in_x = X_df[col].null_count()
|
||||
if null_in_df != null_in_x:
|
||||
mismatched.append((col, null_in_df, null_in_x))
|
||||
|
||||
if mismatched:
|
||||
print(f" [警告] 发现 {len(mismatched)} 列不匹配:")
|
||||
for col, df_null, x_null in mismatched[:10]:
|
||||
print(f" {col}: DataFrame={df_null}, X={x_null}")
|
||||
else:
|
||||
print(f" [通过] 所有列的 null 数量一致")
|
||||
|
||||
# 转换为 numpy
|
||||
X_np = X_df.to_numpy()
|
||||
print(f"\n X numpy 形状: {X_np.shape}")
|
||||
|
||||
nan_count = np.isnan(X_np).sum()
|
||||
print(f" X 中 NaN 总数: {nan_count}")
|
||||
|
||||
if nan_count > 0:
|
||||
# 找出哪些列有 NaN
|
||||
nan_by_col = []
|
||||
for i, col in enumerate(feature_cols):
|
||||
col_nan = np.isnan(X_np[:, i]).sum()
|
||||
if col_nan > 0:
|
||||
nan_by_col.append((col, col_nan))
|
||||
|
||||
print(f"\n [错误] 含 NaN 的特征列数: {len(nan_by_col)}")
|
||||
for col, count in nan_by_col[:10]:
|
||||
# 检查原始 DataFrame 中的情况
|
||||
df_null = after_scaler[col].null_count()
|
||||
dtype = after_scaler[col].dtype
|
||||
|
||||
# 检查是否有 Infinity
|
||||
inf_count_pos = (after_scaler[col] == float("inf")).sum()
|
||||
inf_count_neg = (after_scaler[col] == float("-inf")).sum()
|
||||
|
||||
# 检查 min/max
|
||||
col_min = after_scaler[col].min()
|
||||
col_max = after_scaler[col].max()
|
||||
|
||||
print(
|
||||
f" {col}: numpy中{count}个NaN, DataFrame中{df_null}个null, dtype={dtype}"
|
||||
)
|
||||
print(f" min={col_min}, max={col_max}")
|
||||
print(f" +inf={inf_count_pos}, -inf={inf_count_neg}")
|
||||
|
||||
# 如果有 inf,显示一些样本值
|
||||
if inf_count_pos > 0 or inf_count_neg > 0:
|
||||
sample_vals = after_scaler[col].drop_nulls().tail(5).to_list()
|
||||
print(f" 样本值: {sample_vals}")
|
||||
|
||||
# 断言失败,显示详细信息
|
||||
assert False, f"X 中含 {nan_count} 个 NaN,涉及 {len(nan_by_col)} 个特征列"
|
||||
else:
|
||||
print("\n [通过] X 中无 NaN!")
|
||||
|
||||
def _check_nan_in_df(
|
||||
self, df: pl.DataFrame, feature_cols: list, step_name: str
|
||||
) -> dict:
|
||||
"""检查 DataFrame 中的 NaN 统计信息
|
||||
|
||||
Returns:
|
||||
dict: {
|
||||
'total_nan': 总NaN数,
|
||||
'cols_with_nan': 含NaN的列名列表,
|
||||
'nan_by_col': {列名: NaN数} 的字典
|
||||
}
|
||||
"""
|
||||
nan_by_col = {}
|
||||
total_nan = 0
|
||||
|
||||
for col in feature_cols:
|
||||
null_count = df[col].null_count()
|
||||
if null_count > 0:
|
||||
nan_by_col[col] = null_count
|
||||
total_nan += null_count
|
||||
|
||||
cols_with_nan = list(nan_by_col.keys())
|
||||
|
||||
print(f" {step_name}:")
|
||||
print(f" 总记录数: {len(df)}")
|
||||
print(f" 特征列数: {len(feature_cols)}")
|
||||
print(f" 总NaN数: {total_nan}")
|
||||
print(f" 含NaN的列数: {len(cols_with_nan)}")
|
||||
|
||||
if cols_with_nan and len(cols_with_nan) <= 5:
|
||||
print(f" 含NaN的列: {cols_with_nan}")
|
||||
elif cols_with_nan:
|
||||
print(f" 含NaN的列(前5): {cols_with_nan[:5]}...")
|
||||
|
||||
return {
|
||||
"total_nan": total_nan,
|
||||
"cols_with_nan": cols_with_nan,
|
||||
"nan_by_col": nan_by_col,
|
||||
}
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
pytest.main([__file__, "-v", "-s"])
|
||||
310
tests/test_tabm_integration.py
Normal file
310
tests/test_tabm_integration.py
Normal file
@@ -0,0 +1,310 @@
|
||||
"""TabM 集成测试
|
||||
|
||||
测试 TabMModel 和 TabMRegressionTask 的完整训练流程。
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
import polars as pl
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
# 确保 src 在路径中
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||
|
||||
from src.training.components.models import TabMModel
|
||||
from src.training.tasks import TabMRegressionTask
|
||||
|
||||
|
||||
# ==========================================
|
||||
# 测试数据准备
|
||||
# ==========================================
|
||||
|
||||
|
||||
def create_sample_data(n_samples: int = 1000, n_features: int = 20, seed: int = 42):
|
||||
"""创建样本数据用于测试
|
||||
|
||||
Args:
|
||||
n_samples: 样本数量
|
||||
n_features: 特征数量
|
||||
seed: 随机种子
|
||||
|
||||
Returns:
|
||||
(train_X, train_y, val_X, val_y, test_X, test_y)
|
||||
"""
|
||||
np.random.seed(seed)
|
||||
torch.manual_seed(seed)
|
||||
|
||||
# 创建特征矩阵
|
||||
X_train = pl.DataFrame(
|
||||
np.random.randn(n_samples, n_features).astype(np.float32),
|
||||
schema=[f"feature_{i}" for i in range(n_features)],
|
||||
)
|
||||
y_train = pl.Series("target", np.random.randn(n_samples).astype(np.float32))
|
||||
|
||||
X_val = pl.DataFrame(
|
||||
np.random.randn(n_samples // 2, n_features).astype(np.float32),
|
||||
schema=[f"feature_{i}" for i in range(n_features)],
|
||||
)
|
||||
y_val = pl.Series("target", np.random.randn(n_samples // 2).astype(np.float32))
|
||||
|
||||
X_test = pl.DataFrame(
|
||||
np.random.randn(n_samples // 2, n_features).astype(np.float32),
|
||||
schema=[f"feature_{i}" for i in range(n_features)],
|
||||
)
|
||||
y_test = pl.Series("target", np.random.randn(n_samples // 2).astype(np.float32))
|
||||
|
||||
return X_train, y_train, X_val, y_val, X_test, y_test
|
||||
|
||||
|
||||
# ==========================================
|
||||
# TabMModel 测试
|
||||
# ==========================================
|
||||
|
||||
|
||||
class TestTabMModel:
|
||||
"""TabMModel 单元测试"""
|
||||
|
||||
def test_initialization(self):
|
||||
"""测试模型初始化"""
|
||||
params = {
|
||||
"n_blocks": 2,
|
||||
"d_block": 128,
|
||||
"ensemble_size": 8, # 小规模集成用于测试
|
||||
"batch_size": 64,
|
||||
"epochs": 2,
|
||||
}
|
||||
|
||||
model = TabMModel(params)
|
||||
|
||||
assert model.name == "tabm"
|
||||
assert model.params == params
|
||||
assert model.device.type in ["cuda", "cpu"]
|
||||
assert model.model is None # 未训练时为 None
|
||||
|
||||
def test_fit_and_predict(self):
|
||||
"""测试训练和预测"""
|
||||
# 创建小规模数据
|
||||
X_train, y_train, X_val, y_val, X_test, _ = create_sample_data(
|
||||
n_samples=200, n_features=10, seed=42
|
||||
)
|
||||
|
||||
params = {
|
||||
"n_blocks": 1,
|
||||
"d_block": 64,
|
||||
"ensemble_size": 4,
|
||||
"batch_size": 32,
|
||||
"epochs": 2,
|
||||
"early_stopping_patience": 10,
|
||||
}
|
||||
|
||||
model = TabMModel(params)
|
||||
|
||||
# 训练
|
||||
model.fit(X_train, y_train, eval_set=(X_val, y_val))
|
||||
|
||||
# 验证模型已训练
|
||||
assert model.model is not None
|
||||
assert len(model.training_history_["train_loss"]) > 0
|
||||
|
||||
# 预测
|
||||
predictions = model.predict(X_test)
|
||||
|
||||
# 验证预测结果
|
||||
assert isinstance(predictions, np.ndarray)
|
||||
assert len(predictions) == len(X_test)
|
||||
assert predictions.shape == (len(X_test),)
|
||||
|
||||
def test_save_and_load(self, tmp_path):
|
||||
"""测试模型保存和加载"""
|
||||
# 创建数据
|
||||
X_train, y_train, X_val, y_val, _, _ = create_sample_data(
|
||||
n_samples=200, n_features=10, seed=42
|
||||
)
|
||||
|
||||
params = {
|
||||
"n_blocks": 1,
|
||||
"d_block": 64,
|
||||
"ensemble_size": 4,
|
||||
"batch_size": 32,
|
||||
"epochs": 2,
|
||||
}
|
||||
|
||||
# 训练模型
|
||||
model = TabMModel(params)
|
||||
model.fit(X_train, y_train, eval_set=(X_val, y_val))
|
||||
|
||||
# 保存
|
||||
save_path = tmp_path / "test_tabm_model"
|
||||
model.save(str(save_path))
|
||||
|
||||
# 加载
|
||||
loaded_model = TabMModel.load(str(save_path))
|
||||
|
||||
# 验证加载的模型
|
||||
assert loaded_model.params == params
|
||||
assert loaded_model.feature_names_ == model.feature_names_
|
||||
assert loaded_model.model is not None
|
||||
|
||||
# 预测结果应该一致
|
||||
pred1 = model.predict(X_val)
|
||||
pred2 = loaded_model.predict(X_val)
|
||||
|
||||
np.testing.assert_allclose(pred1, pred2, rtol=1e-5)
|
||||
|
||||
|
||||
# ==========================================
|
||||
# TabMRegressionTask 测试
|
||||
# ==========================================
|
||||
|
||||
|
||||
class TestTabMRegressionTask:
|
||||
"""TabMRegressionTask 单元测试"""
|
||||
|
||||
def test_initialization(self):
|
||||
"""测试任务初始化"""
|
||||
params = {
|
||||
"n_blocks": 2,
|
||||
"d_block": 128,
|
||||
"ensemble_size": 8,
|
||||
"batch_size": 64,
|
||||
"epochs": 2,
|
||||
}
|
||||
|
||||
task = TabMRegressionTask(model_params=params, label_name="target")
|
||||
|
||||
assert task.model_params == params
|
||||
assert task.label_name == "target"
|
||||
assert task.model is None
|
||||
|
||||
def test_prepare_labels(self):
|
||||
"""测试标签准备(回归任务不做转换)"""
|
||||
params = {
|
||||
"ensemble_size": 4,
|
||||
"epochs": 2,
|
||||
}
|
||||
|
||||
task = TabMRegressionTask(model_params=params, label_name="target")
|
||||
|
||||
# 创建测试数据
|
||||
data = {
|
||||
"train": {
|
||||
"X": pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}),
|
||||
"y": pl.Series("target", [0.1, 0.2, 0.3]),
|
||||
}
|
||||
}
|
||||
|
||||
result = task.prepare_labels(data)
|
||||
|
||||
# 回归任务不做转换,数据应该保持不变
|
||||
assert result == data
|
||||
|
||||
def test_fit_train_and_predict(self):
|
||||
"""测试完整训练和预测流程"""
|
||||
# 创建数据
|
||||
X_train, y_train, X_val, y_val, X_test, y_test = create_sample_data(
|
||||
n_samples=300, n_features=10, seed=42
|
||||
)
|
||||
|
||||
params = {
|
||||
"n_blocks": 1,
|
||||
"d_block": 64,
|
||||
"ensemble_size": 4,
|
||||
"batch_size": 32,
|
||||
"epochs": 3,
|
||||
}
|
||||
|
||||
task = TabMRegressionTask(model_params=params, label_name="target")
|
||||
|
||||
# 准备数据格式
|
||||
train_data = {"X": X_train, "y": y_train}
|
||||
val_data = {"X": X_val, "y": y_val}
|
||||
|
||||
# 训练
|
||||
task.fit(train_data, val_data)
|
||||
|
||||
# 验证模型已训练
|
||||
assert task.get_model() is not None
|
||||
|
||||
# 预测
|
||||
predictions = task.predict({"X": X_test})
|
||||
|
||||
# 验证预测结果
|
||||
assert len(predictions) == len(X_test)
|
||||
|
||||
|
||||
# ==========================================
|
||||
# 集成测试
|
||||
# ==========================================
|
||||
|
||||
|
||||
class TestTabMIntegration:
|
||||
"""TabM 集成测试"""
|
||||
|
||||
def test_full_workflow(self):
|
||||
"""测试完整工作流程"""
|
||||
# 创建数据
|
||||
X_train, y_train, X_val, y_val, X_test, y_test = create_sample_data(
|
||||
n_samples=500, n_features=15, seed=42
|
||||
)
|
||||
|
||||
params = {
|
||||
"n_blocks": 2,
|
||||
"d_block": 128,
|
||||
"ensemble_size": 8,
|
||||
"batch_size": 64,
|
||||
"epochs": 5,
|
||||
}
|
||||
|
||||
# 1. 创建 Task
|
||||
task = TabMRegressionTask(model_params=params, label_name="target")
|
||||
|
||||
# 2. 准备数据
|
||||
train_data = {"X": X_train, "y": y_train}
|
||||
val_data = {"X": X_val, "y": y_val}
|
||||
|
||||
# 3. 训练
|
||||
task.fit(train_data, val_data)
|
||||
|
||||
# 4. 验证训练历史
|
||||
model = task.get_model()
|
||||
assert len(model.training_history_["train_loss"]) > 0
|
||||
assert len(model.training_history_["val_loss"]) > 0
|
||||
|
||||
# 5. 预测
|
||||
predictions = task.predict({"X": X_test})
|
||||
|
||||
# 6. 验证预测质量
|
||||
# 简单验证:预测值不应全为常数
|
||||
assert np.std(predictions) > 1e-6, "预测值全为常数,可能是模型未正常训练"
|
||||
|
||||
# 验证预测值与真实值存在一定相关性
|
||||
correlation = np.corrcoef(predictions, y_test.to_numpy())[0, 1]
|
||||
# 注意:随机数据的相关性可能很低,这是正常的
|
||||
print(f"预测与真实值相关系数: {correlation:.4f}")
|
||||
|
||||
def test_gpu_availability(self):
|
||||
"""测试 GPU 可用性"""
|
||||
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||
|
||||
params = {
|
||||
"ensemble_size": 2,
|
||||
"epochs": 1,
|
||||
}
|
||||
|
||||
model = TabMModel(params)
|
||||
|
||||
assert model.device == device
|
||||
expected_type = "cuda" if torch.cuda.is_available() else "cpu"
|
||||
assert model.device.type == expected_type
|
||||
|
||||
|
||||
# ==========================================
|
||||
# 运行测试
|
||||
# ==========================================
|
||||
|
||||
if __name__ == "__main__":
|
||||
pytest.main([__file__, "-v", "--tb=short"])
|
||||
519
tests/test_tabm_nan_debug.py
Normal file
519
tests/test_tabm_nan_debug.py
Normal file
@@ -0,0 +1,519 @@
|
||||
"""TabM NaN 问题诊断测试 + train_skip_days 功能验证
|
||||
|
||||
诊断 loss 为 nan 的根因:
|
||||
1. 标签中是否有 NaN 或极端值
|
||||
2. 标准化后是否有 NaN
|
||||
3. 是否有方差为0或接近0的列
|
||||
4. train_skip_days 功能是否正常工作
|
||||
"""
|
||||
|
||||
import numpy as np
|
||||
import polars as pl
|
||||
import pytest
|
||||
|
||||
from src.factors import FactorEngine
|
||||
from src.training import (
|
||||
FactorManager,
|
||||
DataPipeline,
|
||||
TabMRegressionTask,
|
||||
NullFiller,
|
||||
Winsorizer,
|
||||
StandardScaler,
|
||||
)
|
||||
from src.training.components.filters import STFilter
|
||||
from src.experiment.common import (
|
||||
SELECTED_FACTORS,
|
||||
FACTOR_DEFINITIONS,
|
||||
LABEL_NAME,
|
||||
LABEL_FACTOR,
|
||||
stock_pool_filter,
|
||||
STOCK_FILTER_REQUIRED_COLUMNS,
|
||||
)
|
||||
|
||||
|
||||
# TabM 模型参数(简化用于快速测试)
|
||||
MODEL_PARAMS = {
|
||||
"n_blocks": 2,
|
||||
"d_block": 128,
|
||||
"dropout": 0.1,
|
||||
"ensemble_size": 4, # 简化
|
||||
"batch_size": 256,
|
||||
"learning_rate": 1e-4, # 降低学习率
|
||||
"weight_decay": 1e-5,
|
||||
"epochs": 3,
|
||||
"early_stopping_patience": 5,
|
||||
}
|
||||
|
||||
# 测试日期范围
|
||||
TEST_DATE_RANGE = {
|
||||
"train": ("20200101", "20200630"),
|
||||
"val": ("20200701", "20200731"),
|
||||
"test": ("20200801", "20200831"),
|
||||
}
|
||||
|
||||
# 小范围测试用的因子排除列表
|
||||
EXCLUDED_FACTORS = [
|
||||
"GTJA_alpha001",
|
||||
"GTJA_alpha002",
|
||||
"GTJA_alpha003",
|
||||
"GTJA_alpha004",
|
||||
"GTJA_alpha005",
|
||||
"GTJA_alpha006",
|
||||
"GTJA_alpha007",
|
||||
"GTJA_alpha008",
|
||||
"GTJA_alpha009",
|
||||
"GTJA_alpha010",
|
||||
]
|
||||
|
||||
|
||||
class TestTabMNanDebug:
|
||||
"""TabM NaN 问题诊断测试类(使用 DataPipeline)"""
|
||||
|
||||
@pytest.fixture(scope="class")
|
||||
def engine_and_factor_manager(self):
|
||||
"""准备 FactorEngine 和 FactorManager"""
|
||||
engine = FactorEngine()
|
||||
|
||||
factor_manager = FactorManager(
|
||||
selected_factors=SELECTED_FACTORS,
|
||||
factor_definitions=FACTOR_DEFINITIONS,
|
||||
label_factor=LABEL_FACTOR,
|
||||
excluded_factors=EXCLUDED_FACTORS,
|
||||
)
|
||||
|
||||
return {
|
||||
"engine": engine,
|
||||
"factor_manager": factor_manager,
|
||||
}
|
||||
|
||||
@pytest.fixture(scope="class")
|
||||
def pipeline_data(self, engine_and_factor_manager):
|
||||
"""使用 DataPipeline 准备数据,并过滤掉全为NaN的列"""
|
||||
engine = engine_and_factor_manager["engine"]
|
||||
factor_manager = engine_and_factor_manager["factor_manager"]
|
||||
|
||||
# 创建 DataPipeline(使用 train_skip_days=0 进行基础测试)
|
||||
pipeline = DataPipeline(
|
||||
factor_manager=factor_manager,
|
||||
processor_configs=[
|
||||
(NullFiller, {"strategy": "mean"}),
|
||||
(Winsorizer, {"lower": 0.01, "upper": 0.99}),
|
||||
(StandardScaler, {}),
|
||||
],
|
||||
filters=[STFilter(data_router=engine.router)],
|
||||
stock_pool_filter_func=stock_pool_filter,
|
||||
stock_pool_required_columns=STOCK_FILTER_REQUIRED_COLUMNS,
|
||||
train_skip_days=0, # 先不跳过天数进行测试
|
||||
)
|
||||
|
||||
# 准备数据
|
||||
data = pipeline.prepare_data(
|
||||
engine=engine,
|
||||
date_range=TEST_DATE_RANGE,
|
||||
label_name=LABEL_NAME,
|
||||
verbose=True,
|
||||
)
|
||||
|
||||
# 获取特征列
|
||||
feature_cols = factor_manager.register_to_engine(engine, verbose=False)
|
||||
|
||||
# 过滤掉全为NaN的列(这些因子计算失败,没有修复意义)
|
||||
print("\n[DataPipeline] 检查并过滤全为NaN的特征列...")
|
||||
for split_name in ["train", "val", "test"]:
|
||||
X_df = data[split_name]["X"]
|
||||
y_series = data[split_name]["y"]
|
||||
raw_data = data[split_name]["raw_data"]
|
||||
|
||||
# 删除标签为NaN的行
|
||||
y_nan_count = y_series.null_count()
|
||||
if y_nan_count > 0:
|
||||
print(f" {split_name}: 发现 {y_nan_count} 个标签为NaN的行,将被删除")
|
||||
# 创建有效标签的mask
|
||||
valid_mask = y_series.is_not_null()
|
||||
# 过滤所有相关数据
|
||||
X_df = X_df.filter(valid_mask)
|
||||
y_series = y_series.filter(valid_mask)
|
||||
raw_data = raw_data.filter(valid_mask)
|
||||
# 更新数据
|
||||
data[split_name]["X"] = X_df
|
||||
data[split_name]["y"] = y_series
|
||||
data[split_name]["raw_data"] = raw_data
|
||||
|
||||
# 检查每列的NaN数量
|
||||
nan_counts = {col: X_df[col].null_count() for col in X_df.columns}
|
||||
total_rows = len(X_df)
|
||||
|
||||
# 找出全为NaN的列
|
||||
all_nan_cols = [
|
||||
col for col, count in nan_counts.items() if count == total_rows
|
||||
]
|
||||
|
||||
if all_nan_cols:
|
||||
print(
|
||||
f" {split_name}: 发现 {len(all_nan_cols)} 个全为NaN的列,将被删除"
|
||||
)
|
||||
print(
|
||||
f" 列名: {all_nan_cols[:5]}{'...' if len(all_nan_cols) > 5 else ''}"
|
||||
)
|
||||
|
||||
# 更新feature_cols
|
||||
feature_cols = [c for c in feature_cols if c not in all_nan_cols]
|
||||
|
||||
# 从X中删除这些列
|
||||
X_df = X_df.select(feature_cols)
|
||||
|
||||
# 从raw_data中也删除这些列(保留原始特征列以外的列如trade_date, ts_code等)
|
||||
raw_cols_to_keep = [
|
||||
c for c in raw_data.columns if c not in all_nan_cols
|
||||
]
|
||||
raw_data = raw_data.select(raw_cols_to_keep)
|
||||
|
||||
# 更新数据
|
||||
data[split_name]["X"] = X_df
|
||||
data[split_name]["raw_data"] = raw_data
|
||||
data[split_name]["feature_cols"] = feature_cols
|
||||
|
||||
# 验证没有NaN了
|
||||
X_np = X_df.to_numpy()
|
||||
nan_count = np.isnan(X_np).sum()
|
||||
assert nan_count == 0, f"{split_name} 中仍有 {nan_count} 个NaN"
|
||||
|
||||
print(f" 过滤后特征数: {len(feature_cols)}")
|
||||
print(" [通过] 所有特征列均无NaN")
|
||||
|
||||
return {
|
||||
"pipeline": pipeline,
|
||||
"data": data,
|
||||
"feature_cols": feature_cols,
|
||||
"engine": engine,
|
||||
}
|
||||
|
||||
def test_pipeline_data_structure(self, pipeline_data):
|
||||
"""诊断0: DataPipeline 返回的数据结构检查"""
|
||||
data = pipeline_data["data"]
|
||||
feature_cols = pipeline_data["feature_cols"]
|
||||
|
||||
print("\n[诊断0] DataPipeline 数据结构检查:")
|
||||
|
||||
# 检查数据结构
|
||||
assert "train" in data, "缺少 train 数据"
|
||||
assert "val" in data, "缺少 val 数据"
|
||||
assert "test" in data, "缺少 test 数据"
|
||||
|
||||
for split_name in ["train", "val", "test"]:
|
||||
split_data = data[split_name]
|
||||
assert "X" in split_data, f"{split_name} 缺少 X"
|
||||
assert "y" in split_data, f"{split_name} 缺少 y"
|
||||
assert "raw_data" in split_data, f"{split_name} 缺少 raw_data"
|
||||
assert "feature_cols" in split_data, f"{split_name} 缺少 feature_cols"
|
||||
|
||||
X = split_data["X"]
|
||||
y = split_data["y"]
|
||||
|
||||
print(f" {split_name}:")
|
||||
print(f" X 形状: {X.shape}")
|
||||
print(f" y 形状: {len(y)}")
|
||||
print(f" 特征数: {len(split_data['feature_cols'])}")
|
||||
|
||||
# 检查维度一致性
|
||||
assert X.shape[0] == len(y), f"{split_name} X 和 y 行数不匹配"
|
||||
assert X.shape[1] == len(feature_cols), f"{split_name} 特征数不匹配"
|
||||
|
||||
print(" [通过] 数据结构正确")
|
||||
|
||||
def test_label_quality_with_pipeline(self, pipeline_data):
|
||||
"""诊断1: 使用 DataPipeline 后的标签数据质量检查"""
|
||||
data = pipeline_data["data"]
|
||||
|
||||
# 检查所有数据集的标签质量
|
||||
for split_name in ["train", "val", "test"]:
|
||||
y = data[split_name]["y"]
|
||||
y_np = y.to_numpy()
|
||||
|
||||
print(f"\n[诊断1-{split_name}] 标签数据质量:")
|
||||
print(f" 总数: {len(y)}")
|
||||
print(f" NaN数量: {y.null_count()}")
|
||||
print(f" 均值: {y.mean():.6f}")
|
||||
print(f" 标准差: {y.std():.6f}")
|
||||
print(f" 最小值: {y.min():.6f}")
|
||||
print(f" 最大值: {y.max():.6f}")
|
||||
|
||||
inf_count = np.isinf(y_np).sum()
|
||||
nan_count = np.isnan(y_np).sum()
|
||||
print(f" inf数量: {inf_count}")
|
||||
print(f" nan数量: {nan_count}")
|
||||
|
||||
assert inf_count == 0, f"{split_name} 标签含 inf: {inf_count}"
|
||||
assert nan_count == 0, f"{split_name} 标签含 nan: {nan_count}"
|
||||
|
||||
def test_processed_data_quality_with_pipeline(self, pipeline_data):
|
||||
"""诊断2: 使用 DataPipeline 处理后的特征数据质量"""
|
||||
data = pipeline_data["data"]
|
||||
feature_cols = pipeline_data["feature_cols"]
|
||||
|
||||
for split_name in ["train", "val", "test"]:
|
||||
X = data[split_name]["X"]
|
||||
y = data[split_name]["y"]
|
||||
|
||||
X_np = X.to_numpy().astype(np.float32)
|
||||
y_np = y.to_numpy().astype(np.float32)
|
||||
|
||||
print(f"\n[诊断2-{split_name}] 处理后数据质量:")
|
||||
print(f" X 形状: {X_np.shape}, dtype: {X_np.dtype}")
|
||||
print(f" y 形状: {y_np.shape}, dtype: {y_np.dtype}")
|
||||
print(f" X中NaN: {np.isnan(X_np).sum()}")
|
||||
print(f" X中Inf: {np.isinf(X_np).sum()}")
|
||||
print(f" y中NaN: {np.isnan(y_np).sum()}")
|
||||
print(f" y中Inf: {np.isinf(y_np).sum()}")
|
||||
|
||||
assert np.isnan(X_np).sum() == 0, f"{split_name} X含NaN"
|
||||
assert np.isnan(y_np).sum() == 0, f"{split_name} y含NaN"
|
||||
assert np.isinf(X_np).sum() == 0, f"{split_name} X含Inf"
|
||||
assert np.isinf(y_np).sum() == 0, f"{split_name} y含Inf"
|
||||
|
||||
def test_train_skip_days_functionality(self, engine_and_factor_manager):
|
||||
"""诊断3: train_skip_days 功能验证"""
|
||||
engine = engine_and_factor_manager["engine"]
|
||||
factor_manager = engine_and_factor_manager["factor_manager"]
|
||||
|
||||
print("\n[诊断3] train_skip_days 功能验证:")
|
||||
|
||||
# 创建一个跳过50天数据的 pipeline
|
||||
skip_days = 50
|
||||
pipeline_with_skip = DataPipeline(
|
||||
factor_manager=factor_manager,
|
||||
processor_configs=[
|
||||
(NullFiller, {"strategy": "mean"}),
|
||||
(Winsorizer, {"lower": 0.01, "upper": 0.99}),
|
||||
(StandardScaler, {}),
|
||||
],
|
||||
filters=[STFilter(data_router=engine.router)],
|
||||
stock_pool_filter_func=stock_pool_filter,
|
||||
stock_pool_required_columns=STOCK_FILTER_REQUIRED_COLUMNS,
|
||||
train_skip_days=skip_days,
|
||||
)
|
||||
|
||||
# 准备数据
|
||||
data_with_skip = pipeline_with_skip.prepare_data(
|
||||
engine=engine,
|
||||
date_range=TEST_DATE_RANGE,
|
||||
label_name=LABEL_NAME,
|
||||
verbose=True,
|
||||
)
|
||||
|
||||
# 获取原始数据(不跳过天数)用于对比
|
||||
pipeline_no_skip = DataPipeline(
|
||||
factor_manager=factor_manager,
|
||||
processor_configs=[
|
||||
(NullFiller, {"strategy": "mean"}),
|
||||
(Winsorizer, {"lower": 0.01, "upper": 0.99}),
|
||||
(StandardScaler, {}),
|
||||
],
|
||||
filters=[STFilter(data_router=engine.router)],
|
||||
stock_pool_filter_func=stock_pool_filter,
|
||||
stock_pool_required_columns=STOCK_FILTER_REQUIRED_COLUMNS,
|
||||
train_skip_days=0,
|
||||
)
|
||||
|
||||
data_no_skip = pipeline_no_skip.prepare_data(
|
||||
engine=engine,
|
||||
date_range=TEST_DATE_RANGE,
|
||||
label_name=LABEL_NAME,
|
||||
verbose=False,
|
||||
)
|
||||
|
||||
# 验证训练数据减少了
|
||||
train_with_skip = data_with_skip["train"]["raw_data"]
|
||||
train_no_skip = data_no_skip["train"]["raw_data"]
|
||||
|
||||
print(f"\n 对比结果:")
|
||||
print(f" 不跳过时训练数据: {len(train_no_skip)} 条")
|
||||
print(f" 跳过{skip_days}天后: {len(train_with_skip)} 条")
|
||||
print(f" 减少: {len(train_no_skip) - len(train_with_skip)} 条")
|
||||
|
||||
# 验证验证集和测试集不受影响
|
||||
assert len(data_with_skip["val"]["raw_data"]) == len(
|
||||
data_no_skip["val"]["raw_data"]
|
||||
), "val 数据不应该受 train_skip_days 影响"
|
||||
assert len(data_with_skip["test"]["raw_data"]) == len(
|
||||
data_no_skip["test"]["raw_data"]
|
||||
), "test 数据不应该受 train_skip_days 影响"
|
||||
|
||||
# 验证日期确实被跳过了
|
||||
if len(train_no_skip) > 0:
|
||||
dates_no_skip = sorted(train_no_skip["trade_date"].unique())
|
||||
dates_with_skip = sorted(train_with_skip["trade_date"].unique())
|
||||
|
||||
print(f"\n 日期对比:")
|
||||
print(
|
||||
f" 不跳过 - 最早日期: {dates_no_skip[0]}, 共 {len(dates_no_skip)} 个交易日"
|
||||
)
|
||||
print(
|
||||
f" 跳过 - 最早日期: {dates_with_skip[0]}, 共 {len(dates_with_skip)} 个交易日"
|
||||
)
|
||||
|
||||
# 验证跳过的数据确实从更晚的日期开始
|
||||
if len(dates_no_skip) > skip_days:
|
||||
expected_start_date = dates_no_skip[skip_days]
|
||||
assert dates_with_skip[0] == expected_start_date, (
|
||||
f"预期从 {expected_start_date} 开始,实际从 {dates_with_skip[0]} 开始"
|
||||
)
|
||||
print(f" [通过] 正确跳过前 {skip_days} 个交易日")
|
||||
|
||||
def test_training_with_pipeline(self, pipeline_data):
|
||||
"""诊断4: 使用 DataPipeline 处理后的数据进行训练测试"""
|
||||
import torch
|
||||
from torch.utils.data import DataLoader, TensorDataset
|
||||
from tabm import TabM
|
||||
|
||||
data = pipeline_data["data"]
|
||||
feature_cols = pipeline_data["feature_cols"]
|
||||
|
||||
# 获取训练数据
|
||||
X_train_df = data["train"]["X"]
|
||||
y_train_series = data["train"]["y"]
|
||||
|
||||
# 删除标签为NaN的行
|
||||
valid_mask = y_train_series.is_not_null()
|
||||
y_nan_count = y_train_series.null_count()
|
||||
if y_nan_count > 0:
|
||||
print(f" 发现 {y_nan_count} 个标签为NaN的行,将被删除")
|
||||
X_train_df = X_train_df.filter(valid_mask)
|
||||
y_train_series = y_train_series.filter(valid_mask)
|
||||
|
||||
X_train = X_train_df.to_numpy().astype(np.float32)
|
||||
y_train = y_train_series.to_numpy().astype(np.float32)
|
||||
|
||||
# 只取前1000条加速测试
|
||||
X_train = X_train[:1000]
|
||||
y_train = y_train[:1000]
|
||||
|
||||
print(f"\n[诊断4] DataPipeline 数据训练测试:")
|
||||
print(f" X 形状: {X_train.shape}")
|
||||
print(f" y 形状: {y_train.shape}")
|
||||
print(f" X中NaN: {np.isnan(X_train).sum()}, Inf: {np.isinf(X_train).sum()}")
|
||||
print(f" y中NaN: {np.isnan(y_train).sum()}, Inf: {np.isinf(y_train).sum()}")
|
||||
|
||||
# 创建 TabM 模型
|
||||
n_features = X_train.shape[1]
|
||||
model = TabM.make(
|
||||
n_num_features=n_features,
|
||||
cat_cardinalities=[],
|
||||
d_out=1,
|
||||
n_blocks=2,
|
||||
d_block=128,
|
||||
dropout=0.1,
|
||||
k=4,
|
||||
)
|
||||
|
||||
# 训练一个 epoch
|
||||
dataset = TensorDataset(torch.from_numpy(X_train), torch.from_numpy(y_train))
|
||||
loader = DataLoader(dataset, batch_size=256, shuffle=True)
|
||||
|
||||
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)
|
||||
criterion = torch.nn.MSELoss()
|
||||
|
||||
model.train()
|
||||
losses = []
|
||||
|
||||
for batch_idx, (bx, by) in enumerate(loader):
|
||||
optimizer.zero_grad()
|
||||
outputs = model(bx) # [B, E, 1]
|
||||
outputs_squeezed = outputs.squeeze(-1) # [B, E]
|
||||
by_expanded = by.unsqueeze(-1).expand(-1, 4) # [B, E]
|
||||
loss = criterion(outputs_squeezed, by_expanded)
|
||||
loss.backward()
|
||||
optimizer.step()
|
||||
|
||||
losses.append(loss.item())
|
||||
print(f" 批次 {batch_idx + 1} loss: {loss.item():.6f}")
|
||||
|
||||
assert not torch.isnan(loss), f"批次 {batch_idx + 1} loss 为 nan!"
|
||||
|
||||
if batch_idx >= 2: # 只测试前3个批次
|
||||
break
|
||||
|
||||
print(f" [通过] 所有批次 loss 正常,无 NaN")
|
||||
|
||||
|
||||
class TestTrainSkipDaysEdgeCases:
|
||||
"""train_skip_days 边界情况测试"""
|
||||
|
||||
@pytest.fixture(scope="class")
|
||||
def engine_and_factor_manager(self):
|
||||
"""准备 FactorEngine 和 FactorManager"""
|
||||
engine = FactorEngine()
|
||||
|
||||
factor_manager = FactorManager(
|
||||
selected_factors=SELECTED_FACTORS,
|
||||
factor_definitions=FACTOR_DEFINITIONS,
|
||||
label_factor=LABEL_FACTOR,
|
||||
excluded_factors=EXCLUDED_FACTORS,
|
||||
)
|
||||
|
||||
return {
|
||||
"engine": engine,
|
||||
"factor_manager": factor_manager,
|
||||
}
|
||||
|
||||
def test_skip_more_than_available_days(self, engine_and_factor_manager):
|
||||
"""测试跳过天数超过可用天数的情况"""
|
||||
engine = engine_and_factor_manager["engine"]
|
||||
factor_manager = engine_and_factor_manager["factor_manager"]
|
||||
|
||||
print("\n[边界测试] 跳过天数超过可用天数:")
|
||||
|
||||
# 使用一个很大的跳过天数
|
||||
pipeline = DataPipeline(
|
||||
factor_manager=factor_manager,
|
||||
processor_configs=[(NullFiller, {"strategy": "mean"})],
|
||||
filters=[STFilter(data_router=engine.router)],
|
||||
stock_pool_filter_func=stock_pool_filter,
|
||||
stock_pool_required_columns=STOCK_FILTER_REQUIRED_COLUMNS,
|
||||
train_skip_days=1000, # 超过测试期间的交易日数
|
||||
)
|
||||
|
||||
# 准备数据(应该能正常运行并发出警告)
|
||||
data = pipeline.prepare_data(
|
||||
engine=engine,
|
||||
date_range=TEST_DATE_RANGE,
|
||||
label_name=LABEL_NAME,
|
||||
verbose=True,
|
||||
)
|
||||
|
||||
# 即使跳过天数很多,也应该有数据返回
|
||||
# 如果交易日数少于跳过天数,应该保留所有数据(只发出警告)
|
||||
train_data = data["train"]["raw_data"]
|
||||
print(f" 训练数据量: {len(train_data)} 条")
|
||||
print(f" [通过] 程序未崩溃")
|
||||
|
||||
def test_skip_zero_days(self, engine_and_factor_manager):
|
||||
"""测试跳过0天(即不跳过)"""
|
||||
engine = engine_and_factor_manager["engine"]
|
||||
factor_manager = engine_and_factor_manager["factor_manager"]
|
||||
|
||||
print("\n[边界测试] 跳过0天:")
|
||||
|
||||
pipeline = DataPipeline(
|
||||
factor_manager=factor_manager,
|
||||
processor_configs=[(NullFiller, {"strategy": "mean"})],
|
||||
filters=[STFilter(data_router=engine.router)],
|
||||
stock_pool_filter_func=stock_pool_filter,
|
||||
stock_pool_required_columns=STOCK_FILTER_REQUIRED_COLUMNS,
|
||||
train_skip_days=0,
|
||||
)
|
||||
|
||||
data = pipeline.prepare_data(
|
||||
engine=engine,
|
||||
date_range=TEST_DATE_RANGE,
|
||||
label_name=LABEL_NAME,
|
||||
verbose=False,
|
||||
)
|
||||
|
||||
train_data = data["train"]["raw_data"]
|
||||
print(f" 训练数据量: {len(train_data)} 条")
|
||||
print(f" [通过] skip=0 时数据正常")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
pytest.main([__file__, "-v", "-s"])
|
||||
Reference in New Issue
Block a user