Files
ProStock/tests/factors/test_factor_validation.py
liaozhaorun 0a16129548 feat(factors): 添加因子计算框架
- 新增因子基类 (BaseFactor, CrossSectionalFactor, TimeSeriesFactor)
- 新增数据规格和上下文类 (DataSpec, FactorContext, FactorData)
- 新增数据加载器 (DataLoader) 和执行引擎 (FactorEngine)
- 新增组合因子支持 (CompositeFactor, ScalarFactor)
- 添加因子模块完整测试用例
- 添加 Git 提交规范文档
2026-02-22 14:41:32 +08:00

398 lines
13 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""因子真实数据测试 - 与 Polars 原生计算对比
测试目标:
1. 时序因子 - 移动平均线 (MA)
2. 截面因子 - PE_Rank市盈率排名
3. 结合因子 - 时序 * 截面组合
每个因子都与原始 Polars 计算进行对比验证。
"""
import pytest
import pandas as pd
import polars as pl
import numpy as np
from src.factors import DataSpec, FactorContext, FactorData
from src.factors.base import CrossSectionalFactor, TimeSeriesFactor
from src.factors.composite import CompositeFactor, ScalarFactor
# ========== 测试数据准备 ==========
@pytest.fixture(scope="module")
def daily_data():
"""加载日线测试数据(直接使用 Polars"""
with pd.HDFStore("data/daily.h5", mode="r") as store:
df = store["/daily"]
# 筛选日期范围
df = df[(df["trade_date"] >= "20240101") & (df["trade_date"] <= "20240430")]
# 选择部分股票取前20个
stocks = df["ts_code"].unique()[:20]
df = df[df["ts_code"].isin(stocks)]
# 直接返回 Polars DataFrame不转 pandas
pl_df = pl.from_pandas(df)
pl_df = pl_df.sort(["ts_code", "trade_date"])
return pl_df
# ========== 时序因子定义 ==========
class MAFactor(TimeSeriesFactor):
"""移动平均线因子(时序因子)"""
name = "ma_factor"
data_specs = [
DataSpec("daily", ["ts_code", "trade_date", "close"], lookback_days=5)
]
def __init__(self, period: int = 5):
super().__init__(period=period)
def compute(self, data: FactorData) -> pl.Series:
close = data.get_column("close")
period = self.params["period"]
return close.rolling_mean(period)
class PERankFactor(CrossSectionalFactor):
"""PE 市盈率排名因子(截面因子)"""
name = "pe_rank_factor"
data_specs = [
DataSpec("daily", ["ts_code", "trade_date", "close"], lookback_days=1)
]
def compute(self, data: FactorData) -> pl.Series:
cs = data.get_cross_section()
close = cs["close"]
return close.rank() / close.len()
# ========== 测试用例 ==========
class TestTimeSeriesFactor:
"""时序因子测试"""
def test_ma_factor(self, daily_data):
"""测试 MA 因子与 Polars 原生计算对比"""
period = 5
sample_stock = daily_data["ts_code"].to_list()[0]
stock_df = daily_data.filter(pl.col("ts_code") == sample_stock).sort(
"trade_date"
)
# Polars 基准计算
polars_result = stock_df.with_columns(
pl.col("close")
.rolling_mean(window_size=period)
.over("ts_code")
.alias("ma_polars")
)
# 因子框架计算
context = FactorContext(current_stock=sample_stock)
factor_data = FactorData(
stock_df.with_columns([pl.col("trade_date").cast(pl.Utf8)]), context
)
ma_factor = MAFactor(period=period)
factor_result = ma_factor.compute(factor_data).to_numpy()
# 对比结果
polars_values = polars_result["ma_polars"].to_numpy()
# 去除 NaN 后对比
valid_idx = ~np.isnan(polars_values)
polars_valid = polars_values[valid_idx]
factor_valid = factor_result[valid_idx]
diff = np.abs(polars_valid - factor_valid)
max_diff = np.max(diff)
print(f"\n[时序因子 MA({period}) 对比]")
print(f" 样本股票: {sample_stock}")
print(f" 有效数据点: {len(polars_valid)}")
print(f" 最大差异: {max_diff:.15f}")
print(f" 样本数据 (前5个):")
for i in range(min(5, len(polars_valid))):
print(
f" Polars: {polars_valid[i]:.6f}, Factor: {factor_valid[i]:.6f}, Diff: {abs(polars_valid[i] - factor_valid[i]):.15f}"
)
assert max_diff < 1e-10, f"MA 因子计算差异过大: {max_diff}"
class TestCrossSectionalFactor:
"""截面因子测试"""
def test_pe_rank_factor(self, daily_data):
"""测试 PE_Rank 因子与 Polars 原生计算对比"""
trade_dates = daily_data["trade_date"].unique().to_list()
sample_date = trade_dates[50]
date_df = daily_data.filter(pl.col("trade_date") == sample_date)
# Polars 基准计算
polars_result = date_df.with_columns(
(pl.col("close").rank() / pl.col("close").count()).alias("pe_rank_polars")
)
# 因子框架计算
context = FactorContext(current_date=str(sample_date))
factor_data = FactorData(
date_df.with_columns(
[pl.col("trade_date").cast(pl.Utf8), pl.col("ts_code").cast(pl.Utf8)]
),
context,
)
pe_factor = PERankFactor()
factor_result = pe_factor.compute(factor_data).to_numpy()
# 对比结果
polars_values = polars_result["pe_rank_polars"].to_numpy()
diff = np.abs(polars_values - factor_result)
max_diff = np.max(diff)
print(f"\n[截面因子 PE_Rank 对比]")
print(f" 样本日期: {sample_date}")
print(f" 股票数量: {len(polars_values)}")
print(f" 最大差异: {max_diff:.15f}")
print(f" 样本数据 (前5个):")
for i in range(min(5, len(polars_values))):
ts_code = polars_result["ts_code"].to_numpy()[i]
print(
f" {ts_code}: Polars: {polars_values[i]:.6f}, Factor: {factor_result[i]:.6f}"
)
assert max_diff < 1e-10, f"PE_Rank 因子计算差异过大: {max_diff}"
class TestCompositeFactor:
"""结合因子测试"""
def test_scalar_composite(self, daily_data):
"""测试标量组合因子: 0.5 * MA"""
period = 5
sample_stock = daily_data["ts_code"].to_list()[0]
stock_df = daily_data.filter(pl.col("ts_code") == sample_stock).sort(
"trade_date"
)
# Polars 基准计算
polars_ma = stock_df.with_columns(
pl.col("close").rolling_mean(window_size=period).over("ts_code").alias("ma")
)
polars_combined = 0.5 * polars_ma["ma"].to_numpy()
# 因子框架计算
context = FactorContext(current_stock=sample_stock)
factor_data = FactorData(
stock_df.with_columns([pl.col("trade_date").cast(pl.Utf8)]), context
)
# 组合因子: 0.5 * MA
ma_factor = MAFactor(period=period)
scalar_factor = 0.5 * ma_factor
factor_result = scalar_factor.compute(factor_data).to_numpy()
# 对比结果
valid_idx = ~np.isnan(polars_combined)
polars_valid = polars_combined[valid_idx]
factor_valid = factor_result[valid_idx]
diff = np.abs(polars_valid - factor_valid)
max_diff = np.max(diff)
print(f"\n[结合因子 0.5*MA({period}) 对比]")
print(f" 公式: 0.5 * MA({period})")
print(f" 有效数据点: {len(polars_valid)}")
print(f" 最大差异: {max_diff:.15f}")
print(f" 样本数据 (前5个):")
for i in range(min(5, len(polars_valid))):
print(
f" Polars: {polars_valid[i]:.6f}, Factor: {factor_valid[i]:.6f}, Diff: {abs(polars_valid[i] - factor_valid[i]):.15f}"
)
assert max_diff < 1e-10, f"组合因子计算差异过大: {max_diff}"
def test_factor_addition(self, daily_data):
"""测试因子加法组合: MA(5) + MA(10)"""
sample_stock = daily_data["ts_code"].to_list()[0]
stock_df = daily_data.filter(pl.col("ts_code") == sample_stock).sort(
"trade_date"
)
context = FactorContext(current_stock=sample_stock)
# Polars 基准计算
polars_ma5 = stock_df.with_columns(
pl.col("close").rolling_mean(window_size=5).over("ts_code").alias("ma5")
)
polars_ma10 = stock_df.with_columns(
pl.col("close").rolling_mean(window_size=10).over("ts_code").alias("ma10")
)
polars_combined = polars_ma5["ma5"].to_numpy() + polars_ma10["ma10"].to_numpy()
# 因子框架计算
factor_data = FactorData(
stock_df.with_columns([pl.col("trade_date").cast(pl.Utf8)]), context
)
ma5 = MAFactor(period=5)
ma10 = MAFactor(period=10)
combined = ma5 + ma10
factor_result = combined.compute(factor_data).to_numpy()
# 对比结果
valid_idx = ~(np.isnan(polars_combined) | np.isnan(factor_result))
polars_valid = polars_combined[valid_idx]
factor_valid = factor_result[valid_idx]
diff = np.abs(polars_valid - factor_valid)
max_diff = np.max(diff)
print(f"\n[结合因子 MA(5) + MA(10) 对比]")
print(f" 有效数据点: {len(polars_valid)}")
print(f" 最大差异: {max_diff:.15f}")
assert max_diff < 1e-10, f"因子加法组合差异过大: {max_diff}"
class TestFactorComparison:
"""全面对比测试"""
def test_all_factors_summary(self, daily_data):
"""汇总所有因子测试结果"""
print("\n" + "=" * 60)
print("因子测试汇总")
print("=" * 60)
# 测试多个时序周期
for period in [5, 10, 20]:
sample_stock = daily_data["ts_code"].to_list()[0]
stock_df = daily_data.filter(pl.col("ts_code") == sample_stock).sort(
"trade_date"
)
polars_result = stock_df.with_columns(
pl.col("close")
.rolling_mean(window_size=period)
.over("ts_code")
.alias("ma")
)
context = FactorContext(current_stock=sample_stock)
factor_data = FactorData(
stock_df.with_columns([pl.col("trade_date").cast(pl.Utf8)]), context
)
ma_factor = MAFactor(period=period)
factor_result = ma_factor.compute(factor_data).to_numpy()
polars_values = polars_result["ma"].to_numpy()
valid_idx = ~np.isnan(polars_values)
diff = np.abs(polars_values[valid_idx] - factor_result[valid_idx])
max_diff = np.max(diff)
status = "通过" if max_diff < 1e-10 else "失败"
print(f" MA({period}): 最大差异 = {max_diff:.2e} {status}")
# 测试截面因子
trade_dates = daily_data["trade_date"].unique().to_list()
sample_date = trade_dates[50]
date_df = daily_data.filter(pl.col("trade_date") == sample_date)
polars_result = date_df.with_columns(
(pl.col("close").rank() / pl.col("close").count()).alias("rank")
)
context = FactorContext(current_date=str(sample_date))
factor_data = FactorData(
date_df.with_columns(
[pl.col("trade_date").cast(pl.Utf8), pl.col("ts_code").cast(pl.Utf8)]
),
context,
)
pe_factor = PERankFactor()
factor_result = pe_factor.compute(factor_data).to_numpy()
polars_values = polars_result["rank"].to_numpy()
diff = np.abs(polars_values - factor_result)
max_diff = np.max(diff)
status = "通过" if max_diff < 1e-10 else "失败"
print(f" PE_Rank: 最大差异 = {max_diff:.2e} {status}")
print("=" * 60)
# 测试多个时序周期
for period in [5, 10, 20]:
sample_stock = daily_data["ts_code"].to_list()[0]
stock_df = daily_data.filter(pl.col("ts_code") == sample_stock).sort(
"trade_date"
)
polars_result = stock_df.with_columns(
pl.col("close")
.rolling_mean(window_size=period)
.over("ts_code")
.alias("ma")
)
context = FactorContext(current_stock=sample_stock)
factor_data = FactorData(
stock_df.with_columns([pl.col("trade_date").cast(pl.Utf8)]), context
)
ma_factor = MAFactor(period=period)
factor_result = ma_factor.compute(factor_data).to_numpy()
polars_values = polars_result["ma"].to_numpy()
valid_idx = ~np.isnan(polars_values)
diff = np.abs(polars_values[valid_idx] - factor_result[valid_idx])
max_diff = np.max(diff)
status = "通过" if max_diff < 1e-10 else "失败"
print(f" MA({period}): 最大差异 = {max_diff:.2e} {status}")
# 测试截面因子
trade_dates = daily_data["trade_date"].unique().to_list()
sample_date = trade_dates[50]
date_df = daily_data.filter(pl.col("trade_date") == sample_date)
polars_result = date_df.with_columns(
(pl.col("close").rank() / pl.col("close").count()).alias("rank")
)
context = FactorContext(current_date=str(sample_date))
factor_data = FactorData(
date_df.with_columns(
[pl.col("trade_date").cast(pl.Utf8), pl.col("ts_code").cast(pl.Utf8)]
),
context,
)
pe_factor = PERankFactor()
factor_result = pe_factor.compute(factor_data).to_numpy()
polars_values = polars_result["rank"].to_numpy()
diff = np.abs(polars_values - factor_result)
max_diff = np.max(diff)
status = "通过" if max_diff < 1e-10 else "失败"
print(f" PE_Rank: 最大差异 = {max_diff:.2e} {status}")
print("=" * 60)