- 新增因子基类 (BaseFactor, CrossSectionalFactor, TimeSeriesFactor) - 新增数据规格和上下文类 (DataSpec, FactorContext, FactorData) - 新增数据加载器 (DataLoader) 和执行引擎 (FactorEngine) - 新增组合因子支持 (CompositeFactor, ScalarFactor) - 添加因子模块完整测试用例 - 添加 Git 提交规范文档
398 lines
13 KiB
Python
398 lines
13 KiB
Python
"""因子真实数据测试 - 与 Polars 原生计算对比
|
||
|
||
测试目标:
|
||
1. 时序因子 - 移动平均线 (MA)
|
||
2. 截面因子 - PE_Rank(市盈率排名)
|
||
3. 结合因子 - 时序 * 截面组合
|
||
|
||
每个因子都与原始 Polars 计算进行对比验证。
|
||
"""
|
||
|
||
import pytest
|
||
import pandas as pd
|
||
import polars as pl
|
||
import numpy as np
|
||
from src.factors import DataSpec, FactorContext, FactorData
|
||
from src.factors.base import CrossSectionalFactor, TimeSeriesFactor
|
||
from src.factors.composite import CompositeFactor, ScalarFactor
|
||
|
||
|
||
# ========== 测试数据准备 ==========
|
||
|
||
|
||
@pytest.fixture(scope="module")
|
||
def daily_data():
|
||
"""加载日线测试数据(直接使用 Polars)"""
|
||
with pd.HDFStore("data/daily.h5", mode="r") as store:
|
||
df = store["/daily"]
|
||
|
||
# 筛选日期范围
|
||
df = df[(df["trade_date"] >= "20240101") & (df["trade_date"] <= "20240430")]
|
||
|
||
# 选择部分股票(取前20个)
|
||
stocks = df["ts_code"].unique()[:20]
|
||
df = df[df["ts_code"].isin(stocks)]
|
||
|
||
# 直接返回 Polars DataFrame,不转 pandas
|
||
pl_df = pl.from_pandas(df)
|
||
pl_df = pl_df.sort(["ts_code", "trade_date"])
|
||
|
||
return pl_df
|
||
|
||
|
||
# ========== 时序因子定义 ==========
|
||
|
||
|
||
class MAFactor(TimeSeriesFactor):
|
||
"""移动平均线因子(时序因子)"""
|
||
|
||
name = "ma_factor"
|
||
data_specs = [
|
||
DataSpec("daily", ["ts_code", "trade_date", "close"], lookback_days=5)
|
||
]
|
||
|
||
def __init__(self, period: int = 5):
|
||
super().__init__(period=period)
|
||
|
||
def compute(self, data: FactorData) -> pl.Series:
|
||
close = data.get_column("close")
|
||
period = self.params["period"]
|
||
return close.rolling_mean(period)
|
||
|
||
|
||
class PERankFactor(CrossSectionalFactor):
|
||
"""PE 市盈率排名因子(截面因子)"""
|
||
|
||
name = "pe_rank_factor"
|
||
data_specs = [
|
||
DataSpec("daily", ["ts_code", "trade_date", "close"], lookback_days=1)
|
||
]
|
||
|
||
def compute(self, data: FactorData) -> pl.Series:
|
||
cs = data.get_cross_section()
|
||
close = cs["close"]
|
||
return close.rank() / close.len()
|
||
|
||
|
||
# ========== 测试用例 ==========
|
||
|
||
|
||
class TestTimeSeriesFactor:
|
||
"""时序因子测试"""
|
||
|
||
def test_ma_factor(self, daily_data):
|
||
"""测试 MA 因子与 Polars 原生计算对比"""
|
||
period = 5
|
||
sample_stock = daily_data["ts_code"].to_list()[0]
|
||
stock_df = daily_data.filter(pl.col("ts_code") == sample_stock).sort(
|
||
"trade_date"
|
||
)
|
||
|
||
# Polars 基准计算
|
||
polars_result = stock_df.with_columns(
|
||
pl.col("close")
|
||
.rolling_mean(window_size=period)
|
||
.over("ts_code")
|
||
.alias("ma_polars")
|
||
)
|
||
|
||
# 因子框架计算
|
||
context = FactorContext(current_stock=sample_stock)
|
||
factor_data = FactorData(
|
||
stock_df.with_columns([pl.col("trade_date").cast(pl.Utf8)]), context
|
||
)
|
||
|
||
ma_factor = MAFactor(period=period)
|
||
factor_result = ma_factor.compute(factor_data).to_numpy()
|
||
|
||
# 对比结果
|
||
polars_values = polars_result["ma_polars"].to_numpy()
|
||
|
||
# 去除 NaN 后对比
|
||
valid_idx = ~np.isnan(polars_values)
|
||
polars_valid = polars_values[valid_idx]
|
||
factor_valid = factor_result[valid_idx]
|
||
|
||
diff = np.abs(polars_valid - factor_valid)
|
||
max_diff = np.max(diff)
|
||
|
||
print(f"\n[时序因子 MA({period}) 对比]")
|
||
print(f" 样本股票: {sample_stock}")
|
||
print(f" 有效数据点: {len(polars_valid)}")
|
||
print(f" 最大差异: {max_diff:.15f}")
|
||
print(f" 样本数据 (前5个):")
|
||
for i in range(min(5, len(polars_valid))):
|
||
print(
|
||
f" Polars: {polars_valid[i]:.6f}, Factor: {factor_valid[i]:.6f}, Diff: {abs(polars_valid[i] - factor_valid[i]):.15f}"
|
||
)
|
||
|
||
assert max_diff < 1e-10, f"MA 因子计算差异过大: {max_diff}"
|
||
|
||
|
||
class TestCrossSectionalFactor:
|
||
"""截面因子测试"""
|
||
|
||
def test_pe_rank_factor(self, daily_data):
|
||
"""测试 PE_Rank 因子与 Polars 原生计算对比"""
|
||
trade_dates = daily_data["trade_date"].unique().to_list()
|
||
sample_date = trade_dates[50]
|
||
date_df = daily_data.filter(pl.col("trade_date") == sample_date)
|
||
|
||
# Polars 基准计算
|
||
polars_result = date_df.with_columns(
|
||
(pl.col("close").rank() / pl.col("close").count()).alias("pe_rank_polars")
|
||
)
|
||
|
||
# 因子框架计算
|
||
context = FactorContext(current_date=str(sample_date))
|
||
factor_data = FactorData(
|
||
date_df.with_columns(
|
||
[pl.col("trade_date").cast(pl.Utf8), pl.col("ts_code").cast(pl.Utf8)]
|
||
),
|
||
context,
|
||
)
|
||
|
||
pe_factor = PERankFactor()
|
||
factor_result = pe_factor.compute(factor_data).to_numpy()
|
||
|
||
# 对比结果
|
||
polars_values = polars_result["pe_rank_polars"].to_numpy()
|
||
|
||
diff = np.abs(polars_values - factor_result)
|
||
max_diff = np.max(diff)
|
||
|
||
print(f"\n[截面因子 PE_Rank 对比]")
|
||
print(f" 样本日期: {sample_date}")
|
||
print(f" 股票数量: {len(polars_values)}")
|
||
print(f" 最大差异: {max_diff:.15f}")
|
||
print(f" 样本数据 (前5个):")
|
||
for i in range(min(5, len(polars_values))):
|
||
ts_code = polars_result["ts_code"].to_numpy()[i]
|
||
print(
|
||
f" {ts_code}: Polars: {polars_values[i]:.6f}, Factor: {factor_result[i]:.6f}"
|
||
)
|
||
|
||
assert max_diff < 1e-10, f"PE_Rank 因子计算差异过大: {max_diff}"
|
||
|
||
|
||
class TestCompositeFactor:
|
||
"""结合因子测试"""
|
||
|
||
def test_scalar_composite(self, daily_data):
|
||
"""测试标量组合因子: 0.5 * MA"""
|
||
period = 5
|
||
sample_stock = daily_data["ts_code"].to_list()[0]
|
||
stock_df = daily_data.filter(pl.col("ts_code") == sample_stock).sort(
|
||
"trade_date"
|
||
)
|
||
|
||
# Polars 基准计算
|
||
polars_ma = stock_df.with_columns(
|
||
pl.col("close").rolling_mean(window_size=period).over("ts_code").alias("ma")
|
||
)
|
||
polars_combined = 0.5 * polars_ma["ma"].to_numpy()
|
||
|
||
# 因子框架计算
|
||
context = FactorContext(current_stock=sample_stock)
|
||
factor_data = FactorData(
|
||
stock_df.with_columns([pl.col("trade_date").cast(pl.Utf8)]), context
|
||
)
|
||
|
||
# 组合因子: 0.5 * MA
|
||
ma_factor = MAFactor(period=period)
|
||
scalar_factor = 0.5 * ma_factor
|
||
factor_result = scalar_factor.compute(factor_data).to_numpy()
|
||
|
||
# 对比结果
|
||
valid_idx = ~np.isnan(polars_combined)
|
||
polars_valid = polars_combined[valid_idx]
|
||
factor_valid = factor_result[valid_idx]
|
||
|
||
diff = np.abs(polars_valid - factor_valid)
|
||
max_diff = np.max(diff)
|
||
|
||
print(f"\n[结合因子 0.5*MA({period}) 对比]")
|
||
print(f" 公式: 0.5 * MA({period})")
|
||
print(f" 有效数据点: {len(polars_valid)}")
|
||
print(f" 最大差异: {max_diff:.15f}")
|
||
print(f" 样本数据 (前5个):")
|
||
for i in range(min(5, len(polars_valid))):
|
||
print(
|
||
f" Polars: {polars_valid[i]:.6f}, Factor: {factor_valid[i]:.6f}, Diff: {abs(polars_valid[i] - factor_valid[i]):.15f}"
|
||
)
|
||
|
||
assert max_diff < 1e-10, f"组合因子计算差异过大: {max_diff}"
|
||
|
||
def test_factor_addition(self, daily_data):
|
||
"""测试因子加法组合: MA(5) + MA(10)"""
|
||
sample_stock = daily_data["ts_code"].to_list()[0]
|
||
stock_df = daily_data.filter(pl.col("ts_code") == sample_stock).sort(
|
||
"trade_date"
|
||
)
|
||
|
||
context = FactorContext(current_stock=sample_stock)
|
||
|
||
# Polars 基准计算
|
||
polars_ma5 = stock_df.with_columns(
|
||
pl.col("close").rolling_mean(window_size=5).over("ts_code").alias("ma5")
|
||
)
|
||
polars_ma10 = stock_df.with_columns(
|
||
pl.col("close").rolling_mean(window_size=10).over("ts_code").alias("ma10")
|
||
)
|
||
polars_combined = polars_ma5["ma5"].to_numpy() + polars_ma10["ma10"].to_numpy()
|
||
|
||
# 因子框架计算
|
||
factor_data = FactorData(
|
||
stock_df.with_columns([pl.col("trade_date").cast(pl.Utf8)]), context
|
||
)
|
||
|
||
ma5 = MAFactor(period=5)
|
||
ma10 = MAFactor(period=10)
|
||
combined = ma5 + ma10
|
||
|
||
factor_result = combined.compute(factor_data).to_numpy()
|
||
|
||
# 对比结果
|
||
valid_idx = ~(np.isnan(polars_combined) | np.isnan(factor_result))
|
||
polars_valid = polars_combined[valid_idx]
|
||
factor_valid = factor_result[valid_idx]
|
||
|
||
diff = np.abs(polars_valid - factor_valid)
|
||
max_diff = np.max(diff)
|
||
|
||
print(f"\n[结合因子 MA(5) + MA(10) 对比]")
|
||
print(f" 有效数据点: {len(polars_valid)}")
|
||
print(f" 最大差异: {max_diff:.15f}")
|
||
|
||
assert max_diff < 1e-10, f"因子加法组合差异过大: {max_diff}"
|
||
|
||
|
||
class TestFactorComparison:
|
||
"""全面对比测试"""
|
||
|
||
def test_all_factors_summary(self, daily_data):
|
||
"""汇总所有因子测试结果"""
|
||
print("\n" + "=" * 60)
|
||
print("因子测试汇总")
|
||
print("=" * 60)
|
||
|
||
# 测试多个时序周期
|
||
for period in [5, 10, 20]:
|
||
sample_stock = daily_data["ts_code"].to_list()[0]
|
||
stock_df = daily_data.filter(pl.col("ts_code") == sample_stock).sort(
|
||
"trade_date"
|
||
)
|
||
|
||
polars_result = stock_df.with_columns(
|
||
pl.col("close")
|
||
.rolling_mean(window_size=period)
|
||
.over("ts_code")
|
||
.alias("ma")
|
||
)
|
||
|
||
context = FactorContext(current_stock=sample_stock)
|
||
factor_data = FactorData(
|
||
stock_df.with_columns([pl.col("trade_date").cast(pl.Utf8)]), context
|
||
)
|
||
|
||
ma_factor = MAFactor(period=period)
|
||
factor_result = ma_factor.compute(factor_data).to_numpy()
|
||
|
||
polars_values = polars_result["ma"].to_numpy()
|
||
valid_idx = ~np.isnan(polars_values)
|
||
|
||
diff = np.abs(polars_values[valid_idx] - factor_result[valid_idx])
|
||
max_diff = np.max(diff)
|
||
|
||
status = "通过" if max_diff < 1e-10 else "失败"
|
||
print(f" MA({period}): 最大差异 = {max_diff:.2e} {status}")
|
||
|
||
# 测试截面因子
|
||
trade_dates = daily_data["trade_date"].unique().to_list()
|
||
sample_date = trade_dates[50]
|
||
date_df = daily_data.filter(pl.col("trade_date") == sample_date)
|
||
|
||
polars_result = date_df.with_columns(
|
||
(pl.col("close").rank() / pl.col("close").count()).alias("rank")
|
||
)
|
||
|
||
context = FactorContext(current_date=str(sample_date))
|
||
factor_data = FactorData(
|
||
date_df.with_columns(
|
||
[pl.col("trade_date").cast(pl.Utf8), pl.col("ts_code").cast(pl.Utf8)]
|
||
),
|
||
context,
|
||
)
|
||
|
||
pe_factor = PERankFactor()
|
||
factor_result = pe_factor.compute(factor_data).to_numpy()
|
||
|
||
polars_values = polars_result["rank"].to_numpy()
|
||
diff = np.abs(polars_values - factor_result)
|
||
max_diff = np.max(diff)
|
||
|
||
status = "通过" if max_diff < 1e-10 else "失败"
|
||
print(f" PE_Rank: 最大差异 = {max_diff:.2e} {status}")
|
||
|
||
print("=" * 60)
|
||
|
||
# 测试多个时序周期
|
||
for period in [5, 10, 20]:
|
||
sample_stock = daily_data["ts_code"].to_list()[0]
|
||
stock_df = daily_data.filter(pl.col("ts_code") == sample_stock).sort(
|
||
"trade_date"
|
||
)
|
||
|
||
polars_result = stock_df.with_columns(
|
||
pl.col("close")
|
||
.rolling_mean(window_size=period)
|
||
.over("ts_code")
|
||
.alias("ma")
|
||
)
|
||
|
||
context = FactorContext(current_stock=sample_stock)
|
||
factor_data = FactorData(
|
||
stock_df.with_columns([pl.col("trade_date").cast(pl.Utf8)]), context
|
||
)
|
||
|
||
ma_factor = MAFactor(period=period)
|
||
factor_result = ma_factor.compute(factor_data).to_numpy()
|
||
|
||
polars_values = polars_result["ma"].to_numpy()
|
||
valid_idx = ~np.isnan(polars_values)
|
||
|
||
diff = np.abs(polars_values[valid_idx] - factor_result[valid_idx])
|
||
max_diff = np.max(diff)
|
||
|
||
status = "通过" if max_diff < 1e-10 else "失败"
|
||
print(f" MA({period}): 最大差异 = {max_diff:.2e} {status}")
|
||
|
||
# 测试截面因子
|
||
trade_dates = daily_data["trade_date"].unique().to_list()
|
||
sample_date = trade_dates[50]
|
||
date_df = daily_data.filter(pl.col("trade_date") == sample_date)
|
||
|
||
polars_result = date_df.with_columns(
|
||
(pl.col("close").rank() / pl.col("close").count()).alias("rank")
|
||
)
|
||
|
||
context = FactorContext(current_date=str(sample_date))
|
||
factor_data = FactorData(
|
||
date_df.with_columns(
|
||
[pl.col("trade_date").cast(pl.Utf8), pl.col("ts_code").cast(pl.Utf8)]
|
||
),
|
||
context,
|
||
)
|
||
|
||
pe_factor = PERankFactor()
|
||
factor_result = pe_factor.compute(factor_data).to_numpy()
|
||
|
||
polars_values = polars_result["rank"].to_numpy()
|
||
diff = np.abs(polars_values - factor_result)
|
||
max_diff = np.max(diff)
|
||
|
||
status = "通过" if max_diff < 1e-10 else "失败"
|
||
print(f" PE_Rank: 最大差异 = {max_diff:.2e} {status}")
|
||
|
||
print("=" * 60)
|