feat(factors): 添加因子计算框架
- 新增因子基类 (BaseFactor, CrossSectionalFactor, TimeSeriesFactor) - 新增数据规格和上下文类 (DataSpec, FactorContext, FactorData) - 新增数据加载器 (DataLoader) 和执行引擎 (FactorEngine) - 新增组合因子支持 (CompositeFactor, ScalarFactor) - 添加因子模块完整测试用例 - 添加 Git 提交规范文档
This commit is contained in:
397
tests/factors/test_factor_validation.py
Normal file
397
tests/factors/test_factor_validation.py
Normal file
@@ -0,0 +1,397 @@
|
||||
"""因子真实数据测试 - 与 Polars 原生计算对比
|
||||
|
||||
测试目标:
|
||||
1. 时序因子 - 移动平均线 (MA)
|
||||
2. 截面因子 - PE_Rank(市盈率排名)
|
||||
3. 结合因子 - 时序 * 截面组合
|
||||
|
||||
每个因子都与原始 Polars 计算进行对比验证。
|
||||
"""
|
||||
|
||||
import pytest
|
||||
import pandas as pd
|
||||
import polars as pl
|
||||
import numpy as np
|
||||
from src.factors import DataSpec, FactorContext, FactorData
|
||||
from src.factors.base import CrossSectionalFactor, TimeSeriesFactor
|
||||
from src.factors.composite import CompositeFactor, ScalarFactor
|
||||
|
||||
|
||||
# ========== 测试数据准备 ==========
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def daily_data():
|
||||
"""加载日线测试数据(直接使用 Polars)"""
|
||||
with pd.HDFStore("data/daily.h5", mode="r") as store:
|
||||
df = store["/daily"]
|
||||
|
||||
# 筛选日期范围
|
||||
df = df[(df["trade_date"] >= "20240101") & (df["trade_date"] <= "20240430")]
|
||||
|
||||
# 选择部分股票(取前20个)
|
||||
stocks = df["ts_code"].unique()[:20]
|
||||
df = df[df["ts_code"].isin(stocks)]
|
||||
|
||||
# 直接返回 Polars DataFrame,不转 pandas
|
||||
pl_df = pl.from_pandas(df)
|
||||
pl_df = pl_df.sort(["ts_code", "trade_date"])
|
||||
|
||||
return pl_df
|
||||
|
||||
|
||||
# ========== 时序因子定义 ==========
|
||||
|
||||
|
||||
class MAFactor(TimeSeriesFactor):
|
||||
"""移动平均线因子(时序因子)"""
|
||||
|
||||
name = "ma_factor"
|
||||
data_specs = [
|
||||
DataSpec("daily", ["ts_code", "trade_date", "close"], lookback_days=5)
|
||||
]
|
||||
|
||||
def __init__(self, period: int = 5):
|
||||
super().__init__(period=period)
|
||||
|
||||
def compute(self, data: FactorData) -> pl.Series:
|
||||
close = data.get_column("close")
|
||||
period = self.params["period"]
|
||||
return close.rolling_mean(period)
|
||||
|
||||
|
||||
class PERankFactor(CrossSectionalFactor):
|
||||
"""PE 市盈率排名因子(截面因子)"""
|
||||
|
||||
name = "pe_rank_factor"
|
||||
data_specs = [
|
||||
DataSpec("daily", ["ts_code", "trade_date", "close"], lookback_days=1)
|
||||
]
|
||||
|
||||
def compute(self, data: FactorData) -> pl.Series:
|
||||
cs = data.get_cross_section()
|
||||
close = cs["close"]
|
||||
return close.rank() / close.len()
|
||||
|
||||
|
||||
# ========== 测试用例 ==========
|
||||
|
||||
|
||||
class TestTimeSeriesFactor:
|
||||
"""时序因子测试"""
|
||||
|
||||
def test_ma_factor(self, daily_data):
|
||||
"""测试 MA 因子与 Polars 原生计算对比"""
|
||||
period = 5
|
||||
sample_stock = daily_data["ts_code"].to_list()[0]
|
||||
stock_df = daily_data.filter(pl.col("ts_code") == sample_stock).sort(
|
||||
"trade_date"
|
||||
)
|
||||
|
||||
# Polars 基准计算
|
||||
polars_result = stock_df.with_columns(
|
||||
pl.col("close")
|
||||
.rolling_mean(window_size=period)
|
||||
.over("ts_code")
|
||||
.alias("ma_polars")
|
||||
)
|
||||
|
||||
# 因子框架计算
|
||||
context = FactorContext(current_stock=sample_stock)
|
||||
factor_data = FactorData(
|
||||
stock_df.with_columns([pl.col("trade_date").cast(pl.Utf8)]), context
|
||||
)
|
||||
|
||||
ma_factor = MAFactor(period=period)
|
||||
factor_result = ma_factor.compute(factor_data).to_numpy()
|
||||
|
||||
# 对比结果
|
||||
polars_values = polars_result["ma_polars"].to_numpy()
|
||||
|
||||
# 去除 NaN 后对比
|
||||
valid_idx = ~np.isnan(polars_values)
|
||||
polars_valid = polars_values[valid_idx]
|
||||
factor_valid = factor_result[valid_idx]
|
||||
|
||||
diff = np.abs(polars_valid - factor_valid)
|
||||
max_diff = np.max(diff)
|
||||
|
||||
print(f"\n[时序因子 MA({period}) 对比]")
|
||||
print(f" 样本股票: {sample_stock}")
|
||||
print(f" 有效数据点: {len(polars_valid)}")
|
||||
print(f" 最大差异: {max_diff:.15f}")
|
||||
print(f" 样本数据 (前5个):")
|
||||
for i in range(min(5, len(polars_valid))):
|
||||
print(
|
||||
f" Polars: {polars_valid[i]:.6f}, Factor: {factor_valid[i]:.6f}, Diff: {abs(polars_valid[i] - factor_valid[i]):.15f}"
|
||||
)
|
||||
|
||||
assert max_diff < 1e-10, f"MA 因子计算差异过大: {max_diff}"
|
||||
|
||||
|
||||
class TestCrossSectionalFactor:
|
||||
"""截面因子测试"""
|
||||
|
||||
def test_pe_rank_factor(self, daily_data):
|
||||
"""测试 PE_Rank 因子与 Polars 原生计算对比"""
|
||||
trade_dates = daily_data["trade_date"].unique().to_list()
|
||||
sample_date = trade_dates[50]
|
||||
date_df = daily_data.filter(pl.col("trade_date") == sample_date)
|
||||
|
||||
# Polars 基准计算
|
||||
polars_result = date_df.with_columns(
|
||||
(pl.col("close").rank() / pl.col("close").count()).alias("pe_rank_polars")
|
||||
)
|
||||
|
||||
# 因子框架计算
|
||||
context = FactorContext(current_date=str(sample_date))
|
||||
factor_data = FactorData(
|
||||
date_df.with_columns(
|
||||
[pl.col("trade_date").cast(pl.Utf8), pl.col("ts_code").cast(pl.Utf8)]
|
||||
),
|
||||
context,
|
||||
)
|
||||
|
||||
pe_factor = PERankFactor()
|
||||
factor_result = pe_factor.compute(factor_data).to_numpy()
|
||||
|
||||
# 对比结果
|
||||
polars_values = polars_result["pe_rank_polars"].to_numpy()
|
||||
|
||||
diff = np.abs(polars_values - factor_result)
|
||||
max_diff = np.max(diff)
|
||||
|
||||
print(f"\n[截面因子 PE_Rank 对比]")
|
||||
print(f" 样本日期: {sample_date}")
|
||||
print(f" 股票数量: {len(polars_values)}")
|
||||
print(f" 最大差异: {max_diff:.15f}")
|
||||
print(f" 样本数据 (前5个):")
|
||||
for i in range(min(5, len(polars_values))):
|
||||
ts_code = polars_result["ts_code"].to_numpy()[i]
|
||||
print(
|
||||
f" {ts_code}: Polars: {polars_values[i]:.6f}, Factor: {factor_result[i]:.6f}"
|
||||
)
|
||||
|
||||
assert max_diff < 1e-10, f"PE_Rank 因子计算差异过大: {max_diff}"
|
||||
|
||||
|
||||
class TestCompositeFactor:
|
||||
"""结合因子测试"""
|
||||
|
||||
def test_scalar_composite(self, daily_data):
|
||||
"""测试标量组合因子: 0.5 * MA"""
|
||||
period = 5
|
||||
sample_stock = daily_data["ts_code"].to_list()[0]
|
||||
stock_df = daily_data.filter(pl.col("ts_code") == sample_stock).sort(
|
||||
"trade_date"
|
||||
)
|
||||
|
||||
# Polars 基准计算
|
||||
polars_ma = stock_df.with_columns(
|
||||
pl.col("close").rolling_mean(window_size=period).over("ts_code").alias("ma")
|
||||
)
|
||||
polars_combined = 0.5 * polars_ma["ma"].to_numpy()
|
||||
|
||||
# 因子框架计算
|
||||
context = FactorContext(current_stock=sample_stock)
|
||||
factor_data = FactorData(
|
||||
stock_df.with_columns([pl.col("trade_date").cast(pl.Utf8)]), context
|
||||
)
|
||||
|
||||
# 组合因子: 0.5 * MA
|
||||
ma_factor = MAFactor(period=period)
|
||||
scalar_factor = 0.5 * ma_factor
|
||||
factor_result = scalar_factor.compute(factor_data).to_numpy()
|
||||
|
||||
# 对比结果
|
||||
valid_idx = ~np.isnan(polars_combined)
|
||||
polars_valid = polars_combined[valid_idx]
|
||||
factor_valid = factor_result[valid_idx]
|
||||
|
||||
diff = np.abs(polars_valid - factor_valid)
|
||||
max_diff = np.max(diff)
|
||||
|
||||
print(f"\n[结合因子 0.5*MA({period}) 对比]")
|
||||
print(f" 公式: 0.5 * MA({period})")
|
||||
print(f" 有效数据点: {len(polars_valid)}")
|
||||
print(f" 最大差异: {max_diff:.15f}")
|
||||
print(f" 样本数据 (前5个):")
|
||||
for i in range(min(5, len(polars_valid))):
|
||||
print(
|
||||
f" Polars: {polars_valid[i]:.6f}, Factor: {factor_valid[i]:.6f}, Diff: {abs(polars_valid[i] - factor_valid[i]):.15f}"
|
||||
)
|
||||
|
||||
assert max_diff < 1e-10, f"组合因子计算差异过大: {max_diff}"
|
||||
|
||||
def test_factor_addition(self, daily_data):
|
||||
"""测试因子加法组合: MA(5) + MA(10)"""
|
||||
sample_stock = daily_data["ts_code"].to_list()[0]
|
||||
stock_df = daily_data.filter(pl.col("ts_code") == sample_stock).sort(
|
||||
"trade_date"
|
||||
)
|
||||
|
||||
context = FactorContext(current_stock=sample_stock)
|
||||
|
||||
# Polars 基准计算
|
||||
polars_ma5 = stock_df.with_columns(
|
||||
pl.col("close").rolling_mean(window_size=5).over("ts_code").alias("ma5")
|
||||
)
|
||||
polars_ma10 = stock_df.with_columns(
|
||||
pl.col("close").rolling_mean(window_size=10).over("ts_code").alias("ma10")
|
||||
)
|
||||
polars_combined = polars_ma5["ma5"].to_numpy() + polars_ma10["ma10"].to_numpy()
|
||||
|
||||
# 因子框架计算
|
||||
factor_data = FactorData(
|
||||
stock_df.with_columns([pl.col("trade_date").cast(pl.Utf8)]), context
|
||||
)
|
||||
|
||||
ma5 = MAFactor(period=5)
|
||||
ma10 = MAFactor(period=10)
|
||||
combined = ma5 + ma10
|
||||
|
||||
factor_result = combined.compute(factor_data).to_numpy()
|
||||
|
||||
# 对比结果
|
||||
valid_idx = ~(np.isnan(polars_combined) | np.isnan(factor_result))
|
||||
polars_valid = polars_combined[valid_idx]
|
||||
factor_valid = factor_result[valid_idx]
|
||||
|
||||
diff = np.abs(polars_valid - factor_valid)
|
||||
max_diff = np.max(diff)
|
||||
|
||||
print(f"\n[结合因子 MA(5) + MA(10) 对比]")
|
||||
print(f" 有效数据点: {len(polars_valid)}")
|
||||
print(f" 最大差异: {max_diff:.15f}")
|
||||
|
||||
assert max_diff < 1e-10, f"因子加法组合差异过大: {max_diff}"
|
||||
|
||||
|
||||
class TestFactorComparison:
|
||||
"""全面对比测试"""
|
||||
|
||||
def test_all_factors_summary(self, daily_data):
|
||||
"""汇总所有因子测试结果"""
|
||||
print("\n" + "=" * 60)
|
||||
print("因子测试汇总")
|
||||
print("=" * 60)
|
||||
|
||||
# 测试多个时序周期
|
||||
for period in [5, 10, 20]:
|
||||
sample_stock = daily_data["ts_code"].to_list()[0]
|
||||
stock_df = daily_data.filter(pl.col("ts_code") == sample_stock).sort(
|
||||
"trade_date"
|
||||
)
|
||||
|
||||
polars_result = stock_df.with_columns(
|
||||
pl.col("close")
|
||||
.rolling_mean(window_size=period)
|
||||
.over("ts_code")
|
||||
.alias("ma")
|
||||
)
|
||||
|
||||
context = FactorContext(current_stock=sample_stock)
|
||||
factor_data = FactorData(
|
||||
stock_df.with_columns([pl.col("trade_date").cast(pl.Utf8)]), context
|
||||
)
|
||||
|
||||
ma_factor = MAFactor(period=period)
|
||||
factor_result = ma_factor.compute(factor_data).to_numpy()
|
||||
|
||||
polars_values = polars_result["ma"].to_numpy()
|
||||
valid_idx = ~np.isnan(polars_values)
|
||||
|
||||
diff = np.abs(polars_values[valid_idx] - factor_result[valid_idx])
|
||||
max_diff = np.max(diff)
|
||||
|
||||
status = "通过" if max_diff < 1e-10 else "失败"
|
||||
print(f" MA({period}): 最大差异 = {max_diff:.2e} {status}")
|
||||
|
||||
# 测试截面因子
|
||||
trade_dates = daily_data["trade_date"].unique().to_list()
|
||||
sample_date = trade_dates[50]
|
||||
date_df = daily_data.filter(pl.col("trade_date") == sample_date)
|
||||
|
||||
polars_result = date_df.with_columns(
|
||||
(pl.col("close").rank() / pl.col("close").count()).alias("rank")
|
||||
)
|
||||
|
||||
context = FactorContext(current_date=str(sample_date))
|
||||
factor_data = FactorData(
|
||||
date_df.with_columns(
|
||||
[pl.col("trade_date").cast(pl.Utf8), pl.col("ts_code").cast(pl.Utf8)]
|
||||
),
|
||||
context,
|
||||
)
|
||||
|
||||
pe_factor = PERankFactor()
|
||||
factor_result = pe_factor.compute(factor_data).to_numpy()
|
||||
|
||||
polars_values = polars_result["rank"].to_numpy()
|
||||
diff = np.abs(polars_values - factor_result)
|
||||
max_diff = np.max(diff)
|
||||
|
||||
status = "通过" if max_diff < 1e-10 else "失败"
|
||||
print(f" PE_Rank: 最大差异 = {max_diff:.2e} {status}")
|
||||
|
||||
print("=" * 60)
|
||||
|
||||
# 测试多个时序周期
|
||||
for period in [5, 10, 20]:
|
||||
sample_stock = daily_data["ts_code"].to_list()[0]
|
||||
stock_df = daily_data.filter(pl.col("ts_code") == sample_stock).sort(
|
||||
"trade_date"
|
||||
)
|
||||
|
||||
polars_result = stock_df.with_columns(
|
||||
pl.col("close")
|
||||
.rolling_mean(window_size=period)
|
||||
.over("ts_code")
|
||||
.alias("ma")
|
||||
)
|
||||
|
||||
context = FactorContext(current_stock=sample_stock)
|
||||
factor_data = FactorData(
|
||||
stock_df.with_columns([pl.col("trade_date").cast(pl.Utf8)]), context
|
||||
)
|
||||
|
||||
ma_factor = MAFactor(period=period)
|
||||
factor_result = ma_factor.compute(factor_data).to_numpy()
|
||||
|
||||
polars_values = polars_result["ma"].to_numpy()
|
||||
valid_idx = ~np.isnan(polars_values)
|
||||
|
||||
diff = np.abs(polars_values[valid_idx] - factor_result[valid_idx])
|
||||
max_diff = np.max(diff)
|
||||
|
||||
status = "通过" if max_diff < 1e-10 else "失败"
|
||||
print(f" MA({period}): 最大差异 = {max_diff:.2e} {status}")
|
||||
|
||||
# 测试截面因子
|
||||
trade_dates = daily_data["trade_date"].unique().to_list()
|
||||
sample_date = trade_dates[50]
|
||||
date_df = daily_data.filter(pl.col("trade_date") == sample_date)
|
||||
|
||||
polars_result = date_df.with_columns(
|
||||
(pl.col("close").rank() / pl.col("close").count()).alias("rank")
|
||||
)
|
||||
|
||||
context = FactorContext(current_date=str(sample_date))
|
||||
factor_data = FactorData(
|
||||
date_df.with_columns(
|
||||
[pl.col("trade_date").cast(pl.Utf8), pl.col("ts_code").cast(pl.Utf8)]
|
||||
),
|
||||
context,
|
||||
)
|
||||
|
||||
pe_factor = PERankFactor()
|
||||
factor_result = pe_factor.compute(factor_data).to_numpy()
|
||||
|
||||
polars_values = polars_result["rank"].to_numpy()
|
||||
diff = np.abs(polars_values - factor_result)
|
||||
max_diff = np.max(diff)
|
||||
|
||||
status = "通过" if max_diff < 1e-10 else "失败"
|
||||
print(f" PE_Rank: 最大差异 = {max_diff:.2e} {status}")
|
||||
|
||||
print("=" * 60)
|
||||
Reference in New Issue
Block a user