"""因子真实数据测试 - 与 Polars 原生计算对比 测试目标: 1. 时序因子 - 移动平均线 (MA) 2. 截面因子 - PE_Rank(市盈率排名) 3. 结合因子 - 时序 * 截面组合 每个因子都与原始 Polars 计算进行对比验证。 """ import pytest import pandas as pd import polars as pl import numpy as np from src.factors import DataSpec, FactorContext, FactorData from src.factors.base import CrossSectionalFactor, TimeSeriesFactor from src.factors.composite import CompositeFactor, ScalarFactor # ========== 测试数据准备 ========== @pytest.fixture(scope="module") def daily_data(): """加载日线测试数据(直接使用 Polars)""" with pd.HDFStore("data/daily.h5", mode="r") as store: df = store["/daily"] # 筛选日期范围 df = df[(df["trade_date"] >= "20240101") & (df["trade_date"] <= "20240430")] # 选择部分股票(取前20个) stocks = df["ts_code"].unique()[:20] df = df[df["ts_code"].isin(stocks)] # 直接返回 Polars DataFrame,不转 pandas pl_df = pl.from_pandas(df) pl_df = pl_df.sort(["ts_code", "trade_date"]) return pl_df # ========== 时序因子定义 ========== class MAFactor(TimeSeriesFactor): """移动平均线因子(时序因子)""" name = "ma_factor" data_specs = [ DataSpec("daily", ["ts_code", "trade_date", "close"], lookback_days=5) ] def __init__(self, period: int = 5): super().__init__(period=period) def compute(self, data: FactorData) -> pl.Series: close = data.get_column("close") period = self.params["period"] return close.rolling_mean(period) class PERankFactor(CrossSectionalFactor): """PE 市盈率排名因子(截面因子)""" name = "pe_rank_factor" data_specs = [ DataSpec("daily", ["ts_code", "trade_date", "close"], lookback_days=1) ] def compute(self, data: FactorData) -> pl.Series: cs = data.get_cross_section() close = cs["close"] return close.rank() / close.len() # ========== 测试用例 ========== class TestTimeSeriesFactor: """时序因子测试""" def test_ma_factor(self, daily_data): """测试 MA 因子与 Polars 原生计算对比""" period = 5 sample_stock = daily_data["ts_code"].to_list()[0] stock_df = daily_data.filter(pl.col("ts_code") == sample_stock).sort( "trade_date" ) # Polars 基准计算 polars_result = stock_df.with_columns( pl.col("close") .rolling_mean(window_size=period) .over("ts_code") .alias("ma_polars") ) # 因子框架计算 context = FactorContext(current_stock=sample_stock) factor_data = FactorData( stock_df.with_columns([pl.col("trade_date").cast(pl.Utf8)]), context ) ma_factor = MAFactor(period=period) factor_result = ma_factor.compute(factor_data).to_numpy() # 对比结果 polars_values = polars_result["ma_polars"].to_numpy() # 去除 NaN 后对比 valid_idx = ~np.isnan(polars_values) polars_valid = polars_values[valid_idx] factor_valid = factor_result[valid_idx] diff = np.abs(polars_valid - factor_valid) max_diff = np.max(diff) print(f"\n[时序因子 MA({period}) 对比]") print(f" 样本股票: {sample_stock}") print(f" 有效数据点: {len(polars_valid)}") print(f" 最大差异: {max_diff:.15f}") print(f" 样本数据 (前5个):") for i in range(min(5, len(polars_valid))): print( f" Polars: {polars_valid[i]:.6f}, Factor: {factor_valid[i]:.6f}, Diff: {abs(polars_valid[i] - factor_valid[i]):.15f}" ) assert max_diff < 1e-10, f"MA 因子计算差异过大: {max_diff}" class TestCrossSectionalFactor: """截面因子测试""" def test_pe_rank_factor(self, daily_data): """测试 PE_Rank 因子与 Polars 原生计算对比""" trade_dates = daily_data["trade_date"].unique().to_list() sample_date = trade_dates[50] date_df = daily_data.filter(pl.col("trade_date") == sample_date) # Polars 基准计算 polars_result = date_df.with_columns( (pl.col("close").rank() / pl.col("close").count()).alias("pe_rank_polars") ) # 因子框架计算 context = FactorContext(current_date=str(sample_date)) factor_data = FactorData( date_df.with_columns( [pl.col("trade_date").cast(pl.Utf8), pl.col("ts_code").cast(pl.Utf8)] ), context, ) pe_factor = PERankFactor() factor_result = pe_factor.compute(factor_data).to_numpy() # 对比结果 polars_values = polars_result["pe_rank_polars"].to_numpy() diff = np.abs(polars_values - factor_result) max_diff = np.max(diff) print(f"\n[截面因子 PE_Rank 对比]") print(f" 样本日期: {sample_date}") print(f" 股票数量: {len(polars_values)}") print(f" 最大差异: {max_diff:.15f}") print(f" 样本数据 (前5个):") for i in range(min(5, len(polars_values))): ts_code = polars_result["ts_code"].to_numpy()[i] print( f" {ts_code}: Polars: {polars_values[i]:.6f}, Factor: {factor_result[i]:.6f}" ) assert max_diff < 1e-10, f"PE_Rank 因子计算差异过大: {max_diff}" class TestCompositeFactor: """结合因子测试""" def test_scalar_composite(self, daily_data): """测试标量组合因子: 0.5 * MA""" period = 5 sample_stock = daily_data["ts_code"].to_list()[0] stock_df = daily_data.filter(pl.col("ts_code") == sample_stock).sort( "trade_date" ) # Polars 基准计算 polars_ma = stock_df.with_columns( pl.col("close").rolling_mean(window_size=period).over("ts_code").alias("ma") ) polars_combined = 0.5 * polars_ma["ma"].to_numpy() # 因子框架计算 context = FactorContext(current_stock=sample_stock) factor_data = FactorData( stock_df.with_columns([pl.col("trade_date").cast(pl.Utf8)]), context ) # 组合因子: 0.5 * MA ma_factor = MAFactor(period=period) scalar_factor = 0.5 * ma_factor factor_result = scalar_factor.compute(factor_data).to_numpy() # 对比结果 valid_idx = ~np.isnan(polars_combined) polars_valid = polars_combined[valid_idx] factor_valid = factor_result[valid_idx] diff = np.abs(polars_valid - factor_valid) max_diff = np.max(diff) print(f"\n[结合因子 0.5*MA({period}) 对比]") print(f" 公式: 0.5 * MA({period})") print(f" 有效数据点: {len(polars_valid)}") print(f" 最大差异: {max_diff:.15f}") print(f" 样本数据 (前5个):") for i in range(min(5, len(polars_valid))): print( f" Polars: {polars_valid[i]:.6f}, Factor: {factor_valid[i]:.6f}, Diff: {abs(polars_valid[i] - factor_valid[i]):.15f}" ) assert max_diff < 1e-10, f"组合因子计算差异过大: {max_diff}" def test_factor_addition(self, daily_data): """测试因子加法组合: MA(5) + MA(10)""" sample_stock = daily_data["ts_code"].to_list()[0] stock_df = daily_data.filter(pl.col("ts_code") == sample_stock).sort( "trade_date" ) context = FactorContext(current_stock=sample_stock) # Polars 基准计算 polars_ma5 = stock_df.with_columns( pl.col("close").rolling_mean(window_size=5).over("ts_code").alias("ma5") ) polars_ma10 = stock_df.with_columns( pl.col("close").rolling_mean(window_size=10).over("ts_code").alias("ma10") ) polars_combined = polars_ma5["ma5"].to_numpy() + polars_ma10["ma10"].to_numpy() # 因子框架计算 factor_data = FactorData( stock_df.with_columns([pl.col("trade_date").cast(pl.Utf8)]), context ) ma5 = MAFactor(period=5) ma10 = MAFactor(period=10) combined = ma5 + ma10 factor_result = combined.compute(factor_data).to_numpy() # 对比结果 valid_idx = ~(np.isnan(polars_combined) | np.isnan(factor_result)) polars_valid = polars_combined[valid_idx] factor_valid = factor_result[valid_idx] diff = np.abs(polars_valid - factor_valid) max_diff = np.max(diff) print(f"\n[结合因子 MA(5) + MA(10) 对比]") print(f" 有效数据点: {len(polars_valid)}") print(f" 最大差异: {max_diff:.15f}") assert max_diff < 1e-10, f"因子加法组合差异过大: {max_diff}" class TestFactorComparison: """全面对比测试""" def test_all_factors_summary(self, daily_data): """汇总所有因子测试结果""" print("\n" + "=" * 60) print("因子测试汇总") print("=" * 60) # 测试多个时序周期 for period in [5, 10, 20]: sample_stock = daily_data["ts_code"].to_list()[0] stock_df = daily_data.filter(pl.col("ts_code") == sample_stock).sort( "trade_date" ) polars_result = stock_df.with_columns( pl.col("close") .rolling_mean(window_size=period) .over("ts_code") .alias("ma") ) context = FactorContext(current_stock=sample_stock) factor_data = FactorData( stock_df.with_columns([pl.col("trade_date").cast(pl.Utf8)]), context ) ma_factor = MAFactor(period=period) factor_result = ma_factor.compute(factor_data).to_numpy() polars_values = polars_result["ma"].to_numpy() valid_idx = ~np.isnan(polars_values) diff = np.abs(polars_values[valid_idx] - factor_result[valid_idx]) max_diff = np.max(diff) status = "通过" if max_diff < 1e-10 else "失败" print(f" MA({period}): 最大差异 = {max_diff:.2e} {status}") # 测试截面因子 trade_dates = daily_data["trade_date"].unique().to_list() sample_date = trade_dates[50] date_df = daily_data.filter(pl.col("trade_date") == sample_date) polars_result = date_df.with_columns( (pl.col("close").rank() / pl.col("close").count()).alias("rank") ) context = FactorContext(current_date=str(sample_date)) factor_data = FactorData( date_df.with_columns( [pl.col("trade_date").cast(pl.Utf8), pl.col("ts_code").cast(pl.Utf8)] ), context, ) pe_factor = PERankFactor() factor_result = pe_factor.compute(factor_data).to_numpy() polars_values = polars_result["rank"].to_numpy() diff = np.abs(polars_values - factor_result) max_diff = np.max(diff) status = "通过" if max_diff < 1e-10 else "失败" print(f" PE_Rank: 最大差异 = {max_diff:.2e} {status}") print("=" * 60) # 测试多个时序周期 for period in [5, 10, 20]: sample_stock = daily_data["ts_code"].to_list()[0] stock_df = daily_data.filter(pl.col("ts_code") == sample_stock).sort( "trade_date" ) polars_result = stock_df.with_columns( pl.col("close") .rolling_mean(window_size=period) .over("ts_code") .alias("ma") ) context = FactorContext(current_stock=sample_stock) factor_data = FactorData( stock_df.with_columns([pl.col("trade_date").cast(pl.Utf8)]), context ) ma_factor = MAFactor(period=period) factor_result = ma_factor.compute(factor_data).to_numpy() polars_values = polars_result["ma"].to_numpy() valid_idx = ~np.isnan(polars_values) diff = np.abs(polars_values[valid_idx] - factor_result[valid_idx]) max_diff = np.max(diff) status = "通过" if max_diff < 1e-10 else "失败" print(f" MA({period}): 最大差异 = {max_diff:.2e} {status}") # 测试截面因子 trade_dates = daily_data["trade_date"].unique().to_list() sample_date = trade_dates[50] date_df = daily_data.filter(pl.col("trade_date") == sample_date) polars_result = date_df.with_columns( (pl.col("close").rank() / pl.col("close").count()).alias("rank") ) context = FactorContext(current_date=str(sample_date)) factor_data = FactorData( date_df.with_columns( [pl.col("trade_date").cast(pl.Utf8), pl.col("ts_code").cast(pl.Utf8)] ), context, ) pe_factor = PERankFactor() factor_result = pe_factor.compute(factor_data).to_numpy() polars_values = polars_result["rank"].to_numpy() diff = np.abs(polars_values - factor_result) max_diff = np.max(diff) status = "通过" if max_diff < 1e-10 else "失败" print(f" PE_Rank: 最大差异 = {max_diff:.2e} {status}") print("=" * 60)