refactor: 代码审查修复 - 日期过滤、性能优化、数据泄露防护
- 修复 data_loader.py 财务数据日期过滤,支持按范围加载 - 优化 MADClipper 使用窗口函数替代 join,提升性能 - 修复训练日期边界问题,添加1天间隔避免数据泄露 - 新增 .gitignore 规则忽略训练输出目录
This commit is contained in:
1535
src/factors/FACTOR_GUIDE.md
Normal file
1535
src/factors/FACTOR_GUIDE.md
Normal file
File diff suppressed because it is too large
Load Diff
@@ -18,6 +18,52 @@
|
||||
- CompositeFactor: 组合因子
|
||||
- ScalarFactor: 标量运算因子
|
||||
|
||||
因子分类目录:
|
||||
- momentum/: 动量因子(MA、收益率排名等)
|
||||
- financial/: 财务因子(EPS、ROE等)
|
||||
- valuation/: 估值因子(PE、PB、PS等)
|
||||
- technical/: 技术指标因子(RSI、MACD、布林带等)
|
||||
- quality/: 质量因子(盈利能力、稳定性等)
|
||||
- sentiment/: 情绪因子(换手率、资金流向等)
|
||||
- volume/: 成交量因子(OBV、成交量比率等)
|
||||
- volatility/: 波动率因子(历史波动率、GARCH等)
|
||||
|
||||
数据加载和执行(Phase 3-4):
|
||||
- DataLoader: 数据加载器
|
||||
- FactorEngine: 因子执行引擎
|
||||
|
||||
使用示例:
|
||||
# 使用通用因子(参数化)
|
||||
from src.factors import MovingAverageFactor, ReturnRankFactor
|
||||
from src.factors import DataLoader, FactorEngine
|
||||
|
||||
ma5 = MovingAverageFactor(period=5) # 5日MA
|
||||
ma10 = MovingAverageFactor(period=10) # 10日MA
|
||||
ret5 = ReturnRankFactor(period=5) # 5日收益率排名
|
||||
|
||||
loader = DataLoader(data_dir="data")
|
||||
engine = FactorEngine(loader)
|
||||
result = engine.compute(ma5, stock_codes=["000001.SZ"], start_date="20240101", end_date="20240131")
|
||||
"""
|
||||
|
||||
因子框架提供以下核心功能:
|
||||
1. 类型安全的因子定义(截面因子、时序因子)
|
||||
2. 数据泄露防护机制
|
||||
3. 因子组合和运算
|
||||
4. 高效的数据加载和计算引擎
|
||||
|
||||
基础数据类型(Phase 1):
|
||||
- DataSpec: 数据需求规格
|
||||
- FactorContext: 计算上下文
|
||||
- FactorData: 数据容器
|
||||
|
||||
因子基类(Phase 2):
|
||||
- BaseFactor: 抽象基类
|
||||
- CrossSectionalFactor: 日期截面因子基类
|
||||
- TimeSeriesFactor: 时间序列因子基类
|
||||
- CompositeFactor: 组合因子
|
||||
- ScalarFactor: 标量运算因子
|
||||
|
||||
动量因子(momentum/):
|
||||
- MovingAverageFactor: 移动平均线(时序因子)
|
||||
- ReturnRankFactor: 收益率排名(截面因子)
|
||||
|
||||
@@ -72,8 +72,8 @@ class DataLoader:
|
||||
if cache_key in self._cache:
|
||||
df = self._cache[cache_key]
|
||||
else:
|
||||
# 读取 H5 文件
|
||||
df = self._read_h5(spec.source)
|
||||
# 读取 H5 文件(传入日期范围以支持过滤)
|
||||
df = self._read_h5(spec.source, date_range=date_range)
|
||||
|
||||
# 列选择 - 只保留需要的列
|
||||
missing_cols = set(spec.columns) - set(df.columns)
|
||||
@@ -107,7 +107,11 @@ class DataLoader:
|
||||
"""清空缓存"""
|
||||
self._cache.clear()
|
||||
|
||||
def _read_h5(self, source: str) -> pl.DataFrame:
|
||||
def _read_h5(
|
||||
self,
|
||||
source: str,
|
||||
date_range: Optional[Tuple[str, str]] = None,
|
||||
) -> pl.DataFrame:
|
||||
"""读取数据 - 从 DuckDB 加载为 Polars DataFrame。
|
||||
|
||||
迁移说明:
|
||||
@@ -117,6 +121,7 @@ class DataLoader:
|
||||
|
||||
Args:
|
||||
source: 表名(对应 DuckDB 中的表,如 "daily")
|
||||
date_range: 日期范围限制 (start_date, end_date),可选
|
||||
|
||||
Returns:
|
||||
Polars DataFrame
|
||||
@@ -125,11 +130,38 @@ class DataLoader:
|
||||
Exception: 数据库查询错误
|
||||
"""
|
||||
from src.data.storage import Storage
|
||||
from src.data.api_wrappers.api_trade_cal import get_trading_days
|
||||
from src.data.utils import get_today_date
|
||||
from src.factors.financial.utils import expand_period_to_trading_days
|
||||
|
||||
storage = Storage()
|
||||
|
||||
# 如果 DataLoader 有 date_range,传递给 Storage 进行过滤
|
||||
# 实现查询下推,只加载必要数据
|
||||
# 特殊处理财务数据:将报告期展开到交易日
|
||||
if source == "financial_income":
|
||||
# 确定日期范围
|
||||
start_date = date_range[0] if date_range else "20180101"
|
||||
end_date = date_range[1] if date_range else get_today_date()
|
||||
|
||||
# 1. 加载原始财务数据(报告期粒度),按日期范围过滤
|
||||
# 注意:financial_income 使用 end_date 字段作为报告期
|
||||
df = storage.load_polars(
|
||||
"financial_income",
|
||||
start_date=start_date,
|
||||
end_date=end_date,
|
||||
)
|
||||
|
||||
if len(df) == 0:
|
||||
return pl.DataFrame()
|
||||
|
||||
# 2. 获取交易日历(从2018年开始到当前,确保有足够的历史数据用于前向填充)
|
||||
# 需要从数据的最小日期开始,确保能获取到足够的交易日
|
||||
trade_start = "20180101" if start_date > "20180101" else start_date
|
||||
trade_dates = get_trading_days(trade_start, get_today_date())
|
||||
|
||||
# 3. 展开到交易日(前向填充)
|
||||
return expand_period_to_trading_days(df, trade_dates)
|
||||
|
||||
# 其他数据源保持原有逻辑
|
||||
return storage.load_polars(source)
|
||||
|
||||
def _merge_dataframes(self, dataframes: List[pl.DataFrame]) -> pl.DataFrame:
|
||||
|
||||
@@ -4,7 +4,10 @@
|
||||
|
||||
因子分类:
|
||||
- financial: 财务因子
|
||||
- (待添加)
|
||||
- EPSFactor: 每股收益排名因子
|
||||
|
||||
已添加因子:
|
||||
- EPSFactor: 每股收益排名(基于basic_eps)
|
||||
|
||||
待添加因子:
|
||||
- PERankFactor: 市盈率排名
|
||||
@@ -12,4 +15,6 @@
|
||||
- DividendFactor: 股息率因子
|
||||
"""
|
||||
|
||||
__all__ = []
|
||||
from src.factors.financial.eps_factor import EPSFactor
|
||||
|
||||
__all__ = ["EPSFactor"]
|
||||
|
||||
66
src/factors/financial/eps_factor.py
Normal file
66
src/factors/financial/eps_factor.py
Normal file
@@ -0,0 +1,66 @@
|
||||
"""EPS因子
|
||||
|
||||
每股收益(EPS)排名因子实现
|
||||
"""
|
||||
|
||||
from typing import List
|
||||
import polars as pl
|
||||
|
||||
from src.factors.base import CrossSectionalFactor
|
||||
from src.factors.data_spec import DataSpec, FactorData
|
||||
|
||||
|
||||
class EPSFactor(CrossSectionalFactor):
|
||||
"""每股收益(EPS)排名因子
|
||||
|
||||
计算逻辑:使用最新报告期的basic_eps,每天对所有股票进行截面排名
|
||||
|
||||
Attributes:
|
||||
name: 因子名称 "eps_rank"
|
||||
category: 因子分类 "financial"
|
||||
data_specs: 数据需求规格
|
||||
|
||||
Example:
|
||||
>>> from src.factors import FactorEngine, DataLoader
|
||||
>>> from src.factors.financial.eps_factor import EPSFactor
|
||||
>>> loader = DataLoader('data')
|
||||
>>> engine = FactorEngine(loader)
|
||||
>>> eps_factor = EPSFactor()
|
||||
>>> result = engine.compute(eps_factor, start_date='20210101', end_date='20210131')
|
||||
"""
|
||||
|
||||
name: str = "eps_rank"
|
||||
category: str = "financial"
|
||||
description: str = "每股收益截面排名因子"
|
||||
data_specs: List[DataSpec] = [
|
||||
DataSpec(
|
||||
"financial_income", ["ts_code", "trade_date", "basic_eps"], lookback_days=1
|
||||
)
|
||||
]
|
||||
|
||||
def compute(self, data: FactorData) -> pl.Series:
|
||||
"""计算EPS排名
|
||||
|
||||
Args:
|
||||
data: FactorData,包含当前日期的截面数据
|
||||
|
||||
Returns:
|
||||
EPS排名的0-1标准化值(0-1之间)
|
||||
"""
|
||||
# 获取当前日期的截面数据
|
||||
cs = data.get_cross_section()
|
||||
|
||||
if len(cs) == 0:
|
||||
return pl.Series(name=self.name, values=[])
|
||||
|
||||
# 提取EPS值,填充缺失值为0
|
||||
eps = cs["basic_eps"].fill_null(0)
|
||||
|
||||
# 计算排名并归一化到0-1
|
||||
if len(eps) > 1 and eps.max() != eps.min():
|
||||
ranks = eps.rank(method="average") / len(eps)
|
||||
else:
|
||||
# 数据不足或全部相同,返回0.5
|
||||
ranks = pl.Series(name=self.name, values=[0.5] * len(eps))
|
||||
|
||||
return ranks
|
||||
82
src/factors/financial/utils.py
Normal file
82
src/factors/financial/utils.py
Normal file
@@ -0,0 +1,82 @@
|
||||
"""财务因子工具函数
|
||||
|
||||
提供财务数据处理的工具函数:
|
||||
- expand_period_to_trading_days: 将报告期数据展开到每个交易日(前向填充)
|
||||
"""
|
||||
|
||||
from typing import List
|
||||
import polars as pl
|
||||
|
||||
|
||||
def expand_period_to_trading_days(
|
||||
financial_df: pl.DataFrame,
|
||||
trade_dates: List[str],
|
||||
) -> pl.DataFrame:
|
||||
"""将财务数据(报告期粒度)展开到每个交易日(前向填充)
|
||||
|
||||
核心逻辑:对于每个交易日,找到该日期之前最新的已公告报告期数据。
|
||||
例如:2020年报(20201231)公告于20210428,则在2021-04-28之后的每个
|
||||
交易日都使用该年报数据,直到2021一季报公告。
|
||||
|
||||
Args:
|
||||
financial_df: 财务数据DataFrame,包含 ts_code, ann_date, end_date, ...
|
||||
trade_dates: 交易日列表(YYYYMMDD格式,已排序)
|
||||
|
||||
Returns:
|
||||
DataFrame,包含 trade_date, ts_code 和所有财务字段
|
||||
|
||||
Example:
|
||||
>>> financial_df = pl.DataFrame({
|
||||
... 'ts_code': ['000001.SZ'],
|
||||
... 'ann_date': ['20210428'],
|
||||
... 'end_date': ['20210331'],
|
||||
... 'basic_eps': [0.5]
|
||||
... })
|
||||
>>> trade_dates = ['20210428', '20210429', '20210430']
|
||||
>>> result = expand_period_to_trading_days(financial_df, trade_dates)
|
||||
>>> print(result)
|
||||
shape: (3, 5)
|
||||
┌───────────┬───────────┬────────────┬────────────┬───────────┐
|
||||
│ ts_code ┆ ann_date ┆ end_date ┆ basic_eps ┆ trade_date│
|
||||
│ --- ┆ --- ┆ --- ┆ --- ┆ --- │
|
||||
│ str ┆ str ┆ str ┆ f64 ┆ str │
|
||||
╞═══════════╪═══════════╪════════════╪════════════╪═══════════╡
|
||||
│ 000001.SZ ┆ 20210428 ┆ 20210331 ┆ 0.5 ┆ 20210428 │
|
||||
│ 000001.SZ ┆ 20210428 ┆ 20210331 ┆ 0.5 ┆ 20210429 │
|
||||
│ 000001.SZ ┆ 20210428 ┆ 20210331 ┆ 0.5 ┆ 20210430 │
|
||||
└───────────┴───────────┴────────────┴────────────┴───────────┘
|
||||
"""
|
||||
if len(financial_df) == 0:
|
||||
return pl.DataFrame()
|
||||
|
||||
results = []
|
||||
|
||||
# 按股票分组处理
|
||||
for ts_code in financial_df["ts_code"].unique():
|
||||
stock_data = financial_df.filter(pl.col("ts_code") == ts_code)
|
||||
|
||||
# 按报告期排序(end_date升序)
|
||||
stock_data = stock_data.sort("end_date")
|
||||
|
||||
rows = []
|
||||
for trade_date in trade_dates:
|
||||
# 找到该交易日之前最新的已公告报告期
|
||||
# 条件1: end_date <= trade_date(报告期不晚于交易日)
|
||||
# 条件2: ann_date <= trade_date(已公告)
|
||||
applicable = stock_data.filter(
|
||||
(pl.col("end_date") <= trade_date) & (pl.col("ann_date") <= trade_date)
|
||||
)
|
||||
|
||||
if len(applicable) > 0:
|
||||
# 取最新的一条(end_date最大的)
|
||||
latest = applicable.tail(1).with_columns(
|
||||
[pl.lit(trade_date).alias("trade_date")]
|
||||
)
|
||||
rows.append(latest)
|
||||
|
||||
if rows:
|
||||
results.append(pl.concat(rows))
|
||||
|
||||
if results:
|
||||
return pl.concat(results)
|
||||
return pl.DataFrame()
|
||||
20
src/factors/quality/__init__.py
Normal file
20
src/factors/quality/__init__.py
Normal file
@@ -0,0 +1,20 @@
|
||||
"""质量因子模块
|
||||
|
||||
本模块提供质量类因子:
|
||||
- 盈利能力:ROE、ROA、毛利率、净利率
|
||||
- 盈利稳定性:盈利波动率、盈利持续性
|
||||
- 财务健康度:资产负债率、流动比率等
|
||||
|
||||
使用示例:
|
||||
>>> from src.factors.quality import ROEFactor
|
||||
>>> factor = ROEFactor()
|
||||
"""
|
||||
|
||||
# 在此处导入具体的质量因子
|
||||
# from .roe import ROEFactor
|
||||
# from .roa import ROAFactor
|
||||
# from .profit_stability import ProfitStabilityFactor
|
||||
|
||||
__all__ = [
|
||||
# 添加你的质量因子
|
||||
]
|
||||
20
src/factors/sentiment/__init__.py
Normal file
20
src/factors/sentiment/__init__.py
Normal file
@@ -0,0 +1,20 @@
|
||||
"""情绪因子模块
|
||||
|
||||
本模块提供市场情绪类因子:
|
||||
- 换手率、换手率变化率
|
||||
- 资金流向、主力净流入
|
||||
- 波动率、振幅等
|
||||
|
||||
使用示例:
|
||||
>>> from src.factors.sentiment import TurnoverFactor
|
||||
>>> factor = TurnoverFactor(period=20)
|
||||
"""
|
||||
|
||||
# 在此处导入具体的情绪因子
|
||||
# from .turnover import TurnoverFactor
|
||||
# from .money_flow import MoneyFlowFactor
|
||||
# from .amplitude import AmplitudeFactor
|
||||
|
||||
__all__ = [
|
||||
# 添加你的情绪因子
|
||||
]
|
||||
20
src/factors/technical/__init__.py
Normal file
20
src/factors/technical/__init__.py
Normal file
@@ -0,0 +1,20 @@
|
||||
"""技术指标因子模块
|
||||
|
||||
本模块提供技术分析类因子:
|
||||
- 移动平均线(MA)、指数移动平均(EMA)
|
||||
- 相对强弱指标(RSI)、MACD、KDJ
|
||||
- 布林带(Bollinger Bands)等
|
||||
|
||||
使用示例:
|
||||
>>> from src.factors.technical import RSIFactor
|
||||
>>> factor = RSIFactor(period=14)
|
||||
"""
|
||||
|
||||
# 在此处导入具体的技术指标因子
|
||||
# from .rsi import RSIFactor
|
||||
# from .macd import MACDFactor
|
||||
# from .bollinger import BollingerFactor
|
||||
|
||||
__all__ = [
|
||||
# 添加你的技术指标因子
|
||||
]
|
||||
18
src/factors/valuation/__init__.py
Normal file
18
src/factors/valuation/__init__.py
Normal file
@@ -0,0 +1,18 @@
|
||||
"""估值因子模块
|
||||
|
||||
本模块提供估值类因子:
|
||||
- 市盈率(PE)、市净率(PB)、市销率(PS)等估值指标
|
||||
- 估值排名、估值分位数等衍生因子
|
||||
|
||||
使用示例:
|
||||
>>> from src.factors.valuation import PERankFactor
|
||||
>>> factor = PERankFactor()
|
||||
"""
|
||||
|
||||
# 在此处导入具体的估值因子
|
||||
# from .pe_rank import PERankFactor
|
||||
# from .pb_rank import PBRankFactor
|
||||
|
||||
__all__ = [
|
||||
# 添加你的估值因子
|
||||
]
|
||||
21
src/factors/volatility/__init__.py
Normal file
21
src/factors/volatility/__init__.py
Normal file
@@ -0,0 +1,21 @@
|
||||
"""波动率因子模块
|
||||
|
||||
本模块提供波动率相关因子:
|
||||
- 历史波动率(Historical Volatility)
|
||||
- 实现波动率(Realized Volatility)
|
||||
- GARCH类波动率预测
|
||||
- 波动率风险指标等
|
||||
|
||||
使用示例:
|
||||
>>> from src.factors.volatility import HistoricalVolFactor
|
||||
>>> factor = HistoricalVolFactor(period=20)
|
||||
"""
|
||||
|
||||
# 在此处导入具体的波动率因子
|
||||
# from .historical_vol import HistoricalVolFactor
|
||||
# from .realized_vol import RealizedVolFactor
|
||||
# from .garch_vol import GARCHVolFactor
|
||||
|
||||
__all__ = [
|
||||
# 添加你的波动率因子
|
||||
]
|
||||
20
src/factors/volume/__init__.py
Normal file
20
src/factors/volume/__init__.py
Normal file
@@ -0,0 +1,20 @@
|
||||
"""成交量因子模块
|
||||
|
||||
本模块提供成交量相关因子:
|
||||
- 成交量移动平均
|
||||
- 成交量比率(VR)、能量潮(OBV)
|
||||
- 量价配合指标等
|
||||
|
||||
使用示例:
|
||||
>>> from src.factors.volume import OBVFactor
|
||||
>>> factor = OBVFactor()
|
||||
"""
|
||||
|
||||
# 在此处导入具体的成交量因子
|
||||
# from .obv import OBVFactor
|
||||
# from .volume_ratio import VolumeRatioFactor
|
||||
# from .volume_ma import VolumeMAFactor
|
||||
|
||||
__all__ = [
|
||||
# 添加你的成交量因子
|
||||
]
|
||||
Reference in New Issue
Block a user