refactor(factors): 简化 add_factor API 并默认启用 metadata

- 合并 add_factor_by_name 到 add_factor,支持三种调用方式
- FactorManager 构造函数改为可选参数,使用默认路径
- FactorEngine 默认启用 metadata,无需手动配置路径
This commit is contained in:
2026-03-12 22:34:25 +08:00
parent 2bb7718dd1
commit ced7a929c3
7 changed files with 496 additions and 254 deletions

View File

@@ -40,29 +40,40 @@ from src.training.config import TrainingConfig
# ## 2. 辅助函数
# %%
def create_factors_with_metadata(
engine: FactorEngine, factor_definitions: dict, label_factor: dict
engine: FactorEngine,
selected_factors: List[str],
factor_definitions: dict,
label_factor: dict,
) -> List[str]:
"""使用 metadata 注册因子特征因子通过名称注册label 因子通过表达式注册)"""
"""注册因子SELECTED_FACTORS 从 metadata 查询FACTOR_DEFINITIONS 用表达式注册)"""
print("=" * 80)
print("使用 metadata 注册因子")
print("注册因子")
print("=" * 80)
# 注册所有特征因子(通过 metadata 名称
# 注册 SELECTED_FACTORS 中的因子(已在 metadata
print("\n注册特征因子(从 metadata:")
for name in factor_definitions.keys():
engine.add_factor_by_name(name)
for name in selected_factors:
engine.add_factor(name)
print(f" - {name}")
# 注册 label 因子(通过表达式,因为 label 不在 metadata 中)
# 注册 FACTOR_DEFINITIONS 中的因子(通过表达式,尚未在 metadata 中)
print("\n注册特征因子(表达式):")
for name, expr in factor_definitions.items():
engine.add_factor(name, expr)
print(f" - {name}: {expr}")
# 注册 label 因子(通过表达式)
print("\n注册 Label 因子(表达式):")
for name, expr in label_factor.items():
engine.add_factor(name, expr)
print(f" - {name}: {expr}")
# 从字典自动获取特征列
feature_cols = list(factor_definitions.keys())
# 特征列 = SELECTED_FACTORS + FACTOR_DEFINITIONS 的 keys
feature_cols = selected_factors + list(factor_definitions.keys())
print(f"\n特征因子数: {len(feature_cols)}")
print(f" - 来自 metadata: {len(selected_factors)}")
print(f" - 来自表达式: {len(factor_definitions)}")
print(f"Label: {list(label_factor.keys())[0]}")
print(f"已注册因子总数: {len(engine.list_registered())}")
@@ -236,62 +247,68 @@ def evaluate_ndcg_at_k(
# 特征因子定义字典(复用 regression.ipynb 的因子定义)
LABEL_NAME = "future_return_5_rank"
FACTOR_DEFINITIONS = {
# ================= 1. 价格、趋势与路径依赖 (Trend, Momentum & Path Dependency) =================
"ma_5": "ts_mean(close, 5)",
"ma_20": "ts_mean(close, 20)",
"ma_ratio_5_20": "ts_mean(close, 5) / (ts_mean(close, 20) + 1e-8) - 1",
"bias_10": "close / (ts_mean(close, 10) + 1e-8) - 1",
"high_low_ratio": "(close - ts_min(low, 20)) / (ts_max(high, 20) - ts_min(low, 20) + 1e-8)",
"bbi_ratio": "(ts_mean(close, 3) + ts_mean(close, 6) + ts_mean(close, 12) + ts_mean(close, 24)) / (4 * close + 1e-8)",
"return_5": "(close / (ts_delay(close, 5) + 1e-8)) - 1",
"return_20": "(close / (ts_delay(close, 20) + 1e-8)) - 1",
"kaufman_ER_20": "abs(close - ts_delay(close, 20)) / (ts_sum(abs(close - ts_delay(close, 1)), 20) + 1e-8)",
"mom_acceleration_10_20": "(close / (ts_delay(close, 10) + 1e-8) - 1) - (ts_delay(close, 10) / (ts_delay(close, 20) + 1e-8) - 1)",
"drawdown_from_high_60": "close / (ts_max(high, 60) + 1e-8) - 1",
"up_days_ratio_20": "ts_sum(close > ts_delay(close, 1), 20) / 20",
# 当前选择的因子列表(从 FACTOR_DEFINITIONS 中选择要使用的因子)
SELECTED_FACTORS = [
# ================= 1. 价格、趋势与路径依赖 =================
"ma_5",
"ma_20",
"ma_ratio_5_20",
"bias_10",
"high_low_ratio",
"bbi_ratio",
"return_5",
"return_20",
"kaufman_ER_20",
"mom_acceleration_10_20",
"drawdown_from_high_60",
"up_days_ratio_20",
# ================= 2. 波动率、风险调整与高阶矩 =================
"volatility_5": "ts_std(close, 5)",
"volatility_20": "ts_std(close, 20)",
"volatility_ratio": "ts_std(close, 5) / (ts_std(close, 20) + 1e-8)",
"std_return_20": "ts_std((close / (ts_delay(close, 1) + 1e-8)) - 1, 20)",
"sharpe_ratio_20": "ts_mean(close / (ts_delay(close, 1) + 1e-8) - 1, 20) / (ts_std(close / (ts_delay(close, 1) + 1e-8) - 1, 20) + 1e-8)",
"min_ret_20": "ts_min(close / (ts_delay(close, 1) + 1e-8) - 1, 20)",
"volatility_squeeze_5_60": "ts_std(close, 5) / (ts_std(close, 60) + 1e-8)",
"volatility_5",
"volatility_20",
"volatility_ratio",
"std_return_20",
"sharpe_ratio_20",
"min_ret_20",
"volatility_squeeze_5_60",
# ================= 3. 日内微观结构与异象 =================
"overnight_intraday_diff": "(open / (ts_delay(close, 1) + 1e-8) - 1) - (close / (open + 1e-8) - 1)",
"upper_shadow_ratio": "(high - ((open + close + abs(open - close)) / 2)) / (high - low + 1e-8)",
"capital_retention_20": "ts_sum(abs(close - open), 20) / (ts_sum(high - low, 20) + 1e-8)",
"max_ret_20": "ts_max(close / (ts_delay(close, 1) + 1e-8) - 1, 20)",
"overnight_intraday_diff",
"upper_shadow_ratio",
"capital_retention_20",
"max_ret_20",
# ================= 4. 量能、流动性与量价背离 =================
"volume_ratio_5_20": "ts_mean(vol, 5) / (ts_mean(vol, 20) + 1e-8)",
"turnover_rate_mean_5": "ts_mean(turnover_rate, 5)",
"turnover_deviation": "(turnover_rate - ts_mean(turnover_rate, 10)) / (ts_std(turnover_rate, 10) + 1e-8)",
"amihud_illiq_20": "ts_mean(abs(close / (ts_delay(close, 1) + 1e-8) - 1) / (amount + 1e-8), 20)",
"turnover_cv_20": "ts_std(turnover_rate, 20) / (ts_mean(turnover_rate, 20) + 1e-8)",
"pv_corr_20": "ts_corr(close / (ts_delay(close, 1) + 1e-8) - 1, vol, 20)",
"close_vwap_deviation": "close / (amount / (vol * 100 + 1e-8) + 1e-8) - 1",
"volume_ratio_5_20",
"turnover_rate_mean_5",
"turnover_deviation",
"amihud_illiq_20",
"turnover_cv_20",
"pv_corr_20",
"close_vwap_deviation",
# ================= 5. 基本面财务特征 =================
"roe": "n_income / (total_hldr_eqy_exc_min_int + 1e-8)",
"roa": "n_income / (total_assets + 1e-8)",
"profit_margin": "n_income / (revenue + 1e-8)",
"debt_to_equity": "total_liab / (total_hldr_eqy_exc_min_int + 1e-8)",
"current_ratio": "total_cur_assets / (total_cur_liab + 1e-8)",
"net_profit_yoy": "(n_income / (ts_delay(n_income, 252) + 1e-8)) - 1",
"revenue_yoy": "(revenue / (ts_delay(revenue, 252) + 1e-8)) - 1",
"healthy_expansion_velocity": "(total_assets / (ts_delay(total_assets, 252) + 1e-8) - 1) - (total_liab / (ts_delay(total_liab, 252) + 1e-8) - 1)",
"roe",
"roa",
"profit_margin",
"debt_to_equity",
"current_ratio",
"net_profit_yoy",
"revenue_yoy",
"healthy_expansion_velocity",
# ================= 6. 基本面估值与截面动量共振 =================
"EP": "n_income / (total_mv * 10000 + 1e-8)",
"BP": "total_hldr_eqy_exc_min_int / (total_mv * 10000 + 1e-8)",
"CP": "n_cashflow_act / (total_mv * 10000 + 1e-8)",
"market_cap_rank": "cs_rank(total_mv)",
"turnover_rank": "cs_rank(turnover_rate)",
"return_5_rank": "cs_rank((close / (ts_delay(close, 5) + 1e-8)) - 1)",
"EP_rank": "cs_rank(n_income / (total_mv + 1e-8))",
"pe_expansion_trend": "(total_mv / (n_income + 1e-8)) / (ts_delay(total_mv, 60) / (ts_delay(n_income, 60) + 1e-8) + 1e-8) - 1",
"value_price_divergence": "cs_rank((n_income - ts_delay(n_income, 252)) / (abs(ts_delay(n_income, 252)) + 1e-8)) - cs_rank(close / (ts_delay(close, 20) + 1e-8))",
"active_market_cap": "total_mv * ts_mean(turnover_rate, 20)",
"ebit_rank": "cs_rank(ebit)",
"EP",
"BP",
"CP",
"market_cap_rank",
"turnover_rank",
"return_5_rank",
"EP_rank",
"pe_expansion_trend",
"value_price_divergence",
"active_market_cap",
"ebit_rank",
]
# 因子定义字典(完整因子库)
FACTOR_DEFINITIONS = {
# "turnover_volatility_ratio": "log(ts_std(turnover_rate, 20))"
}
# Label 因子定义(不参与训练,用于计算目标)
@@ -332,7 +349,7 @@ MODEL_PARAMS = {
N_QUANTILES = 20 # 将 label 分为 20 组
# 特征列(用于数据处理器)
FEATURE_COLS = list(FACTOR_DEFINITIONS.keys())
FEATURE_COLS = SELECTED_FACTORS
# 数据处理器配置
PROCESSORS = [
@@ -385,11 +402,13 @@ print("=" * 80)
# 1. 创建 FactorEngine启用 metadata 功能)
print("\n[1] 创建 FactorEngine")
engine = FactorEngine(metadata_path="data/factors.jsonl")
engine = FactorEngine()
# 2. 使用 metadata 定义因子
print("\n[2] 定义因子(从 metadata 注册)")
feature_cols = create_factors_with_metadata(engine, FACTOR_DEFINITIONS, LABEL_FACTOR)
feature_cols = create_factors_with_metadata(
engine, SELECTED_FACTORS, FACTOR_DEFINITIONS, LABEL_FACTOR
)
# 3. 准备数据
print("\n[3] 准备数据")