feat(training): LightGBM支持验证集早停

- 为fit方法添加eval_set参数,支持验证集评估和早停

- 因子引擎简化初始化,移除metadata_path参数

- 回归实验精简因子定义,移除冗余因子库
This commit is contained in:
2026-03-14 22:51:24 +08:00
parent 5541373ded
commit 6927d20de1
3 changed files with 19 additions and 90 deletions

View File

@@ -164,84 +164,6 @@ SELECTED_FACTORS = [
# 因子定义字典(完整因子库)
FACTOR_DEFINITIONS = {
# ================= 1. 价格、趋势与路径依赖 (Trend, Momentum & Path Dependency) =================
"ma_5": "ts_mean(close, 5)",
"ma_20": "ts_mean(close, 20)",
"ma_ratio_5_20": "ts_mean(close, 5) / (ts_mean(close, 20) + 1e-8) - 1", # 均线发散度
"bias_10": "close / (ts_mean(close, 10) + 1e-8) - 1", # 10日乖离率
"high_low_ratio": "(close - ts_min(low, 20)) / (ts_max(high, 20) - ts_min(low, 20) + 1e-8)", # 威廉指标变形
"bbi_ratio": "(ts_mean(close, 3) + ts_mean(close, 6) + ts_mean(close, 12) + ts_mean(close, 24)) / (4 * close + 1e-8)", # 多空指标比率
"return_5": "(close / (ts_delay(close, 5) + 1e-8)) - 1", # 5日动量
"return_20": "(close / (ts_delay(close, 20) + 1e-8)) - 1", # 20日动量
# [高阶] Kaufman 趋势效率 (极高价值) - 衡量趋势流畅度,剔除无序震荡
"kaufman_ER_20": "abs(close - ts_delay(close, 20)) / (ts_sum(abs(close - ts_delay(close, 1)), 20) + 1e-8)",
# [高阶] 动量加速度 - 寻找二阶导数大于0正在加速爆发的股票
"mom_acceleration_10_20": "(close / (ts_delay(close, 10) + 1e-8) - 1) - (ts_delay(close, 10) / (ts_delay(close, 20) + 1e-8) - 1)",
# [高阶] 高点距离衰减 - 衡量套牢盘压力
"drawdown_from_high_60": "close / (ts_max(high, 60) + 1e-8) - 1",
# [高阶] 趋势一致性 - 过去20天内收红的天数比例
"up_days_ratio_20": "ts_sum(close > ts_delay(close, 1), 20) / 20",
# ================= 2. 波动率、风险调整与高阶矩 (Volatility & Risk-Adjusted Returns) =================
"volatility_5": "ts_std(close, 5)",
"volatility_20": "ts_std(close, 20)",
"volatility_ratio": "ts_std(close, 5) / (ts_std(close, 20) + 1e-8)", # 波动率期限结构
"std_return_20": "ts_std((close / (ts_delay(close, 1) + 1e-8)) - 1, 20)", # 真实收益率波动率
# [高阶] 夏普趋势比率 - 惩罚暴涨暴跌,奖励稳健爬坡
"sharpe_ratio_20": "ts_mean(close / (ts_delay(close, 1) + 1e-8) - 1, 20) / (ts_std(close / (ts_delay(close, 1) + 1e-8) - 1, 20) + 1e-8)",
# [高阶] 尾部崩盘风险 - 过去一个月最大单日跌幅
"min_ret_20": "ts_min(close / (ts_delay(close, 1) + 1e-8) - 1, 20)",
# [高阶] 波动率挤压比 - 寻找盘整到极致面临变盘的股票 (布林带收口)
"volatility_squeeze_5_60": "ts_std(close, 5) / (ts_std(close, 60) + 1e-8)",
# ================= 3. 日内微观结构与异象 (Intraday Microstructure & Anomalies) =================
# [高阶] 隔夜与日内背离 - 差值越小说明主力越喜欢在盘中吸筹
"overnight_intraday_diff": "(open / (ts_delay(close, 1) + 1e-8) - 1) - (close / (open + 1e-8) - 1)",
# [高阶] 上影线抛压极值 - 冲高回落被套牢的概率
"upper_shadow_ratio": "(high - ((open + close + abs(open - close)) / 2)) / (high - low + 1e-8)",
# [高阶] 资金沉淀率 - 衡量主力日内高抛低吸洗盘的剧烈程度
"capital_retention_20": "ts_sum(abs(close - open), 20) / (ts_sum(high - low, 20) + 1e-8)",
# [高阶] MAX 彩票效应 - 反转因子,剔除近期有过妖股连板特征的标的
"max_ret_20": "ts_max(close / (ts_delay(close, 1) + 1e-8) - 1, 20)",
# ================= 4. 量能、流动性与量价背离 (Volume, Liquidity & Divergence) =================
"volume_ratio_5_20": "ts_mean(vol, 5) / (ts_mean(vol, 20) + 1e-8)", # 相对放量比
"turnover_rate_mean_5": "ts_mean(turnover_rate, 5)", # 活跃度
"turnover_deviation": "(turnover_rate - ts_mean(turnover_rate, 10)) / (ts_std(turnover_rate, 10) + 1e-8)", # 换手率偏离度
# [高阶] Amihud 非流动性异象 (绝对核心) - 衡量砸盘/拉升的摩擦成本
"amihud_illiq_20": "ts_mean(abs(close / (ts_delay(close, 1) + 1e-8) - 1) / (amount + 1e-8), 20)",
# [高阶] 换手率惩罚因子 - 换手率忽高忽低说明游资接力,行情极不稳定
"turnover_cv_20": "ts_std(turnover_rate, 20) / (ts_mean(turnover_rate, 20) + 1e-8)",
# [高阶] 纯粹量价相关性 - 检验是否是"放量上涨,缩量下跌"的良性多头
"pv_corr_20": "ts_corr(close / (ts_delay(close, 1) + 1e-8) - 1, vol, 20)",
# [高阶] 收盘价与均价背离 - 专门抓尾盘突袭拉升骗线的股票
"close_vwap_deviation": "close / (amount / (vol * 100 + 1e-8) + 1e-8) - 1",
# ================= 5. 基本面财务特征 (Fundamental Quality & Structure) =================
"roe": "n_income / (total_hldr_eqy_exc_min_int + 1e-8)", # 净资产收益率
"roa": "n_income / (total_assets + 1e-8)", # 总资产收益率
"profit_margin": "n_income / (revenue + 1e-8)", # 销售净利率
"debt_to_equity": "total_liab / (total_hldr_eqy_exc_min_int + 1e-8)", # 杠杆率
"current_ratio": "total_cur_assets / (total_cur_liab + 1e-8)", # 短期偿债安全垫
# [高阶] 利润同比增速 (日频延后252天等于去年同期)
"net_profit_yoy": "(n_income / (ts_delay(n_income, 252) + 1e-8)) - 1",
# [高阶] 营收同比增速
"revenue_yoy": "(revenue / (ts_delay(revenue, 252) + 1e-8)) - 1",
# [高阶] 资产负债表扩张斜率 - 剔除单纯靠举债扩张的公司
"healthy_expansion_velocity": "(total_assets / (ts_delay(total_assets, 252) + 1e-8) - 1) - (total_liab / (ts_delay(total_liab, 252) + 1e-8) - 1)",
# ================= 6. 基本面估值与截面动量共振 (Valuation & Cross-Sectional Ranking) =================
# 估值水平绝对值 (Tushare 市值单位需要 * 10000 转换为元)
"EP": "n_income / (total_mv * 10000 + 1e-8)", # 盈利收益率 (1/PE)
"BP": "total_hldr_eqy_exc_min_int / (total_mv * 10000 + 1e-8)", # 账面市值比 (1/PB)
"CP": "n_cashflow_act / (total_mv * 10000 + 1e-8)", # 经营现金流收益率 (1/PCF)
# 全市场截面排名因子
"market_cap_rank": "cs_rank(total_mv)", # 规模因子 (Size)
"turnover_rank": "cs_rank(turnover_rate)",
"return_5_rank": "cs_rank((close / (ts_delay(close, 5) + 1e-8)) - 1)",
"EP_rank": "cs_rank(n_income / (total_mv + 1e-8))", # 谁最便宜
# [高阶] 戴维斯双击动量 - 估值相对上一年是否在扩张
"pe_expansion_trend": "(total_mv / (n_income + 1e-8)) / (ts_delay(total_mv, 60) / (ts_delay(n_income, 60) + 1e-8) + 1e-8) - 1",
# [高阶] 业绩与价格背离度 - 截面做差利润排名全市场第一但近20日价格排名倒数第一捕捉被错杀的潜伏股
"value_price_divergence": "cs_rank((n_income - ts_delay(n_income, 252)) / (abs(ts_delay(n_income, 252)) + 1e-8)) - cs_rank(close / (ts_delay(close, 20) + 1e-8))",
# [高阶] 流动性溢价调整后市值 - 识别僵尸大盘股和极度活跃的小微盘
"active_market_cap": "total_mv * ts_mean(turnover_rate, 20)",
"ebit_rank": "cs_rank(ebit)",
}
# Label 因子定义(不参与训练,用于计算目标)

View File

@@ -58,14 +58,12 @@ class FactorEngine:
self,
data_source: Optional[Dict[str, pl.DataFrame]] = None,
registry: Optional["FunctionRegistry"] = None,
metadata_path: Optional[str] = None,
) -> None:
"""初始化因子引擎。
Args:
data_source: 内存数据源,为 None 时使用数据库连接
registry: 函数注册表None 时创建独立实例
metadata_path: 因子元数据文件路径,为 None 时启用默认 metadata 功能
"""
from src.factors.registry import FunctionRegistry
from src.factors.parser import FormulaParser
@@ -80,16 +78,10 @@ class FactorEngine:
self._registry = registry if registry is not None else FunctionRegistry()
self._parser = FormulaParser(self._registry)
# 初始化 metadata 管理器(可选,默认启用
if metadata_path is not None:
from src.factors.metadata import FactorManager
# 初始化 metadata 管理器(使用默认路径
from src.factors.metadata import FactorManager
self._metadata = FactorManager(metadata_path)
else:
# 使用 FactorManager 的默认路径
from src.factors.metadata import FactorManager
self._metadata = FactorManager()
self._metadata = FactorManager()
def _register_internal(
self,

View File

@@ -49,12 +49,18 @@ class LightGBMModel(BaseModel):
self.model = None
self.feature_names_: Optional[list] = None
def fit(self, X: pl.DataFrame, y: pl.Series) -> "LightGBMModel":
def fit(
self,
X: pl.DataFrame,
y: pl.Series,
eval_set: Optional[tuple] = None,
) -> "LightGBMModel":
"""训练模型
Args:
X: 特征矩阵 (Polars DataFrame)
y: 目标变量 (Polars Series)
eval_set: 验证集元组 (X_val, y_val),用于早停
Returns:
self (支持链式调用)
@@ -76,6 +82,14 @@ class LightGBMModel(BaseModel):
train_data = lgb.Dataset(X_np, label=y_np)
# 准备验证集
valid_sets = None
if eval_set is not None:
X_val, y_val = eval_set
X_val_np = X_val.to_numpy()
y_val_np = y_val.to_numpy()
valid_sets = lgb.Dataset(X_val_np, label=y_val_np, reference=train_data)
# 从 params 中提取 num_boost_round默认 100
num_boost_round = self.params.pop("n_estimators", 100)
@@ -83,6 +97,7 @@ class LightGBMModel(BaseModel):
self.params,
train_data,
num_boost_round=num_boost_round,
valid_sets=[valid_sets] if valid_sets else None,
)
return self