feat(training): LightGBM支持验证集早停
- 为fit方法添加eval_set参数,支持验证集评估和早停 - 因子引擎简化初始化,移除metadata_path参数 - 回归实验精简因子定义,移除冗余因子库
This commit is contained in:
@@ -164,84 +164,6 @@ SELECTED_FACTORS = [
|
||||
|
||||
# 因子定义字典(完整因子库)
|
||||
FACTOR_DEFINITIONS = {
|
||||
# ================= 1. 价格、趋势与路径依赖 (Trend, Momentum & Path Dependency) =================
|
||||
"ma_5": "ts_mean(close, 5)",
|
||||
"ma_20": "ts_mean(close, 20)",
|
||||
"ma_ratio_5_20": "ts_mean(close, 5) / (ts_mean(close, 20) + 1e-8) - 1", # 均线发散度
|
||||
"bias_10": "close / (ts_mean(close, 10) + 1e-8) - 1", # 10日乖离率
|
||||
"high_low_ratio": "(close - ts_min(low, 20)) / (ts_max(high, 20) - ts_min(low, 20) + 1e-8)", # 威廉指标变形
|
||||
"bbi_ratio": "(ts_mean(close, 3) + ts_mean(close, 6) + ts_mean(close, 12) + ts_mean(close, 24)) / (4 * close + 1e-8)", # 多空指标比率
|
||||
"return_5": "(close / (ts_delay(close, 5) + 1e-8)) - 1", # 5日动量
|
||||
"return_20": "(close / (ts_delay(close, 20) + 1e-8)) - 1", # 20日动量
|
||||
# [高阶] Kaufman 趋势效率 (极高价值) - 衡量趋势流畅度,剔除无序震荡
|
||||
"kaufman_ER_20": "abs(close - ts_delay(close, 20)) / (ts_sum(abs(close - ts_delay(close, 1)), 20) + 1e-8)",
|
||||
# [高阶] 动量加速度 - 寻找二阶导数大于0,正在加速爆发的股票
|
||||
"mom_acceleration_10_20": "(close / (ts_delay(close, 10) + 1e-8) - 1) - (ts_delay(close, 10) / (ts_delay(close, 20) + 1e-8) - 1)",
|
||||
# [高阶] 高点距离衰减 - 衡量套牢盘压力
|
||||
"drawdown_from_high_60": "close / (ts_max(high, 60) + 1e-8) - 1",
|
||||
# [高阶] 趋势一致性 - 过去20天内收红的天数比例
|
||||
"up_days_ratio_20": "ts_sum(close > ts_delay(close, 1), 20) / 20",
|
||||
# ================= 2. 波动率、风险调整与高阶矩 (Volatility & Risk-Adjusted Returns) =================
|
||||
"volatility_5": "ts_std(close, 5)",
|
||||
"volatility_20": "ts_std(close, 20)",
|
||||
"volatility_ratio": "ts_std(close, 5) / (ts_std(close, 20) + 1e-8)", # 波动率期限结构
|
||||
"std_return_20": "ts_std((close / (ts_delay(close, 1) + 1e-8)) - 1, 20)", # 真实收益率波动率
|
||||
# [高阶] 夏普趋势比率 - 惩罚暴涨暴跌,奖励稳健爬坡
|
||||
"sharpe_ratio_20": "ts_mean(close / (ts_delay(close, 1) + 1e-8) - 1, 20) / (ts_std(close / (ts_delay(close, 1) + 1e-8) - 1, 20) + 1e-8)",
|
||||
# [高阶] 尾部崩盘风险 - 过去一个月最大单日跌幅
|
||||
"min_ret_20": "ts_min(close / (ts_delay(close, 1) + 1e-8) - 1, 20)",
|
||||
# [高阶] 波动率挤压比 - 寻找盘整到极致面临变盘的股票 (布林带收口)
|
||||
"volatility_squeeze_5_60": "ts_std(close, 5) / (ts_std(close, 60) + 1e-8)",
|
||||
# ================= 3. 日内微观结构与异象 (Intraday Microstructure & Anomalies) =================
|
||||
# [高阶] 隔夜与日内背离 - 差值越小说明主力越喜欢在盘中吸筹
|
||||
"overnight_intraday_diff": "(open / (ts_delay(close, 1) + 1e-8) - 1) - (close / (open + 1e-8) - 1)",
|
||||
# [高阶] 上影线抛压极值 - 冲高回落被套牢的概率
|
||||
"upper_shadow_ratio": "(high - ((open + close + abs(open - close)) / 2)) / (high - low + 1e-8)",
|
||||
# [高阶] 资金沉淀率 - 衡量主力日内高抛低吸洗盘的剧烈程度
|
||||
"capital_retention_20": "ts_sum(abs(close - open), 20) / (ts_sum(high - low, 20) + 1e-8)",
|
||||
# [高阶] MAX 彩票效应 - 反转因子,剔除近期有过妖股连板特征的标的
|
||||
"max_ret_20": "ts_max(close / (ts_delay(close, 1) + 1e-8) - 1, 20)",
|
||||
# ================= 4. 量能、流动性与量价背离 (Volume, Liquidity & Divergence) =================
|
||||
"volume_ratio_5_20": "ts_mean(vol, 5) / (ts_mean(vol, 20) + 1e-8)", # 相对放量比
|
||||
"turnover_rate_mean_5": "ts_mean(turnover_rate, 5)", # 活跃度
|
||||
"turnover_deviation": "(turnover_rate - ts_mean(turnover_rate, 10)) / (ts_std(turnover_rate, 10) + 1e-8)", # 换手率偏离度
|
||||
# [高阶] Amihud 非流动性异象 (绝对核心) - 衡量砸盘/拉升的摩擦成本
|
||||
"amihud_illiq_20": "ts_mean(abs(close / (ts_delay(close, 1) + 1e-8) - 1) / (amount + 1e-8), 20)",
|
||||
# [高阶] 换手率惩罚因子 - 换手率忽高忽低说明游资接力,行情极不稳定
|
||||
"turnover_cv_20": "ts_std(turnover_rate, 20) / (ts_mean(turnover_rate, 20) + 1e-8)",
|
||||
# [高阶] 纯粹量价相关性 - 检验是否是"放量上涨,缩量下跌"的良性多头
|
||||
"pv_corr_20": "ts_corr(close / (ts_delay(close, 1) + 1e-8) - 1, vol, 20)",
|
||||
# [高阶] 收盘价与均价背离 - 专门抓尾盘突袭拉升骗线的股票
|
||||
"close_vwap_deviation": "close / (amount / (vol * 100 + 1e-8) + 1e-8) - 1",
|
||||
# ================= 5. 基本面财务特征 (Fundamental Quality & Structure) =================
|
||||
"roe": "n_income / (total_hldr_eqy_exc_min_int + 1e-8)", # 净资产收益率
|
||||
"roa": "n_income / (total_assets + 1e-8)", # 总资产收益率
|
||||
"profit_margin": "n_income / (revenue + 1e-8)", # 销售净利率
|
||||
"debt_to_equity": "total_liab / (total_hldr_eqy_exc_min_int + 1e-8)", # 杠杆率
|
||||
"current_ratio": "total_cur_assets / (total_cur_liab + 1e-8)", # 短期偿债安全垫
|
||||
# [高阶] 利润同比增速 (日频延后252天等于去年同期)
|
||||
"net_profit_yoy": "(n_income / (ts_delay(n_income, 252) + 1e-8)) - 1",
|
||||
# [高阶] 营收同比增速
|
||||
"revenue_yoy": "(revenue / (ts_delay(revenue, 252) + 1e-8)) - 1",
|
||||
# [高阶] 资产负债表扩张斜率 - 剔除单纯靠举债扩张的公司
|
||||
"healthy_expansion_velocity": "(total_assets / (ts_delay(total_assets, 252) + 1e-8) - 1) - (total_liab / (ts_delay(total_liab, 252) + 1e-8) - 1)",
|
||||
# ================= 6. 基本面估值与截面动量共振 (Valuation & Cross-Sectional Ranking) =================
|
||||
# 估值水平绝对值 (Tushare 市值单位需要 * 10000 转换为元)
|
||||
"EP": "n_income / (total_mv * 10000 + 1e-8)", # 盈利收益率 (1/PE)
|
||||
"BP": "total_hldr_eqy_exc_min_int / (total_mv * 10000 + 1e-8)", # 账面市值比 (1/PB)
|
||||
"CP": "n_cashflow_act / (total_mv * 10000 + 1e-8)", # 经营现金流收益率 (1/PCF)
|
||||
# 全市场截面排名因子
|
||||
"market_cap_rank": "cs_rank(total_mv)", # 规模因子 (Size)
|
||||
"turnover_rank": "cs_rank(turnover_rate)",
|
||||
"return_5_rank": "cs_rank((close / (ts_delay(close, 5) + 1e-8)) - 1)",
|
||||
"EP_rank": "cs_rank(n_income / (total_mv + 1e-8))", # 谁最便宜
|
||||
# [高阶] 戴维斯双击动量 - 估值相对上一年是否在扩张
|
||||
"pe_expansion_trend": "(total_mv / (n_income + 1e-8)) / (ts_delay(total_mv, 60) / (ts_delay(n_income, 60) + 1e-8) + 1e-8) - 1",
|
||||
# [高阶] 业绩与价格背离度 - 截面做差:利润排名全市场第一,但近20日价格排名倒数第一,捕捉被错杀的潜伏股
|
||||
"value_price_divergence": "cs_rank((n_income - ts_delay(n_income, 252)) / (abs(ts_delay(n_income, 252)) + 1e-8)) - cs_rank(close / (ts_delay(close, 20) + 1e-8))",
|
||||
# [高阶] 流动性溢价调整后市值 - 识别僵尸大盘股和极度活跃的小微盘
|
||||
"active_market_cap": "total_mv * ts_mean(turnover_rate, 20)",
|
||||
"ebit_rank": "cs_rank(ebit)",
|
||||
}
|
||||
|
||||
# Label 因子定义(不参与训练,用于计算目标)
|
||||
|
||||
@@ -58,14 +58,12 @@ class FactorEngine:
|
||||
self,
|
||||
data_source: Optional[Dict[str, pl.DataFrame]] = None,
|
||||
registry: Optional["FunctionRegistry"] = None,
|
||||
metadata_path: Optional[str] = None,
|
||||
) -> None:
|
||||
"""初始化因子引擎。
|
||||
|
||||
Args:
|
||||
data_source: 内存数据源,为 None 时使用数据库连接
|
||||
registry: 函数注册表,None 时创建独立实例
|
||||
metadata_path: 因子元数据文件路径,为 None 时启用默认 metadata 功能
|
||||
"""
|
||||
from src.factors.registry import FunctionRegistry
|
||||
from src.factors.parser import FormulaParser
|
||||
@@ -80,13 +78,7 @@ class FactorEngine:
|
||||
self._registry = registry if registry is not None else FunctionRegistry()
|
||||
self._parser = FormulaParser(self._registry)
|
||||
|
||||
# 初始化 metadata 管理器(可选,默认启用)
|
||||
if metadata_path is not None:
|
||||
from src.factors.metadata import FactorManager
|
||||
|
||||
self._metadata = FactorManager(metadata_path)
|
||||
else:
|
||||
# 使用 FactorManager 的默认路径
|
||||
# 初始化 metadata 管理器(使用默认路径)
|
||||
from src.factors.metadata import FactorManager
|
||||
|
||||
self._metadata = FactorManager()
|
||||
|
||||
@@ -49,12 +49,18 @@ class LightGBMModel(BaseModel):
|
||||
self.model = None
|
||||
self.feature_names_: Optional[list] = None
|
||||
|
||||
def fit(self, X: pl.DataFrame, y: pl.Series) -> "LightGBMModel":
|
||||
def fit(
|
||||
self,
|
||||
X: pl.DataFrame,
|
||||
y: pl.Series,
|
||||
eval_set: Optional[tuple] = None,
|
||||
) -> "LightGBMModel":
|
||||
"""训练模型
|
||||
|
||||
Args:
|
||||
X: 特征矩阵 (Polars DataFrame)
|
||||
y: 目标变量 (Polars Series)
|
||||
eval_set: 验证集元组 (X_val, y_val),用于早停
|
||||
|
||||
Returns:
|
||||
self (支持链式调用)
|
||||
@@ -76,6 +82,14 @@ class LightGBMModel(BaseModel):
|
||||
|
||||
train_data = lgb.Dataset(X_np, label=y_np)
|
||||
|
||||
# 准备验证集
|
||||
valid_sets = None
|
||||
if eval_set is not None:
|
||||
X_val, y_val = eval_set
|
||||
X_val_np = X_val.to_numpy()
|
||||
y_val_np = y_val.to_numpy()
|
||||
valid_sets = lgb.Dataset(X_val_np, label=y_val_np, reference=train_data)
|
||||
|
||||
# 从 params 中提取 num_boost_round,默认 100
|
||||
num_boost_round = self.params.pop("n_estimators", 100)
|
||||
|
||||
@@ -83,6 +97,7 @@ class LightGBMModel(BaseModel):
|
||||
self.params,
|
||||
train_data,
|
||||
num_boost_round=num_boost_round,
|
||||
valid_sets=[valid_sets] if valid_sets else None,
|
||||
)
|
||||
|
||||
return self
|
||||
|
||||
Reference in New Issue
Block a user