feat(experiment): 新增因子排除机制并优化模型训练参数

- 添加 EXCLUDED_FACTORS 列表支持批量排除效果不佳的因子
- 修复 LightGBM 树结构冲突,调整正则化和采样策略防过拟合
- 调整数据处理器配置,关闭模型自动保存
This commit is contained in:
2026-03-18 20:57:02 +08:00
parent 16f82d3458
commit 0a29506f45
3 changed files with 131 additions and 55 deletions

View File

@@ -11,7 +11,6 @@ import polars as pl
from src.factors import FactorEngine from src.factors import FactorEngine
# ============================================================================= # =============================================================================
# 日期范围配置(正确的 train/val/test 三分法) # 日期范围配置(正确的 train/val/test 三分法)
# ============================================================================= # =============================================================================
@@ -22,7 +21,6 @@ VAL_END = "20241231"
TEST_START = "20250101" TEST_START = "20250101"
TEST_END = "20261231" TEST_END = "20261231"
# ============================================================================= # =============================================================================
# 因子配置 # 因子配置
# ============================================================================= # =============================================================================
@@ -257,6 +255,49 @@ SELECTED_FACTORS = [
# 因子定义字典完整因子库用于存放尚未注册到metadata的因子 # 因子定义字典完整因子库用于存放尚未注册到metadata的因子
FACTOR_DEFINITIONS = {} FACTOR_DEFINITIONS = {}
# 需要排除的因子列表(这些因子不会被计算和使用)
# 用于临时屏蔽效果不好的因子,无需从 SELECTED_FACTORS 中删除
EXCLUDED_FACTORS: List[str] = [
'GTJA_alpha005',
'GTJA_alpha028',
'GTJA_alpha023',
'GTJA_alpha002',
'GTJA_alpha010',
'GTJA_alpha011',
'GTJA_alpha044',
'GTJA_alpha036',
'GTJA_alpha027',
'GTJA_alpha109',
'GTJA_alpha104',
'GTJA_alpha103',
'GTJA_alpha085',
'GTJA_alpha111',
'GTJA_alpha092',
'GTJA_alpha067',
'GTJA_alpha060',
'GTJA_alpha062',
'GTJA_alpha063',
'GTJA_alpha079',
'GTJA_alpha073',
'GTJA_alpha087',
'GTJA_alpha117',
'GTJA_alpha113',
'GTJA_alpha138',
'GTJA_alpha121',
'GTJA_alpha124',
'GTJA_alpha133',
'GTJA_alpha131',
'GTJA_alpha118',
'GTJA_alpha164',
'GTJA_alpha162',
'GTJA_alpha157',
'GTJA_alpha171',
'GTJA_alpha177',
'GTJA_alpha180',
'GTJA_alpha188',
'GTJA_alpha191',
]
def get_label_factor(label_name: str) -> dict: def get_label_factor(label_name: str) -> dict:
"""获取Label因子定义字典。 """获取Label因子定义字典。
@@ -280,48 +321,80 @@ def register_factors(
selected_factors: List[str], selected_factors: List[str],
factor_definitions: dict, factor_definitions: dict,
label_factor: dict, label_factor: dict,
excluded_factors: Optional[List[str]] = None,
) -> List[str]: ) -> List[str]:
"""注册因子。 """注册因子。
selected_factors 从 metadata 查询factor_definitions 用 DSL 表达式注册。 selected_factors 从 metadata 查询factor_definitions 用 DSL 表达式注册。
excluded_factors 中的因子会被排除,不参与计算。
Args: Args:
engine: FactorEngine实例 engine: FactorEngine实例
selected_factors: 从metadata中选择的因子名称列表 selected_factors: 从metadata中选择的因子名称列表
factor_definitions: 通过表达式定义的因子字典 factor_definitions: 通过表达式定义的因子字典
label_factor: label因子定义字典 label_factor: label因子定义字典
excluded_factors: 需要排除的因子名称列表默认为None
Returns: Returns:
特征列名称列表 特征列名称列表已排除excluded_factors中的因子
""" """
print("=" * 80) print("=" * 80)
print("注册因子") print("注册因子")
print("=" * 80) print("=" * 80)
# 处理排除列表
excluded = set(excluded_factors) if excluded_factors else set()
if excluded:
print(f"\n[排除因子] 以下 {len(excluded)} 个因子将被排除:")
for name in sorted(excluded):
print(f" - {name}")
# 过滤 SELECTED_FACTORS 中的因子排除excluded_factors
filtered_selected = [name for name in selected_factors if name not in excluded]
excluded_from_selected = set(selected_factors) - set(filtered_selected)
if excluded_from_selected:
print(
f"\n[排除详情] 从 SELECTED_FACTORS 排除 {len(excluded_from_selected)} 个因子"
)
# 注册 SELECTED_FACTORS 中的因子(已在 metadata 中) # 注册 SELECTED_FACTORS 中的因子(已在 metadata 中)
print("\n注册特征因子(从 metadata:") print("\n注册特征因子(从 metadata:")
for name in selected_factors: for name in filtered_selected:
engine.add_factor(name) engine.add_factor(name)
print(f" - {name}") print(f" - {name}")
# 过滤 FACTOR_DEFINITIONS 中的因子排除excluded_factors
filtered_definitions = {
name: expr for name, expr in factor_definitions.items() if name not in excluded
}
excluded_from_definitions = set(factor_definitions.keys()) - set(
filtered_definitions.keys()
)
if excluded_from_definitions:
print(
f"\n[排除详情] 从 FACTOR_DEFINITIONS 排除 {len(excluded_from_definitions)} 个因子"
)
# 注册 FACTOR_DEFINITIONS 中的因子(通过表达式,尚未在 metadata 中) # 注册 FACTOR_DEFINITIONS 中的因子(通过表达式,尚未在 metadata 中)
print("\n注册特征因子(表达式):") print("\n注册特征因子(表达式):")
for name, expr in factor_definitions.items(): for name, expr in filtered_definitions.items():
engine.add_factor(name, expr) engine.add_factor(name, expr)
print(f" - {name}: {expr}") print(f" - {name}: {expr}")
# 注册 label 因子(通过表达式) # 注册 label 因子(通过表达式label因子不受excluded_factors影响
print("\n注册 Label 因子(表达式):") print("\n注册 Label 因子(表达式):")
for name, expr in label_factor.items(): for name, expr in label_factor.items():
engine.add_factor(name, expr) engine.add_factor(name, expr)
print(f" - {name}: {expr}") print(f" - {name}: {expr}")
# 特征列 = SELECTED_FACTORS + FACTOR_DEFINITIONS 的 keys # 特征列 = 过滤后的 SELECTED_FACTORS + 过滤后的 FACTOR_DEFINITIONS 的 keys
feature_cols = selected_factors + list(factor_definitions.keys()) feature_cols = filtered_selected + list(filtered_definitions.keys())
print(f"\n特征因子数: {len(feature_cols)}") print(f"\n特征因子数: {len(feature_cols)}")
print(f" - 来自 metadata: {len(selected_factors)}") print(f" - 来自 metadata: {len(filtered_selected)}")
print(f" - 来自表达式: {len(factor_definitions)}") print(f" - 来自表达式: {len(filtered_definitions)}")
if excluded:
print(f" - 已排除: {len(excluded)}")
print(f"Label: {list(label_factor.keys())[0]}") print(f"Label: {list(label_factor.keys())[0]}")
print(f"已注册因子总数: {len(engine.list_registered())}") print(f"已注册因子总数: {len(engine.list_registered())}")
@@ -410,7 +483,6 @@ def stock_pool_filter(df: pl.DataFrame) -> pl.Series:
# 定义筛选所需的基础列 # 定义筛选所需的基础列
STOCK_FILTER_REQUIRED_COLUMNS = ["total_mv"] STOCK_FILTER_REQUIRED_COLUMNS = ["total_mv"]
# ============================================================================= # =============================================================================
# 输出配置 # 输出配置
# ============================================================================= # =============================================================================
@@ -418,7 +490,7 @@ OUTPUT_DIR = "output"
SAVE_PREDICTIONS = True SAVE_PREDICTIONS = True
# 模型保存配置 # 模型保存配置
SAVE_MODEL = True # 是否保存模型 SAVE_MODEL = False # 是否保存模型
MODEL_SAVE_DIR = "models" # 模型保存目录 MODEL_SAVE_DIR = "models" # 模型保存目录
# Top N 配置:每日推荐股票数量 # Top N 配置:每日推荐股票数量

View File

@@ -41,6 +41,7 @@ from src.training.config import TrainingConfig
from src.experiment.common import ( from src.experiment.common import (
SELECTED_FACTORS, SELECTED_FACTORS,
FACTOR_DEFINITIONS, FACTOR_DEFINITIONS,
EXCLUDED_FACTORS,
get_label_factor, get_label_factor,
register_factors, register_factors,
prepare_data, prepare_data,
@@ -260,7 +261,7 @@ engine = FactorEngine()
# 2. 使用 metadata 定义因子 # 2. 使用 metadata 定义因子
print("\n[2] 定义因子(从 metadata 注册)") print("\n[2] 定义因子(从 metadata 注册)")
feature_cols = register_factors( feature_cols = register_factors(
engine, SELECTED_FACTORS, FACTOR_DEFINITIONS, LABEL_FACTOR engine, SELECTED_FACTORS, FACTOR_DEFINITIONS, LABEL_FACTOR, EXCLUDED_FACTORS
) )
# 3. 准备数据 # 3. 准备数据

View File

@@ -17,6 +17,7 @@ from src.training import (
Winsorizer, Winsorizer,
NullFiller, NullFiller,
check_data_quality, check_data_quality,
CrossSectionalStandardScaler,
) )
from src.training.config import TrainingConfig from src.training.config import TrainingConfig
@@ -24,6 +25,7 @@ from src.training.config import TrainingConfig
from src.experiment.common import ( from src.experiment.common import (
SELECTED_FACTORS, SELECTED_FACTORS,
FACTOR_DEFINITIONS, FACTOR_DEFINITIONS,
EXCLUDED_FACTORS,
get_label_factor, get_label_factor,
register_factors, register_factors,
prepare_data, prepare_data,
@@ -60,24 +62,25 @@ LABEL_FACTOR = get_label_factor(LABEL_NAME)
# 模型参数配置 # 模型参数配置
MODEL_PARAMS = { MODEL_PARAMS = {
"objective": "regression", # 基础设置
"metric": "mae", # 改为 MAE对异常值更稳健 "objective": "regression_l1", # LightGBM 中 MAE 对应的目标函数推荐写 regression_l1
# 树结构控制(防过拟合核心) "metric": "mae",
# "num_leaves": 20, # 从31降为20降低模型复杂度 # 1. 修复树结构冲突深度设为5叶子数必须<=32。
# "max_depth": 16, # 显式限制深度,防止过拟合噪声 # 推荐设定为稍微小于满二叉树的数值(如 15~31以增加树的不对称性,防止过拟合
# "min_child_samples": 50, # 叶子最小样本数,防止学习极端样本 "max_depth": 5,
# "min_child_weight": 0.001, "num_leaves": 24, # 修改:从 63 降为 24
# 学习参数 "min_data_in_leaf": 100, # 修改:适当增大,金融数据噪音大,叶子节点数据越多越抗噪
"learning_rate": 0.01, # 降低学习率,配合更多树 # 2. 学习参数
"n_estimators": 1000, # 增加树数量,配合早停 "learning_rate": 0.01,
# 采样策略(关键防过拟合) "n_estimators": 1500, # 修改:配合小学习率,树可以再多一点
"subsample": 0.8, # 每棵树随机采样80%数据(行采样) # 3. 修复采样抖动:改为每棵树都重新采样
"subsample_freq": 5, # 每5轮迭代进行一次 subsample "subsample": 0.8,
"colsample_bytree": 0.8, # 每棵树随机选择80%特征(列采样) "subsample_freq": 1, # 【关键修改】:从 5 改为 1。每轮都重采样让抖动均匀化而不是5轮来一次大抖动
# 正则化 "colsample_bytree": 0.8,
"reg_alpha": 0.1, # L1正则增加稀疏性 # 正则化(金融量化等高噪场景可适当加大)
"reg_lambda": 1.0, # L2正则平滑权重 "reg_alpha": 0.5, # 修改适当提高L1强迫模型只选最有效的因子
# 数值稳定性 "reg_lambda": 1.0,
# 杂项
"verbose": -1, "verbose": -1,
"random_state": 42, "random_state": 42,
} }
@@ -92,12 +95,12 @@ print("=" * 80)
# 1. 创建 FactorEngine启用 metadata 功能) # 1. 创建 FactorEngine启用 metadata 功能)
print("\n[1] 创建 FactorEngine") print("\n[1] 创建 FactorEngine")
engine = FactorEngine(metadata_path="data/factors.jsonl") engine = FactorEngine()
# 2. 使用 metadata 定义因子 # 2. 使用 metadata 定义因子
print("\n[2] 定义因子(从 metadata 注册)") print("\n[2] 定义因子(从 metadata 注册)")
feature_cols = register_factors( feature_cols = register_factors(
engine, SELECTED_FACTORS, FACTOR_DEFINITIONS, LABEL_FACTOR engine, SELECTED_FACTORS, FACTOR_DEFINITIONS, LABEL_FACTOR, EXCLUDED_FACTORS
) )
target_col = LABEL_NAME target_col = LABEL_NAME
@@ -126,7 +129,7 @@ model = LightGBMModel(params=MODEL_PARAMS)
processors = [ processors = [
NullFiller(feature_cols=feature_cols, strategy="mean"), NullFiller(feature_cols=feature_cols, strategy="mean"),
Winsorizer(feature_cols=feature_cols, lower=0.01, upper=0.99), Winsorizer(feature_cols=feature_cols, lower=0.01, upper=0.99),
StandardScaler(feature_cols=feature_cols), StandardScaler(feature_cols=feature_cols + [LABEL_NAME]),
] ]
# 7. 创建数据划分器(正确的 train/val/test 三分法) # 7. 创建数据划分器(正确的 train/val/test 三分法)
@@ -230,7 +233,7 @@ print("-" * 60)
print(" [说明] 此检查在 fillna 等处理之前执行,用于发现数据问题") print(" [说明] 此检查在 fillna 等处理之前执行,用于发现数据问题")
print("\n 检查训练集...") print("\n 检查训练集...")
check_data_quality(train_data, feature_cols, raise_on_error=True) check_data_quality(train_data, feature_cols, raise_on_error=False)
if "val_data" in locals() and val_data is not None: if "val_data" in locals() and val_data is not None:
print("\n 检查验证集...") print("\n 检查验证集...")
@@ -579,7 +582,7 @@ zero_importance = importance_gain[importance_gain == 0].index.tolist()
if zero_importance: if zero_importance:
print(f"\n[低重要性特征] 以下{len(zero_importance)}个特征重要性为0可考虑删除:") print(f"\n[低重要性特征] 以下{len(zero_importance)}个特征重要性为0可考虑删除:")
for feat in zero_importance: for feat in zero_importance:
print(f" - {feat}") print(f"'{feat}',")
else: else:
print("\n所有特征都有一定重要性") print("\n所有特征都有一定重要性")