feat(experiment): 新增因子排除机制并优化模型训练参数
- 添加 EXCLUDED_FACTORS 列表支持批量排除效果不佳的因子 - 修复 LightGBM 树结构冲突,调整正则化和采样策略防过拟合 - 调整数据处理器配置,关闭模型自动保存
This commit is contained in:
@@ -17,6 +17,7 @@ from src.training import (
|
||||
Winsorizer,
|
||||
NullFiller,
|
||||
check_data_quality,
|
||||
CrossSectionalStandardScaler,
|
||||
)
|
||||
from src.training.config import TrainingConfig
|
||||
|
||||
@@ -24,6 +25,7 @@ from src.training.config import TrainingConfig
|
||||
from src.experiment.common import (
|
||||
SELECTED_FACTORS,
|
||||
FACTOR_DEFINITIONS,
|
||||
EXCLUDED_FACTORS,
|
||||
get_label_factor,
|
||||
register_factors,
|
||||
prepare_data,
|
||||
@@ -60,24 +62,25 @@ LABEL_FACTOR = get_label_factor(LABEL_NAME)
|
||||
|
||||
# 模型参数配置
|
||||
MODEL_PARAMS = {
|
||||
"objective": "regression",
|
||||
"metric": "mae", # 改为 MAE,对异常值更稳健
|
||||
# 树结构控制(防过拟合核心)
|
||||
# "num_leaves": 20, # 从31降为20,降低模型复杂度
|
||||
# "max_depth": 16, # 显式限制深度,防止过度拟合噪声
|
||||
# "min_child_samples": 50, # 叶子最小样本数,防止学习极端样本
|
||||
# "min_child_weight": 0.001,
|
||||
# 学习参数
|
||||
"learning_rate": 0.01, # 降低学习率,配合更多树
|
||||
"n_estimators": 1000, # 增加树数量,配合早停
|
||||
# 采样策略(关键防过拟合)
|
||||
"subsample": 0.8, # 每棵树随机采样80%数据(行采样)
|
||||
"subsample_freq": 5, # 每5轮迭代进行一次 subsample
|
||||
"colsample_bytree": 0.8, # 每棵树随机选择80%特征(列采样)
|
||||
# 正则化
|
||||
"reg_alpha": 0.1, # L1正则,增加稀疏性
|
||||
"reg_lambda": 1.0, # L2正则,平滑权重
|
||||
# 数值稳定性
|
||||
# 基础设置
|
||||
"objective": "regression_l1", # LightGBM 中 MAE 对应的目标函数推荐写 regression_l1
|
||||
"metric": "mae",
|
||||
# 1. 修复树结构冲突:深度设为5,叶子数必须<=32。
|
||||
# 推荐设定为稍微小于满二叉树的数值(如 15~31),以增加树的不对称性,防止过拟合
|
||||
"max_depth": 5,
|
||||
"num_leaves": 24, # 修改:从 63 降为 24
|
||||
"min_data_in_leaf": 100, # 修改:适当增大,金融数据噪音大,叶子节点数据越多越抗噪
|
||||
# 2. 学习参数
|
||||
"learning_rate": 0.01,
|
||||
"n_estimators": 1500, # 修改:配合小学习率,树可以再多一点
|
||||
# 3. 修复采样抖动:改为每棵树都重新采样
|
||||
"subsample": 0.8,
|
||||
"subsample_freq": 1, # 【关键修改】:从 5 改为 1。每轮都重采样,让抖动均匀化,而不是5轮来一次大抖动
|
||||
"colsample_bytree": 0.8,
|
||||
# 正则化(金融量化等高噪场景可适当加大)
|
||||
"reg_alpha": 0.5, # 修改:适当提高L1,强迫模型只选最有效的因子
|
||||
"reg_lambda": 1.0,
|
||||
# 杂项
|
||||
"verbose": -1,
|
||||
"random_state": 42,
|
||||
}
|
||||
@@ -92,12 +95,12 @@ print("=" * 80)
|
||||
|
||||
# 1. 创建 FactorEngine(启用 metadata 功能)
|
||||
print("\n[1] 创建 FactorEngine")
|
||||
engine = FactorEngine(metadata_path="data/factors.jsonl")
|
||||
engine = FactorEngine()
|
||||
|
||||
# 2. 使用 metadata 定义因子
|
||||
print("\n[2] 定义因子(从 metadata 注册)")
|
||||
feature_cols = register_factors(
|
||||
engine, SELECTED_FACTORS, FACTOR_DEFINITIONS, LABEL_FACTOR
|
||||
engine, SELECTED_FACTORS, FACTOR_DEFINITIONS, LABEL_FACTOR, EXCLUDED_FACTORS
|
||||
)
|
||||
target_col = LABEL_NAME
|
||||
|
||||
@@ -126,7 +129,7 @@ model = LightGBMModel(params=MODEL_PARAMS)
|
||||
processors = [
|
||||
NullFiller(feature_cols=feature_cols, strategy="mean"),
|
||||
Winsorizer(feature_cols=feature_cols, lower=0.01, upper=0.99),
|
||||
StandardScaler(feature_cols=feature_cols),
|
||||
StandardScaler(feature_cols=feature_cols + [LABEL_NAME]),
|
||||
]
|
||||
|
||||
# 7. 创建数据划分器(正确的 train/val/test 三分法)
|
||||
@@ -230,7 +233,7 @@ print("-" * 60)
|
||||
print(" [说明] 此检查在 fillna 等处理之前执行,用于发现数据问题")
|
||||
|
||||
print("\n 检查训练集...")
|
||||
check_data_quality(train_data, feature_cols, raise_on_error=True)
|
||||
check_data_quality(train_data, feature_cols, raise_on_error=False)
|
||||
|
||||
if "val_data" in locals() and val_data is not None:
|
||||
print("\n 检查验证集...")
|
||||
@@ -579,7 +582,7 @@ zero_importance = importance_gain[importance_gain == 0].index.tolist()
|
||||
if zero_importance:
|
||||
print(f"\n[低重要性特征] 以下{len(zero_importance)}个特征重要性为0,可考虑删除:")
|
||||
for feat in zero_importance:
|
||||
print(f" - {feat}")
|
||||
print(f"'{feat}',")
|
||||
else:
|
||||
print("\n所有特征都有一定重要性")
|
||||
|
||||
|
||||
Reference in New Issue
Block a user