feat(training): 支持 Label 预处理器

- DataPipeline 新增 label_processor_configs 参数
- 分离特征与 label 的预处理流程
- regression.py 添加 label 缩尾处理配置
- 调整学习率并更新排除因子列表
This commit is contained in:
2026-03-29 02:37:53 +08:00
parent c3d1b157e9
commit 9e0114c745
2 changed files with 140 additions and 86 deletions

View File

@@ -51,61 +51,55 @@ TRAINING_TYPE = "regression"
# 排除的因子列表 # 排除的因子列表
EXCLUDED_FACTORS = [ EXCLUDED_FACTORS = [
"GTJA_alpha062", 'GTJA_alpha036',
"GTJA_alpha060", 'GTJA_alpha032',
"GTJA_alpha058", 'GTJA_alpha010',
"GTJA_alpha056", 'GTJA_alpha005',
"GTJA_alpha053", 'CP',
"GTJA_alpha040", 'BP',
"GTJA_alpha043", 'debt_to_equity',
"GTJA_alpha027", 'current_ratio',
"CP", 'GTJA_alpha002',
"max_ret_20", 'GTJA_alpha027',
"debt_to_equity", 'GTJA_alpha064',
"close_vwap_deviation", 'GTJA_alpha062',
"EP", 'GTJA_alpha043',
"BP", 'GTJA_alpha044',
"EP_rank", 'GTJA_alpha120',
"GTJA_alpha044", 'GTJA_alpha117',
"GTJA_alpha036", 'GTJA_alpha103',
"GTJA_alpha010", 'GTJA_alpha104',
"GTJA_alpha005", 'GTJA_alpha105',
"GTJA_alpha001", 'GTJA_alpha073',
"GTJA_alpha002", 'GTJA_alpha077',
"GTJA_alpha007", 'GTJA_alpha085',
"GTJA_alpha016", 'GTJA_alpha090',
"GTJA_alpha073", 'GTJA_alpha087',
"GTJA_alpha133", 'GTJA_alpha083',
"GTJA_alpha131", 'GTJA_alpha092',
"GTJA_alpha117", 'GTJA_alpha133',
"GTJA_alpha124", 'GTJA_alpha131',
"GTJA_alpha120", 'GTJA_alpha126',
"GTJA_alpha119", 'GTJA_alpha124',
"GTJA_alpha103", 'GTJA_alpha162',
"GTJA_alpha099", 'GTJA_alpha164',
"GTJA_alpha105", 'GTJA_alpha157',
"GTJA_alpha104", 'GTJA_alpha177',
"GTJA_alpha090", 'price_to_avg_cost',
"GTJA_alpha085", 'cost_skewness',
"GTJA_alpha083", 'GTJA_alpha191',
"GTJA_alpha084", 'GTJA_alpha180',
"GTJA_alpha087", 'history_position',
"GTJA_alpha092", 'bottom_profit',
"GTJA_alpha074", 'mean_median_dev',
"GTJA_alpha089", 'smart_money_accumulation',
"GTJA_alpha173", 'GTJA_alpha013',
"GTJA_alpha157", 'GTJA_alpha099',
"GTJA_alpha139", 'GTJA_alpha107',
"GTJA_alpha162", 'GTJA_alpha119',
"GTJA_alpha163", 'GTJA_alpha141',
"GTJA_alpha177", 'GTJA_alpha130',
"price_to_avg_cost", 'GTJA_alpha173',
"cost_skewness",
"GTJA_alpha191",
"GTJA_alpha180",
"history_position",
"bottom_profit",
"smart_money_accumulation",
] ]
# 模型参数配置 # 模型参数配置
@@ -118,7 +112,7 @@ MODEL_PARAMS = {
"num_leaves": 31, # 【修改】限制为 312的5次方-1确保树是不对称生长的防止过拟合 "num_leaves": 31, # 【修改】限制为 312的5次方-1确保树是不对称生长的防止过拟合
"min_data_in_leaf": 512, # 【大幅增加】从256加到1000。训练集有97万条极大地限制叶子节点样本量能有效抵抗股市噪音 "min_data_in_leaf": 512, # 【大幅增加】从256加到1000。训练集有97万条极大地限制叶子节点样本量能有效抵抗股市噪音
# ==================== 学习参数 ==================== # ==================== 学习参数 ====================
"learning_rate": 0.02, # 【修改】稍微调大一点,帮助模型跳出初始的局部最优(避免十几轮就早停) "learning_rate": 0.01, # 【修改】稍微调大一点,帮助模型跳出初始的局部最优(避免十几轮就早停)
"n_estimators": 2000, "n_estimators": 2000,
# ==================== 随机采样与降维 ==================== # ==================== 随机采样与降维 ====================
"subsample": 0.85, "subsample": 0.85,
@@ -182,6 +176,11 @@ def main():
(StandardScaler, {}), (StandardScaler, {}),
# (CrossSectionalStandardScaler, {}), # (CrossSectionalStandardScaler, {}),
], ],
label_processor_configs=[
# 对 label 进行缩尾处理(去除极端收益率)
(Winsorizer, {"lower": 0.05, "upper": 0.95}),
# (StandardScaler, {}),
],
filters=[STFilter(data_router=engine.router)], filters=[STFilter(data_router=engine.router)],
stock_pool_filter_func=stock_pool_filter, stock_pool_filter_func=stock_pool_filter,
stock_pool_required_columns=STOCK_FILTER_REQUIRED_COLUMNS, stock_pool_required_columns=STOCK_FILTER_REQUIRED_COLUMNS,

View File

@@ -40,6 +40,9 @@ class DataPipeline:
filters: Optional[List[Any]] = None, filters: Optional[List[Any]] = None,
stock_pool_filter_func: Optional[Callable] = None, stock_pool_filter_func: Optional[Callable] = None,
stock_pool_required_columns: Optional[List[str]] = None, stock_pool_required_columns: Optional[List[str]] = None,
label_processor_configs: Optional[
List[Tuple[Type[BaseProcessor], Dict[str, Any]]]
] = None,
): ):
"""初始化数据流水线 """初始化数据流水线
@@ -50,6 +53,8 @@ class DataPipeline:
filters: 类形式的过滤器列表(如 [STFilter] filters: 类形式的过滤器列表(如 [STFilter]
stock_pool_filter_func: 函数形式的股票池筛选器 stock_pool_filter_func: 函数形式的股票池筛选器
stock_pool_required_columns: 股票池筛选所需的额外列 stock_pool_required_columns: 股票池筛选所需的额外列
label_processor_configs: Label 数据处理器配置列表,格式与 processor_configs 相同
例如:[(Winsorizer, {"lower": 0.01, "upper": 0.99})] 用于对 label 进行缩尾处理
""" """
self.factor_manager = factor_manager self.factor_manager = factor_manager
self.processor_configs = processor_configs or [] self.processor_configs = processor_configs or []
@@ -57,6 +62,8 @@ class DataPipeline:
self.stock_pool_filter_func = stock_pool_filter_func self.stock_pool_filter_func = stock_pool_filter_func
self.stock_pool_required_columns = stock_pool_required_columns or [] self.stock_pool_required_columns = stock_pool_required_columns or []
self.fitted_processors: List[BaseProcessor] = [] self.fitted_processors: List[BaseProcessor] = []
self.label_processor_configs = label_processor_configs or []
self.fitted_label_processors: List[BaseProcessor] = []
def prepare_data( def prepare_data(
self, self,
@@ -250,6 +257,7 @@ class DataPipeline:
"""预处理数据 """预处理数据
训练集使用 fit_transform验证集和测试集使用 transform 训练集使用 fit_transform验证集和测试集使用 transform
同时支持对 label 进行 processor 处理
Args: Args:
split_data: 划分后的数据字典 split_data: 划分后的数据字典
@@ -259,44 +267,83 @@ class DataPipeline:
Returns: Returns:
预处理后的数据字典 预处理后的数据字典
""" """
if not self.processor_configs: label_name = split_data["train"]["y"].name
return split_data
self.fitted_processors = [] # 处理特征
if self.processor_configs:
self.fitted_processors = []
# 实例化 processors传入 feature_cols # 实例化 processors传入 feature_cols
processors = [] processors = []
for proc_class, proc_kwargs in self.processor_configs: for proc_class, proc_kwargs in self.processor_configs:
proc_kwargs_with_cols = {**proc_kwargs, "feature_cols": feature_cols} proc_kwargs_with_cols = {**proc_kwargs, "feature_cols": feature_cols}
processors.append(proc_class(**proc_kwargs_with_cols)) processors.append(proc_class(**proc_kwargs_with_cols))
# 训练集fit_transform # 训练集fit_transform
if verbose: if verbose:
print(f" 训练集预处理fit_transform...") print(f" 训练集特征预处理fit_transform...")
train_data = split_data["train"]["raw_data"] train_data = split_data["train"]["raw_data"]
for processor in processors: for processor in processors:
train_data = processor.fit_transform(train_data) train_data = processor.fit_transform(train_data)
self.fitted_processors.append(processor) self.fitted_processors.append(processor)
# 更新训练集 # 更新训练集
split_data["train"]["raw_data"] = train_data split_data["train"]["raw_data"] = train_data
split_data["train"]["X"] = train_data.select(feature_cols) split_data["train"]["X"] = train_data.select(feature_cols)
split_data["train"]["y"] = train_data[split_data["train"]["y"].name] split_data["train"]["y"] = train_data[label_name]
# 验证集和测试集transform # 验证集和测试集transform
for split_name in ["val", "test"]: for split_name in ["val", "test"]:
if split_name in split_data: if split_name in split_data:
if verbose: if verbose:
print(f" {split_name}集预处理transform...") print(f" {split_name}特征预处理transform...")
split_df = split_data[split_name]["raw_data"] split_df = split_data[split_name]["raw_data"]
for processor in self.fitted_processors: for processor in self.fitted_processors:
split_df = processor.transform(split_df) split_df = processor.transform(split_df)
split_data[split_name]["raw_data"] = split_df split_data[split_name]["raw_data"] = split_df
split_data[split_name]["X"] = split_df.select(feature_cols) split_data[split_name]["X"] = split_df.select(feature_cols)
split_data[split_name]["y"] = split_df[split_data[split_name]["y"].name] split_data[split_name]["y"] = split_df[label_name]
# 处理 label
if self.label_processor_configs:
self.fitted_label_processors = []
# 实例化 label processors传入 label_name 作为 feature_cols
label_processors = []
for proc_class, proc_kwargs in self.label_processor_configs:
proc_kwargs_with_label = {**proc_kwargs, "feature_cols": [label_name]}
label_processors.append(proc_class(**proc_kwargs_with_label))
# 训练集fit_transform
if verbose:
print(f" 训练集 Label 预处理fit_transform...")
train_data = split_data["train"]["raw_data"]
for processor in label_processors:
train_data = processor.fit_transform(train_data)
self.fitted_label_processors.append(processor)
# 更新训练集
split_data["train"]["raw_data"] = train_data
split_data["train"]["X"] = train_data.select(feature_cols)
split_data["train"]["y"] = train_data[label_name]
# 验证集和测试集transform
for split_name in ["val", "test"]:
if split_name in split_data:
if verbose:
print(f" {split_name}集 Label 预处理transform...")
split_df = split_data[split_name]["raw_data"]
for processor in self.fitted_label_processors:
split_df = processor.transform(split_df)
split_data[split_name]["raw_data"] = split_df
split_data[split_name]["X"] = split_df.select(feature_cols)
split_data[split_name]["y"] = split_df[label_name]
return split_data return split_data
@@ -307,3 +354,11 @@ class DataPipeline:
已拟合的处理器列表(用于模型保存) 已拟合的处理器列表(用于模型保存)
""" """
return self.fitted_processors return self.fitted_processors
def get_fitted_label_processors(self) -> List[BaseProcessor]:
"""获取已拟合的 Label 处理器列表
Returns:
已拟合的 Label 处理器列表(用于模型保存和预测时反转换)
"""
return self.fitted_label_processors