feat(training): 支持 Label 预处理器
- DataPipeline 新增 label_processor_configs 参数 - 分离特征与 label 的预处理流程 - regression.py 添加 label 缩尾处理配置 - 调整学习率并更新排除因子列表
This commit is contained in:
@@ -51,61 +51,55 @@ TRAINING_TYPE = "regression"
|
|||||||
|
|
||||||
# 排除的因子列表
|
# 排除的因子列表
|
||||||
EXCLUDED_FACTORS = [
|
EXCLUDED_FACTORS = [
|
||||||
"GTJA_alpha062",
|
'GTJA_alpha036',
|
||||||
"GTJA_alpha060",
|
'GTJA_alpha032',
|
||||||
"GTJA_alpha058",
|
'GTJA_alpha010',
|
||||||
"GTJA_alpha056",
|
'GTJA_alpha005',
|
||||||
"GTJA_alpha053",
|
'CP',
|
||||||
"GTJA_alpha040",
|
'BP',
|
||||||
"GTJA_alpha043",
|
'debt_to_equity',
|
||||||
"GTJA_alpha027",
|
'current_ratio',
|
||||||
"CP",
|
'GTJA_alpha002',
|
||||||
"max_ret_20",
|
'GTJA_alpha027',
|
||||||
"debt_to_equity",
|
'GTJA_alpha064',
|
||||||
"close_vwap_deviation",
|
'GTJA_alpha062',
|
||||||
"EP",
|
'GTJA_alpha043',
|
||||||
"BP",
|
'GTJA_alpha044',
|
||||||
"EP_rank",
|
'GTJA_alpha120',
|
||||||
"GTJA_alpha044",
|
'GTJA_alpha117',
|
||||||
"GTJA_alpha036",
|
'GTJA_alpha103',
|
||||||
"GTJA_alpha010",
|
'GTJA_alpha104',
|
||||||
"GTJA_alpha005",
|
'GTJA_alpha105',
|
||||||
"GTJA_alpha001",
|
'GTJA_alpha073',
|
||||||
"GTJA_alpha002",
|
'GTJA_alpha077',
|
||||||
"GTJA_alpha007",
|
'GTJA_alpha085',
|
||||||
"GTJA_alpha016",
|
'GTJA_alpha090',
|
||||||
"GTJA_alpha073",
|
'GTJA_alpha087',
|
||||||
"GTJA_alpha133",
|
'GTJA_alpha083',
|
||||||
"GTJA_alpha131",
|
'GTJA_alpha092',
|
||||||
"GTJA_alpha117",
|
'GTJA_alpha133',
|
||||||
"GTJA_alpha124",
|
'GTJA_alpha131',
|
||||||
"GTJA_alpha120",
|
'GTJA_alpha126',
|
||||||
"GTJA_alpha119",
|
'GTJA_alpha124',
|
||||||
"GTJA_alpha103",
|
'GTJA_alpha162',
|
||||||
"GTJA_alpha099",
|
'GTJA_alpha164',
|
||||||
"GTJA_alpha105",
|
'GTJA_alpha157',
|
||||||
"GTJA_alpha104",
|
'GTJA_alpha177',
|
||||||
"GTJA_alpha090",
|
'price_to_avg_cost',
|
||||||
"GTJA_alpha085",
|
'cost_skewness',
|
||||||
"GTJA_alpha083",
|
'GTJA_alpha191',
|
||||||
"GTJA_alpha084",
|
'GTJA_alpha180',
|
||||||
"GTJA_alpha087",
|
'history_position',
|
||||||
"GTJA_alpha092",
|
'bottom_profit',
|
||||||
"GTJA_alpha074",
|
'mean_median_dev',
|
||||||
"GTJA_alpha089",
|
'smart_money_accumulation',
|
||||||
"GTJA_alpha173",
|
'GTJA_alpha013',
|
||||||
"GTJA_alpha157",
|
'GTJA_alpha099',
|
||||||
"GTJA_alpha139",
|
'GTJA_alpha107',
|
||||||
"GTJA_alpha162",
|
'GTJA_alpha119',
|
||||||
"GTJA_alpha163",
|
'GTJA_alpha141',
|
||||||
"GTJA_alpha177",
|
'GTJA_alpha130',
|
||||||
"price_to_avg_cost",
|
'GTJA_alpha173',
|
||||||
"cost_skewness",
|
|
||||||
"GTJA_alpha191",
|
|
||||||
"GTJA_alpha180",
|
|
||||||
"history_position",
|
|
||||||
"bottom_profit",
|
|
||||||
"smart_money_accumulation",
|
|
||||||
]
|
]
|
||||||
|
|
||||||
# 模型参数配置
|
# 模型参数配置
|
||||||
@@ -118,7 +112,7 @@ MODEL_PARAMS = {
|
|||||||
"num_leaves": 31, # 【修改】限制为 31(2的5次方-1),确保树是不对称生长的,防止过拟合
|
"num_leaves": 31, # 【修改】限制为 31(2的5次方-1),确保树是不对称生长的,防止过拟合
|
||||||
"min_data_in_leaf": 512, # 【大幅增加】从256加到1000。训练集有97万条,极大地限制叶子节点样本量能有效抵抗股市噪音
|
"min_data_in_leaf": 512, # 【大幅增加】从256加到1000。训练集有97万条,极大地限制叶子节点样本量能有效抵抗股市噪音
|
||||||
# ==================== 学习参数 ====================
|
# ==================== 学习参数 ====================
|
||||||
"learning_rate": 0.02, # 【修改】稍微调大一点,帮助模型跳出初始的局部最优(避免十几轮就早停)
|
"learning_rate": 0.01, # 【修改】稍微调大一点,帮助模型跳出初始的局部最优(避免十几轮就早停)
|
||||||
"n_estimators": 2000,
|
"n_estimators": 2000,
|
||||||
# ==================== 随机采样与降维 ====================
|
# ==================== 随机采样与降维 ====================
|
||||||
"subsample": 0.85,
|
"subsample": 0.85,
|
||||||
@@ -182,6 +176,11 @@ def main():
|
|||||||
(StandardScaler, {}),
|
(StandardScaler, {}),
|
||||||
# (CrossSectionalStandardScaler, {}),
|
# (CrossSectionalStandardScaler, {}),
|
||||||
],
|
],
|
||||||
|
label_processor_configs=[
|
||||||
|
# 对 label 进行缩尾处理(去除极端收益率)
|
||||||
|
(Winsorizer, {"lower": 0.05, "upper": 0.95}),
|
||||||
|
# (StandardScaler, {}),
|
||||||
|
],
|
||||||
filters=[STFilter(data_router=engine.router)],
|
filters=[STFilter(data_router=engine.router)],
|
||||||
stock_pool_filter_func=stock_pool_filter,
|
stock_pool_filter_func=stock_pool_filter,
|
||||||
stock_pool_required_columns=STOCK_FILTER_REQUIRED_COLUMNS,
|
stock_pool_required_columns=STOCK_FILTER_REQUIRED_COLUMNS,
|
||||||
|
|||||||
@@ -40,6 +40,9 @@ class DataPipeline:
|
|||||||
filters: Optional[List[Any]] = None,
|
filters: Optional[List[Any]] = None,
|
||||||
stock_pool_filter_func: Optional[Callable] = None,
|
stock_pool_filter_func: Optional[Callable] = None,
|
||||||
stock_pool_required_columns: Optional[List[str]] = None,
|
stock_pool_required_columns: Optional[List[str]] = None,
|
||||||
|
label_processor_configs: Optional[
|
||||||
|
List[Tuple[Type[BaseProcessor], Dict[str, Any]]]
|
||||||
|
] = None,
|
||||||
):
|
):
|
||||||
"""初始化数据流水线
|
"""初始化数据流水线
|
||||||
|
|
||||||
@@ -50,6 +53,8 @@ class DataPipeline:
|
|||||||
filters: 类形式的过滤器列表(如 [STFilter])
|
filters: 类形式的过滤器列表(如 [STFilter])
|
||||||
stock_pool_filter_func: 函数形式的股票池筛选器
|
stock_pool_filter_func: 函数形式的股票池筛选器
|
||||||
stock_pool_required_columns: 股票池筛选所需的额外列
|
stock_pool_required_columns: 股票池筛选所需的额外列
|
||||||
|
label_processor_configs: Label 数据处理器配置列表,格式与 processor_configs 相同
|
||||||
|
例如:[(Winsorizer, {"lower": 0.01, "upper": 0.99})] 用于对 label 进行缩尾处理
|
||||||
"""
|
"""
|
||||||
self.factor_manager = factor_manager
|
self.factor_manager = factor_manager
|
||||||
self.processor_configs = processor_configs or []
|
self.processor_configs = processor_configs or []
|
||||||
@@ -57,6 +62,8 @@ class DataPipeline:
|
|||||||
self.stock_pool_filter_func = stock_pool_filter_func
|
self.stock_pool_filter_func = stock_pool_filter_func
|
||||||
self.stock_pool_required_columns = stock_pool_required_columns or []
|
self.stock_pool_required_columns = stock_pool_required_columns or []
|
||||||
self.fitted_processors: List[BaseProcessor] = []
|
self.fitted_processors: List[BaseProcessor] = []
|
||||||
|
self.label_processor_configs = label_processor_configs or []
|
||||||
|
self.fitted_label_processors: List[BaseProcessor] = []
|
||||||
|
|
||||||
def prepare_data(
|
def prepare_data(
|
||||||
self,
|
self,
|
||||||
@@ -250,6 +257,7 @@ class DataPipeline:
|
|||||||
"""预处理数据
|
"""预处理数据
|
||||||
|
|
||||||
训练集使用 fit_transform,验证集和测试集使用 transform
|
训练集使用 fit_transform,验证集和测试集使用 transform
|
||||||
|
同时支持对 label 进行 processor 处理
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
split_data: 划分后的数据字典
|
split_data: 划分后的数据字典
|
||||||
@@ -259,44 +267,83 @@ class DataPipeline:
|
|||||||
Returns:
|
Returns:
|
||||||
预处理后的数据字典
|
预处理后的数据字典
|
||||||
"""
|
"""
|
||||||
if not self.processor_configs:
|
label_name = split_data["train"]["y"].name
|
||||||
return split_data
|
|
||||||
|
|
||||||
self.fitted_processors = []
|
# 处理特征
|
||||||
|
if self.processor_configs:
|
||||||
|
self.fitted_processors = []
|
||||||
|
|
||||||
# 实例化 processors(传入 feature_cols)
|
# 实例化 processors(传入 feature_cols)
|
||||||
processors = []
|
processors = []
|
||||||
for proc_class, proc_kwargs in self.processor_configs:
|
for proc_class, proc_kwargs in self.processor_configs:
|
||||||
proc_kwargs_with_cols = {**proc_kwargs, "feature_cols": feature_cols}
|
proc_kwargs_with_cols = {**proc_kwargs, "feature_cols": feature_cols}
|
||||||
processors.append(proc_class(**proc_kwargs_with_cols))
|
processors.append(proc_class(**proc_kwargs_with_cols))
|
||||||
|
|
||||||
# 训练集:fit_transform
|
# 训练集:fit_transform
|
||||||
if verbose:
|
if verbose:
|
||||||
print(f" 训练集预处理(fit_transform)...")
|
print(f" 训练集特征预处理(fit_transform)...")
|
||||||
|
|
||||||
train_data = split_data["train"]["raw_data"]
|
train_data = split_data["train"]["raw_data"]
|
||||||
for processor in processors:
|
for processor in processors:
|
||||||
train_data = processor.fit_transform(train_data)
|
train_data = processor.fit_transform(train_data)
|
||||||
self.fitted_processors.append(processor)
|
self.fitted_processors.append(processor)
|
||||||
|
|
||||||
# 更新训练集
|
# 更新训练集
|
||||||
split_data["train"]["raw_data"] = train_data
|
split_data["train"]["raw_data"] = train_data
|
||||||
split_data["train"]["X"] = train_data.select(feature_cols)
|
split_data["train"]["X"] = train_data.select(feature_cols)
|
||||||
split_data["train"]["y"] = train_data[split_data["train"]["y"].name]
|
split_data["train"]["y"] = train_data[label_name]
|
||||||
|
|
||||||
# 验证集和测试集:transform
|
# 验证集和测试集:transform
|
||||||
for split_name in ["val", "test"]:
|
for split_name in ["val", "test"]:
|
||||||
if split_name in split_data:
|
if split_name in split_data:
|
||||||
if verbose:
|
if verbose:
|
||||||
print(f" {split_name}集预处理(transform)...")
|
print(f" {split_name}集特征预处理(transform)...")
|
||||||
|
|
||||||
split_df = split_data[split_name]["raw_data"]
|
split_df = split_data[split_name]["raw_data"]
|
||||||
for processor in self.fitted_processors:
|
for processor in self.fitted_processors:
|
||||||
split_df = processor.transform(split_df)
|
split_df = processor.transform(split_df)
|
||||||
|
|
||||||
split_data[split_name]["raw_data"] = split_df
|
split_data[split_name]["raw_data"] = split_df
|
||||||
split_data[split_name]["X"] = split_df.select(feature_cols)
|
split_data[split_name]["X"] = split_df.select(feature_cols)
|
||||||
split_data[split_name]["y"] = split_df[split_data[split_name]["y"].name]
|
split_data[split_name]["y"] = split_df[label_name]
|
||||||
|
|
||||||
|
# 处理 label
|
||||||
|
if self.label_processor_configs:
|
||||||
|
self.fitted_label_processors = []
|
||||||
|
|
||||||
|
# 实例化 label processors(传入 label_name 作为 feature_cols)
|
||||||
|
label_processors = []
|
||||||
|
for proc_class, proc_kwargs in self.label_processor_configs:
|
||||||
|
proc_kwargs_with_label = {**proc_kwargs, "feature_cols": [label_name]}
|
||||||
|
label_processors.append(proc_class(**proc_kwargs_with_label))
|
||||||
|
|
||||||
|
# 训练集:fit_transform
|
||||||
|
if verbose:
|
||||||
|
print(f" 训练集 Label 预处理(fit_transform)...")
|
||||||
|
|
||||||
|
train_data = split_data["train"]["raw_data"]
|
||||||
|
for processor in label_processors:
|
||||||
|
train_data = processor.fit_transform(train_data)
|
||||||
|
self.fitted_label_processors.append(processor)
|
||||||
|
|
||||||
|
# 更新训练集
|
||||||
|
split_data["train"]["raw_data"] = train_data
|
||||||
|
split_data["train"]["X"] = train_data.select(feature_cols)
|
||||||
|
split_data["train"]["y"] = train_data[label_name]
|
||||||
|
|
||||||
|
# 验证集和测试集:transform
|
||||||
|
for split_name in ["val", "test"]:
|
||||||
|
if split_name in split_data:
|
||||||
|
if verbose:
|
||||||
|
print(f" {split_name}集 Label 预处理(transform)...")
|
||||||
|
|
||||||
|
split_df = split_data[split_name]["raw_data"]
|
||||||
|
for processor in self.fitted_label_processors:
|
||||||
|
split_df = processor.transform(split_df)
|
||||||
|
|
||||||
|
split_data[split_name]["raw_data"] = split_df
|
||||||
|
split_data[split_name]["X"] = split_df.select(feature_cols)
|
||||||
|
split_data[split_name]["y"] = split_df[label_name]
|
||||||
|
|
||||||
return split_data
|
return split_data
|
||||||
|
|
||||||
@@ -307,3 +354,11 @@ class DataPipeline:
|
|||||||
已拟合的处理器列表(用于模型保存)
|
已拟合的处理器列表(用于模型保存)
|
||||||
"""
|
"""
|
||||||
return self.fitted_processors
|
return self.fitted_processors
|
||||||
|
|
||||||
|
def get_fitted_label_processors(self) -> List[BaseProcessor]:
|
||||||
|
"""获取已拟合的 Label 处理器列表
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
已拟合的 Label 处理器列表(用于模型保存和预测时反转换)
|
||||||
|
"""
|
||||||
|
return self.fitted_label_processors
|
||||||
|
|||||||
Reference in New Issue
Block a user