feat(training): 支持 Label 预处理器
- DataPipeline 新增 label_processor_configs 参数 - 分离特征与 label 的预处理流程 - regression.py 添加 label 缩尾处理配置 - 调整学习率并更新排除因子列表
This commit is contained in:
@@ -40,6 +40,9 @@ class DataPipeline:
|
||||
filters: Optional[List[Any]] = None,
|
||||
stock_pool_filter_func: Optional[Callable] = None,
|
||||
stock_pool_required_columns: Optional[List[str]] = None,
|
||||
label_processor_configs: Optional[
|
||||
List[Tuple[Type[BaseProcessor], Dict[str, Any]]]
|
||||
] = None,
|
||||
):
|
||||
"""初始化数据流水线
|
||||
|
||||
@@ -50,6 +53,8 @@ class DataPipeline:
|
||||
filters: 类形式的过滤器列表(如 [STFilter])
|
||||
stock_pool_filter_func: 函数形式的股票池筛选器
|
||||
stock_pool_required_columns: 股票池筛选所需的额外列
|
||||
label_processor_configs: Label 数据处理器配置列表,格式与 processor_configs 相同
|
||||
例如:[(Winsorizer, {"lower": 0.01, "upper": 0.99})] 用于对 label 进行缩尾处理
|
||||
"""
|
||||
self.factor_manager = factor_manager
|
||||
self.processor_configs = processor_configs or []
|
||||
@@ -57,6 +62,8 @@ class DataPipeline:
|
||||
self.stock_pool_filter_func = stock_pool_filter_func
|
||||
self.stock_pool_required_columns = stock_pool_required_columns or []
|
||||
self.fitted_processors: List[BaseProcessor] = []
|
||||
self.label_processor_configs = label_processor_configs or []
|
||||
self.fitted_label_processors: List[BaseProcessor] = []
|
||||
|
||||
def prepare_data(
|
||||
self,
|
||||
@@ -250,6 +257,7 @@ class DataPipeline:
|
||||
"""预处理数据
|
||||
|
||||
训练集使用 fit_transform,验证集和测试集使用 transform
|
||||
同时支持对 label 进行 processor 处理
|
||||
|
||||
Args:
|
||||
split_data: 划分后的数据字典
|
||||
@@ -259,44 +267,83 @@ class DataPipeline:
|
||||
Returns:
|
||||
预处理后的数据字典
|
||||
"""
|
||||
if not self.processor_configs:
|
||||
return split_data
|
||||
label_name = split_data["train"]["y"].name
|
||||
|
||||
self.fitted_processors = []
|
||||
# 处理特征
|
||||
if self.processor_configs:
|
||||
self.fitted_processors = []
|
||||
|
||||
# 实例化 processors(传入 feature_cols)
|
||||
processors = []
|
||||
for proc_class, proc_kwargs in self.processor_configs:
|
||||
proc_kwargs_with_cols = {**proc_kwargs, "feature_cols": feature_cols}
|
||||
processors.append(proc_class(**proc_kwargs_with_cols))
|
||||
# 实例化 processors(传入 feature_cols)
|
||||
processors = []
|
||||
for proc_class, proc_kwargs in self.processor_configs:
|
||||
proc_kwargs_with_cols = {**proc_kwargs, "feature_cols": feature_cols}
|
||||
processors.append(proc_class(**proc_kwargs_with_cols))
|
||||
|
||||
# 训练集:fit_transform
|
||||
if verbose:
|
||||
print(f" 训练集预处理(fit_transform)...")
|
||||
# 训练集:fit_transform
|
||||
if verbose:
|
||||
print(f" 训练集特征预处理(fit_transform)...")
|
||||
|
||||
train_data = split_data["train"]["raw_data"]
|
||||
for processor in processors:
|
||||
train_data = processor.fit_transform(train_data)
|
||||
self.fitted_processors.append(processor)
|
||||
train_data = split_data["train"]["raw_data"]
|
||||
for processor in processors:
|
||||
train_data = processor.fit_transform(train_data)
|
||||
self.fitted_processors.append(processor)
|
||||
|
||||
# 更新训练集
|
||||
split_data["train"]["raw_data"] = train_data
|
||||
split_data["train"]["X"] = train_data.select(feature_cols)
|
||||
split_data["train"]["y"] = train_data[split_data["train"]["y"].name]
|
||||
# 更新训练集
|
||||
split_data["train"]["raw_data"] = train_data
|
||||
split_data["train"]["X"] = train_data.select(feature_cols)
|
||||
split_data["train"]["y"] = train_data[label_name]
|
||||
|
||||
# 验证集和测试集:transform
|
||||
for split_name in ["val", "test"]:
|
||||
if split_name in split_data:
|
||||
if verbose:
|
||||
print(f" {split_name}集预处理(transform)...")
|
||||
# 验证集和测试集:transform
|
||||
for split_name in ["val", "test"]:
|
||||
if split_name in split_data:
|
||||
if verbose:
|
||||
print(f" {split_name}集特征预处理(transform)...")
|
||||
|
||||
split_df = split_data[split_name]["raw_data"]
|
||||
for processor in self.fitted_processors:
|
||||
split_df = processor.transform(split_df)
|
||||
split_df = split_data[split_name]["raw_data"]
|
||||
for processor in self.fitted_processors:
|
||||
split_df = processor.transform(split_df)
|
||||
|
||||
split_data[split_name]["raw_data"] = split_df
|
||||
split_data[split_name]["X"] = split_df.select(feature_cols)
|
||||
split_data[split_name]["y"] = split_df[split_data[split_name]["y"].name]
|
||||
split_data[split_name]["raw_data"] = split_df
|
||||
split_data[split_name]["X"] = split_df.select(feature_cols)
|
||||
split_data[split_name]["y"] = split_df[label_name]
|
||||
|
||||
# 处理 label
|
||||
if self.label_processor_configs:
|
||||
self.fitted_label_processors = []
|
||||
|
||||
# 实例化 label processors(传入 label_name 作为 feature_cols)
|
||||
label_processors = []
|
||||
for proc_class, proc_kwargs in self.label_processor_configs:
|
||||
proc_kwargs_with_label = {**proc_kwargs, "feature_cols": [label_name]}
|
||||
label_processors.append(proc_class(**proc_kwargs_with_label))
|
||||
|
||||
# 训练集:fit_transform
|
||||
if verbose:
|
||||
print(f" 训练集 Label 预处理(fit_transform)...")
|
||||
|
||||
train_data = split_data["train"]["raw_data"]
|
||||
for processor in label_processors:
|
||||
train_data = processor.fit_transform(train_data)
|
||||
self.fitted_label_processors.append(processor)
|
||||
|
||||
# 更新训练集
|
||||
split_data["train"]["raw_data"] = train_data
|
||||
split_data["train"]["X"] = train_data.select(feature_cols)
|
||||
split_data["train"]["y"] = train_data[label_name]
|
||||
|
||||
# 验证集和测试集:transform
|
||||
for split_name in ["val", "test"]:
|
||||
if split_name in split_data:
|
||||
if verbose:
|
||||
print(f" {split_name}集 Label 预处理(transform)...")
|
||||
|
||||
split_df = split_data[split_name]["raw_data"]
|
||||
for processor in self.fitted_label_processors:
|
||||
split_df = processor.transform(split_df)
|
||||
|
||||
split_data[split_name]["raw_data"] = split_df
|
||||
split_data[split_name]["X"] = split_df.select(feature_cols)
|
||||
split_data[split_name]["y"] = split_df[label_name]
|
||||
|
||||
return split_data
|
||||
|
||||
@@ -307,3 +354,11 @@ class DataPipeline:
|
||||
已拟合的处理器列表(用于模型保存)
|
||||
"""
|
||||
return self.fitted_processors
|
||||
|
||||
def get_fitted_label_processors(self) -> List[BaseProcessor]:
|
||||
"""获取已拟合的 Label 处理器列表
|
||||
|
||||
Returns:
|
||||
已拟合的 Label 处理器列表(用于模型保存和预测时反转换)
|
||||
"""
|
||||
return self.fitted_label_processors
|
||||
|
||||
Reference in New Issue
Block a user