From 9e0114c7456a566f774fcf7c0801261ddbc5c6ec Mon Sep 17 00:00:00 2001 From: liaozhaorun <1300336796@qq.com> Date: Sun, 29 Mar 2026 02:37:53 +0800 Subject: [PATCH] =?UTF-8?q?feat(training):=20=E6=94=AF=E6=8C=81=20Label=20?= =?UTF-8?q?=E9=A2=84=E5=A4=84=E7=90=86=E5=99=A8=20-=20DataPipeline=20?= =?UTF-8?q?=E6=96=B0=E5=A2=9E=20label=5Fprocessor=5Fconfigs=20=E5=8F=82?= =?UTF-8?q?=E6=95=B0=20-=20=E5=88=86=E7=A6=BB=E7=89=B9=E5=BE=81=E4=B8=8E?= =?UTF-8?q?=20label=20=E7=9A=84=E9=A2=84=E5=A4=84=E7=90=86=E6=B5=81?= =?UTF-8?q?=E7=A8=8B=20-=20regression.py=20=E6=B7=BB=E5=8A=A0=20label=20?= =?UTF-8?q?=E7=BC=A9=E5=B0=BE=E5=A4=84=E7=90=86=E9=85=8D=E7=BD=AE=20-=20?= =?UTF-8?q?=E8=B0=83=E6=95=B4=E5=AD=A6=E4=B9=A0=E7=8E=87=E5=B9=B6=E6=9B=B4?= =?UTF-8?q?=E6=96=B0=E6=8E=92=E9=99=A4=E5=9B=A0=E5=AD=90=E5=88=97=E8=A1=A8?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/experiment/regression.py | 111 +++++++++++++++++---------------- src/training/pipeline.py | 115 ++++++++++++++++++++++++++--------- 2 files changed, 140 insertions(+), 86 deletions(-) diff --git a/src/experiment/regression.py b/src/experiment/regression.py index 3fabbac..0eee2c7 100644 --- a/src/experiment/regression.py +++ b/src/experiment/regression.py @@ -51,61 +51,55 @@ TRAINING_TYPE = "regression" # 排除的因子列表 EXCLUDED_FACTORS = [ - "GTJA_alpha062", - "GTJA_alpha060", - "GTJA_alpha058", - "GTJA_alpha056", - "GTJA_alpha053", - "GTJA_alpha040", - "GTJA_alpha043", - "GTJA_alpha027", - "CP", - "max_ret_20", - "debt_to_equity", - "close_vwap_deviation", - "EP", - "BP", - "EP_rank", - "GTJA_alpha044", - "GTJA_alpha036", - "GTJA_alpha010", - "GTJA_alpha005", - "GTJA_alpha001", - "GTJA_alpha002", - "GTJA_alpha007", - "GTJA_alpha016", - "GTJA_alpha073", - "GTJA_alpha133", - "GTJA_alpha131", - "GTJA_alpha117", - "GTJA_alpha124", - "GTJA_alpha120", - "GTJA_alpha119", - "GTJA_alpha103", - "GTJA_alpha099", - "GTJA_alpha105", - "GTJA_alpha104", - "GTJA_alpha090", - "GTJA_alpha085", - "GTJA_alpha083", - "GTJA_alpha084", - "GTJA_alpha087", - "GTJA_alpha092", - "GTJA_alpha074", - "GTJA_alpha089", - "GTJA_alpha173", - "GTJA_alpha157", - "GTJA_alpha139", - "GTJA_alpha162", - "GTJA_alpha163", - "GTJA_alpha177", - "price_to_avg_cost", - "cost_skewness", - "GTJA_alpha191", - "GTJA_alpha180", - "history_position", - "bottom_profit", - "smart_money_accumulation", + 'GTJA_alpha036', + 'GTJA_alpha032', + 'GTJA_alpha010', + 'GTJA_alpha005', + 'CP', + 'BP', + 'debt_to_equity', + 'current_ratio', + 'GTJA_alpha002', + 'GTJA_alpha027', + 'GTJA_alpha064', + 'GTJA_alpha062', + 'GTJA_alpha043', + 'GTJA_alpha044', + 'GTJA_alpha120', + 'GTJA_alpha117', + 'GTJA_alpha103', + 'GTJA_alpha104', + 'GTJA_alpha105', + 'GTJA_alpha073', + 'GTJA_alpha077', + 'GTJA_alpha085', + 'GTJA_alpha090', + 'GTJA_alpha087', + 'GTJA_alpha083', + 'GTJA_alpha092', + 'GTJA_alpha133', + 'GTJA_alpha131', + 'GTJA_alpha126', + 'GTJA_alpha124', + 'GTJA_alpha162', + 'GTJA_alpha164', + 'GTJA_alpha157', + 'GTJA_alpha177', + 'price_to_avg_cost', + 'cost_skewness', + 'GTJA_alpha191', + 'GTJA_alpha180', + 'history_position', + 'bottom_profit', + 'mean_median_dev', + 'smart_money_accumulation', +'GTJA_alpha013', +'GTJA_alpha099', +'GTJA_alpha107', +'GTJA_alpha119', +'GTJA_alpha141', +'GTJA_alpha130', +'GTJA_alpha173', ] # 模型参数配置 @@ -118,7 +112,7 @@ MODEL_PARAMS = { "num_leaves": 31, # 【修改】限制为 31(2的5次方-1),确保树是不对称生长的,防止过拟合 "min_data_in_leaf": 512, # 【大幅增加】从256加到1000。训练集有97万条,极大地限制叶子节点样本量能有效抵抗股市噪音 # ==================== 学习参数 ==================== - "learning_rate": 0.02, # 【修改】稍微调大一点,帮助模型跳出初始的局部最优(避免十几轮就早停) + "learning_rate": 0.01, # 【修改】稍微调大一点,帮助模型跳出初始的局部最优(避免十几轮就早停) "n_estimators": 2000, # ==================== 随机采样与降维 ==================== "subsample": 0.85, @@ -182,6 +176,11 @@ def main(): (StandardScaler, {}), # (CrossSectionalStandardScaler, {}), ], + label_processor_configs=[ + # 对 label 进行缩尾处理(去除极端收益率) + (Winsorizer, {"lower": 0.05, "upper": 0.95}), + # (StandardScaler, {}), + ], filters=[STFilter(data_router=engine.router)], stock_pool_filter_func=stock_pool_filter, stock_pool_required_columns=STOCK_FILTER_REQUIRED_COLUMNS, diff --git a/src/training/pipeline.py b/src/training/pipeline.py index 570b15c..c376987 100644 --- a/src/training/pipeline.py +++ b/src/training/pipeline.py @@ -40,6 +40,9 @@ class DataPipeline: filters: Optional[List[Any]] = None, stock_pool_filter_func: Optional[Callable] = None, stock_pool_required_columns: Optional[List[str]] = None, + label_processor_configs: Optional[ + List[Tuple[Type[BaseProcessor], Dict[str, Any]]] + ] = None, ): """初始化数据流水线 @@ -50,6 +53,8 @@ class DataPipeline: filters: 类形式的过滤器列表(如 [STFilter]) stock_pool_filter_func: 函数形式的股票池筛选器 stock_pool_required_columns: 股票池筛选所需的额外列 + label_processor_configs: Label 数据处理器配置列表,格式与 processor_configs 相同 + 例如:[(Winsorizer, {"lower": 0.01, "upper": 0.99})] 用于对 label 进行缩尾处理 """ self.factor_manager = factor_manager self.processor_configs = processor_configs or [] @@ -57,6 +62,8 @@ class DataPipeline: self.stock_pool_filter_func = stock_pool_filter_func self.stock_pool_required_columns = stock_pool_required_columns or [] self.fitted_processors: List[BaseProcessor] = [] + self.label_processor_configs = label_processor_configs or [] + self.fitted_label_processors: List[BaseProcessor] = [] def prepare_data( self, @@ -250,6 +257,7 @@ class DataPipeline: """预处理数据 训练集使用 fit_transform,验证集和测试集使用 transform + 同时支持对 label 进行 processor 处理 Args: split_data: 划分后的数据字典 @@ -259,44 +267,83 @@ class DataPipeline: Returns: 预处理后的数据字典 """ - if not self.processor_configs: - return split_data + label_name = split_data["train"]["y"].name - self.fitted_processors = [] + # 处理特征 + if self.processor_configs: + self.fitted_processors = [] - # 实例化 processors(传入 feature_cols) - processors = [] - for proc_class, proc_kwargs in self.processor_configs: - proc_kwargs_with_cols = {**proc_kwargs, "feature_cols": feature_cols} - processors.append(proc_class(**proc_kwargs_with_cols)) + # 实例化 processors(传入 feature_cols) + processors = [] + for proc_class, proc_kwargs in self.processor_configs: + proc_kwargs_with_cols = {**proc_kwargs, "feature_cols": feature_cols} + processors.append(proc_class(**proc_kwargs_with_cols)) - # 训练集:fit_transform - if verbose: - print(f" 训练集预处理(fit_transform)...") + # 训练集:fit_transform + if verbose: + print(f" 训练集特征预处理(fit_transform)...") - train_data = split_data["train"]["raw_data"] - for processor in processors: - train_data = processor.fit_transform(train_data) - self.fitted_processors.append(processor) + train_data = split_data["train"]["raw_data"] + for processor in processors: + train_data = processor.fit_transform(train_data) + self.fitted_processors.append(processor) - # 更新训练集 - split_data["train"]["raw_data"] = train_data - split_data["train"]["X"] = train_data.select(feature_cols) - split_data["train"]["y"] = train_data[split_data["train"]["y"].name] + # 更新训练集 + split_data["train"]["raw_data"] = train_data + split_data["train"]["X"] = train_data.select(feature_cols) + split_data["train"]["y"] = train_data[label_name] - # 验证集和测试集:transform - for split_name in ["val", "test"]: - if split_name in split_data: - if verbose: - print(f" {split_name}集预处理(transform)...") + # 验证集和测试集:transform + for split_name in ["val", "test"]: + if split_name in split_data: + if verbose: + print(f" {split_name}集特征预处理(transform)...") - split_df = split_data[split_name]["raw_data"] - for processor in self.fitted_processors: - split_df = processor.transform(split_df) + split_df = split_data[split_name]["raw_data"] + for processor in self.fitted_processors: + split_df = processor.transform(split_df) - split_data[split_name]["raw_data"] = split_df - split_data[split_name]["X"] = split_df.select(feature_cols) - split_data[split_name]["y"] = split_df[split_data[split_name]["y"].name] + split_data[split_name]["raw_data"] = split_df + split_data[split_name]["X"] = split_df.select(feature_cols) + split_data[split_name]["y"] = split_df[label_name] + + # 处理 label + if self.label_processor_configs: + self.fitted_label_processors = [] + + # 实例化 label processors(传入 label_name 作为 feature_cols) + label_processors = [] + for proc_class, proc_kwargs in self.label_processor_configs: + proc_kwargs_with_label = {**proc_kwargs, "feature_cols": [label_name]} + label_processors.append(proc_class(**proc_kwargs_with_label)) + + # 训练集:fit_transform + if verbose: + print(f" 训练集 Label 预处理(fit_transform)...") + + train_data = split_data["train"]["raw_data"] + for processor in label_processors: + train_data = processor.fit_transform(train_data) + self.fitted_label_processors.append(processor) + + # 更新训练集 + split_data["train"]["raw_data"] = train_data + split_data["train"]["X"] = train_data.select(feature_cols) + split_data["train"]["y"] = train_data[label_name] + + # 验证集和测试集:transform + for split_name in ["val", "test"]: + if split_name in split_data: + if verbose: + print(f" {split_name}集 Label 预处理(transform)...") + + split_df = split_data[split_name]["raw_data"] + for processor in self.fitted_label_processors: + split_df = processor.transform(split_df) + + split_data[split_name]["raw_data"] = split_df + split_data[split_name]["X"] = split_df.select(feature_cols) + split_data[split_name]["y"] = split_df[label_name] return split_data @@ -307,3 +354,11 @@ class DataPipeline: 已拟合的处理器列表(用于模型保存) """ return self.fitted_processors + + def get_fitted_label_processors(self) -> List[BaseProcessor]: + """获取已拟合的 Label 处理器列表 + + Returns: + 已拟合的 Label 处理器列表(用于模型保存和预测时反转换) + """ + return self.fitted_label_processors