feat(training): 添加缺失值填充处理器 NullFiller

新增 NullFiller 处理器，支持 zero/mean/median/value 填充策略，支持全局统计量或按日期截面填充。在回归训练流程中添加 NullFiller。
2026-03-05 21:57:34 +08:00
parent aefe6d06cf
commit 7b935b0fa3
4 changed files with 206 additions and 3 deletions
--- a/src/experiment/regression.py
+++ b/src/experiment/regression.py
@@ -19,7 +19,7 @@ from src.training import (
    StockFilterConfig,
    StockPoolManager,
    Trainer,
-    Winsorizer,
+    Winsorizer, NullFiller,
 )
 from src.training.config import TrainingConfig

@@ -224,6 +224,7 @@ def train_regression_model():

    # 6. 创建数据处理器（从 PROCESSOR_CONFIGS 解析）
    processors = [
+        NullFiller(strategy="mean"),
        Winsorizer(**PROCESSOR_CONFIGS[0]["params"]),  # type: ignore[arg-type]
        StandardScaler(exclude_cols=["ts_code", "trade_date", target_col]),  # type: ignore[call-arg]
    ]
--- a/src/training/init.py
+++ b/src/training/init.py
@@ -26,6 +26,7 @@ from src.training.components.selectors import (
 # 数据处理器
 from src.training.components.processors import (
    CrossSectionalStandardScaler,
+    NullFiller,
    StandardScaler,
    Winsorizer,
 )
@@ -57,6 +58,7 @@ __all__ = [
    "StockFilterConfig",
    "MarketCapSelectorConfig",
    # 数据处理器
+    "NullFiller",
    "StandardScaler",
    "CrossSectionalStandardScaler",
    "Winsorizer",
--- a/src/training/components/processors/init.py
+++ b/src/training/components/processors/init.py
@@ -5,11 +5,13 @@

 from src.training.components.processors.transforms import (
    CrossSectionalStandardScaler,
+    NullFiller,
    StandardScaler,
    Winsorizer,
 )

 __all__ = [
+    "NullFiller",
    "StandardScaler",
    "CrossSectionalStandardScaler",
    "Winsorizer",
--- a/src/training/components/processors/transforms.py
+++ b/src/training/components/processors/transforms.py
@@ -1,9 +1,9 @@
 """数据处理器实现

-包含标准化、缩尾等数据处理器。
+包含标准化、缩尾、缺失值填充等数据处理器。
 """

-from typing import List, Optional
+from typing import List, Literal, Optional, Union

 import polars as pl

@@ -11,6 +11,204 @@ from src.training.components.base import BaseProcessor
 from src.training.registry import register_processor


+@register_processor("null_filler")
+class NullFiller(BaseProcessor):
+    """缺失值填充处理器
+
+    支持多种填充策略：固定值、0、均值、中值。
+    可以全局填充或使用当天截面统计量填充。
+
+    填充策略：
+        - "zero": 填充0
+        - "mean": 填充均值（全局或当天截面）
+        - "median": 填充中值（全局或当天截面）
+        - "value": 填充指定数值
+
+    Attributes:
+        strategy: 填充策略，可选 "zero", "mean", "median", "value"
+        fill_value: 当 strategy="value" 时使用的填充值
+        by_date: 是否按日期独立计算统计量（仅对 mean/median 有效）
+        date_col: 日期列名
+        exclude_cols: 不参与填充的列名列表
+        stats_: 存储学习到的统计量（全局模式）
+    """
+
+    name = "null_filler"
+
+    def __init__(
+        self,
+        strategy: Literal["zero", "mean", "median", "value"] = "zero",
+        fill_value: Optional[float] = None,
+        by_date: bool = True,
+        date_col: str = "trade_date",
+        exclude_cols: Optional[List[str]] = None,
+    ):
+        """初始化缺失值填充处理器
+
+        Args:
+            strategy: 填充策略，默认 "zero"
+                - "zero": 填充0
+                - "mean": 填充均值
+                - "median": 填充中值
+                - "value": 填充指定数值（需配合 fill_value）
+            fill_value: 当 strategy="value" 时的填充值，默认为 None
+            by_date: 是否每天独立计算统计量，默认 False（全局统计量）
+            date_col: 日期列名，默认 "trade_date"
+            exclude_cols: 不参与填充的列名列表，默认为 ["ts_code", "trade_date"]
+
+        Raises:
+            ValueError: 策略无效或 fill_value 未提供时
+        """
+        if strategy not in ("zero", "mean", "median", "value"):
+            raise ValueError(
+                f"无效的填充策略: {strategy}，必须是 'zero', 'mean', 'median', 'value' 之一"
+            )
+
+        if strategy == "value" and fill_value is None:
+            raise ValueError("当 strategy='value' 时，必须提供 fill_value")
+
+        self.strategy = strategy
+        self.fill_value = fill_value
+        self.by_date = by_date
+        self.date_col = date_col
+        self.exclude_cols = exclude_cols or ["ts_code", "trade_date"]
+        self.stats_: dict = {}
+
+    def fit(self, X: pl.DataFrame) -> "NullFiller":
+        """学习统计量（仅在全局模式下）
+
+        在全局模式下，计算每列的均值或中值作为填充值。
+        在截面模式下（by_date=True），不需要 fit，每天独立计算。
+
+        Args:
+            X: 训练数据
+
+        Returns:
+            self
+        """
+        if not self.by_date and self.strategy in ("mean", "median"):
+            numeric_cols = [
+                c
+                for c in X.columns
+                if c not in self.exclude_cols and X[c].dtype.is_numeric()
+            ]
+
+            for col in numeric_cols:
+                if self.strategy == "mean":
+                    self.stats_[col] = X[col].mean()
+                else:  # median
+                    self.stats_[col] = X[col].median()
+
+        return self
+
+    def transform(self, X: pl.DataFrame) -> pl.DataFrame:
+        """填充缺失值
+
+        Args:
+            X: 待转换数据
+
+        Returns:
+            填充后的数据
+        """
+        if self.strategy == "zero":
+            return self._fill_with_zero(X)
+        elif self.strategy == "value":
+            return self._fill_with_value(X)
+        elif self.strategy in ("mean", "median"):
+            if self.by_date:
+                return self._fill_by_date(X)
+            else:
+                return self._fill_global(X)
+        else:
+            # 不应该到达这里，因为 __init__ 已经验证
+            raise ValueError(f"未知的填充策略: {self.strategy}")
+
+    def _fill_with_zero(self, X: pl.DataFrame) -> pl.DataFrame:
+        """使用0填充缺失值"""
+        numeric_cols = [
+            c
+            for c in X.columns
+            if c not in self.exclude_cols and X[c].dtype.is_numeric()
+        ]
+
+        expressions = []
+        for col in X.columns:
+            if col in numeric_cols:
+                expr = pl.col(col).fill_null(0).alias(col)
+                expressions.append(expr)
+            else:
+                expressions.append(pl.col(col))
+
+        return X.select(expressions)
+
+    def _fill_with_value(self, X: pl.DataFrame) -> pl.DataFrame:
+        """使用指定值填充缺失值"""
+        numeric_cols = [
+            c
+            for c in X.columns
+            if c not in self.exclude_cols and X[c].dtype.is_numeric()
+        ]
+
+        expressions = []
+        for col in X.columns:
+            if col in numeric_cols:
+                expr = pl.col(col).fill_null(self.fill_value).alias(col)
+                expressions.append(expr)
+            else:
+                expressions.append(pl.col(col))
+
+        return X.select(expressions)
+
+    def _fill_global(self, X: pl.DataFrame) -> pl.DataFrame:
+        """使用全局统计量填充（训练集学到的统计量）"""
+        expressions = []
+        for col in X.columns:
+            if col in self.stats_:
+                fill_val = self.stats_[col]
+                expr = pl.col(col).fill_null(fill_val).alias(col)
+                expressions.append(expr)
+            else:
+                expressions.append(pl.col(col))
+
+        return X.select(expressions)
+
+    def _fill_by_date(self, X: pl.DataFrame) -> pl.DataFrame:
+        """使用每天截面统计量填充"""
+        numeric_cols = [
+            c
+            for c in X.columns
+            if c not in self.exclude_cols and X[c].dtype.is_numeric()
+        ]
+
+        # 计算每天的统计量
+        stat_exprs = []
+        for col in numeric_cols:
+            if self.strategy == "mean":
+                stat_exprs.append(
+                    pl.col(col).mean().over(self.date_col).alias(f"{col}_stat")
+                )
+            else:  # median
+                stat_exprs.append(
+                    pl.col(col).median().over(self.date_col).alias(f"{col}_stat")
+                )
+
+        # 添加统计量列
+        result = X.with_columns(stat_exprs)
+
+        # 使用统计量填充缺失值
+        fill_exprs = []
+        for col in X.columns:
+            if col in numeric_cols:
+                expr = pl.col(col).fill_null(pl.col(f"{col}_stat")).alias(col)
+                fill_exprs.append(expr)
+            else:
+                fill_exprs.append(pl.col(col))
+
+        result = result.select(fill_exprs)
+
+        return result
+
+
@register_processor("standard_scaler")
 class StandardScaler(BaseProcessor):
    """标准化处理器（全局标准化）