refactor: 存储层迁移DuckDB + 模块重构
- 存储层重构: HDF5 → DuckDB(UPSERT模式、线程安全存储) - Sync类迁移: DataSync从sync.py迁移到api_daily.py(职责分离) - 模型模块重构: src/models → src/pipeline(更清晰的命名) - 新增因子模块: factors/momentum (MA、收益率排名)、factors/financial - 新增API接口: api_namechange、api_bak_basic - 新增训练入口: training模块(main.py、pipeline配置) - 工具函数统一: get_today_date等移至utils.py - 文档更新: AGENTS.md添加架构变更历史
This commit is contained in:
21
src/pipeline/processors/__init__.py
Normal file
21
src/pipeline/processors/__init__.py
Normal file
@@ -0,0 +1,21 @@
|
||||
"""处理器模块"""
|
||||
|
||||
from src.pipeline.processors.processors import (
|
||||
DropNAProcessor,
|
||||
FillNAProcessor,
|
||||
Winsorizer,
|
||||
StandardScaler,
|
||||
MinMaxScaler,
|
||||
RankTransformer,
|
||||
Neutralizer,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
"DropNAProcessor",
|
||||
"FillNAProcessor",
|
||||
"Winsorizer",
|
||||
"StandardScaler",
|
||||
"MinMaxScaler",
|
||||
"RankTransformer",
|
||||
"Neutralizer",
|
||||
]
|
||||
238
src/pipeline/processors/processors.py
Normal file
238
src/pipeline/processors/processors.py
Normal file
@@ -0,0 +1,238 @@
|
||||
"""内置数据处理器
|
||||
|
||||
提供常用的数据预处理和转换处理器。
|
||||
"""
|
||||
|
||||
from typing import List, Optional, Dict, Any
|
||||
import polars as pl
|
||||
import numpy as np
|
||||
|
||||
from src.pipeline.core import BaseProcessor, PipelineStage
|
||||
from src.pipeline.registry import PluginRegistry
|
||||
|
||||
# 数值类型列表
|
||||
FLOAT_TYPES = [pl.Float32, pl.Float64, pl.Int8, pl.Int16, pl.Int32, pl.Int64]
|
||||
|
||||
|
||||
def _get_numeric_columns(
|
||||
data: pl.DataFrame, columns: Optional[List[str]] = None
|
||||
) -> List[str]:
|
||||
"""获取数值列"""
|
||||
if columns is not None:
|
||||
return columns
|
||||
return [c for c in data.columns if data[c].dtype in FLOAT_TYPES]
|
||||
|
||||
|
||||
@PluginRegistry.register_processor("dropna")
|
||||
class DropNAProcessor(BaseProcessor):
|
||||
"""缺失值删除处理器"""
|
||||
|
||||
stage = PipelineStage.ALL
|
||||
|
||||
def fit(self, data: pl.DataFrame) -> "DropNAProcessor":
|
||||
self._is_fitted = True
|
||||
return self
|
||||
|
||||
def transform(self, data: pl.DataFrame) -> pl.DataFrame:
|
||||
cols = self.columns or data.columns
|
||||
return data.drop_nulls(subset=cols)
|
||||
|
||||
|
||||
@PluginRegistry.register_processor("fillna")
|
||||
class FillNAProcessor(BaseProcessor):
|
||||
"""缺失值填充处理器(只在训练阶段计算填充值)"""
|
||||
|
||||
stage = PipelineStage.TRAIN
|
||||
|
||||
def __init__(self, columns: Optional[List[str]] = None, method: str = "median"):
|
||||
super().__init__(columns)
|
||||
if method not in ["median", "mean", "zero"]:
|
||||
raise ValueError(f"Unknown fill method: {method}")
|
||||
self.method = method
|
||||
|
||||
def fit(self, data: pl.DataFrame) -> "FillNAProcessor":
|
||||
cols = _get_numeric_columns(data, self.columns)
|
||||
fill_values = {}
|
||||
|
||||
for col in cols:
|
||||
if self.method == "median":
|
||||
fill_values[col] = data[col].median()
|
||||
elif self.method == "mean":
|
||||
fill_values[col] = data[col].mean()
|
||||
elif self.method == "zero":
|
||||
fill_values[col] = 0.0
|
||||
|
||||
self._fitted_params = {"fill_values": fill_values, "columns": cols}
|
||||
self._is_fitted = True
|
||||
return self
|
||||
|
||||
def transform(self, data: pl.DataFrame) -> pl.DataFrame:
|
||||
result = data
|
||||
for col, val in self._fitted_params.get("fill_values", {}).items():
|
||||
if col in result.columns:
|
||||
result = result.with_columns(pl.col(col).fill_null(val).alias(col))
|
||||
return result
|
||||
|
||||
|
||||
@PluginRegistry.register_processor("winsorizer")
|
||||
class Winsorizer(BaseProcessor):
|
||||
"""缩尾处理器 - 防止极端值影响(只在训练阶段计算分位数)"""
|
||||
|
||||
stage = PipelineStage.TRAIN
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
columns: Optional[List[str]] = None,
|
||||
lower: float = 0.01,
|
||||
upper: float = 0.99,
|
||||
):
|
||||
super().__init__(columns)
|
||||
self.lower = lower
|
||||
self.upper = upper
|
||||
|
||||
def fit(self, data: pl.DataFrame) -> "Winsorizer":
|
||||
cols = _get_numeric_columns(data, self.columns)
|
||||
bounds = {}
|
||||
|
||||
for col in cols:
|
||||
bounds[col] = {
|
||||
"lower": data[col].quantile(self.lower),
|
||||
"upper": data[col].quantile(self.upper),
|
||||
}
|
||||
|
||||
self._fitted_params = {"bounds": bounds, "columns": cols}
|
||||
self._is_fitted = True
|
||||
return self
|
||||
|
||||
def transform(self, data: pl.DataFrame) -> pl.DataFrame:
|
||||
result = data
|
||||
for col, bounds in self._fitted_params.get("bounds", {}).items():
|
||||
if col in result.columns:
|
||||
result = result.with_columns(
|
||||
pl.col(col).clip(bounds["lower"], bounds["upper"]).alias(col)
|
||||
)
|
||||
return result
|
||||
|
||||
|
||||
@PluginRegistry.register_processor("standard_scaler")
|
||||
class StandardScaler(BaseProcessor):
|
||||
"""标准化处理器 - Z-score标准化"""
|
||||
|
||||
stage = PipelineStage.ALL
|
||||
|
||||
def fit(self, data: pl.DataFrame) -> "StandardScaler":
|
||||
cols = _get_numeric_columns(data, self.columns)
|
||||
stats = {}
|
||||
|
||||
for col in cols:
|
||||
stats[col] = {"mean": data[col].mean(), "std": data[col].std()}
|
||||
|
||||
self._fitted_params = {"stats": stats, "columns": cols}
|
||||
self._is_fitted = True
|
||||
return self
|
||||
|
||||
def transform(self, data: pl.DataFrame) -> pl.DataFrame:
|
||||
result = data
|
||||
for col, stats in self._fitted_params.get("stats", {}).items():
|
||||
if col in result.columns and stats["std"] is not None and stats["std"] > 0:
|
||||
result = result.with_columns(
|
||||
((pl.col(col) - stats["mean"]) / stats["std"]).alias(col)
|
||||
)
|
||||
return result
|
||||
|
||||
|
||||
@PluginRegistry.register_processor("minmax_scaler")
|
||||
class MinMaxScaler(BaseProcessor):
|
||||
"""归一化处理器 - 缩放到[0, 1]范围"""
|
||||
|
||||
stage = PipelineStage.ALL
|
||||
|
||||
def fit(self, data: pl.DataFrame) -> "MinMaxScaler":
|
||||
cols = _get_numeric_columns(data, self.columns)
|
||||
stats = {}
|
||||
|
||||
for col in cols:
|
||||
stats[col] = {"min": data[col].min(), "max": data[col].max()}
|
||||
|
||||
self._fitted_params = {"stats": stats, "columns": cols}
|
||||
self._is_fitted = True
|
||||
return self
|
||||
|
||||
def transform(self, data: pl.DataFrame) -> pl.DataFrame:
|
||||
result = data
|
||||
for col, stats in self._fitted_params.get("stats", {}).items():
|
||||
if col in result.columns:
|
||||
range_val = stats["max"] - stats["min"]
|
||||
if range_val is not None and range_val > 0:
|
||||
result = result.with_columns(
|
||||
((pl.col(col) - stats["min"]) / range_val).alias(col)
|
||||
)
|
||||
return result
|
||||
|
||||
|
||||
@PluginRegistry.register_processor("rank_transformer")
|
||||
class RankTransformer(BaseProcessor):
|
||||
"""排名转换处理器 - 转换为截面排名"""
|
||||
|
||||
stage = PipelineStage.ALL
|
||||
|
||||
def fit(self, data: pl.DataFrame) -> "RankTransformer":
|
||||
self._is_fitted = True
|
||||
return self
|
||||
|
||||
def transform(self, data: pl.DataFrame) -> pl.DataFrame:
|
||||
result = data
|
||||
cols = self.columns or _get_numeric_columns(data)
|
||||
|
||||
for col in cols:
|
||||
if col in result.columns:
|
||||
result = result.with_columns(
|
||||
pl.col(col).rank().over("trade_date").alias(col)
|
||||
)
|
||||
return result
|
||||
|
||||
|
||||
@PluginRegistry.register_processor("neutralizer")
|
||||
class Neutralizer(BaseProcessor):
|
||||
"""中性化处理器 - 行业/市值中性化"""
|
||||
|
||||
stage = PipelineStage.ALL
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
columns: Optional[List[str]] = None,
|
||||
group_col: str = "industry",
|
||||
exclude_cols: Optional[List[str]] = None,
|
||||
):
|
||||
super().__init__(columns)
|
||||
self.group_col = group_col
|
||||
self.exclude_cols = exclude_cols or []
|
||||
|
||||
def fit(self, data: pl.DataFrame) -> "Neutralizer":
|
||||
self._is_fitted = True
|
||||
return self
|
||||
|
||||
def transform(self, data: pl.DataFrame) -> pl.DataFrame:
|
||||
result = data
|
||||
cols = self.columns or _get_numeric_columns(data)
|
||||
|
||||
for col in cols:
|
||||
if col in result.columns and col not in self.exclude_cols:
|
||||
result = result.with_columns(
|
||||
(
|
||||
pl.col(col)
|
||||
- pl.col(col).mean().over(["trade_date", self.group_col])
|
||||
).alias(col)
|
||||
)
|
||||
return result
|
||||
|
||||
|
||||
__all__ = [
|
||||
"DropNAProcessor",
|
||||
"FillNAProcessor",
|
||||
"Winsorizer",
|
||||
"StandardScaler",
|
||||
"MinMaxScaler",
|
||||
"RankTransformer",
|
||||
"Neutralizer",
|
||||
]
|
||||
Reference in New Issue
Block a user