refactor: 存储层迁移DuckDB + 模块重构

- 存储层重构: HDF5 → DuckDB(UPSERT模式、线程安全存储)
- Sync类迁移: DataSync从sync.py迁移到api_daily.py(职责分离)
- 模型模块重构: src/models → src/pipeline(更清晰的命名)
- 新增因子模块: factors/momentum (MA、收益率排名)、factors/financial
- 新增API接口: api_namechange、api_bak_basic
- 新增训练入口: training模块(main.py、pipeline配置)
- 工具函数统一: get_today_date等移至utils.py
- 文档更新: AGENTS.md添加架构变更历史
This commit is contained in:
2026-02-23 16:23:53 +08:00
parent 9f95be56a0
commit 593ec99466
32 changed files with 4181 additions and 1395 deletions

View File

@@ -0,0 +1,21 @@
"""处理器模块"""
from src.pipeline.processors.processors import (
DropNAProcessor,
FillNAProcessor,
Winsorizer,
StandardScaler,
MinMaxScaler,
RankTransformer,
Neutralizer,
)
__all__ = [
"DropNAProcessor",
"FillNAProcessor",
"Winsorizer",
"StandardScaler",
"MinMaxScaler",
"RankTransformer",
"Neutralizer",
]

View File

@@ -0,0 +1,238 @@
"""内置数据处理器
提供常用的数据预处理和转换处理器。
"""
from typing import List, Optional, Dict, Any
import polars as pl
import numpy as np
from src.pipeline.core import BaseProcessor, PipelineStage
from src.pipeline.registry import PluginRegistry
# 数值类型列表
FLOAT_TYPES = [pl.Float32, pl.Float64, pl.Int8, pl.Int16, pl.Int32, pl.Int64]
def _get_numeric_columns(
data: pl.DataFrame, columns: Optional[List[str]] = None
) -> List[str]:
"""获取数值列"""
if columns is not None:
return columns
return [c for c in data.columns if data[c].dtype in FLOAT_TYPES]
@PluginRegistry.register_processor("dropna")
class DropNAProcessor(BaseProcessor):
"""缺失值删除处理器"""
stage = PipelineStage.ALL
def fit(self, data: pl.DataFrame) -> "DropNAProcessor":
self._is_fitted = True
return self
def transform(self, data: pl.DataFrame) -> pl.DataFrame:
cols = self.columns or data.columns
return data.drop_nulls(subset=cols)
@PluginRegistry.register_processor("fillna")
class FillNAProcessor(BaseProcessor):
"""缺失值填充处理器(只在训练阶段计算填充值)"""
stage = PipelineStage.TRAIN
def __init__(self, columns: Optional[List[str]] = None, method: str = "median"):
super().__init__(columns)
if method not in ["median", "mean", "zero"]:
raise ValueError(f"Unknown fill method: {method}")
self.method = method
def fit(self, data: pl.DataFrame) -> "FillNAProcessor":
cols = _get_numeric_columns(data, self.columns)
fill_values = {}
for col in cols:
if self.method == "median":
fill_values[col] = data[col].median()
elif self.method == "mean":
fill_values[col] = data[col].mean()
elif self.method == "zero":
fill_values[col] = 0.0
self._fitted_params = {"fill_values": fill_values, "columns": cols}
self._is_fitted = True
return self
def transform(self, data: pl.DataFrame) -> pl.DataFrame:
result = data
for col, val in self._fitted_params.get("fill_values", {}).items():
if col in result.columns:
result = result.with_columns(pl.col(col).fill_null(val).alias(col))
return result
@PluginRegistry.register_processor("winsorizer")
class Winsorizer(BaseProcessor):
"""缩尾处理器 - 防止极端值影响(只在训练阶段计算分位数)"""
stage = PipelineStage.TRAIN
def __init__(
self,
columns: Optional[List[str]] = None,
lower: float = 0.01,
upper: float = 0.99,
):
super().__init__(columns)
self.lower = lower
self.upper = upper
def fit(self, data: pl.DataFrame) -> "Winsorizer":
cols = _get_numeric_columns(data, self.columns)
bounds = {}
for col in cols:
bounds[col] = {
"lower": data[col].quantile(self.lower),
"upper": data[col].quantile(self.upper),
}
self._fitted_params = {"bounds": bounds, "columns": cols}
self._is_fitted = True
return self
def transform(self, data: pl.DataFrame) -> pl.DataFrame:
result = data
for col, bounds in self._fitted_params.get("bounds", {}).items():
if col in result.columns:
result = result.with_columns(
pl.col(col).clip(bounds["lower"], bounds["upper"]).alias(col)
)
return result
@PluginRegistry.register_processor("standard_scaler")
class StandardScaler(BaseProcessor):
"""标准化处理器 - Z-score标准化"""
stage = PipelineStage.ALL
def fit(self, data: pl.DataFrame) -> "StandardScaler":
cols = _get_numeric_columns(data, self.columns)
stats = {}
for col in cols:
stats[col] = {"mean": data[col].mean(), "std": data[col].std()}
self._fitted_params = {"stats": stats, "columns": cols}
self._is_fitted = True
return self
def transform(self, data: pl.DataFrame) -> pl.DataFrame:
result = data
for col, stats in self._fitted_params.get("stats", {}).items():
if col in result.columns and stats["std"] is not None and stats["std"] > 0:
result = result.with_columns(
((pl.col(col) - stats["mean"]) / stats["std"]).alias(col)
)
return result
@PluginRegistry.register_processor("minmax_scaler")
class MinMaxScaler(BaseProcessor):
"""归一化处理器 - 缩放到[0, 1]范围"""
stage = PipelineStage.ALL
def fit(self, data: pl.DataFrame) -> "MinMaxScaler":
cols = _get_numeric_columns(data, self.columns)
stats = {}
for col in cols:
stats[col] = {"min": data[col].min(), "max": data[col].max()}
self._fitted_params = {"stats": stats, "columns": cols}
self._is_fitted = True
return self
def transform(self, data: pl.DataFrame) -> pl.DataFrame:
result = data
for col, stats in self._fitted_params.get("stats", {}).items():
if col in result.columns:
range_val = stats["max"] - stats["min"]
if range_val is not None and range_val > 0:
result = result.with_columns(
((pl.col(col) - stats["min"]) / range_val).alias(col)
)
return result
@PluginRegistry.register_processor("rank_transformer")
class RankTransformer(BaseProcessor):
"""排名转换处理器 - 转换为截面排名"""
stage = PipelineStage.ALL
def fit(self, data: pl.DataFrame) -> "RankTransformer":
self._is_fitted = True
return self
def transform(self, data: pl.DataFrame) -> pl.DataFrame:
result = data
cols = self.columns or _get_numeric_columns(data)
for col in cols:
if col in result.columns:
result = result.with_columns(
pl.col(col).rank().over("trade_date").alias(col)
)
return result
@PluginRegistry.register_processor("neutralizer")
class Neutralizer(BaseProcessor):
"""中性化处理器 - 行业/市值中性化"""
stage = PipelineStage.ALL
def __init__(
self,
columns: Optional[List[str]] = None,
group_col: str = "industry",
exclude_cols: Optional[List[str]] = None,
):
super().__init__(columns)
self.group_col = group_col
self.exclude_cols = exclude_cols or []
def fit(self, data: pl.DataFrame) -> "Neutralizer":
self._is_fitted = True
return self
def transform(self, data: pl.DataFrame) -> pl.DataFrame:
result = data
cols = self.columns or _get_numeric_columns(data)
for col in cols:
if col in result.columns and col not in self.exclude_cols:
result = result.with_columns(
(
pl.col(col)
- pl.col(col).mean().over(["trade_date", self.group_col])
).alias(col)
)
return result
__all__ = [
"DropNAProcessor",
"FillNAProcessor",
"Winsorizer",
"StandardScaler",
"MinMaxScaler",
"RankTransformer",
"Neutralizer",
]