refactor(data): 移除 api_daily 模块并更新文档
- 删除 src/data/api_wrappers/api_daily.py (240行) - 更新 6 个文档文件,将 daily 表引用替换为 pro_bar - 同步 README.md 中的因子框架和训练模块示例 BREAKING CHANGE: api_daily 模块已移除,请使用 api_pro_bar 替代
This commit is contained in:
176
README.md
176
README.md
@@ -36,9 +36,21 @@ ProStock/
|
||||
│ │
|
||||
│ ├── data/ # 数据获取与存储
|
||||
│ │ ├── api_wrappers/ # Tushare API 封装
|
||||
│ │ │ ├── api_daily.py # 日线数据接口
|
||||
│ │ │ ├── api_stock_basic.py # 股票基础信息
|
||||
│ │ │ └── api_trade_cal.py # 交易日历
|
||||
│ │ │ ├── api_pro_bar.py # Pro Bar行情数据接口(主用)
|
||||
│ │ │ ├── api_stock_basic.py # 股票基础信息接口
|
||||
│ │ │ ├── api_trade_cal.py # 交易日历接口
|
||||
│ │ │ ├── api_bak_basic.py # 历史股票列表接口
|
||||
│ │ │ ├── api_namechange.py # 股票名称变更接口
|
||||
│ │ │ ├── api_stock_st.py # ST股票信息接口
|
||||
│ │ │ ├── api_daily_basic.py # 每日指标接口
|
||||
│ │ │ ├── api_stk_limit.py # 涨跌停价格接口
|
||||
│ │ │ ├── financial_data/ # 财务数据接口
|
||||
│ │ │ │ ├── api_income.py # 利润表接口
|
||||
│ │ │ │ ├── api_balance.py # 资产负债表接口
|
||||
│ │ │ │ ├── api_cashflow.py # 现金流量表接口
|
||||
│ │ │ │ ├── api_fina_indicator.py # 财务指标接口
|
||||
│ │ │ │ └── api_financial_sync.py # 财务数据同步调度中心
|
||||
│ │ │ └── __init__.py
|
||||
│ │ ├── client.py # Tushare 客户端(含限流)
|
||||
│ │ ├── config.py # 数据模块配置
|
||||
│ │ ├── db_manager.py # DuckDB 表管理和同步
|
||||
@@ -140,83 +152,123 @@ uv run python -c "from src.data.db_inspector import get_db_info; get_db_info()"
|
||||
### 因子计算
|
||||
|
||||
```python
|
||||
from src.factors import FactorEngine, DataLoader, DataSpec
|
||||
from src.factors.base import CrossSectionalFactor, TimeSeriesFactor
|
||||
from src.factors import FactorEngine
|
||||
from src.factors.api import close, ts_mean, cs_rank
|
||||
import polars as pl
|
||||
|
||||
# 自定义截面因子:PE排名
|
||||
class PERankFactor(CrossSectionalFactor):
|
||||
name = "pe_rank"
|
||||
data_specs = [DataSpec("daily", ["ts_code", "trade_date", "pe"], lookback_days=1)]
|
||||
|
||||
def compute(self, data) -> pl.Series:
|
||||
cs = data.get_cross_section()
|
||||
return cs["pe"].rank()
|
||||
# 初始化引擎
|
||||
engine = FactorEngine()
|
||||
|
||||
# 自定义时序因子:20日移动平均
|
||||
class MA20Factor(TimeSeriesFactor):
|
||||
name = "ma20"
|
||||
data_specs = [DataSpec("daily", ["ts_code", "trade_date", "close"], lookback_days=20)]
|
||||
|
||||
def compute(self, data) -> pl.Series:
|
||||
return data.get_column("close").rolling_mean(window_size=20)
|
||||
# 方式1:使用 DSL 表达式注册
|
||||
engine.register("ma20", ts_mean(close, 20))
|
||||
engine.register("price_rank", cs_rank(close))
|
||||
|
||||
# 执行计算
|
||||
loader = DataLoader(data_dir="data")
|
||||
engine = FactorEngine(loader)
|
||||
# 方式2:使用字符串表达式(推荐)
|
||||
engine.add_factor("ma20", "ts_mean(close, 20)")
|
||||
engine.add_factor("alpha", "cs_rank(ts_mean(close, 5) - ts_mean(close, 20))")
|
||||
|
||||
# 计算截面因子
|
||||
pe_rank = PERankFactor()
|
||||
result1 = engine.compute(pe_rank, start_date="20240101", end_date="20240131")
|
||||
# 方式3:从 metadata 查询(需先在 metadata 中定义)
|
||||
engine.add_factor("mom_5d")
|
||||
|
||||
# 计算时序因子
|
||||
ma20 = MA20Factor()
|
||||
result2 = engine.compute(ma20, stock_codes=["000001.SZ"],
|
||||
start_date="20240101", end_date="20240131")
|
||||
# 计算因子
|
||||
result = engine.compute(
|
||||
factor_names=["ma20", "price_rank"],
|
||||
start_date="20240101",
|
||||
end_date="20240131"
|
||||
)
|
||||
|
||||
# 因子组合
|
||||
combined = 0.5 * pe_rank + 0.3 * ma20
|
||||
# 查看执行计划
|
||||
plan = engine.preview_plan("ma20")
|
||||
```
|
||||
|
||||
### 模型训练
|
||||
|
||||
```python
|
||||
from src.models import PluginRegistry, ProcessingPipeline
|
||||
from src.models.core import PipelineStage
|
||||
from src.training import (
|
||||
Trainer,
|
||||
LightGBMModel,
|
||||
DateSplitter,
|
||||
StockPoolManager,
|
||||
NullFiller,
|
||||
Winsorizer,
|
||||
StandardScaler,
|
||||
STFilter,
|
||||
check_data_quality,
|
||||
)
|
||||
from src.factors import FactorEngine
|
||||
import polars as pl
|
||||
|
||||
# 创建处理流水线
|
||||
pipeline = ProcessingPipeline([
|
||||
PluginRegistry.get_processor("dropna")(),
|
||||
PluginRegistry.get_processor("winsorizer")(lower=0.01, upper=0.99),
|
||||
PluginRegistry.get_processor("standard_scaler")(),
|
||||
])
|
||||
# 1. 创建模型
|
||||
model = LightGBMModel(params={
|
||||
"objective": "regression",
|
||||
"metric": "mae",
|
||||
"num_leaves": 20,
|
||||
"learning_rate": 0.01,
|
||||
"n_estimators": 1000,
|
||||
})
|
||||
|
||||
# 准备数据
|
||||
data = pl.read_csv("features.csv") # 包含特征和标签
|
||||
# 2. 准备因子数据
|
||||
engine = FactorEngine()
|
||||
engine.add_factor("ma5", "ts_mean(close, 5)")
|
||||
engine.add_factor("ma20", "ts_mean(close, 20)")
|
||||
|
||||
# 划分训练/测试集
|
||||
from src.models.core import WalkForwardSplit
|
||||
splitter = WalkForwardSplit(train_window=252, test_window=21)
|
||||
# 计算全市场因子
|
||||
data = engine.compute(
|
||||
factor_names=["ma5", "ma20", "future_return_5"],
|
||||
start_date="20200101",
|
||||
end_date="20231231"
|
||||
)
|
||||
|
||||
# 获取 LightGBM 模型
|
||||
ModelClass = PluginRegistry.get_model("lightgbm")
|
||||
model = ModelClass(task_type="regression", params={"n_estimators": 100})
|
||||
# 3. 创建数据处理器
|
||||
processors = [
|
||||
NullFiller(feature_cols=["ma5", "ma20"], strategy="mean"),
|
||||
Winsorizer(feature_cols=["ma5", "ma20"], lower=0.01, upper=0.99),
|
||||
StandardScaler(feature_cols=["ma5", "ma20"]),
|
||||
]
|
||||
|
||||
# 训练循环
|
||||
for train_idx, test_idx in splitter.split(data):
|
||||
train_data = data[train_idx]
|
||||
test_data = data[test_idx]
|
||||
|
||||
# 数据处理
|
||||
X_train = pipeline.fit_transform(train_data.drop("target"))
|
||||
X_test = pipeline.transform(test_data.drop("target"))
|
||||
y_train = train_data["target"]
|
||||
y_test = test_data["target"]
|
||||
|
||||
# 训练模型
|
||||
model.fit(X_train, y_train)
|
||||
predictions = model.predict(X_test)
|
||||
# 4. 创建股票池筛选函数
|
||||
def stock_pool_filter(df: pl.DataFrame) -> pl.Series:
|
||||
"""筛选小市值股票"""
|
||||
code_filter = (
|
||||
~df["ts_code"].str.starts_with("300") & # 排除创业板
|
||||
~df["ts_code"].str.starts_with("688") # 排除科创板
|
||||
)
|
||||
return code_filter
|
||||
|
||||
pool_manager = StockPoolManager(
|
||||
filter_func=stock_pool_filter,
|
||||
required_columns=["total_mv"],
|
||||
)
|
||||
|
||||
# 5. 创建过滤器
|
||||
st_filter = STFilter(data_router=engine.router)
|
||||
|
||||
# 6. 创建数据划分器
|
||||
splitter = DateSplitter(
|
||||
train_start="20200101",
|
||||
train_end="20221231",
|
||||
val_start="20230101",
|
||||
val_end="20230630",
|
||||
test_start="20230701",
|
||||
test_end="20231231",
|
||||
)
|
||||
|
||||
# 7. 创建训练器
|
||||
trainer = Trainer(
|
||||
model=model,
|
||||
pool_manager=pool_manager,
|
||||
processors=processors,
|
||||
filters=[st_filter],
|
||||
splitter=splitter,
|
||||
target_col="future_return_5",
|
||||
feature_cols=["ma5", "ma20"],
|
||||
)
|
||||
|
||||
# 8. 执行训练
|
||||
results = trainer.train(data)
|
||||
|
||||
# 9. 获取预测结果
|
||||
predictions = trainer.get_results()
|
||||
```
|
||||
|
||||
## 核心设计
|
||||
|
||||
Reference in New Issue
Block a user