From 593ec99466646db54c629bb9720ffb9d79b3ceb1 Mon Sep 17 00:00:00 2001 From: liaozhaorun <1300336796@qq.com> Date: Mon, 23 Feb 2026 16:23:53 +0800 Subject: [PATCH] =?UTF-8?q?refactor:=20=E5=AD=98=E5=82=A8=E5=B1=82?= =?UTF-8?q?=E8=BF=81=E7=A7=BBDuckDB=20+=20=E6=A8=A1=E5=9D=97=E9=87=8D?= =?UTF-8?q?=E6=9E=84=20-=20=E5=AD=98=E5=82=A8=E5=B1=82=E9=87=8D=E6=9E=84:?= =?UTF-8?q?=20HDF5=20=E2=86=92=20DuckDB=EF=BC=88UPSERT=E6=A8=A1=E5=BC=8F?= =?UTF-8?q?=E3=80=81=E7=BA=BF=E7=A8=8B=E5=AE=89=E5=85=A8=E5=AD=98=E5=82=A8?= =?UTF-8?q?=EF=BC=89=20-=20Sync=E7=B1=BB=E8=BF=81=E7=A7=BB:=20DataSync?= =?UTF-8?q?=E4=BB=8Esync.py=E8=BF=81=E7=A7=BB=E5=88=B0api=5Fdaily.py?= =?UTF-8?q?=EF=BC=88=E8=81=8C=E8=B4=A3=E5=88=86=E7=A6=BB=EF=BC=89=20-=20?= =?UTF-8?q?=E6=A8=A1=E5=9E=8B=E6=A8=A1=E5=9D=97=E9=87=8D=E6=9E=84:=20src/m?= =?UTF-8?q?odels=20=E2=86=92=20src/pipeline=EF=BC=88=E6=9B=B4=E6=B8=85?= =?UTF-8?q?=E6=99=B0=E7=9A=84=E5=91=BD=E5=90=8D=EF=BC=89=20-=20=E6=96=B0?= =?UTF-8?q?=E5=A2=9E=E5=9B=A0=E5=AD=90=E6=A8=A1=E5=9D=97:=20factors/moment?= =?UTF-8?q?um=20(MA=E3=80=81=E6=94=B6=E7=9B=8A=E7=8E=87=E6=8E=92=E5=90=8D)?= =?UTF-8?q?=E3=80=81factors/financial=20-=20=E6=96=B0=E5=A2=9EAPI=E6=8E=A5?= =?UTF-8?q?=E5=8F=A3:=20api=5Fnamechange=E3=80=81api=5Fbak=5Fbasic=20-=20?= =?UTF-8?q?=E6=96=B0=E5=A2=9E=E8=AE=AD=E7=BB=83=E5=85=A5=E5=8F=A3:=20train?= =?UTF-8?q?ing=E6=A8=A1=E5=9D=97=EF=BC=88main.py=E3=80=81pipeline=E9=85=8D?= =?UTF-8?q?=E7=BD=AE=EF=BC=89=20-=20=E5=B7=A5=E5=85=B7=E5=87=BD=E6=95=B0?= =?UTF-8?q?=E7=BB=9F=E4=B8=80:=20get=5Ftoday=5Fdate=E7=AD=89=E7=A7=BB?= =?UTF-8?q?=E8=87=B3utils.py=20-=20=E6=96=87=E6=A1=A3=E6=9B=B4=E6=96=B0:?= =?UTF-8?q?=20AGENTS.md=E6=B7=BB=E5=8A=A0=E6=9E=B6=E6=9E=84=E5=8F=98?= =?UTF-8?q?=E6=9B=B4=E5=8E=86=E5=8F=B2?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitignore | 3 + AGENTS.md | 117 +- src/data/api_wrappers/API_INTERFACE_SPEC.md | 563 ++++++-- src/data/api_wrappers/__init__.py | 19 +- src/data/api_wrappers/api.md | 97 +- src/data/api_wrappers/api_bak_basic.py | 243 ++++ src/data/api_wrappers/api_daily.py | 759 +++++++++- src/data/api_wrappers/api_namechange.py | 113 ++ src/data/sync.py | 910 +++--------- src/data/utils.py | 75 + src/factors/__init__.py | 31 +- src/factors/financial/__init__.py | 15 + src/factors/momentum/__init__.py | 19 + src/factors/momentum/ma.py | 78 ++ src/factors/momentum/return_rank.py | 100 ++ src/{models => pipeline}/__init__.py | 17 +- src/{models => pipeline}/core/__init__.py | 4 +- src/{models => pipeline}/core/base.py | 0 src/{models => pipeline}/core/splitter.py | 2 +- src/{models => pipeline}/models/__init__.py | 2 +- src/{models => pipeline}/models/models.py | 4 +- src/{models => pipeline}/pipeline.py | 2 +- .../processors/__init__.py | 2 +- .../processors/processors.py | 4 +- src/{models => pipeline}/registry.py | 2 +- src/training/__init__.py | 46 + src/training/main.py | 27 + src/training/output/top_stocks.tsv | 1216 +++++++++++++++++ src/training/pipeline.py | 448 ++++++ tests/{models => pipeline}/test_core.py | 30 +- tests/test_sync.py | 372 ++--- tests/test_sync_real.py | 256 ---- 32 files changed, 4181 insertions(+), 1395 deletions(-) create mode 100644 src/data/api_wrappers/api_bak_basic.py create mode 100644 src/data/api_wrappers/api_namechange.py create mode 100644 src/data/utils.py create mode 100644 src/factors/financial/__init__.py create mode 100644 src/factors/momentum/__init__.py create mode 100644 src/factors/momentum/ma.py create mode 100644 src/factors/momentum/return_rank.py rename src/{models => pipeline}/__init__.py (78%) rename src/{models => pipeline}/core/__init__.py (85%) rename src/{models => pipeline}/core/base.py (100%) rename src/{models => pipeline}/core/splitter.py (99%) rename src/{models => pipeline}/models/__init__.py (74%) rename src/{models => pipeline}/models/models.py (98%) rename src/{models => pipeline}/pipeline.py (97%) rename src/{models => pipeline}/processors/__init__.py (86%) rename src/{models => pipeline}/processors/processors.py (98%) rename src/{models => pipeline}/registry.py (99%) create mode 100644 src/training/__init__.py create mode 100644 src/training/main.py create mode 100644 src/training/output/top_stocks.tsv create mode 100644 src/training/pipeline.py rename tests/{models => pipeline}/test_core.py (94%) delete mode 100644 tests/test_sync_real.py diff --git a/.gitignore b/.gitignore index 0436138..50e6e55 100644 --- a/.gitignore +++ b/.gitignore @@ -75,3 +75,6 @@ temp/ # 数据目录(允许跟踪,但忽略内容) data/* + +# AI Agent 工作目录 +/.sisyphus/ diff --git a/AGENTS.md b/AGENTS.md index bca8d6e..bb85ce3 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -2,6 +2,15 @@ A股量化投资框架 - Python 项目,用于量化股票投资分析。 +## 交流语言要求 + +**⚠️ 强制要求:所有沟通和思考过程必须使用中文。** + + 所有与 AI Agent 的交流必须使用中文 + 代码中的注释和文档字符串使用中文 + 禁止使用英文进行思考或沟通 + + ## 构建/检查/测试命令 **⚠️ 重要:本项目强制使用 uv 作为 Python 包管理器和运行工具。禁止直接使用 `python` 或 `pip` 命令。** @@ -67,25 +76,69 @@ uv run pytest tests/test_sync.py # ✅ 正确 ``` ProStock/ ├── src/ # 源代码 -│ ├── data/ # 数据采集模块 +│ ├── config/ # 配置管理 │ │ ├── __init__.py -│ │ ├── client.py # Tushare API 客户端,带速率限制 -│ │ ├── config.py # 配置(pydantic-settings) -│ │ ├── daily.py # 日线市场数据 +│ │ └── settings.py # pydantic-settings 配置 +│ │ +│ ├── data/ # 数据获取与存储 +│ │ ├── api_wrappers/ # Tushare API 封装 +│ │ │ ├── API_INTERFACE_SPEC.md # 接口规范文档 +│ │ │ ├── api.md # API 接口定义 +│ │ │ ├── api_daily.py # 日线数据接口 +│ │ │ ├── api_stock_basic.py # 股票基础信息接口 +│ │ │ ├── api_trade_cal.py # 交易日历接口 +│ │ │ └── __init__.py +│ │ ├── __init__.py +│ │ ├── client.py # Tushare API 客户端(带速率限制) +│ │ ├── config.py # 数据模块配置 +│ │ ├── db_inspector.py # 数据库信息查看工具 +│ │ ├── db_manager.py # DuckDB 表管理和同步 │ │ ├── rate_limiter.py # 令牌桶速率限制器 -│ │ ├── stock_basic.py # 股票基本信息 -│ │ ├── storage.py # HDF5 存储管理器 -│ │ └── sync.py # 数据同步 -│ ├── config/ # 全局配置 +│ │ ├── storage.py # 数据存储核心 +│ │ └── sync.py # 数据同步主逻辑 +│ │ +│ ├── factors/ # 因子计算框架 │ │ ├── __init__.py -│ │ └── settings.py # 应用设置(pydantic-settings) -│ └── __init__.py +│ │ ├── base.py # 因子基类(截面/时序) +│ │ ├── composite.py # 组合因子和标量运算 +│ │ ├── data_loader.py # 数据加载器 +│ │ ├── data_spec.py # 数据规格定义 +│ │ ├── engine.py # 因子执行引擎 +│ │ ├── momentum/ # 动量因子 +│ │ │ ├── __init__.py +│ │ │ ├── ma.py # 移动平均线 +│ │ │ └── return_rank.py # 收益排名 +│ │ └── financial/ # 财务因子 +│ │ └── __init__.py +│ │ +│ ├── pipeline/ # 模型训练管道 +│ │ ├── __init__.py +│ │ ├── pipeline.py # 处理流水线 +│ │ ├── registry.py # 插件注册中心 +│ │ ├── core/ # 核心抽象 +│ │ │ ├── __init__.py +│ │ │ ├── base.py # 基类定义 +│ │ │ └── splitter.py # 时间序列划分策略 +│ │ ├── models/ # 模型实现 +│ │ │ ├── __init__.py +│ │ │ └── models.py # LightGBM、CatBoost 等 +│ │ └── processors/ # 数据处理器 +│ │ ├── __init__.py +│ │ └── processors.py # 标准化、缩尾、中性化等 +│ │ +│ └── training/ # 训练入口 +│ ├── __init__.py +│ ├── main.py # 训练主程序 +│ ├── pipeline.py # 训练流程配置 +│ └── output/ # 训练输出 +│ └── top_stocks.tsv # 推荐股票结果 +│ ├── tests/ # 测试文件 │ ├── test_sync.py │ └── test_daily.py ├── config/ # 配置文件 │ └── .env.local # 环境变量(不在 git 中) -├── data/ # 数据存储(HDF5 文件) +├── data/ # 数据存储(DuckDB) ├── docs/ # 文档 ├── pyproject.toml # 项目配置 └── README.md @@ -182,10 +235,10 @@ except Exception as e: - 对配置单例使用 `@lru_cache()` ### 数据存储 -- 通过 `pandas.HDFStore` 使用 **HDF5 格式** 进行持久化 +- 使用 **DuckDB** 嵌入式 OLAP 数据库进行持久化 - 存储在 `data/` 目录中(通过 `DATA_PATH` 环境变量配置) -- 对可追加数据集使用 `format="table"` -- 追加时处理重复项:`drop_duplicates(subset=[...])` +- 使用 UPSERT 模式(`INSERT OR REPLACE`)处理重复数据 +- 多线程场景使用 `ThreadSafeStorage.queue_save()` + `flush()` 模式 ### 线程与并发 - 对 I/O 密集型任务(API 调用)使用 `ThreadPoolExecutor` @@ -240,3 +293,39 @@ uv run python -c "from src.data.sync import sync_all; sync_all(force_full=True)" # 自定义线程数 uv run python -c "from src.data.sync import sync_all; sync_all(max_workers=20)" ``` + +## 架构变更历史 + +### v2.0 (2026-02-23) - 重要更新 + +#### 存储层重构 + **变更**: 从 HDF5 迁移到 DuckDB + **原因**: DuckDB 提供更好的查询性能、SQL 下推能力、并发支持 + **影响**: 所有数据表现在使用 DuckDB 存储,旧 HDF5 文件可手动迁移 + +#### Sync 类迁移 + **变更**: `DataSync` 类从 `sync.py` 迁移到 `api_daily.py` + **原因**: 实现代码职责分离,每个 API 文件包含自己的同步逻辑 + **影响**: + - `sync.py` 保留为调度中心 + - `api_daily.py` 包含 `DailySync` 类和 `sync_daily` 函数 + +#### 新增模块 + **pipeline 模块**: 机器学习流水线组件(处理器、模型、划分策略) + **training 模块**: 训练入口程序 + **factors/momentum**: 动量因子(MA、收益率排名) + **factors/financial**: 财务因子框架 + **data/utils.py**: 日期工具函数集中管理 + +#### 新增 API 接口 + `api_namechange.py`: 股票曾用名接口(手动同步) + `api_bak_basic.py`: 历史股票列表接口 + +#### 工具函数统一 + `get_today_date()`、`get_next_date()`、`DEFAULT_START_DATE` 等函数统一在 `src/data/utils.py` 中管理 + 其他模块应从 `utils.py` 导入这些函数,避免重复定义 + +### v1.x (历史版本) + + 初始版本,使用 HDF5 存储 + 数据同步逻辑集中在 `sync.py` diff --git a/src/data/api_wrappers/API_INTERFACE_SPEC.md b/src/data/api_wrappers/API_INTERFACE_SPEC.md index 95ccf1e..ced32b6 100644 --- a/src/data/api_wrappers/API_INTERFACE_SPEC.md +++ b/src/data/api_wrappers/API_INTERFACE_SPEC.md @@ -1,12 +1,26 @@ # ProStock 数据接口封装规范 - ## 1. 概述 - 本文档定义了新增 Tushare API 接口封装的标准规范。所有非特殊接口必须遵循此规范,确保: - 代码风格统一 - 自动 sync 支持 - 增量更新逻辑一致 - 减少存储写入压力 +- 类型安全(强制类型提示) + +### 1.1 技术栈 + +- **存储层**: DuckDB(高性能嵌入式 OLAP 数据库) +- **数据格式**: Pandas DataFrame / Polars DataFrame +- **速率限制**: 令牌桶算法(TokenBucketRateLimiter) +- **并发**: ThreadPoolExecutor 多线程 +- **类型系统**: Python 3.10+ 类型提示 + +### 1.2 自动化支持 + +项目提供 `prostock-api-interface` Skill 来自动化接口封装流程。在 `api.md` 中定义接口后,调用该 Skill 可自动生成: +- 数据模块文件(`src/data/api_wrappers/api_{data_type}.py`) +- 数据库表管理配置 +- 测试文件(`tests/test_{data_type}.py`) ## 2. 接口分类 @@ -14,37 +28,41 @@ 以下接口有独立的同步逻辑,不参与自动 sync 机制: -| 接口类型 | 示例 | 说明 | -|---------|------|------| -| 交易日历 | `trade_cal` | 全局数据,按日期范围获取 | -| 股票基础信息 | `stock_basic` | 一次性全量获取,CSV 存储 | -| 辅助数据 | 行业分类、概念分类 | 低频更新,独立管理 | +| 接口类型 | 文件名 | 说明 | +|---------|--------|------| +| 交易日历 | `api_trade_cal.py` | 全局数据,按日期范围获取,使用 HDF5 缓存 | +| 股票基础信息 | `api_stock_basic.py` | 一次性全量获取,CSV 存储 | +| 辅助数据 | `api_industry`, `api_concept` | 低频更新,独立管理 | ### 2.2 标准接口(必须遵循本规范) -所有按股票或按日期获取的因子数据、行情数据、财务数据等,必须遵循本规范。 +所有按股票或按日期获取的因子数据、行情数据、财务数据等,必须遵循本规范: + +- 按日期获取:**优先选择**,支持全市场批量获取 +- 按股票获取:仅当 API 不支持按日期获取时使用 ## 3. 文件结构要求 ### 3.1 文件命名 ``` -{data_type}.py +api_{data_type}.py ``` -示例:`daily.py`、`moneyflow.py`、`limit_list.py` +- 示例:`api_daily.py`、`api_moneyflow.py`、`api_limit_list.py` +- **必须**以 `api_` 前缀开头 +- 使用小写字母和下划线 ### 3.2 文件位置 -所有接口文件必须位于 `src/data/` 目录下。 +所有接口文件必须位于 `src/data/api_wrappers/` 目录下。 ### 3.3 导出要求 -新接口必须在 `src/data/__init__.py` 中导出: +新接口必须在 `src/data/api_wrappers/__init__.py` 中导出: ```python -from src.data.{module_name} import get_{data_type} - +from src.data.api_wrappers.api_{data_type} import get_{data_type} __all__ = [ # ... 其他导出 ... "get_{data_type}", @@ -59,7 +77,7 @@ __all__ = [ #### 4.1.1 按日期获取的接口(优先) -适用于:涨跌停、龙虎榜、筹码分布等。 +适用于:涨跌停、龙虎榜、筹码分布、每日指标等。 **函数签名要求**: @@ -77,6 +95,7 @@ def get_{data_type}( - 优先使用 `trade_date` 获取单日全市场数据 - 支持 `start_date + end_date` 获取区间数据 - `ts_code` 作为可选过滤参数 +- **性能优势**: 单日全市场数据一次 API 调用即可完成 #### 4.1.2 按股票获取的接口 @@ -93,152 +112,504 @@ def get_{data_type}( ) -> pd.DataFrame: ``` +**要求**: +- `ts_code` 为必选参数 +- 需要遍历所有股票获取全市场数据 + ### 4.2 文档字符串要求 -函数必须包含 Google 风格的完整文档字符串,包含: -- 函数功能描述 -- `Args` 部分:所有参数说明 -- `Returns` 部分:返回的 DataFrame 包含的字段说明 -- `Example` 部分:使用示例 +函数必须包含 **Google 风格**的完整文档字符串,包含: + +```python +def get_{data_type}(...) -> pd.DataFrame: + """Fetch {数据描述} from Tushare. + + This interface retrieves {详细描述}. + + Args: + ts_code: Stock code (e.g., '000001.SZ', '600000.SH') + trade_date: Specific trade date (YYYYMMDD format) + start_date: Start date (YYYYMMDD format) + end_date: End date (YYYYMMDD format) + # 其他参数... + + Returns: + pd.DataFrame with columns: + - ts_code: Stock code + - trade_date: Trade date (YYYYMMDD) + - {其他字段}: {字段描述} + + Example: + >>> # Get single date data for all stocks + >>> data = get_{data_type}(trade_date='20240101') + >>> + >>> # Get date range data + >>> data = get_{data_type}(start_date='20240101', end_date='20240131') + >>> + >>> # Get specific stock data + >>> data = get_{data_type}(ts_code='000001.SZ', trade_date='20240101') + """ +``` ### 4.3 日期格式要求 -- 所有日期参数和返回值使用 `YYYYMMDD` 字符串格式 +- 所有日期参数使用 **YYYYMMDD** 字符串格式 - 统一使用 `trade_date` 作为日期字段名 -- 如果 API 返回其他日期字段名(如 `date`、`end_date`),必须在返回前重命名为 `trade_date` +- 如果 API 返回其他日期字段名(如 `date`、`end_date`),必须在返回前重命名为 `trade_date`: + +```python +if "date" in data.columns: + data = data.rename(columns={"date": "trade_date"}) +``` ### 4.4 股票代码要求 - - 统一使用 `ts_code` 作为股票代码字段名 - 格式:`{code}.{exchange}`,如 `000001.SZ`、`600000.SH` +- 确保返回的 DataFrame 包含 `ts_code` 列 ### 4.5 令牌桶限速要求 -所有 API 调用必须通过 `TushareClient`,自动满足令牌桶限速要求。 - -## 5. Sync 集成规范 - -### 5.1 DATASET_CONFIG 注册要求 - -新接口必须在 `DataSync.DATASET_CONFIG` 中注册,配置项: +所有 API 调用必须通过 `TushareClient`,自动满足令牌桶限速要求: ```python -"{new_data_type}": { - "api_name": "{tushare_api_name}", # Tushare API 名称 - "fetch_by": "date", # "date" 或 "stock" - "date_field": "trade_date", - "key_fields": ["ts_code", "trade_date"], # 用于去重的主键 +from src.data.client import TushareClient + +def get_{data_type}(...) -> pd.DataFrame: + client = TushareClient() + + # Build parameters + params = {} + if trade_date: + params["trade_date"] = trade_date + if ts_code: + params["ts_code"] = ts_code + # ... + + # Fetch data (rate limiting handled automatically) + data = client.query("{api_name}", **params) + + return data +``` + +**注意**: `TushareClient` 自动处理: +- 令牌桶速率限制 +- API 重试逻辑(指数退避) +- 配置加载 + +## 5. DuckDB 存储规范 + +### 5.1 存储架构 + +项目使用 **DuckDB** 作为持久化存储: + +- **单例模式**: `Storage` 类确保单一数据库连接 +- **线程安全**: `ThreadSafeStorage` 提供并发写入支持 +- **UPSERT 支持**: `INSERT OR REPLACE` 自动处理重复数据 +- **查询下推**: WHERE 条件在数据库层过滤 + +### 5.2 表结构设计 + +每个数据类型对应一个 DuckDB 表: + +```sql +CREATE TABLE {data_type} ( + ts_code VARCHAR(16) NOT NULL, + trade_date DATE NOT NULL, + # 其他字段... + PRIMARY KEY (ts_code, trade_date) +); + +CREATE INDEX idx_{data_type}_date_code ON {data_type}(trade_date, ts_code); +``` + +**主键要求**: +- 必须包含 `ts_code` 和 `trade_date` +- 使用 UPSERT 确保幂等性 + +### 5.3 存储写入策略 + +**批量写入模式**(推荐用于多线程场景): + +```python +from src.data.storage import ThreadSafeStorage + +def sync_{data_type}(self, ...): + storage = ThreadSafeStorage() + + # 收集数据到队列(不立即写入) + for data_chunk in data_generator: + storage.queue_save("{data_type}", data_chunk) + + # 批量写入所有数据 + storage.flush() +``` + +**直接写入模式**(适用于简单场景): + +```python +from src.data.storage import Storage + +storage = Storage() +storage.save("{data_type}", data, mode="append") +``` + +### 5.4 数据类型映射 + +标准字段类型映射(`DEFAULT_TYPE_MAPPING`): + +```python +DEFAULT_TYPE_MAPPING = { + "ts_code": "VARCHAR(16)", + "trade_date": "DATE", + "open": "DOUBLE", + "high": "DOUBLE", + "low": "DOUBLE", + "close": "DOUBLE", + "vol": "DOUBLE", + "amount": "DOUBLE", + # ... 其他字段 } ``` -### 5.2 fetch_by 取值规则 +## 6. Sync 集成规范 -- **优先使用 `"date"`**:如果 API 支持按日期获取全市场数据 -- 仅当 API 不支持按日期获取时才使用 `"stock"` +### 6.1 使用 db_manager 进行同步 -### 5.3 sync 方法要求 - -必须实现对应的 sync 方法或复用通用方法: +项目使用 `db_manager` 模块提供高级同步功能: ```python -def sync_{data_type}(self, force_full: bool = False) -> pd.DataFrame: - """Sync {数据描述}。""" - return self.sync_dataset("{data_type}", force_full) +from src.data.db_manager import SyncManager, ensure_table + +def sync_{data_type}(force_full: bool = False) -> pd.DataFrame: + """Sync {数据描述} to DuckDB.""" + + manager = SyncManager() + + # 确保表存在 + ensure_table("{data_type}", schema={ + "ts_code": "VARCHAR(16)", + "trade_date": "DATE", + # ... 其他字段 + }) + + # 执行同步 + result = manager.sync( + table_name="{data_type}", + fetch_func=get_{data_type}, + start_date=start_date, + end_date=end_date, + force_full=force_full, + ) + + return result ``` -同时提供便捷函数: +### 6.2 增量更新逻辑 + +`SyncManager` 自动处理增量更新: + +1. **检查本地最新日期**: 从 DuckDB 获取 `MAX(trade_date)` +2. **获取交易日历**: 从 `api_trade_cal` 获取交易日范围 +3. **计算需要同步的日期**: 本地最新日期 + 1 到最新交易日 +4. **批量获取数据**: 按日期或按股票获取 +5. **批量写入**: 使用 `ThreadSafeStorage` 队列写入 + +### 6.3 便捷函数 + +每个接口必须提供顶层便捷函数: ```python def sync_{data_type}(force_full: bool = False) -> pd.DataFrame: - """Sync {数据描述}。""" - sync_manager = DataSync() - return sync_manager.sync_{data_type}(force_full) + """Sync {数据描述} to local storage. + + Args: + force_full: If True, force full reload from 20180101 + + Returns: + DataFrame with synced data + """ + # Implementation... ``` -### 5.4 增量更新要求 +## 7. 代码模板 -- 必须实现增量更新逻辑(自动检查本地最新日期) -- 使用 `force_full` 参数支持强制全量同步 +### 7.1 按日期获取接口模板 -## 6. 存储规范 +```python +"""{数据描述} interface. -### 6.1 存储方式 +Fetch {数据描述} data from Tushare. +""" -所有数据通过 `Storage` 类进行 HDF5 存储。 +import pandas as pd +from typing import Optional +from src.data.client import TushareClient -### 6.2 写入策略 -**要求**:所有数据在请求完成后**一次性写入**,而非逐条写入。 +def get_{data_type}( + trade_date: Optional[str] = None, + start_date: Optional[str] = None, + end_date: Optional[str] = None, + ts_code: Optional[str] = None, +) -> pd.DataFrame: + """Fetch {数据描述} from Tushare. -### 6.3 去重要求 + This interface retrieves {详细描述}. -使用 `key_fields` 配置的字段进行去重,默认使用 `["ts_code", "trade_date"]`。 + Args: + trade_date: Specific trade date (YYYYMMDD format) + start_date: Start date (YYYYMMDD format) + end_date: End date (YYYYMMDD format) + ts_code: Stock code filter (optional) -## 7. 测试规范 + Returns: + pd.DataFrame with columns: + - ts_code: Stock code + - trade_date: Trade date (YYYYMMDD) + - {字段1}: {描述} + - {字段2}: {描述} -### 7.1 测试文件要求 + Example: + >>> # Get all stocks for a single date + >>> data = get_{data_type}(trade_date='20240101') + >>> + >>> # Get date range data + >>> data = get_{data_type}(start_date='20240101', end_date='20240131') + """ + client = TushareClient() + + # Build parameters + params = {} + if trade_date: + params["trade_date"] = trade_date + if start_date: + params["start_date"] = start_date + if end_date: + params["end_date"] = end_date + if ts_code: + params["ts_code"] = ts_code + + # Fetch data + data = client.query("{tushare_api_name}", **params) + + # Rename date column if needed + if "date" in data.columns: + data = data.rename(columns={"date": "trade_date"}) + + return data +``` + +### 7.2 按股票获取接口模板 + +```python +"""{数据描述} interface. + +Fetch {数据描述} data from Tushare (per stock). +""" + +import pandas as pd +from typing import Optional +from src.data.client import TushareClient + + +def get_{data_type}( + ts_code: str, + start_date: Optional[str] = None, + end_date: Optional[str] = None, +) -> pd.DataFrame: + """Fetch {数据描述} for a specific stock. + + Args: + ts_code: Stock code (e.g., '000001.SZ') + start_date: Start date (YYYYMMDD format) + end_date: End date (YYYYMMDD format) + + Returns: + pd.DataFrame with {数据描述} data + """ + client = TushareClient() + + params = {"ts_code": ts_code} + if start_date: + params["start_date"] = start_date + if end_date: + params["end_date"] = end_date + + data = client.query("{tushare_api_name}", **params) + + return data +``` + +### 7.3 Sync 函数模板 + +```python +from src.data.db_manager import SyncManager, ensure_table +from src.data.api_wrappers import get_{data_type} + + +def sync_{data_type}(force_full: bool = False) -> pd.DataFrame: + """Sync {数据描述} to local DuckDB storage. + + Args: + force_full: If True, force full reload from 20180101 + + Returns: + DataFrame with synced data + """ + manager = SyncManager() + + # Ensure table exists with proper schema + ensure_table("{data_type}", schema={ + "ts_code": "VARCHAR(16)", + "trade_date": "DATE", + # Add other fields... + }) + + # Perform sync + result = manager.sync( + table_name="{data_type}", + fetch_func=get_{data_type}, + force_full=force_full, + ) + + return result +``` + +## 8. 测试规范 + +### 8.1 测试文件要求 必须创建对应的测试文件:`tests/test_{data_type}.py` -### 7.2 测试覆盖要求 +### 8.2 测试覆盖要求 -- 测试按日期获取 -- 测试按股票获取(如果支持) -- 必须 mock `TushareClient` -- 测试覆盖正常和异常情况 +```python +import pytest +import pandas as pd +from unittest.mock import patch, MagicMock +from src.data.api_wrappers.api_{data_type} import get_{data_type} -## 8. 新增接口完整流程 -### 8.1 创建接口文件 +class Test{DataType}: + """Test suite for {data_type} API wrapper.""" + + @patch("src.data.api_wrappers.api_{data_type}.TushareClient") + def test_get_by_date(self, mock_client_class): + """Test fetching data by date.""" + # Setup mock + mock_client = MagicMock() + mock_client_class.return_value = mock_client + mock_client.query.return_value = pd.DataFrame({ + "ts_code": ["000001.SZ"], + "trade_date": ["20240101"], + # ... other columns + }) + + # Test + result = get_{data_type}(trade_date="20240101") + + # Assert + assert not result.empty + assert "ts_code" in result.columns + assert "trade_date" in result.columns + mock_client.query.assert_called_once() + + @patch("src.data.api_wrappers.api_{data_type}.TushareClient") + def test_get_by_stock(self, mock_client_class): + """Test fetching data by stock code.""" + # Similar setup... + pass + + @patch("src.data.api_wrappers.api_{data_type}.TushareClient") + def test_empty_response(self, mock_client_class): + """Test handling empty response.""" + mock_client = MagicMock() + mock_client_class.return_value = mock_client + mock_client.query.return_value = pd.DataFrame() + + result = get_{data_type}(trade_date="20240101") + assert result.empty +``` -1. 在 `src/data/` 下创建 `{data_type}.py` -2. 实现数据获取函数,遵循第 4 节规范 +### 8.3 Mock 规范 -### 8.2 注册 sync 支持 +- 在导入位置打补丁:`patch('src.data.api_wrappers.api_{data_type}.TushareClient')` +- 测试正常和异常情况 +- 验证参数传递正确 -1. 在 `sync.py` 的 `DataSync.DATASET_CONFIG` 中注册 -2. 实现对应的 sync 方法 -3. 提供便捷函数 +## 9. 使用 Skill 自动生成 -### 8.3 更新导出 +### 9.1 准备工作 -在 `src/data/__init__.py` 中导出接口函数。 +1. 在 `api.md` 中定义接口信息,包含: + - 接口名称和描述 + - 输入参数(名称、类型、必选、描述) + - 输出参数(名称、类型、描述) -### 8.4 创建测试 +### 9.2 调用 Skill -创建 `tests/test_{data_type}.py`,覆盖关键场景。 +告知 Claude 要封装的接口名称: -## 9. 检查清单 +> "帮我封装 {data_type} 接口" -### 9.1 文件结构 -- [ ] 文件位于 `src/data/{data_type}.py` -- [ ] 已更新 `src/data/__init__.py` 导出公共接口 +> "为 {data_type} 接口生成代码" + +### 9.3 自动生成内容 + +Skill 会自动: +1. 解析 `api.md` 中的接口定义 +2. 生成 `src/data/api_wrappers/api_{data_type}.py` +3. 更新 `src/data/api_wrappers/__init__.py` 导出 +4. 生成 `tests/test_{data_type}.py` 测试文件 +5. 提供 `sync_{data_type}()` 函数模板 + +## 10. 检查清单 + +### 10.1 文件结构 +- [ ] 文件位于 `src/data/api_wrappers/api_{data_type}.py` +- [ ] 已更新 `src/data/api_wrappers/__init__.py` 导出公共接口 - [ ] 已创建 `tests/test_{data_type}.py` 测试文件 - -### 9.2 接口实现 +### 10.2 接口实现 - [ ] 数据获取函数使用 `TushareClient` - [ ] 函数包含完整的 Google 风格文档字符串 - [ ] 日期参数使用 `YYYYMMDD` 格式 - [ ] 返回的 DataFrame 包含 `ts_code` 和 `trade_date` 字段 - [ ] 优先实现按日期获取的接口(如果 API 支持) +- [ ] 参数传递前检查是否为 None -### 9.3 Sync 集成 -- [ ] 已在 `DataSync.DATASET_CONFIG` 中注册 -- [ ] 正确设置 `fetch_by`("date" 或 "stock") -- [ ] 正确设置 `date_field` 和 `key_fields` -- [ ] 已实现对应的 sync 方法或复用通用方法 -- [ ] 增量更新逻辑正确(检查本地最新日期) +### 10.3 存储集成 +- [ ] 使用 `Storage` 或 `ThreadSafeStorage` 进行数据存储 +- [ ] 表结构包含 `ts_code` 和 `trade_date` 作为主键 +- [ ] 使用 UPSERT 模式(`INSERT OR REPLACE`) +- [ ] 多线程场景使用 `queue_save()` + `flush()` 模式 -### 9.4 存储优化 -- [ ] 所有数据一次性写入(非逐条) -- [ ] 使用 `storage.save(mode="append")` 进行增量保存 -- [ ] 去重字段配置正确 +### 10.4 Sync 集成 +- [ ] 使用 `db_manager` 模块进行同步管理 +- [ ] 实现 `sync_{data_type}()` 便捷函数 +- [ ] 支持 `force_full` 参数 +- [ ] 增量更新逻辑正确 -### 9.5 测试 +### 10.5 测试 - [ ] 已编写单元测试 -- [ ] 已 mock TushareClient +- [ ] 已 mock `TushareClient` +- [ ] 测试覆盖按日期和按股票获取 - [ ] 测试覆盖正常和异常情况 +## 11. 示例参考 + +### 11.1 完整示例:api_daily.py + +参见 `src/data/api_wrappers/api_daily.py` - 按股票获取日线数据的完整实现。 + +### 11.2 完整示例:api_trade_cal.py + +参见 `src/data/api_wrappers/api_trade_cal.py` - 特殊接口(交易日历)的实现,包含 HDF5 缓存逻辑。 + +### 11.3 完整示例:api_stock_basic.py + +参见 `src/data/api_wrappers/api_stock_basic.py` - 特殊接口(股票基础信息)的实现,包含 CSV 存储逻辑。 --- -**最后更新**: 2026-02-01 +**最后更新**: 2026-02-23 + +**版本**: v2.0 - 更新 DuckDB 存储规范,添加 Skill 自动化说明 \ No newline at end of file diff --git a/src/data/api_wrappers/__init__.py b/src/data/api_wrappers/__init__.py index cd64427..13f3b89 100644 --- a/src/data/api_wrappers/__init__.py +++ b/src/data/api_wrappers/__init__.py @@ -7,15 +7,21 @@ Available APIs: - api_daily: Daily market data (日线行情) - api_stock_basic: Stock basic information (股票基本信息) - api_trade_cal: Trading calendar (交易日历) + - api_namechange: Stock name change history (股票曾用名) + - api_bak_basic: Stock historical list (股票历史列表) Example: - >>> from src.data.api_wrappers import get_daily, get_stock_basic, get_trade_cal + >>> from src.data.api_wrappers import get_daily, get_stock_basic, get_trade_cal, get_bak_basic + >>> from src.data.api_wrappers import get_bak_basic, sync_bak_basic >>> data = get_daily('000001.SZ', start_date='20240101', end_date='20240131') >>> stocks = get_stock_basic() >>> calendar = get_trade_cal('20240101', '20240131') + >>> bak_basic = get_bak_basic(trade_date='20240101') """ -from src.data.api_wrappers.api_daily import get_daily +from src.data.api_wrappers.api_daily import get_daily, sync_daily, preview_daily_sync, DailySync +from src.data.api_wrappers.api_bak_basic import get_bak_basic, sync_bak_basic +from src.data.api_wrappers.api_namechange import get_namechange, sync_namechange from src.data.api_wrappers.api_stock_basic import get_stock_basic, sync_all_stocks from src.data.api_wrappers.api_trade_cal import ( get_trade_cal, @@ -28,6 +34,15 @@ from src.data.api_wrappers.api_trade_cal import ( __all__ = [ # Daily market data "get_daily", + "sync_daily", + "preview_daily_sync", + "DailySync", + # Historical stock list + "get_bak_basic", + "sync_bak_basic", + # Namechange + "get_namechange", + "sync_namechange", # Stock basic information "get_stock_basic", "sync_all_stocks", diff --git a/src/data/api_wrappers/api.md b/src/data/api_wrappers/api.md index 4c3835b..8e83438 100644 --- a/src/data/api_wrappers/api.md +++ b/src/data/api_wrappers/api.md @@ -250,4 +250,99 @@ df = pro.query('daily_basic', ts_code='', trade_date='20180726',fields='ts_code, 16 300718.SZ 20180726 17.6612 0.92 32.0239 3.8661 17 000708.SZ 20180726 0.5575 0.70 10.3674 1.0276 18 002626.SZ 20180726 0.6187 0.83 22.7580 4.2446 -19 600816.SH 20180726 0.6745 0.65 11.0778 3.2214 \ No newline at end of file +19 600816.SH 20180726 0.6745 0.65 11.0778 3.2214 + + +股票曾用名 +接口:namechange +描述:历史名称变更记录 + +输入参数 + +名称 类型 必选 描述 +ts_code str N TS代码 +start_date str N 公告开始日期 +end_date str N 公告结束日期 +输出参数 + +名称 类型 默认输出 描述 +ts_code str Y TS代码 +name str Y 证券名称 +start_date str Y 开始日期 +end_date str Y 结束日期 +ann_date str Y 公告日期 +change_reason str Y 变更原因 +接口示例 + + +pro = ts.pro_api() + +df = pro.namechange(ts_code='600848.SH', fields='ts_code,name,start_date,end_date,change_reason') +数据样例 + + ts_code name start_date end_date change_reason +0 600848.SH 上海临港 20151118 None 改名 +1 600848.SH 自仪股份 20070514 20151117 撤销ST +2 600848.SH ST自仪 20061026 20070513 完成股改 +3 600848.SH SST自仪 20061009 20061025 未股改加S +4 600848.SH ST自仪 20010508 20061008 ST +5 600848.SH 自仪股份 19940324 20010507 其他 + + +股票历史列表(历史每天股票列表) +接口:bak_basic +描述:获取备用基础列表,数据从2016年开始 +限量:单次最大7000条,可以根据日期参数循环获取历史,正式权限需要5000积分。 + +输入参数 + +名称 类型 必选 描述 +trade_date str N 交易日期 +ts_code str N 股票代码 +输出参数 + +名称 类型 默认显示 描述 +trade_date str Y 交易日期 +ts_code str Y TS股票代码 +name str Y 股票名称 +industry str Y 行业 +area str Y 地域 +pe float Y 市盈率(动) +float_share float Y 流通股本(亿) +total_share float Y 总股本(亿) +total_assets float Y 总资产(亿) +liquid_assets float Y 流动资产(亿) +fixed_assets float Y 固定资产(亿) +reserved float Y 公积金 +reserved_pershare float Y 每股公积金 +eps float Y 每股收益 +bvps float Y 每股净资产 +pb float Y 市净率 +list_date str Y 上市日期 +undp float Y 未分配利润 +per_undp float Y 每股未分配利润 +rev_yoy float Y 收入同比(%) +profit_yoy float Y 利润同比(%) +gpr float Y 毛利率(%) +npr float Y 净利润率(%) +holder_num int Y 股东人数 +接口示例 + + +pro = ts.pro_api() + +df = pro.bak_basic(trade_date='20211012', fields='trade_date,ts_code,name,industry,pe') +数据样例 + + trade_date ts_code name industry pe +0 20211012 300605.SZ 恒锋信息 软件服务 56.4400 +1 20211012 301017.SZ 漱玉平民 医药商业 58.7600 +2 20211012 300755.SZ 华致酒行 其他商业 23.0000 +3 20211012 300255.SZ 常山药业 生物制药 24.9900 +4 20211012 688378.SH 奥来德 专用机械 24.9600 +... ... ... ... ... ... +4529 20211012 688257.SH 新锐股份 机械基件 0.0000 +4530 20211012 688255.SH 凯尔达 机械基件 0.0000 +4531 20211012 688211.SH 中科微至 专用机械 0.0000 +4532 20211012 605567.SH 春雪食品 食品 0.0000 +4533 20211012 605566.SH 福莱蒽特 染料涂料 0.0000 \ No newline at end of file diff --git a/src/data/api_wrappers/api_bak_basic.py b/src/data/api_wrappers/api_bak_basic.py new file mode 100644 index 0000000..9185bb3 --- /dev/null +++ b/src/data/api_wrappers/api_bak_basic.py @@ -0,0 +1,243 @@ +"""Stock historical list interface. + +Fetch daily stock list from Tushare bak_basic API. +Data available from 2016 onwards. +""" + +import pandas as pd +from typing import Optional, List +from datetime import datetime, timedelta +from tqdm import tqdm +from src.data.client import TushareClient +from src.data.storage import ThreadSafeStorage, Storage +from src.data.db_manager import ensure_table + + +def get_bak_basic( + trade_date: Optional[str] = None, + ts_code: Optional[str] = None, +) -> pd.DataFrame: + """Fetch historical stock list from Tushare. + + This interface retrieves the daily stock list including basic information + for all stocks on a specific trade date. Data is available from 2016 onwards. + + Args: + trade_date: Specific trade date in YYYYMMDD format + ts_code: Stock code filter (optional, e.g., '000001.SZ') + + Returns: + pd.DataFrame with columns: + - trade_date: Trade date (YYYYMMDD) + - ts_code: TS stock code + - name: Stock name + - industry: Industry + - area: Region + - pe: P/E ratio (dynamic) + - float_share: Float shares (100 million) + - total_share: Total shares (100 million) + - total_assets: Total assets (100 million) + - liquid_assets: Liquid assets (100 million) + - fixed_assets: Fixed assets (100 million) + - reserved: Reserve fund + - reserved_pershare: Reserve per share + - eps: Earnings per share + - bvps: Book value per share + - pb: P/B ratio + - list_date: Listing date + - undp: Undistributed profit + - per_undp: Undistributed profit per share + - rev_yoy: Revenue YoY (%) + - profit_yoy: Profit YoY (%) + - gpr: Gross profit ratio (%) + - npr: Net profit ratio (%) + - holder_num: Number of shareholders + + Example: + >>> # Get all stocks for a single date + >>> data = get_bak_basic(trade_date='20240101') + >>> + >>> # Get specific stock data + >>> data = get_bak_basic(ts_code='000001.SZ', trade_date='20240101') + """ + client = TushareClient() + + # Build parameters + params = {} + if trade_date: + params["trade_date"] = trade_date + if ts_code: + params["ts_code"] = ts_code + + # Fetch data + data = client.query("bak_basic", **params) + + return data + + +def sync_bak_basic( + start_date: Optional[str] = None, + end_date: Optional[str] = None, + force_full: bool = False, +) -> pd.DataFrame: + """Sync historical stock list to DuckDB with intelligent incremental sync. + + Logic: + - If table doesn't exist: create table + composite index (trade_date, ts_code) + full sync + - If table exists: incremental sync from last_date + 1 + + Args: + start_date: Start date for sync (YYYYMMDD format, default: 20160101 for full, last_date+1 for incremental) + end_date: End date for sync (YYYYMMDD format, default: today) + force_full: If True, force full reload from 20160101 + + Returns: + pd.DataFrame with synced data + """ + from src.data.db_manager import ensure_table + + TABLE_NAME = "bak_basic" + storage = Storage() + thread_storage = ThreadSafeStorage() + + # Default end date + if end_date is None: + end_date = datetime.now().strftime("%Y%m%d") + + # Check if table exists + table_exists = storage.exists(TABLE_NAME) + + if not table_exists or force_full: + # ===== FULL SYNC ===== + # 1. Create table with schema + # 2. Create composite index (trade_date, ts_code) + # 3. Full sync from start_date + + if not table_exists: + print(f"[sync_bak_basic] Table '{TABLE_NAME}' doesn't exist, creating...") + + # Fetch sample to get schema + sample = get_bak_basic(trade_date=end_date) + if sample.empty: + sample = get_bak_basic(trade_date="20240102") + + if sample.empty: + print("[sync_bak_basic] Cannot create table: no sample data available") + return pd.DataFrame() + + # Create table with schema + columns = [] + for col in sample.columns: + dtype = str(sample[col].dtype) + if "int" in dtype: + col_type = "INTEGER" + elif "float" in dtype: + col_type = "DOUBLE" + else: + col_type = "VARCHAR" + columns.append(f'"{col}" {col_type}') + + columns_sql = ", ".join(columns) + create_sql = f'CREATE TABLE IF NOT EXISTS "{TABLE_NAME}" ({columns_sql}, PRIMARY KEY ("trade_date", "ts_code"))' + + try: + storage._connection.execute(create_sql) + print(f"[sync_bak_basic] Created table '{TABLE_NAME}'") + except Exception as e: + print(f"[sync_bak_basic] Error creating table: {e}") + + # Create composite index + try: + storage._connection.execute(f""" + CREATE INDEX IF NOT EXISTS "idx_bak_basic_date_code" + ON "{TABLE_NAME}"("trade_date", "ts_code") + """) + print(f"[sync_bak_basic] Created composite index on (trade_date, ts_code)") + except Exception as e: + print(f"[sync_bak_basic] Error creating index: {e}") + + # Determine sync dates + sync_start = start_date or "20160101" + mode = "FULL" + print(f"[sync_bak_basic] Mode: {mode} SYNC from {sync_start} to {end_date}") + + else: + # ===== INCREMENTAL SYNC ===== + # Check last date in table, sync from last_date + 1 + + try: + result = storage._connection.execute( + f'SELECT MAX("trade_date") FROM "{TABLE_NAME}"' + ).fetchone() + last_date = result[0] if result and result[0] else None + except Exception as e: + print(f"[sync_bak_basic] Error getting last date: {e}") + last_date = None + + if last_date is None: + # Table exists but empty, do full sync + sync_start = start_date or "20160101" + mode = "FULL (empty table)" + else: + # Incremental from last_date + 1 + # Handle both YYYYMMDD and YYYY-MM-DD formats + last_date_str = str(last_date).replace("-", "") + last_dt = datetime.strptime(last_date_str, "%Y%m%d") + next_dt = last_dt + timedelta(days=1) + sync_start = next_dt.strftime("%Y%m%d") + mode = "INCREMENTAL" + + # Skip if already up to date + if sync_start > end_date: + print(f"[sync_bak_basic] Data is up-to-date (last: {last_date}), skipping sync") + return pd.DataFrame() + + print(f"[sync_bak_basic] Mode: {mode} from {sync_start} to {end_date} (last: {last_date})") + + # ===== FETCH AND SAVE DATA ===== + all_data: List[pd.DataFrame] = [] + current = datetime.strptime(sync_start, "%Y%m%d") + end_dt = datetime.strptime(end_date, "%Y%m%d") + + # Calculate total days for progress bar + total_days = (end_dt - current).days + 1 + print(f"[sync_bak_basic] Fetching data for {total_days} days...") + + with tqdm(total=total_days, desc="Syncing dates") as pbar: + while current <= end_dt: + date_str = current.strftime("%Y%m%d") + try: + data = get_bak_basic(trade_date=date_str) + if not data.empty: + all_data.append(data) + pbar.set_postfix({"date": date_str, "records": len(data)}) + except Exception as e: + print(f" {date_str}: ERROR - {e}") + + current += timedelta(days=1) + pbar.update(1) + + if not all_data: + print("[sync_bak_basic] No data fetched") + return pd.DataFrame() + + # Combine and save + combined = pd.concat(all_data, ignore_index=True) + print(f"[sync_bak_basic] Total records: {len(combined)}") + + # Delete existing data for the date range and append new data + storage._connection.execute(f'DELETE FROM "{TABLE_NAME}" WHERE "trade_date" >= ?', [sync_start]) + thread_storage.queue_save(TABLE_NAME, combined) + thread_storage.flush() + + print(f"[sync_bak_basic] Saved {len(combined)} records to DuckDB") + return combined + + +if __name__ == "__main__": + # Test sync + result = sync_bak_basic(end_date="20240102") + print(f"Synced {len(result)} records") + if not result.empty: + print("\nSample data:") + print(result.head()) diff --git a/src/data/api_wrappers/api_daily.py b/src/data/api_wrappers/api_daily.py index 3be4b3f..456ab9d 100644 --- a/src/data/api_wrappers/api_daily.py +++ b/src/data/api_wrappers/api_daily.py @@ -2,11 +2,27 @@ A single function to fetch A股日线行情 data from Tushare. Supports all output fields including tor (换手率) and vr (量比). + +This module provides both single-stock fetching (get_daily) and +batch synchronization (DailySync class) for daily market data. """ import pandas as pd -from typing import Optional, List, Literal +from typing import Optional, List, Literal, Dict +from datetime import datetime, timedelta +from tqdm import tqdm +from concurrent.futures import ThreadPoolExecutor, as_completed +import threading + from src.data.client import TushareClient +from src.data.storage import ThreadSafeStorage, Storage +from src.data.utils import get_today_date, get_next_date, DEFAULT_START_DATE +from src.data.api_wrappers.api_trade_cal import ( + get_first_trading_day, + get_last_trading_day, + sync_trade_cal_cache, +) +from src.data.api_wrappers.api_stock_basic import _get_csv_path, sync_all_stocks def get_daily( @@ -71,3 +87,744 @@ def get_daily( data = client.query("pro_bar", **params) return data + + +# ============================================================================= +# DailySync - 日线数据批量同步类 +# ============================================================================= + + +class DailySync: + """日线数据批量同步管理器,支持全量/增量同步。 + + 功能特性: + - 多线程并发获取(ThreadPoolExecutor) + - 增量同步(自动检测上次同步位置) + - 内存缓存(避免重复磁盘读取) + - 异常立即停止(确保数据一致性) + - 预览模式(预览同步数据量,不实际写入) + """ + + # 默认工作线程数 + DEFAULT_MAX_WORKERS = 10 + + def __init__(self, max_workers: Optional[int] = None): + """初始化同步管理器。 + + Args: + max_workers: 工作线程数(默认: 10) + """ + self.storage = ThreadSafeStorage() + self.client = TushareClient() + self.max_workers = max_workers or self.DEFAULT_MAX_WORKERS + self._stop_flag = threading.Event() + self._stop_flag.set() # 初始为未停止状态 + self._cached_daily_data: Optional[pd.DataFrame] = None # 日线数据缓存 + + def _load_daily_data(self) -> pd.DataFrame: + """从存储加载日线数据(带缓存)。 + + 该方法会将数据缓存在内存中以避免重复磁盘读取。 + 调用 clear_cache() 可强制重新加载。 + + Returns: + 缓存或从存储加载的日线数据 DataFrame + """ + if self._cached_daily_data is None: + self._cached_daily_data = self.storage.load("daily") + return self._cached_daily_data + + def clear_cache(self) -> None: + """清除缓存的日线数据,强制下次访问时重新加载。""" + self._cached_daily_data = None + + def get_all_stock_codes(self, only_listed: bool = True) -> list: + """从本地存储获取所有股票代码。 + + 优先使用 stock_basic.csv 以确保包含所有股票, + 避免回测中的前视偏差。 + + Args: + only_listed: 若为 True,仅返回当前上市股票(L 状态)。 + 设为 False 可包含退市股票(用于完整回测)。 + + Returns: + 股票代码列表 + """ + # 确保 stock_basic.csv 是最新的 + print("[DailySync] Ensuring stock_basic.csv is up-to-date...") + sync_all_stocks() + + # 从 stock_basic.csv 文件获取 + stock_csv_path = _get_csv_path() + + if stock_csv_path.exists(): + print(f"[DailySync] Reading stock_basic from CSV: {stock_csv_path}") + try: + stock_df = pd.read_csv(stock_csv_path, encoding="utf-8-sig") + if not stock_df.empty and "ts_code" in stock_df.columns: + # 根据 list_status 过滤 + if only_listed and "list_status" in stock_df.columns: + listed_stocks = stock_df[stock_df["list_status"] == "L"] + codes = listed_stocks["ts_code"].unique().tolist() + total = len(stock_df["ts_code"].unique()) + print( + f"[DailySync] Found {len(codes)} listed stocks (filtered from {total} total)" + ) + else: + codes = stock_df["ts_code"].unique().tolist() + print( + f"[DailySync] Found {len(codes)} stock codes from stock_basic.csv" + ) + return codes + else: + print( + f"[DailySync] stock_basic.csv exists but no ts_code column or empty" + ) + except Exception as e: + print(f"[DailySync] Error reading stock_basic.csv: {e}") + + # 回退:从日线存储获取 + print( + "[DailySync] stock_basic.csv not available, falling back to daily data..." + ) + daily_data = self._load_daily_data() + if not daily_data.empty and "ts_code" in daily_data.columns: + codes = daily_data["ts_code"].unique().tolist() + print(f"[DailySync] Found {len(codes)} stock codes from daily data") + return codes + + print("[DailySync] No stock codes found in local storage") + return [] + + def get_global_last_date(self) -> Optional[str]: + """获取全局最后交易日期。 + + Returns: + 最后交易日期字符串,若无数据则返回 None + """ + daily_data = self._load_daily_data() + if daily_data.empty or "trade_date" not in daily_data.columns: + return None + return str(daily_data["trade_date"].max()) + + def get_global_first_date(self) -> Optional[str]: + """获取全局最早交易日期。 + + Returns: + 最早交易日期字符串,若无数据则返回 None + """ + daily_data = self._load_daily_data() + if daily_data.empty or "trade_date" not in daily_data.columns: + return None + return str(daily_data["trade_date"].min()) + + def get_trade_calendar_bounds( + self, start_date: str, end_date: str + ) -> tuple[Optional[str], Optional[str]]: + """从交易日历获取首尾交易日。 + + Args: + start_date: 开始日期(YYYYMMDD 格式) + end_date: 结束日期(YYYYMMDD 格式) + + Returns: + (首交易日, 尾交易日) 元组,若出错则返回 (None, None) + """ + try: + first_day = get_first_trading_day(start_date, end_date) + last_day = get_last_trading_day(start_date, end_date) + return (first_day, last_day) + except Exception as e: + print(f"[ERROR] Failed to get trade calendar bounds: {e}") + return (None, None) + + def check_sync_needed( + self, + force_full: bool = False, + table_name: str = "daily", + ) -> tuple[bool, Optional[str], Optional[str], Optional[str]]: + """基于交易日历检查是否需要同步。 + + 该方法比较本地数据日期范围与交易日历, + 以确定是否需要获取新数据。 + + 逻辑: + - 若 force_full:需要同步,返回 (True, 20180101, today) + - 若无本地数据:需要同步,返回 (True, 20180101, today) + - 若存在本地数据: + - 从交易日历获取最后交易日 + - 若本地最后日期 >= 日历最后日期:无需同步 + - 否则:从本地最后日期+1 到最新交易日同步 + + Args: + force_full: 若为 True,始终返回需要同步 + table_name: 要检查的表名(默认: "daily") + + Returns: + (需要同步, 起始日期, 结束日期, 本地最后日期) + - 需要同步: True 表示应继续同步 + - 起始日期: 同步起始日期(无需同步时为 None) + - 结束日期: 同步结束日期(无需同步时为 None) + - 本地最后日期: 本地数据最后日期(用于增量同步) + """ + # 若 force_full,始终同步 + if force_full: + print("[DailySync] Force full sync requested") + return (True, DEFAULT_START_DATE, get_today_date(), None) + + # 检查特定表的本地数据是否存在 + storage = Storage() + table_data = ( + storage.load(table_name) if storage.exists(table_name) else pd.DataFrame() + ) + + if table_data.empty or "trade_date" not in table_data.columns: + print( + f"[DailySync] No local data found for table '{table_name}', full sync needed" + ) + return (True, DEFAULT_START_DATE, get_today_date(), None) + + # 获取本地数据最后日期 + local_last_date = str(table_data["trade_date"].max()) + + print(f"[DailySync] Local data last date: {local_last_date}") + + # 从交易日历获取最新交易日 + today = get_today_date() + _, cal_last = self.get_trade_calendar_bounds(DEFAULT_START_DATE, today) + + if cal_last is None: + print("[DailySync] Failed to get trade calendar, proceeding with sync") + return (True, DEFAULT_START_DATE, today, local_last_date) + + print(f"[DailySync] Calendar last trading day: {cal_last}") + + # 比较本地最后日期与日历最后日期 + print( + f"[DailySync] Comparing: local={local_last_date} (type={type(local_last_date).__name__}), " + f"cal={cal_last} (type={type(cal_last).__name__})" + ) + try: + local_last_int = int(local_last_date) + cal_last_int = int(cal_last) + print( + f"[DailySync] Comparing integers: local={local_last_int} >= cal={cal_last_int} = " + f"{local_last_int >= cal_last_int}" + ) + if local_last_int >= cal_last_int: + print( + "[DailySync] Local data is up-to-date, SKIPPING sync (no tokens consumed)" + ) + return (False, None, None, None) + except (ValueError, TypeError) as e: + print(f"[ERROR] Date comparison failed: {e}") + + # 需要从本地最后日期+1 同步到最新交易日 + sync_start = get_next_date(local_last_date) + print(f"[DailySync] Incremental sync needed from {sync_start} to {cal_last}") + return (True, sync_start, cal_last, local_last_date) + + def preview_sync( + self, + force_full: bool = False, + start_date: Optional[str] = None, + end_date: Optional[str] = None, + sample_size: int = 3, + ) -> dict: + """预览同步数据量和样本(不实际同步)。 + + 该方法提供即将同步的数据的预览,包括: + - 将同步的股票数量 + - 同步日期范围 + - 预估总记录数 + - 前几只股票的样本数据 + + Args: + force_full: 若为 True,预览全量同步(从 20180101) + start_date: 手动指定起始日期(覆盖自动检测) + end_date: 手动指定结束日期(默认为今天) + sample_size: 预览用样本股票数量(默认: 3) + + Returns: + 包含预览信息的字典: + { + 'sync_needed': bool, + 'stock_count': int, + 'start_date': str, + 'end_date': str, + 'estimated_records': int, + 'sample_data': pd.DataFrame, + 'mode': str, # 'full' 或 'incremental' + } + """ + print("\n" + "=" * 60) + print("[DailySync] Preview Mode - Analyzing sync requirements...") + print("=" * 60) + + # 首先确保交易日历缓存是最新的 + print("[DailySync] Syncing trade calendar cache...") + sync_trade_cal_cache() + + # 确定日期范围 + if end_date is None: + end_date = get_today_date() + + # 检查是否需要同步 + sync_needed, cal_start, cal_end, local_last = self.check_sync_needed(force_full) + + if not sync_needed: + print("\n" + "=" * 60) + print("[DailySync] Preview Result") + print("=" * 60) + print(" Sync Status: NOT NEEDED") + print(" Reason: Local data is up-to-date with trade calendar") + print("=" * 60) + return { + "sync_needed": False, + "stock_count": 0, + "start_date": None, + "end_date": None, + "estimated_records": 0, + "sample_data": pd.DataFrame(), + "mode": "none", + } + + # 使用 check_sync_needed 返回的日期 + if cal_start and cal_end: + sync_start_date = cal_start + end_date = cal_end + else: + sync_start_date = start_date or DEFAULT_START_DATE + if end_date is None: + end_date = get_today_date() + + # 确定同步模式 + if force_full: + mode = "full" + print(f"[DailySync] Mode: FULL SYNC from {sync_start_date} to {end_date}") + elif local_last and cal_start and sync_start_date == get_next_date(local_last): + mode = "incremental" + print(f"[DailySync] Mode: INCREmental SYNC (bandwidth optimized)") + print(f"[DailySync] Sync from: {sync_start_date} to {end_date}") + else: + mode = "partial" + print(f"[DailySync] Mode: SYNC from {sync_start_date} to {end_date}") + + # 获取所有股票代码 + stock_codes = self.get_all_stock_codes() + if not stock_codes: + print("[DailySync] No stocks found to sync") + return { + "sync_needed": False, + "stock_count": 0, + "start_date": None, + "end_date": None, + "estimated_records": 0, + "sample_data": pd.DataFrame(), + "mode": "none", + } + + stock_count = len(stock_codes) + print(f"[DailySync] Total stocks to sync: {stock_count}") + + # 从前几只股票获取样本数据 + print(f"[DailySync] Fetching sample data from {sample_size} stocks...") + sample_data_list = [] + sample_codes = stock_codes[:sample_size] + + for ts_code in sample_codes: + try: + data = self.client.query( + "pro_bar", + ts_code=ts_code, + start_date=sync_start_date, + end_date=end_date, + factors="tor,vr", + ) + if not data.empty: + sample_data_list.append(data) + print(f" - {ts_code}: {len(data)} records") + except Exception as e: + print(f" - {ts_code}: Error fetching - {e}") + + # 合并样本数据 + sample_df = ( + pd.concat(sample_data_list, ignore_index=True) + if sample_data_list + else pd.DataFrame() + ) + + # 基于样本估算总记录数 + if not sample_df.empty: + avg_records_per_stock = len(sample_df) / len(sample_data_list) + estimated_records = int(avg_records_per_stock * stock_count) + else: + estimated_records = 0 + + # 显示预览结果 + print("\n" + "=" * 60) + print("[DailySync] Preview Result") + print("=" * 60) + print(f" Sync Mode: {mode.upper()}") + print(f" Date Range: {sync_start_date} to {end_date}") + print(f" Stocks to Sync: {stock_count}") + print(f" Sample Stocks Checked: {len(sample_data_list)}/{sample_size}") + print(f" Estimated Total Records: ~{estimated_records:,}") + + if not sample_df.empty: + print(f"\n Sample Data Preview (first {len(sample_df)} rows):") + print(" " + "-" * 56) + # 以紧凑格式显示样本数据 + preview_cols = [ + "ts_code", + "trade_date", + "open", + "high", + "low", + "close", + "vol", + ] + available_cols = [c for c in preview_cols if c in sample_df.columns] + sample_display = sample_df[available_cols].head(10) + for idx, row in sample_display.iterrows(): + print(f" {row.to_dict()}") + print(" " + "-" * 56) + + print("=" * 60) + + return { + "sync_needed": True, + "stock_count": stock_count, + "start_date": sync_start_date, + "end_date": end_date, + "estimated_records": estimated_records, + "sample_data": sample_df, + "mode": mode, + } + + def sync_single_stock( + self, + ts_code: str, + start_date: str, + end_date: str, + ) -> pd.DataFrame: + """同步单只股票的日线数据。 + + Args: + ts_code: 股票代码 + start_date: 起始日期(YYYYMMDD) + end_date: 结束日期(YYYYMMDD) + + Returns: + 包含日线市场数据的 DataFrame + """ + # 检查是否应该停止同步(用于异常处理) + if not self._stop_flag.is_set(): + return pd.DataFrame() + + try: + # 使用共享客户端进行跨线程速率限制 + data = self.client.query( + "pro_bar", + ts_code=ts_code, + start_date=start_date, + end_date=end_date, + factors="tor,vr", + ) + return data + except Exception as e: + # 设置停止标志以通知其他线程停止 + self._stop_flag.clear() + print(f"[ERROR] Exception syncing {ts_code}: {e}") + raise + + def sync_all( + self, + force_full: bool = False, + start_date: Optional[str] = None, + end_date: Optional[str] = None, + max_workers: Optional[int] = None, + dry_run: bool = False, + ) -> Dict[str, pd.DataFrame]: + """同步本地存储中所有股票的日线数据。 + + 该函数: + 1. 从本地存储读取股票代码(daily 或 stock_basic) + 2. 检查交易日历确定是否需要同步: + - 若本地数据匹配交易日历边界,则跳过同步(节省 token) + - 否则,从本地最后日期+1 同步到最新交易日(带宽优化) + 3. 使用多线程并发获取(带速率限制) + 4. 跳过返回空数据的股票(退市/不可用) + 5. 遇异常立即停止 + + Args: + force_full: 若为 True,强制从 20180101 完整重载 + start_date: 手动指定起始日期(覆盖自动检测) + end_date: 手动指定结束日期(默认为今天) + max_workers: 工作线程数(默认: 10) + dry_run: 若为 True,仅预览将要同步的内容,不写入数据 + + Returns: + 映射 ts_code 到 DataFrame 的字典(若跳过或 dry_run 则为空字典) + """ + print("\n" + "=" * 60) + print("[DailySync] Starting daily data sync...") + print("=" * 60) + + # 首先确保交易日历缓存是最新的(使用增量同步) + print("[DailySync] Syncing trade calendar cache...") + sync_trade_cal_cache() + + # 确定日期范围 + if end_date is None: + end_date = get_today_date() + + # 基于交易日历检查是否需要同步 + sync_needed, cal_start, cal_end, local_last = self.check_sync_needed(force_full) + + if not sync_needed: + # 跳过同步 - 不消耗 token + print("\n" + "=" * 60) + print("[DailySync] Sync Summary") + print("=" * 60) + print(" Sync: SKIPPED (local data up-to-date with trade calendar)") + print(" Tokens saved: 0 consumed") + print("=" * 60) + return {} + + # 使用 check_sync_needed 返回的日期(会计算增量起始日期) + if cal_start and cal_end: + sync_start_date = cal_start + end_date = cal_end + else: + # 回退到默认逻辑 + sync_start_date = start_date or DEFAULT_START_DATE + if end_date is None: + end_date = get_today_date() + + # 确定同步模式 + if force_full: + mode = "full" + print(f"[DailySync] Mode: FULL SYNC from {sync_start_date} to {end_date}") + elif local_last and cal_start and sync_start_date == get_next_date(local_last): + mode = "incremental" + print(f"[DailySync] Mode: INCREMENTAL SYNC (bandwidth optimized)") + print(f"[DailySync] Sync from: {sync_start_date} to {end_date}") + else: + mode = "partial" + print(f"[DailySync] Mode: SYNC from {sync_start_date} to {end_date}") + + # 获取所有股票代码 + stock_codes = self.get_all_stock_codes() + if not stock_codes: + print("[DailySync] No stocks found to sync") + return {} + + print(f"[DailySync] Total stocks to sync: {len(stock_codes)}") + print(f"[DailySync] Using {max_workers or self.max_workers} worker threads") + + # 处理 dry run 模式 + if dry_run: + print("\n" + "=" * 60) + print("[DailySync] DRY RUN MODE - No data will be written") + print("=" * 60) + print(f" Would sync {len(stock_codes)} stocks") + print(f" Date range: {sync_start_date} to {end_date}") + print(f" Mode: {mode}") + print("=" * 60) + return {} + + # 为新同步重置停止标志 + self._stop_flag.set() + + # 多线程并发获取 + results: Dict[str, pd.DataFrame] = {} + error_occurred = False + exception_to_raise = None + + def sync_task(ts_code: str) -> tuple[str, pd.DataFrame]: + """每只股票的任务函数。""" + try: + data = self.sync_single_stock( + ts_code=ts_code, + start_date=sync_start_date, + end_date=end_date, + ) + return (ts_code, data) + except Exception as e: + # 重新抛出以被 Future 捕获 + raise + + # 使用 ThreadPoolExecutor 进行并发获取 + workers = max_workers or self.max_workers + with ThreadPoolExecutor(max_workers=workers) as executor: + # 提交所有任务并跟踪 futures 与股票代码的映射 + future_to_code = { + executor.submit(sync_task, ts_code): ts_code for ts_code in stock_codes + } + + # 使用 as_completed 处理结果 + error_count = 0 + empty_count = 0 + success_count = 0 + + # 创建进度条 + pbar = tqdm(total=len(stock_codes), desc="Syncing stocks") + + try: + # 处理完成的 futures + for future in as_completed(future_to_code): + ts_code = future_to_code[future] + + try: + _, data = future.result() + if data is not None and not data.empty: + results[ts_code] = data + success_count += 1 + else: + # 空数据 - 股票可能已退市或不可用 + empty_count += 1 + print( + f"[DailySync] Stock {ts_code}: empty data (skipped, may be delisted)" + ) + except Exception as e: + # 发生异常 - 停止全部并中止 + error_occurred = True + exception_to_raise = e + print(f"\n[ERROR] Sync aborted due to exception: {e}") + # 关闭 executor 以停止所有待处理任务 + executor.shutdown(wait=False, cancel_futures=True) + raise exception_to_raise + + # 更新进度条 + pbar.update(1) + + except Exception: + error_count = 1 + print("[DailySync] Sync stopped due to exception") + finally: + pbar.close() + + # 批量写入所有数据(仅在无错误时) + if results and not error_occurred: + for ts_code, data in results.items(): + if not data.empty: + self.storage.queue_save("daily", data) + # 一次性刷新所有排队写入 + self.storage.flush() + total_rows = sum(len(df) for df in results.values()) + print(f"\n[DailySync] Saved {total_rows} rows to storage") + + # 摘要 + print("\n" + "=" * 60) + print("[DailySync] Sync Summary") + print("=" * 60) + print(f" Total stocks: {len(stock_codes)}") + print(f" Updated: {success_count}") + print(f" Skipped (empty/delisted): {empty_count}") + print( + f" Errors: {error_count} (aborted on first error)" + if error_count + else " Errors: 0" + ) + print(f" Date range: {sync_start_date} to {end_date}") + print("=" * 60) + + return results + + +def sync_daily( + force_full: bool = False, + start_date: Optional[str] = None, + end_date: Optional[str] = None, + max_workers: Optional[int] = None, + dry_run: bool = False, +) -> Dict[str, pd.DataFrame]: + """同步所有股票的日线数据。 + + 这是日线数据同步的主要入口点。 + + Args: + force_full: 若为 True,强制从 20180101 完整重载 + start_date: 手动指定起始日期(YYYYMMDD) + end_date: 手动指定结束日期(默认为今天) + max_workers: 工作线程数(默认: 10) + dry_run: 若为 True,仅预览将要同步的内容,不写入数据 + + Returns: + 映射 ts_code 到 DataFrame 的字典 + + Example: + >>> # 首次同步(从 20180101 全量加载) + >>> result = sync_daily() + >>> + >>> # 后续同步(增量 - 仅新数据) + >>> result = sync_daily() + >>> + >>> # 强制完整重载 + >>> result = sync_daily(force_full=True) + >>> + >>> # 手动指定日期范围 + >>> result = sync_daily(start_date='20240101', end_date='20240131') + >>> + >>> # 自定义线程数 + >>> result = sync_daily(max_workers=20) + >>> + >>> # Dry run(仅预览) + >>> result = sync_daily(dry_run=True) + """ + sync_manager = DailySync(max_workers=max_workers) + return sync_manager.sync_all( + force_full=force_full, + start_date=start_date, + end_date=end_date, + dry_run=dry_run, + ) + + +def preview_daily_sync( + force_full: bool = False, + start_date: Optional[str] = None, + end_date: Optional[str] = None, + sample_size: int = 3, +) -> dict: + """预览日线同步数据量和样本(不实际同步)。 + + 这是推荐的方式,可在实际同步前检查将要同步的内容。 + + Args: + force_full: 若为 True,预览全量同步(从 20180101) + start_date: 手动指定起始日期(覆盖自动检测) + end_date: 手动指定结束日期(默认为今天) + sample_size: 预览用样本股票数量(默认: 3) + + Returns: + 包含预览信息的字典: + { + 'sync_needed': bool, + 'stock_count': int, + 'start_date': str, + 'end_date': str, + 'estimated_records': int, + 'sample_data': pd.DataFrame, + 'mode': str, # 'full', 'incremental', 'partial', 或 'none' + } + + Example: + >>> # 预览将要同步的内容 + >>> preview = preview_daily_sync() + >>> + >>> # 预览全量同步 + >>> preview = preview_daily_sync(force_full=True) + >>> + >>> # 预览更多样本 + >>> preview = preview_daily_sync(sample_size=5) + """ + sync_manager = DailySync() + return sync_manager.preview_sync( + force_full=force_full, + start_date=start_date, + end_date=end_date, + sample_size=sample_size, + ) diff --git a/src/data/api_wrappers/api_namechange.py b/src/data/api_wrappers/api_namechange.py new file mode 100644 index 0000000..dbddd83 --- /dev/null +++ b/src/data/api_wrappers/api_namechange.py @@ -0,0 +1,113 @@ +"""Stock name change history interface. + +Fetch historical name change records for stocks. +This interface retrieves all historical name changes including name, dates, and change reasons. +""" + +import pandas as pd +from pathlib import Path +from typing import Optional, List +from src.data.client import TushareClient +from src.data.config import get_config + + +# CSV file path for namechange data +def _get_csv_path() -> Path: + """Get the CSV file path for namechange data.""" + cfg = get_config() + return cfg.data_path_resolved / "namechange.csv" + + +def get_namechange( + ts_code: Optional[str] = None, + start_date: Optional[str] = None, + end_date: Optional[str] = None, + fields: Optional[List[str]] = None, +) -> pd.DataFrame: + """Fetch stock name change history. + + This interface retrieves historical name change records for stocks, + including name, start/end dates, announcement date, and change reason. + + Args: + ts_code: TS stock code (optional, if not provided, returns all stocks) + start_date: Start date for announcement date range (YYYYMMDD) + end_date: End date for announcement date range (YYYYMMDD) + fields: Specific fields to return, None returns all fields + + Returns: + pd.DataFrame with namechange information containing: + - ts_code: TS stock code + - name: Security name + - start_date: Start date of the name + - end_date: End date of the name + - ann_date: Announcement date + - change_reason: Reason for name change + """ + client = TushareClient() + + # Build parameters + params = {} + if ts_code: + params["ts_code"] = ts_code + if start_date: + params["start_date"] = start_date + if end_date: + params["end_date"] = end_date + if fields: + params["fields"] = ",".join(fields) + + # Fetch data + data = client.query("namechange", **params) + + if data.empty: + print("[get_namechange] No data returned") + + return data + + +def sync_namechange(force: bool = False) -> pd.DataFrame: + """Fetch and save all stock name change records to local CSV. + + This is a full load interface - fetches all historical name change records. + Each call fetches all data (no incremental sync). + + Args: + force: If True, force re-fetch even if CSV exists (default: False) + + Returns: + pd.DataFrame with all namechange records + """ + csv_path = _get_csv_path() + + # Check if CSV file already exists + if csv_path.exists() and not force: + print(f"[sync_namechange] namechange.csv already exists at {csv_path}") + print("[sync_namechange] Use force=True to re-fetch") + return pd.read_csv(csv_path, encoding="utf-8-sig") + + print("[sync_namechange] Fetching all stock name changes...") + + # Fetch all namechange data (no parameters = all stocks, all history) + client = TushareClient() + data = client.query("namechange") + + if data.empty: + print("[sync_namechange] No namechange data fetched") + return pd.DataFrame() + + print(f"[sync_namechange] Fetched {len(data)} name change records") + + # Save to CSV + data.to_csv(csv_path, index=False, encoding="utf-8-sig") + print(f"[sync_namechange] Saved {len(data)} records to {csv_path}") + return data + + +if __name__ == "__main__": + # Sync all namechange records to data folder + result = sync_namechange() + print(f"Total records synced: {len(result)}") + if not result.empty: + print("\nSample data:") + print(result.head(10)) diff --git a/src/data/sync.py b/src/data/sync.py index 39fd960..27ecffb 100644 --- a/src/data/sync.py +++ b/src/data/sync.py @@ -1,701 +1,34 @@ -"""Data synchronization module. +"""数据同步调度中心模块。 -This module provides data fetching functions with intelligent sync logic: -- If local file doesn't exist: fetch all data (full load from 20180101) -- If local file exists: incremental update (fetch from latest date + 1 day) -- Multi-threaded concurrent fetching for improved performance -- Stop immediately on any exception -- Preview mode: check data volume and samples before actual sync +该模块作为数据同步的调度中心,统一管理各类型数据的同步流程。 +具体的同步逻辑已迁移到对应的 api_xxx.py 文件中: +- api_daily.py: 日线数据同步 (DailySync 类) +- api_bak_basic.py: 历史股票列表同步 +- api_stock_basic.py: 股票基本信息同步 +- api_trade_cal.py: 交易日历同步 -Currently supported data types: -- daily: Daily market data (with turnover rate and volume ratio) +注意:名称变更 (namechange) 已从自动同步中移除, +因为股票名称变更不频繁,建议手动定期同步。 -Usage: - # Preview sync (check data volume and samples without writing) - preview_sync() +使用方式: + # 预览同步(检查数据量,不写入) + from src.data.sync import preview_sync + preview = preview_sync() - # Sync all stocks (full load) - sync_all() + # 同步所有数据(不包括 namechange) + from src.data.sync import sync_all_data + result = sync_all_data() - # Sync all stocks (incremental) - sync_all() - - # Force full reload - sync_all(force_full=True) - - # Dry run (preview only, no write) - sync_all(dry_run=True) + # 强制全量重载 + result = sync_all_data(force_full=True) """ +from typing import Optional, Dict + import pandas as pd -from typing import Optional, Dict, Callable -from datetime import datetime, timedelta -from tqdm import tqdm -from concurrent.futures import ThreadPoolExecutor, as_completed -import threading -import sys -from src.data.client import TushareClient -from src.data.storage import ThreadSafeStorage -from src.data.api_wrappers import get_daily -from src.data.api_wrappers import ( - get_first_trading_day, - get_last_trading_day, - sync_trade_cal_cache, -) - - -# Default full sync start date -DEFAULT_START_DATE = "20180101" - -# Today's date in YYYYMMDD format -TODAY = datetime.now().strftime("%Y%m%d") - - -def get_today_date() -> str: - """Get today's date in YYYYMMDD format.""" - return TODAY - - -def get_next_date(date_str: str) -> str: - """Get the next day after the given date. - - Args: - date_str: Date in YYYYMMDD format - - Returns: - Next date in YYYYMMDD format - """ - dt = datetime.strptime(date_str, "%Y%m%d") - next_dt = dt + timedelta(days=1) - return next_dt.strftime("%Y%m%d") - - -class DataSync: - """Data synchronization manager with full/incremental sync support.""" - - # Default number of worker threads - DEFAULT_MAX_WORKERS = 10 - - def __init__(self, max_workers: Optional[int] = None): - """Initialize sync manager. - - Args: - max_workers: Number of worker threads (default: 10) - """ - self.storage = ThreadSafeStorage() - self.client = TushareClient() - self.max_workers = max_workers or self.DEFAULT_MAX_WORKERS - self._stop_flag = threading.Event() - self._stop_flag.set() # Initially not stopped - self._cached_daily_data: Optional[pd.DataFrame] = None # Cache for daily data - - def _load_daily_data(self) -> pd.DataFrame: - """Load daily data from storage with caching. - - This method caches the daily data in memory to avoid repeated disk reads. - Call clear_cache() to force reload. - - Returns: - DataFrame with daily data (cached or loaded from storage) - """ - if self._cached_daily_data is None: - self._cached_daily_data = self.storage.load("daily") - return self._cached_daily_data - - def clear_cache(self) -> None: - """Clear the cached daily data to force reload on next access.""" - self._cached_daily_data = None - - def get_all_stock_codes(self, only_listed: bool = True) -> list: - """Get all stock codes from local storage. - - This function prioritizes stock_basic.csv to ensure all stocks - are included for backtesting to avoid look-ahead bias. - - Args: - only_listed: If True, only return currently listed stocks (L status). - Set to False to include delisted stocks (for full backtest). - - Returns: - List of stock codes - """ - # Import sync_all_stocks here to avoid circular imports - from src.data.api_wrappers import sync_all_stocks - from src.data.api_wrappers.api_stock_basic import _get_csv_path - - # First, ensure stock_basic.csv is up-to-date with all stocks - print("[DataSync] Ensuring stock_basic.csv is up-to-date...") - sync_all_stocks() - - # Get from stock_basic.csv file - stock_csv_path = _get_csv_path() - - if stock_csv_path.exists(): - print(f"[DataSync] Reading stock_basic from CSV: {stock_csv_path}") - try: - stock_df = pd.read_csv(stock_csv_path, encoding="utf-8-sig") - if not stock_df.empty and "ts_code" in stock_df.columns: - # Filter by list_status if only_listed is True - if only_listed and "list_status" in stock_df.columns: - listed_stocks = stock_df[stock_df["list_status"] == "L"] - codes = listed_stocks["ts_code"].unique().tolist() - total = len(stock_df["ts_code"].unique()) - print( - f"[DataSync] Found {len(codes)} listed stocks (filtered from {total} total)" - ) - else: - codes = stock_df["ts_code"].unique().tolist() - print( - f"[DataSync] Found {len(codes)} stock codes from stock_basic.csv" - ) - return codes - else: - print( - f"[DataSync] stock_basic.csv exists but no ts_code column or empty" - ) - except Exception as e: - print(f"[DataSync] Error reading stock_basic.csv: {e}") - - # Fallback: try daily storage if stock_basic not available (using cached data) - print("[DataSync] stock_basic.csv not available, falling back to daily data...") - daily_data = self._load_daily_data() - if not daily_data.empty and "ts_code" in daily_data.columns: - codes = daily_data["ts_code"].unique().tolist() - print(f"[DataSync] Found {len(codes)} stock codes from daily data") - return codes - - print("[DataSync] No stock codes found in local storage") - return [] - - def get_global_last_date(self) -> Optional[str]: - """Get the global last trade date across all stocks. - - Returns: - Last trade date string or None - """ - daily_data = self._load_daily_data() - if daily_data.empty or "trade_date" not in daily_data.columns: - return None - return str(daily_data["trade_date"].max()) - - def get_global_first_date(self) -> Optional[str]: - """Get the global first trade date across all stocks. - - Returns: - First trade date string or None - """ - daily_data = self._load_daily_data() - if daily_data.empty or "trade_date" not in daily_data.columns: - return None - return str(daily_data["trade_date"].min()) - - def get_trade_calendar_bounds( - self, start_date: str, end_date: str - ) -> tuple[Optional[str], Optional[str]]: - """Get the first and last trading day from trade calendar. - - Args: - start_date: Start date in YYYYMMDD format - end_date: End date in YYYYMMDD format - - Returns: - Tuple of (first_trading_day, last_trading_day) or (None, None) if error - """ - try: - first_day = get_first_trading_day(start_date, end_date) - last_day = get_last_trading_day(start_date, end_date) - return (first_day, last_day) - except Exception as e: - print(f"[ERROR] Failed to get trade calendar bounds: {e}") - return (None, None) - - def check_sync_needed( - self, force_full: bool = False - ) -> tuple[bool, Optional[str], Optional[str], Optional[str]]: - """Check if sync is needed based on trade calendar. - - This method compares local data date range with trade calendar - to determine if new data needs to be fetched. - - Logic: - - If force_full: sync needed, return (True, 20180101, today) - - If no local data: sync needed, return (True, 20180101, today) - - If local data exists: - - Get the last trading day from trade calendar - - If local last date >= calendar last date: NO sync needed - - Otherwise: sync needed from local_last_date + 1 to latest trade day - - Args: - force_full: If True, always return sync needed - - Returns: - Tuple of (sync_needed, start_date, end_date, local_last_date) - - sync_needed: True if sync should proceed, False to skip - - start_date: Sync start date (None if sync not needed) - - end_date: Sync end date (None if sync not needed) - - local_last_date: Local data last date (for incremental sync) - """ - # If force_full, always sync - if force_full: - print("[DataSync] Force full sync requested") - return (True, DEFAULT_START_DATE, get_today_date(), None) - - # Check if local data exists (using cached data) - daily_data = self._load_daily_data() - if daily_data.empty or "trade_date" not in daily_data.columns: - print("[DataSync] No local data found, full sync needed") - return (True, DEFAULT_START_DATE, get_today_date(), None) - - # Get local data last date (we only care about the latest date, not the first) - local_last_date = str(daily_data["trade_date"].max()) - - print(f"[DataSync] Local data last date: {local_last_date}") - - # Get the latest trading day from trade calendar - today = get_today_date() - _, cal_last = self.get_trade_calendar_bounds(DEFAULT_START_DATE, today) - - if cal_last is None: - print("[DataSync] Failed to get trade calendar, proceeding with sync") - return (True, DEFAULT_START_DATE, today, local_last_date) - - print(f"[DataSync] Calendar last trading day: {cal_last}") - - # Compare local last date with calendar last date - # If local data is already up-to-date or newer, no sync needed - print( - f"[DataSync] Comparing: local={local_last_date} (type={type(local_last_date).__name__}), cal={cal_last} (type={type(cal_last).__name__})" - ) - try: - local_last_int = int(local_last_date) - cal_last_int = int(cal_last) - print( - f"[DataSync] Comparing integers: local={local_last_int} >= cal={cal_last_int} = {local_last_int >= cal_last_int}" - ) - if local_last_int >= cal_last_int: - print( - "[DataSync] Local data is up-to-date, SKIPPING sync (no tokens consumed)" - ) - return (False, None, None, None) - except (ValueError, TypeError) as e: - print(f"[ERROR] Date comparison failed: {e}") - - # Need to sync from local_last_date + 1 to latest trade day - sync_start = get_next_date(local_last_date) - print(f"[DataSync] Incremental sync needed from {sync_start} to {cal_last}") - return (True, sync_start, cal_last, local_last_date) - - def preview_sync( - self, - force_full: bool = False, - start_date: Optional[str] = None, - end_date: Optional[str] = None, - sample_size: int = 3, - ) -> dict: - """Preview sync data volume and samples without actually syncing. - - This method provides a preview of what would be synced, including: - - Number of stocks to be synced - - Date range for sync - - Estimated total records - - Sample data from first few stocks - - Args: - force_full: If True, preview full sync from 20180101 - start_date: Manual start date (overrides auto-detection) - end_date: Manual end date (defaults to today) - sample_size: Number of sample stocks to fetch for preview (default: 3) - - Returns: - Dictionary with preview information: - { - 'sync_needed': bool, - 'stock_count': int, - 'start_date': str, - 'end_date': str, - 'estimated_records': int, - 'sample_data': pd.DataFrame, - 'mode': str, # 'full' or 'incremental' - } - """ - print("\n" + "=" * 60) - print("[DataSync] Preview Mode - Analyzing sync requirements...") - print("=" * 60) - - # First, ensure trade calendar cache is up-to-date - print("[DataSync] Syncing trade calendar cache...") - sync_trade_cal_cache() - - # Determine date range - if end_date is None: - end_date = get_today_date() - - # Check if sync is needed - sync_needed, cal_start, cal_end, local_last = self.check_sync_needed(force_full) - - if not sync_needed: - print("\n" + "=" * 60) - print("[DataSync] Preview Result") - print("=" * 60) - print(" Sync Status: NOT NEEDED") - print(" Reason: Local data is up-to-date with trade calendar") - print("=" * 60) - return { - "sync_needed": False, - "stock_count": 0, - "start_date": None, - "end_date": None, - "estimated_records": 0, - "sample_data": pd.DataFrame(), - "mode": "none", - } - - # Use dates from check_sync_needed - if cal_start and cal_end: - sync_start_date = cal_start - end_date = cal_end - else: - sync_start_date = start_date or DEFAULT_START_DATE - if end_date is None: - end_date = get_today_date() - - # Determine sync mode - if force_full: - mode = "full" - print(f"[DataSync] Mode: FULL SYNC from {sync_start_date} to {end_date}") - elif local_last and cal_start and sync_start_date == get_next_date(local_last): - mode = "incremental" - print(f"[DataSync] Mode: INCREMENTAL SYNC (bandwidth optimized)") - print(f"[DataSync] Sync from: {sync_start_date} to {end_date}") - else: - mode = "partial" - print(f"[DataSync] Mode: SYNC from {sync_start_date} to {end_date}") - - # Get all stock codes - stock_codes = self.get_all_stock_codes() - if not stock_codes: - print("[DataSync] No stocks found to sync") - return { - "sync_needed": False, - "stock_count": 0, - "start_date": None, - "end_date": None, - "estimated_records": 0, - "sample_data": pd.DataFrame(), - "mode": "none", - } - - stock_count = len(stock_codes) - print(f"[DataSync] Total stocks to sync: {stock_count}") - - # Fetch sample data from first few stocks - print(f"[DataSync] Fetching sample data from {sample_size} stocks...") - sample_data_list = [] - sample_codes = stock_codes[:sample_size] - - for ts_code in sample_codes: - try: - data = self.client.query( - "pro_bar", - ts_code=ts_code, - start_date=sync_start_date, - end_date=end_date, - factors="tor,vr", - ) - if not data.empty: - sample_data_list.append(data) - print(f" - {ts_code}: {len(data)} records") - except Exception as e: - print(f" - {ts_code}: Error fetching - {e}") - - # Combine sample data - sample_df = ( - pd.concat(sample_data_list, ignore_index=True) - if sample_data_list - else pd.DataFrame() - ) - - # Estimate total records based on sample - if not sample_df.empty: - avg_records_per_stock = len(sample_df) / len(sample_data_list) - estimated_records = int(avg_records_per_stock * stock_count) - else: - estimated_records = 0 - - # Display preview results - print("\n" + "=" * 60) - print("[DataSync] Preview Result") - print("=" * 60) - print(f" Sync Mode: {mode.upper()}") - print(f" Date Range: {sync_start_date} to {end_date}") - print(f" Stocks to Sync: {stock_count}") - print(f" Sample Stocks Checked: {len(sample_data_list)}/{sample_size}") - print(f" Estimated Total Records: ~{estimated_records:,}") - - if not sample_df.empty: - print(f"\n Sample Data Preview (first {len(sample_df)} rows):") - print(" " + "-" * 56) - # Display sample data in a compact format - preview_cols = [ - "ts_code", - "trade_date", - "open", - "high", - "low", - "close", - "vol", - ] - available_cols = [c for c in preview_cols if c in sample_df.columns] - sample_display = sample_df[available_cols].head(10) - for idx, row in sample_display.iterrows(): - print(f" {row.to_dict()}") - print(" " + "-" * 56) - - print("=" * 60) - - return { - "sync_needed": True, - "stock_count": stock_count, - "start_date": sync_start_date, - "end_date": end_date, - "estimated_records": estimated_records, - "sample_data": sample_df, - "mode": mode, - } - - def sync_single_stock( - self, - ts_code: str, - start_date: str, - end_date: str, - ) -> pd.DataFrame: - """Sync daily data for a single stock. - - Args: - ts_code: Stock code - start_date: Start date (YYYYMMDD) - end_date: End date (YYYYMMDD) - - Returns: - DataFrame with daily market data - """ - # Check if sync should stop (for exception handling) - if not self._stop_flag.is_set(): - return pd.DataFrame() - - try: - # Use shared client for rate limiting across threads - data = self.client.query( - "pro_bar", - ts_code=ts_code, - start_date=start_date, - end_date=end_date, - factors="tor,vr", - ) - return data - except Exception as e: - # Set stop flag to signal other threads to stop - self._stop_flag.clear() - print(f"[ERROR] Exception syncing {ts_code}: {e}") - raise - - def sync_all( - self, - force_full: bool = False, - start_date: Optional[str] = None, - end_date: Optional[str] = None, - max_workers: Optional[int] = None, - dry_run: bool = False, - ) -> Dict[str, pd.DataFrame]: - """Sync daily data for all stocks in local storage. - - This function: - 1. Reads stock codes from local storage (daily or stock_basic) - 2. Checks trade calendar to determine if sync is needed: - - If local data matches trade calendar bounds, SKIP sync (save tokens) - - Otherwise, sync from local_last_date + 1 to latest trade day (bandwidth optimized) - 3. Uses multi-threaded concurrent fetching with rate limiting - 4. Skips updating stocks that return empty data (delisted/unavailable) - 5. Stops immediately on any exception - - Args: - force_full: If True, force full reload from 20180101 - start_date: Manual start date (overrides auto-detection) - end_date: Manual end date (defaults to today) - max_workers: Number of worker threads (default: 10) - dry_run: If True, only preview what would be synced without writing data - - Returns: - Dict mapping ts_code to DataFrame (empty if sync skipped or dry_run) - """ - print("\n" + "=" * 60) - print("[DataSync] Starting daily data sync...") - print("=" * 60) - - # First, ensure trade calendar cache is up-to-date (uses incremental sync) - print("[DataSync] Syncing trade calendar cache...") - sync_trade_cal_cache() - - # Determine date range - if end_date is None: - end_date = get_today_date() - - # Check if sync is needed based on trade calendar - sync_needed, cal_start, cal_end, local_last = self.check_sync_needed(force_full) - - if not sync_needed: - # Sync skipped - no tokens consumed - print("\n" + "=" * 60) - print("[DataSync] Sync Summary") - print("=" * 60) - print(" Sync: SKIPPED (local data up-to-date with trade calendar)") - print(" Tokens saved: 0 consumed") - print("=" * 60) - return {} - - # Use dates from check_sync_needed (which calculates incremental start if needed) - if cal_start and cal_end: - sync_start_date = cal_start - end_date = cal_end - else: - # Fallback to default logic - sync_start_date = start_date or DEFAULT_START_DATE - if end_date is None: - end_date = get_today_date() - - # Determine sync mode - if force_full: - mode = "full" - print(f"[DataSync] Mode: FULL SYNC from {sync_start_date} to {end_date}") - elif local_last and cal_start and sync_start_date == get_next_date(local_last): - mode = "incremental" - print(f"[DataSync] Mode: INCREMENTAL SYNC (bandwidth optimized)") - print(f"[DataSync] Sync from: {sync_start_date} to {end_date}") - else: - mode = "partial" - print(f"[DataSync] Mode: SYNC from {sync_start_date} to {end_date}") - - # Get all stock codes - stock_codes = self.get_all_stock_codes() - if not stock_codes: - print("[DataSync] No stocks found to sync") - return {} - - print(f"[DataSync] Total stocks to sync: {len(stock_codes)}") - print(f"[DataSync] Using {max_workers or self.max_workers} worker threads") - - # Handle dry run mode - if dry_run: - print("\n" + "=" * 60) - print("[DataSync] DRY RUN MODE - No data will be written") - print("=" * 60) - print(f" Would sync {len(stock_codes)} stocks") - print(f" Date range: {sync_start_date} to {end_date}") - print(f" Mode: {mode}") - print("=" * 60) - return {} - - # Reset stop flag for new sync - self._stop_flag.set() - - # Multi-threaded concurrent fetching - results: Dict[str, pd.DataFrame] = {} - error_occurred = False - exception_to_raise = None - - def sync_task(ts_code: str) -> tuple[str, pd.DataFrame]: - """Task function for each stock.""" - try: - data = self.sync_single_stock( - ts_code=ts_code, - start_date=sync_start_date, - end_date=end_date, - ) - return (ts_code, data) - except Exception as e: - # Re-raise to be caught by Future - raise - - # Use ThreadPoolExecutor for concurrent fetching - workers = max_workers or self.max_workers - with ThreadPoolExecutor(max_workers=workers) as executor: - # Submit all tasks and track futures with their stock codes - future_to_code = { - executor.submit(sync_task, ts_code): ts_code for ts_code in stock_codes - } - - # Process results using as_completed - error_count = 0 - empty_count = 0 - success_count = 0 - - # Create progress bar - pbar = tqdm(total=len(stock_codes), desc="Syncing stocks") - - try: - # Process futures as they complete - for future in as_completed(future_to_code): - ts_code = future_to_code[future] - - try: - _, data = future.result() - if data is not None and not data.empty: - results[ts_code] = data - success_count += 1 - else: - # Empty data - stock may be delisted or unavailable - empty_count += 1 - print( - f"[DataSync] Stock {ts_code}: empty data (skipped, may be delisted)" - ) - except Exception as e: - # Exception occurred - stop all and abort - error_occurred = True - exception_to_raise = e - print(f"\n[ERROR] Sync aborted due to exception: {e}") - # Shutdown executor to stop all pending tasks - executor.shutdown(wait=False, cancel_futures=True) - raise exception_to_raise - - # Update progress bar - pbar.update(1) - - except Exception: - error_count = 1 - print("[DataSync] Sync stopped due to exception") - finally: - pbar.close() - - # Queue all data for batch write (only if no error) - if results and not error_occurred: - for ts_code, data in results.items(): - if not data.empty: - self.storage.queue_save("daily", data) - # Flush all queued writes at once - self.storage.flush() - total_rows = sum(len(df) for df in results.values()) - print(f"\n[DataSync] Saved {total_rows} rows to storage") - - # Summary - print("\n" + "=" * 60) - print("[DataSync] Sync Summary") - print("=" * 60) - print(f" Total stocks: {len(stock_codes)}") - print(f" Updated: {success_count}") - print(f" Skipped (empty/delisted): {empty_count}") - print( - f" Errors: {error_count} (aborted on first error)" - if error_count - else " Errors: 0" - ) - print(f" Date range: {sync_start_date} to {end_date}") - print("=" * 60) - - return results - - -# Convenience functions +from src.data.api_wrappers import sync_all_stocks +from src.data.api_wrappers.api_daily import sync_daily, preview_daily_sync def preview_sync( @@ -705,20 +38,19 @@ def preview_sync( sample_size: int = 3, max_workers: Optional[int] = None, ) -> dict: - """Preview sync data volume and samples without actually syncing. + """预览日线同步数据量和样本(不实际同步)。 - This is the recommended way to check what would be synced before - running the actual synchronization. + 这是推荐的方式,可在实际同步前检查将要同步的内容。 Args: - force_full: If True, preview full sync from 20180101 - start_date: Manual start date (overrides auto-detection) - end_date: Manual end date (defaults to today) - sample_size: Number of sample stocks to fetch for preview (default: 3) - max_workers: Number of worker threads (not used in preview, for API compatibility) + force_full: 若为 True,预览全量同步(从 20180101) + start_date: 手动指定起始日期(覆盖自动检测) + end_date: 手动指定结束日期(默认为今天) + sample_size: 预览用样本股票数量(默认: 3) + max_workers: 工作线程数(默认: 10) Returns: - Dictionary with preview information: + 包含预览信息的字典: { 'sync_needed': bool, 'stock_count': int, @@ -726,21 +58,20 @@ def preview_sync( 'end_date': str, 'estimated_records': int, 'sample_data': pd.DataFrame, - 'mode': str, # 'full', 'incremental', 'partial', or 'none' + 'mode': str, # 'full', 'incremental', 'partial', 或 'none' } Example: - >>> # Preview what would be synced + >>> # 预览将要同步的内容 >>> preview = preview_sync() >>> - >>> # Preview full sync + >>> # 预览全量同步 >>> preview = preview_sync(force_full=True) >>> - >>> # Preview with more samples + >>> # 预览更多样本 >>> preview = preview_sync(sample_size=5) """ - sync_manager = DataSync(max_workers=max_workers) - return sync_manager.preview_sync( + return preview_daily_sync( force_full=force_full, start_date=start_date, end_date=end_date, @@ -755,54 +86,168 @@ def sync_all( max_workers: Optional[int] = None, dry_run: bool = False, ) -> Dict[str, pd.DataFrame]: - """Sync daily data for all stocks. + """同步所有股票的日线数据。 - This is the main entry point for data synchronization. + 这是日线数据同步的主要入口点。 Args: - force_full: If True, force full reload from 20180101 - start_date: Manual start date (YYYYMMDD) - end_date: Manual end date (defaults to today) - max_workers: Number of worker threads (default: 10) - dry_run: If True, only preview what would be synced without writing data + force_full: 若为 True,强制从 20180101 完整重载 + start_date: 手动指定起始日期(YYYYMMDD) + end_date: 手动指定结束日期(默认为今天) + max_workers: 工作线程数(默认: 10) + dry_run: 若为 True,仅预览将要同步的内容,不写入数据 Returns: - Dict mapping ts_code to DataFrame + 映射 ts_code 到 DataFrame 的字典 Example: - >>> # First time sync (full load from 20180101) + >>> # 首次同步(从 20180101 全量加载) >>> result = sync_all() >>> - >>> # Subsequent sync (incremental - only new data) + >>> # 后续同步(增量 - 仅新数据) >>> result = sync_all() >>> - >>> # Force full reload + >>> # 强制完整重载 >>> result = sync_all(force_full=True) >>> - >>> # Manual date range + >>> # 手动指定日期范围 >>> result = sync_all(start_date='20240101', end_date='20240131') >>> - >>> # Custom thread count + >>> # 自定义线程数 >>> result = sync_all(max_workers=20) >>> - >>> # Dry run (preview only) + >>> # Dry run(仅预览) >>> result = sync_all(dry_run=True) """ - sync_manager = DataSync(max_workers=max_workers) - return sync_manager.sync_all( + return sync_daily( force_full=force_full, start_date=start_date, end_date=end_date, + max_workers=max_workers, dry_run=dry_run, ) +def sync_all_data( + force_full: bool = False, + max_workers: Optional[int] = None, + dry_run: bool = False, +) -> Dict[str, pd.DataFrame]: + """同步所有数据类型(每日同步)。 + + 该函数按顺序同步所有可用的数据类型: + 1. 交易日历 (sync_trade_cal_cache) + 2. 股票基本信息 (sync_all_stocks) + 3. 日线市场数据 (sync_all) + 4. 历史股票列表 (sync_bak_basic) + + 注意:名称变更 (namechange) 不在自动同步中,如需同步请手动调用。 + + Args: + force_full: 若为 True,强制所有数据类型完整重载 + max_workers: 日线数据同步的工作线程数(默认: 10) + dry_run: 若为 True,仅显示将要同步的内容 Returns: + 映射数据类型,不写入数据 + + 到同步结果的字典 + + Example: + >>> # 同步所有数据(增量) + >>> result = sync_all_data() + >>> + >>> # 强制完整重载 + >>> result = sync_all_data(force_full=True) + >>> + >>> # Dry run + >>> result = sync_all_data(dry_run=True) + """ + results: Dict[str, pd.DataFrame] = {} + + print("\n" + "=" * 60) + print("[sync_all_data] Starting full data synchronization...") + print("=" * 60) + + # 1. Sync trade calendar (always needed first) + print("\n[1/5] Syncing trade calendar cache...") + try: + from src.data.api_wrappers import sync_trade_cal_cache + + sync_trade_cal_cache() + results["trade_cal"] = pd.DataFrame() + print("[1/5] Trade calendar: OK") + except Exception as e: + print(f"[1/5] Trade calendar: FAILED - {e}") + results["trade_cal"] = pd.DataFrame() + + # 2. Sync stock basic info + print("\n[2/5] Syncing stock basic info...") + try: + sync_all_stocks() + results["stock_basic"] = pd.DataFrame() + print("[2/5] Stock basic: OK") + except Exception as e: + print(f"[2/5] Stock basic: FAILED - {e}") + results["stock_basic"] = pd.DataFrame() + + # 3. Sync daily market data + print("\n[3/5] Syncing daily market data...") + try: + daily_result = sync_daily( + force_full=force_full, + max_workers=max_workers, + dry_run=dry_run, + ) + results["daily"] = ( + pd.concat(daily_result.values(), ignore_index=True) + if daily_result + else pd.DataFrame() + ) + print("[3/5] Daily data: OK") + except Exception as e: + print(f"[3/5] Daily data: FAILED - {e}") + results["daily"] = pd.DataFrame() + + # 4. Sync stock historical list (bak_basic) + print("\n[4/5] Syncing stock historical list (bak_basic)...") + try: + bak_basic_result = sync_bak_basic(force_full=force_full) + results["bak_basic"] = bak_basic_result + print(f"[4/5] Bak basic: OK ({len(bak_basic_result)} records)") + except Exception as e: + print(f"[4/5] Bak basic: FAILED - {e}") + results["bak_basic"] = pd.DataFrame() + + # Summary + print("\n" + "=" * 60) + print("[sync_all_data] Sync Summary") + print("=" * 60) + for data_type, df in results.items(): + print(f" {data_type}: {len(df)} records") + print("=" * 60) + print("\nNote: namechange is NOT in auto-sync. To sync manually:") + print(" from src.data.api_wrappers import sync_namechange") + print(" sync_namechange(force=True)") + + return results + + +# 保留向后兼容的导入 +from src.data.api_wrappers import sync_bak_basic + + if __name__ == "__main__": print("=" * 60) print("Data Sync Module") print("=" * 60) print("\nUsage:") + print(" # Sync all data types at once (RECOMMENDED)") + print(" from src.data.sync import sync_all_data") + print(" result = sync_all_data() # Incremental sync all") + print(" result = sync_all_data(force_full=True) # Full reload") + print("") + print(" # Or sync individual data types:") print(" from src.data.sync import sync_all, preview_sync") + print(" from src.data.sync import sync_bak_basic") print("") print(" # Preview before sync (recommended)") print(" preview = preview_sync()") @@ -813,21 +258,14 @@ if __name__ == "__main__": print(" # Actual sync") print(" result = sync_all() # Incremental sync") print(" result = sync_all(force_full=True) # Full reload") + print("") + print(" # bak_basic sync") + print(" result = sync_bak_basic() # Incremental sync") + print(" result = sync_bak_basic(force_full=True) # Full reload") print("\n" + "=" * 60) - # Run preview first - print("\n[Main] Running preview first...") - preview = preview_sync() - - if preview["sync_needed"]: - # Ask for confirmation - print("\n" + "=" * 60) - response = input("Proceed with sync? (y/n): ").strip().lower() - if response in ("y", "yes"): - print("\n[Main] Starting actual sync...") - result = sync_all() - print(f"\nSynced {len(result)} stocks") - else: - print("\n[Main] Sync cancelled by user") - else: - print("\n[Main] No sync needed - data is up to date") + # Run sync_all_data by default + print("\n[Main] Running sync_all_data()...") + result = sync_all_data() + print("\n[Main] Sync completed!") + print(f"Total data types synced: {len(result)}") diff --git a/src/data/utils.py b/src/data/utils.py new file mode 100644 index 0000000..350fc76 --- /dev/null +++ b/src/data/utils.py @@ -0,0 +1,75 @@ +"""Data module utility functions. + +集中管理数据模块中常用的工具函数,避免重复定义。 +""" + +from datetime import datetime, timedelta +from typing import Optional + + +# 默认全量同步开始日期 +DEFAULT_START_DATE = "20180101" + +# 今日日期 (YYYYMMDD 格式) +TODAY: str = datetime.now().strftime("%Y%m%d") + + +def get_today_date() -> str: + """获取今日日期(YYYYMMDD 格式)。 + + Returns: + 今日日期字符串,格式为 YYYYMMDD + """ + return TODAY + + +def get_next_date(date_str: str) -> str: + """获取给定日期的下一天。 + + Args: + date_str: YYYYMMDD 格式的日期 + + Returns: + YYYYMMDD 格式的下一天日期 + """ + dt = datetime.strptime(date_str, "%Y%m%d") + next_dt = dt + timedelta(days=1) + return next_dt.strftime("%Y%m%d") + + +def get_prev_date(date_str: str) -> str: + """获取给定日期的前一天。 + + Args: + date_str: YYYYMMDD 格式的日期 + + Returns: + YYYYMMDD 格式的前一天日期 + """ + dt = datetime.strptime(date_str, "%Y%m%d") + prev_dt = dt - timedelta(days=1) + return prev_dt.strftime("%Y%m%d") + + +def parse_date(date_str: str) -> datetime: + """解析 YYYYMMDD 格式的日期字符串。 + + Args: + date_str: YYYYMMDD 格式的日期 + + Returns: + datetime 对象 + """ + return datetime.strptime(date_str, "%Y%m%d") + + +def format_date(dt: datetime) -> str: + """将 datetime 对象格式化为 YYYYMMDD 字符串。 + + Args: + dt: datetime 对象 + + Returns: + YYYYMMDD 格式的日期字符串 + """ + return dt.strftime("%Y%m%d") diff --git a/src/factors/__init__.py b/src/factors/__init__.py index ff7d7ce..df8edf8 100644 --- a/src/factors/__init__.py +++ b/src/factors/__init__.py @@ -18,28 +18,29 @@ - CompositeFactor: 组合因子 - ScalarFactor: 标量运算因子 +动量因子(momentum/): +- MovingAverageFactor: 移动平均线(时序因子) +- ReturnRankFactor: 收益率排名(截面因子) + +财务因子(financial/): +- (待添加) + 数据加载和执行(Phase 3-4): - DataLoader: 数据加载器 - FactorEngine: 因子执行引擎 使用示例: - from src.factors import DataSpec, FactorContext, FactorData - from src.factors import CrossSectionalFactor, TimeSeriesFactor + # 使用通用因子(参数化) + from src.factors import MovingAverageFactor, ReturnRankFactor from src.factors import DataLoader, FactorEngine - # 定义数据需求 - spec = DataSpec( - source="daily", - columns=["ts_code", "trade_date", "close"], - lookback_days=20 - ) + ma5 = MovingAverageFactor(period=5) # 5日MA + ma10 = MovingAverageFactor(period=10) # 10日MA + ret5 = ReturnRankFactor(period=5) # 5日收益率排名 - # 初始化引擎 loader = DataLoader(data_dir="data") engine = FactorEngine(loader) - - # 计算因子 - result = engine.compute(factor, start_date="20240101", end_date="20240131") + result = engine.compute(ma5, stock_codes=["000001.SZ"], start_date="20240101", end_date="20240131") """ from src.factors.data_spec import DataSpec, FactorContext, FactorData @@ -48,6 +49,9 @@ from src.factors.composite import CompositeFactor, ScalarFactor from src.factors.data_loader import DataLoader from src.factors.engine import FactorEngine +# 动量因子 +from src.factors.momentum import MovingAverageFactor, ReturnRankFactor + __all__ = [ # Phase 1: 数据类型定义 "DataSpec", @@ -62,4 +66,7 @@ __all__ = [ # Phase 3-4: 数据加载和执行引擎 "DataLoader", "FactorEngine", + # 动量因子 + "MovingAverageFactor", + "ReturnRankFactor", ] diff --git a/src/factors/financial/__init__.py b/src/factors/financial/__init__.py new file mode 100644 index 0000000..9c34516 --- /dev/null +++ b/src/factors/financial/__init__.py @@ -0,0 +1,15 @@ +"""财务因子模块 + +本模块提供财务类型的因子: + +因子分类: +- financial: 财务因子 + - (待添加) + +待添加因子: +- PERankFactor: 市盈率排名 +- PBFactor: 市净率因子 +- DividendFactor: 股息率因子 +""" + +__all__ = [] diff --git a/src/factors/momentum/__init__.py b/src/factors/momentum/__init__.py new file mode 100644 index 0000000..4b8b401 --- /dev/null +++ b/src/factors/momentum/__init__.py @@ -0,0 +1,19 @@ +"""动量因子模块 + +本模块提供动量类型的因子: +- MovingAverageFactor: 移动平均线(时序因子) +- ReturnRankFactor: 收益率排名(截面因子) + +因子分类: +- momentum: 动量因子 + - ma: 移动平均线 + - return_rank: 收益率排名 +""" + +from src.factors.momentum.ma import MovingAverageFactor +from src.factors.momentum.return_rank import ReturnRankFactor + +__all__ = [ + "MovingAverageFactor", + "ReturnRankFactor", +] diff --git a/src/factors/momentum/ma.py b/src/factors/momentum/ma.py new file mode 100644 index 0000000..05ea234 --- /dev/null +++ b/src/factors/momentum/ma.py @@ -0,0 +1,78 @@ +"""动量因子 - 移动平均线 + +本模块提供通用移动平均线因子,支持参数化配置: +- MovingAverageFactor: 移动平均线(时序因子) + +使用示例: + >>> from src.factors.momentum import MovingAverageFactor + >>> ma5 = MovingAverageFactor(period=5) # 5日MA + >>> ma10 = MovingAverageFactor(period=10) # 10日MA + >>> ma20 = MovingAverageFactor(period=20) # 20日MA +""" + +from typing import List + +import polars as pl + +from src.factors.base import TimeSeriesFactor +from src.factors.data_spec import DataSpec, FactorData + + +class MovingAverageFactor(TimeSeriesFactor): + """移动平均线因子 + + 计算逻辑:对每只股票,计算其过去n日收盘价的移动平均值。 + + 特点: + - 参数化因子:训练时通过 period 参数指定计算窗口 + - 时序因子:每只股票单独计算,防止股票间数据泄露 + + Attributes: + period: MA计算期(天数),默认5 + + Example: + >>> ma5 = MovingAverageFactor(period=5) + >>> # 计算过去5日的收盘价均值 + """ + + name: str = "ma" + factor_type: str = "time_series" + category: str = "momentum" + description: str = "移动平均线因子,计算过去n日收盘价的均值" + data_specs: List[DataSpec] = [ + DataSpec("daily", ["ts_code", "trade_date", "close"], lookback_days=5) + ] + + def __init__(self, period: int = 5): + """初始化因子 + + Args: + period: MA计算期(天数),默认5日 + """ + super().__init__(period=period) + # 重新创建 DataSpec 以设置正确的 lookback_days(DataSpec 是 frozen 的) + self.data_specs = [ + DataSpec( + "daily", + ["ts_code", "trade_date", "close"], + lookback_days=period, + ) + ] + self.name = f"ma_{period}" + + def compute(self, data: FactorData) -> pl.Series: + """计算移动平均线 + + Args: + data: FactorData,包含单只股票的完整时间序列 + + Returns: + 移动平均值序列 + """ + # 获取收盘价序列 + close_prices = data.get_column("close") + + # 计算移动平均 + ma = close_prices.rolling_mean(window_size=self.params["period"]) + + return ma diff --git a/src/factors/momentum/return_rank.py b/src/factors/momentum/return_rank.py new file mode 100644 index 0000000..ee7514b --- /dev/null +++ b/src/factors/momentum/return_rank.py @@ -0,0 +1,100 @@ +"""动量因子 - 收益率排名 + +本模块提供收益率排名因子: +- ReturnRankFactor: 过去n日收益率的rank因子(截面因子) + +使用示例: + >>> from src.factors.momentum import ReturnRankFactor + >>> ret5 = ReturnRankFactor(period=5) # 5日收益率排名 + >>> ret10 = ReturnRankFactor(period=10) # 10日收益率排名 +""" + +from typing import List + +import polars as pl + +from src.factors.base import CrossSectionalFactor +from src.factors.data_spec import DataSpec, FactorData + + +class ReturnRankFactor(CrossSectionalFactor): + """过去n日收益率排名因子 + + 计算逻辑:每个交易日,计算所有股票过去n日的收益率,然后进行截面排名。 + + 特点: + - 参数化因子:训练时通过 period 参数指定计算窗口 + - 截面因子:每天对所有股票进行横向排名,防止日期泄露 + + Attributes: + period: 收益率计算期(默认5日) + + Example: + >>> ret5 = ReturnRankFactor(period=5) + >>> # 每个交易日,返回所有股票过去5日收益率的排名 + """ + + name: str = "return_rank" + factor_type: str = "cross_sectional" + category: str = "momentum" + description: str = "过去n日收益率的截面排名因子" + data_specs: List[DataSpec] = [ + DataSpec("daily", ["ts_code", "trade_date", "close"], lookback_days=5) + ] + + def __init__(self, period: int = 5): + """初始化因子 + + Args: + period: 收益率计算期(天数) + """ + super().__init__(period=period) + # 重新创建 DataSpec 以设置正确的 lookback_days(DataSpec 是 frozen 的) + self.data_specs = [ + DataSpec( + "daily", + ["ts_code", "trade_date", "close"], + lookback_days=period + 1, + ) + ] + self.name = f"return_{period}_rank" + + def compute(self, data: FactorData) -> pl.Series: + """计算过去n日收益率排名 + + Args: + data: FactorData,包含过去n+1天的截面数据 + + Returns: + 过去n日收益率的截面排名(0-1之间) + """ + # 获取当前日期的截面数据 + cs = data.to_polars() + + # 获取所有交易日期(已按日期排序) + trade_dates = cs["trade_date"].unique().sort() + + if len(trade_dates) < 2: + # 数据不足,返回空排名 + return pl.Series(name=self.name, values=[]) + + # 获取最新日期的数据 + latest_date = trade_dates[-1] + current_data = cs.filter(pl.col("trade_date") == latest_date) + + # 获取n天前的日期 + n_days_ago = trade_dates[-(self.params["period"] + 1)] + past_data = cs.filter(pl.col("trade_date") == n_days_ago) + + # 通过 ts_code join 计算收益率 + merged = current_data.select(["ts_code", "close"]).join( + past_data.select(["ts_code", "close"]).rename({"close": "close_past"}), + on="ts_code", + how="inner", + ) + + # 计算收益率 + returns = (merged["close"] - merged["close_past"]) / merged["close_past"] + + # 返回排名(0-1之间) + return returns.rank(method="average") / len(returns) diff --git a/src/models/__init__.py b/src/pipeline/__init__.py similarity index 78% rename from src/models/__init__.py rename to src/pipeline/__init__.py index 789dcac..9187292 100644 --- a/src/models/__init__.py +++ b/src/pipeline/__init__.py @@ -1,9 +1,10 @@ -"""ProStock 模型训练框架 +"""ProStock ML Pipeline 组件库 -组件化、低耦合、插件式的机器学习训练框架。 +提供组件化、低耦合、插件式的机器学习流水线组件。 +包括处理器、模型、划分策略等可复用组件。 示例: - >>> from src.models import ( + >>> from src.pipeline import ( ... PluginRegistry, ProcessingPipeline, ... PipelineStage, BaseProcessor ... ) @@ -21,7 +22,7 @@ """ # 导入核心抽象类和划分策略 -from src.models.core import ( +from src.pipeline.core import ( PipelineStage, TaskType, BaseProcessor, @@ -34,13 +35,13 @@ from src.models.core import ( ) # 导入注册中心 -from src.models.registry import PluginRegistry +from src.pipeline.registry import PluginRegistry # 导入处理流水线 -from src.models.pipeline import ProcessingPipeline +from src.pipeline.pipeline import ProcessingPipeline # 导入并注册内置处理器 -from src.models.processors.processors import ( +from src.pipeline.processors.processors import ( DropNAProcessor, FillNAProcessor, Winsorizer, @@ -51,7 +52,7 @@ from src.models.processors.processors import ( ) # 导入并注册内置模型 -from src.models.models.models import ( +from src.pipeline.models.models import ( LightGBMModel, CatBoostModel, ) diff --git a/src/models/core/__init__.py b/src/pipeline/core/__init__.py similarity index 85% rename from src/models/core/__init__.py rename to src/pipeline/core/__init__.py index 7369ced..70f6cf7 100644 --- a/src/models/core/__init__.py +++ b/src/pipeline/core/__init__.py @@ -1,6 +1,6 @@ """核心模块导出""" -from src.models.core.base import ( +from src.pipeline.core.base import ( PipelineStage, TaskType, BaseProcessor, @@ -9,7 +9,7 @@ from src.models.core.base import ( BaseMetric, ) -from src.models.core.splitter import ( +from src.pipeline.core.splitter import ( TimeSeriesSplit, WalkForwardSplit, ExpandingWindowSplit, diff --git a/src/models/core/base.py b/src/pipeline/core/base.py similarity index 100% rename from src/models/core/base.py rename to src/pipeline/core/base.py diff --git a/src/models/core/splitter.py b/src/pipeline/core/splitter.py similarity index 99% rename from src/models/core/splitter.py rename to src/pipeline/core/splitter.py index d2734a6..8d63f4a 100644 --- a/src/models/core/splitter.py +++ b/src/pipeline/core/splitter.py @@ -6,7 +6,7 @@ from typing import Iterator, List, Tuple import polars as pl -from src.models.core.base import BaseSplitter +from src.pipeline.core.base import BaseSplitter class TimeSeriesSplit(BaseSplitter): diff --git a/src/models/models/__init__.py b/src/pipeline/models/__init__.py similarity index 74% rename from src/models/models/__init__.py rename to src/pipeline/models/__init__.py index 9618f58..cc2cdfc 100644 --- a/src/models/models/__init__.py +++ b/src/pipeline/models/__init__.py @@ -1,6 +1,6 @@ """模型模块""" -from src.models.models.models import ( +from src.pipeline.models.models import ( LightGBMModel, CatBoostModel, ) diff --git a/src/models/models/models.py b/src/pipeline/models/models.py similarity index 98% rename from src/models/models/models.py rename to src/pipeline/models/models.py index e50e179..40278b1 100644 --- a/src/models/models/models.py +++ b/src/pipeline/models/models.py @@ -7,8 +7,8 @@ from typing import Optional, Dict, Any import polars as pl import numpy as np -from src.models.core import BaseModel, TaskType -from src.models.registry import PluginRegistry +from src.pipeline.core import BaseModel, TaskType +from src.pipeline.registry import PluginRegistry @PluginRegistry.register_model("lightgbm") diff --git a/src/models/pipeline.py b/src/pipeline/pipeline.py similarity index 97% rename from src/models/pipeline.py rename to src/pipeline/pipeline.py index 09be2ee..9a7407c 100644 --- a/src/models/pipeline.py +++ b/src/pipeline/pipeline.py @@ -6,7 +6,7 @@ from typing import List, Dict import polars as pl -from src.models.core import BaseProcessor, PipelineStage +from src.pipeline.core import BaseProcessor, PipelineStage class ProcessingPipeline: diff --git a/src/models/processors/__init__.py b/src/pipeline/processors/__init__.py similarity index 86% rename from src/models/processors/__init__.py rename to src/pipeline/processors/__init__.py index f68eb14..f33779e 100644 --- a/src/models/processors/__init__.py +++ b/src/pipeline/processors/__init__.py @@ -1,6 +1,6 @@ """处理器模块""" -from src.models.processors.processors import ( +from src.pipeline.processors.processors import ( DropNAProcessor, FillNAProcessor, Winsorizer, diff --git a/src/models/processors/processors.py b/src/pipeline/processors/processors.py similarity index 98% rename from src/models/processors/processors.py rename to src/pipeline/processors/processors.py index c38a6a3..31985c2 100644 --- a/src/models/processors/processors.py +++ b/src/pipeline/processors/processors.py @@ -7,8 +7,8 @@ from typing import List, Optional, Dict, Any import polars as pl import numpy as np -from src.models.core import BaseProcessor, PipelineStage -from src.models.registry import PluginRegistry +from src.pipeline.core import BaseProcessor, PipelineStage +from src.pipeline.registry import PluginRegistry # 数值类型列表 FLOAT_TYPES = [pl.Float32, pl.Float64, pl.Int8, pl.Int16, pl.Int32, pl.Int64] diff --git a/src/models/registry.py b/src/pipeline/registry.py similarity index 99% rename from src/models/registry.py rename to src/pipeline/registry.py index b65767e..9ff8ecb 100644 --- a/src/models/registry.py +++ b/src/pipeline/registry.py @@ -17,7 +17,7 @@ from functools import wraps from weakref import WeakValueDictionary import contextlib -from src.models.core import BaseProcessor, BaseModel, BaseSplitter, BaseMetric +from src.pipeline.core import BaseProcessor, BaseModel, BaseSplitter, BaseMetric T = TypeVar("T") diff --git a/src/training/__init__.py b/src/training/__init__.py new file mode 100644 index 0000000..2615cd4 --- /dev/null +++ b/src/training/__init__.py @@ -0,0 +1,46 @@ +"""ProStock 训练流程模块 + +本模块提供完整的模型训练流程: +1. 数据处理:Fillna(0) -> Dropna +2. 模型训练:LightGBM分类模型 +3. 预测选股:每日top5股票池 + +使用示例: + from src.training import run_training + + # 运行完整训练流程 + result = run_training( + train_start="20180101", + train_end="20230101", + test_start="20230101", + test_end="20240101", + top_n=5, + output_path="output/top_stocks.tsv" + ) + +因子使用: + from src.factors import MovingAverageFactor, ReturnRankFactor + + ma5 = MovingAverageFactor(period=5) # 5日移动平均 + ma10 = MovingAverageFactor(period=10) # 10日移动平均 + ret5 = ReturnRankFactor(period=5) # 5日收益率排名 +""" + +from src.training.pipeline import ( + create_pipeline, + predict_top_stocks, + prepare_data, + run_training, + save_top_stocks, + train_model, +) + +__all__ = [ + # 管道函数 + "prepare_data", + "create_pipeline", + "train_model", + "predict_top_stocks", + "save_top_stocks", + "run_training", +] diff --git a/src/training/main.py b/src/training/main.py new file mode 100644 index 0000000..97cd868 --- /dev/null +++ b/src/training/main.py @@ -0,0 +1,27 @@ +"""训练流程入口脚本 + +运行方式: + uv run python -m src.training.main + +或: + uv run python src/training/main.py +""" + +from src.training.pipeline import run_training + + +if __name__ == "__main__": + # 运行完整训练流程 + # 训练集:20180101 - 20230101 + # 测试集:20230101 - 20240101 + result = run_training( + train_start="20190101", + train_end="20250101", + test_start="20250101", + test_end="20260101", + top_n=5, + output_path="output/top_stocks.tsv", + ) + + print("\n[Result] Top stocks selection:") + print(result) diff --git a/src/training/output/top_stocks.tsv b/src/training/output/top_stocks.tsv new file mode 100644 index 0000000..fc7e297 --- /dev/null +++ b/src/training/output/top_stocks.tsv @@ -0,0 +1,1216 @@ +trade_date score ts_code +2025-01-02 0.7051179656547538 000759.SZ +2025-01-02 0.7051179656547538 600778.SH +2025-01-02 0.7051179656547538 002582.SZ +2025-01-02 0.7051179656547538 603068.SH +2025-01-02 0.7051179656547538 605299.SH +2025-01-03 0.7051179656547538 603016.SH +2025-01-03 0.7051179656547538 600071.SH +2025-01-03 0.7051179656547538 002952.SZ +2025-01-03 0.7051179656547538 600693.SH +2025-01-03 0.7051179656547538 600618.SH +2025-01-06 0.7051179656547538 000756.SZ +2025-01-06 0.7051179656547538 605378.SH +2025-01-06 0.7051179656547538 000597.SZ +2025-01-06 0.7051179656547538 600618.SH +2025-01-06 0.7051179656547538 002552.SZ +2025-01-07 0.7051179656547538 002620.SZ +2025-01-07 0.7051179656547538 600218.SH +2025-01-07 0.7051179656547538 000533.SZ +2025-01-07 0.7051179656547538 600105.SH +2025-01-07 0.7051179656547538 003003.SZ +2025-01-08 0.733000910769149 603070.SH +2025-01-08 0.7285396253124238 600693.SH +2025-01-08 0.7269798929107358 603683.SH +2025-01-08 0.7263330514684178 002364.SZ +2025-01-08 0.7250024174886921 603990.SH +2025-01-09 0.7254659521341397 603118.SH +2025-01-09 0.7250673670188915 002400.SZ +2025-01-09 0.7244838367978005 002536.SZ +2025-01-09 0.723226593227657 603777.SH +2025-01-09 0.7142213237842978 603305.SH +2025-01-10 0.733000910769149 600053.SH +2025-01-10 0.728364923591949 600650.SH +2025-01-10 0.7278727136705941 000561.SZ +2025-01-10 0.7269798929107358 002660.SZ +2025-01-10 0.7250024174886921 002917.SZ +2025-01-13 0.728999426321122 603166.SH +2025-01-13 0.728999426321122 603106.SH +2025-01-13 0.7274413474305255 002313.SZ +2025-01-13 0.7269798929107358 600128.SH +2025-01-13 0.7250024174886921 002725.SZ +2025-01-14 0.733000910769149 002583.SZ +2025-01-14 0.733000910769149 603825.SH +2025-01-14 0.728999426321122 002137.SZ +2025-01-14 0.7254659521341397 003003.SZ +2025-01-14 0.7254659521341397 002103.SZ +2025-01-15 0.6387075433643968 600289.SH +2025-01-15 0.6374385511426223 002656.SZ +2025-01-15 0.5410429736449522 601279.SH +2025-01-15 0.5393223279940808 002397.SZ +2025-01-15 0.5340821600522587 002793.SZ +2025-01-16 0.6781212885667098 002123.SZ +2025-01-16 0.6775871821434284 000016.SZ +2025-01-16 0.6775871821434284 600933.SH +2025-01-16 0.6387075433643968 600289.SH +2025-01-16 0.6159681320421684 002379.SZ +2025-01-17 0.6826622225079328 002656.SZ +2025-01-17 0.6826622225079328 600933.SH +2025-01-17 0.6387075433643968 600289.SH +2025-01-17 0.544416099620109 000408.SZ +2025-01-17 0.5361316966516099 002793.SZ +2025-01-20 0.5313247850374478 002872.SZ +2025-01-20 0.5270902024087297 603500.SH +2025-01-20 0.521655179277378 002218.SZ +2025-01-20 0.5146433933122077 000802.SZ +2025-01-20 0.5099224225735371 600664.SH +2025-01-21 0.6159124153292242 600289.SH +2025-01-21 0.521566979605915 603500.SH +2025-01-21 0.5192867029883689 003021.SZ +2025-01-21 0.5121463182849544 002218.SZ +2025-01-21 0.5120253271434904 603236.SH +2025-01-22 0.6620934004606905 002656.SZ +2025-01-22 0.661169148249026 600289.SH +2025-01-22 0.5351771629779175 002418.SZ +2025-01-22 0.5233214191633075 600589.SH +2025-01-22 0.5188882944862926 600243.SH +2025-01-23 0.6669403824458782 001356.SZ +2025-01-23 0.6620934004606905 002656.SZ +2025-01-23 0.661169148249026 600289.SH +2025-01-23 0.5823267181327698 002418.SZ +2025-01-23 0.5208278859839193 002686.SZ +2025-01-24 0.6474199399258881 002494.SZ +2025-01-24 0.5331050665207507 000615.SZ +2025-01-24 0.5295780449034047 600421.SH +2025-01-24 0.5289199536142031 600130.SH +2025-01-24 0.5256292685122648 002569.SZ +2025-01-27 0.6669403824458782 001395.SZ +2025-01-27 0.631159350200206 600130.SH +2025-01-27 0.5720072117966968 600421.SH +2025-01-27 0.5584407718655982 600358.SH +2025-01-27 0.5380300868300264 002542.SZ +2025-02-05 0.7244838367978005 002123.SZ +2025-02-05 0.6550780484687123 000972.SZ +2025-02-05 0.5890724385352274 600358.SH +2025-02-05 0.5560990328127002 000820.SZ +2025-02-05 0.5427844132995285 603682.SH +2025-02-06 0.6310880416420657 600130.SH +2025-02-06 0.6025512918092927 600358.SH +2025-02-06 0.5685222827863671 000736.SZ +2025-02-06 0.5572457637916861 600778.SH +2025-02-06 0.5513423924464075 601515.SH +2025-02-07 0.7027342936600967 001395.SZ +2025-02-07 0.6224057426565484 600358.SH +2025-02-07 0.5598843527597499 002767.SZ +2025-02-07 0.5542198914019132 605162.SH +2025-02-07 0.5450747783984903 600858.SH +2025-02-10 0.5585540848911058 000736.SZ +2025-02-10 0.5543271073999997 605162.SH +2025-02-10 0.5451710325592884 002767.SZ +2025-02-10 0.5433425776971076 603682.SH +2025-02-10 0.5392073380823367 002636.SZ +2025-02-11 0.543296731052936 603719.SH +2025-02-11 0.5323121465042445 601156.SH +2025-02-11 0.526231221718517 002636.SZ +2025-02-11 0.5136339707518657 603687.SH +2025-02-11 0.5131110240773495 002820.SZ +2025-02-12 0.5467464517435845 002052.SZ +2025-02-12 0.5150190047321577 603296.SH +2025-02-12 0.5108097318020007 603501.SH +2025-02-12 0.5072040838683077 600115.SH +2025-02-12 0.5062407534441172 600929.SH +2025-02-13 0.5260729456251799 002822.SZ +2025-02-13 0.5249034646482712 001356.SZ +2025-02-13 0.5225603550790333 002718.SZ +2025-02-13 0.5062880387830124 002207.SZ +2025-02-13 0.5062880387830124 600517.SH +2025-02-14 0.5129142393802463 002718.SZ +2025-02-14 0.5062880387830124 603323.SH +2025-02-14 0.5062407534441172 600929.SH +2025-02-14 0.5062407534441172 000930.SZ +2025-02-14 0.5059432414859574 600222.SH +2025-02-17 0.5404208504398543 002718.SZ +2025-02-17 0.5313441956279837 600381.SH +2025-02-17 0.511722434421143 000530.SZ +2025-02-17 0.5062786073876784 002697.SZ +2025-02-17 0.5062407534441172 601988.SH +2025-02-18 0.5345120731092181 002718.SZ +2025-02-18 0.516756626959392 000626.SZ +2025-02-18 0.5096590250388462 603036.SH +2025-02-18 0.5069762768566503 002059.SZ +2025-02-18 0.5069065869847293 600740.SH +2025-02-19 0.5326002636013797 600261.SH +2025-02-19 0.5221685385870379 600381.SH +2025-02-19 0.5205308388365613 601500.SH +2025-02-19 0.519151342531204 000530.SZ +2025-02-19 0.5109344127753974 600255.SH +2025-02-20 0.5195782215053089 600261.SH +2025-02-20 0.5173744533696785 600121.SH +2025-02-20 0.5166753540849746 000530.SZ +2025-02-20 0.5129988424429641 603116.SH +2025-02-20 0.5109344127753974 601113.SH +2025-02-21 0.516756626959392 600698.SH +2025-02-21 0.5158730698526023 000802.SZ +2025-02-21 0.5150190047321577 603728.SH +2025-02-21 0.5148248849090167 601111.SH +2025-02-21 0.5141087911649017 600320.SH +2025-02-24 0.5417239869858262 000802.SZ +2025-02-24 0.5171224060142442 002905.SZ +2025-02-24 0.5167878247389599 600630.SH +2025-02-24 0.5157554031130761 002739.SZ +2025-02-24 0.5117426671618643 000530.SZ +2025-02-25 0.5137333740114639 603103.SH +2025-02-25 0.5132838589586456 600664.SH +2025-02-25 0.5123761070094214 600630.SH +2025-02-25 0.5084389075155574 000600.SZ +2025-02-25 0.5072040838683077 600121.SH +2025-02-26 0.5178887396475126 603825.SH +2025-02-26 0.5176583632438975 000863.SZ +2025-02-26 0.5161890103955529 600609.SH +2025-02-26 0.5148248849090167 603993.SH +2025-02-26 0.5131783435030605 000802.SZ +2025-02-27 0.5234880337204921 600863.SH +2025-02-27 0.5192867029883689 001306.SZ +2025-02-27 0.5110428546997954 601599.SH +2025-02-27 0.5108097318020007 002896.SZ +2025-02-27 0.5072040838683077 600500.SH +2025-02-28 0.5302693298584756 603825.SH +2025-02-28 0.528551727159285 603848.SH +2025-02-28 0.5135320827288788 000488.SZ +2025-02-28 0.5099124664959997 600421.SH +2025-02-28 0.508592800689105 002550.SZ +2025-03-03 0.5388641146940151 603848.SH +2025-03-03 0.5284306205881467 600633.SH +2025-03-03 0.5189972724604688 002716.SZ +2025-03-03 0.5179075316852415 600863.SH +2025-03-03 0.5174473288476508 601000.SH +2025-03-04 0.6669403824458782 603409.SH +2025-03-04 0.5388421266082344 600986.SH +2025-03-04 0.5296286995283527 002175.SZ +2025-03-04 0.5296286995283527 002398.SZ +2025-03-04 0.5192811146990993 000903.SZ +2025-03-05 0.5713532860246767 002398.SZ +2025-03-05 0.5439725581069169 600633.SH +2025-03-05 0.5207195789387543 600556.SH +2025-03-05 0.5152424033193611 002238.SZ +2025-03-05 0.5148248849090167 601801.SH +2025-03-06 0.5440709142221977 600302.SH +2025-03-06 0.5350674079018686 600892.SH +2025-03-06 0.531155872343103 000816.SZ +2025-03-06 0.5255755545116174 002522.SZ +2025-03-06 0.5221685385870379 600130.SH +2025-03-07 0.5303266869959796 600825.SH +2025-03-07 0.5294564461378244 000892.SZ +2025-03-07 0.517257618824714 002191.SZ +2025-03-07 0.5169986507482853 000506.SZ +2025-03-07 0.5150190047321577 601100.SH +2025-03-10 0.5455320858284268 000679.SZ +2025-03-10 0.5179555419687653 002191.SZ +2025-03-10 0.5155746895304034 605300.SH +2025-03-10 0.5147422040064019 600611.SH +2025-03-10 0.5131569088886956 000042.SZ +2025-03-11 0.6669403824458782 603271.SH +2025-03-11 0.5195959454077144 000042.SZ +2025-03-11 0.5188882944862926 600892.SH +2025-03-11 0.5156039866864774 002742.SZ +2025-03-11 0.5104362501652622 002133.SZ +2025-03-12 0.5312791351753321 600622.SH +2025-03-12 0.5207195789387543 000506.SZ +2025-03-12 0.5192867029883689 603297.SH +2025-03-12 0.5124833570753022 002490.SZ +2025-03-12 0.5086182804866152 000560.SZ +2025-03-13 0.5306198355428968 605300.SH +2025-03-13 0.5237245769000003 002490.SZ +2025-03-13 0.5123724033923982 000042.SZ +2025-03-13 0.5105185694504328 002208.SZ +2025-03-13 0.5092585322658638 600525.SH +2025-03-14 0.5153716885780952 605066.SH +2025-03-14 0.5092800421012358 603700.SH +2025-03-14 0.5086237851754711 000016.SZ +2025-03-14 0.5065054255860063 600115.SH +2025-03-14 0.5062786073876784 601500.SH +2025-03-17 0.5249712719291497 603700.SH +2025-03-17 0.5216259044236606 605066.SH +2025-03-17 0.5192867029883689 002896.SZ +2025-03-17 0.511187246369438 603086.SH +2025-03-17 0.507539169599309 603278.SH +2025-03-18 0.5347903147450458 600216.SH +2025-03-18 0.5069762768566503 000822.SZ +2025-03-18 0.5066006223596465 600115.SH +2025-03-18 0.5062880387830124 600510.SH +2025-03-18 0.5062407534441172 601988.SH +2025-03-19 0.5246154292130808 600216.SH +2025-03-19 0.5173744533696785 000761.SZ +2025-03-19 0.5067526778780671 600115.SH +2025-03-19 0.5062880387830124 002360.SZ +2025-03-19 0.5062880387830124 002492.SZ +2025-03-20 0.6669403824458782 603124.SH +2025-03-20 0.5192867029883689 601799.SH +2025-03-20 0.506799811492834 601988.SH +2025-03-20 0.5062786073876784 600929.SH +2025-03-20 0.5059432414859574 002592.SZ +2025-03-21 0.6669403824458782 001382.SZ +2025-03-21 0.5125122512066596 601010.SH +2025-03-21 0.5102833212875136 002542.SZ +2025-03-21 0.5095087997335653 603388.SH +2025-03-21 0.5063152599497193 002898.SZ +2025-03-24 0.5287115738052487 600539.SH +2025-03-24 0.516756626959392 002175.SZ +2025-03-24 0.5159073332822095 002369.SZ +2025-03-24 0.5140870467512906 002348.SZ +2025-03-24 0.5127436997551269 002542.SZ +2025-03-25 0.5338833435320202 002418.SZ +2025-03-25 0.5288583540672106 000536.SZ +2025-03-25 0.5220201124248427 002542.SZ +2025-03-25 0.5210227197807852 600193.SH +2025-03-25 0.5208278859839193 600478.SH +2025-03-26 0.5471895490430322 603626.SH +2025-03-26 0.5454494909023954 600193.SH +2025-03-26 0.5319661551585267 600984.SH +2025-03-26 0.5317817863838529 000665.SZ +2025-03-26 0.5313866754947477 000892.SZ +2025-03-27 0.5503862351081134 603626.SH +2025-03-27 0.5448849775405112 002795.SZ +2025-03-27 0.5433145940449446 600552.SH +2025-03-27 0.5352209692776468 600355.SH +2025-03-27 0.533842022464007 000813.SZ +2025-03-28 0.5661570857190846 001382.SZ +2025-03-28 0.5585868058140843 002076.SZ +2025-03-28 0.5576433813007622 603315.SH +2025-03-28 0.5552114736653683 002490.SZ +2025-03-28 0.5456866035054126 002580.SZ +2025-03-31 0.553764702793686 000890.SZ +2025-03-31 0.5532818452700384 000736.SZ +2025-03-31 0.5528182733194853 600439.SH +2025-03-31 0.5422846765613857 605208.SH +2025-03-31 0.5411407392151744 603421.SH +2025-04-01 0.574923880807869 002713.SZ +2025-04-01 0.5708170878939591 601798.SH +2025-04-01 0.5674920738410727 600355.SH +2025-04-01 0.5674920738410727 000890.SZ +2025-04-01 0.5512737728527647 603789.SH +2025-04-02 0.5638354497475407 002164.SZ +2025-04-02 0.5552114736653683 000890.SZ +2025-04-02 0.5432957621006483 002724.SZ +2025-04-02 0.5413800214539841 603359.SH +2025-04-02 0.5411760159205538 603081.SH +2025-04-03 0.5584407718655982 002523.SZ +2025-04-03 0.5445463206684408 603081.SH +2025-04-03 0.5388926502267588 600758.SH +2025-04-03 0.538322659938494 603359.SH +2025-04-03 0.5373019343225885 600255.SH +2025-04-07 0.5858977654072869 002278.SZ +2025-04-07 0.5831732967328205 600758.SH +2025-04-07 0.5683367621802544 000852.SZ +2025-04-07 0.5683367621802544 000541.SZ +2025-04-07 0.5683049519792355 603359.SH +2025-04-08 0.6669403824458782 603257.SH +2025-04-08 0.5746354992094956 000541.SZ +2025-04-08 0.5713532860246767 000595.SZ +2025-04-08 0.5700320948798235 600758.SH +2025-04-08 0.5582398821422095 002523.SZ +2025-04-09 0.6669403824458782 603210.SH +2025-04-09 0.5977846147659792 600758.SH +2025-04-09 0.5747415726474464 002363.SZ +2025-04-09 0.5714595675545813 603002.SH +2025-04-09 0.5693345798065312 002523.SZ +2025-04-10 0.5826618787963105 002662.SZ +2025-04-10 0.5681804385990817 600579.SH +2025-04-10 0.5676768464120937 002686.SZ +2025-04-10 0.5673064474063121 002689.SZ +2025-04-10 0.566065683809122 600403.SH +2025-04-11 0.6199863617598848 600595.SH +2025-04-11 0.6097986210699184 600403.SH +2025-04-11 0.5825563460455111 000757.SZ +2025-04-11 0.575622310153987 000533.SZ +2025-04-11 0.575622310153987 002420.SZ +2025-04-14 0.6072536670709681 600595.SH +2025-04-14 0.5681804385990817 002579.SZ +2025-04-14 0.5647134283078418 603353.SH +2025-04-14 0.5604553606074223 605255.SH +2025-04-14 0.5543020950336787 002448.SZ +2025-04-15 0.6669403824458782 001335.SZ +2025-04-15 0.5604663668984082 002735.SZ +2025-04-15 0.5389745862910662 603353.SH +2025-04-15 0.5350674079018686 600403.SH +2025-04-15 0.5349437299046987 002526.SZ +2025-04-16 0.6669403824458782 603120.SH +2025-04-16 0.5116020755128445 600595.SH +2025-04-16 0.5104362501652622 600477.SH +2025-04-16 0.5098676844433959 002091.SZ +2025-04-16 0.5088902658690911 600152.SH +2025-04-17 0.5119422408566764 603330.SH +2025-04-17 0.5067526778780671 000725.SZ +2025-04-17 0.5062935365059151 002535.SZ +2025-04-17 0.5059432414859574 603817.SH +2025-04-17 0.5059432414859574 002022.SZ +2025-04-18 0.5553688921775346 603330.SH +2025-04-18 0.514635987194688 002688.SZ +2025-04-18 0.5110428546997954 002535.SZ +2025-04-18 0.5056110543461024 002752.SZ +2025-04-18 0.5056110543461024 000650.SZ +2025-04-21 0.5819957556599779 002084.SZ +2025-04-21 0.545414969408834 603336.SH +2025-04-21 0.5172847175023007 603330.SH +2025-04-21 0.516612009471722 000713.SZ +2025-04-21 0.5149332831592093 002688.SZ +2025-04-22 0.5843211073067424 002084.SZ +2025-04-22 0.5433478030227075 002900.SZ +2025-04-22 0.5153784216354333 603709.SH +2025-04-22 0.5100486966304032 000958.SZ +2025-04-22 0.5098912342080154 600195.SH +2025-04-23 0.5600621372160365 000785.SZ +2025-04-23 0.5518254167458194 603709.SH +2025-04-23 0.5377852469767108 002900.SZ +2025-04-23 0.5314798446365324 002084.SZ +2025-04-23 0.5284541045338316 003021.SZ +2025-04-24 0.6669403824458782 603202.SH +2025-04-24 0.6669403824458782 001400.SZ +2025-04-24 0.5977176355354952 000785.SZ +2025-04-24 0.5677858234677825 002084.SZ +2025-04-24 0.5259647450920883 600033.SH +2025-04-25 0.5622165720306567 000785.SZ +2025-04-25 0.5332605242520999 600965.SH +2025-04-25 0.5230859175308072 002234.SZ +2025-04-25 0.5219065617709089 002713.SZ +2025-04-25 0.5215694612086733 002529.SZ +2025-04-28 0.5842493402723935 000785.SZ +2025-04-28 0.5370901276046655 600381.SH +2025-04-28 0.5336039591479221 603605.SH +2025-04-28 0.5294515299872153 600965.SH +2025-04-28 0.519813501069505 600172.SH +2025-04-29 0.5786351360102432 600187.SH +2025-04-29 0.5296286995283527 600712.SH +2025-04-29 0.5237719024466865 600965.SH +2025-04-29 0.5215694612086733 600249.SH +2025-04-29 0.5194484553193683 000753.SZ +2025-04-30 0.5755387753130181 000504.SZ +2025-04-30 0.5538023549703315 000638.SZ +2025-04-30 0.5514808808570097 002910.SZ +2025-04-30 0.5513285701188216 002549.SZ +2025-04-30 0.5485578275612335 603535.SH +2025-05-06 0.5789643829047145 600300.SH +2025-05-06 0.5515931343933391 002529.SZ +2025-05-06 0.545227460940995 600249.SH +2025-05-06 0.5393720691752084 600250.SH +2025-05-06 0.5380380761759416 603021.SH +2025-05-07 0.5935722209684432 002231.SZ +2025-05-07 0.5555488333612659 600228.SH +2025-05-07 0.5533538665235384 600300.SH +2025-05-07 0.5320260837500719 603535.SH +2025-05-07 0.5313084167778416 002529.SZ +2025-05-08 0.592069575966614 000638.SZ +2025-05-08 0.5908651580077275 002231.SZ +2025-05-08 0.5512490630072319 600525.SH +2025-05-08 0.5495867905803188 002693.SZ +2025-05-08 0.5493178829982284 600308.SH +2025-05-09 0.6058279120586914 600200.SH +2025-05-09 0.5323391688950498 600107.SH +2025-05-09 0.524017663470185 600051.SH +2025-05-09 0.5205308388365613 002627.SZ +2025-05-09 0.5195782215053089 600308.SH +2025-05-12 0.5680157000763866 600107.SH +2025-05-12 0.5513472361064857 603580.SH +2025-05-12 0.5361465148203206 600599.SH +2025-05-12 0.5284541045338316 002779.SZ +2025-05-12 0.5261729504169856 003003.SZ +2025-05-13 0.5210012376735503 603696.SH +2025-05-13 0.5180067312425508 003003.SZ +2025-05-13 0.5172511366202515 002042.SZ +2025-05-13 0.5109344127753974 600248.SH +2025-05-13 0.5090820621498142 000796.SZ +2025-05-14 0.518624752638074 600107.SH +2025-05-14 0.510745439828272 600248.SH +2025-05-14 0.5064574868683113 600657.SH +2025-05-14 0.5057195347445462 002478.SZ +2025-05-14 0.5057195347445462 603336.SH +2025-05-15 0.5515931343933391 002042.SZ +2025-05-15 0.5466435753827111 603398.SH +2025-05-15 0.5201338233515974 601188.SH +2025-05-15 0.5062880387830124 600399.SH +2025-05-15 0.5058911839972406 000796.SZ +2025-05-16 0.5124106139335408 002571.SZ +2025-05-16 0.5077156767050429 002251.SZ +2025-05-16 0.5070956117261788 002131.SZ +2025-05-16 0.5062407534441172 000875.SZ +2025-05-16 0.5062407534441172 601789.SH +2025-05-19 0.6669403824458782 603014.SH +2025-05-19 0.5259647450920883 601188.SH +2025-05-19 0.5131783435030605 002042.SZ +2025-05-19 0.5094818860270377 600828.SH +2025-05-19 0.5094818860270377 002490.SZ +2025-05-20 0.5105453191187367 002490.SZ +2025-05-20 0.5077631117447703 601616.SH +2025-05-20 0.5074774556059805 002392.SZ +2025-05-20 0.5069065869847293 002131.SZ +2025-05-20 0.5069065869847293 002526.SZ +2025-05-21 0.5161866297338369 600615.SH +2025-05-21 0.5146014429165642 600396.SH +2025-05-21 0.5127621070520616 002987.SZ +2025-05-21 0.5076436730811701 600855.SH +2025-05-21 0.5062786073876784 603878.SH +2025-05-22 0.5602275718550119 002987.SZ +2025-05-22 0.5149332831592093 000617.SZ +2025-05-22 0.5128946932546128 600787.SH +2025-05-22 0.5123376364999294 002551.SZ +2025-05-22 0.5064438479509945 000812.SZ +2025-05-23 0.5607595502258559 603429.SH +2025-05-23 0.5485581030353742 002987.SZ +2025-05-23 0.5115221694101192 601127.SH +2025-05-23 0.5066006223596465 002110.SZ +2025-05-23 0.5062407534441172 002225.SZ +2025-05-26 0.5323121465042445 002809.SZ +2025-05-26 0.5307120448488225 605138.SH +2025-05-26 0.5242624770812656 002987.SZ +2025-05-26 0.5203142580785934 000601.SZ +2025-05-26 0.5179555419687653 603363.SH +2025-05-27 0.5515055057635462 605138.SH +2025-05-27 0.5386995229214847 603429.SH +2025-05-27 0.5299128239702332 002987.SZ +2025-05-27 0.5220201124248427 601018.SH +2025-05-27 0.5194698926054124 601188.SH +2025-05-28 0.5369275719717898 003036.SZ +2025-05-28 0.5290764404714322 600575.SH +2025-05-28 0.5279184941908557 000863.SZ +2025-05-28 0.5257761894248653 600595.SH +2025-05-28 0.5208278859839193 002419.SZ +2025-05-29 0.6669403824458782 001390.SZ +2025-05-29 0.5452589553634714 600448.SH +2025-05-29 0.5290764404714322 002397.SZ +2025-05-29 0.5290764404714322 603363.SH +2025-05-29 0.5290764404714322 600575.SH +2025-05-30 0.535116652342961 002613.SZ +2025-05-30 0.5289323063323099 000507.SZ +2025-05-30 0.5240797839653469 002596.SZ +2025-05-30 0.5162322534143614 002575.SZ +2025-05-30 0.5149332831592093 601008.SH +2025-06-03 0.5454823420038027 605055.SH +2025-06-03 0.5450992816935564 600698.SH +2025-06-03 0.5238461451726196 002811.SZ +2025-06-03 0.5213807481656725 600770.SH +2025-06-03 0.5177545512881411 600149.SH +2025-06-04 0.5538808999940212 600698.SH +2025-06-04 0.5281781513402405 600770.SH +2025-06-04 0.5281533861052373 600448.SH +2025-06-04 0.5271158989738494 000096.SZ +2025-06-04 0.5262771214083012 600403.SH +2025-06-05 0.6669403824458782 603049.SH +2025-06-05 0.542229301349679 600770.SH +2025-06-05 0.5310888011611477 000096.SZ +2025-06-05 0.5302922673899837 000715.SZ +2025-06-05 0.5281955222483072 002599.SZ +2025-06-06 0.5498718697458119 000096.SZ +2025-06-06 0.5467593142714535 603730.SH +2025-06-06 0.5428251312746337 000715.SZ +2025-06-06 0.5294768540665556 002889.SZ +2025-06-06 0.5168008602085614 605055.SH +2025-06-09 0.549005230154962 000096.SZ +2025-06-09 0.537538445742131 603730.SH +2025-06-09 0.5364841799206382 002889.SZ +2025-06-09 0.530842115323555 600936.SH +2025-06-09 0.5169332559702563 603269.SH +2025-06-10 0.55464361080565 603390.SH +2025-06-10 0.5366354494501869 002889.SZ +2025-06-10 0.5330131266030339 605377.SH +2025-06-10 0.5303443337374748 603730.SH +2025-06-10 0.5259305404171137 600546.SH +2025-06-11 0.5387561851960709 603003.SH +2025-06-11 0.5362268294017861 603329.SH +2025-06-11 0.529831272598744 605377.SH +2025-06-11 0.5265615066520655 000677.SZ +2025-06-11 0.5260729456251799 600936.SH +2025-06-12 0.6669403824458782 603382.SH +2025-06-12 0.5593320323165342 000677.SZ +2025-06-12 0.5588942462777563 603329.SH +2025-06-12 0.5324612012654403 001210.SZ +2025-06-12 0.5289369856185699 605377.SH +2025-06-13 0.5697318013954993 603003.SH +2025-06-13 0.5436378826363994 000677.SZ +2025-06-13 0.543296731052936 003025.SZ +2025-06-13 0.5249929388659182 605377.SH +2025-06-13 0.5142350880651031 002685.SZ +2025-06-16 0.5358732302918909 000859.SZ +2025-06-16 0.5353864503535868 003025.SZ +2025-06-16 0.5226174689038874 000695.SZ +2025-06-16 0.5182994300515202 600829.SH +2025-06-16 0.517730370195514 600082.SH +2025-06-17 0.5397963149123955 600082.SH +2025-06-17 0.5195782215053089 002131.SZ +2025-06-17 0.5179886611866528 603359.SH +2025-06-17 0.5178471867134444 002375.SZ +2025-06-17 0.5176797545742033 002431.SZ +2025-06-18 0.5710023471497209 002298.SZ +2025-06-18 0.5314522621191695 600082.SH +2025-06-18 0.5289323063323099 002546.SZ +2025-06-18 0.5203771586183269 000510.SZ +2025-06-18 0.5192867029883689 002916.SZ +2025-06-19 0.5383401964784511 603519.SH +2025-06-19 0.5344757618074522 002303.SZ +2025-06-19 0.5313441956279837 600107.SH +2025-06-19 0.5281863100024857 603681.SH +2025-06-19 0.5257761894248653 600500.SH +2025-06-20 0.6669403824458782 603400.SH +2025-06-20 0.5351753669128975 600439.SH +2025-06-20 0.533842022464007 600448.SH +2025-06-20 0.5317887439092939 600107.SH +2025-06-20 0.5292280061167861 000632.SZ +2025-06-23 0.5711086503758361 002181.SZ +2025-06-23 0.5458513047520547 605388.SH +2025-06-23 0.5379239101616969 002172.SZ +2025-06-23 0.5372498529358267 603871.SH +2025-06-23 0.5371815102808061 600232.SH +2025-06-24 0.6387075433643968 603400.SH +2025-06-24 0.5563046499206835 002181.SZ +2025-06-24 0.5508935566412092 601279.SH +2025-06-24 0.5473929425745812 600232.SH +2025-06-24 0.5427432402972987 002172.SZ +2025-06-25 0.7027342936600967 603400.SH +2025-06-25 0.5396084451598557 601279.SH +2025-06-25 0.5310909201864787 002172.SZ +2025-06-25 0.5294564461378244 600960.SH +2025-06-25 0.5287042731979068 603037.SH +2025-06-26 0.529156294049306 002172.SZ +2025-06-26 0.5272791776397703 600480.SH +2025-06-26 0.5179555419687653 600303.SH +2025-06-26 0.517352717899504 601279.SH +2025-06-26 0.5162912092682229 002735.SZ +2025-06-27 0.5516561587455301 600480.SH +2025-06-27 0.5464583932314039 603037.SH +2025-06-27 0.5257978492925677 002892.SZ +2025-06-27 0.5100252249571439 603587.SH +2025-06-27 0.5064483525900161 603697.SH +2025-06-30 0.5179555419687653 000698.SZ +2025-06-30 0.5174081762086218 002892.SZ +2025-06-30 0.5160731169272524 600295.SH +2025-06-30 0.5129787285127705 600722.SH +2025-06-30 0.5121103836753264 600617.SH +2025-07-01 0.6669403824458782 001388.SZ +2025-07-01 0.5453557949215517 002892.SZ +2025-07-01 0.5212102176264158 600617.SH +2025-07-01 0.5144821798859033 600722.SH +2025-07-01 0.5077228295768573 000554.SZ +2025-07-02 0.5393720691752084 002207.SZ +2025-07-02 0.5158085932031415 605189.SH +2025-07-02 0.5142289246458795 000637.SZ +2025-07-02 0.5126734764286909 002700.SZ +2025-07-02 0.5124016528312786 000159.SZ +2025-07-03 0.5210012376735503 605189.SH +2025-07-03 0.5138861961464867 000637.SZ +2025-07-03 0.5108097318020007 000661.SZ +2025-07-03 0.5058816487197313 000554.SZ +2025-07-03 0.5057893248482864 603866.SH +2025-07-04 0.5137342295321232 002141.SZ +2025-07-04 0.5080808860352188 600110.SH +2025-07-04 0.5064438479509945 002687.SZ +2025-07-04 0.5064438479509945 601500.SH +2025-07-04 0.5064438479509945 002224.SZ +2025-07-07 0.5316698039923937 002109.SZ +2025-07-07 0.5140400139221889 002141.SZ +2025-07-07 0.5110428546997954 600115.SH +2025-07-07 0.5080961531869804 001330.SZ +2025-07-07 0.5069762768566503 600050.SH +2025-07-08 0.5318581027868793 002108.SZ +2025-07-08 0.5295116119546419 002397.SZ +2025-07-08 0.5259647450920883 002109.SZ +2025-07-08 0.5169085372339255 002762.SZ +2025-07-08 0.5164582399968147 002207.SZ +2025-07-09 0.551178085508194 600241.SH +2025-07-09 0.535116652342961 002397.SZ +2025-07-09 0.528066744613198 002108.SZ +2025-07-09 0.5078461078321801 600738.SH +2025-07-09 0.5073539612208279 002455.SZ +2025-07-10 0.5290764404714322 002108.SZ +2025-07-10 0.5157013807857321 002769.SZ +2025-07-10 0.5137735386488207 002397.SZ +2025-07-10 0.5130491869366306 600738.SH +2025-07-10 0.5090675271420955 600241.SH +2025-07-11 0.5294086009072477 603325.SH +2025-07-11 0.5260729456251799 002397.SZ +2025-07-11 0.5207195789387543 600738.SH +2025-07-11 0.5203188769486073 600735.SH +2025-07-11 0.5192867029883689 002821.SZ +2025-07-14 0.5220825162760148 600738.SH +2025-07-14 0.5140400139221889 601218.SH +2025-07-14 0.5091816942372954 000551.SZ +2025-07-14 0.5080065165404044 002658.SZ +2025-07-14 0.5073582226595034 000859.SZ +2025-07-15 0.5259344753158761 603803.SH +2025-07-15 0.5203771586183269 002424.SZ +2025-07-15 0.5150190047321577 603296.SH +2025-07-15 0.5083307455772172 603316.SH +2025-07-15 0.5081916801506979 002012.SZ +2025-07-16 0.6669403824458782 600930.SH +2025-07-16 0.5637525947434893 002309.SZ +2025-07-16 0.5482190028602671 002581.SZ +2025-07-16 0.529349578877612 600039.SH +2025-07-16 0.5237245769000003 000889.SZ +2025-07-17 0.5286406736082913 002263.SZ +2025-07-17 0.5259344753158761 600039.SH +2025-07-17 0.5172578456633914 601101.SH +2025-07-17 0.5121666186315809 601010.SH +2025-07-17 0.5113139305546494 002177.SZ +2025-07-18 0.5208278859839193 600027.SH +2025-07-18 0.516612009471722 605006.SH +2025-07-18 0.5148248849090167 603113.SH +2025-07-18 0.5129787285127705 600229.SH +2025-07-18 0.5122750481447931 601010.SH +2025-07-21 0.7027342936600967 600930.SH +2025-07-21 0.5602677619779618 000903.SZ +2025-07-21 0.5256292685122648 002366.SZ +2025-07-21 0.5195817785669419 600403.SH +2025-07-21 0.5169092312460223 605006.SH +2025-07-22 0.61486236957925 000903.SZ +2025-07-22 0.529027199610689 603959.SH +2025-07-22 0.5188882944862926 002356.SZ +2025-07-22 0.5178471867134444 000720.SZ +2025-07-22 0.517019735866071 002218.SZ +2025-07-23 0.6669403824458782 603262.SH +2025-07-23 0.5386217893521608 002356.SZ +2025-07-23 0.5176583632438975 002141.SZ +2025-07-23 0.515669568888107 002822.SZ +2025-07-23 0.5101143450789728 600250.SH +2025-07-24 0.5295645625272489 603778.SH +2025-07-24 0.5248312020607117 002235.SZ +2025-07-24 0.5203750341785283 000903.SZ +2025-07-24 0.5150908046416858 002356.SZ +2025-07-24 0.5120818023990829 605366.SH +2025-07-25 0.5214304599725706 603778.SH +2025-07-25 0.518624752638074 601916.SH +2025-07-25 0.5074974609211786 600027.SH +2025-07-25 0.5074974609211786 002479.SZ +2025-07-25 0.5070956117261788 002141.SZ +2025-07-28 0.5334375570360811 002916.SZ +2025-07-28 0.5156157233044605 600822.SH +2025-07-28 0.5115300689823145 601107.SH +2025-07-28 0.5115239567079181 605162.SH +2025-07-28 0.5107791472497838 002141.SZ +2025-07-29 0.5170622974146505 600256.SH +2025-07-29 0.5142289246458795 000638.SZ +2025-07-29 0.5108097318020007 603259.SH +2025-07-29 0.5088902658690911 000601.SZ +2025-07-29 0.5084717010767641 600421.SH +2025-07-30 0.6669403824458782 001221.SZ +2025-07-30 0.5149332831592093 002107.SZ +2025-07-30 0.5148248849090167 600930.SH +2025-07-30 0.5117540751866493 600439.SH +2025-07-30 0.5094515226902939 600826.SH +2025-07-31 0.5291066892495949 002266.SZ +2025-07-31 0.5202253266995881 000877.SZ +2025-07-31 0.5195782215053089 600396.SH +2025-07-31 0.5164534160576106 600822.SH +2025-07-31 0.5149332831592093 600753.SH +2025-08-01 0.5336017119045484 002266.SZ +2025-08-01 0.5257761894248653 601015.SH +2025-08-01 0.5247117033289409 600439.SH +2025-08-01 0.5233769415422445 002542.SZ +2025-08-01 0.5195782215053089 601188.SH +2025-08-04 0.5569407228959676 002266.SZ +2025-08-04 0.5336039591479221 001306.SZ +2025-08-04 0.5254970680391339 600439.SH +2025-08-04 0.5235577174355214 002096.SZ +2025-08-04 0.5195782215053089 601188.SH +2025-08-05 0.5257818250477888 002512.SZ +2025-08-05 0.5203036402538412 603819.SH +2025-08-05 0.5192738960079799 600744.SH +2025-08-05 0.5178471867134444 600115.SH +2025-08-05 0.5178038522073544 601015.SH +2025-08-06 0.5296496642223002 002542.SZ +2025-08-06 0.5288681588434305 002512.SZ +2025-08-06 0.5234910779589691 002896.SZ +2025-08-06 0.51870263095642 600777.SH +2025-08-06 0.5172823322968993 600663.SH +2025-08-07 0.5291146231953325 002512.SZ +2025-08-07 0.5234910779589691 603290.SH +2025-08-07 0.5148248849090167 002208.SZ +2025-08-07 0.5144798838507378 002037.SZ +2025-08-07 0.5140870467512906 600537.SH +2025-08-08 0.6669403824458782 603406.SH +2025-08-08 0.5421215789673404 603669.SH +2025-08-08 0.5300403289774189 001216.SZ +2025-08-08 0.5173389658079611 600107.SH +2025-08-08 0.5169963705486088 002512.SZ +2025-08-11 0.5276630558210795 001318.SZ +2025-08-11 0.5169581185947476 001216.SZ +2025-08-11 0.512851095429037 603616.SH +2025-08-11 0.5094492309413429 000516.SZ +2025-08-11 0.506799811492834 601216.SH +2025-08-12 0.5315222086193077 000679.SZ +2025-08-12 0.519582801282802 001318.SZ +2025-08-12 0.5110352388373823 000597.SZ +2025-08-12 0.506799811492834 600027.SH +2025-08-12 0.506799811492834 600998.SH +2025-08-13 0.538757178989356 600117.SH +2025-08-13 0.5259305404171137 600579.SH +2025-08-13 0.5235577174355214 001318.SZ +2025-08-13 0.5149332831592093 000525.SZ +2025-08-13 0.5116385050716121 000597.SZ +2025-08-14 0.5148248849090167 600238.SH +2025-08-14 0.5100463982075332 600539.SH +2025-08-14 0.5062407534441172 601216.SH +2025-08-14 0.5061322757265252 002538.SZ +2025-08-14 0.5061322757265252 600998.SH +2025-08-15 0.5548900952321907 600117.SH +2025-08-15 0.5398898015532435 002795.SZ +2025-08-15 0.508592800689105 002742.SZ +2025-08-15 0.5081916801506979 002398.SZ +2025-08-15 0.5070956117261788 600792.SH +2025-08-18 0.5738782734368069 600117.SH +2025-08-18 0.5309645980537814 601718.SH +2025-08-18 0.5192867029883689 603516.SH +2025-08-18 0.511722434421143 000558.SZ +2025-08-18 0.5093068224471816 002775.SZ +2025-08-19 0.5207195789387543 603398.SH +2025-08-19 0.5192738960079799 002480.SZ +2025-08-19 0.5173595009824935 603878.SH +2025-08-19 0.5169798402349087 600792.SH +2025-08-19 0.5163178922289325 000802.SZ +2025-08-20 0.5259647450920883 000761.SZ +2025-08-20 0.5205308388365613 603398.SH +2025-08-20 0.5147161662005285 601718.SH +2025-08-20 0.5146901234462072 002596.SZ +2025-08-20 0.5120818023990829 002480.SZ +2025-08-21 0.5228829187684927 601718.SH +2025-08-21 0.5221685385870379 000761.SZ +2025-08-21 0.5108097318020007 603986.SH +2025-08-21 0.5059432414859574 002360.SZ +2025-08-21 0.5059329787807083 002330.SZ +2025-08-22 0.5192867029883689 603986.SH +2025-08-22 0.5192107504177909 600808.SH +2025-08-22 0.506799811492834 603398.SH +2025-08-22 0.506799811492834 000950.SZ +2025-08-22 0.506799811492834 600256.SH +2025-08-25 0.5156039866864774 600808.SH +2025-08-25 0.5150190047321577 603019.SH +2025-08-25 0.5115300689823145 601665.SH +2025-08-25 0.5074974609211786 600256.SH +2025-08-25 0.5062880387830124 600063.SH +2025-08-26 0.5078046064828842 603221.SH +2025-08-26 0.5061322757265252 600738.SH +2025-08-26 0.5061322757265252 601816.SH +2025-08-26 0.5052050973634397 002666.SZ +2025-08-26 0.5051870732773048 601096.SH +2025-08-27 0.5978889550464905 002305.SZ +2025-08-27 0.5158791642933552 603221.SH +2025-08-27 0.5115221694101192 603893.SH +2025-08-27 0.5108097318020007 603236.SH +2025-08-27 0.5064472297039752 603083.SH +2025-08-28 0.5357831647776657 603893.SH +2025-08-28 0.5336039591479221 603124.SH +2025-08-28 0.5192867029883689 603290.SH +2025-08-28 0.5162911266132231 603221.SH +2025-08-28 0.5143302235016333 002225.SZ +2025-08-29 0.5229758732518588 603221.SH +2025-08-29 0.5202253266995881 603980.SH +2025-08-29 0.5195782215053089 002437.SZ +2025-08-29 0.5179555419687653 600664.SH +2025-08-29 0.5163489167484762 002858.SZ +2025-09-01 0.5579671919498944 600658.SH +2025-09-01 0.5257036413547017 000889.SZ +2025-09-01 0.5248393833059358 603617.SH +2025-09-01 0.5178471867134444 600664.SH +2025-09-01 0.5177545512881411 603980.SH +2025-09-02 0.5317572447435938 002534.SZ +2025-09-02 0.5284541045338316 000661.SZ +2025-09-02 0.5275523725439486 600664.SH +2025-09-02 0.5266305456247979 600537.SH +2025-09-02 0.5227889876095065 603336.SH +2025-09-03 0.535116652342961 601599.SH +2025-09-03 0.5301462333144298 601869.SH +2025-09-03 0.526410328569605 603326.SH +2025-09-03 0.5257148174645001 603158.SH +2025-09-03 0.5221694020644978 600360.SH +2025-09-04 0.5333373315105012 603458.SH +2025-09-04 0.5324612012654403 002123.SZ +2025-09-04 0.5259647450920883 600664.SH +2025-09-04 0.5259344753158761 600234.SH +2025-09-04 0.5210986637536168 600271.SH +2025-09-05 0.6669403824458782 603370.SH +2025-09-05 0.5373956149682387 600728.SH +2025-09-05 0.5295205847835255 002640.SZ +2025-09-05 0.5291787670010305 603601.SH +2025-09-05 0.5290764404714322 601599.SH +2025-09-08 0.5517752644936849 600986.SH +2025-09-08 0.5417239869858262 603601.SH +2025-09-08 0.5336039591479221 603119.SH +2025-09-08 0.5301445991229797 002649.SZ +2025-09-08 0.526231221718517 002297.SZ +2025-09-09 0.549747924470271 600435.SH +2025-09-09 0.5343258195125252 002277.SZ +2025-09-09 0.5286617661336549 002815.SZ +2025-09-09 0.526231221718517 000670.SZ +2025-09-09 0.5208278859839193 603601.SH +2025-09-10 0.5537305785096447 600744.SH +2025-09-10 0.5301308361482894 600435.SH +2025-09-10 0.5284541045338316 603516.SH +2025-09-10 0.522871366052352 002277.SZ +2025-09-10 0.5206268768675039 002413.SZ +2025-09-11 0.5825563460455111 002210.SZ +2025-09-11 0.5392698727103348 600435.SH +2025-09-11 0.5336039591479221 603124.SH +2025-09-11 0.5301335488378848 603177.SH +2025-09-11 0.5234910779589691 603083.SH +2025-09-12 0.5257761894248653 002217.SZ +2025-09-12 0.5207195789387543 002413.SZ +2025-09-12 0.5199835318621187 002210.SZ +2025-09-12 0.5192867029883689 603083.SH +2025-09-12 0.5192867029883689 603516.SH +2025-09-15 0.5472970747630104 603839.SH +2025-09-15 0.5440495302362878 603626.SH +2025-09-15 0.5330472282985935 002210.SZ +2025-09-15 0.5290764404714322 000407.SZ +2025-09-15 0.5192867029883689 002896.SZ +2025-09-16 0.5392073380823367 603839.SH +2025-09-16 0.522977653412587 002691.SZ +2025-09-16 0.5192867029883689 603083.SH +2025-09-16 0.5192867029883689 601869.SH +2025-09-16 0.5108097318020007 603516.SH +2025-09-17 0.522839363247788 603709.SH +2025-09-17 0.5192867029883689 603297.SH +2025-09-17 0.5172075410270989 002174.SZ +2025-09-17 0.5150190047321577 001283.SZ +2025-09-17 0.5138861961464867 002168.SZ +2025-09-18 0.5280642376241884 603709.SH +2025-09-18 0.5192867029883689 601869.SH +2025-09-18 0.5192867029883689 605288.SH +2025-09-18 0.5192867029883689 603297.SH +2025-09-18 0.5173595009824935 600358.SH +2025-09-19 0.5292280061167861 002721.SZ +2025-09-19 0.5220825162760148 002316.SZ +2025-09-19 0.5202253266995881 600507.SH +2025-09-19 0.5192867029883689 601869.SH +2025-09-19 0.5192867029883689 001309.SZ +2025-09-22 0.5290764404714322 002721.SZ +2025-09-22 0.5209796446328382 601778.SH +2025-09-22 0.5207195789387543 601003.SH +2025-09-22 0.5192867029883689 603019.SH +2025-09-22 0.5192867029883689 001309.SZ +2025-09-23 0.6669403824458782 603418.SH +2025-09-23 0.5411662091594641 001309.SZ +2025-09-23 0.5290764404714322 002397.SZ +2025-09-23 0.5254926778536082 603778.SH +2025-09-23 0.5207195789387543 002333.SZ +2025-09-24 0.5392059286348174 000972.SZ +2025-09-24 0.5346476148255608 600503.SH +2025-09-24 0.5308810420188524 002692.SZ +2025-09-24 0.5290764404714322 601599.SH +2025-09-24 0.5262182185763995 600130.SH +2025-09-25 0.5294086009072477 000988.SZ +2025-09-25 0.5259647450920883 600525.SH +2025-09-25 0.5244785263384821 603778.SH +2025-09-25 0.524186490228001 002692.SZ +2025-09-25 0.5230859175308072 002719.SZ +2025-09-26 0.5503662271537297 603778.SH +2025-09-26 0.5468004475511966 000428.SZ +2025-09-26 0.5308727072969106 600716.SH +2025-09-26 0.5301217237062711 000959.SZ +2025-09-26 0.5263936646664212 000718.SZ +2025-09-29 0.5932557593164053 000428.SZ +2025-09-29 0.5670273182706154 600448.SH +2025-09-29 0.5549175264141686 002217.SZ +2025-09-29 0.5505530618759115 002263.SZ +2025-09-29 0.5445932653852826 000718.SZ +2025-09-30 0.6669403824458782 001285.SZ +2025-09-30 0.5786351360102432 000428.SZ +2025-09-30 0.5527109872312292 002263.SZ +2025-09-30 0.5357831647776657 001309.SZ +2025-09-30 0.5352209692776468 002622.SZ +2025-10-09 0.5288735076238339 002622.SZ +2025-10-09 0.5262051259218314 000545.SZ +2025-10-09 0.5260729456251799 002263.SZ +2025-10-09 0.5206296858612343 002097.SZ +2025-10-09 0.5149332831592093 603956.SH +2025-10-10 0.606757468204688 002622.SZ +2025-10-10 0.5669848068365473 600624.SH +2025-10-10 0.5421215789673404 600658.SH +2025-10-10 0.5226353674128292 000721.SZ +2025-10-10 0.5208278859839193 002042.SZ +2025-10-13 0.5334375570360811 603516.SH +2025-10-13 0.5267596517862191 600624.SH +2025-10-13 0.5189666077848415 002356.SZ +2025-10-13 0.5121632067888446 000721.SZ +2025-10-13 0.5115300689823145 002659.SZ +2025-10-14 0.5258952914012429 600165.SH +2025-10-14 0.5220569777779845 600624.SH +2025-10-14 0.5210752793150646 002059.SZ +2025-10-14 0.5203750341785283 002622.SZ +2025-10-14 0.5198560721779237 600977.SH +2025-10-15 0.5416299693968815 001330.SZ +2025-10-15 0.5318581027868793 000698.SZ +2025-10-15 0.5192867029883689 002779.SZ +2025-10-15 0.5170622974146505 002059.SZ +2025-10-15 0.5161866297338369 603429.SH +2025-10-16 0.5357831647776657 001309.SZ +2025-10-16 0.5260729456251799 002122.SZ +2025-10-16 0.5254307041644566 000698.SZ +2025-10-16 0.5215775357230317 603103.SH +2025-10-16 0.5173595009824935 002059.SZ +2025-10-17 0.6669403824458782 601026.SH +2025-10-17 0.5436378826363994 603335.SH +2025-10-17 0.5318208651273958 002193.SZ +2025-10-17 0.5232111562602314 600376.SH +2025-10-17 0.5227275460675327 605050.SH +2025-10-20 0.5542198914019132 002515.SZ +2025-10-20 0.5274715810404275 603809.SH +2025-10-20 0.5257357396563849 601519.SH +2025-10-20 0.5243850230152007 605050.SH +2025-10-20 0.5214694606436803 603618.SH +2025-10-21 0.5399718459160422 603618.SH +2025-10-21 0.528551727159285 002182.SZ +2025-10-21 0.5179555419687653 601618.SH +2025-10-21 0.5147102019586222 600376.SH +2025-10-21 0.5113139305546494 000967.SZ +2025-10-22 0.6669403824458782 001386.SZ +2025-10-22 0.5293305264816156 600468.SH +2025-10-22 0.5276711651816922 601618.SH +2025-10-22 0.526231221718517 002998.SZ +2025-10-22 0.521469242438557 603305.SH +2025-10-23 0.535116652342961 601618.SH +2025-10-23 0.5224835943742379 002196.SZ +2025-10-23 0.519700750715856 600255.SH +2025-10-23 0.5154701409400779 002998.SZ +2025-10-23 0.5149332831592093 000758.SZ +2025-10-24 0.6669403824458782 603175.SH +2025-10-24 0.5661570857190846 601026.SH +2025-10-24 0.5336039591479221 603124.SH +2025-10-24 0.5234910779589691 605288.SH +2025-10-24 0.5234910779589691 603160.SH +2025-10-27 0.5386364033186966 600857.SH +2025-10-27 0.5357831647776657 603986.SH +2025-10-27 0.5219351242774325 601212.SH +2025-10-27 0.514635987194688 002716.SZ +2025-10-27 0.5115221694101192 001309.SZ +2025-10-28 0.514635987194688 600333.SH +2025-10-28 0.5133335084224743 000010.SZ +2025-10-28 0.5130347572908087 000889.SZ +2025-10-28 0.5112284398631035 000782.SZ +2025-10-28 0.5105915720835598 600617.SH +2025-10-29 0.7027342936600967 603175.SH +2025-10-29 0.5455562348041093 000626.SZ +2025-10-29 0.5397963149123955 600617.SH +2025-10-29 0.5365722351193363 603027.SH +2025-10-29 0.5120818023990829 600333.SH +2025-10-30 0.5727564714989891 600617.SH +2025-10-30 0.5308131309654015 603027.SH +2025-10-30 0.5217135378764437 000626.SZ +2025-10-30 0.5208278859839193 600168.SH +2025-10-30 0.5207195789387543 601212.SH +2025-10-31 0.5787228086014843 600617.SH +2025-10-31 0.5207195789387543 600168.SH +2025-10-31 0.5207195789387543 000868.SZ +2025-10-31 0.5183163360678433 002251.SZ +2025-10-31 0.5179555419687653 000692.SZ +2025-11-03 0.534924877254506 600617.SH +2025-11-03 0.5243608302115037 002678.SZ +2025-11-03 0.5233797777110492 002542.SZ +2025-11-03 0.5208278859839193 002585.SZ +2025-11-03 0.5208278859839193 600168.SH +2025-11-04 0.6334108285080842 603175.SH +2025-11-04 0.5239557735459798 002678.SZ +2025-11-04 0.5207195789387543 000868.SZ +2025-11-04 0.5205617801737676 600168.SH +2025-11-04 0.5203771586183269 002766.SZ +2025-11-05 0.6669403824458782 603334.SH +2025-11-05 0.5205308388365613 601339.SH +2025-11-05 0.5201302686329659 002766.SZ +2025-11-05 0.5192107504177909 000090.SZ +2025-11-05 0.5176583632438975 002542.SZ +2025-11-06 0.6669403824458782 603376.SH +2025-11-06 0.5349437299046987 601618.SH +2025-11-06 0.5259305404171137 000600.SZ +2025-11-06 0.517352717899504 600248.SH +2025-11-06 0.5150190047321577 603040.SH +2025-11-07 0.6669403824458782 603092.SH +2025-11-07 0.5227889876095065 000600.SZ +2025-11-07 0.5194698926054124 601618.SH +2025-11-07 0.513454254303403 002337.SZ +2025-11-07 0.5122750481447931 600169.SH +2025-11-10 0.5234910779589691 601888.SH +2025-11-10 0.5233214191633075 000981.SZ +2025-11-10 0.5168008602085614 603378.SH +2025-11-10 0.5165434716045726 601618.SH +2025-11-10 0.5118573629020219 000679.SZ +2025-11-11 0.5114452710217089 000757.SZ +2025-11-11 0.5057893248482864 002766.SZ +2025-11-11 0.5055903451732487 601216.SH +2025-11-11 0.5053462265996771 002044.SZ +2025-11-11 0.5051870732773048 600963.SH +2025-11-12 0.5294123016954966 000981.SZ +2025-11-12 0.5116342321245925 000757.SZ +2025-11-12 0.5064438479509945 000802.SZ +2025-11-12 0.5064438479509945 002042.SZ +2025-11-12 0.5064438479509945 000088.SZ +2025-11-13 0.5129787285127705 600120.SH +2025-11-13 0.510685819260391 600416.SH +2025-11-13 0.5094954603413556 002211.SZ +2025-11-13 0.5062407534441172 600515.SH +2025-11-13 0.5057195347445462 000931.SZ +2025-11-14 0.5226681338608951 600416.SH +2025-11-14 0.513082107892262 002480.SZ +2025-11-14 0.5098171825913262 600375.SH +2025-11-14 0.509575707799716 000524.SZ +2025-11-14 0.5086804480442645 002164.SZ +2025-11-17 0.5434544826661134 603759.SH +2025-11-17 0.5253186667209742 600581.SH +2025-11-17 0.5097395009973196 002181.SZ +2025-11-17 0.5093884986165883 000524.SZ +2025-11-17 0.5063330752943594 600375.SH +2025-11-18 0.5294086009072477 605255.SH +2025-11-18 0.5255109699121789 603092.SH +2025-11-18 0.5241190636672188 600292.SH +2025-11-18 0.5196292030484984 603759.SH +2025-11-18 0.5159647339612334 002492.SZ +2025-11-19 0.5357831647776657 603129.SH +2025-11-19 0.5214600555100268 603759.SH +2025-11-19 0.5147101098480353 600094.SH +2025-11-19 0.514635987194688 000027.SZ +2025-11-19 0.5130347572908087 600939.SH +2025-11-20 0.5318208651273958 002522.SZ +2025-11-20 0.5298249062594883 600667.SH +2025-11-20 0.5292280061167861 000720.SZ +2025-11-20 0.5276630558210795 002112.SZ +2025-11-20 0.5184728793030473 601399.SH +2025-11-21 0.5557953906852645 600939.SH +2025-11-21 0.5463342922136836 000809.SZ +2025-11-21 0.5296286995283527 000514.SZ +2025-11-21 0.5277364647770626 002102.SZ +2025-11-21 0.5257761894248653 600708.SH +2025-11-24 0.5708682177963273 002160.SZ +2025-11-24 0.5684724381631168 600802.SH +2025-11-24 0.5632244428348601 002309.SZ +2025-11-24 0.5591187020609228 000809.SZ +2025-11-24 0.5524793574711755 000631.SZ +2025-11-25 0.6669403824458782 001233.SZ +2025-11-25 0.574307667182978 601011.SH +2025-11-25 0.5669124419405613 000631.SZ +2025-11-25 0.5588469864728527 002374.SZ +2025-11-25 0.5552114736653683 603030.SH +2025-11-26 0.5752947449644089 601011.SH +2025-11-26 0.5656926249079262 600807.SH +2025-11-26 0.5601931195838576 600617.SH +2025-11-26 0.559715459117151 002256.SZ +2025-11-26 0.5540608597415699 600798.SH +2025-11-27 0.5826618787963105 600617.SH +2025-11-27 0.5809430149117796 601908.SH +2025-11-27 0.561305310075055 601011.SH +2025-11-27 0.5595889780873284 600303.SH +2025-11-27 0.5546366793786385 600807.SH +2025-11-28 0.5509003340456048 600617.SH +2025-11-28 0.5410583063546074 601908.SH +2025-11-28 0.5376307909297392 000797.SZ +2025-11-28 0.535116652342961 002132.SZ +2025-11-28 0.5311380079716314 002691.SZ +2025-12-01 0.5454842829756713 601566.SH +2025-12-01 0.5286904261151575 002691.SZ +2025-12-01 0.5259647450920883 002132.SZ +2025-12-01 0.5192294534036106 002404.SZ +2025-12-01 0.51914944010669 600338.SH +2025-12-02 0.5351753669128975 600383.SH +2025-12-02 0.5293321778178374 000002.SZ +2025-12-02 0.5150119387056977 002205.SZ +2025-12-02 0.514635987194688 000968.SZ +2025-12-02 0.5088944832379734 002691.SZ +2025-12-03 0.6669403824458782 001280.SZ +2025-12-03 0.5200277459621825 002205.SZ +2025-12-03 0.5059329787807083 600738.SH +2025-12-03 0.5059329787807083 002145.SZ +2025-12-03 0.5059329787807083 000723.SZ +2025-12-04 0.5243608302115037 000002.SZ +2025-12-04 0.5086255830848415 000892.SZ +2025-12-04 0.5061322757265252 600050.SH +2025-12-04 0.5061322757265252 601669.SH +2025-12-04 0.505974230757272 600658.SH +2025-12-05 0.5260729456251799 601929.SH +2025-12-05 0.5194698926054124 002127.SZ +2025-12-05 0.5192867029883689 002779.SZ +2025-12-05 0.5172884903900697 002205.SZ +2025-12-05 0.514635987194688 000567.SZ +2025-12-08 0.5376131262635266 603398.SH +2025-12-08 0.5335642996732787 002811.SZ +2025-12-08 0.5307120448488225 002291.SZ +2025-12-08 0.5178038522073544 601929.SH +2025-12-08 0.5172823322968993 000676.SZ +2025-12-09 0.5404894018708571 002291.SZ +2025-12-09 0.5265064083159695 002811.SZ +2025-12-09 0.5260729456251799 002168.SZ +2025-12-09 0.5158900518232388 002141.SZ +2025-12-09 0.5152424033193611 600986.SH +2025-12-10 0.5869553596904462 002689.SZ +2025-12-10 0.5294086009072477 601869.SH +2025-12-10 0.5284174266960939 000695.SZ +2025-12-10 0.5277364647770626 002168.SZ +2025-12-10 0.5260729456251799 000889.SZ +2025-12-11 0.5541392269101565 001280.SZ +2025-12-11 0.5466581511209122 000638.SZ +2025-12-11 0.5417505813506529 000695.SZ +2025-12-11 0.5338886826939419 603176.SH +2025-12-11 0.5307460824444339 002689.SZ +2025-12-12 0.600272759621709 002689.SZ +2025-12-12 0.5208278859839193 000789.SZ +2025-12-12 0.5203771586183269 600616.SH +2025-12-12 0.5173595009824935 000937.SZ +2025-12-12 0.5173595009824935 002563.SZ +2025-12-15 0.5535428869099189 001280.SZ +2025-12-15 0.5360280914291116 603176.SH +2025-12-15 0.5270577999669818 002551.SZ +2025-12-15 0.5260729456251799 002060.SZ +2025-12-15 0.524017663470185 002686.SZ +2025-12-16 0.572690393905651 000536.SZ +2025-12-16 0.5627205484266725 603815.SH +2025-12-16 0.5410429736449522 000068.SZ +2025-12-16 0.5326635614877283 002634.SZ +2025-12-16 0.529027199610689 002193.SZ +2025-12-17 0.5951304562201396 002589.SZ +2025-12-17 0.5606871732446959 002390.SZ +2025-12-17 0.5517174366399105 000509.SZ +2025-12-17 0.5436378826363994 002193.SZ +2025-12-17 0.5375693919463156 600599.SH +2025-12-18 0.5567503731149535 000701.SZ +2025-12-18 0.5558441068891651 600696.SH +2025-12-18 0.5404570196520365 603815.SH +2025-12-18 0.5384676192562836 605299.SH +2025-12-18 0.5352009523422904 000020.SZ +2025-12-19 0.5550086601660389 600846.SH +2025-12-19 0.5539777914846407 000020.SZ +2025-12-19 0.5517196665917564 600815.SH +2025-12-19 0.5468004475511966 002589.SZ +2025-12-19 0.5361095563091214 600734.SH +2025-12-22 0.576437350624106 600815.SH +2025-12-22 0.5708170878939591 600802.SH +2025-12-22 0.5582571013351643 600696.SH +2025-12-22 0.5405691530605988 002589.SZ +2025-12-22 0.5293145743852032 603879.SH +2025-12-23 0.5360150094122866 600846.SH +2025-12-23 0.5192867029883689 603929.SH +2025-12-23 0.5192867029883689 002837.SZ +2025-12-23 0.514956054042036 000685.SZ +2025-12-23 0.5127101187507722 600939.SH +2025-12-24 0.5330131266030339 002682.SZ +2025-12-24 0.5314281167618613 000838.SZ +2025-12-24 0.5207195789387543 600624.SH +2025-12-24 0.5170622974146505 000035.SZ +2025-12-24 0.5160528360618574 002238.SZ +2025-12-25 0.5208278859839193 600624.SH +2025-12-25 0.5174997854530934 000838.SZ +2025-12-25 0.5169085372339255 000035.SZ +2025-12-25 0.5108097318020007 605123.SH +2025-12-25 0.5071739242869363 002343.SZ +2025-12-26 0.518589360677003 000078.SZ +2025-12-26 0.5141650961836706 600180.SH +2025-12-26 0.5115221694101192 001309.SZ +2025-12-26 0.5089666327619147 002697.SZ +2025-12-26 0.5082868674245398 600624.SH +2025-12-29 0.5399583041498672 000078.SZ +2025-12-29 0.5179555419687653 002694.SZ +2025-12-29 0.5116385050716121 002016.SZ +2025-12-29 0.5115221694101192 605255.SH +2025-12-29 0.5108097318020007 605123.SH +2025-12-30 0.5806552000864679 600800.SH +2025-12-30 0.5713532860246767 000890.SZ +2025-12-30 0.5308261919014939 000078.SZ +2025-12-30 0.5192867029883689 603119.SH +2025-12-30 0.5177545512881411 002044.SZ +2025-12-31 0.5764823074649972 600800.SH +2025-12-31 0.5323336836032145 000078.SZ +2025-12-31 0.5295773547573392 600730.SH +2025-12-31 0.5221685385870379 002047.SZ +2025-12-31 0.5208278859839193 002251.SZ diff --git a/src/training/pipeline.py b/src/training/pipeline.py new file mode 100644 index 0000000..8d93577 --- /dev/null +++ b/src/training/pipeline.py @@ -0,0 +1,448 @@ +"""训练管道 - 包含数据处理、模型训练和预测功能 + +本模块提供: +1. 数据准备:从因子计算结果中准备训练/测试数据 +2. 数据处理:Fillna(0) -> Dropna +3. 模型训练:使用LightGBM训练分类模型 +4. 预测和选股:输出每日top5股票池 +""" + +from datetime import datetime +from pathlib import Path +from typing import List, Optional, Tuple + +import numpy as np +import polars as pl + +from src.factors import DataLoader, FactorEngine +from src.factors.data_spec import DataSpec +from src.pipeline import ( + DropNAProcessor, + FillNAProcessor, + LightGBMModel, + PipelineStage, + ProcessingPipeline, + TaskType, +) + + +def prepare_data( + data_dir: str = "data", + train_start: str = "20180101", + train_end: str = "20230101", + test_start: str = "20230101", + test_end: str = "20240101", +) -> Tuple[pl.DataFrame, pl.DataFrame]: + """准备训练和测试数据 + + 从DuckDB加载原始日线数据,计算所需因子并生成标签。 + + Args: + data_dir: 数据目录 + train_start: 训练集开始日期 + train_end: 训练集结束日期 + test_start: 测试集开始日期 + test_end: 测试集结束日期 + + Returns: + (train_data, test_data): 训练集和测试集的DataFrame + """ + from src.data.storage import Storage + + storage = Storage() + + # 加载日线数据(需要更多历史数据用于计算因子) + # 训练集需要更多历史数据(用于计算因子lookback) + lookback_days = 20 # 足够计算MA10和5日收益率 + start_with_lookback = str(int(train_start) - 10000) # 往前取一年 + + # 查询训练集数据 + # 注意:DuckDB 中 trade_date 是 DATE 类型,需要转换 + start_dt = f"{start_with_lookback[:4]}-{start_with_lookback[4:6]}-{start_with_lookback[6:8]}" + end_dt = f"{train_end[:4]}-{train_end[4:6]}-{train_end[6:8]}" + train_query = f""" + SELECT ts_code, trade_date, close, pre_close + FROM daily + WHERE trade_date >= '{start_dt}' AND trade_date <= '{end_dt}' + ORDER BY ts_code, trade_date + """ + train_raw = storage._connection.sql(train_query).pl() + # 转换 trade_date 为字符串格式 + train_raw = train_raw.with_columns( + pl.col("trade_date").dt.strftime("%Y-%m-%d").alias("trade_date") + ) + + # 查询测试集数据(也需要历史数据计算因子) + test_start_dt = f"{test_start[:4]}-{test_start[4:6]}-{test_start[6:8]}" + test_end_dt = f"{test_end[:4]}-{test_end[4:6]}-{test_end[6:8]}" + test_query = f""" + SELECT ts_code, trade_date, close, pre_close + FROM daily + WHERE trade_date >= '{test_start_dt}' AND trade_date <= '{test_end_dt}' + ORDER BY ts_code, trade_date + """ + test_raw = storage._connection.sql(test_query).pl() + # 转换 trade_date 为字符串格式 + test_raw = test_raw.with_columns( + pl.col("trade_date").dt.strftime("%Y-%m-%d").alias("trade_date") + ) + + # 过滤不符合条件的股票 + train_raw = _filter_invalid_stocks(train_raw) + test_raw = _filter_invalid_stocks(test_raw) + print(f"[PrepareData] After filtering: train={len(train_raw)}, test={len(test_raw)}") + + # 计算因子和标签 + train_data = _compute_features_and_label(train_raw, train_start, train_end) + test_data = _compute_features_and_label(test_raw, test_start, test_end) + + return train_data, test_data + + +def _filter_invalid_stocks(df: pl.DataFrame) -> pl.DataFrame: + """过滤不符合条件的股票 + + 过滤规则: + 1. 过滤北交所股票(ts_code 以 BJ 结尾) + 2. 过滤创业板股票(ts_code 以 30 开头) + 3. 过滤科创板股票(ts_code 以 68 开头) + 4. 过滤退市/风险股票(ts_code 以 8 开头) + + Args: + df: 原始数据 + + Returns: + 过滤后的数据 + """ + ts_code_col = pl.col("ts_code") + + return df.filter( + ~ts_code_col.str.ends_with("BJ") + & ~ts_code_col.str.starts_with("30") + & ~ts_code_col.str.starts_with("68") + & ~ts_code_col.str.starts_with("8") + ) + + +def _compute_features_and_label( + raw_data: pl.DataFrame, + start_date: str, + end_date: str, +) -> pl.DataFrame: + """计算因子和标签 + + 因子: + 1. return_5_rank: 5日收益率截面排名 + 2. ma_5: 5日移动平均 + 3. ma_10: 10日移动平均 + + 标签:未来5日收益率大于0为1,否则为0 + + Args: + raw_data: 原始日线数据 + start_date: 开始日期 + end_date: 结束日期 + + Returns: + 包含因子和标签的DataFrame + """ + # 确保按日期排序 + raw_data = raw_data.sort(["ts_code", "trade_date"]) + + # 计算收益率(未来5日) + raw_data = raw_data.with_columns( + [ + # 当日收益率 + ((pl.col("close") - pl.col("pre_close")) / pl.col("pre_close")).alias( + "daily_return" + ), + ] + ) + + # 按股票分组计算 + result_list = [] + + for ts_code in raw_data["ts_code"].unique(): + stock_data = raw_data.filter(pl.col("ts_code") == ts_code).sort("trade_date") + + if len(stock_data) < 20: + continue + + # 计算MA5和MA10 + stock_data = stock_data.with_columns( + [ + pl.col("close").rolling_mean(5).alias("ma_5"), + pl.col("close").rolling_mean(10).alias("ma_10"), + ] + ) + + # 计算未来5日收益率(用于标签) + future_return = stock_data["close"].shift(-5) - stock_data["close"] + future_return_pct = future_return / stock_data["close"] + stock_data = stock_data.with_columns( + [ + future_return_pct.alias("future_return_5"), + ] + ) + + # 生成标签:收益率>0为1,否则为0 + stock_data = stock_data.with_columns( + [ + (pl.col("future_return_5") > 0).cast(pl.Int8).alias("label"), + ] + ) + + result_list.append(stock_data) + + if not result_list: + return pl.DataFrame() + + result = pl.concat(result_list) + + # 转换日期格式:YYYYMMDD -> YYYY-MM-DD + start_date_formatted = f"{start_date[:4]}-{start_date[4:6]}-{start_date[6:8]}" + end_date_formatted = f"{end_date[:4]}-{end_date[4:6]}-{end_date[6:8]}" + + # 过滤有效日期范围 + result = result.filter( + (pl.col("trade_date") >= start_date_formatted) & (pl.col("trade_date") <= end_date_formatted) + ) + + # 计算5日收益率排名(截面) + result = result.with_columns( + [ + pl.col("daily_return") + .rank(method="average") + .over("trade_date") + .alias("return_5_rank") + ] + ) + + # 归一化排名到0-1 + result = result.with_columns( + [ + ( + pl.col("return_5_rank") + / pl.col("return_5_rank").max().over("trade_date") + ).alias("return_5_rank") + ] + ) + + # 选择需要的列 + feature_cols = ["trade_date", "ts_code", "return_5_rank", "ma_5", "ma_10", "label"] + result = result.select(feature_cols) + + return result + + +def create_pipeline() -> ProcessingPipeline: + """创建数据处理流水线 + + 处理流程: + 1. FillNA(0): 将缺失值填充为0 + + 注意:不使用 Dropna,因为会导致训练和预测时的行数不匹配 + + Returns: + 配置好的ProcessingPipeline + """ + processors = [ + FillNAProcessor(method="zero"), # 缺失值填充为0 + ] + return ProcessingPipeline(processors) + + +def train_model( + train_data: pl.DataFrame, + feature_cols: List[str], + label_col: str = "label", + model_params: Optional[dict] = None, +) -> Tuple[LightGBMModel, ProcessingPipeline]: + """训练LightGBM分类模型 + + Args: + train_data: 训练数据 + feature_cols: 特征列名列表 + label_col: 标签列名 + model_params: 模型参数字典 + + Returns: + (训练好的模型, 处理流水线) + """ + # 创建处理流水线 + pipeline = create_pipeline() + print("[TrainModel] Pipeline created: FillNA(0)") + + # 准备特征和标签 + X_train = train_data.select(feature_cols) + y_train = train_data[label_col] + print(f"[TrainModel] Raw samples: {len(X_train)}, features: {feature_cols}") + + # 处理数据 + X_train_processed = pipeline.fit_transform(X_train, stage=PipelineStage.TRAIN) + print(f"[TrainModel] After processing: {len(X_train_processed)} samples") + + # 过滤有效标签(排除-1等无效值) + valid_mask = y_train.is_in([0, 1]) + X_train_processed = X_train_processed.filter(valid_mask) + y_train = y_train.filter(valid_mask) + print(f"[TrainModel] After filtering valid labels: {len(X_train_processed)} samples") + print(f"[TrainModel] Label distribution: {dict(y_train.value_counts().sort('label').iter_rows())}") + + # 创建模型 + params = model_params or { + "n_estimators": 100, + "learning_rate": 0.05, + "max_depth": 5, + "num_leaves": 31, + } + print(f"[TrainModel] Model params: {params}") + model = LightGBMModel( + task_type="classification", + params=params, + ) + + # 训练模型 + print("[TrainModel] Training LightGBM...") + model.fit(X_train_processed, y_train) + print("[TrainModel] Training completed!") + + return model, pipeline + + +def predict_top_stocks( + model: LightGBMModel, + pipeline: ProcessingPipeline, + test_data: pl.DataFrame, + feature_cols: List[str], + top_n: int = 5, +) -> pl.DataFrame: + """预测并选出每日top N股票 + + Args: + model: 训练好的模型 + pipeline: 数据处理流水线 + test_data: 测试数据 + feature_cols: 特征列名 + top_n: 每日选出的股票数量 + + Returns: + 包含日期和股票代码的DataFrame + """ + # 准备特征和必要列 + X_test = test_data.select(feature_cols) + key_cols = ["trade_date", "ts_code"] + key_data = test_data.select(key_cols) + + print(f"[Predict] Test samples: {len(X_test)}, top_n: {top_n}") + + # 处理数据(使用训练阶段的参数) + X_test_processed = pipeline.transform(X_test, stage=PipelineStage.TEST) + print(f"[Predict] Data processed, shape: {X_test_processed.shape}") + + # 预测概率 + probs = model.predict_proba(X_test_processed) + print(f"[Predict] Predictions generated, probability shape: {probs.shape}") + + # 使用 key_data 添加预测结果,保持行数一致 + result = key_data.with_columns( + pl.Series( + name="pred_prob", values=probs[:, 1] if len(probs.shape) > 1 and probs.shape[1] > 1 else probs.flatten() + ), + ) + + # 每日选出top N + top_stocks = [] + for date in result["trade_date"].unique().sort(): + day_data = result.filter(pl.col("trade_date") == date) + + # 按概率降序排序,选出top N + day_top = day_data.sort("pred_prob", descending=True).head(top_n) + + top_stocks.append(day_top.select(["trade_date", "pred_prob", "ts_code"]).rename({"pred_prob": "score"})) + + return pl.concat(top_stocks) + + +def save_top_stocks(top_stocks: pl.DataFrame, output_path: str) -> None: + """保存选股结果到TSV文件 + + Args: + top_stocks: 选股结果 + output_path: 输出文件路径 + """ + # 转换为pandas并保存为TSV + df = top_stocks.to_pandas() + df.to_csv(output_path, sep="\t", index=False) + print(f"[Training] Top stocks saved to: {output_path}") + + +def run_training( + data_dir: str = "data", + output_path: str = "output/top_stocks.tsv", + train_start: str = "20180101", + train_end: str = "20230101", + test_start: str = "20230101", + test_end: str = "20240101", + top_n: int = 5, +) -> pl.DataFrame: + """运行完整训练流程 + + Args: + data_dir: 数据目录 + output_path: 输出文件路径 + train_start: 训练集开始日期 + train_end: 训练集结束日期 + test_start: 测试集开始日期 + test_end: 测试集结束日期 + top_n: 每日选股数量 + + Returns: + 选股结果DataFrame + """ + print(f"[Training] Starting training pipeline...") + print(f"[Training] Train period: {train_start} -> {train_end}") + print(f"[Training] Test period: {test_start} -> {test_end}") + + # 1. 准备数据 + print("[Training] Preparing data...") + train_data, test_data = prepare_data( + data_dir=data_dir, + train_start=train_start, + train_end=train_end, + test_start=test_start, + test_end=test_end, + ) + print(f"[Training] Train samples: {len(train_data)}") + print(f"[Training] Test samples: {len(test_data)}") + + # 2. 定义特征列 + feature_cols = ["return_5_rank", "ma_5", "ma_10"] + label_col = "label" + + # 3. 训练模型 + print("[Training] Training model...") + model, pipeline = train_model( + train_data=train_data, + feature_cols=feature_cols, + label_col=label_col, + ) + + # 4. 测试集预测 + print("[Training] Predicting on test set...") + top_stocks = predict_top_stocks( + model=model, + pipeline=pipeline, + test_data=test_data, + feature_cols=feature_cols, + top_n=top_n, + ) + + # 5. 保存结果 + print(f"[Training] Saving results to {output_path}...") + Path(output_path).parent.mkdir(parents=True, exist_ok=True) + save_top_stocks(top_stocks, output_path) + + print("[Training] Training completed!") + + return top_stocks diff --git a/tests/models/test_core.py b/tests/pipeline/test_core.py similarity index 94% rename from tests/models/test_core.py rename to tests/pipeline/test_core.py index 23442b4..4b0173b 100644 --- a/tests/models/test_core.py +++ b/tests/pipeline/test_core.py @@ -1,4 +1,4 @@ -"""模型框架核心测试 +"""Pipeline 组件库核心测试 测试核心抽象类、插件注册中心、处理器、模型和划分策略。 """ @@ -9,7 +9,7 @@ import numpy as np from typing import List, Optional # 确保导入时注册所有组件 -from src.models import ( +from src.pipeline import ( PluginRegistry, PipelineStage, BaseProcessor, @@ -17,7 +17,7 @@ from src.models import ( BaseSplitter, ProcessingPipeline, ) -from src.models.core import TaskType +from src.pipeline.core import TaskType # ========== 测试核心抽象类 ========== @@ -232,7 +232,7 @@ class TestBuiltInProcessors: def test_dropna_processor(self): """测试缺失值删除处理器""" - from src.models.processors import DropNAProcessor + from src.pipeline.processors import DropNAProcessor processor = DropNAProcessor(columns=["a", "b"]) df = pl.DataFrame({"a": [1, None, 3], "b": [4, 5, None], "c": [7, 8, 9]}) @@ -246,7 +246,7 @@ class TestBuiltInProcessors: def test_fillna_processor(self): """测试缺失值填充处理器""" - from src.models.processors import FillNAProcessor + from src.pipeline.processors import FillNAProcessor processor = FillNAProcessor(columns=["a"], method="mean") df = pl.DataFrame({"a": [1.0, 2.0, None, 4.0]}) @@ -258,7 +258,7 @@ class TestBuiltInProcessors: def test_standard_scaler(self): """测试标准化处理器""" - from src.models.processors import StandardScaler + from src.pipeline.processors import StandardScaler processor = StandardScaler(columns=["value"]) df = pl.DataFrame({"value": [1.0, 2.0, 3.0, 4.0, 5.0]}) @@ -271,7 +271,7 @@ class TestBuiltInProcessors: def test_winsorizer(self): """测试缩尾处理器""" - from src.models.processors import Winsorizer + from src.pipeline.processors import Winsorizer processor = Winsorizer(columns=["value"], lower=0.1, upper=0.9) df = pl.DataFrame( @@ -288,7 +288,7 @@ class TestBuiltInProcessors: def test_rank_transformer(self): """测试排名转换处理器""" - from src.models.processors import RankTransformer + from src.pipeline.processors import RankTransformer processor = RankTransformer(columns=["value"]) df = pl.DataFrame( @@ -302,7 +302,7 @@ class TestBuiltInProcessors: def test_neutralizer(self): """测试中性化处理器""" - from src.models.processors import Neutralizer + from src.pipeline.processors import Neutralizer processor = Neutralizer(columns=["value"], group_col="industry") df = pl.DataFrame( @@ -331,7 +331,7 @@ class TestProcessingPipeline: def test_pipeline_fit_transform(self): """测试流水线的 fit_transform""" - from src.models.processors import StandardScaler + from src.pipeline.processors import StandardScaler scaler1 = StandardScaler(columns=["a"]) scaler2 = StandardScaler(columns=["b"]) @@ -348,7 +348,7 @@ class TestProcessingPipeline: def test_pipeline_transform_uses_fitted_params(self): """测试 transform 使用已 fit 的参数""" - from src.models.processors import StandardScaler + from src.pipeline.processors import StandardScaler scaler = StandardScaler(columns=["value"]) pipeline = ProcessingPipeline([scaler]) @@ -383,7 +383,7 @@ class TestSplitters: def test_time_series_split(self): """测试时间序列划分""" - from src.models.core import TimeSeriesSplit + from src.pipeline.core import TimeSeriesSplit splitter = TimeSeriesSplit(n_splits=2, gap=1, min_train_size=3) @@ -406,7 +406,7 @@ class TestSplitters: def test_walk_forward_split(self): """测试滚动前向划分""" - from src.models.core import WalkForwardSplit + from src.pipeline.core import WalkForwardSplit splitter = WalkForwardSplit(train_window=5, test_window=2, gap=1) @@ -426,7 +426,7 @@ class TestSplitters: def test_expanding_window_split(self): """测试扩展窗口划分""" - from src.models.core import ExpandingWindowSplit + from src.pipeline.core import ExpandingWindowSplit splitter = ExpandingWindowSplit(initial_train_size=3, test_window=2, gap=1) @@ -455,7 +455,7 @@ class TestModels: @pytest.mark.skip(reason="需要安装 lightgbm") def test_lightgbm_model(self): """测试 LightGBM 模型""" - from src.models.models import LightGBMModel + from src.pipeline.models import LightGBMModel model = LightGBMModel(task_type="regression", params={"n_estimators": 10}) diff --git a/tests/test_sync.py b/tests/test_sync.py index c342791..caf78c7 100644 --- a/tests/test_sync.py +++ b/tests/test_sync.py @@ -1,277 +1,163 @@ -"""Tests for data synchronization module. +"""Sync 接口测试规范与实现。 -Tests the sync module's full/incremental sync logic for daily data: -- Full sync when local data doesn't exist (from 20180101) -- Incremental sync when local data exists (from last_date + 1) -- Data integrity validation +【测试规范】 +1. 所有 sync 测试只使用 2018-01-01 到 2018-04-01 的数据 +2. 只测试接口是否能正常返回数据,不测试落库逻辑 +3. 对于按股票查询的接口,只测试 000001.SZ、000002.SZ 两支股票 +4. 使用真实 API 调用,确保接口可用性 + +【测试范围】 +- get_daily: 日线数据接口(按股票) +- sync_all_stocks: 股票基础信息接口 +- sync_trade_cal_cache: 交易日历接口 +- sync_namechange: 名称变更接口 +- sync_bak_basic: 备用股票基础信息接口 """ import pytest import pandas as pd -from unittest.mock import Mock, patch, MagicMock -from datetime import datetime, timedelta +from datetime import datetime -from src.data.sync import ( - DataSync, - sync_all, - get_today_date, - get_next_date, - DEFAULT_START_DATE, -) -from src.data.storage import ThreadSafeStorage -from src.data.client import TushareClient +# 测试用常量 +TEST_START_DATE = "20180101" +TEST_END_DATE = "20180401" +TEST_STOCK_CODES = ["000001.SZ", "000002.SZ"] -@pytest.fixture -def mock_storage(): - """Create a mock storage instance.""" - storage = Mock(spec=ThreadSafeStorage) - storage.exists = Mock(return_value=False) - storage.load = Mock(return_value=pd.DataFrame()) - storage.save = Mock(return_value={"status": "success", "rows": 0}) - return storage +class TestGetDaily: + """测试日线数据 get 接口(按股票查询).""" + def test_get_daily_single_stock(self): + """测试 get_daily 获取单只股票数据.""" + from src.data.api_wrappers.api_daily import get_daily -@pytest.fixture -def mock_client(): - """Create a mock client instance.""" - return Mock(spec=TushareClient) + result = get_daily( + ts_code=TEST_STOCK_CODES[0], + start_date=TEST_START_DATE, + end_date=TEST_END_DATE, + ) + # 验证返回了数据 + assert isinstance(result, pd.DataFrame), "get_daily 应返回 DataFrame" + assert not result.empty, "get_daily 应返回非空数据" -class TestDateUtilities: - """Test date utility functions.""" + def test_get_daily_has_required_columns(self): + """测试 get_daily 返回的数据包含必要字段.""" + from src.data.api_wrappers.api_daily import get_daily - def test_get_today_date_format(self): - """Test today date is in YYYYMMDD format.""" - result = get_today_date() - assert len(result) == 8 - assert result.isdigit() + result = get_daily( + ts_code=TEST_STOCK_CODES[0], + start_date=TEST_START_DATE, + end_date=TEST_END_DATE, + ) - def test_get_next_date(self): - """Test getting next date.""" - result = get_next_date("20240101") - assert result == "20240102" + # 验证必要的列存在 + required_columns = ["ts_code", "trade_date", "open", "high", "low", "close"] + for col in required_columns: + assert col in result.columns, f"get_daily 返回应包含 {col} 列" - def test_get_next_date_year_end(self): - """Test getting next date across year boundary.""" - result = get_next_date("20241231") - assert result == "20250101" + def test_get_daily_multiple_stocks(self): + """测试 get_daily 获取多只股票数据.""" + from src.data.api_wrappers.api_daily import get_daily - def test_get_next_date_month_end(self): - """Test getting next date across month boundary.""" - result = get_next_date("20240131") - assert result == "20240201" - - -class TestDataSync: - """Test DataSync class functionality.""" - - def test_get_all_stock_codes_from_daily(self, mock_storage): - """Test getting stock codes from daily data.""" - with patch("src.data.sync.ThreadSafeStorage", return_value=mock_storage): - sync = DataSync() - sync.storage = mock_storage - - mock_storage.load.return_value = pd.DataFrame( - { - "ts_code": ["000001.SZ", "000001.SZ", "600000.SH"], - } + results = {} + for code in TEST_STOCK_CODES: + result = get_daily( + ts_code=code, + start_date=TEST_START_DATE, + end_date=TEST_END_DATE, ) - - codes = sync.get_all_stock_codes() - - assert len(codes) == 2 - assert "000001.SZ" in codes - assert "600000.SH" in codes - - def test_get_all_stock_codes_fallback(self, mock_storage): - """Test fallback to stock_basic when daily is empty.""" - with patch("src.data.sync.ThreadSafeStorage", return_value=mock_storage): - sync = DataSync() - sync.storage = mock_storage - - # First call (daily) returns empty, second call (stock_basic) returns data - mock_storage.load.side_effect = [ - pd.DataFrame(), # daily empty - pd.DataFrame({"ts_code": ["000001.SZ", "600000.SH"]}), # stock_basic - ] - - codes = sync.get_all_stock_codes() - - assert len(codes) == 2 - - def test_get_global_last_date(self, mock_storage): - """Test getting global last date.""" - with patch("src.data.sync.ThreadSafeStorage", return_value=mock_storage): - sync = DataSync() - sync.storage = mock_storage - - mock_storage.load.return_value = pd.DataFrame( - { - "ts_code": ["000001.SZ", "600000.SH"], - "trade_date": ["20240102", "20240103"], - } + results[code] = result + assert isinstance(result, pd.DataFrame), ( + f"get_daily({code}) 应返回 DataFrame" ) - - last_date = sync.get_global_last_date() - assert last_date == "20240103" - - def test_get_global_last_date_empty(self, mock_storage): - """Test getting last date from empty storage.""" - with patch("src.data.sync.ThreadSafeStorage", return_value=mock_storage): - sync = DataSync() - sync.storage = mock_storage - - mock_storage.load.return_value = pd.DataFrame() - - last_date = sync.get_global_last_date() - assert last_date is None - - def test_sync_single_stock(self, mock_storage): - """Test syncing a single stock.""" - with patch("src.data.sync.ThreadSafeStorage", return_value=mock_storage): - with patch( - "src.data.sync.get_daily", - return_value=pd.DataFrame( - { - "ts_code": ["000001.SZ"], - "trade_date": ["20240102"], - } - ), - ): - sync = DataSync() - sync.storage = mock_storage - - result = sync.sync_single_stock( - ts_code="000001.SZ", - start_date="20240101", - end_date="20240102", - ) - - assert isinstance(result, pd.DataFrame) - assert len(result) == 1 - - def test_sync_single_stock_empty(self, mock_storage): - """Test syncing a stock with no data.""" - with patch("src.data.sync.ThreadSafeStorage", return_value=mock_storage): - with patch("src.data.sync.get_daily", return_value=pd.DataFrame()): - sync = DataSync() - sync.storage = mock_storage - - result = sync.sync_single_stock( - ts_code="INVALID.SZ", - start_date="20240101", - end_date="20240102", - ) - - assert result.empty + assert not result.empty, f"get_daily({code}) 应返回非空数据" -class TestSyncAll: - """Test sync_all function.""" +class TestSyncStockBasic: + """测试股票基础信息 sync 接口.""" - def test_full_sync_mode(self, mock_storage): - """Test full sync mode when force_full=True.""" - with patch("src.data.sync.ThreadSafeStorage", return_value=mock_storage): - with patch("src.data.sync.get_daily", return_value=pd.DataFrame()): - sync = DataSync() - sync.storage = mock_storage - sync.sync_single_stock = Mock(return_value=pd.DataFrame()) + def test_sync_all_stocks_returns_data(self): + """测试 sync_all_stocks 是否能正常返回数据.""" + from src.data.api_wrappers.api_stock_basic import sync_all_stocks - mock_storage.load.return_value = pd.DataFrame( - { - "ts_code": ["000001.SZ"], - } - ) + result = sync_all_stocks() - result = sync.sync_all(force_full=True) + # 验证返回了数据 + assert isinstance(result, pd.DataFrame), "sync_all_stocks 应返回 DataFrame" + assert not result.empty, "sync_all_stocks 应返回非空数据" - # Verify sync_single_stock was called with default start date - sync.sync_single_stock.assert_called_once() - call_args = sync.sync_single_stock.call_args - assert call_args[1]["start_date"] == DEFAULT_START_DATE + def test_sync_all_stocks_has_required_columns(self): + """测试 sync_all_stocks 返回的数据包含必要字段.""" + from src.data.api_wrappers.api_stock_basic import sync_all_stocks - def test_incremental_sync_mode(self, mock_storage): - """Test incremental sync mode when data exists.""" - with patch("src.data.sync.ThreadSafeStorage", return_value=mock_storage): - sync = DataSync() - sync.storage = mock_storage - sync.sync_single_stock = Mock(return_value=pd.DataFrame()) + result = sync_all_stocks() - # Mock existing data with last date - mock_storage.load.side_effect = [ - pd.DataFrame( - { - "ts_code": ["000001.SZ"], - "trade_date": ["20240102"], - } - ), # get_all_stock_codes - pd.DataFrame( - { - "ts_code": ["000001.SZ"], - "trade_date": ["20240102"], - } - ), # get_global_last_date - ] - - result = sync.sync_all(force_full=False) - - # Verify sync_single_stock was called with next date - sync.sync_single_stock.assert_called_once() - call_args = sync.sync_single_stock.call_args - assert call_args[1]["start_date"] == "20240103" - - def test_manual_start_date(self, mock_storage): - """Test sync with manual start date.""" - with patch("src.data.sync.ThreadSafeStorage", return_value=mock_storage): - sync = DataSync() - sync.storage = mock_storage - sync.sync_single_stock = Mock(return_value=pd.DataFrame()) - - mock_storage.load.return_value = pd.DataFrame( - { - "ts_code": ["000001.SZ"], - } - ) - - result = sync.sync_all(force_full=False, start_date="20230601") - - sync.sync_single_stock.assert_called_once() - call_args = sync.sync_single_stock.call_args - assert call_args[1]["start_date"] == "20230601" - - def test_no_stocks_found(self, mock_storage): - """Test sync when no stocks are found.""" - with patch("src.data.sync.ThreadSafeStorage", return_value=mock_storage): - sync = DataSync() - sync.storage = mock_storage - - mock_storage.load.return_value = pd.DataFrame() - - result = sync.sync_all() - - assert result == {} + # 验证必要的列存在 + required_columns = ["ts_code"] + for col in required_columns: + assert col in result.columns, f"sync_all_stocks 返回应包含 {col} 列" -class TestSyncAllConvenienceFunction: - """Test sync_all convenience function.""" +class TestSyncTradeCal: + """测试交易日历 sync 接口.""" - def test_sync_all_function(self): - """Test sync_all convenience function.""" - with patch("src.data.sync.DataSync") as MockSync: - mock_instance = Mock() - mock_instance.sync_all.return_value = {} - MockSync.return_value = mock_instance + def test_sync_trade_cal_cache_returns_data(self): + """测试 sync_trade_cal_cache 是否能正常返回数据.""" + from src.data.api_wrappers.api_trade_cal import sync_trade_cal_cache - result = sync_all(force_full=True) + result = sync_trade_cal_cache( + start_date=TEST_START_DATE, + end_date=TEST_END_DATE, + ) - MockSync.assert_called_once() - mock_instance.sync_all.assert_called_once_with( - force_full=True, - start_date=None, - end_date=None, - dry_run=False, - ) + # 验证返回了数据 + assert isinstance(result, pd.DataFrame), "sync_trade_cal_cache 应返回 DataFrame" + assert not result.empty, "sync_trade_cal_cache 应返回非空数据" + + def test_sync_trade_cal_cache_has_required_columns(self): + """测试 sync_trade_cal_cache 返回的数据包含必要字段.""" + from src.data.api_wrappers.api_trade_cal import sync_trade_cal_cache + + result = sync_trade_cal_cache( + start_date=TEST_START_DATE, + end_date=TEST_END_DATE, + ) + + # 验证必要的列存在 + required_columns = ["cal_date", "is_open"] + for col in required_columns: + assert col in result.columns, f"sync_trade_cal_cache 返回应包含 {col} 列" + + +class TestSyncNamechange: + """测试名称变更 sync 接口.""" + + def test_sync_namechange_returns_data(self): + """测试 sync_namechange 是否能正常返回数据.""" + from src.data.api_wrappers.api_namechange import sync_namechange + + result = sync_namechange() + + # 验证返回了数据(可能是空 DataFrame,因为是历史变更) + assert isinstance(result, pd.DataFrame), "sync_namechange 应返回 DataFrame" + + +class TestSyncBakBasic: + """测试备用股票基础信息 sync 接口.""" + + def test_sync_bak_basic_returns_data(self): + """测试 sync_bak_basic 是否能正常返回数据.""" + from src.data.api_wrappers.api_bak_basic import sync_bak_basic + + result = sync_bak_basic( + start_date=TEST_START_DATE, + end_date=TEST_END_DATE, + ) + + # 验证返回了数据 + assert isinstance(result, pd.DataFrame), "sync_bak_basic 应返回 DataFrame" + # 注意:bak_basic 可能返回空数据,这是正常的 if __name__ == "__main__": diff --git a/tests/test_sync_real.py b/tests/test_sync_real.py deleted file mode 100644 index eacaf3f..0000000 --- a/tests/test_sync_real.py +++ /dev/null @@ -1,256 +0,0 @@ -"""Tests for data sync with REAL data (read-only). - -Tests verify: -1. get_global_last_date() correctly reads local data's max date -2. Incremental sync date calculation (local_last_date + 1) -3. Full sync date calculation (20180101) -4. Multi-stock scenario with real data - -⚠️ IMPORTANT: These tests ONLY read data, no write operations. -- NO sync_all() calls (writes daily.h5) -- NO check_sync_needed() calls (writes trade_cal.h5) -""" - -import pytest -import pandas as pd -from pathlib import Path - -from src.data.sync import ( - DataSync, - get_next_date, - DEFAULT_START_DATE, -) -from src.data.storage import Storage - - -class TestDataSyncReadOnly: - """Read-only tests for data sync - verify date calculation logic.""" - - @pytest.fixture - def storage(self): - """Create storage instance.""" - return Storage() - - @pytest.fixture - def data_sync(self): - """Create DataSync instance.""" - return DataSync() - - @pytest.fixture - def daily_exists(self, storage): - """Check if daily.h5 exists.""" - return storage.exists("daily") - - def test_daily_h5_exists(self, storage): - """Verify daily.h5 data file exists before running tests.""" - assert storage.exists("daily"), ( - "daily.h5 not found. Please run full sync first: " - "uv run python -c 'from src.data.sync import sync_all; sync_all(force_full=True)'" - ) - - def test_get_global_last_date(self, data_sync, daily_exists): - """Test get_global_last_date returns correct max date from local data.""" - if not daily_exists: - pytest.skip("daily.h5 not found") - - last_date = data_sync.get_global_last_date() - - # Verify it's a valid date string - assert last_date is not None, "get_global_last_date returned None" - assert isinstance(last_date, str), f"Expected str, got {type(last_date)}" - assert len(last_date) == 8, f"Expected 8-digit date, got {last_date}" - assert last_date.isdigit(), f"Expected numeric date, got {last_date}" - - # Verify by reading storage directly - daily_data = data_sync.storage.load("daily") - expected_max = str(daily_data["trade_date"].max()) - - assert last_date == expected_max, ( - f"get_global_last_date returned {last_date}, " - f"but actual max date is {expected_max}" - ) - - print(f"[TEST] Local data last date: {last_date}") - - def test_incremental_sync_date_calculation(self, data_sync, daily_exists): - """Test incremental sync: start_date = local_last_date + 1. - - This verifies that when local data exists, incremental sync should - fetch data from (local_last_date + 1), not from 20180101. - """ - if not daily_exists: - pytest.skip("daily.h5 not found") - - # Get local last date - local_last_date = data_sync.get_global_last_date() - assert local_last_date is not None, "No local data found" - - # Calculate expected incremental start date - expected_start_date = get_next_date(local_last_date) - - # Verify the calculation is correct - local_last_int = int(local_last_date) - expected_int = local_last_int + 1 - actual_int = int(expected_start_date) - - assert actual_int == expected_int, ( - f"Incremental start date calculation error: " - f"expected {expected_int}, got {actual_int}" - ) - - print( - f"[TEST] Incremental sync: local_last={local_last_date}, " - f"start_date should be {expected_start_date}" - ) - - # Verify this is NOT 20180101 (would be full sync) - assert expected_start_date != DEFAULT_START_DATE, ( - f"Incremental sync should NOT start from {DEFAULT_START_DATE}" - ) - - def test_full_sync_date_calculation(self): - """Test full sync: start_date = 20180101 when force_full=True. - - This verifies that force_full=True always starts from 20180101. - """ - # Full sync should always use DEFAULT_START_DATE - full_sync_start = DEFAULT_START_DATE - - assert full_sync_start == "20180101", ( - f"Full sync should start from 20180101, got {full_sync_start}" - ) - - print(f"[TEST] Full sync start date: {full_sync_start}") - - def test_date_comparison_logic(self, data_sync, daily_exists): - """Test date comparison: incremental vs full sync selection logic. - - Verify that: - - If local_last_date < today: incremental sync needed - - If local_last_date >= today: no sync needed - """ - if not daily_exists: - pytest.skip("daily.h5 not found") - - from datetime import datetime - - local_last_date = data_sync.get_global_last_date() - today = datetime.now().strftime("%Y%m%d") - - local_last_int = int(local_last_date) - today_int = int(today) - - # Log the comparison - print( - f"[TEST] Date comparison: local_last={local_last_date} ({local_last_int}), " - f"today={today} ({today_int})" - ) - - # This test just verifies the comparison logic works - if local_last_int < today_int: - print("[TEST] Local data is older than today - sync needed") - # Incremental sync should fetch from local_last_date + 1 - sync_start = get_next_date(local_last_date) - assert int(sync_start) > local_last_int, ( - "Sync start should be after local last" - ) - else: - print("[TEST] Local data is up-to-date - no sync needed") - - def test_get_all_stock_codes_real_data(self, data_sync, daily_exists): - """Test get_all_stock_codes returns multiple real stock codes.""" - if not daily_exists: - pytest.skip("daily.h5 not found") - - codes = data_sync.get_all_stock_codes() - - # Verify it's a list - assert isinstance(codes, list), f"Expected list, got {type(codes)}" - assert len(codes) > 0, "No stock codes found" - - # Verify multiple stocks - assert len(codes) >= 10, ( - f"Expected at least 10 stocks for multi-stock test, got {len(codes)}" - ) - - # Verify format (should be like 000001.SZ, 600000.SH) - sample_codes = codes[:5] - for code in sample_codes: - assert "." in code, f"Invalid stock code format: {code}" - suffix = code.split(".")[-1] - assert suffix in ["SZ", "SH"], f"Invalid exchange suffix: {suffix}" - - print(f"[TEST] Found {len(codes)} stock codes (sample: {sample_codes})") - - def test_multi_stock_date_range(self, data_sync, daily_exists): - """Test that multiple stocks share the same date range in local data. - - This verifies that local data has consistent date coverage across stocks. - """ - if not daily_exists: - pytest.skip("daily.h5 not found") - - daily_data = data_sync.storage.load("daily") - - # Get date range for each stock - stock_dates = daily_data.groupby("ts_code")["trade_date"].agg(["min", "max"]) - - # Get global min and max - global_min = str(daily_data["trade_date"].min()) - global_max = str(daily_data["trade_date"].max()) - - print(f"[TEST] Global date range: {global_min} to {global_max}") - print(f"[TEST] Total stocks: {len(stock_dates)}") - - # Verify we have data for multiple stocks - assert len(stock_dates) >= 10, ( - f"Expected at least 10 stocks, got {len(stock_dates)}" - ) - - # Verify date range is reasonable (at least 1 year of data) - global_min_int = int(global_min) - global_max_int = int(global_max) - days_span = global_max_int - global_min_int - - assert days_span > 100, ( - f"Date range too small: {days_span} days. " - f"Expected at least 100 days of data." - ) - - print(f"[TEST] Date span: {days_span} days") - - -class TestDateUtilities: - """Test date utility functions.""" - - def test_get_next_date(self): - """Test get_next_date correctly calculates next day.""" - # Test normal cases - assert get_next_date("20240101") == "20240102" - assert get_next_date("20240131") == "20240201" # Month boundary - assert get_next_date("20241231") == "20250101" # Year boundary - - def test_incremental_vs_full_sync_logic(self): - """Test the logic difference between incremental and full sync. - - Incremental: start_date = local_last_date + 1 - Full: start_date = 20180101 - """ - # Scenario 1: Local data exists - local_last_date = "20240115" - incremental_start = get_next_date(local_last_date) - - assert incremental_start == "20240116" - assert incremental_start != DEFAULT_START_DATE - - # Scenario 2: Force full sync - full_sync_start = DEFAULT_START_DATE # "20180101" - - assert full_sync_start == "20180101" - assert incremental_start != full_sync_start - - print("[TEST] Incremental vs Full sync logic verified") - - -if __name__ == "__main__": - pytest.main([__file__, "-v", "-s"])