refactor: 调整项目结构，新增数据同步和交易日历模块

- 移除 pyproject.toml，改用 uv 管理项目 - 新增 data/* 忽略规则 - 新增数据同步模块 sync.py - 新增交易日历模块 trade_cal.py - 新增相关测试用例 - 更新 API 文档
2026-02-01 04:44:01 +08:00
parent ec08a2578c
commit 05228ce9de
7 changed files with 1140 additions and 24 deletions
--- a/tests/test_daily_storage.py
+++ b/tests/test_daily_storage.py
@@ -0,0 +1,190 @@
+"""Tests for data/daily.h5 storage validation.
+
+Validates two key points:
+1. All stocks from stock_basic.csv are saved in daily.h5
+2. No abnormal data with very few data points (< 10 rows per stock)
+"""
+
+import pytest
+import pandas as pd
+from pathlib import Path
+from src.data.storage import Storage
+from src.data.stock_basic import _get_csv_path
+
+
+class TestDailyStorageValidation:
+    """Test daily.h5 storage integrity and completeness."""
+
+    @pytest.fixture
+    def storage(self):
+        """Create storage instance."""
+        return Storage()
+
+    @pytest.fixture
+    def stock_basic_df(self):
+        """Load stock basic data from CSV."""
+        csv_path = _get_csv_path()
+        if not csv_path.exists():
+            pytest.skip(f"stock_basic.csv not found at {csv_path}")
+        return pd.read_csv(csv_path)
+
+    @pytest.fixture
+    def daily_df(self, storage):
+        """Load daily data from HDF5."""
+        if not storage.exists("daily"):
+            pytest.skip("daily.h5 not found")
+        # HDF5 stores keys with leading slash, so we need to handle both '/daily' and 'daily'
+        file_path = storage._get_file_path("daily")
+        try:
+            with pd.HDFStore(file_path, mode="r") as store:
+                if "/daily" in store.keys():
+                    return store["/daily"]
+                elif "daily" in store.keys():
+                    return store["daily"]
+                return pd.DataFrame()
+        except Exception as e:
+            pytest.skip(f"Error loading daily.h5: {e}")
+
+    def test_all_stocks_saved(self, storage, stock_basic_df, daily_df):
+        """Verify all stocks from stock_basic are saved in daily.h5.
+
+        This test ensures data completeness - every stock in stock_basic
+        should have corresponding data in daily.h5.
+        """
+        if daily_df.empty:
+            pytest.fail("daily.h5 is empty")
+
+        # Get unique stock codes from both sources
+        expected_codes = set(stock_basic_df["ts_code"].dropna().unique())
+        actual_codes = set(daily_df["ts_code"].dropna().unique())
+
+        # Check for missing stocks
+        missing_codes = expected_codes - actual_codes
+
+        if missing_codes:
+            missing_list = sorted(missing_codes)
+            # Show first 20 missing stocks as sample
+            sample = missing_list[:20]
+            msg = f"Found {len(missing_codes)} stocks missing from daily.h5:\n"
+            msg += f"Sample missing: {sample}\n"
+            if len(missing_list) > 20:
+                msg += f"... and {len(missing_list) - 20} more"
+            pytest.fail(msg)
+
+        # All stocks present
+        assert len(actual_codes) > 0, "No stocks found in daily.h5"
+        print(
+            f"[TEST] All {len(expected_codes)} stocks from stock_basic are present in daily.h5"
+        )
+
+    def test_no_stock_with_insufficient_data(self, storage, daily_df):
+        """Verify no stock has abnormally few data points (< 10 rows).
+
+        Stocks with very few data points may indicate sync failures,
+        delisted stocks not properly handled, or data corruption.
+        """
+        if daily_df.empty:
+            pytest.fail("daily.h5 is empty")
+
+        # Count rows per stock
+        stock_counts = daily_df.groupby("ts_code").size()
+
+        # Find stocks with less than 10 data points
+        insufficient_stocks = stock_counts[stock_counts < 10]
+
+        if not insufficient_stocks.empty:
+            # Separate into categories for better reporting
+            empty_stocks = stock_counts[stock_counts == 0]
+            very_few_stocks = stock_counts[(stock_counts > 0) & (stock_counts < 10)]
+
+            msg = f"Found {len(insufficient_stocks)} stocks with insufficient data (< 10 rows):\n"
+
+            if not empty_stocks.empty:
+                msg += f"\nEmpty stocks (0 rows): {len(empty_stocks)}\n"
+                sample = sorted(empty_stocks.index[:10].tolist())
+                msg += f"Sample: {sample}"
+
+            if not very_few_stocks.empty:
+                msg += f"\nVery few data points (1-9 rows): {len(very_few_stocks)}\n"
+                # Show counts for these stocks
+                sample = very_few_stocks.sort_values().head(20)
+                msg += "Sample (ts_code: count):\n"
+                for code, count in sample.items():
+                    msg += f"  {code}: {count} rows\n"
+
+            pytest.fail(msg)
+
+        print(f"[TEST] All stocks have sufficient data (>= 10 rows)")
+
+    def test_data_integrity_basic(self, storage, daily_df):
+        """Basic data integrity checks for daily.h5."""
+        if daily_df.empty:
+            pytest.fail("daily.h5 is empty")
+
+        # Check required columns exist
+        required_columns = ["ts_code", "trade_date"]
+        missing_columns = [
+            col for col in required_columns if col not in daily_df.columns
+        ]
+
+        if missing_columns:
+            pytest.fail(f"Missing required columns: {missing_columns}")
+
+        # Check for null values in key columns
+        null_ts_code = daily_df["ts_code"].isna().sum()
+        null_trade_date = daily_df["trade_date"].isna().sum()
+
+        if null_ts_code > 0:
+            pytest.fail(f"Found {null_ts_code} rows with null ts_code")
+        if null_trade_date > 0:
+            pytest.fail(f"Found {null_trade_date} rows with null trade_date")
+
+        print(f"[TEST] Data integrity check passed")
+
+    def test_stock_data_coverage_report(self, storage, daily_df):
+        """Generate a summary report of stock data coverage.
+
+        This test provides visibility into data distribution without failing.
+        """
+        if daily_df.empty:
+            pytest.skip("daily.h5 is empty - cannot generate report")
+
+        stock_counts = daily_df.groupby("ts_code").size()
+
+        # Calculate statistics
+        total_stocks = len(stock_counts)
+        min_count = stock_counts.min()
+        max_count = stock_counts.max()
+        median_count = stock_counts.median()
+        mean_count = stock_counts.mean()
+
+        # Distribution buckets
+        very_low = (stock_counts < 10).sum()
+        low = ((stock_counts >= 10) & (stock_counts < 100)).sum()
+        medium = ((stock_counts >= 100) & (stock_counts < 500)).sum()
+        high = (stock_counts >= 500).sum()
+
+        report = f"""
+=== Stock Data Coverage Report ===
+Total stocks: {total_stocks}
+Data points per stock:
+  Min: {min_count}
+  Max: {max_count}
+  Median: {median_count:.0f}
+  Mean: {mean_count:.1f}
+
+Distribution:
+  < 10 rows:  {very_low} stocks ({very_low / total_stocks * 100:.1f}%)
+  10-99:      {low} stocks ({low / total_stocks * 100:.1f}%)
+  100-499:    {medium} stocks ({medium / total_stocks * 100:.1f}%)
+  >= 500:     {high} stocks ({high / total_stocks * 100:.1f}%)
+"""
+        print(report)
+
+        # This is an informational test - it should not fail
+        # But we assert to mark it as passed
+        assert total_stocks > 0
+
+
+if __name__ == "__main__":
+    pytest.main([__file__, "-v", "-s"])
--- a/tests/test_tushare_api.py
+++ b/tests/test_tushare_api.py
@@ -0,0 +1,20 @@
+"""Tushare API 验证脚本 - 快速生成 pro 对象用于调试。"""
+
+import os
+
+os.environ.setdefault("DATA_PATH", "data")
+
+from src.data.config import get_config
+import tushare as ts
+
+config = get_config()
+token = config.tushare_token
+
+if not token:
+    raise ValueError("请在 config/.env.local 中配置 TUSHARE_TOKEN")
+
+pro = ts.pro_api(token)
+print(f"pro_api 对象已创建，token: {token[:10]}...")
+
+df = pro.query('daily', ts_code='000001.SZ', start_date='20180702', end_date='20180718')
+print(df)