refactor: 重构 API 接口模块，整合为 api_wrappers 目录结构

- 将独立 API 模块 (daily, stock_basic, trade_cal) 整合至 api_wrappers/ - 重写 sync.py 使用新的 wrapper 结构，支持更多同步功能 - 更新测试文件适配新的模块结构 - 添加 pytest.ini 配置文件
2026-02-21 03:43:30 +08:00
parent e81d39ae0d
commit 9965ce5706
15 changed files with 1042 additions and 952 deletions
--- a/src/data/sync.py
+++ b/src/data/sync.py
@@ -5,11 +5,15 @@ This module provides data fetching functions with intelligent sync logic:
 - If local file exists: incremental update (fetch from latest date + 1 day)
 - Multi-threaded concurrent fetching for improved performance
 - Stop immediately on any exception
+- Preview mode: check data volume and samples before actual sync

 Currently supported data types:
 - daily: Daily market data (with turnover rate and volume ratio)

 Usage:
+    # Preview sync (check data volume and samples without writing)
+    preview_sync()
+
    # Sync all stocks (full load)
    sync_all()

@@ -18,6 +22,9 @@ Usage:

    # Force full reload
    sync_all(force_full=True)
+
+    # Dry run (preview only, no write)
+    sync_all(dry_run=True)
 """

 import pandas as pd
@@ -30,8 +37,8 @@ import sys

 from src.data.client import TushareClient
 from src.data.storage import Storage
-from src.data.daily import get_daily
-from src.data.trade_cal import (
+from src.data.api_wrappers import get_daily
+from src.data.api_wrappers import (
    get_first_trading_day,
    get_last_trading_day,
    sync_trade_cal_cache,
@@ -114,7 +121,8 @@ class DataSync:
            List of stock codes
        """
        # Import sync_all_stocks here to avoid circular imports
-        from src.data.stock_basic import sync_all_stocks, _get_csv_path
+        from src.data.api_wrappers import sync_all_stocks
+        from src.data.api_wrappers.api_stock_basic import _get_csv_path

        # First, ensure stock_basic.csv is up-to-date with all stocks
        print("[DataSync] Ensuring stock_basic.csv is up-to-date...")
@@ -278,6 +286,184 @@ class DataSync:
        print(f"[DataSync] Incremental sync needed from {sync_start} to {cal_last}")
        return (True, sync_start, cal_last, local_last_date)

+    def preview_sync(
+        self,
+        force_full: bool = False,
+        start_date: Optional[str] = None,
+        end_date: Optional[str] = None,
+        sample_size: int = 3,
+    ) -> dict:
+        """Preview sync data volume and samples without actually syncing.
+
+        This method provides a preview of what would be synced, including:
+        - Number of stocks to be synced
+        - Date range for sync
+        - Estimated total records
+        - Sample data from first few stocks
+
+        Args:
+            force_full: If True, preview full sync from 20180101
+            start_date: Manual start date (overrides auto-detection)
+            end_date: Manual end date (defaults to today)
+            sample_size: Number of sample stocks to fetch for preview (default: 3)
+
+        Returns:
+            Dictionary with preview information:
+            {
+                'sync_needed': bool,
+                'stock_count': int,
+                'start_date': str,
+                'end_date': str,
+                'estimated_records': int,
+                'sample_data': pd.DataFrame,
+                'mode': str,  # 'full' or 'incremental'
+            }
+        """
+        print("\n" + "=" * 60)
+        print("[DataSync] Preview Mode - Analyzing sync requirements...")
+        print("=" * 60)
+
+        # First, ensure trade calendar cache is up-to-date
+        print("[DataSync] Syncing trade calendar cache...")
+        sync_trade_cal_cache()
+
+        # Determine date range
+        if end_date is None:
+            end_date = get_today_date()
+
+        # Check if sync is needed
+        sync_needed, cal_start, cal_end, local_last = self.check_sync_needed(force_full)
+
+        if not sync_needed:
+            print("\n" + "=" * 60)
+            print("[DataSync] Preview Result")
+            print("=" * 60)
+            print("  Sync Status: NOT NEEDED")
+            print("  Reason: Local data is up-to-date with trade calendar")
+            print("=" * 60)
+            return {
+                "sync_needed": False,
+                "stock_count": 0,
+                "start_date": None,
+                "end_date": None,
+                "estimated_records": 0,
+                "sample_data": pd.DataFrame(),
+                "mode": "none",
+            }
+
+        # Use dates from check_sync_needed
+        if cal_start and cal_end:
+            sync_start_date = cal_start
+            end_date = cal_end
+        else:
+            sync_start_date = start_date or DEFAULT_START_DATE
+            if end_date is None:
+                end_date = get_today_date()
+
+        # Determine sync mode
+        if force_full:
+            mode = "full"
+            print(f"[DataSync] Mode: FULL SYNC from {sync_start_date} to {end_date}")
+        elif local_last and cal_start and sync_start_date == get_next_date(local_last):
+            mode = "incremental"
+            print(f"[DataSync] Mode: INCREMENTAL SYNC (bandwidth optimized)")
+            print(f"[DataSync] Sync from: {sync_start_date} to {end_date}")
+        else:
+            mode = "partial"
+            print(f"[DataSync] Mode: SYNC from {sync_start_date} to {end_date}")
+
+        # Get all stock codes
+        stock_codes = self.get_all_stock_codes()
+        if not stock_codes:
+            print("[DataSync] No stocks found to sync")
+            return {
+                "sync_needed": False,
+                "stock_count": 0,
+                "start_date": None,
+                "end_date": None,
+                "estimated_records": 0,
+                "sample_data": pd.DataFrame(),
+                "mode": "none",
+            }
+
+        stock_count = len(stock_codes)
+        print(f"[DataSync] Total stocks to sync: {stock_count}")
+
+        # Fetch sample data from first few stocks
+        print(f"[DataSync] Fetching sample data from {sample_size} stocks...")
+        sample_data_list = []
+        sample_codes = stock_codes[:sample_size]
+
+        for ts_code in sample_codes:
+            try:
+                data = self.client.query(
+                    "pro_bar",
+                    ts_code=ts_code,
+                    start_date=sync_start_date,
+                    end_date=end_date,
+                    factors="tor,vr",
+                )
+                if not data.empty:
+                    sample_data_list.append(data)
+                    print(f"  - {ts_code}: {len(data)} records")
+            except Exception as e:
+                print(f"  - {ts_code}: Error fetching - {e}")
+
+        # Combine sample data
+        sample_df = (
+            pd.concat(sample_data_list, ignore_index=True)
+            if sample_data_list
+            else pd.DataFrame()
+        )
+
+        # Estimate total records based on sample
+        if not sample_df.empty:
+            avg_records_per_stock = len(sample_df) / len(sample_data_list)
+            estimated_records = int(avg_records_per_stock * stock_count)
+        else:
+            estimated_records = 0
+
+        # Display preview results
+        print("\n" + "=" * 60)
+        print("[DataSync] Preview Result")
+        print("=" * 60)
+        print(f"  Sync Mode: {mode.upper()}")
+        print(f"  Date Range: {sync_start_date} to {end_date}")
+        print(f"  Stocks to Sync: {stock_count}")
+        print(f"  Sample Stocks Checked: {len(sample_data_list)}/{sample_size}")
+        print(f"  Estimated Total Records: ~{estimated_records:,}")
+
+        if not sample_df.empty:
+            print(f"\n  Sample Data Preview (first {len(sample_df)} rows):")
+            print("  " + "-" * 56)
+            # Display sample data in a compact format
+            preview_cols = [
+                "ts_code",
+                "trade_date",
+                "open",
+                "high",
+                "low",
+                "close",
+                "vol",
+            ]
+            available_cols = [c for c in preview_cols if c in sample_df.columns]
+            sample_display = sample_df[available_cols].head(10)
+            for idx, row in sample_display.iterrows():
+                print(f"    {row.to_dict()}")
+            print("  " + "-" * 56)
+
+        print("=" * 60)
+
+        return {
+            "sync_needed": True,
+            "stock_count": stock_count,
+            "start_date": sync_start_date,
+            "end_date": end_date,
+            "estimated_records": estimated_records,
+            "sample_data": sample_df,
+            "mode": mode,
+        }
+
    def sync_single_stock(
        self,
        ts_code: str,
@@ -320,6 +506,7 @@ class DataSync:
        start_date: Optional[str] = None,
        end_date: Optional[str] = None,
        max_workers: Optional[int] = None,
+        dry_run: bool = False,
    ) -> Dict[str, pd.DataFrame]:
        """Sync daily data for all stocks in local storage.

@@ -337,9 +524,10 @@ class DataSync:
            start_date: Manual start date (overrides auto-detection)
            end_date: Manual end date (defaults to today)
            max_workers: Number of worker threads (default: 10)
+            dry_run: If True, only preview what would be synced without writing data

        Returns:
-            Dict mapping ts_code to DataFrame (empty if sync skipped)
+            Dict mapping ts_code to DataFrame (empty if sync skipped or dry_run)
        """
        print("\n" + "=" * 60)
        print("[DataSync] Starting daily data sync...")
@@ -378,11 +566,14 @@ class DataSync:

        # Determine sync mode
        if force_full:
+            mode = "full"
            print(f"[DataSync] Mode: FULL SYNC from {sync_start_date} to {end_date}")
        elif local_last and cal_start and sync_start_date == get_next_date(local_last):
+            mode = "incremental"
            print(f"[DataSync] Mode: INCREMENTAL SYNC (bandwidth optimized)")
            print(f"[DataSync] Sync from: {sync_start_date} to {end_date}")
        else:
+            mode = "partial"
            print(f"[DataSync] Mode: SYNC from {sync_start_date} to {end_date}")

        # Get all stock codes
@@ -394,6 +585,17 @@ class DataSync:
        print(f"[DataSync] Total stocks to sync: {len(stock_codes)}")
        print(f"[DataSync] Using {max_workers or self.max_workers} worker threads")

+        # Handle dry run mode
+        if dry_run:
+            print("\n" + "=" * 60)
+            print("[DataSync] DRY RUN MODE - No data will be written")
+            print("=" * 60)
+            print(f"  Would sync {len(stock_codes)} stocks")
+            print(f"  Date range: {sync_start_date} to {end_date}")
+            print(f"  Mode: {mode}")
+            print("=" * 60)
+            return {}
+
        # Reset stop flag for new sync
        self._stop_flag.set()

@@ -492,11 +694,62 @@ class DataSync:
 # Convenience functions


+def preview_sync(
+    force_full: bool = False,
+    start_date: Optional[str] = None,
+    end_date: Optional[str] = None,
+    sample_size: int = 3,
+    max_workers: Optional[int] = None,
+) -> dict:
+    """Preview sync data volume and samples without actually syncing.
+
+    This is the recommended way to check what would be synced before
+    running the actual synchronization.
+
+    Args:
+        force_full: If True, preview full sync from 20180101
+        start_date: Manual start date (overrides auto-detection)
+        end_date: Manual end date (defaults to today)
+        sample_size: Number of sample stocks to fetch for preview (default: 3)
+        max_workers: Number of worker threads (not used in preview, for API compatibility)
+
+    Returns:
+        Dictionary with preview information:
+        {
+            'sync_needed': bool,
+            'stock_count': int,
+            'start_date': str,
+            'end_date': str,
+            'estimated_records': int,
+            'sample_data': pd.DataFrame,
+            'mode': str,  # 'full', 'incremental', 'partial', or 'none'
+        }
+
+    Example:
+        >>> # Preview what would be synced
+        >>> preview = preview_sync()
+        >>>
+        >>> # Preview full sync
+        >>> preview = preview_sync(force_full=True)
+        >>>
+        >>> # Preview with more samples
+        >>> preview = preview_sync(sample_size=5)
+    """
+    sync_manager = DataSync(max_workers=max_workers)
+    return sync_manager.preview_sync(
+        force_full=force_full,
+        start_date=start_date,
+        end_date=end_date,
+        sample_size=sample_size,
+    )
+
+
 def sync_all(
    force_full: bool = False,
    start_date: Optional[str] = None,
    end_date: Optional[str] = None,
    max_workers: Optional[int] = None,
+    dry_run: bool = False,
 ) -> Dict[str, pd.DataFrame]:
    """Sync daily data for all stocks.

@@ -507,6 +760,7 @@ def sync_all(
        start_date: Manual start date (YYYYMMDD)
        end_date: Manual end date (defaults to today)
        max_workers: Number of worker threads (default: 10)
+        dry_run: If True, only preview what would be synced without writing data

    Returns:
        Dict mapping ts_code to DataFrame
@@ -526,12 +780,16 @@ def sync_all(
        >>>
        >>> # Custom thread count
        >>> result = sync_all(max_workers=20)
+        >>>
+        >>> # Dry run (preview only)
+        >>> result = sync_all(dry_run=True)
    """
    sync_manager = DataSync(max_workers=max_workers)
    return sync_manager.sync_all(
        force_full=force_full,
        start_date=start_date,
        end_date=end_date,
+        dry_run=dry_run,
    )


@@ -540,11 +798,32 @@ if __name__ == "__main__":
    print("Data Sync Module")
    print("=" * 60)
    print("\nUsage:")
-    print("  from src.data.sync import sync_all")
+    print("  from src.data.sync import sync_all, preview_sync")
+    print("")
+    print("  # Preview before sync (recommended)")
+    print("  preview = preview_sync()")
+    print("")
+    print("  # Dry run (preview only)")
+    print("  result = sync_all(dry_run=True)")
+    print("")
+    print("  # Actual sync")
    print("  result = sync_all()  # Incremental sync")
    print("  result = sync_all(force_full=True)  # Full reload")
    print("\n" + "=" * 60)

-    # Run sync
-    result = sync_all()
-    print(f"\nSynced {len(result)} stocks")
+    # Run preview first
+    print("\n[Main] Running preview first...")
+    preview = preview_sync()
+
+    if preview["sync_needed"]:
+        # Ask for confirmation
+        print("\n" + "=" * 60)
+        response = input("Proceed with sync? (y/n): ").strip().lower()
+        if response in ("y", "yes"):
+            print("\n[Main] Starting actual sync...")
+            result = sync_all()
+            print(f"\nSynced {len(result)} stocks")
+        else:
+            print("\n[Main] Sync cancelled by user")
+    else:
+        print("\n[Main] No sync needed - data is up to date")