refactor: 存储层迁移DuckDB + 模块重构

- 存储层重构: HDF5 → DuckDB（UPSERT模式、线程安全存储） - Sync类迁移: DataSync从sync.py迁移到api_daily.py（职责分离） - 模型模块重构: src/models → src/pipeline（更清晰的命名） - 新增因子模块: factors/momentum (MA、收益率排名)、factors/financial - 新增API接口: api_namechange、api_bak_basic - 新增训练入口: training模块（main.py、pipeline配置） - 工具函数统一: get_today_date等移至utils.py - 文档更新: AGENTS.md添加架构变更历史
2026-02-23 16:23:53 +08:00
parent 9f95be56a0
commit 593ec99466
32 changed files with 4181 additions and 1395 deletions
--- a/src/data/sync.py
+++ b/src/data/sync.py
@@ -1,701 +1,34 @@
-"""Data synchronization module.
+"""数据同步调度中心模块。

-This module provides data fetching functions with intelligent sync logic:
- If local file doesn't exist: fetch all data (full load from 20180101)
- If local file exists: incremental update (fetch from latest date + 1 day)
- Multi-threaded concurrent fetching for improved performance
- Stop immediately on any exception
- Preview mode: check data volume and samples before actual sync
+该模块作为数据同步的调度中心，统一管理各类型数据的同步流程。
+具体的同步逻辑已迁移到对应的 api_xxx.py 文件中：
+- api_daily.py: 日线数据同步 (DailySync 类)
+- api_bak_basic.py: 历史股票列表同步
+- api_stock_basic.py: 股票基本信息同步
+- api_trade_cal.py: 交易日历同步

-Currently supported data types:
- daily: Daily market data (with turnover rate and volume ratio)
+注意：名称变更 (namechange) 已从自动同步中移除，
+因为股票名称变更不频繁，建议手动定期同步。

-Usage:
-    # Preview sync (check data volume and samples without writing)
-    preview_sync()
+使用方式：
+    # 预览同步（检查数据量，不写入）
+    from src.data.sync import preview_sync
+    preview = preview_sync()

-    # Sync all stocks (full load)
-    sync_all()
+    # 同步所有数据（不包括 namechange）
+    from src.data.sync import sync_all_data
+    result = sync_all_data()

-    # Sync all stocks (incremental)
-    sync_all()
-
-    # Force full reload
-    sync_all(force_full=True)
-
-    # Dry run (preview only, no write)
-    sync_all(dry_run=True)
+    # 强制全量重载
+    result = sync_all_data(force_full=True)
 """

+from typing import Optional, Dict
+
 import pandas as pd
-from typing import Optional, Dict, Callable
-from datetime import datetime, timedelta
-from tqdm import tqdm
-from concurrent.futures import ThreadPoolExecutor, as_completed
-import threading
-import sys

-from src.data.client import TushareClient
-from src.data.storage import ThreadSafeStorage
-from src.data.api_wrappers import get_daily
-from src.data.api_wrappers import (
-    get_first_trading_day,
-    get_last_trading_day,
-    sync_trade_cal_cache,
-)
-
-
-# Default full sync start date
-DEFAULT_START_DATE = "20180101"
-
-# Today's date in YYYYMMDD format
-TODAY = datetime.now().strftime("%Y%m%d")
-
-
-def get_today_date() -> str:
-    """Get today's date in YYYYMMDD format."""
-    return TODAY
-
-
-def get_next_date(date_str: str) -> str:
-    """Get the next day after the given date.
-
-    Args:
-        date_str: Date in YYYYMMDD format
-
-    Returns:
-        Next date in YYYYMMDD format
-    """
-    dt = datetime.strptime(date_str, "%Y%m%d")
-    next_dt = dt + timedelta(days=1)
-    return next_dt.strftime("%Y%m%d")
-
-
-class DataSync:
-    """Data synchronization manager with full/incremental sync support."""
-
-    # Default number of worker threads
-    DEFAULT_MAX_WORKERS = 10
-
-    def __init__(self, max_workers: Optional[int] = None):
-        """Initialize sync manager.
-
-        Args:
-            max_workers: Number of worker threads (default: 10)
-        """
-        self.storage = ThreadSafeStorage()
-        self.client = TushareClient()
-        self.max_workers = max_workers or self.DEFAULT_MAX_WORKERS
-        self._stop_flag = threading.Event()
-        self._stop_flag.set()  # Initially not stopped
-        self._cached_daily_data: Optional[pd.DataFrame] = None  # Cache for daily data
-
-    def _load_daily_data(self) -> pd.DataFrame:
-        """Load daily data from storage with caching.
-
-        This method caches the daily data in memory to avoid repeated disk reads.
-        Call clear_cache() to force reload.
-
-        Returns:
-            DataFrame with daily data (cached or loaded from storage)
-        """
-        if self._cached_daily_data is None:
-            self._cached_daily_data = self.storage.load("daily")
-        return self._cached_daily_data
-
-    def clear_cache(self) -> None:
-        """Clear the cached daily data to force reload on next access."""
-        self._cached_daily_data = None
-
-    def get_all_stock_codes(self, only_listed: bool = True) -> list:
-        """Get all stock codes from local storage.
-
-        This function prioritizes stock_basic.csv to ensure all stocks
-        are included for backtesting to avoid look-ahead bias.
-
-        Args:
-            only_listed: If True, only return currently listed stocks (L status).
-                        Set to False to include delisted stocks (for full backtest).
-
-        Returns:
-            List of stock codes
-        """
-        # Import sync_all_stocks here to avoid circular imports
-        from src.data.api_wrappers import sync_all_stocks
-        from src.data.api_wrappers.api_stock_basic import _get_csv_path
-
-        # First, ensure stock_basic.csv is up-to-date with all stocks
-        print("[DataSync] Ensuring stock_basic.csv is up-to-date...")
-        sync_all_stocks()
-
-        # Get from stock_basic.csv file
-        stock_csv_path = _get_csv_path()
-
-        if stock_csv_path.exists():
-            print(f"[DataSync] Reading stock_basic from CSV: {stock_csv_path}")
-            try:
-                stock_df = pd.read_csv(stock_csv_path, encoding="utf-8-sig")
-                if not stock_df.empty and "ts_code" in stock_df.columns:
-                    # Filter by list_status if only_listed is True
-                    if only_listed and "list_status" in stock_df.columns:
-                        listed_stocks = stock_df[stock_df["list_status"] == "L"]
-                        codes = listed_stocks["ts_code"].unique().tolist()
-                        total = len(stock_df["ts_code"].unique())
-                        print(
-                            f"[DataSync] Found {len(codes)} listed stocks (filtered from {total} total)"
-                        )
-                    else:
-                        codes = stock_df["ts_code"].unique().tolist()
-                        print(
-                            f"[DataSync] Found {len(codes)} stock codes from stock_basic.csv"
-                        )
-                    return codes
-                else:
-                    print(
-                        f"[DataSync] stock_basic.csv exists but no ts_code column or empty"
-                    )
-            except Exception as e:
-                print(f"[DataSync] Error reading stock_basic.csv: {e}")
-
-        # Fallback: try daily storage if stock_basic not available (using cached data)
-        print("[DataSync] stock_basic.csv not available, falling back to daily data...")
-        daily_data = self._load_daily_data()
-        if not daily_data.empty and "ts_code" in daily_data.columns:
-            codes = daily_data["ts_code"].unique().tolist()
-            print(f"[DataSync] Found {len(codes)} stock codes from daily data")
-            return codes
-
-        print("[DataSync] No stock codes found in local storage")
-        return []
-
-    def get_global_last_date(self) -> Optional[str]:
-        """Get the global last trade date across all stocks.
-
-        Returns:
-            Last trade date string or None
-        """
-        daily_data = self._load_daily_data()
-        if daily_data.empty or "trade_date" not in daily_data.columns:
-            return None
-        return str(daily_data["trade_date"].max())
-
-    def get_global_first_date(self) -> Optional[str]:
-        """Get the global first trade date across all stocks.
-
-        Returns:
-            First trade date string or None
-        """
-        daily_data = self._load_daily_data()
-        if daily_data.empty or "trade_date" not in daily_data.columns:
-            return None
-        return str(daily_data["trade_date"].min())
-
-    def get_trade_calendar_bounds(
-        self, start_date: str, end_date: str
-    ) -> tuple[Optional[str], Optional[str]]:
-        """Get the first and last trading day from trade calendar.
-
-        Args:
-            start_date: Start date in YYYYMMDD format
-            end_date: End date in YYYYMMDD format
-
-        Returns:
-            Tuple of (first_trading_day, last_trading_day) or (None, None) if error
-        """
-        try:
-            first_day = get_first_trading_day(start_date, end_date)
-            last_day = get_last_trading_day(start_date, end_date)
-            return (first_day, last_day)
-        except Exception as e:
-            print(f"[ERROR] Failed to get trade calendar bounds: {e}")
-            return (None, None)
-
-    def check_sync_needed(
-        self, force_full: bool = False
-    ) -> tuple[bool, Optional[str], Optional[str], Optional[str]]:
-        """Check if sync is needed based on trade calendar.
-
-        This method compares local data date range with trade calendar
-        to determine if new data needs to be fetched.
-
-        Logic:
-        - If force_full: sync needed, return (True, 20180101, today)
-        - If no local data: sync needed, return (True, 20180101, today)
-        - If local data exists:
-            - Get the last trading day from trade calendar
-            - If local last date >= calendar last date: NO sync needed
-            - Otherwise: sync needed from local_last_date + 1 to latest trade day
-
-        Args:
-            force_full: If True, always return sync needed
-
-        Returns:
-            Tuple of (sync_needed, start_date, end_date, local_last_date)
-            - sync_needed: True if sync should proceed, False to skip
-            - start_date: Sync start date (None if sync not needed)
-            - end_date: Sync end date (None if sync not needed)
-            - local_last_date: Local data last date (for incremental sync)
-        """
-        # If force_full, always sync
-        if force_full:
-            print("[DataSync] Force full sync requested")
-            return (True, DEFAULT_START_DATE, get_today_date(), None)
-
-        # Check if local data exists (using cached data)
-        daily_data = self._load_daily_data()
-        if daily_data.empty or "trade_date" not in daily_data.columns:
-            print("[DataSync] No local data found, full sync needed")
-            return (True, DEFAULT_START_DATE, get_today_date(), None)
-
-        # Get local data last date (we only care about the latest date, not the first)
-        local_last_date = str(daily_data["trade_date"].max())
-
-        print(f"[DataSync] Local data last date: {local_last_date}")
-
-        # Get the latest trading day from trade calendar
-        today = get_today_date()
-        _, cal_last = self.get_trade_calendar_bounds(DEFAULT_START_DATE, today)
-
-        if cal_last is None:
-            print("[DataSync] Failed to get trade calendar, proceeding with sync")
-            return (True, DEFAULT_START_DATE, today, local_last_date)
-
-        print(f"[DataSync] Calendar last trading day: {cal_last}")
-
-        # Compare local last date with calendar last date
-        # If local data is already up-to-date or newer, no sync needed
-        print(
-            f"[DataSync] Comparing: local={local_last_date} (type={type(local_last_date).__name__}), cal={cal_last} (type={type(cal_last).__name__})"
-        )
-        try:
-            local_last_int = int(local_last_date)
-            cal_last_int = int(cal_last)
-            print(
-                f"[DataSync] Comparing integers: local={local_last_int} >= cal={cal_last_int} = {local_last_int >= cal_last_int}"
-            )
-            if local_last_int >= cal_last_int:
-                print(
-                    "[DataSync] Local data is up-to-date, SKIPPING sync (no tokens consumed)"
-                )
-                return (False, None, None, None)
-        except (ValueError, TypeError) as e:
-            print(f"[ERROR] Date comparison failed: {e}")
-
-        # Need to sync from local_last_date + 1 to latest trade day
-        sync_start = get_next_date(local_last_date)
-        print(f"[DataSync] Incremental sync needed from {sync_start} to {cal_last}")
-        return (True, sync_start, cal_last, local_last_date)
-
-    def preview_sync(
-        self,
-        force_full: bool = False,
-        start_date: Optional[str] = None,
-        end_date: Optional[str] = None,
-        sample_size: int = 3,
-    ) -> dict:
-        """Preview sync data volume and samples without actually syncing.
-
-        This method provides a preview of what would be synced, including:
-        - Number of stocks to be synced
-        - Date range for sync
-        - Estimated total records
-        - Sample data from first few stocks
-
-        Args:
-            force_full: If True, preview full sync from 20180101
-            start_date: Manual start date (overrides auto-detection)
-            end_date: Manual end date (defaults to today)
-            sample_size: Number of sample stocks to fetch for preview (default: 3)
-
-        Returns:
-            Dictionary with preview information:
-            {
-                'sync_needed': bool,
-                'stock_count': int,
-                'start_date': str,
-                'end_date': str,
-                'estimated_records': int,
-                'sample_data': pd.DataFrame,
-                'mode': str,  # 'full' or 'incremental'
-            }
-        """
-        print("\n" + "=" * 60)
-        print("[DataSync] Preview Mode - Analyzing sync requirements...")
-        print("=" * 60)
-
-        # First, ensure trade calendar cache is up-to-date
-        print("[DataSync] Syncing trade calendar cache...")
-        sync_trade_cal_cache()
-
-        # Determine date range
-        if end_date is None:
-            end_date = get_today_date()
-
-        # Check if sync is needed
-        sync_needed, cal_start, cal_end, local_last = self.check_sync_needed(force_full)
-
-        if not sync_needed:
-            print("\n" + "=" * 60)
-            print("[DataSync] Preview Result")
-            print("=" * 60)
-            print("  Sync Status: NOT NEEDED")
-            print("  Reason: Local data is up-to-date with trade calendar")
-            print("=" * 60)
-            return {
-                "sync_needed": False,
-                "stock_count": 0,
-                "start_date": None,
-                "end_date": None,
-                "estimated_records": 0,
-                "sample_data": pd.DataFrame(),
-                "mode": "none",
-            }
-
-        # Use dates from check_sync_needed
-        if cal_start and cal_end:
-            sync_start_date = cal_start
-            end_date = cal_end
-        else:
-            sync_start_date = start_date or DEFAULT_START_DATE
-            if end_date is None:
-                end_date = get_today_date()
-
-        # Determine sync mode
-        if force_full:
-            mode = "full"
-            print(f"[DataSync] Mode: FULL SYNC from {sync_start_date} to {end_date}")
-        elif local_last and cal_start and sync_start_date == get_next_date(local_last):
-            mode = "incremental"
-            print(f"[DataSync] Mode: INCREMENTAL SYNC (bandwidth optimized)")
-            print(f"[DataSync] Sync from: {sync_start_date} to {end_date}")
-        else:
-            mode = "partial"
-            print(f"[DataSync] Mode: SYNC from {sync_start_date} to {end_date}")
-
-        # Get all stock codes
-        stock_codes = self.get_all_stock_codes()
-        if not stock_codes:
-            print("[DataSync] No stocks found to sync")
-            return {
-                "sync_needed": False,
-                "stock_count": 0,
-                "start_date": None,
-                "end_date": None,
-                "estimated_records": 0,
-                "sample_data": pd.DataFrame(),
-                "mode": "none",
-            }
-
-        stock_count = len(stock_codes)
-        print(f"[DataSync] Total stocks to sync: {stock_count}")
-
-        # Fetch sample data from first few stocks
-        print(f"[DataSync] Fetching sample data from {sample_size} stocks...")
-        sample_data_list = []
-        sample_codes = stock_codes[:sample_size]
-
-        for ts_code in sample_codes:
-            try:
-                data = self.client.query(
-                    "pro_bar",
-                    ts_code=ts_code,
-                    start_date=sync_start_date,
-                    end_date=end_date,
-                    factors="tor,vr",
-                )
-                if not data.empty:
-                    sample_data_list.append(data)
-                    print(f"  - {ts_code}: {len(data)} records")
-            except Exception as e:
-                print(f"  - {ts_code}: Error fetching - {e}")
-
-        # Combine sample data
-        sample_df = (
-            pd.concat(sample_data_list, ignore_index=True)
-            if sample_data_list
-            else pd.DataFrame()
-        )
-
-        # Estimate total records based on sample
-        if not sample_df.empty:
-            avg_records_per_stock = len(sample_df) / len(sample_data_list)
-            estimated_records = int(avg_records_per_stock * stock_count)
-        else:
-            estimated_records = 0
-
-        # Display preview results
-        print("\n" + "=" * 60)
-        print("[DataSync] Preview Result")
-        print("=" * 60)
-        print(f"  Sync Mode: {mode.upper()}")
-        print(f"  Date Range: {sync_start_date} to {end_date}")
-        print(f"  Stocks to Sync: {stock_count}")
-        print(f"  Sample Stocks Checked: {len(sample_data_list)}/{sample_size}")
-        print(f"  Estimated Total Records: ~{estimated_records:,}")
-
-        if not sample_df.empty:
-            print(f"\n  Sample Data Preview (first {len(sample_df)} rows):")
-            print("  " + "-" * 56)
-            # Display sample data in a compact format
-            preview_cols = [
-                "ts_code",
-                "trade_date",
-                "open",
-                "high",
-                "low",
-                "close",
-                "vol",
-            ]
-            available_cols = [c for c in preview_cols if c in sample_df.columns]
-            sample_display = sample_df[available_cols].head(10)
-            for idx, row in sample_display.iterrows():
-                print(f"    {row.to_dict()}")
-            print("  " + "-" * 56)
-
-        print("=" * 60)
-
-        return {
-            "sync_needed": True,
-            "stock_count": stock_count,
-            "start_date": sync_start_date,
-            "end_date": end_date,
-            "estimated_records": estimated_records,
-            "sample_data": sample_df,
-            "mode": mode,
-        }
-
-    def sync_single_stock(
-        self,
-        ts_code: str,
-        start_date: str,
-        end_date: str,
-    ) -> pd.DataFrame:
-        """Sync daily data for a single stock.
-
-        Args:
-            ts_code: Stock code
-            start_date: Start date (YYYYMMDD)
-            end_date: End date (YYYYMMDD)
-
-        Returns:
-            DataFrame with daily market data
-        """
-        # Check if sync should stop (for exception handling)
-        if not self._stop_flag.is_set():
-            return pd.DataFrame()
-
-        try:
-            # Use shared client for rate limiting across threads
-            data = self.client.query(
-                "pro_bar",
-                ts_code=ts_code,
-                start_date=start_date,
-                end_date=end_date,
-                factors="tor,vr",
-            )
-            return data
-        except Exception as e:
-            # Set stop flag to signal other threads to stop
-            self._stop_flag.clear()
-            print(f"[ERROR] Exception syncing {ts_code}: {e}")
-            raise
-
-    def sync_all(
-        self,
-        force_full: bool = False,
-        start_date: Optional[str] = None,
-        end_date: Optional[str] = None,
-        max_workers: Optional[int] = None,
-        dry_run: bool = False,
-    ) -> Dict[str, pd.DataFrame]:
-        """Sync daily data for all stocks in local storage.
-
-        This function:
-        1. Reads stock codes from local storage (daily or stock_basic)
-        2. Checks trade calendar to determine if sync is needed:
-           - If local data matches trade calendar bounds, SKIP sync (save tokens)
-           - Otherwise, sync from local_last_date + 1 to latest trade day (bandwidth optimized)
-        3. Uses multi-threaded concurrent fetching with rate limiting
-        4. Skips updating stocks that return empty data (delisted/unavailable)
-        5. Stops immediately on any exception
-
-        Args:
-            force_full: If True, force full reload from 20180101
-            start_date: Manual start date (overrides auto-detection)
-            end_date: Manual end date (defaults to today)
-            max_workers: Number of worker threads (default: 10)
-            dry_run: If True, only preview what would be synced without writing data
-
-        Returns:
-            Dict mapping ts_code to DataFrame (empty if sync skipped or dry_run)
-        """
-        print("\n" + "=" * 60)
-        print("[DataSync] Starting daily data sync...")
-        print("=" * 60)
-
-        # First, ensure trade calendar cache is up-to-date (uses incremental sync)
-        print("[DataSync] Syncing trade calendar cache...")
-        sync_trade_cal_cache()
-
-        # Determine date range
-        if end_date is None:
-            end_date = get_today_date()
-
-        # Check if sync is needed based on trade calendar
-        sync_needed, cal_start, cal_end, local_last = self.check_sync_needed(force_full)
-
-        if not sync_needed:
-            # Sync skipped - no tokens consumed
-            print("\n" + "=" * 60)
-            print("[DataSync] Sync Summary")
-            print("=" * 60)
-            print("  Sync: SKIPPED (local data up-to-date with trade calendar)")
-            print("  Tokens saved: 0 consumed")
-            print("=" * 60)
-            return {}
-
-        # Use dates from check_sync_needed (which calculates incremental start if needed)
-        if cal_start and cal_end:
-            sync_start_date = cal_start
-            end_date = cal_end
-        else:
-            # Fallback to default logic
-            sync_start_date = start_date or DEFAULT_START_DATE
-            if end_date is None:
-                end_date = get_today_date()
-
-        # Determine sync mode
-        if force_full:
-            mode = "full"
-            print(f"[DataSync] Mode: FULL SYNC from {sync_start_date} to {end_date}")
-        elif local_last and cal_start and sync_start_date == get_next_date(local_last):
-            mode = "incremental"
-            print(f"[DataSync] Mode: INCREMENTAL SYNC (bandwidth optimized)")
-            print(f"[DataSync] Sync from: {sync_start_date} to {end_date}")
-        else:
-            mode = "partial"
-            print(f"[DataSync] Mode: SYNC from {sync_start_date} to {end_date}")
-
-        # Get all stock codes
-        stock_codes = self.get_all_stock_codes()
-        if not stock_codes:
-            print("[DataSync] No stocks found to sync")
-            return {}
-
-        print(f"[DataSync] Total stocks to sync: {len(stock_codes)}")
-        print(f"[DataSync] Using {max_workers or self.max_workers} worker threads")
-
-        # Handle dry run mode
-        if dry_run:
-            print("\n" + "=" * 60)
-            print("[DataSync] DRY RUN MODE - No data will be written")
-            print("=" * 60)
-            print(f"  Would sync {len(stock_codes)} stocks")
-            print(f"  Date range: {sync_start_date} to {end_date}")
-            print(f"  Mode: {mode}")
-            print("=" * 60)
-            return {}
-
-        # Reset stop flag for new sync
-        self._stop_flag.set()
-
-        # Multi-threaded concurrent fetching
-        results: Dict[str, pd.DataFrame] = {}
-        error_occurred = False
-        exception_to_raise = None
-
-        def sync_task(ts_code: str) -> tuple[str, pd.DataFrame]:
-            """Task function for each stock."""
-            try:
-                data = self.sync_single_stock(
-                    ts_code=ts_code,
-                    start_date=sync_start_date,
-                    end_date=end_date,
-                )
-                return (ts_code, data)
-            except Exception as e:
-                # Re-raise to be caught by Future
-                raise
-
-        # Use ThreadPoolExecutor for concurrent fetching
-        workers = max_workers or self.max_workers
-        with ThreadPoolExecutor(max_workers=workers) as executor:
-            # Submit all tasks and track futures with their stock codes
-            future_to_code = {
-                executor.submit(sync_task, ts_code): ts_code for ts_code in stock_codes
-            }
-
-            # Process results using as_completed
-            error_count = 0
-            empty_count = 0
-            success_count = 0
-
-            # Create progress bar
-            pbar = tqdm(total=len(stock_codes), desc="Syncing stocks")
-
-            try:
-                # Process futures as they complete
-                for future in as_completed(future_to_code):
-                    ts_code = future_to_code[future]
-
-                    try:
-                        _, data = future.result()
-                        if data is not None and not data.empty:
-                            results[ts_code] = data
-                            success_count += 1
-                        else:
-                            # Empty data - stock may be delisted or unavailable
-                            empty_count += 1
-                            print(
-                                f"[DataSync] Stock {ts_code}: empty data (skipped, may be delisted)"
-                            )
-                    except Exception as e:
-                        # Exception occurred - stop all and abort
-                        error_occurred = True
-                        exception_to_raise = e
-                        print(f"\n[ERROR] Sync aborted due to exception: {e}")
-                        # Shutdown executor to stop all pending tasks
-                        executor.shutdown(wait=False, cancel_futures=True)
-                        raise exception_to_raise
-
-                    # Update progress bar
-                    pbar.update(1)
-
-            except Exception:
-                error_count = 1
-                print("[DataSync] Sync stopped due to exception")
-            finally:
-                pbar.close()
-
-        # Queue all data for batch write (only if no error)
-        if results and not error_occurred:
-            for ts_code, data in results.items():
-                if not data.empty:
-                    self.storage.queue_save("daily", data)
-            # Flush all queued writes at once
-            self.storage.flush()
-            total_rows = sum(len(df) for df in results.values())
-            print(f"\n[DataSync] Saved {total_rows} rows to storage")
-
-        # Summary
-        print("\n" + "=" * 60)
-        print("[DataSync] Sync Summary")
-        print("=" * 60)
-        print(f"  Total stocks: {len(stock_codes)}")
-        print(f"  Updated: {success_count}")
-        print(f"  Skipped (empty/delisted): {empty_count}")
-        print(
-            f"  Errors: {error_count} (aborted on first error)"
-            if error_count
-            else "  Errors: 0"
-        )
-        print(f"  Date range: {sync_start_date} to {end_date}")
-        print("=" * 60)
-
-        return results
-
-
-# Convenience functions
+from src.data.api_wrappers import sync_all_stocks
+from src.data.api_wrappers.api_daily import sync_daily, preview_daily_sync


 def preview_sync(
@@ -705,20 +38,19 @@ def preview_sync(
    sample_size: int = 3,
    max_workers: Optional[int] = None,
 ) -> dict:
-    """Preview sync data volume and samples without actually syncing.
+    """预览日线同步数据量和样本（不实际同步）。

-    This is the recommended way to check what would be synced before
-    running the actual synchronization.
+    这是推荐的方式，可在实际同步前检查将要同步的内容。

    Args:
-        force_full: If True, preview full sync from 20180101
-        start_date: Manual start date (overrides auto-detection)
-        end_date: Manual end date (defaults to today)
-        sample_size: Number of sample stocks to fetch for preview (default: 3)
-        max_workers: Number of worker threads (not used in preview, for API compatibility)
+        force_full: 若为 True，预览全量同步（从 20180101）
+        start_date: 手动指定起始日期（覆盖自动检测）
+        end_date: 手动指定结束日期（默认为今天）
+        sample_size: 预览用样本股票数量（默认: 3）
+        max_workers: 工作线程数（默认: 10）

    Returns:
-        Dictionary with preview information:
+        包含预览信息的字典：
        {
            'sync_needed': bool,
            'stock_count': int,
@@ -726,21 +58,20 @@ def preview_sync(
            'end_date': str,
            'estimated_records': int,
            'sample_data': pd.DataFrame,
-            'mode': str,  # 'full', 'incremental', 'partial', or 'none'
+            'mode': str,  # 'full', 'incremental', 'partial', 或 'none'
        }

    Example:
-        >>> # Preview what would be synced
+        >>> # 预览将要同步的内容
        >>> preview = preview_sync()
        >>>
-        >>> # Preview full sync
+        >>> # 预览全量同步
        >>> preview = preview_sync(force_full=True)
        >>>
-        >>> # Preview with more samples
+        >>> # 预览更多样本
        >>> preview = preview_sync(sample_size=5)
    """
-    sync_manager = DataSync(max_workers=max_workers)
-    return sync_manager.preview_sync(
+    return preview_daily_sync(
        force_full=force_full,
        start_date=start_date,
        end_date=end_date,
@@ -755,54 +86,168 @@ def sync_all(
    max_workers: Optional[int] = None,
    dry_run: bool = False,
 ) -> Dict[str, pd.DataFrame]:
-    """Sync daily data for all stocks.
+    """同步所有股票的日线数据。

-    This is the main entry point for data synchronization.
+    这是日线数据同步的主要入口点。

    Args:
-        force_full: If True, force full reload from 20180101
-        start_date: Manual start date (YYYYMMDD)
-        end_date: Manual end date (defaults to today)
-        max_workers: Number of worker threads (default: 10)
-        dry_run: If True, only preview what would be synced without writing data
+        force_full: 若为 True，强制从 20180101 完整重载
+        start_date: 手动指定起始日期（YYYYMMDD）
+        end_date: 手动指定结束日期（默认为今天）
+        max_workers: 工作线程数（默认: 10）
+        dry_run: 若为 True，仅预览将要同步的内容，不写入数据

    Returns:
-        Dict mapping ts_code to DataFrame
+        映射 ts_code 到 DataFrame 的字典

    Example:
-        >>> # First time sync (full load from 20180101)
+        >>> # 首次同步（从 20180101 全量加载）
        >>> result = sync_all()
        >>>
-        >>> # Subsequent sync (incremental - only new data)
+        >>> # 后续同步（增量 - 仅新数据）
        >>> result = sync_all()
        >>>
-        >>> # Force full reload
+        >>> # 强制完整重载
        >>> result = sync_all(force_full=True)
        >>>
-        >>> # Manual date range
+        >>> # 手动指定日期范围
        >>> result = sync_all(start_date='20240101', end_date='20240131')
        >>>
-        >>> # Custom thread count
+        >>> # 自定义线程数
        >>> result = sync_all(max_workers=20)
        >>>
-        >>> # Dry run (preview only)
+        >>> # Dry run（仅预览）
        >>> result = sync_all(dry_run=True)
    """
-    sync_manager = DataSync(max_workers=max_workers)
-    return sync_manager.sync_all(
+    return sync_daily(
        force_full=force_full,
        start_date=start_date,
        end_date=end_date,
+        max_workers=max_workers,
        dry_run=dry_run,
    )


+def sync_all_data(
+    force_full: bool = False,
+    max_workers: Optional[int] = None,
+    dry_run: bool = False,
+) -> Dict[str, pd.DataFrame]:
+    """同步所有数据类型（每日同步）。
+
+        该函数按顺序同步所有可用的数据类型：
+        1. 交易日历 (sync_trade_cal_cache)
+        2. 股票基本信息 (sync_all_stocks)
+        3. 日线市场数据 (sync_all)
+        4. 历史股票列表 (sync_bak_basic)
+
+        注意：名称变更 (namechange) 不在自动同步中，如需同步请手动调用。
+
+        Args:
+            force_full: 若为 True，强制所有数据类型完整重载
+            max_workers: 日线数据同步的工作线程数（默认: 10）
+            dry_run: 若为 True，仅显示将要同步的内容    Returns:
+            映射数据类型，不写入数据
+
+    到同步结果的字典
+
+        Example:
+            >>> # 同步所有数据（增量）
+            >>> result = sync_all_data()
+            >>>
+            >>> # 强制完整重载
+            >>> result = sync_all_data(force_full=True)
+            >>>
+            >>> # Dry run
+            >>> result = sync_all_data(dry_run=True)
+    """
+    results: Dict[str, pd.DataFrame] = {}
+
+    print("\n" + "=" * 60)
+    print("[sync_all_data] Starting full data synchronization...")
+    print("=" * 60)
+
+    # 1. Sync trade calendar (always needed first)
+    print("\n[1/5] Syncing trade calendar cache...")
+    try:
+        from src.data.api_wrappers import sync_trade_cal_cache
+
+        sync_trade_cal_cache()
+        results["trade_cal"] = pd.DataFrame()
+        print("[1/5] Trade calendar: OK")
+    except Exception as e:
+        print(f"[1/5] Trade calendar: FAILED - {e}")
+        results["trade_cal"] = pd.DataFrame()
+
+    # 2. Sync stock basic info
+    print("\n[2/5] Syncing stock basic info...")
+    try:
+        sync_all_stocks()
+        results["stock_basic"] = pd.DataFrame()
+        print("[2/5] Stock basic: OK")
+    except Exception as e:
+        print(f"[2/5] Stock basic: FAILED - {e}")
+        results["stock_basic"] = pd.DataFrame()
+
+    # 3. Sync daily market data
+    print("\n[3/5] Syncing daily market data...")
+    try:
+        daily_result = sync_daily(
+            force_full=force_full,
+            max_workers=max_workers,
+            dry_run=dry_run,
+        )
+        results["daily"] = (
+            pd.concat(daily_result.values(), ignore_index=True)
+            if daily_result
+            else pd.DataFrame()
+        )
+        print("[3/5] Daily data: OK")
+    except Exception as e:
+        print(f"[3/5] Daily data: FAILED - {e}")
+        results["daily"] = pd.DataFrame()
+
+    # 4. Sync stock historical list (bak_basic)
+    print("\n[4/5] Syncing stock historical list (bak_basic)...")
+    try:
+        bak_basic_result = sync_bak_basic(force_full=force_full)
+        results["bak_basic"] = bak_basic_result
+        print(f"[4/5] Bak basic: OK ({len(bak_basic_result)} records)")
+    except Exception as e:
+        print(f"[4/5] Bak basic: FAILED - {e}")
+        results["bak_basic"] = pd.DataFrame()
+
+    # Summary
+    print("\n" + "=" * 60)
+    print("[sync_all_data] Sync Summary")
+    print("=" * 60)
+    for data_type, df in results.items():
+        print(f"  {data_type}: {len(df)} records")
+    print("=" * 60)
+    print("\nNote: namechange is NOT in auto-sync. To sync manually:")
+    print("  from src.data.api_wrappers import sync_namechange")
+    print("  sync_namechange(force=True)")
+
+    return results
+
+
+# 保留向后兼容的导入
+from src.data.api_wrappers import sync_bak_basic
+
+
 if __name__ == "__main__":
    print("=" * 60)
    print("Data Sync Module")
    print("=" * 60)
    print("\nUsage:")
+    print("  # Sync all data types at once (RECOMMENDED)")
+    print("  from src.data.sync import sync_all_data")
+    print("  result = sync_all_data()  # Incremental sync all")
+    print("  result = sync_all_data(force_full=True)  # Full reload")
+    print("")
+    print("  # Or sync individual data types:")
    print("  from src.data.sync import sync_all, preview_sync")
+    print("  from src.data.sync import sync_bak_basic")
    print("")
    print("  # Preview before sync (recommended)")
    print("  preview = preview_sync()")
@@ -813,21 +258,14 @@ if __name__ == "__main__":
    print("  # Actual sync")
    print("  result = sync_all()  # Incremental sync")
    print("  result = sync_all(force_full=True)  # Full reload")
+    print("")
+    print("  # bak_basic sync")
+    print("  result = sync_bak_basic()  # Incremental sync")
+    print("  result = sync_bak_basic(force_full=True)  # Full reload")
    print("\n" + "=" * 60)

-    # Run preview first
-    print("\n[Main] Running preview first...")
-    preview = preview_sync()
-
-    if preview["sync_needed"]:
-        # Ask for confirmation
-        print("\n" + "=" * 60)
-        response = input("Proceed with sync? (y/n): ").strip().lower()
-        if response in ("y", "yes"):
-            print("\n[Main] Starting actual sync...")
-            result = sync_all()
-            print(f"\nSynced {len(result)} stocks")
-        else:
-            print("\n[Main] Sync cancelled by user")
-    else:
-        print("\n[Main] No sync needed - data is up to date")
+    # Run sync_all_data by default
+    print("\n[Main] Running sync_all_data()...")
+    result = sync_all_data()
+    print("\n[Main] Sync completed!")
+    print(f"Total data types synced: {len(result)}")