refactor: 存储层迁移DuckDB + 模块重构
- 存储层重构: HDF5 → DuckDB(UPSERT模式、线程安全存储) - Sync类迁移: DataSync从sync.py迁移到api_daily.py(职责分离) - 模型模块重构: src/models → src/pipeline(更清晰的命名) - 新增因子模块: factors/momentum (MA、收益率排名)、factors/financial - 新增API接口: api_namechange、api_bak_basic - 新增训练入口: training模块(main.py、pipeline配置) - 工具函数统一: get_today_date等移至utils.py - 文档更新: AGENTS.md添加架构变更历史
This commit is contained in:
910
src/data/sync.py
910
src/data/sync.py
@@ -1,701 +1,34 @@
|
||||
"""Data synchronization module.
|
||||
"""数据同步调度中心模块。
|
||||
|
||||
This module provides data fetching functions with intelligent sync logic:
|
||||
- If local file doesn't exist: fetch all data (full load from 20180101)
|
||||
- If local file exists: incremental update (fetch from latest date + 1 day)
|
||||
- Multi-threaded concurrent fetching for improved performance
|
||||
- Stop immediately on any exception
|
||||
- Preview mode: check data volume and samples before actual sync
|
||||
该模块作为数据同步的调度中心,统一管理各类型数据的同步流程。
|
||||
具体的同步逻辑已迁移到对应的 api_xxx.py 文件中:
|
||||
- api_daily.py: 日线数据同步 (DailySync 类)
|
||||
- api_bak_basic.py: 历史股票列表同步
|
||||
- api_stock_basic.py: 股票基本信息同步
|
||||
- api_trade_cal.py: 交易日历同步
|
||||
|
||||
Currently supported data types:
|
||||
- daily: Daily market data (with turnover rate and volume ratio)
|
||||
注意:名称变更 (namechange) 已从自动同步中移除,
|
||||
因为股票名称变更不频繁,建议手动定期同步。
|
||||
|
||||
Usage:
|
||||
# Preview sync (check data volume and samples without writing)
|
||||
preview_sync()
|
||||
使用方式:
|
||||
# 预览同步(检查数据量,不写入)
|
||||
from src.data.sync import preview_sync
|
||||
preview = preview_sync()
|
||||
|
||||
# Sync all stocks (full load)
|
||||
sync_all()
|
||||
# 同步所有数据(不包括 namechange)
|
||||
from src.data.sync import sync_all_data
|
||||
result = sync_all_data()
|
||||
|
||||
# Sync all stocks (incremental)
|
||||
sync_all()
|
||||
|
||||
# Force full reload
|
||||
sync_all(force_full=True)
|
||||
|
||||
# Dry run (preview only, no write)
|
||||
sync_all(dry_run=True)
|
||||
# 强制全量重载
|
||||
result = sync_all_data(force_full=True)
|
||||
"""
|
||||
|
||||
from typing import Optional, Dict
|
||||
|
||||
import pandas as pd
|
||||
from typing import Optional, Dict, Callable
|
||||
from datetime import datetime, timedelta
|
||||
from tqdm import tqdm
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
import threading
|
||||
import sys
|
||||
|
||||
from src.data.client import TushareClient
|
||||
from src.data.storage import ThreadSafeStorage
|
||||
from src.data.api_wrappers import get_daily
|
||||
from src.data.api_wrappers import (
|
||||
get_first_trading_day,
|
||||
get_last_trading_day,
|
||||
sync_trade_cal_cache,
|
||||
)
|
||||
|
||||
|
||||
# Default full sync start date
|
||||
DEFAULT_START_DATE = "20180101"
|
||||
|
||||
# Today's date in YYYYMMDD format
|
||||
TODAY = datetime.now().strftime("%Y%m%d")
|
||||
|
||||
|
||||
def get_today_date() -> str:
|
||||
"""Get today's date in YYYYMMDD format."""
|
||||
return TODAY
|
||||
|
||||
|
||||
def get_next_date(date_str: str) -> str:
|
||||
"""Get the next day after the given date.
|
||||
|
||||
Args:
|
||||
date_str: Date in YYYYMMDD format
|
||||
|
||||
Returns:
|
||||
Next date in YYYYMMDD format
|
||||
"""
|
||||
dt = datetime.strptime(date_str, "%Y%m%d")
|
||||
next_dt = dt + timedelta(days=1)
|
||||
return next_dt.strftime("%Y%m%d")
|
||||
|
||||
|
||||
class DataSync:
|
||||
"""Data synchronization manager with full/incremental sync support."""
|
||||
|
||||
# Default number of worker threads
|
||||
DEFAULT_MAX_WORKERS = 10
|
||||
|
||||
def __init__(self, max_workers: Optional[int] = None):
|
||||
"""Initialize sync manager.
|
||||
|
||||
Args:
|
||||
max_workers: Number of worker threads (default: 10)
|
||||
"""
|
||||
self.storage = ThreadSafeStorage()
|
||||
self.client = TushareClient()
|
||||
self.max_workers = max_workers or self.DEFAULT_MAX_WORKERS
|
||||
self._stop_flag = threading.Event()
|
||||
self._stop_flag.set() # Initially not stopped
|
||||
self._cached_daily_data: Optional[pd.DataFrame] = None # Cache for daily data
|
||||
|
||||
def _load_daily_data(self) -> pd.DataFrame:
|
||||
"""Load daily data from storage with caching.
|
||||
|
||||
This method caches the daily data in memory to avoid repeated disk reads.
|
||||
Call clear_cache() to force reload.
|
||||
|
||||
Returns:
|
||||
DataFrame with daily data (cached or loaded from storage)
|
||||
"""
|
||||
if self._cached_daily_data is None:
|
||||
self._cached_daily_data = self.storage.load("daily")
|
||||
return self._cached_daily_data
|
||||
|
||||
def clear_cache(self) -> None:
|
||||
"""Clear the cached daily data to force reload on next access."""
|
||||
self._cached_daily_data = None
|
||||
|
||||
def get_all_stock_codes(self, only_listed: bool = True) -> list:
|
||||
"""Get all stock codes from local storage.
|
||||
|
||||
This function prioritizes stock_basic.csv to ensure all stocks
|
||||
are included for backtesting to avoid look-ahead bias.
|
||||
|
||||
Args:
|
||||
only_listed: If True, only return currently listed stocks (L status).
|
||||
Set to False to include delisted stocks (for full backtest).
|
||||
|
||||
Returns:
|
||||
List of stock codes
|
||||
"""
|
||||
# Import sync_all_stocks here to avoid circular imports
|
||||
from src.data.api_wrappers import sync_all_stocks
|
||||
from src.data.api_wrappers.api_stock_basic import _get_csv_path
|
||||
|
||||
# First, ensure stock_basic.csv is up-to-date with all stocks
|
||||
print("[DataSync] Ensuring stock_basic.csv is up-to-date...")
|
||||
sync_all_stocks()
|
||||
|
||||
# Get from stock_basic.csv file
|
||||
stock_csv_path = _get_csv_path()
|
||||
|
||||
if stock_csv_path.exists():
|
||||
print(f"[DataSync] Reading stock_basic from CSV: {stock_csv_path}")
|
||||
try:
|
||||
stock_df = pd.read_csv(stock_csv_path, encoding="utf-8-sig")
|
||||
if not stock_df.empty and "ts_code" in stock_df.columns:
|
||||
# Filter by list_status if only_listed is True
|
||||
if only_listed and "list_status" in stock_df.columns:
|
||||
listed_stocks = stock_df[stock_df["list_status"] == "L"]
|
||||
codes = listed_stocks["ts_code"].unique().tolist()
|
||||
total = len(stock_df["ts_code"].unique())
|
||||
print(
|
||||
f"[DataSync] Found {len(codes)} listed stocks (filtered from {total} total)"
|
||||
)
|
||||
else:
|
||||
codes = stock_df["ts_code"].unique().tolist()
|
||||
print(
|
||||
f"[DataSync] Found {len(codes)} stock codes from stock_basic.csv"
|
||||
)
|
||||
return codes
|
||||
else:
|
||||
print(
|
||||
f"[DataSync] stock_basic.csv exists but no ts_code column or empty"
|
||||
)
|
||||
except Exception as e:
|
||||
print(f"[DataSync] Error reading stock_basic.csv: {e}")
|
||||
|
||||
# Fallback: try daily storage if stock_basic not available (using cached data)
|
||||
print("[DataSync] stock_basic.csv not available, falling back to daily data...")
|
||||
daily_data = self._load_daily_data()
|
||||
if not daily_data.empty and "ts_code" in daily_data.columns:
|
||||
codes = daily_data["ts_code"].unique().tolist()
|
||||
print(f"[DataSync] Found {len(codes)} stock codes from daily data")
|
||||
return codes
|
||||
|
||||
print("[DataSync] No stock codes found in local storage")
|
||||
return []
|
||||
|
||||
def get_global_last_date(self) -> Optional[str]:
|
||||
"""Get the global last trade date across all stocks.
|
||||
|
||||
Returns:
|
||||
Last trade date string or None
|
||||
"""
|
||||
daily_data = self._load_daily_data()
|
||||
if daily_data.empty or "trade_date" not in daily_data.columns:
|
||||
return None
|
||||
return str(daily_data["trade_date"].max())
|
||||
|
||||
def get_global_first_date(self) -> Optional[str]:
|
||||
"""Get the global first trade date across all stocks.
|
||||
|
||||
Returns:
|
||||
First trade date string or None
|
||||
"""
|
||||
daily_data = self._load_daily_data()
|
||||
if daily_data.empty or "trade_date" not in daily_data.columns:
|
||||
return None
|
||||
return str(daily_data["trade_date"].min())
|
||||
|
||||
def get_trade_calendar_bounds(
|
||||
self, start_date: str, end_date: str
|
||||
) -> tuple[Optional[str], Optional[str]]:
|
||||
"""Get the first and last trading day from trade calendar.
|
||||
|
||||
Args:
|
||||
start_date: Start date in YYYYMMDD format
|
||||
end_date: End date in YYYYMMDD format
|
||||
|
||||
Returns:
|
||||
Tuple of (first_trading_day, last_trading_day) or (None, None) if error
|
||||
"""
|
||||
try:
|
||||
first_day = get_first_trading_day(start_date, end_date)
|
||||
last_day = get_last_trading_day(start_date, end_date)
|
||||
return (first_day, last_day)
|
||||
except Exception as e:
|
||||
print(f"[ERROR] Failed to get trade calendar bounds: {e}")
|
||||
return (None, None)
|
||||
|
||||
def check_sync_needed(
|
||||
self, force_full: bool = False
|
||||
) -> tuple[bool, Optional[str], Optional[str], Optional[str]]:
|
||||
"""Check if sync is needed based on trade calendar.
|
||||
|
||||
This method compares local data date range with trade calendar
|
||||
to determine if new data needs to be fetched.
|
||||
|
||||
Logic:
|
||||
- If force_full: sync needed, return (True, 20180101, today)
|
||||
- If no local data: sync needed, return (True, 20180101, today)
|
||||
- If local data exists:
|
||||
- Get the last trading day from trade calendar
|
||||
- If local last date >= calendar last date: NO sync needed
|
||||
- Otherwise: sync needed from local_last_date + 1 to latest trade day
|
||||
|
||||
Args:
|
||||
force_full: If True, always return sync needed
|
||||
|
||||
Returns:
|
||||
Tuple of (sync_needed, start_date, end_date, local_last_date)
|
||||
- sync_needed: True if sync should proceed, False to skip
|
||||
- start_date: Sync start date (None if sync not needed)
|
||||
- end_date: Sync end date (None if sync not needed)
|
||||
- local_last_date: Local data last date (for incremental sync)
|
||||
"""
|
||||
# If force_full, always sync
|
||||
if force_full:
|
||||
print("[DataSync] Force full sync requested")
|
||||
return (True, DEFAULT_START_DATE, get_today_date(), None)
|
||||
|
||||
# Check if local data exists (using cached data)
|
||||
daily_data = self._load_daily_data()
|
||||
if daily_data.empty or "trade_date" not in daily_data.columns:
|
||||
print("[DataSync] No local data found, full sync needed")
|
||||
return (True, DEFAULT_START_DATE, get_today_date(), None)
|
||||
|
||||
# Get local data last date (we only care about the latest date, not the first)
|
||||
local_last_date = str(daily_data["trade_date"].max())
|
||||
|
||||
print(f"[DataSync] Local data last date: {local_last_date}")
|
||||
|
||||
# Get the latest trading day from trade calendar
|
||||
today = get_today_date()
|
||||
_, cal_last = self.get_trade_calendar_bounds(DEFAULT_START_DATE, today)
|
||||
|
||||
if cal_last is None:
|
||||
print("[DataSync] Failed to get trade calendar, proceeding with sync")
|
||||
return (True, DEFAULT_START_DATE, today, local_last_date)
|
||||
|
||||
print(f"[DataSync] Calendar last trading day: {cal_last}")
|
||||
|
||||
# Compare local last date with calendar last date
|
||||
# If local data is already up-to-date or newer, no sync needed
|
||||
print(
|
||||
f"[DataSync] Comparing: local={local_last_date} (type={type(local_last_date).__name__}), cal={cal_last} (type={type(cal_last).__name__})"
|
||||
)
|
||||
try:
|
||||
local_last_int = int(local_last_date)
|
||||
cal_last_int = int(cal_last)
|
||||
print(
|
||||
f"[DataSync] Comparing integers: local={local_last_int} >= cal={cal_last_int} = {local_last_int >= cal_last_int}"
|
||||
)
|
||||
if local_last_int >= cal_last_int:
|
||||
print(
|
||||
"[DataSync] Local data is up-to-date, SKIPPING sync (no tokens consumed)"
|
||||
)
|
||||
return (False, None, None, None)
|
||||
except (ValueError, TypeError) as e:
|
||||
print(f"[ERROR] Date comparison failed: {e}")
|
||||
|
||||
# Need to sync from local_last_date + 1 to latest trade day
|
||||
sync_start = get_next_date(local_last_date)
|
||||
print(f"[DataSync] Incremental sync needed from {sync_start} to {cal_last}")
|
||||
return (True, sync_start, cal_last, local_last_date)
|
||||
|
||||
def preview_sync(
|
||||
self,
|
||||
force_full: bool = False,
|
||||
start_date: Optional[str] = None,
|
||||
end_date: Optional[str] = None,
|
||||
sample_size: int = 3,
|
||||
) -> dict:
|
||||
"""Preview sync data volume and samples without actually syncing.
|
||||
|
||||
This method provides a preview of what would be synced, including:
|
||||
- Number of stocks to be synced
|
||||
- Date range for sync
|
||||
- Estimated total records
|
||||
- Sample data from first few stocks
|
||||
|
||||
Args:
|
||||
force_full: If True, preview full sync from 20180101
|
||||
start_date: Manual start date (overrides auto-detection)
|
||||
end_date: Manual end date (defaults to today)
|
||||
sample_size: Number of sample stocks to fetch for preview (default: 3)
|
||||
|
||||
Returns:
|
||||
Dictionary with preview information:
|
||||
{
|
||||
'sync_needed': bool,
|
||||
'stock_count': int,
|
||||
'start_date': str,
|
||||
'end_date': str,
|
||||
'estimated_records': int,
|
||||
'sample_data': pd.DataFrame,
|
||||
'mode': str, # 'full' or 'incremental'
|
||||
}
|
||||
"""
|
||||
print("\n" + "=" * 60)
|
||||
print("[DataSync] Preview Mode - Analyzing sync requirements...")
|
||||
print("=" * 60)
|
||||
|
||||
# First, ensure trade calendar cache is up-to-date
|
||||
print("[DataSync] Syncing trade calendar cache...")
|
||||
sync_trade_cal_cache()
|
||||
|
||||
# Determine date range
|
||||
if end_date is None:
|
||||
end_date = get_today_date()
|
||||
|
||||
# Check if sync is needed
|
||||
sync_needed, cal_start, cal_end, local_last = self.check_sync_needed(force_full)
|
||||
|
||||
if not sync_needed:
|
||||
print("\n" + "=" * 60)
|
||||
print("[DataSync] Preview Result")
|
||||
print("=" * 60)
|
||||
print(" Sync Status: NOT NEEDED")
|
||||
print(" Reason: Local data is up-to-date with trade calendar")
|
||||
print("=" * 60)
|
||||
return {
|
||||
"sync_needed": False,
|
||||
"stock_count": 0,
|
||||
"start_date": None,
|
||||
"end_date": None,
|
||||
"estimated_records": 0,
|
||||
"sample_data": pd.DataFrame(),
|
||||
"mode": "none",
|
||||
}
|
||||
|
||||
# Use dates from check_sync_needed
|
||||
if cal_start and cal_end:
|
||||
sync_start_date = cal_start
|
||||
end_date = cal_end
|
||||
else:
|
||||
sync_start_date = start_date or DEFAULT_START_DATE
|
||||
if end_date is None:
|
||||
end_date = get_today_date()
|
||||
|
||||
# Determine sync mode
|
||||
if force_full:
|
||||
mode = "full"
|
||||
print(f"[DataSync] Mode: FULL SYNC from {sync_start_date} to {end_date}")
|
||||
elif local_last and cal_start and sync_start_date == get_next_date(local_last):
|
||||
mode = "incremental"
|
||||
print(f"[DataSync] Mode: INCREMENTAL SYNC (bandwidth optimized)")
|
||||
print(f"[DataSync] Sync from: {sync_start_date} to {end_date}")
|
||||
else:
|
||||
mode = "partial"
|
||||
print(f"[DataSync] Mode: SYNC from {sync_start_date} to {end_date}")
|
||||
|
||||
# Get all stock codes
|
||||
stock_codes = self.get_all_stock_codes()
|
||||
if not stock_codes:
|
||||
print("[DataSync] No stocks found to sync")
|
||||
return {
|
||||
"sync_needed": False,
|
||||
"stock_count": 0,
|
||||
"start_date": None,
|
||||
"end_date": None,
|
||||
"estimated_records": 0,
|
||||
"sample_data": pd.DataFrame(),
|
||||
"mode": "none",
|
||||
}
|
||||
|
||||
stock_count = len(stock_codes)
|
||||
print(f"[DataSync] Total stocks to sync: {stock_count}")
|
||||
|
||||
# Fetch sample data from first few stocks
|
||||
print(f"[DataSync] Fetching sample data from {sample_size} stocks...")
|
||||
sample_data_list = []
|
||||
sample_codes = stock_codes[:sample_size]
|
||||
|
||||
for ts_code in sample_codes:
|
||||
try:
|
||||
data = self.client.query(
|
||||
"pro_bar",
|
||||
ts_code=ts_code,
|
||||
start_date=sync_start_date,
|
||||
end_date=end_date,
|
||||
factors="tor,vr",
|
||||
)
|
||||
if not data.empty:
|
||||
sample_data_list.append(data)
|
||||
print(f" - {ts_code}: {len(data)} records")
|
||||
except Exception as e:
|
||||
print(f" - {ts_code}: Error fetching - {e}")
|
||||
|
||||
# Combine sample data
|
||||
sample_df = (
|
||||
pd.concat(sample_data_list, ignore_index=True)
|
||||
if sample_data_list
|
||||
else pd.DataFrame()
|
||||
)
|
||||
|
||||
# Estimate total records based on sample
|
||||
if not sample_df.empty:
|
||||
avg_records_per_stock = len(sample_df) / len(sample_data_list)
|
||||
estimated_records = int(avg_records_per_stock * stock_count)
|
||||
else:
|
||||
estimated_records = 0
|
||||
|
||||
# Display preview results
|
||||
print("\n" + "=" * 60)
|
||||
print("[DataSync] Preview Result")
|
||||
print("=" * 60)
|
||||
print(f" Sync Mode: {mode.upper()}")
|
||||
print(f" Date Range: {sync_start_date} to {end_date}")
|
||||
print(f" Stocks to Sync: {stock_count}")
|
||||
print(f" Sample Stocks Checked: {len(sample_data_list)}/{sample_size}")
|
||||
print(f" Estimated Total Records: ~{estimated_records:,}")
|
||||
|
||||
if not sample_df.empty:
|
||||
print(f"\n Sample Data Preview (first {len(sample_df)} rows):")
|
||||
print(" " + "-" * 56)
|
||||
# Display sample data in a compact format
|
||||
preview_cols = [
|
||||
"ts_code",
|
||||
"trade_date",
|
||||
"open",
|
||||
"high",
|
||||
"low",
|
||||
"close",
|
||||
"vol",
|
||||
]
|
||||
available_cols = [c for c in preview_cols if c in sample_df.columns]
|
||||
sample_display = sample_df[available_cols].head(10)
|
||||
for idx, row in sample_display.iterrows():
|
||||
print(f" {row.to_dict()}")
|
||||
print(" " + "-" * 56)
|
||||
|
||||
print("=" * 60)
|
||||
|
||||
return {
|
||||
"sync_needed": True,
|
||||
"stock_count": stock_count,
|
||||
"start_date": sync_start_date,
|
||||
"end_date": end_date,
|
||||
"estimated_records": estimated_records,
|
||||
"sample_data": sample_df,
|
||||
"mode": mode,
|
||||
}
|
||||
|
||||
def sync_single_stock(
|
||||
self,
|
||||
ts_code: str,
|
||||
start_date: str,
|
||||
end_date: str,
|
||||
) -> pd.DataFrame:
|
||||
"""Sync daily data for a single stock.
|
||||
|
||||
Args:
|
||||
ts_code: Stock code
|
||||
start_date: Start date (YYYYMMDD)
|
||||
end_date: End date (YYYYMMDD)
|
||||
|
||||
Returns:
|
||||
DataFrame with daily market data
|
||||
"""
|
||||
# Check if sync should stop (for exception handling)
|
||||
if not self._stop_flag.is_set():
|
||||
return pd.DataFrame()
|
||||
|
||||
try:
|
||||
# Use shared client for rate limiting across threads
|
||||
data = self.client.query(
|
||||
"pro_bar",
|
||||
ts_code=ts_code,
|
||||
start_date=start_date,
|
||||
end_date=end_date,
|
||||
factors="tor,vr",
|
||||
)
|
||||
return data
|
||||
except Exception as e:
|
||||
# Set stop flag to signal other threads to stop
|
||||
self._stop_flag.clear()
|
||||
print(f"[ERROR] Exception syncing {ts_code}: {e}")
|
||||
raise
|
||||
|
||||
def sync_all(
|
||||
self,
|
||||
force_full: bool = False,
|
||||
start_date: Optional[str] = None,
|
||||
end_date: Optional[str] = None,
|
||||
max_workers: Optional[int] = None,
|
||||
dry_run: bool = False,
|
||||
) -> Dict[str, pd.DataFrame]:
|
||||
"""Sync daily data for all stocks in local storage.
|
||||
|
||||
This function:
|
||||
1. Reads stock codes from local storage (daily or stock_basic)
|
||||
2. Checks trade calendar to determine if sync is needed:
|
||||
- If local data matches trade calendar bounds, SKIP sync (save tokens)
|
||||
- Otherwise, sync from local_last_date + 1 to latest trade day (bandwidth optimized)
|
||||
3. Uses multi-threaded concurrent fetching with rate limiting
|
||||
4. Skips updating stocks that return empty data (delisted/unavailable)
|
||||
5. Stops immediately on any exception
|
||||
|
||||
Args:
|
||||
force_full: If True, force full reload from 20180101
|
||||
start_date: Manual start date (overrides auto-detection)
|
||||
end_date: Manual end date (defaults to today)
|
||||
max_workers: Number of worker threads (default: 10)
|
||||
dry_run: If True, only preview what would be synced without writing data
|
||||
|
||||
Returns:
|
||||
Dict mapping ts_code to DataFrame (empty if sync skipped or dry_run)
|
||||
"""
|
||||
print("\n" + "=" * 60)
|
||||
print("[DataSync] Starting daily data sync...")
|
||||
print("=" * 60)
|
||||
|
||||
# First, ensure trade calendar cache is up-to-date (uses incremental sync)
|
||||
print("[DataSync] Syncing trade calendar cache...")
|
||||
sync_trade_cal_cache()
|
||||
|
||||
# Determine date range
|
||||
if end_date is None:
|
||||
end_date = get_today_date()
|
||||
|
||||
# Check if sync is needed based on trade calendar
|
||||
sync_needed, cal_start, cal_end, local_last = self.check_sync_needed(force_full)
|
||||
|
||||
if not sync_needed:
|
||||
# Sync skipped - no tokens consumed
|
||||
print("\n" + "=" * 60)
|
||||
print("[DataSync] Sync Summary")
|
||||
print("=" * 60)
|
||||
print(" Sync: SKIPPED (local data up-to-date with trade calendar)")
|
||||
print(" Tokens saved: 0 consumed")
|
||||
print("=" * 60)
|
||||
return {}
|
||||
|
||||
# Use dates from check_sync_needed (which calculates incremental start if needed)
|
||||
if cal_start and cal_end:
|
||||
sync_start_date = cal_start
|
||||
end_date = cal_end
|
||||
else:
|
||||
# Fallback to default logic
|
||||
sync_start_date = start_date or DEFAULT_START_DATE
|
||||
if end_date is None:
|
||||
end_date = get_today_date()
|
||||
|
||||
# Determine sync mode
|
||||
if force_full:
|
||||
mode = "full"
|
||||
print(f"[DataSync] Mode: FULL SYNC from {sync_start_date} to {end_date}")
|
||||
elif local_last and cal_start and sync_start_date == get_next_date(local_last):
|
||||
mode = "incremental"
|
||||
print(f"[DataSync] Mode: INCREMENTAL SYNC (bandwidth optimized)")
|
||||
print(f"[DataSync] Sync from: {sync_start_date} to {end_date}")
|
||||
else:
|
||||
mode = "partial"
|
||||
print(f"[DataSync] Mode: SYNC from {sync_start_date} to {end_date}")
|
||||
|
||||
# Get all stock codes
|
||||
stock_codes = self.get_all_stock_codes()
|
||||
if not stock_codes:
|
||||
print("[DataSync] No stocks found to sync")
|
||||
return {}
|
||||
|
||||
print(f"[DataSync] Total stocks to sync: {len(stock_codes)}")
|
||||
print(f"[DataSync] Using {max_workers or self.max_workers} worker threads")
|
||||
|
||||
# Handle dry run mode
|
||||
if dry_run:
|
||||
print("\n" + "=" * 60)
|
||||
print("[DataSync] DRY RUN MODE - No data will be written")
|
||||
print("=" * 60)
|
||||
print(f" Would sync {len(stock_codes)} stocks")
|
||||
print(f" Date range: {sync_start_date} to {end_date}")
|
||||
print(f" Mode: {mode}")
|
||||
print("=" * 60)
|
||||
return {}
|
||||
|
||||
# Reset stop flag for new sync
|
||||
self._stop_flag.set()
|
||||
|
||||
# Multi-threaded concurrent fetching
|
||||
results: Dict[str, pd.DataFrame] = {}
|
||||
error_occurred = False
|
||||
exception_to_raise = None
|
||||
|
||||
def sync_task(ts_code: str) -> tuple[str, pd.DataFrame]:
|
||||
"""Task function for each stock."""
|
||||
try:
|
||||
data = self.sync_single_stock(
|
||||
ts_code=ts_code,
|
||||
start_date=sync_start_date,
|
||||
end_date=end_date,
|
||||
)
|
||||
return (ts_code, data)
|
||||
except Exception as e:
|
||||
# Re-raise to be caught by Future
|
||||
raise
|
||||
|
||||
# Use ThreadPoolExecutor for concurrent fetching
|
||||
workers = max_workers or self.max_workers
|
||||
with ThreadPoolExecutor(max_workers=workers) as executor:
|
||||
# Submit all tasks and track futures with their stock codes
|
||||
future_to_code = {
|
||||
executor.submit(sync_task, ts_code): ts_code for ts_code in stock_codes
|
||||
}
|
||||
|
||||
# Process results using as_completed
|
||||
error_count = 0
|
||||
empty_count = 0
|
||||
success_count = 0
|
||||
|
||||
# Create progress bar
|
||||
pbar = tqdm(total=len(stock_codes), desc="Syncing stocks")
|
||||
|
||||
try:
|
||||
# Process futures as they complete
|
||||
for future in as_completed(future_to_code):
|
||||
ts_code = future_to_code[future]
|
||||
|
||||
try:
|
||||
_, data = future.result()
|
||||
if data is not None and not data.empty:
|
||||
results[ts_code] = data
|
||||
success_count += 1
|
||||
else:
|
||||
# Empty data - stock may be delisted or unavailable
|
||||
empty_count += 1
|
||||
print(
|
||||
f"[DataSync] Stock {ts_code}: empty data (skipped, may be delisted)"
|
||||
)
|
||||
except Exception as e:
|
||||
# Exception occurred - stop all and abort
|
||||
error_occurred = True
|
||||
exception_to_raise = e
|
||||
print(f"\n[ERROR] Sync aborted due to exception: {e}")
|
||||
# Shutdown executor to stop all pending tasks
|
||||
executor.shutdown(wait=False, cancel_futures=True)
|
||||
raise exception_to_raise
|
||||
|
||||
# Update progress bar
|
||||
pbar.update(1)
|
||||
|
||||
except Exception:
|
||||
error_count = 1
|
||||
print("[DataSync] Sync stopped due to exception")
|
||||
finally:
|
||||
pbar.close()
|
||||
|
||||
# Queue all data for batch write (only if no error)
|
||||
if results and not error_occurred:
|
||||
for ts_code, data in results.items():
|
||||
if not data.empty:
|
||||
self.storage.queue_save("daily", data)
|
||||
# Flush all queued writes at once
|
||||
self.storage.flush()
|
||||
total_rows = sum(len(df) for df in results.values())
|
||||
print(f"\n[DataSync] Saved {total_rows} rows to storage")
|
||||
|
||||
# Summary
|
||||
print("\n" + "=" * 60)
|
||||
print("[DataSync] Sync Summary")
|
||||
print("=" * 60)
|
||||
print(f" Total stocks: {len(stock_codes)}")
|
||||
print(f" Updated: {success_count}")
|
||||
print(f" Skipped (empty/delisted): {empty_count}")
|
||||
print(
|
||||
f" Errors: {error_count} (aborted on first error)"
|
||||
if error_count
|
||||
else " Errors: 0"
|
||||
)
|
||||
print(f" Date range: {sync_start_date} to {end_date}")
|
||||
print("=" * 60)
|
||||
|
||||
return results
|
||||
|
||||
|
||||
# Convenience functions
|
||||
from src.data.api_wrappers import sync_all_stocks
|
||||
from src.data.api_wrappers.api_daily import sync_daily, preview_daily_sync
|
||||
|
||||
|
||||
def preview_sync(
|
||||
@@ -705,20 +38,19 @@ def preview_sync(
|
||||
sample_size: int = 3,
|
||||
max_workers: Optional[int] = None,
|
||||
) -> dict:
|
||||
"""Preview sync data volume and samples without actually syncing.
|
||||
"""预览日线同步数据量和样本(不实际同步)。
|
||||
|
||||
This is the recommended way to check what would be synced before
|
||||
running the actual synchronization.
|
||||
这是推荐的方式,可在实际同步前检查将要同步的内容。
|
||||
|
||||
Args:
|
||||
force_full: If True, preview full sync from 20180101
|
||||
start_date: Manual start date (overrides auto-detection)
|
||||
end_date: Manual end date (defaults to today)
|
||||
sample_size: Number of sample stocks to fetch for preview (default: 3)
|
||||
max_workers: Number of worker threads (not used in preview, for API compatibility)
|
||||
force_full: 若为 True,预览全量同步(从 20180101)
|
||||
start_date: 手动指定起始日期(覆盖自动检测)
|
||||
end_date: 手动指定结束日期(默认为今天)
|
||||
sample_size: 预览用样本股票数量(默认: 3)
|
||||
max_workers: 工作线程数(默认: 10)
|
||||
|
||||
Returns:
|
||||
Dictionary with preview information:
|
||||
包含预览信息的字典:
|
||||
{
|
||||
'sync_needed': bool,
|
||||
'stock_count': int,
|
||||
@@ -726,21 +58,20 @@ def preview_sync(
|
||||
'end_date': str,
|
||||
'estimated_records': int,
|
||||
'sample_data': pd.DataFrame,
|
||||
'mode': str, # 'full', 'incremental', 'partial', or 'none'
|
||||
'mode': str, # 'full', 'incremental', 'partial', 或 'none'
|
||||
}
|
||||
|
||||
Example:
|
||||
>>> # Preview what would be synced
|
||||
>>> # 预览将要同步的内容
|
||||
>>> preview = preview_sync()
|
||||
>>>
|
||||
>>> # Preview full sync
|
||||
>>> # 预览全量同步
|
||||
>>> preview = preview_sync(force_full=True)
|
||||
>>>
|
||||
>>> # Preview with more samples
|
||||
>>> # 预览更多样本
|
||||
>>> preview = preview_sync(sample_size=5)
|
||||
"""
|
||||
sync_manager = DataSync(max_workers=max_workers)
|
||||
return sync_manager.preview_sync(
|
||||
return preview_daily_sync(
|
||||
force_full=force_full,
|
||||
start_date=start_date,
|
||||
end_date=end_date,
|
||||
@@ -755,54 +86,168 @@ def sync_all(
|
||||
max_workers: Optional[int] = None,
|
||||
dry_run: bool = False,
|
||||
) -> Dict[str, pd.DataFrame]:
|
||||
"""Sync daily data for all stocks.
|
||||
"""同步所有股票的日线数据。
|
||||
|
||||
This is the main entry point for data synchronization.
|
||||
这是日线数据同步的主要入口点。
|
||||
|
||||
Args:
|
||||
force_full: If True, force full reload from 20180101
|
||||
start_date: Manual start date (YYYYMMDD)
|
||||
end_date: Manual end date (defaults to today)
|
||||
max_workers: Number of worker threads (default: 10)
|
||||
dry_run: If True, only preview what would be synced without writing data
|
||||
force_full: 若为 True,强制从 20180101 完整重载
|
||||
start_date: 手动指定起始日期(YYYYMMDD)
|
||||
end_date: 手动指定结束日期(默认为今天)
|
||||
max_workers: 工作线程数(默认: 10)
|
||||
dry_run: 若为 True,仅预览将要同步的内容,不写入数据
|
||||
|
||||
Returns:
|
||||
Dict mapping ts_code to DataFrame
|
||||
映射 ts_code 到 DataFrame 的字典
|
||||
|
||||
Example:
|
||||
>>> # First time sync (full load from 20180101)
|
||||
>>> # 首次同步(从 20180101 全量加载)
|
||||
>>> result = sync_all()
|
||||
>>>
|
||||
>>> # Subsequent sync (incremental - only new data)
|
||||
>>> # 后续同步(增量 - 仅新数据)
|
||||
>>> result = sync_all()
|
||||
>>>
|
||||
>>> # Force full reload
|
||||
>>> # 强制完整重载
|
||||
>>> result = sync_all(force_full=True)
|
||||
>>>
|
||||
>>> # Manual date range
|
||||
>>> # 手动指定日期范围
|
||||
>>> result = sync_all(start_date='20240101', end_date='20240131')
|
||||
>>>
|
||||
>>> # Custom thread count
|
||||
>>> # 自定义线程数
|
||||
>>> result = sync_all(max_workers=20)
|
||||
>>>
|
||||
>>> # Dry run (preview only)
|
||||
>>> # Dry run(仅预览)
|
||||
>>> result = sync_all(dry_run=True)
|
||||
"""
|
||||
sync_manager = DataSync(max_workers=max_workers)
|
||||
return sync_manager.sync_all(
|
||||
return sync_daily(
|
||||
force_full=force_full,
|
||||
start_date=start_date,
|
||||
end_date=end_date,
|
||||
max_workers=max_workers,
|
||||
dry_run=dry_run,
|
||||
)
|
||||
|
||||
|
||||
def sync_all_data(
|
||||
force_full: bool = False,
|
||||
max_workers: Optional[int] = None,
|
||||
dry_run: bool = False,
|
||||
) -> Dict[str, pd.DataFrame]:
|
||||
"""同步所有数据类型(每日同步)。
|
||||
|
||||
该函数按顺序同步所有可用的数据类型:
|
||||
1. 交易日历 (sync_trade_cal_cache)
|
||||
2. 股票基本信息 (sync_all_stocks)
|
||||
3. 日线市场数据 (sync_all)
|
||||
4. 历史股票列表 (sync_bak_basic)
|
||||
|
||||
注意:名称变更 (namechange) 不在自动同步中,如需同步请手动调用。
|
||||
|
||||
Args:
|
||||
force_full: 若为 True,强制所有数据类型完整重载
|
||||
max_workers: 日线数据同步的工作线程数(默认: 10)
|
||||
dry_run: 若为 True,仅显示将要同步的内容 Returns:
|
||||
映射数据类型,不写入数据
|
||||
|
||||
到同步结果的字典
|
||||
|
||||
Example:
|
||||
>>> # 同步所有数据(增量)
|
||||
>>> result = sync_all_data()
|
||||
>>>
|
||||
>>> # 强制完整重载
|
||||
>>> result = sync_all_data(force_full=True)
|
||||
>>>
|
||||
>>> # Dry run
|
||||
>>> result = sync_all_data(dry_run=True)
|
||||
"""
|
||||
results: Dict[str, pd.DataFrame] = {}
|
||||
|
||||
print("\n" + "=" * 60)
|
||||
print("[sync_all_data] Starting full data synchronization...")
|
||||
print("=" * 60)
|
||||
|
||||
# 1. Sync trade calendar (always needed first)
|
||||
print("\n[1/5] Syncing trade calendar cache...")
|
||||
try:
|
||||
from src.data.api_wrappers import sync_trade_cal_cache
|
||||
|
||||
sync_trade_cal_cache()
|
||||
results["trade_cal"] = pd.DataFrame()
|
||||
print("[1/5] Trade calendar: OK")
|
||||
except Exception as e:
|
||||
print(f"[1/5] Trade calendar: FAILED - {e}")
|
||||
results["trade_cal"] = pd.DataFrame()
|
||||
|
||||
# 2. Sync stock basic info
|
||||
print("\n[2/5] Syncing stock basic info...")
|
||||
try:
|
||||
sync_all_stocks()
|
||||
results["stock_basic"] = pd.DataFrame()
|
||||
print("[2/5] Stock basic: OK")
|
||||
except Exception as e:
|
||||
print(f"[2/5] Stock basic: FAILED - {e}")
|
||||
results["stock_basic"] = pd.DataFrame()
|
||||
|
||||
# 3. Sync daily market data
|
||||
print("\n[3/5] Syncing daily market data...")
|
||||
try:
|
||||
daily_result = sync_daily(
|
||||
force_full=force_full,
|
||||
max_workers=max_workers,
|
||||
dry_run=dry_run,
|
||||
)
|
||||
results["daily"] = (
|
||||
pd.concat(daily_result.values(), ignore_index=True)
|
||||
if daily_result
|
||||
else pd.DataFrame()
|
||||
)
|
||||
print("[3/5] Daily data: OK")
|
||||
except Exception as e:
|
||||
print(f"[3/5] Daily data: FAILED - {e}")
|
||||
results["daily"] = pd.DataFrame()
|
||||
|
||||
# 4. Sync stock historical list (bak_basic)
|
||||
print("\n[4/5] Syncing stock historical list (bak_basic)...")
|
||||
try:
|
||||
bak_basic_result = sync_bak_basic(force_full=force_full)
|
||||
results["bak_basic"] = bak_basic_result
|
||||
print(f"[4/5] Bak basic: OK ({len(bak_basic_result)} records)")
|
||||
except Exception as e:
|
||||
print(f"[4/5] Bak basic: FAILED - {e}")
|
||||
results["bak_basic"] = pd.DataFrame()
|
||||
|
||||
# Summary
|
||||
print("\n" + "=" * 60)
|
||||
print("[sync_all_data] Sync Summary")
|
||||
print("=" * 60)
|
||||
for data_type, df in results.items():
|
||||
print(f" {data_type}: {len(df)} records")
|
||||
print("=" * 60)
|
||||
print("\nNote: namechange is NOT in auto-sync. To sync manually:")
|
||||
print(" from src.data.api_wrappers import sync_namechange")
|
||||
print(" sync_namechange(force=True)")
|
||||
|
||||
return results
|
||||
|
||||
|
||||
# 保留向后兼容的导入
|
||||
from src.data.api_wrappers import sync_bak_basic
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
print("=" * 60)
|
||||
print("Data Sync Module")
|
||||
print("=" * 60)
|
||||
print("\nUsage:")
|
||||
print(" # Sync all data types at once (RECOMMENDED)")
|
||||
print(" from src.data.sync import sync_all_data")
|
||||
print(" result = sync_all_data() # Incremental sync all")
|
||||
print(" result = sync_all_data(force_full=True) # Full reload")
|
||||
print("")
|
||||
print(" # Or sync individual data types:")
|
||||
print(" from src.data.sync import sync_all, preview_sync")
|
||||
print(" from src.data.sync import sync_bak_basic")
|
||||
print("")
|
||||
print(" # Preview before sync (recommended)")
|
||||
print(" preview = preview_sync()")
|
||||
@@ -813,21 +258,14 @@ if __name__ == "__main__":
|
||||
print(" # Actual sync")
|
||||
print(" result = sync_all() # Incremental sync")
|
||||
print(" result = sync_all(force_full=True) # Full reload")
|
||||
print("")
|
||||
print(" # bak_basic sync")
|
||||
print(" result = sync_bak_basic() # Incremental sync")
|
||||
print(" result = sync_bak_basic(force_full=True) # Full reload")
|
||||
print("\n" + "=" * 60)
|
||||
|
||||
# Run preview first
|
||||
print("\n[Main] Running preview first...")
|
||||
preview = preview_sync()
|
||||
|
||||
if preview["sync_needed"]:
|
||||
# Ask for confirmation
|
||||
print("\n" + "=" * 60)
|
||||
response = input("Proceed with sync? (y/n): ").strip().lower()
|
||||
if response in ("y", "yes"):
|
||||
print("\n[Main] Starting actual sync...")
|
||||
result = sync_all()
|
||||
print(f"\nSynced {len(result)} stocks")
|
||||
else:
|
||||
print("\n[Main] Sync cancelled by user")
|
||||
else:
|
||||
print("\n[Main] No sync needed - data is up to date")
|
||||
# Run sync_all_data by default
|
||||
print("\n[Main] Running sync_all_data()...")
|
||||
result = sync_all_data()
|
||||
print("\n[Main] Sync completed!")
|
||||
print(f"Total data types synced: {len(result)}")
|
||||
|
||||
Reference in New Issue
Block a user