feat: HDF5迁移至DuckDB存储

- 新增DuckDB Storage与ThreadSafeStorage实现
- 新增db_manager模块支持增量同步策略
- DataLoader与Sync模块适配DuckDB
- 补充迁移相关文档与测试
- 修复README文档链接
This commit is contained in:
2026-02-23 00:07:21 +08:00
parent 0a16129548
commit e58b39970c
14 changed files with 2265 additions and 329 deletions

View File

@@ -10,6 +10,10 @@ from pathlib import Path
from src.data.client import TushareClient
from src.data.config import get_config
# Module-level flag to track if cache has been synced in this session
_cache_synced = False
# Trading calendar cache file path
def _get_cache_path() -> Path:
@@ -51,8 +55,9 @@ def _load_from_cache() -> pd.DataFrame:
try:
with pd.HDFStore(cache_path, mode="r") as store:
if "trade_cal" in store.keys():
data = store["trade_cal"]
# HDF5 keys include leading slash (e.g., '/trade_cal')
if "/trade_cal" in store.keys():
data = store["/trade_cal"]
print(f"[trade_cal] Loaded {len(data)} records from cache")
return data
except Exception as e:
@@ -77,6 +82,7 @@ def _get_cached_date_range() -> tuple[Optional[str], Optional[str]]:
def sync_trade_cal_cache(
start_date: str = "20180101",
end_date: Optional[str] = None,
force: bool = False,
) -> pd.DataFrame:
"""Sync trade calendar data to local cache with incremental updates.
@@ -86,10 +92,17 @@ def sync_trade_cal_cache(
Args:
start_date: Initial start date for full sync (default: 20180101)
end_date: End date (defaults to today)
force: If True, force sync even if already synced in this session
Returns:
Full trade calendar DataFrame (cached + new)
"""
global _cache_synced
# Skip if already synced in this session (unless forced)
if _cache_synced and not force:
return _load_from_cache()
if end_date is None:
from datetime import datetime
@@ -137,6 +150,8 @@ def sync_trade_cal_cache(
combined = new_data
# Save combined data to cache
# Mark as synced to avoid redundant syncs in this session
_cache_synced = True
_save_to_cache(combined)
return combined
else:
@@ -153,6 +168,8 @@ def sync_trade_cal_cache(
print("[trade_cal] No data returned")
return data
# Mark as synced to avoid redundant syncs in this session
_cache_synced = True
_save_to_cache(data)
return data