feat: HDF5迁移至DuckDB存储

- 新增DuckDB Storage与ThreadSafeStorage实现
- 新增db_manager模块支持增量同步策略
- DataLoader与Sync模块适配DuckDB
- 补充迁移相关文档与测试
- 修复README文档链接
This commit is contained in:
2026-02-23 00:07:21 +08:00
parent 0a16129548
commit e58b39970c
14 changed files with 2265 additions and 329 deletions

View File

@@ -36,7 +36,7 @@ import threading
import sys
from src.data.client import TushareClient
from src.data.storage import Storage
from src.data.storage import ThreadSafeStorage
from src.data.api_wrappers import get_daily
from src.data.api_wrappers import (
get_first_trading_day,
@@ -83,7 +83,7 @@ class DataSync:
Args:
max_workers: Number of worker threads (default: 10)
"""
self.storage = Storage()
self.storage = ThreadSafeStorage()
self.client = TushareClient()
self.max_workers = max_workers or self.DEFAULT_MAX_WORKERS
self._stop_flag = threading.Event()
@@ -667,11 +667,15 @@ class DataSync:
finally:
pbar.close()
# Write all data at once (only if no error)
# Queue all data for batch write (only if no error)
if results and not error_occurred:
combined_data = pd.concat(results.values(), ignore_index=True)
self.storage.save("daily", combined_data, mode="append")
print(f"\n[DataSync] Saved {len(combined_data)} rows to storage")
for ts_code, data in results.items():
if not data.empty:
self.storage.queue_save("daily", data)
# Flush all queued writes at once
self.storage.flush()
total_rows = sum(len(df) for df in results.values())
print(f"\n[DataSync] Saved {total_rows} rows to storage")
# Summary
print("\n" + "=" * 60)