feat(data): 为数据同步添加事务支持和同步日志

- Storage/ThreadSafeStorage 添加事务支持(begin/commit/rollback)
- 新增 SyncLogManager 记录所有同步任务的执行状态
- 集成事务到 StockBasedSync、DateBasedSync、QuarterBasedSync
- 在 sync_all 和 sync_financial 调度中心添加日志记录
- 新增测试验证事务和日志功能
This commit is contained in:
2026-03-23 21:10:15 +08:00
parent 31b25074c3
commit bace4cc5f4
10 changed files with 1468 additions and 177 deletions

View File

@@ -25,6 +25,7 @@ from tqdm import tqdm
from src.data.client import TushareClient
from src.data.storage import ThreadSafeStorage, Storage
from src.data.sync_logger import SyncLogManager
from src.data.utils import get_today_date, get_quarters_in_range, DEFAULT_START_DATE
@@ -466,18 +467,28 @@ class QuarterBasedSync(ABC):
inserted_count = 0
if is_first_sync_for_period:
# 首次同步该季度:直接插入所有数据
# 首次同步该季度:直接插入所有数据(使用事务)
print(
f"[{self.__class__.__name__}] First sync for quarter {period}, inserting all data directly"
)
if not dry_run:
self.storage.queue_save(self.table_name, remote_df, use_upsert=False)
self.storage.flush()
inserted_count = len(remote_df)
print(
f"[{self.__class__.__name__}] Inserted {inserted_count} new records"
)
try:
# 开始事务
self.storage.begin_transaction()
self.storage.queue_save(
self.table_name, remote_df, use_upsert=False
)
self.storage.flush(use_transaction=False)
self.storage.commit_transaction()
inserted_count = len(remote_df)
print(
f"[{self.__class__.__name__}] Inserted {inserted_count} new records (transaction committed)"
)
except Exception as e:
self.storage.rollback_transaction()
print(f"[{self.__class__.__name__}] Transaction rolled back: {e}")
raise
return {
"period": period,
@@ -501,19 +512,33 @@ class QuarterBasedSync(ABC):
print(f" - Unchanged stocks: {unchanged_count}")
if not dry_run and not diff_df.empty:
# 5.1 删除差异股票的旧数据
deleted_stocks_count = len(diff_stocks)
self.delete_stock_quarter_data(period, diff_stocks)
deleted_count = len(diff_df)
print(
f"[{self.__class__.__name__}] Deleted {deleted_stocks_count} stocks' old records (approx {deleted_count} rows)"
)
try:
# 开始事务
self.storage.begin_transaction()
# 5.2 插入新数据(使用普通 INSERT因为已删除旧数据
self.storage.queue_save(self.table_name, diff_df, use_upsert=False)
self.storage.flush()
inserted_count = len(diff_df)
print(f"[{self.__class__.__name__}] Inserted {inserted_count} new records")
# 5.1 删除差异股票的旧数据
deleted_stocks_count = len(diff_stocks)
self.delete_stock_quarter_data(period, diff_stocks)
deleted_count = len(diff_df)
print(
f"[{self.__class__.__name__}] Deleted {deleted_stocks_count} stocks' old records (approx {deleted_count} rows)"
)
# 5.2 插入新数据(使用普通 INSERT因为已删除旧数据
self.storage.queue_save(self.table_name, diff_df, use_upsert=False)
self.storage.flush(use_transaction=False)
inserted_count = len(diff_df)
# 提交事务
self.storage.commit_transaction()
print(
f"[{self.__class__.__name__}] Inserted {inserted_count} new records (transaction committed)"
)
except Exception as e:
self.storage.rollback_transaction()
print(f"[{self.__class__.__name__}] Transaction rolled back: {e}")
raise
return {
"period": period,
@@ -583,55 +608,86 @@ class QuarterBasedSync(ABC):
print(f"[{self.__class__.__name__}] Incremental Sync")
print(f"{'=' * 60}")
# 0. 确保表存在(首次同步时自动建表)
self.ensure_table_exists()
# 初始化日志管理器
log_manager = SyncLogManager()
log_entry = log_manager.start_sync(
table_name=self.table_name,
sync_type="incremental",
metadata={"dry_run": dry_run},
)
# 1. 获取最新季度
storage = Storage()
try:
result = storage._connection.execute(
f'SELECT MAX(end_date) FROM "{self.table_name}"'
).fetchone()
latest_quarter = result[0] if result and result[0] else None
if hasattr(latest_quarter, "strftime"):
latest_quarter = latest_quarter.strftime("%Y%m%d")
# 0. 确保表存在(首次同步时自动建表)
self.ensure_table_exists()
# 1. 获取最新季度
storage = Storage()
try:
result = storage._connection.execute(
f'SELECT MAX(end_date) FROM "{self.table_name}"'
).fetchone()
latest_quarter = result[0] if result and result[0] else None
if hasattr(latest_quarter, "strftime"):
latest_quarter = latest_quarter.strftime("%Y%m%d")
except Exception as e:
print(f"[{self.__class__.__name__}] Error getting latest quarter: {e}")
latest_quarter = None
# 2. 获取当前季度
current_quarter = self.get_current_quarter()
if latest_quarter is None:
# 无本地数据,执行全量同步
print(
f"[{self.__class__.__name__}] No local data, performing full sync"
)
results = self.sync_range(
self.DEFAULT_START_DATE, current_quarter, dry_run
)
else:
print(
f"[{self.__class__.__name__}] Latest local quarter: {latest_quarter}"
)
print(f"[{self.__class__.__name__}] Current quarter: {current_quarter}")
# 3. 确定同步范围
start_quarter = latest_quarter
if start_quarter > current_quarter:
start_quarter = current_quarter
else:
start_quarter = self.get_prev_quarter(latest_quarter)
if start_quarter < self.DEFAULT_START_DATE:
start_quarter = self.DEFAULT_START_DATE
# 打印同步的两个季度信息
print(f"\n[{self.__class__.__name__}] 将同步以下两个季度的财报:")
print(f" - 前一季度: {start_quarter}")
print(f" - 当前季度: {current_quarter}")
print(f" (包含前一季度以确保数据完整性)")
print()
results = self.sync_range(start_quarter, current_quarter, dry_run)
# 计算总插入记录数
total_inserted = sum(
r.get("inserted_count", 0) for r in results if isinstance(r, dict)
)
# 完成日志记录
log_manager.complete_sync(
log_entry,
status="success",
records_inserted=total_inserted,
records_updated=0,
records_deleted=0,
)
return results
except Exception as e:
print(f"[{self.__class__.__name__}] Error getting latest quarter: {e}")
latest_quarter = None
# 2. 获取当前季度
current_quarter = self.get_current_quarter()
if latest_quarter is None:
# 无本地数据,执行全量同步
print(f"[{self.__class__.__name__}] No local data, performing full sync")
return self.sync_range(self.DEFAULT_START_DATE, current_quarter, dry_run)
print(f"[{self.__class__.__name__}] Latest local quarter: {latest_quarter}")
print(f"[{self.__class__.__name__}] Current quarter: {current_quarter}")
# 3. 确定同步范围
# 财务数据必须每次都进行对比更新,不存在"跳过"的情况
# 同步范围:从最新季度到当前季度(包含前一季度以确保数据完整性)
start_quarter = latest_quarter
if start_quarter > current_quarter:
# 如果本地数据比当前季度还新,仍然需要同步(可能包含修正数据)
start_quarter = current_quarter
else:
# 正常情况:包含前一季度
start_quarter = self.get_prev_quarter(latest_quarter)
if start_quarter < self.DEFAULT_START_DATE:
start_quarter = self.DEFAULT_START_DATE
# 打印同步的两个季度信息
print(f"\n[{self.__class__.__name__}] 将同步以下两个季度的财报:")
print(f" - 前一季度: {start_quarter}")
print(f" - 当前季度: {current_quarter}")
print(f" (包含前一季度以确保数据完整性)")
print()
return self.sync_range(start_quarter, current_quarter, dry_run)
log_manager.complete_sync(log_entry, status="failed", error_message=str(e))
raise
def sync_full(self, dry_run: bool = False) -> List[Dict]:
"""执行全量同步。
@@ -646,12 +702,38 @@ class QuarterBasedSync(ABC):
print(f"[{self.__class__.__name__}] Full Sync")
print(f"{'=' * 60}")
# 确保表存在
self.ensure_table_exists()
# 初始化日志管理器
log_manager = SyncLogManager()
log_entry = log_manager.start_sync(
table_name=self.table_name, sync_type="full", metadata={"dry_run": dry_run}
)
current_quarter = self.get_current_quarter()
try:
# 确保表存在
self.ensure_table_exists()
return self.sync_range(self.DEFAULT_START_DATE, current_quarter, dry_run)
current_quarter = self.get_current_quarter()
results = self.sync_range(self.DEFAULT_START_DATE, current_quarter, dry_run)
# 计算总插入记录数
total_inserted = sum(
r.get("inserted_count", 0) for r in results if isinstance(r, dict)
)
# 完成日志记录
log_manager.complete_sync(
log_entry,
status="success",
records_inserted=total_inserted,
records_updated=0,
records_deleted=0,
)
return results
except Exception as e:
log_manager.complete_sync(log_entry, status="failed", error_message=str(e))
raise
# ======================================================================
# 预览模式