feat(data): 为数据同步添加事务支持和同步日志
- Storage/ThreadSafeStorage 添加事务支持(begin/commit/rollback) - 新增 SyncLogManager 记录所有同步任务的执行状态 - 集成事务到 StockBasedSync、DateBasedSync、QuarterBasedSync - 在 sync_all 和 sync_financial 调度中心添加日志记录 - 新增测试验证事务和日志功能
This commit is contained in:
@@ -25,6 +25,7 @@ from tqdm import tqdm
|
||||
|
||||
from src.data.client import TushareClient
|
||||
from src.data.storage import ThreadSafeStorage, Storage
|
||||
from src.data.sync_logger import SyncLogManager
|
||||
from src.data.utils import get_today_date, get_quarters_in_range, DEFAULT_START_DATE
|
||||
|
||||
|
||||
@@ -466,18 +467,28 @@ class QuarterBasedSync(ABC):
|
||||
inserted_count = 0
|
||||
|
||||
if is_first_sync_for_period:
|
||||
# 首次同步该季度:直接插入所有数据
|
||||
# 首次同步该季度:直接插入所有数据(使用事务)
|
||||
print(
|
||||
f"[{self.__class__.__name__}] First sync for quarter {period}, inserting all data directly"
|
||||
)
|
||||
|
||||
if not dry_run:
|
||||
self.storage.queue_save(self.table_name, remote_df, use_upsert=False)
|
||||
self.storage.flush()
|
||||
inserted_count = len(remote_df)
|
||||
print(
|
||||
f"[{self.__class__.__name__}] Inserted {inserted_count} new records"
|
||||
)
|
||||
try:
|
||||
# 开始事务
|
||||
self.storage.begin_transaction()
|
||||
self.storage.queue_save(
|
||||
self.table_name, remote_df, use_upsert=False
|
||||
)
|
||||
self.storage.flush(use_transaction=False)
|
||||
self.storage.commit_transaction()
|
||||
inserted_count = len(remote_df)
|
||||
print(
|
||||
f"[{self.__class__.__name__}] Inserted {inserted_count} new records (transaction committed)"
|
||||
)
|
||||
except Exception as e:
|
||||
self.storage.rollback_transaction()
|
||||
print(f"[{self.__class__.__name__}] Transaction rolled back: {e}")
|
||||
raise
|
||||
|
||||
return {
|
||||
"period": period,
|
||||
@@ -501,19 +512,33 @@ class QuarterBasedSync(ABC):
|
||||
print(f" - Unchanged stocks: {unchanged_count}")
|
||||
|
||||
if not dry_run and not diff_df.empty:
|
||||
# 5.1 删除差异股票的旧数据
|
||||
deleted_stocks_count = len(diff_stocks)
|
||||
self.delete_stock_quarter_data(period, diff_stocks)
|
||||
deleted_count = len(diff_df)
|
||||
print(
|
||||
f"[{self.__class__.__name__}] Deleted {deleted_stocks_count} stocks' old records (approx {deleted_count} rows)"
|
||||
)
|
||||
try:
|
||||
# 开始事务
|
||||
self.storage.begin_transaction()
|
||||
|
||||
# 5.2 插入新数据(使用普通 INSERT,因为已删除旧数据)
|
||||
self.storage.queue_save(self.table_name, diff_df, use_upsert=False)
|
||||
self.storage.flush()
|
||||
inserted_count = len(diff_df)
|
||||
print(f"[{self.__class__.__name__}] Inserted {inserted_count} new records")
|
||||
# 5.1 删除差异股票的旧数据
|
||||
deleted_stocks_count = len(diff_stocks)
|
||||
self.delete_stock_quarter_data(period, diff_stocks)
|
||||
deleted_count = len(diff_df)
|
||||
print(
|
||||
f"[{self.__class__.__name__}] Deleted {deleted_stocks_count} stocks' old records (approx {deleted_count} rows)"
|
||||
)
|
||||
|
||||
# 5.2 插入新数据(使用普通 INSERT,因为已删除旧数据)
|
||||
self.storage.queue_save(self.table_name, diff_df, use_upsert=False)
|
||||
self.storage.flush(use_transaction=False)
|
||||
inserted_count = len(diff_df)
|
||||
|
||||
# 提交事务
|
||||
self.storage.commit_transaction()
|
||||
print(
|
||||
f"[{self.__class__.__name__}] Inserted {inserted_count} new records (transaction committed)"
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
self.storage.rollback_transaction()
|
||||
print(f"[{self.__class__.__name__}] Transaction rolled back: {e}")
|
||||
raise
|
||||
|
||||
return {
|
||||
"period": period,
|
||||
@@ -583,55 +608,86 @@ class QuarterBasedSync(ABC):
|
||||
print(f"[{self.__class__.__name__}] Incremental Sync")
|
||||
print(f"{'=' * 60}")
|
||||
|
||||
# 0. 确保表存在(首次同步时自动建表)
|
||||
self.ensure_table_exists()
|
||||
# 初始化日志管理器
|
||||
log_manager = SyncLogManager()
|
||||
log_entry = log_manager.start_sync(
|
||||
table_name=self.table_name,
|
||||
sync_type="incremental",
|
||||
metadata={"dry_run": dry_run},
|
||||
)
|
||||
|
||||
# 1. 获取最新季度
|
||||
storage = Storage()
|
||||
try:
|
||||
result = storage._connection.execute(
|
||||
f'SELECT MAX(end_date) FROM "{self.table_name}"'
|
||||
).fetchone()
|
||||
latest_quarter = result[0] if result and result[0] else None
|
||||
if hasattr(latest_quarter, "strftime"):
|
||||
latest_quarter = latest_quarter.strftime("%Y%m%d")
|
||||
# 0. 确保表存在(首次同步时自动建表)
|
||||
self.ensure_table_exists()
|
||||
|
||||
# 1. 获取最新季度
|
||||
storage = Storage()
|
||||
try:
|
||||
result = storage._connection.execute(
|
||||
f'SELECT MAX(end_date) FROM "{self.table_name}"'
|
||||
).fetchone()
|
||||
latest_quarter = result[0] if result and result[0] else None
|
||||
if hasattr(latest_quarter, "strftime"):
|
||||
latest_quarter = latest_quarter.strftime("%Y%m%d")
|
||||
except Exception as e:
|
||||
print(f"[{self.__class__.__name__}] Error getting latest quarter: {e}")
|
||||
latest_quarter = None
|
||||
|
||||
# 2. 获取当前季度
|
||||
current_quarter = self.get_current_quarter()
|
||||
|
||||
if latest_quarter is None:
|
||||
# 无本地数据,执行全量同步
|
||||
print(
|
||||
f"[{self.__class__.__name__}] No local data, performing full sync"
|
||||
)
|
||||
results = self.sync_range(
|
||||
self.DEFAULT_START_DATE, current_quarter, dry_run
|
||||
)
|
||||
else:
|
||||
print(
|
||||
f"[{self.__class__.__name__}] Latest local quarter: {latest_quarter}"
|
||||
)
|
||||
print(f"[{self.__class__.__name__}] Current quarter: {current_quarter}")
|
||||
|
||||
# 3. 确定同步范围
|
||||
start_quarter = latest_quarter
|
||||
if start_quarter > current_quarter:
|
||||
start_quarter = current_quarter
|
||||
else:
|
||||
start_quarter = self.get_prev_quarter(latest_quarter)
|
||||
|
||||
if start_quarter < self.DEFAULT_START_DATE:
|
||||
start_quarter = self.DEFAULT_START_DATE
|
||||
|
||||
# 打印同步的两个季度信息
|
||||
print(f"\n[{self.__class__.__name__}] 将同步以下两个季度的财报:")
|
||||
print(f" - 前一季度: {start_quarter}")
|
||||
print(f" - 当前季度: {current_quarter}")
|
||||
print(f" (包含前一季度以确保数据完整性)")
|
||||
print()
|
||||
|
||||
results = self.sync_range(start_quarter, current_quarter, dry_run)
|
||||
|
||||
# 计算总插入记录数
|
||||
total_inserted = sum(
|
||||
r.get("inserted_count", 0) for r in results if isinstance(r, dict)
|
||||
)
|
||||
|
||||
# 完成日志记录
|
||||
log_manager.complete_sync(
|
||||
log_entry,
|
||||
status="success",
|
||||
records_inserted=total_inserted,
|
||||
records_updated=0,
|
||||
records_deleted=0,
|
||||
)
|
||||
|
||||
return results
|
||||
|
||||
except Exception as e:
|
||||
print(f"[{self.__class__.__name__}] Error getting latest quarter: {e}")
|
||||
latest_quarter = None
|
||||
|
||||
# 2. 获取当前季度
|
||||
current_quarter = self.get_current_quarter()
|
||||
|
||||
if latest_quarter is None:
|
||||
# 无本地数据,执行全量同步
|
||||
print(f"[{self.__class__.__name__}] No local data, performing full sync")
|
||||
return self.sync_range(self.DEFAULT_START_DATE, current_quarter, dry_run)
|
||||
|
||||
print(f"[{self.__class__.__name__}] Latest local quarter: {latest_quarter}")
|
||||
print(f"[{self.__class__.__name__}] Current quarter: {current_quarter}")
|
||||
|
||||
# 3. 确定同步范围
|
||||
# 财务数据必须每次都进行对比更新,不存在"跳过"的情况
|
||||
# 同步范围:从最新季度到当前季度(包含前一季度以确保数据完整性)
|
||||
start_quarter = latest_quarter
|
||||
if start_quarter > current_quarter:
|
||||
# 如果本地数据比当前季度还新,仍然需要同步(可能包含修正数据)
|
||||
start_quarter = current_quarter
|
||||
else:
|
||||
# 正常情况:包含前一季度
|
||||
start_quarter = self.get_prev_quarter(latest_quarter)
|
||||
|
||||
if start_quarter < self.DEFAULT_START_DATE:
|
||||
start_quarter = self.DEFAULT_START_DATE
|
||||
|
||||
# 打印同步的两个季度信息
|
||||
print(f"\n[{self.__class__.__name__}] 将同步以下两个季度的财报:")
|
||||
print(f" - 前一季度: {start_quarter}")
|
||||
print(f" - 当前季度: {current_quarter}")
|
||||
print(f" (包含前一季度以确保数据完整性)")
|
||||
print()
|
||||
|
||||
return self.sync_range(start_quarter, current_quarter, dry_run)
|
||||
log_manager.complete_sync(log_entry, status="failed", error_message=str(e))
|
||||
raise
|
||||
|
||||
def sync_full(self, dry_run: bool = False) -> List[Dict]:
|
||||
"""执行全量同步。
|
||||
@@ -646,12 +702,38 @@ class QuarterBasedSync(ABC):
|
||||
print(f"[{self.__class__.__name__}] Full Sync")
|
||||
print(f"{'=' * 60}")
|
||||
|
||||
# 确保表存在
|
||||
self.ensure_table_exists()
|
||||
# 初始化日志管理器
|
||||
log_manager = SyncLogManager()
|
||||
log_entry = log_manager.start_sync(
|
||||
table_name=self.table_name, sync_type="full", metadata={"dry_run": dry_run}
|
||||
)
|
||||
|
||||
current_quarter = self.get_current_quarter()
|
||||
try:
|
||||
# 确保表存在
|
||||
self.ensure_table_exists()
|
||||
|
||||
return self.sync_range(self.DEFAULT_START_DATE, current_quarter, dry_run)
|
||||
current_quarter = self.get_current_quarter()
|
||||
results = self.sync_range(self.DEFAULT_START_DATE, current_quarter, dry_run)
|
||||
|
||||
# 计算总插入记录数
|
||||
total_inserted = sum(
|
||||
r.get("inserted_count", 0) for r in results if isinstance(r, dict)
|
||||
)
|
||||
|
||||
# 完成日志记录
|
||||
log_manager.complete_sync(
|
||||
log_entry,
|
||||
status="success",
|
||||
records_inserted=total_inserted,
|
||||
records_updated=0,
|
||||
records_deleted=0,
|
||||
)
|
||||
|
||||
return results
|
||||
|
||||
except Exception as e:
|
||||
log_manager.complete_sync(log_entry, status="failed", error_message=str(e))
|
||||
raise
|
||||
|
||||
# ======================================================================
|
||||
# 预览模式
|
||||
|
||||
Reference in New Issue
Block a user