refactor(financial-sync): 重构财务数据同步架构
- 新增 base_financial_sync.py 基础同步抽象类 - 重构 api_financial_sync.py 简化调度逻辑 - 重命名 IncomeSync 为 IncomeQuarterSync 继承新基础类 - 增强 storage.py 支持 use_upsert 参数 - 更新 __init__.py 导出符号
This commit is contained in:
@@ -47,7 +47,8 @@ from src.data.api_wrappers.api_pro_bar import (
|
||||
from src.data.api_wrappers.financial_data.api_income import (
|
||||
get_income,
|
||||
sync_income,
|
||||
IncomeSync,
|
||||
preview_income_sync,
|
||||
IncomeQuarterSync,
|
||||
)
|
||||
from src.data.api_wrappers.api_bak_basic import get_bak_basic, sync_bak_basic
|
||||
from src.data.api_wrappers.api_namechange import get_namechange, sync_namechange
|
||||
@@ -84,7 +85,8 @@ __all__ = [
|
||||
# Income statement
|
||||
"get_income",
|
||||
"sync_income",
|
||||
"IncomeSync",
|
||||
"preview_income_sync",
|
||||
"IncomeQuarterSync",
|
||||
# Historical stock list
|
||||
"get_bak_basic",
|
||||
"sync_bak_basic",
|
||||
|
||||
756
src/data/api_wrappers/base_financial_sync.py
Normal file
756
src/data/api_wrappers/base_financial_sync.py
Normal file
@@ -0,0 +1,756 @@
|
||||
"""财务数据同步基础抽象模块。
|
||||
|
||||
提供专门用于按季度同步财务数据的基类 QuarterBasedSync。
|
||||
财务数据特点:
|
||||
- 按季度发布(period: 20231231, 20230930, 20230630, 20230331)
|
||||
- 使用 VIP 接口一次性获取某季度的全部上市公司数据
|
||||
- 数据可能会修正,增量同步需获取当前季度+前一季度
|
||||
- 主键为 (ts_code, end_date)
|
||||
|
||||
使用方式:
|
||||
class IncomeQuarterSync(QuarterBasedSync):
|
||||
table_name = "financial_income"
|
||||
api_name = "income_vip"
|
||||
|
||||
def fetch_single_quarter(self, period: str) -> pd.DataFrame:
|
||||
# 实现单季度数据获取
|
||||
...
|
||||
"""
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Optional, Dict, List, Tuple, Set
|
||||
from datetime import datetime
|
||||
import pandas as pd
|
||||
from tqdm import tqdm
|
||||
|
||||
from src.data.client import TushareClient
|
||||
from src.data.storage import ThreadSafeStorage, Storage
|
||||
from src.data.utils import get_today_date, get_quarters_in_range, DEFAULT_START_DATE
|
||||
|
||||
|
||||
class QuarterBasedSync(ABC):
|
||||
"""财务数据季度同步抽象基类。
|
||||
|
||||
专门处理按季度同步的财务数据(利润表、资产负债表、现金流量表)。
|
||||
财务数据同步特点:
|
||||
1. 按季度获取:使用 VIP 接口一次性获取某季度全部上市公司数据
|
||||
2. 数据可修正:同一季度数据可能被更新,增量同步需获取当前季度+前一季度
|
||||
3. 差异检测:需对比本地与远程数据量,识别缺失或变更的记录
|
||||
4. 主键:(ts_code, end_date)
|
||||
|
||||
子类必须实现:
|
||||
- table_name: 类属性,目标表名
|
||||
- api_name: 类属性,Tushare API 接口名
|
||||
- fetch_single_quarter(period) -> pd.DataFrame: 获取单季度数据
|
||||
- TABLE_SCHEMA: 类属性,表结构定义
|
||||
|
||||
Attributes:
|
||||
table_name: 目标表名(子类必须覆盖)
|
||||
api_name: Tushare API 接口名(子类必须覆盖)
|
||||
DEFAULT_START_DATE: 默认起始日期(2018Q1)
|
||||
TABLE_SCHEMA: 表结构定义 {列名: SQL类型}
|
||||
TABLE_INDEXES: 索引定义 [(索引名, [列名列表]), ...]
|
||||
注意:不要创建唯一索引,因为财务数据可能发生多次修正
|
||||
"""
|
||||
|
||||
table_name: str = "" # 子类必须覆盖
|
||||
api_name: str = "" # 子类必须覆盖
|
||||
DEFAULT_START_DATE = "20180331" # 2018年Q1
|
||||
|
||||
# 目标报表类型(子类可覆盖)
|
||||
# 默认只同步合并报表(report_type='1')
|
||||
# 设为 None 则同步所有报表类型
|
||||
TARGET_REPORT_TYPE: Optional[str] = "1"
|
||||
|
||||
# 表结构定义(子类必须覆盖)
|
||||
TABLE_SCHEMA: Dict[str, str] = {}
|
||||
|
||||
# 索引定义(子类可覆盖)
|
||||
# 格式: [("index_name", ["col1", "col2"]), ...]
|
||||
# 注意:不要创建唯一索引,因为财务数据可能发生多次修正
|
||||
TABLE_INDEXES: List[Tuple[str, List[str]]] = []
|
||||
|
||||
def __init__(self):
|
||||
"""初始化季度同步管理器。"""
|
||||
self.storage = ThreadSafeStorage()
|
||||
self.client = TushareClient()
|
||||
self._cached_data: Optional[pd.DataFrame] = None
|
||||
|
||||
# ======================================================================
|
||||
# 抽象方法 - 子类必须实现
|
||||
# ======================================================================
|
||||
|
||||
@abstractmethod
|
||||
def fetch_single_quarter(self, period: str) -> pd.DataFrame:
|
||||
"""获取单季度的全部上市公司数据。
|
||||
|
||||
Args:
|
||||
period: 报告期,季度最后一天日期(如 '20231231')
|
||||
|
||||
Returns:
|
||||
包含该季度全部上市公司财务数据的 DataFrame
|
||||
"""
|
||||
pass
|
||||
|
||||
# ======================================================================
|
||||
# 季度计算工具方法
|
||||
# ======================================================================
|
||||
|
||||
def get_current_quarter(self) -> str:
|
||||
"""获取当前季度(考虑是否到季末)。
|
||||
|
||||
如果当前日期未到季度最后一天,则返回前一季度。
|
||||
这样可以避免获取尚无数据的未来季度。
|
||||
|
||||
Returns:
|
||||
当前季度字符串 (YYYYMMDD),如 '20231231'
|
||||
"""
|
||||
today = get_today_date()
|
||||
year = int(today[:4])
|
||||
month = int(today[4:6])
|
||||
|
||||
# 确定当前季度
|
||||
if month <= 3:
|
||||
current_q = f"{year}0331"
|
||||
elif month <= 6:
|
||||
current_q = f"{year}0630"
|
||||
elif month <= 9:
|
||||
current_q = f"{year}0930"
|
||||
else:
|
||||
current_q = f"{year}1231"
|
||||
|
||||
# 如果今天还没到季末,返回前一季度
|
||||
if today < current_q:
|
||||
return self.get_prev_quarter(current_q)
|
||||
|
||||
return current_q
|
||||
|
||||
def get_prev_quarter(self, quarter: str) -> str:
|
||||
"""获取前一季度。
|
||||
|
||||
Args:
|
||||
quarter: 季度字符串 (YYYYMMDD),如 '20231231'
|
||||
|
||||
Returns:
|
||||
前一季度字符串 (YYYYMMDD)
|
||||
"""
|
||||
year = int(quarter[:4])
|
||||
month_day = quarter[4:]
|
||||
|
||||
if month_day == "0331":
|
||||
return f"{year - 1}1231"
|
||||
elif month_day == "0630":
|
||||
return f"{year}0331"
|
||||
elif month_day == "0930":
|
||||
return f"{year}0630"
|
||||
else: # "1231"
|
||||
return f"{year}0930"
|
||||
|
||||
def get_next_quarter(self, quarter: str) -> str:
|
||||
"""获取下一季度。
|
||||
|
||||
Args:
|
||||
quarter: 季度字符串 (YYYYMMDD)
|
||||
|
||||
Returns:
|
||||
下一季度字符串 (YYYYMMDD)
|
||||
"""
|
||||
year = int(quarter[:4])
|
||||
month_day = quarter[4:]
|
||||
|
||||
if month_day == "0331":
|
||||
return f"{year}0630"
|
||||
elif month_day == "0630":
|
||||
return f"{year}0930"
|
||||
elif month_day == "0930":
|
||||
return f"{year}1231"
|
||||
else: # "1231"
|
||||
return f"{year + 1}0331"
|
||||
|
||||
# ======================================================================
|
||||
# 表结构管理
|
||||
# ======================================================================
|
||||
|
||||
def ensure_table_exists(self) -> None:
|
||||
"""确保表结构存在,如不存在则创建表和索引。
|
||||
|
||||
注意:不设置主键和唯一索引,因为财务数据可能发生多次修正,
|
||||
同一支股票在同一季度可能有多个版本(不同的ann_date)。
|
||||
DuckDB 会自动创建 rowid 作为主键。
|
||||
"""
|
||||
storage = Storage()
|
||||
|
||||
if storage.exists(self.table_name):
|
||||
return
|
||||
|
||||
if not self.TABLE_SCHEMA:
|
||||
print(
|
||||
f"[{self.__class__.__name__}] TABLE_SCHEMA not defined, skipping table creation"
|
||||
)
|
||||
return
|
||||
|
||||
# 构建列定义(不设置主键)
|
||||
columns_def = []
|
||||
for col_name, col_type in self.TABLE_SCHEMA.items():
|
||||
columns_def.append(f'"{col_name}" {col_type}')
|
||||
|
||||
columns_sql = ", ".join(columns_def)
|
||||
create_sql = f'CREATE TABLE IF NOT EXISTS "{self.table_name}" ({columns_sql})'
|
||||
|
||||
try:
|
||||
storage._connection.execute(create_sql)
|
||||
print(f"[{self.__class__.__name__}] Created table '{self.table_name}'")
|
||||
except Exception as e:
|
||||
print(f"[{self.__class__.__name__}] Error creating table: {e}")
|
||||
raise
|
||||
|
||||
# 创建普通索引(不要创建唯一索引)
|
||||
for idx_name, idx_cols in self.TABLE_INDEXES:
|
||||
try:
|
||||
idx_cols_sql = ", ".join(f'"{col}"' for col in idx_cols)
|
||||
storage._connection.execute(
|
||||
f'CREATE INDEX IF NOT EXISTS "{idx_name}" ON "{self.table_name}"({idx_cols_sql})'
|
||||
)
|
||||
print(
|
||||
f"[{self.__class__.__name__}] Created index '{idx_name}' on {idx_cols}"
|
||||
)
|
||||
except Exception as e:
|
||||
print(
|
||||
f"[{self.__class__.__name__}] Error creating index {idx_name}: {e}"
|
||||
)
|
||||
|
||||
# ======================================================================
|
||||
# 数据差异检测(核心逻辑)
|
||||
# ======================================================================
|
||||
|
||||
def get_local_data_count_by_stock(self, period: str) -> Dict[str, int]:
|
||||
"""获取本地数据库中某季度的各股票数据量。
|
||||
|
||||
Args:
|
||||
period: 季度字符串 (YYYYMMDD)
|
||||
|
||||
Returns:
|
||||
字典 {ts_code: 记录数}
|
||||
"""
|
||||
storage = Storage()
|
||||
|
||||
try:
|
||||
# 将 YYYYMMDD 转换为 YYYY-MM-DD 格式
|
||||
period_formatted = f"{period[:4]}-{period[4:6]}-{period[6:]}"
|
||||
query = f'''
|
||||
SELECT ts_code, COUNT(*) as cnt
|
||||
FROM "{self.table_name}"
|
||||
WHERE end_date = ?
|
||||
GROUP BY ts_code
|
||||
'''
|
||||
result = storage._connection.execute(query, [period_formatted]).fetchdf()
|
||||
|
||||
if result.empty:
|
||||
return {}
|
||||
|
||||
return dict(zip(result["ts_code"], result["cnt"]))
|
||||
except Exception as e:
|
||||
print(f"[{self.__class__.__name__}] Error querying local data count: {e}")
|
||||
return {}
|
||||
|
||||
def get_local_records_by_key(self, period: str) -> Dict[tuple, int]:
|
||||
"""获取本地数据库中某季度的记录(按主键分组计数)。
|
||||
|
||||
用于更精确的差异检测,按 (ts_code, end_date, report_type) 分组。
|
||||
|
||||
Args:
|
||||
period: 季度字符串 (YYYYMMDD)
|
||||
|
||||
Returns:
|
||||
字典 {(ts_code, end_date, report_type): 记录数}
|
||||
"""
|
||||
storage = Storage()
|
||||
|
||||
try:
|
||||
# 将 YYYYMMDD 转换为 YYYY-MM-DD 格式
|
||||
period_formatted = f"{period[:4]}-{period[4:6]}-{period[6:]}"
|
||||
query = f'''
|
||||
SELECT ts_code, end_date, report_type, COUNT(*) as cnt
|
||||
FROM "{self.table_name}"
|
||||
WHERE end_date = ?
|
||||
GROUP BY ts_code, end_date, report_type
|
||||
'''
|
||||
result = storage._connection.execute(query, [period_formatted]).fetchdf()
|
||||
|
||||
if result.empty:
|
||||
return {}
|
||||
|
||||
return {
|
||||
(row["ts_code"], row["end_date"], row["report_type"]): row["cnt"]
|
||||
for _, row in result.iterrows()
|
||||
}
|
||||
except Exception as e:
|
||||
print(f"[{self.__class__.__name__}] Error querying local records: {e}")
|
||||
return {}
|
||||
|
||||
def compare_and_find_differences(
|
||||
self, remote_df: pd.DataFrame, period: str
|
||||
) -> Tuple[pd.DataFrame, pd.DataFrame]:
|
||||
"""对比远程数据与本地数据,找出差异。
|
||||
|
||||
逻辑:
|
||||
1. 统计远程数据中每只股票的数据量
|
||||
2. 查询本地数据库中该季度每只股票的数据量
|
||||
3. 对比找出:
|
||||
- 本地缺失的股票(新增)
|
||||
- 数据量不一致的股票(有更新,可能包含财务修正)
|
||||
4. 返回需要插入的差异数据
|
||||
|
||||
注意:主键为 (ts_code, end_date, report_type),因此同一支股票在同一季度
|
||||
可能有多个 report_type 的记录。差异检测按股票级别进行,如果该股票的
|
||||
记录总数不一致,则更新该股票的所有记录。
|
||||
|
||||
Args:
|
||||
remote_df: 从 API 获取的远程数据
|
||||
period: 季度字符串
|
||||
|
||||
Returns:
|
||||
(差异数据DataFrame, 统计信息DataFrame)
|
||||
统计信息包含:ts_code, remote_count, local_count, status
|
||||
"""
|
||||
if remote_df.empty:
|
||||
return pd.DataFrame(), pd.DataFrame()
|
||||
|
||||
# 1. 统计远程数据中每只股票的数据量
|
||||
remote_counts = remote_df.groupby("ts_code").size().to_dict()
|
||||
|
||||
# 2. 获取本地数据量(按股票汇总)
|
||||
local_counts = self.get_local_data_count_by_stock(period)
|
||||
|
||||
# 3. 对比找出差异
|
||||
diff_stocks = [] # 需要更新的股票列表
|
||||
stats = []
|
||||
|
||||
for ts_code, remote_count in remote_counts.items():
|
||||
local_count = local_counts.get(ts_code, 0)
|
||||
|
||||
if local_count == 0:
|
||||
status = "new" # 本地不存在,全部插入
|
||||
diff_stocks.append(ts_code)
|
||||
elif local_count != remote_count:
|
||||
status = "modified" # 数据量不一致,可能包含财务修正
|
||||
diff_stocks.append(ts_code)
|
||||
else:
|
||||
status = "same" # 数据量一致,跳过
|
||||
|
||||
stats.append(
|
||||
{
|
||||
"ts_code": ts_code,
|
||||
"remote_count": remote_count,
|
||||
"local_count": local_count,
|
||||
"status": status,
|
||||
}
|
||||
)
|
||||
|
||||
# 4. 提取差异数据
|
||||
if diff_stocks:
|
||||
diff_df = remote_df[remote_df["ts_code"].isin(diff_stocks)].copy()
|
||||
else:
|
||||
diff_df = pd.DataFrame()
|
||||
|
||||
stats_df = pd.DataFrame(stats)
|
||||
|
||||
return diff_df, stats_df
|
||||
|
||||
# ======================================================================
|
||||
# 同步核心逻辑
|
||||
# ======================================================================
|
||||
|
||||
def delete_stock_quarter_data(
|
||||
self, period: str, ts_codes: Optional[List[str]] = None
|
||||
) -> int:
|
||||
"""删除指定季度和股票的数据。
|
||||
|
||||
在同步前删除旧数据,然后插入新数据(先删除后插入策略)。
|
||||
|
||||
Args:
|
||||
period: 季度字符串 (YYYYMMDD)
|
||||
ts_codes: 股票代码列表,None 表示删除该季度所有数据
|
||||
|
||||
Returns:
|
||||
删除的记录数
|
||||
"""
|
||||
storage = Storage()
|
||||
|
||||
try:
|
||||
# 将 YYYYMMDD 转换为 YYYY-MM-DD 格式
|
||||
period_formatted = f"{period[:4]}-{period[4:6]}-{period[6:]}"
|
||||
if ts_codes:
|
||||
# 删除指定股票的数据
|
||||
placeholders = ", ".join(["?" for _ in ts_codes])
|
||||
query = f'''
|
||||
DELETE FROM "{self.table_name}"
|
||||
WHERE end_date = ? AND ts_code IN ({placeholders})
|
||||
'''
|
||||
result = storage._connection.execute(
|
||||
query, [period_formatted] + ts_codes
|
||||
)
|
||||
else:
|
||||
# 删除整个季度的数据
|
||||
query = f'DELETE FROM "{self.table_name}" WHERE end_date = ?'
|
||||
result = storage._connection.execute(query, [period_formatted])
|
||||
|
||||
# DuckDB 的 rowcount 可能返回 -1,我们手动查询删除后的数量变化
|
||||
# 由于我们已经删除了特定条件的数据,直接返回传入的股票数量作为估算
|
||||
if ts_codes:
|
||||
deleted_count = len(ts_codes)
|
||||
else:
|
||||
# 删除整个季度,查询删除前的数量
|
||||
deleted_count = -1 # 标记为未知,稍后处理
|
||||
return deleted_count
|
||||
except Exception as e:
|
||||
print(f"[{self.__class__.__name__}] Error deleting data: {e}")
|
||||
return 0
|
||||
|
||||
def sync_quarter(self, period: str, dry_run: bool = False) -> Dict:
|
||||
"""同步单个季度的数据。
|
||||
|
||||
流程:
|
||||
1. 获取远程数据
|
||||
2. 根据 TARGET_REPORT_TYPE 过滤报表类型
|
||||
3. 对比本地数据,找出差异股票
|
||||
4. 删除差异股票的旧数据
|
||||
5. 插入新数据(先删除后插入)
|
||||
|
||||
注意:财务数据同步必须取当前季度和前一季度进行对比更新,
|
||||
不存在"不需要同步"的情况。
|
||||
|
||||
Args:
|
||||
period: 季度字符串 (YYYYMMDD)
|
||||
dry_run: 是否为预览模式
|
||||
|
||||
Returns:
|
||||
同步结果字典 {
|
||||
'period': 季度,
|
||||
'remote_total': 远程总记录数,
|
||||
'diff_count': 差异记录数,
|
||||
'deleted_count': 删除的记录数,
|
||||
'inserted_count': 插入的记录数,
|
||||
'dry_run': 是否预览模式
|
||||
}
|
||||
"""
|
||||
print(f"[{self.__class__.__name__}] Syncing quarter {period}...")
|
||||
|
||||
# 1. 获取远程数据
|
||||
remote_df = self.fetch_single_quarter(period)
|
||||
|
||||
if remote_df.empty:
|
||||
print(f"[{self.__class__.__name__}] No data for quarter {period}")
|
||||
return {
|
||||
"period": period,
|
||||
"remote_total": 0,
|
||||
"diff_count": 0,
|
||||
"deleted_count": 0,
|
||||
"inserted_count": 0,
|
||||
"dry_run": dry_run,
|
||||
}
|
||||
|
||||
# 2. 根据 TARGET_REPORT_TYPE 过滤报表类型
|
||||
if self.TARGET_REPORT_TYPE and "report_type" in remote_df.columns:
|
||||
remote_df = remote_df[remote_df["report_type"] == self.TARGET_REPORT_TYPE]
|
||||
|
||||
remote_total = len(remote_df)
|
||||
print(f"[{self.__class__.__name__}] Fetched {remote_total} records from API")
|
||||
|
||||
# 3. 检查本地是否有该季度数据
|
||||
local_counts = self.get_local_data_count_by_stock(period)
|
||||
is_first_sync_for_period = len(local_counts) == 0
|
||||
|
||||
# 4. 执行同步
|
||||
deleted_count = 0
|
||||
inserted_count = 0
|
||||
|
||||
if is_first_sync_for_period:
|
||||
# 首次同步该季度:直接插入所有数据
|
||||
print(
|
||||
f"[{self.__class__.__name__}] First sync for quarter {period}, inserting all data directly"
|
||||
)
|
||||
|
||||
if not dry_run:
|
||||
self.storage.queue_save(self.table_name, remote_df, use_upsert=False)
|
||||
self.storage.flush()
|
||||
inserted_count = len(remote_df)
|
||||
print(
|
||||
f"[{self.__class__.__name__}] Inserted {inserted_count} new records"
|
||||
)
|
||||
|
||||
return {
|
||||
"period": period,
|
||||
"remote_total": remote_total,
|
||||
"diff_count": len(remote_df),
|
||||
"deleted_count": 0,
|
||||
"inserted_count": inserted_count,
|
||||
"dry_run": dry_run,
|
||||
}
|
||||
|
||||
# 5. 非首次同步:对比找出差异股票
|
||||
diff_df, stats_df = self.compare_and_find_differences(remote_df, period)
|
||||
|
||||
diff_stocks = list(diff_df["ts_code"].unique()) if not diff_df.empty else []
|
||||
unchanged_count = (
|
||||
len(stats_df[stats_df["status"] == "same"]) if not stats_df.empty else 0
|
||||
)
|
||||
|
||||
print(f"[{self.__class__.__name__}] Comparison result:")
|
||||
print(f" - Stocks with differences: {len(diff_stocks)}")
|
||||
print(f" - Unchanged stocks: {unchanged_count}")
|
||||
|
||||
if not dry_run and not diff_df.empty:
|
||||
# 5.1 删除差异股票的旧数据
|
||||
deleted_stocks_count = len(diff_stocks)
|
||||
self.delete_stock_quarter_data(period, diff_stocks)
|
||||
deleted_count = len(diff_df)
|
||||
print(
|
||||
f"[{self.__class__.__name__}] Deleted {deleted_stocks_count} stocks' old records (approx {deleted_count} rows)"
|
||||
)
|
||||
|
||||
# 5.2 插入新数据(使用普通 INSERT,因为已删除旧数据)
|
||||
self.storage.queue_save(self.table_name, diff_df, use_upsert=False)
|
||||
self.storage.flush()
|
||||
inserted_count = len(diff_df)
|
||||
print(f"[{self.__class__.__name__}] Inserted {inserted_count} new records")
|
||||
|
||||
return {
|
||||
"period": period,
|
||||
"remote_total": remote_total,
|
||||
"diff_count": len(diff_df),
|
||||
"deleted_count": deleted_count,
|
||||
"inserted_count": inserted_count,
|
||||
"dry_run": dry_run,
|
||||
}
|
||||
|
||||
def sync_range(
|
||||
self, start_quarter: str, end_quarter: str, dry_run: bool = False
|
||||
) -> List[Dict]:
|
||||
"""同步指定季度范围的数据。
|
||||
|
||||
注意:增量同步会自动包含前一季度以确保数据完整性。
|
||||
|
||||
Args:
|
||||
start_quarter: 起始季度 (YYYYMMDD)
|
||||
end_quarter: 结束季度 (YYYYMMDD)
|
||||
dry_run: 是否为预览模式
|
||||
|
||||
Returns:
|
||||
各季度同步结果列表
|
||||
"""
|
||||
quarters = get_quarters_in_range(start_quarter, end_quarter)
|
||||
|
||||
if not quarters:
|
||||
print(f"[{self.__class__.__name__}] No quarters to sync")
|
||||
return []
|
||||
|
||||
print(
|
||||
f"[{self.__class__.__name__}] Syncing {len(quarters)} quarters: {quarters}"
|
||||
)
|
||||
|
||||
results = []
|
||||
for period in tqdm(quarters, desc=f"Syncing {self.table_name}"):
|
||||
try:
|
||||
result = self.sync_quarter(period, dry_run=dry_run)
|
||||
results.append(result)
|
||||
except Exception as e:
|
||||
print(f"[{self.__class__.__name__}] Error syncing {period}: {e}")
|
||||
results.append({"period": period, "error": str(e)})
|
||||
|
||||
return results
|
||||
|
||||
def sync_incremental(self, dry_run: bool = False) -> List[Dict]:
|
||||
"""执行增量同步。
|
||||
|
||||
策略:
|
||||
1. 确保表存在(首次同步时自动建表)
|
||||
2. 获取表中最新季度
|
||||
3. 计算当前季度(考虑是否到季末)
|
||||
4. 确定同步范围:从最新季度到当前季度
|
||||
5. **重要**:额外包含前一季度以确保数据完整性
|
||||
|
||||
注意:财务数据同步与日线数据不同,必须每次都获取数据进行对比
|
||||
更新,不存在"不需要同步"的情况。因为财务数据可能会被修正。
|
||||
|
||||
Args:
|
||||
dry_run: 是否为预览模式
|
||||
|
||||
Returns:
|
||||
各季度同步结果列表
|
||||
"""
|
||||
print(f"\n{'=' * 60}")
|
||||
print(f"[{self.__class__.__name__}] Incremental Sync")
|
||||
print(f"{'=' * 60}")
|
||||
|
||||
# 0. 确保表存在(首次同步时自动建表)
|
||||
self.ensure_table_exists()
|
||||
|
||||
# 1. 获取最新季度
|
||||
storage = Storage()
|
||||
try:
|
||||
result = storage._connection.execute(
|
||||
f'SELECT MAX(end_date) FROM "{self.table_name}"'
|
||||
).fetchone()
|
||||
latest_quarter = result[0] if result and result[0] else None
|
||||
if hasattr(latest_quarter, "strftime"):
|
||||
latest_quarter = latest_quarter.strftime("%Y%m%d")
|
||||
except Exception as e:
|
||||
print(f"[{self.__class__.__name__}] Error getting latest quarter: {e}")
|
||||
latest_quarter = None
|
||||
|
||||
# 2. 获取当前季度
|
||||
current_quarter = self.get_current_quarter()
|
||||
|
||||
if latest_quarter is None:
|
||||
# 无本地数据,执行全量同步
|
||||
print(f"[{self.__class__.__name__}] No local data, performing full sync")
|
||||
return self.sync_range(self.DEFAULT_START_DATE, current_quarter, dry_run)
|
||||
|
||||
print(f"[{self.__class__.__name__}] Latest local quarter: {latest_quarter}")
|
||||
print(f"[{self.__class__.__name__}] Current quarter: {current_quarter}")
|
||||
|
||||
# 3. 确定同步范围
|
||||
# 财务数据必须每次都进行对比更新,不存在"跳过"的情况
|
||||
# 同步范围:从最新季度到当前季度(包含前一季度以确保数据完整性)
|
||||
start_quarter = latest_quarter
|
||||
if start_quarter > current_quarter:
|
||||
# 如果本地数据比当前季度还新,仍然需要同步(可能包含修正数据)
|
||||
start_quarter = current_quarter
|
||||
else:
|
||||
# 正常情况:包含前一季度
|
||||
start_quarter = self.get_prev_quarter(latest_quarter)
|
||||
|
||||
if start_quarter < self.DEFAULT_START_DATE:
|
||||
start_quarter = self.DEFAULT_START_DATE
|
||||
|
||||
# 打印同步的两个季度信息
|
||||
print(f"\n[{self.__class__.__name__}] 将同步以下两个季度的财报:")
|
||||
print(f" - 前一季度: {start_quarter}")
|
||||
print(f" - 当前季度: {current_quarter}")
|
||||
print(f" (包含前一季度以确保数据完整性)")
|
||||
print()
|
||||
|
||||
return self.sync_range(start_quarter, current_quarter, dry_run)
|
||||
|
||||
def sync_full(self, dry_run: bool = False) -> List[Dict]:
|
||||
"""执行全量同步。
|
||||
|
||||
Args:
|
||||
dry_run: 是否为预览模式
|
||||
|
||||
Returns:
|
||||
各季度同步结果列表
|
||||
"""
|
||||
print(f"\n{'=' * 60}")
|
||||
print(f"[{self.__class__.__name__}] Full Sync")
|
||||
print(f"{'=' * 60}")
|
||||
|
||||
# 确保表存在
|
||||
self.ensure_table_exists()
|
||||
|
||||
current_quarter = self.get_current_quarter()
|
||||
|
||||
return self.sync_range(self.DEFAULT_START_DATE, current_quarter, dry_run)
|
||||
|
||||
# ======================================================================
|
||||
# 预览模式
|
||||
# ======================================================================
|
||||
|
||||
def preview_sync(self) -> Dict:
|
||||
"""预览同步信息(不实际同步)。
|
||||
|
||||
注意:财务数据同步必须每次都进行,因为数据可能会被修正。
|
||||
预览显示将要同步的季度范围。
|
||||
|
||||
Returns:
|
||||
预览信息字典
|
||||
"""
|
||||
print(f"\n{'=' * 60}")
|
||||
print(f"[{self.__class__.__name__}] Preview Mode")
|
||||
print(f"{'=' * 60}")
|
||||
|
||||
# 获取最新季度
|
||||
storage = Storage()
|
||||
try:
|
||||
result = storage._connection.execute(
|
||||
f'SELECT MAX(end_date) FROM "{self.table_name}"'
|
||||
).fetchone()
|
||||
latest_quarter = result[0] if result and result[0] else None
|
||||
if hasattr(latest_quarter, "strftime"):
|
||||
latest_quarter = latest_quarter.strftime("%Y%m%d")
|
||||
except Exception:
|
||||
latest_quarter = None
|
||||
|
||||
current_quarter = self.get_current_quarter()
|
||||
|
||||
if latest_quarter is None:
|
||||
# 无本地数据,需要全量同步
|
||||
start_quarter = self.DEFAULT_START_DATE
|
||||
message = "No local data, full sync required"
|
||||
else:
|
||||
# 财务数据必须每次都进行对比更新
|
||||
# 同步范围:从最新季度到当前季度(包含前一季度)
|
||||
start_quarter = self.get_prev_quarter(latest_quarter)
|
||||
if start_quarter < self.DEFAULT_START_DATE:
|
||||
start_quarter = self.DEFAULT_START_DATE
|
||||
message = f"Incremental sync from {start_quarter} to {current_quarter}"
|
||||
|
||||
preview = {
|
||||
"table_name": self.table_name,
|
||||
"api_name": self.api_name,
|
||||
"latest_quarter": latest_quarter,
|
||||
"current_quarter": current_quarter,
|
||||
"start_quarter": start_quarter,
|
||||
"end_quarter": current_quarter,
|
||||
"message": message,
|
||||
}
|
||||
|
||||
print(f"Table: {self.table_name}")
|
||||
print(f"API: {self.api_name}")
|
||||
print(f"Latest local: {latest_quarter}")
|
||||
print(f"Current quarter: {current_quarter}")
|
||||
print(f"Sync range: {start_quarter} -> {current_quarter}")
|
||||
print(f"Message: {message}")
|
||||
print(f"{'=' * 60}")
|
||||
|
||||
return preview
|
||||
|
||||
|
||||
# ======================================================================
|
||||
# 便捷函数
|
||||
# ======================================================================
|
||||
|
||||
|
||||
def sync_financial_data(
|
||||
syncer_class: type, force_full: bool = False, dry_run: bool = False
|
||||
) -> List[Dict]:
|
||||
"""通用的财务数据同步便捷函数。
|
||||
|
||||
Args:
|
||||
syncer_class: QuarterBasedSync 的子类
|
||||
force_full: 是否强制全量同步
|
||||
dry_run: 是否为预览模式
|
||||
|
||||
Returns:
|
||||
同步结果列表
|
||||
"""
|
||||
syncer = syncer_class()
|
||||
|
||||
if force_full:
|
||||
return syncer.sync_full(dry_run)
|
||||
else:
|
||||
return syncer.sync_incremental(dry_run)
|
||||
|
||||
|
||||
def preview_financial_sync(syncer_class: type) -> Dict:
|
||||
"""预览财务数据同步信息。
|
||||
|
||||
Args:
|
||||
syncer_class: QuarterBasedSync 的子类
|
||||
|
||||
Returns:
|
||||
预览信息字典
|
||||
"""
|
||||
syncer = syncer_class()
|
||||
return syncer.preview_sync()
|
||||
@@ -1,696 +1,234 @@
|
||||
"""财务数据统一同步调度中心。
|
||||
|
||||
该模块作为财务数据同步的调度中心,统一管理各类型财务数据的同步流程。
|
||||
支持全量同步和增量同步两种模式。
|
||||
该模块作为财务数据同步的调度中心,只负责任务协调和调度。
|
||||
具体的同步逻辑已下沉到各 API 文件中。
|
||||
|
||||
财务数据类型:
|
||||
支持的财务数据类型:
|
||||
- income: 利润表 (已实现)
|
||||
- balance: 资产负债表 (预留)
|
||||
- cashflow: 现金流量表 (预留)
|
||||
|
||||
同步模式:
|
||||
1. 全量同步 (force_full=True):
|
||||
- 检查表是否存在,如不存在则建表+建索引
|
||||
- 从默认开始日期 (20180101) 同步到当前季度
|
||||
|
||||
2. 增量同步 (force_full=False):
|
||||
- 获取表中最新季度 (MAX(end_date))
|
||||
- 计算当前季度(如果当前日期未到季末,则用前一季度)
|
||||
- 如果最新季度 == 当前季度,不同步(避免消耗流量)
|
||||
- 否则从最新季度+1 同步到当前季度
|
||||
|
||||
使用方式:
|
||||
# 增量同步利润表数据(推荐)
|
||||
使用方式:
|
||||
# 同步所有财务数据(增量)
|
||||
from src.data.api_wrappers.financial_data.api_financial_sync import sync_financial
|
||||
sync_financial()
|
||||
|
||||
# 全量同步利润表数据
|
||||
from src.data.api_wrappers.financial_data.api_financial_sync import sync_financial
|
||||
# 全量同步
|
||||
sync_financial(force_full=True)
|
||||
|
||||
# 只同步利润表
|
||||
sync_financial(data_types=["income"])
|
||||
|
||||
# 预览同步
|
||||
from src.data.api_wrappers.financial_data.api_financial_sync import preview_sync
|
||||
preview = preview_sync()
|
||||
"""
|
||||
|
||||
from typing import Optional, Dict, List
|
||||
from datetime import datetime
|
||||
from typing import List, Dict, Optional
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from src.data.storage import Storage, ThreadSafeStorage
|
||||
from src.data.utils import (
|
||||
get_today_date,
|
||||
get_quarters_in_range,
|
||||
date_to_quarter,
|
||||
DEFAULT_START_DATE,
|
||||
from src.data.api_wrappers.financial_data.api_income import (
|
||||
IncomeQuarterSync,
|
||||
sync_income,
|
||||
preview_income_sync,
|
||||
)
|
||||
from src.data.api_wrappers.financial_data.api_income import get_income
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# 财务数据表结构定义
|
||||
# =============================================================================
|
||||
|
||||
# 各财务数据表的表名和字段定义
|
||||
FINANCIAL_TABLES = {
|
||||
# 支持的财务数据类型映射
|
||||
FINANCIAL_SYNCERS = {
|
||||
"income": {
|
||||
"table_name": "financial_income",
|
||||
"api_name": "income_vip",
|
||||
"period_field": "end_date", # 用于存储最新季度的字段
|
||||
"get_data_func": get_income,
|
||||
"syncer_class": IncomeQuarterSync,
|
||||
"sync_func": sync_income,
|
||||
"preview_func": preview_income_sync,
|
||||
"display_name": "利润表",
|
||||
},
|
||||
# 预留:资产负债表
|
||||
# "balance": {
|
||||
# "table_name": "financial_balance",
|
||||
# "api_name": "balance Sheet_vip",
|
||||
# "period_field": "end_date",
|
||||
# "get_data_func": get_balance,
|
||||
# "syncer_class": BalanceQuarterSync,
|
||||
# "sync_func": sync_balance,
|
||||
# "preview_func": preview_balance_sync,
|
||||
# "display_name": "资产负债表",
|
||||
# },
|
||||
# 预留:现金流量表
|
||||
# "cashflow": {
|
||||
# "table_name": "financial_cashflow",
|
||||
# "api_name": "cashflow_vip",
|
||||
# "period_field": "end_date",
|
||||
# "get_data_func": get_cashflow,
|
||||
# "syncer_class": CashflowQuarterSync,
|
||||
# "sync_func": sync_cashflow,
|
||||
# "preview_func": preview_cashflow_sync,
|
||||
# "display_name": "现金流量表",
|
||||
# },
|
||||
}
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# 财务数据同步核心类
|
||||
# =============================================================================
|
||||
def sync_financial(
|
||||
data_types: Optional[List[str]] = None,
|
||||
force_full: bool = False,
|
||||
dry_run: bool = False,
|
||||
) -> Dict[str, List]:
|
||||
"""同步财务数据(调度函数)。
|
||||
|
||||
根据指定的数据类型,调度对应的同步器执行同步。
|
||||
|
||||
class FinancialSync:
|
||||
"""财务数据统一同步管理器。
|
||||
Args:
|
||||
data_types: 数据类型列表,如 ["income", "balance"]
|
||||
None 表示同步所有类型
|
||||
force_full: 若为 True,强制全量同步
|
||||
dry_run: 若为 True,仅预览不写入
|
||||
|
||||
支持全量同步和增量同步,自动检测数据状态并选择最优同步策略。
|
||||
|
||||
功能特性:
|
||||
- 全量/增量同步自动切换
|
||||
- 自动建表和索引(如不存在)
|
||||
- 智能季度计算(当前季度未到季末时使用前一季度)
|
||||
- 流量保护(最新季度==当前季度时不请求API)
|
||||
Returns:
|
||||
各类型同步结果字典 {数据类型: 同步结果列表}
|
||||
|
||||
Example:
|
||||
>>> sync = FinancialSync()
|
||||
>>> sync.sync_all() # 增量同步所有财务数据
|
||||
>>> sync.sync_all(force_full=True) # 全量同步
|
||||
>>> sync.sync_income() # 只同步利润表
|
||||
>>> # 同步所有财务数据
|
||||
>>> sync_financial()
|
||||
>>>
|
||||
>>> # 只同步利润表
|
||||
>>> sync_financial(data_types=["income"])
|
||||
>>>
|
||||
>>> # 全量同步
|
||||
>>> sync_financial(force_full=True)
|
||||
"""
|
||||
if data_types is None:
|
||||
data_types = list(FINANCIAL_SYNCERS.keys())
|
||||
|
||||
# 表结构定义(按表名)
|
||||
TABLE_SCHEMAS = {
|
||||
"financial_income": {
|
||||
"columns": {
|
||||
"ts_code": "VARCHAR(16) NOT NULL",
|
||||
"ann_date": "DATE",
|
||||
"f_ann_date": "DATE",
|
||||
"end_date": "DATE NOT NULL",
|
||||
"report_type": "INTEGER",
|
||||
"comp_type": "INTEGER",
|
||||
"end_type": "VARCHAR(10)",
|
||||
"basic_eps": "DOUBLE",
|
||||
"diluted_eps": "DOUBLE",
|
||||
"total_revenue": "DOUBLE",
|
||||
"revenue": "DOUBLE",
|
||||
"int_income": "DOUBLE",
|
||||
"prem_earned": "DOUBLE",
|
||||
"comm_income": "DOUBLE",
|
||||
"n_commis_income": "DOUBLE",
|
||||
"n_oth_income": "DOUBLE",
|
||||
"n_oth_b_income": "DOUBLE",
|
||||
"prem_income": "DOUBLE",
|
||||
"out_prem": "DOUBLE",
|
||||
"une_prem_reser": "DOUBLE",
|
||||
"reins_income": "DOUBLE",
|
||||
"n_sec_tb_income": "DOUBLE",
|
||||
"n_sec_uw_income": "DOUBLE",
|
||||
"n_asset_mg_income": "DOUBLE",
|
||||
"oth_b_income": "DOUBLE",
|
||||
"fv_value_chg_gain": "DOUBLE",
|
||||
"invest_income": "DOUBLE",
|
||||
"ass_invest_income": "DOUBLE",
|
||||
"forex_gain": "DOUBLE",
|
||||
"total_cogs": "DOUBLE",
|
||||
"oper_cost": "DOUBLE",
|
||||
"int_exp": "DOUBLE",
|
||||
"comm_exp": "DOUBLE",
|
||||
"biz_tax_surchg": "DOUBLE",
|
||||
"sell_exp": "DOUBLE",
|
||||
"admin_exp": "DOUBLE",
|
||||
"fin_exp": "DOUBLE",
|
||||
"assets_impair_loss": "DOUBLE",
|
||||
"prem_refund": "DOUBLE",
|
||||
"compens_payout": "DOUBLE",
|
||||
"reser_insur_liab": "DOUBLE",
|
||||
"div_payt": "DOUBLE",
|
||||
"reins_exp": "DOUBLE",
|
||||
"oper_exp": "DOUBLE",
|
||||
"compens_payout_refu": "DOUBLE",
|
||||
"insur_reser_refu": "DOUBLE",
|
||||
"reins_cost_refund": "DOUBLE",
|
||||
"other_bus_cost": "DOUBLE",
|
||||
"operate_profit": "DOUBLE",
|
||||
"non_oper_income": "DOUBLE",
|
||||
"non_oper_exp": "DOUBLE",
|
||||
"nca_disploss": "DOUBLE",
|
||||
"total_profit": "DOUBLE",
|
||||
"income_tax": "DOUBLE",
|
||||
"n_income": "DOUBLE",
|
||||
"n_income_attr_p": "DOUBLE",
|
||||
"minority_gain": "DOUBLE",
|
||||
"oth_compr_income": "DOUBLE",
|
||||
"t_compr_income": "DOUBLE",
|
||||
"compr_inc_attr_p": "DOUBLE",
|
||||
"compr_inc_attr_m_s": "DOUBLE",
|
||||
"ebit": "DOUBLE",
|
||||
"ebitda": "DOUBLE",
|
||||
"insurance_exp": "DOUBLE",
|
||||
"undist_profit": "DOUBLE",
|
||||
"distable_profit": "DOUBLE",
|
||||
"rd_exp": "DOUBLE",
|
||||
"fin_exp_int_exp": "DOUBLE",
|
||||
"fin_exp_int_inc": "DOUBLE",
|
||||
"transfer_surplus_rese": "DOUBLE",
|
||||
"transfer_housing_imprest": "DOUBLE",
|
||||
"transfer_oth": "DOUBLE",
|
||||
"adj_lossgain": "DOUBLE",
|
||||
"withdra_legal_surplus": "DOUBLE",
|
||||
"withdra_legal_pubfund": "DOUBLE",
|
||||
"withdra_biz_devfund": "DOUBLE",
|
||||
"withdra_rese_fund": "DOUBLE",
|
||||
"withdra_oth_ersu": "DOUBLE",
|
||||
"workers_welfare": "DOUBLE",
|
||||
"distr_profit_shrhder": "DOUBLE",
|
||||
"prfshare_payable_dvd": "DOUBLE",
|
||||
"comshare_payable_dvd": "DOUBLE",
|
||||
"capit_comstock_div": "DOUBLE",
|
||||
"net_after_nr_lp_correct": "DOUBLE",
|
||||
"credit_impa_loss": "DOUBLE",
|
||||
"net_expo_hedging_benefits": "DOUBLE",
|
||||
"oth_impair_loss_assets": "DOUBLE",
|
||||
"total_opcost": "DOUBLE",
|
||||
"amodcost_fin_assets": "DOUBLE",
|
||||
"oth_income": "DOUBLE",
|
||||
"asset_disp_income": "DOUBLE",
|
||||
"continued_net_profit": "DOUBLE",
|
||||
"end_net_profit": "DOUBLE",
|
||||
"update_flag": "VARCHAR(1)",
|
||||
},
|
||||
"primary_key": ("ts_code", "end_date"),
|
||||
"indexes": [
|
||||
("idx_financial_ann", ["ts_code", "ann_date"]),
|
||||
],
|
||||
},
|
||||
}
|
||||
|
||||
def __init__(self):
|
||||
"""初始化同步管理器"""
|
||||
self.storage = Storage()
|
||||
self.thread_storage = ThreadSafeStorage()
|
||||
|
||||
def _create_table_if_not_exists(self, table_name: str) -> None:
|
||||
"""如果表不存在则创建表和索引。
|
||||
|
||||
Args:
|
||||
table_name: 表名
|
||||
"""
|
||||
if self.storage.exists(table_name):
|
||||
print(f"[FinancialSync] 表 {table_name} 已存在,跳过建表")
|
||||
return
|
||||
|
||||
if table_name not in self.TABLE_SCHEMAS:
|
||||
print(f"[FinancialSync] 表 {table_name} 没有定义表结构,跳过建表")
|
||||
return
|
||||
|
||||
schema = self.TABLE_SCHEMAS[table_name]
|
||||
print(f"[FinancialSync] 表 {table_name} 不存在,创建表和索引...")
|
||||
|
||||
# 构建列定义
|
||||
columns_def = []
|
||||
for col_name, col_type in schema["columns"].items():
|
||||
columns_def.append(f'"{col_name}" {col_type}')
|
||||
|
||||
# 添加主键约束
|
||||
if schema.get("primary_key"):
|
||||
pk_cols = ', '.join(f'"{col}"' for col in schema["primary_key"])
|
||||
columns_def.append(f"PRIMARY KEY ({pk_cols})")
|
||||
|
||||
columns_sql = ", ".join(columns_def)
|
||||
create_sql = f'CREATE TABLE IF NOT EXISTS "{table_name}" ({columns_sql})'
|
||||
|
||||
try:
|
||||
self.storage._connection.execute(create_sql)
|
||||
print(f"[FinancialSync] 表 {table_name} 创建完成")
|
||||
except Exception as e:
|
||||
print(f"[FinancialSync] 创建表 {table_name} 失败: {e}")
|
||||
raise
|
||||
|
||||
# 创建索引
|
||||
for idx_name, idx_cols in schema.get("indexes", []):
|
||||
try:
|
||||
idx_cols_sql = ', '.join(f'"{col}"' for col in idx_cols)
|
||||
self.storage._connection.execute(
|
||||
f'CREATE INDEX IF NOT EXISTS "{idx_name}" ON "{table_name}"({idx_cols_sql})'
|
||||
)
|
||||
print(f"[FinancialSync] 索引 {idx_name} 创建完成")
|
||||
except Exception as e:
|
||||
print(f"[FinancialSync] 创建索引 {idx_name} 失败: {e}")
|
||||
def _get_latest_quarter(
|
||||
self, table_name: str, period_field: str = "end_date"
|
||||
) -> Optional[str]:
|
||||
"""获取表中最新季度。
|
||||
|
||||
Args:
|
||||
table_name: 表名
|
||||
period_field: 季度字段名
|
||||
|
||||
Returns:
|
||||
最新季度字符串 (YYYYMMDD),如无数据返回 None
|
||||
"""
|
||||
try:
|
||||
result = self.storage._connection.execute(f"""
|
||||
SELECT MAX({period_field}) FROM {table_name}
|
||||
""").fetchone()
|
||||
|
||||
if result and result[0]:
|
||||
# 转换为字符串格式
|
||||
latest = result[0]
|
||||
if hasattr(latest, "strftime"):
|
||||
return latest.strftime("%Y%m%d")
|
||||
return str(latest)
|
||||
return None
|
||||
except Exception as e:
|
||||
print(f"[FinancialSync] 获取最新季度失败: {e}")
|
||||
return None
|
||||
|
||||
def _get_current_quarter(self) -> str:
|
||||
"""获取当前季度(考虑是否到季末)。
|
||||
|
||||
如果当前日期未到季度最后一天,则返回前一季度。
|
||||
这样可以避免获取尚无数据的未来季度。
|
||||
|
||||
Returns:
|
||||
当前季度字符串 (YYYYMMDD)
|
||||
"""
|
||||
today = get_today_date()
|
||||
current_quarter = date_to_quarter(today)
|
||||
|
||||
# 检查今天是否到了当前季度的最后一天
|
||||
if today < current_quarter:
|
||||
# 未到季末,返回前一季度
|
||||
return self._get_prev_quarter(current_quarter)
|
||||
|
||||
return current_quarter
|
||||
|
||||
def _get_prev_quarter(self, quarter: str) -> str:
|
||||
"""获取前一季度。
|
||||
|
||||
Args:
|
||||
quarter: 季度字符串 (YYYYMMDD)
|
||||
|
||||
Returns:
|
||||
前一季度字符串 (YYYYMMDD)
|
||||
"""
|
||||
year = int(quarter[:4])
|
||||
month_day = quarter[4:]
|
||||
|
||||
if month_day == "0331":
|
||||
# Q1 -> 去年 Q4
|
||||
return f"{year - 1}1231"
|
||||
elif month_day == "0630":
|
||||
# Q2 -> Q1
|
||||
return f"{year}0331"
|
||||
elif month_day == "0930":
|
||||
# Q3 -> Q2
|
||||
return f"{year}0630"
|
||||
else: # "1231"
|
||||
# Q4 -> Q3
|
||||
return f"{year}0930"
|
||||
|
||||
def _get_next_quarter(self, quarter: str) -> str:
|
||||
"""获取下一季度。
|
||||
|
||||
Args:
|
||||
quarter: 季度字符串 (YYYYMMDD)
|
||||
|
||||
Returns:
|
||||
下一季度字符串 (YYYYMMDD)
|
||||
"""
|
||||
year = int(quarter[:4])
|
||||
month_day = quarter[4:]
|
||||
|
||||
if month_day == "0331":
|
||||
# Q1 -> Q2
|
||||
return f"{year}0630"
|
||||
elif month_day == "0630":
|
||||
# Q2 -> Q3
|
||||
return f"{year}0930"
|
||||
elif month_day == "0930":
|
||||
# Q3 -> Q4
|
||||
return f"{year}1231"
|
||||
else: # "1231"
|
||||
# Q4 -> 明年 Q1
|
||||
return f"{year + 1}0331"
|
||||
|
||||
def _check_incremental_needed(
|
||||
self,
|
||||
table_name: str,
|
||||
period_field: str = "end_date",
|
||||
) -> tuple[bool, Optional[str], Optional[str]]:
|
||||
"""检查增量同步是否需要。
|
||||
|
||||
Args:
|
||||
table_name: 表名
|
||||
period_field: 季度字段名
|
||||
|
||||
Returns:
|
||||
(是否需要同步, 起始季度, 目标季度)
|
||||
- 如果不需要同步,返回 (False, None, None)
|
||||
"""
|
||||
# 获取表中最新季度
|
||||
latest_quarter = self._get_latest_quarter(table_name, period_field)
|
||||
|
||||
if latest_quarter is None:
|
||||
# 无本地数据,需要全量同步
|
||||
print(f"[FinancialSync] 表 {table_name} 无数据,需要全量同步")
|
||||
return (True, DEFAULT_START_DATE, self._get_current_quarter())
|
||||
|
||||
print(f"[FinancialSync] 表 {table_name} 最新季度: {latest_quarter}")
|
||||
|
||||
# 获取当前季度(考虑是否到季末)
|
||||
current_quarter = self._get_current_quarter()
|
||||
print(f"[FinancialSync] 当前季度: {current_quarter}")
|
||||
|
||||
# 比较:如果最新季度 >= 当前季度,不需要同步
|
||||
if latest_quarter >= current_quarter:
|
||||
print(
|
||||
f"[FinancialSync] 最新季度 {latest_quarter} >= 当前季度 {current_quarter},跳过增量同步"
|
||||
)
|
||||
return (False, None, None)
|
||||
|
||||
# 需要增量同步:从最新季度+1 到 当前季度
|
||||
start_quarter = self._get_next_quarter(latest_quarter)
|
||||
print(f"[FinancialSync] 增量同步: {start_quarter} -> {current_quarter}")
|
||||
|
||||
return (True, start_quarter, current_quarter)
|
||||
|
||||
def _sync_single_table(
|
||||
self,
|
||||
table_config: Dict,
|
||||
start_quarter: str,
|
||||
end_quarter: str,
|
||||
) -> int:
|
||||
"""同步单个财务数据表。
|
||||
|
||||
Args:
|
||||
table_config: 表配置字典
|
||||
start_quarter: 起始季度
|
||||
end_quarter: 目标季度
|
||||
|
||||
Returns:
|
||||
同步的记录数
|
||||
"""
|
||||
table_name = table_config["table_name"]
|
||||
get_data_func = table_config["get_data_func"]
|
||||
|
||||
# 获取需要同步的季度列表
|
||||
quarters = get_quarters_in_range(start_quarter, end_quarter)
|
||||
print(f"[FinancialSync] 计划同步 {len(quarters)} 个季度: {quarters}")
|
||||
|
||||
total_records = 0
|
||||
|
||||
# 对每个季度调用 API 获取数据
|
||||
for period in quarters:
|
||||
try:
|
||||
df = get_data_func(period)
|
||||
if df.empty:
|
||||
print(f"[WARN] 季度 {period} 无数据")
|
||||
continue
|
||||
|
||||
# 只保留合并报表 (report_type='1',注意是字符串)
|
||||
if "report_type" in df.columns:
|
||||
df = df[df["report_type"] == "1"]
|
||||
|
||||
if not df.empty:
|
||||
self.thread_storage.queue_save(table_name, df)
|
||||
print(f"[FinancialSync] 季度 {period} -> {len(df)} 条记录")
|
||||
total_records += len(df)
|
||||
|
||||
except Exception as e:
|
||||
print(f"[ERROR] 获取季度 {period} 数据失败: {e}")
|
||||
|
||||
# 刷新缓存到数据库
|
||||
self.thread_storage.flush()
|
||||
|
||||
return total_records
|
||||
|
||||
def sync_income(
|
||||
self,
|
||||
force_full: bool = False,
|
||||
) -> Dict:
|
||||
"""同步利润表数据。
|
||||
|
||||
Args:
|
||||
force_full: 若为 True,强制全量同步
|
||||
|
||||
Returns:
|
||||
同步结果字典
|
||||
"""
|
||||
table_config = FINANCIAL_TABLES["income"]
|
||||
table_name = table_config["table_name"]
|
||||
period_field = table_config["period_field"]
|
||||
|
||||
print("\n" + "=" * 60)
|
||||
print(f"[FinancialSync] 开始同步利润表 (force_full={force_full})")
|
||||
print("=" * 60)
|
||||
|
||||
# 1. 全量同步:建表
|
||||
if force_full:
|
||||
self._create_table_if_not_exists(table_name)
|
||||
start_quarter = DEFAULT_START_DATE
|
||||
end_quarter = self._get_current_quarter()
|
||||
else:
|
||||
# 2. 增量同步:检查是否需要
|
||||
needed, start_quarter, end_quarter = self._check_incremental_needed(
|
||||
table_name, period_field
|
||||
)
|
||||
|
||||
if not needed:
|
||||
return {
|
||||
"status": "skipped",
|
||||
"message": "数据已是最新",
|
||||
"table": table_name,
|
||||
}
|
||||
|
||||
# 检查表是否存在,不存在则创建
|
||||
if not self.storage.exists(table_name):
|
||||
self._create_table_if_not_exists(table_name)
|
||||
|
||||
# 3. 执行同步
|
||||
print(f"[FinancialSync] 同步范围: {start_quarter} -> {end_quarter}")
|
||||
total_records = self._sync_single_table(
|
||||
table_config, start_quarter, end_quarter
|
||||
)
|
||||
|
||||
result = {
|
||||
"status": "success",
|
||||
"table": table_name,
|
||||
"start_quarter": start_quarter,
|
||||
"end_quarter": end_quarter,
|
||||
"records": total_records,
|
||||
}
|
||||
|
||||
print(f"[FinancialSync] 利润表同步完成: {total_records} 条记录")
|
||||
return result
|
||||
|
||||
def sync_all(
|
||||
self,
|
||||
force_full: bool = False,
|
||||
) -> Dict[str, Dict]:
|
||||
"""同步所有财务数据表。
|
||||
|
||||
Args:
|
||||
force_full: 若为 True,强制全量同步
|
||||
|
||||
Returns:
|
||||
各表同步结果字典
|
||||
"""
|
||||
results = {}
|
||||
|
||||
print("\n" + "=" * 60)
|
||||
print(f"[FinancialSync] 开始同步所有财务数据 (force_full={force_full})")
|
||||
print("[Financial Sync] 财务数据同步调度中心")
|
||||
print("=" * 60)
|
||||
print(f"数据类型: {', '.join(data_types)}")
|
||||
print(f"同步模式: {'全量' if force_full else '增量'}")
|
||||
print(f"写入模式: {'预览' if dry_run else '实际写入'}")
|
||||
print("=" * 60)
|
||||
|
||||
# 同步各财务数据表
|
||||
for data_type, table_config in FINANCIAL_TABLES.items():
|
||||
for data_type in data_types:
|
||||
if data_type not in FINANCIAL_SYNCERS:
|
||||
print(f"[WARN] 未知的数据类型: {data_type}")
|
||||
results[data_type] = {"error": f"Unknown data type: {data_type}"}
|
||||
continue
|
||||
|
||||
config = FINANCIAL_SYNCERS[data_type]
|
||||
sync_func = config["sync_func"]
|
||||
display_name = config["display_name"]
|
||||
|
||||
print(f"\n[{display_name}] 开始同步...")
|
||||
|
||||
try:
|
||||
if data_type == "income":
|
||||
result = self.sync_income(force_full=force_full)
|
||||
else:
|
||||
# 预留其他表的同步逻辑
|
||||
print(f"[FinancialSync] {data_type} 暂未实现,跳过")
|
||||
result = {"status": "not_implemented"}
|
||||
|
||||
result = sync_func(force_full=force_full, dry_run=dry_run)
|
||||
results[data_type] = result
|
||||
|
||||
print(f"[{display_name}] 同步完成")
|
||||
except Exception as e:
|
||||
print(f"[ERROR] 同步 {data_type} 失败: {e}")
|
||||
results[data_type] = {"status": "error", "error": str(e)}
|
||||
print(f"[ERROR] [{display_name}] 同步失败: {e}")
|
||||
results[data_type] = {"error": str(e)}
|
||||
|
||||
# 打印汇总
|
||||
print("\n" + "=" * 60)
|
||||
print("[Financial Sync] 同步汇总")
|
||||
print("=" * 60)
|
||||
for data_type, result in results.items():
|
||||
status = result.get("status", "unknown")
|
||||
records = result.get("records", 0)
|
||||
print(f" {data_type}: {status} ({records} records)")
|
||||
if "error" in result:
|
||||
status = f"失败: {result['error']}"
|
||||
elif isinstance(result, list):
|
||||
total_records = sum(
|
||||
r.get("diff_count", 0) for r in result if isinstance(r, dict)
|
||||
)
|
||||
status = f"成功 ({len(result)} 个季度, {total_records} 条差异)"
|
||||
else:
|
||||
status = "完成"
|
||||
|
||||
display_name = FINANCIAL_SYNCERS.get(data_type, {}).get(
|
||||
"display_name", data_type
|
||||
)
|
||||
print(f" {display_name}: {status}")
|
||||
print("=" * 60)
|
||||
|
||||
return results
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# 便捷函数
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def sync_financial(
|
||||
data_type: str = "income",
|
||||
force_full: bool = False,
|
||||
) -> Dict:
|
||||
"""同步财务数据(便捷函数)。
|
||||
def preview_sync(data_types: Optional[List[str]] = None) -> Dict[str, Dict]:
|
||||
"""预览财务数据同步信息。
|
||||
|
||||
Args:
|
||||
data_type: 财务数据类型 ('income', 'balance', 'cashflow')
|
||||
force_full: 若为 True,强制全量同步
|
||||
data_types: 数据类型列表,None 表示所有类型
|
||||
|
||||
Returns:
|
||||
同步结果字典
|
||||
各类型预览信息字典
|
||||
|
||||
Example:
|
||||
>>> # 增量同步利润表
|
||||
>>> sync_financial()
|
||||
>>> # 全量同步
|
||||
>>> sync_financial(force_full=True)
|
||||
>>> preview = preview_sync()
|
||||
>>> print(preview)
|
||||
"""
|
||||
syncer = FinancialSync()
|
||||
if data_types is None:
|
||||
data_types = list(FINANCIAL_SYNCERS.keys())
|
||||
|
||||
if data_type == "income":
|
||||
return syncer.sync_income(force_full=force_full)
|
||||
else:
|
||||
raise ValueError(f"不支持的财务数据类型: {data_type}")
|
||||
previews = {}
|
||||
|
||||
print("\n" + "=" * 60)
|
||||
print("[Financial Sync] 同步预览")
|
||||
print("=" * 60)
|
||||
|
||||
def sync_all_financial(force_full: bool = False) -> Dict[str, Dict]:
|
||||
"""同步所有财务数据(便捷函数)。
|
||||
|
||||
Args:
|
||||
force_full: 若为 True,强制全量同步
|
||||
|
||||
Returns:
|
||||
各表同步结果字典
|
||||
|
||||
Example:
|
||||
>>> # 增量同步所有财务数据
|
||||
>>> sync_all_financial()
|
||||
>>> # 全量同步
|
||||
>>> sync_all_financial(force_full=True)
|
||||
"""
|
||||
syncer = FinancialSync()
|
||||
return syncer.sync_all(force_full=force_full)
|
||||
|
||||
|
||||
def preview_sync() -> Dict:
|
||||
"""预览同步信息(不实际同步)。
|
||||
|
||||
Returns:
|
||||
预览信息字典:
|
||||
{
|
||||
'income': {
|
||||
'sync_needed': bool,
|
||||
'latest_quarter': str,
|
||||
'current_quarter': str,
|
||||
'start_quarter': str,
|
||||
'end_quarter': str,
|
||||
},
|
||||
...
|
||||
}
|
||||
"""
|
||||
syncer = FinancialSync()
|
||||
preview = {}
|
||||
|
||||
for data_type, table_config in FINANCIAL_TABLES.items():
|
||||
if data_type != "income":
|
||||
for data_type in data_types:
|
||||
if data_type not in FINANCIAL_SYNCERS:
|
||||
continue
|
||||
|
||||
table_name = table_config["table_name"]
|
||||
period_field = table_config["period_field"]
|
||||
preview_func = FINANCIAL_SYNCERS[data_type]["preview_func"]
|
||||
previews[data_type] = preview_func()
|
||||
|
||||
# 获取最新季度
|
||||
latest_quarter = syncer._get_latest_quarter(table_name, period_field)
|
||||
current_quarter = syncer._get_current_quarter()
|
||||
return previews
|
||||
|
||||
# 检查是否需要同步
|
||||
needed, start_quarter, end_quarter = syncer._check_incremental_needed(
|
||||
table_name, period_field
|
||||
)
|
||||
|
||||
preview[data_type] = {
|
||||
"sync_needed": needed,
|
||||
"latest_quarter": latest_quarter,
|
||||
"current_quarter": current_quarter,
|
||||
"start_quarter": start_quarter,
|
||||
"end_quarter": end_quarter,
|
||||
def list_financial_types() -> List[Dict]:
|
||||
"""列出所有支持的财务数据类型。
|
||||
|
||||
Returns:
|
||||
数据类型信息列表
|
||||
"""
|
||||
return [
|
||||
{
|
||||
"name": name,
|
||||
"display_name": config["display_name"],
|
||||
}
|
||||
|
||||
return preview
|
||||
for name, config in FINANCIAL_SYNCERS.items()
|
||||
]
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# 主程序入口
|
||||
# =============================================================================
|
||||
# 保持向后兼容的别名
|
||||
sync_all_financial = sync_financial
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import sys
|
||||
|
||||
print("=" * 60)
|
||||
print("财务数据同步模块")
|
||||
print("财务数据同步调度中心")
|
||||
print("=" * 60)
|
||||
print("\n支持的财务数据类型:")
|
||||
print("-" * 60)
|
||||
|
||||
for info in list_financial_types():
|
||||
print(f" - {info['name']}: {info['display_name']}")
|
||||
|
||||
print("-" * 60)
|
||||
print("\n使用方式:")
|
||||
print(" # 预览同步信息")
|
||||
print(
|
||||
" from src.data.api_wrappers.financial_data.api_financial_sync import preview_sync"
|
||||
)
|
||||
print(" preview = preview_sync()")
|
||||
print(" # 同步所有财务数据(默认)")
|
||||
print(" python api_financial_sync.py")
|
||||
print("")
|
||||
print(" # 增量同步(推荐)")
|
||||
print(" # 全量同步")
|
||||
print(" python api_financial_sync.py --full")
|
||||
print("")
|
||||
print(" # 预览模式")
|
||||
print(" python api_financial_sync.py --preview")
|
||||
print("")
|
||||
print(" # Python 代码调用")
|
||||
print(
|
||||
" from src.data.api_wrappers.financial_data.api_financial_sync import sync_financial"
|
||||
)
|
||||
print(" sync_financial()")
|
||||
print("")
|
||||
print(" # 全量同步")
|
||||
print(" sync_financial(force_full=True)")
|
||||
print("")
|
||||
print(" # 同步所有财务数据")
|
||||
print(
|
||||
" from src.data.api_wrappers.financial_data.api_financial_sync import sync_all_financial"
|
||||
)
|
||||
print(" sync_all_financial()")
|
||||
print("=" * 60)
|
||||
|
||||
# 默认执行增量同步
|
||||
if len(sys.argv) > 1 and sys.argv[1] == "--full":
|
||||
print("\n[Main] 执行全量同步...")
|
||||
result = sync_all_financial(force_full=True)
|
||||
# 默认执行同步
|
||||
if len(sys.argv) > 1 and sys.argv[1] == "--preview":
|
||||
print("\n执行预览...")
|
||||
preview_sync()
|
||||
else:
|
||||
print("\n[Main] 执行增量同步...")
|
||||
result = sync_financial()
|
||||
|
||||
print("\n[Main] 执行完成!")
|
||||
print(f"结果: {result}")
|
||||
print("\n执行同步...")
|
||||
force_full = "--full" in sys.argv
|
||||
sync_financial(force_full=force_full)
|
||||
|
||||
@@ -7,133 +7,252 @@
|
||||
- income_vip: 获取某一季度全部上市公司利润表数据
|
||||
- 需要 5000 积分才能调用
|
||||
- period 参数为报告期(季度最后一天,如 20231231)
|
||||
|
||||
使用方式:
|
||||
# 同步利润表数据
|
||||
from src.data.api_wrappers.financial_data.api_income import IncomeQuarterSync, sync_income
|
||||
|
||||
# 方式1: 使用类
|
||||
syncer = IncomeQuarterSync()
|
||||
syncer.sync_incremental() # 增量同步
|
||||
syncer.sync_full() # 全量同步
|
||||
|
||||
# 方式2: 使用便捷函数
|
||||
sync_income() # 增量同步
|
||||
sync_income(force_full=True) # 全量同步
|
||||
"""
|
||||
|
||||
from typing import Optional
|
||||
import pandas as pd
|
||||
from typing import Optional, List
|
||||
from tqdm import tqdm
|
||||
|
||||
from src.data.client import TushareClient
|
||||
from src.data.storage import ThreadSafeStorage
|
||||
from src.data.utils import get_today_date, get_quarters_in_range
|
||||
from src.data.api_wrappers.base_financial_sync import (
|
||||
QuarterBasedSync,
|
||||
sync_financial_data,
|
||||
preview_financial_sync,
|
||||
)
|
||||
|
||||
|
||||
def get_income(
|
||||
period: str,
|
||||
fields: Optional[str] = None,
|
||||
) -> pd.DataFrame:
|
||||
"""获取利润表数据 (VIP 接口)
|
||||
class IncomeQuarterSync(QuarterBasedSync):
|
||||
"""利润表季度同步实现。
|
||||
|
||||
从 Tushare 获取指定季度的全部上市公司利润表数据。
|
||||
使用 income_vip 接口按季度获取全部上市公司利润表数据。
|
||||
|
||||
Args:
|
||||
period: 报告期,季度最后一天日期 (如 '20231231', '20230930')
|
||||
- 0331: 一季报
|
||||
- 0630: 半年报
|
||||
- 0930: 三季报
|
||||
- 1231: 年报
|
||||
fields: 指定返回字段,默认返回全部字段
|
||||
|
||||
Returns:
|
||||
pd.DataFrame 包含利润表数据:
|
||||
- ts_code: 股票代码
|
||||
- ann_date: 公告日期
|
||||
- end_date: 报告期
|
||||
- basic_eps: 基本每股收益
|
||||
- report_type: 报告类型 (1=合并报表)
|
||||
|
||||
Example:
|
||||
>>> data = get_income('20231231')
|
||||
>>> print(data[['ts_code', 'ann_date', 'basic_eps']].head())
|
||||
表结构: financial_income
|
||||
主键: (ts_code, end_date)
|
||||
"""
|
||||
client = TushareClient()
|
||||
# 默认字段:返回全部字段(利润表有100+字段)
|
||||
if fields is None:
|
||||
fields = "ts_code,ann_date,f_ann_date,end_date,report_type,comp_type,end_type,basic_eps,diluted_eps,total_revenue,revenue,int_income,prem_earned,comm_income,n_commis_income,n_oth_income,n_oth_b_income,prem_income,out_prem,une_prem_reser,reins_income,n_sec_tb_income,n_sec_uw_income,n_asset_mg_income,oth_b_income,fv_value_chg_gain,invest_income,ass_invest_income,forex_gain,total_cogs,oper_cost,int_exp,comm_exp,biz_tax_surchg,sell_exp,admin_exp,fin_exp,assets_impair_loss,prem_refund,compens_payout,reser_insur_liab,div_payt,reins_exp,oper_exp,compens_payout_refu,insur_reser_refu,reins_cost_refund,other_bus_cost,operate_profit,non_oper_income,non_oper_exp,nca_disploss,total_profit,income_tax,n_income,n_income_attr_p,minority_gain,oth_compr_income,t_compr_income,compr_inc_attr_p,compr_inc_attr_m_s,ebit,ebitda,insurance_exp,undist_profit,distable_profit,rd_exp,fin_exp_int_exp,fin_exp_int_inc,transfer_surplus_rese,transfer_housing_imprest,transfer_oth,adj_lossgain,withdra_legal_surplus,withdra_legal_pubfund,withdra_biz_devfund,withdra_rese_fund,withdra_oth_ersu,workers_welfare,distr_profit_shrhder,prfshare_payable_dvd,comshare_payable_dvd,capit_comstock_div,net_after_nr_lp_correct,credit_impa_loss,net_expo_hedging_benefits,oth_impair_loss_assets,total_opcost,amodcost_fin_assets,oth_income,asset_disp_income,continued_net_profit,end_net_profit,update_flag"
|
||||
|
||||
params = {"fields": fields, "period": period}
|
||||
return client.query("income_vip", **params)
|
||||
table_name = "financial_income"
|
||||
api_name = "income_vip"
|
||||
|
||||
# 目标报表类型:默认只同步合并报表
|
||||
TARGET_REPORT_TYPE = "1"
|
||||
|
||||
# =============================================================================
|
||||
# IncomeSync - 利润表数据批量同步类
|
||||
# =============================================================================
|
||||
# 表结构定义
|
||||
TABLE_SCHEMA = {
|
||||
"ts_code": "VARCHAR(16) NOT NULL",
|
||||
"ann_date": "DATE",
|
||||
"f_ann_date": "DATE",
|
||||
"end_date": "DATE NOT NULL",
|
||||
"report_type": "INTEGER",
|
||||
"comp_type": "INTEGER",
|
||||
"end_type": "VARCHAR(10)",
|
||||
"basic_eps": "DOUBLE",
|
||||
"diluted_eps": "DOUBLE",
|
||||
"total_revenue": "DOUBLE",
|
||||
"revenue": "DOUBLE",
|
||||
"int_income": "DOUBLE",
|
||||
"prem_earned": "DOUBLE",
|
||||
"comm_income": "DOUBLE",
|
||||
"n_commis_income": "DOUBLE",
|
||||
"n_oth_income": "DOUBLE",
|
||||
"n_oth_b_income": "DOUBLE",
|
||||
"prem_income": "DOUBLE",
|
||||
"out_prem": "DOUBLE",
|
||||
"une_prem_reser": "DOUBLE",
|
||||
"reins_income": "DOUBLE",
|
||||
"n_sec_tb_income": "DOUBLE",
|
||||
"n_sec_uw_income": "DOUBLE",
|
||||
"n_asset_mg_income": "DOUBLE",
|
||||
"oth_b_income": "DOUBLE",
|
||||
"fv_value_chg_gain": "DOUBLE",
|
||||
"invest_income": "DOUBLE",
|
||||
"ass_invest_income": "DOUBLE",
|
||||
"forex_gain": "DOUBLE",
|
||||
"total_cogs": "DOUBLE",
|
||||
"oper_cost": "DOUBLE",
|
||||
"int_exp": "DOUBLE",
|
||||
"comm_exp": "DOUBLE",
|
||||
"biz_tax_surchg": "DOUBLE",
|
||||
"sell_exp": "DOUBLE",
|
||||
"admin_exp": "DOUBLE",
|
||||
"fin_exp": "DOUBLE",
|
||||
"assets_impair_loss": "DOUBLE",
|
||||
"prem_refund": "DOUBLE",
|
||||
"compens_payout": "DOUBLE",
|
||||
"reser_insur_liab": "DOUBLE",
|
||||
"div_payt": "DOUBLE",
|
||||
"reins_exp": "DOUBLE",
|
||||
"oper_exp": "DOUBLE",
|
||||
"compens_payout_refu": "DOUBLE",
|
||||
"insur_reser_refu": "DOUBLE",
|
||||
"reins_cost_refund": "DOUBLE",
|
||||
"other_bus_cost": "DOUBLE",
|
||||
"operate_profit": "DOUBLE",
|
||||
"non_oper_income": "DOUBLE",
|
||||
"non_oper_exp": "DOUBLE",
|
||||
"nca_disploss": "DOUBLE",
|
||||
"total_profit": "DOUBLE",
|
||||
"income_tax": "DOUBLE",
|
||||
"n_income": "DOUBLE",
|
||||
"n_income_attr_p": "DOUBLE",
|
||||
"minority_gain": "DOUBLE",
|
||||
"oth_compr_income": "DOUBLE",
|
||||
"t_compr_income": "DOUBLE",
|
||||
"compr_inc_attr_p": "DOUBLE",
|
||||
"compr_inc_attr_m_s": "DOUBLE",
|
||||
"ebit": "DOUBLE",
|
||||
"ebitda": "DOUBLE",
|
||||
"insurance_exp": "DOUBLE",
|
||||
"undist_profit": "DOUBLE",
|
||||
"distable_profit": "DOUBLE",
|
||||
"rd_exp": "DOUBLE",
|
||||
"fin_exp_int_exp": "DOUBLE",
|
||||
"fin_exp_int_inc": "DOUBLE",
|
||||
"transfer_surplus_rese": "DOUBLE",
|
||||
"transfer_housing_imprest": "DOUBLE",
|
||||
"transfer_oth": "DOUBLE",
|
||||
"adj_lossgain": "DOUBLE",
|
||||
"withdra_legal_surplus": "DOUBLE",
|
||||
"withdra_legal_pubfund": "DOUBLE",
|
||||
"withdra_biz_devfund": "DOUBLE",
|
||||
"withdra_rese_fund": "DOUBLE",
|
||||
"withdra_oth_ersu": "DOUBLE",
|
||||
"workers_welfare": "DOUBLE",
|
||||
"distr_profit_shrhder": "DOUBLE",
|
||||
"prfshare_payable_dvd": "DOUBLE",
|
||||
"comshare_payable_dvd": "DOUBLE",
|
||||
"capit_comstock_div": "DOUBLE",
|
||||
"net_after_nr_lp_correct": "DOUBLE",
|
||||
"credit_impa_loss": "DOUBLE",
|
||||
"net_expo_hedging_benefits": "DOUBLE",
|
||||
"oth_impair_loss_assets": "DOUBLE",
|
||||
"total_opcost": "DOUBLE",
|
||||
"amodcost_fin_assets": "DOUBLE",
|
||||
"oth_income": "DOUBLE",
|
||||
"asset_disp_income": "DOUBLE",
|
||||
"continued_net_profit": "DOUBLE",
|
||||
"end_net_profit": "DOUBLE",
|
||||
"update_flag": "VARCHAR(1)",
|
||||
}
|
||||
|
||||
|
||||
class IncomeSync:
|
||||
"""利润表数据批量同步管理器 (VIP 版本)
|
||||
|
||||
功能特性:
|
||||
- 按季度同步,每次请求获取该季度全部上市公司数据
|
||||
- 使用 income_vip 接口
|
||||
- 只保留合并报表(report_type=1)
|
||||
- 使用 ThreadSafeStorage 安全写入
|
||||
|
||||
Example:
|
||||
>>> sync = IncomeSync()
|
||||
>>> sync.sync(start_date='20200101', end_date='20231231')
|
||||
"""
|
||||
# 索引定义(不要创建唯一索引)
|
||||
# 注意:财务数据可能发生多次修正,不设置主键和唯一索引
|
||||
TABLE_INDEXES = [
|
||||
("idx_financial_income_ts_code", ["ts_code"]),
|
||||
("idx_financial_income_end_date", ["end_date"]),
|
||||
("idx_financial_income_ts_period", ["ts_code", "end_date", "report_type"]),
|
||||
]
|
||||
|
||||
def __init__(self):
|
||||
"""初始化同步管理器"""
|
||||
self.storage = ThreadSafeStorage()
|
||||
self.client = TushareClient()
|
||||
"""初始化利润表同步器。"""
|
||||
super().__init__()
|
||||
self._fields = None # 默认返回全部字段
|
||||
|
||||
def sync(
|
||||
self,
|
||||
start_date: str,
|
||||
end_date: Optional[str] = None,
|
||||
) -> None:
|
||||
"""同步利润表数据
|
||||
def fetch_single_quarter(self, period: str) -> pd.DataFrame:
|
||||
"""获取单季度的全部上市公司利润表数据。
|
||||
|
||||
Args:
|
||||
start_date: 开始日期 YYYYMMDD
|
||||
end_date: 结束日期 YYYYMMDD(默认为今天)
|
||||
period: 报告期,季度最后一天日期(如 '20231231')
|
||||
|
||||
Returns:
|
||||
包含该季度全部上市公司利润表数据的 DataFrame
|
||||
"""
|
||||
if end_date is None:
|
||||
end_date = get_today_date()
|
||||
params = {"period": period}
|
||||
|
||||
# 获取日期范围内的所有季度
|
||||
quarters = get_quarters_in_range(start_date, end_date)
|
||||
print(f"[IncomeSync] 计划同步 {len(quarters)} 个季度: {quarters}")
|
||||
if self._fields:
|
||||
params["fields"] = self._fields
|
||||
|
||||
# 对每个季度调用 income_vip 获取全部股票数据
|
||||
for period in tqdm(quarters, desc="Syncing income by quarter"):
|
||||
try:
|
||||
df = get_income(period)
|
||||
if df.empty:
|
||||
print(f"[WARN] 季度 {period} 无数据")
|
||||
continue
|
||||
return self.client.query(self.api_name, **params)
|
||||
|
||||
# 只保留合并报表 (report_type='1',注意是字符串)
|
||||
if "report_type" in df.columns:
|
||||
df = df[df["report_type"] == "1"]
|
||||
|
||||
if not df.empty:
|
||||
self.storage.queue_save("financial_income", df)
|
||||
print(f"[IncomeSync] 季度 {period} -> {len(df)} 条记录")
|
||||
|
||||
except Exception as e:
|
||||
print(f"[ERROR] 获取季度 {period} 数据失败: {e}")
|
||||
|
||||
# 刷新缓存到数据库
|
||||
self.storage.flush()
|
||||
print(f"[IncomeSync] 同步完成,共处理 {len(quarters)} 个季度")
|
||||
# =============================================================================
|
||||
# 便捷函数
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def sync_income(
|
||||
start_date: str,
|
||||
end_date: Optional[str] = None,
|
||||
) -> None:
|
||||
"""同步利润表数据(便捷函数)
|
||||
force_full: bool = False,
|
||||
dry_run: bool = False,
|
||||
) -> list:
|
||||
"""同步利润表数据(便捷函数)。
|
||||
|
||||
Args:
|
||||
start_date: 开始日期 YYYYMMDD
|
||||
end_date: 结束日期 YYYYMMDD(默认为今天)
|
||||
force_full: 若为 True,强制全量同步
|
||||
dry_run: 若为 True,仅预览不写入
|
||||
|
||||
Returns:
|
||||
同步结果列表
|
||||
|
||||
Example:
|
||||
>>> sync_income('20200101')
|
||||
>>> sync_income('20200101', '20231231')
|
||||
>>> # 增量同步
|
||||
>>> sync_income()
|
||||
>>>
|
||||
>>> # 全量同步
|
||||
>>> sync_income(force_full=True)
|
||||
>>>
|
||||
>>> # 预览
|
||||
>>> sync_income(dry_run=True)
|
||||
"""
|
||||
syncer = IncomeSync()
|
||||
syncer.sync(start_date, end_date)
|
||||
return sync_financial_data(IncomeQuarterSync, force_full, dry_run)
|
||||
|
||||
|
||||
def preview_income_sync() -> dict:
|
||||
"""预览利润表同步信息。
|
||||
|
||||
Returns:
|
||||
预览信息字典
|
||||
"""
|
||||
return preview_financial_sync(IncomeQuarterSync)
|
||||
|
||||
|
||||
def get_income(period: str, fields: Optional[str] = None) -> pd.DataFrame:
|
||||
"""获取利润表数据(原始接口,单季度)。
|
||||
|
||||
用于直接获取某个季度的数据,不进行同步管理。
|
||||
|
||||
Args:
|
||||
period: 报告期,季度最后一天日期(如 '20231231')
|
||||
fields: 指定返回字段,默认返回全部字段
|
||||
|
||||
Returns:
|
||||
包含利润表数据的 DataFrame
|
||||
"""
|
||||
client = TushareClient()
|
||||
|
||||
if fields is None:
|
||||
fields = (
|
||||
"ts_code,ann_date,f_ann_date,end_date,report_type,comp_type,end_type,"
|
||||
"basic_eps,diluted_eps,total_revenue,revenue,int_income,prem_earned,"
|
||||
"comm_income,n_commis_income,n_oth_income,n_oth_b_income,prem_income,"
|
||||
"out_prem,une_prem_reser,reins_income,n_sec_tb_income,n_sec_uw_income,"
|
||||
"n_asset_mg_income,oth_b_income,fv_value_chg_gain,invest_income,"
|
||||
"ass_invest_income,forex_gain,total_cogs,oper_cost,int_exp,comm_exp,"
|
||||
"biz_tax_surchg,sell_exp,admin_exp,fin_exp,assets_impair_loss,prem_refund,"
|
||||
"compens_payout,reser_insur_liab,div_payt,reins_exp,oper_exp,"
|
||||
"compens_payout_refu,insur_reser_refu,reins_cost_refund,other_bus_cost,"
|
||||
"operate_profit,non_oper_income,non_oper_exp,nca_disploss,total_profit,"
|
||||
"income_tax,n_income,n_income_attr_p,minority_gain,oth_compr_income,"
|
||||
"t_compr_income,compr_inc_attr_p,compr_inc_attr_m_s,ebit,ebitda,"
|
||||
"insurance_exp,undist_profit,distable_profit,rd_exp,fin_exp_int_exp,"
|
||||
"fin_exp_int_inc,transfer_surplus_rese,transfer_housing_imprest,"
|
||||
"transfer_oth,adj_lossgain,withdra_legal_surplus,withdra_legal_pubfund,"
|
||||
"withdra_biz_devfund,withdra_rese_fund,withdra_oth_ersu,workers_welfare,"
|
||||
"distr_profit_shrhder,prfshare_payable_dvd,comshare_payable_dvd,"
|
||||
"capit_comstock_div,net_after_nr_lp_correct,credit_impa_loss,"
|
||||
"net_expo_hedging_benefits,oth_impair_loss_assets,total_opcost,"
|
||||
"amodcost_fin_assets,oth_income,asset_disp_income,continued_net_profit,"
|
||||
"end_net_profit,update_flag"
|
||||
)
|
||||
|
||||
return client.query("income_vip", period=period, fields=fields)
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
"""DuckDB storage for data persistence."""
|
||||
|
||||
import pandas as pd
|
||||
import polars as pl
|
||||
import duckdb
|
||||
@@ -73,13 +74,22 @@ class Storage:
|
||||
- api_financial_sync.py: FinancialSync.TABLE_SCHEMAS
|
||||
"""
|
||||
self._connection = duckdb.connect(str(self.db_path))
|
||||
def save(self, name: str, data: pd.DataFrame, mode: str = "append") -> dict:
|
||||
|
||||
def save(
|
||||
self,
|
||||
name: str,
|
||||
data: pd.DataFrame,
|
||||
mode: str = "append",
|
||||
use_upsert: bool = True,
|
||||
) -> dict:
|
||||
"""Save data to DuckDB.
|
||||
|
||||
Args:
|
||||
name: Table name
|
||||
data: DataFrame to save
|
||||
mode: 'append' (UPSERT) or 'replace' (DELETE + INSERT)
|
||||
mode: 'append' or 'replace' (DELETE + INSERT)
|
||||
use_upsert: 若为 True 使用 INSERT OR REPLACE (upsert),
|
||||
若为 False 使用普通 INSERT (依赖外部删除逻辑)
|
||||
|
||||
Returns:
|
||||
Dict with save result
|
||||
@@ -123,15 +133,18 @@ class Storage:
|
||||
if mode == "replace":
|
||||
self._connection.execute(f"DELETE FROM {name}")
|
||||
|
||||
# UPSERT: INSERT OR REPLACE
|
||||
columns = ', '.join(f'"{col}"' for col in data.columns)
|
||||
columns = ", ".join(f'"{col}"' for col in data.columns)
|
||||
|
||||
if use_upsert:
|
||||
# UPSERT: INSERT OR REPLACE (需要主键约束)
|
||||
self._connection.execute(f"""
|
||||
INSERT OR REPLACE INTO {name} ({columns})
|
||||
SELECT {columns} FROM temp_data
|
||||
""")
|
||||
columns = ", ".join(data.columns)
|
||||
else:
|
||||
# 普通 INSERT (依赖外部删除逻辑确保无重复)
|
||||
self._connection.execute(f"""
|
||||
INSERT OR REPLACE INTO {name} ({columns})
|
||||
INSERT INTO {name} ({columns})
|
||||
SELECT {columns} FROM temp_data
|
||||
""")
|
||||
|
||||
@@ -220,8 +233,8 @@ class Storage:
|
||||
# Build query
|
||||
conditions = []
|
||||
if start_date and end_date:
|
||||
start = pd.to_datetime(start_date, format='%Y%m%d').date()
|
||||
end = pd.to_datetime(end_date, format='%Y%m%d').date()
|
||||
start = pd.to_datetime(start_date, format="%Y%m%d").date()
|
||||
end = pd.to_datetime(end_date, format="%Y%m%d").date()
|
||||
conditions.append(f"trade_date BETWEEN '{start}' AND '{end}'")
|
||||
if ts_code:
|
||||
conditions.append(f"ts_code = '{ts_code}'")
|
||||
@@ -295,12 +308,18 @@ class ThreadSafeStorage:
|
||||
|
||||
def __init__(self):
|
||||
self.storage = Storage()
|
||||
self._pending_writes: List[tuple] = [] # [(name, data), ...]
|
||||
self._pending_writes: List[tuple] = [] # [(name, data, use_upsert), ...]
|
||||
|
||||
def queue_save(self, name: str, data: pd.DataFrame):
|
||||
"""将数据放入写入队列(不立即写入)"""
|
||||
def queue_save(self, name: str, data: pd.DataFrame, use_upsert: bool = True):
|
||||
"""将数据放入写入队列(不立即写入)
|
||||
|
||||
Args:
|
||||
name: 表名
|
||||
data: DataFrame 数据
|
||||
use_upsert: 若为 True 使用 INSERT OR REPLACE,若为 False 使用普通 INSERT
|
||||
"""
|
||||
if not data.empty:
|
||||
self._pending_writes.append((name, data))
|
||||
self._pending_writes.append((name, data, use_upsert))
|
||||
|
||||
def flush(self):
|
||||
"""批量写入所有队列数据。
|
||||
@@ -310,21 +329,22 @@ class ThreadSafeStorage:
|
||||
if not self._pending_writes:
|
||||
return
|
||||
|
||||
# 合并相同表的数据
|
||||
# 按表名和 use_upsert 分组
|
||||
table_data = defaultdict(list)
|
||||
|
||||
for name, data in self._pending_writes:
|
||||
table_data[name].append(data)
|
||||
for name, data, use_upsert in self._pending_writes:
|
||||
key = (name, use_upsert)
|
||||
table_data[key].append(data)
|
||||
|
||||
# 批量写入每个表
|
||||
for name, data_list in table_data.items():
|
||||
for (name, use_upsert), data_list in table_data.items():
|
||||
combined = pd.concat(data_list, ignore_index=True)
|
||||
# 在批量数据中先去重
|
||||
if "ts_code" in combined.columns and "trade_date" in combined.columns:
|
||||
combined = combined.drop_duplicates(
|
||||
subset=["ts_code", "trade_date"], keep="last"
|
||||
)
|
||||
self.storage.save(name, combined, mode="append")
|
||||
self.storage.save(name, combined, mode="append", use_upsert=use_upsert)
|
||||
|
||||
self._pending_writes.clear()
|
||||
|
||||
|
||||
File diff suppressed because one or more lines are too long
Reference in New Issue
Block a user