refactor: 代码审查修复 - 日期过滤、性能优化、数据泄露防护

- 修复 data_loader.py 财务数据日期过滤,支持按范围加载
- 优化 MADClipper 使用窗口函数替代 join,提升性能
- 修复训练日期边界问题,添加1天间隔避免数据泄露
- 新增 .gitignore 规则忽略训练输出目录
This commit is contained in:
2026-02-25 21:11:19 +08:00
parent 593ec99466
commit a9e4746239
24 changed files with 3597 additions and 56 deletions

View File

@@ -90,6 +90,113 @@ class Storage:
CREATE INDEX IF NOT EXISTS idx_daily_date_code ON daily(trade_date, ts_code)
""")
# Create financial_income table for income statement data
# 完整的利润表字段94列全部
self._connection.execute("""
CREATE TABLE IF NOT EXISTS financial_income (
ts_code VARCHAR(16) NOT NULL,
ann_date DATE,
f_ann_date DATE,
end_date DATE NOT NULL,
report_type INTEGER,
comp_type INTEGER,
end_type VARCHAR(10),
basic_eps DOUBLE,
diluted_eps DOUBLE,
total_revenue DOUBLE,
revenue DOUBLE,
int_income DOUBLE,
prem_earned DOUBLE,
comm_income DOUBLE,
n_commis_income DOUBLE,
n_oth_income DOUBLE,
n_oth_b_income DOUBLE,
prem_income DOUBLE,
out_prem DOUBLE,
une_prem_reser DOUBLE,
reins_income DOUBLE,
n_sec_tb_income DOUBLE,
n_sec_uw_income DOUBLE,
n_asset_mg_income DOUBLE,
oth_b_income DOUBLE,
fv_value_chg_gain DOUBLE,
invest_income DOUBLE,
ass_invest_income DOUBLE,
forex_gain DOUBLE,
total_cogs DOUBLE,
oper_cost DOUBLE,
int_exp DOUBLE,
comm_exp DOUBLE,
biz_tax_surchg DOUBLE,
sell_exp DOUBLE,
admin_exp DOUBLE,
fin_exp DOUBLE,
assets_impair_loss DOUBLE,
prem_refund DOUBLE,
compens_payout DOUBLE,
reser_insur_liab DOUBLE,
div_payt DOUBLE,
reins_exp DOUBLE,
oper_exp DOUBLE,
compens_payout_refu DOUBLE,
insur_reser_refu DOUBLE,
reins_cost_refund DOUBLE,
other_bus_cost DOUBLE,
operate_profit DOUBLE,
non_oper_income DOUBLE,
non_oper_exp DOUBLE,
nca_disploss DOUBLE,
total_profit DOUBLE,
income_tax DOUBLE,
n_income DOUBLE,
n_income_attr_p DOUBLE,
minority_gain DOUBLE,
oth_compr_income DOUBLE,
t_compr_income DOUBLE,
compr_inc_attr_p DOUBLE,
compr_inc_attr_m_s DOUBLE,
ebit DOUBLE,
ebitda DOUBLE,
insurance_exp DOUBLE,
undist_profit DOUBLE,
distable_profit DOUBLE,
rd_exp DOUBLE,
fin_exp_int_exp DOUBLE,
fin_exp_int_inc DOUBLE,
transfer_surplus_rese DOUBLE,
transfer_housing_imprest DOUBLE,
transfer_oth DOUBLE,
adj_lossgain DOUBLE,
withdra_legal_surplus DOUBLE,
withdra_legal_pubfund DOUBLE,
withdra_biz_devfund DOUBLE,
withdra_rese_fund DOUBLE,
withdra_oth_ersu DOUBLE,
workers_welfare DOUBLE,
distr_profit_shrhder DOUBLE,
prfshare_payable_dvd DOUBLE,
comshare_payable_dvd DOUBLE,
capit_comstock_div DOUBLE,
net_after_nr_lp_correct DOUBLE,
credit_impa_loss DOUBLE,
net_expo_hedging_benefits DOUBLE,
oth_impair_loss_assets DOUBLE,
total_opcost DOUBLE,
amodcost_fin_assets DOUBLE,
oth_income DOUBLE,
asset_disp_income DOUBLE,
continued_net_profit DOUBLE,
end_net_profit DOUBLE,
update_flag VARCHAR(1),
PRIMARY KEY (ts_code, end_date)
)
""")
# Create index for financial_income
self._connection.execute("""
CREATE INDEX IF NOT EXISTS idx_financial_ann ON financial_income(ts_code, ann_date)
""")
def save(self, name: str, data: pd.DataFrame, mode: str = "append") -> dict:
"""Save data to DuckDB.
@@ -104,13 +211,35 @@ class Storage:
if data.empty:
return {"status": "skipped", "rows": 0}
# Ensure date column is proper type
# 确保日期列是正确的类型 (YYYYMMDD -> date)
# trade_date: 日线数据日期
if "trade_date" in data.columns:
data = data.copy()
data["trade_date"] = pd.to_datetime(
data["trade_date"], format="%Y%m%d"
).dt.date
# ann_date: 公告日期
if "ann_date" in data.columns:
data = data.copy()
data["ann_date"] = pd.to_datetime(
data["ann_date"], format="%Y%m%d", errors="coerce"
).dt.date
# f_ann_date: 最终公告日期
if "f_ann_date" in data.columns:
data = data.copy()
data["f_ann_date"] = pd.to_datetime(
data["f_ann_date"], format="%Y%m%d", errors="coerce"
).dt.date
# end_date: 报告期/期末日期
if "end_date" in data.columns:
data = data.copy()
data["end_date"] = pd.to_datetime(
data["end_date"], format="%Y%m%d", errors="coerce"
).dt.date
# Register DataFrame as temporary view
self._connection.register("temp_data", data)