refactor: 代码审查修复 - 日期过滤、性能优化、数据泄露防护
- 修复 data_loader.py 财务数据日期过滤,支持按范围加载 - 优化 MADClipper 使用窗口函数替代 join,提升性能 - 修复训练日期边界问题,添加1天间隔避免数据泄露 - 新增 .gitignore 规则忽略训练输出目录
This commit is contained in:
@@ -90,6 +90,113 @@ class Storage:
|
||||
CREATE INDEX IF NOT EXISTS idx_daily_date_code ON daily(trade_date, ts_code)
|
||||
""")
|
||||
|
||||
# Create financial_income table for income statement data
|
||||
# 完整的利润表字段(94列全部)
|
||||
self._connection.execute("""
|
||||
CREATE TABLE IF NOT EXISTS financial_income (
|
||||
ts_code VARCHAR(16) NOT NULL,
|
||||
ann_date DATE,
|
||||
f_ann_date DATE,
|
||||
end_date DATE NOT NULL,
|
||||
report_type INTEGER,
|
||||
comp_type INTEGER,
|
||||
end_type VARCHAR(10),
|
||||
basic_eps DOUBLE,
|
||||
diluted_eps DOUBLE,
|
||||
total_revenue DOUBLE,
|
||||
revenue DOUBLE,
|
||||
int_income DOUBLE,
|
||||
prem_earned DOUBLE,
|
||||
comm_income DOUBLE,
|
||||
n_commis_income DOUBLE,
|
||||
n_oth_income DOUBLE,
|
||||
n_oth_b_income DOUBLE,
|
||||
prem_income DOUBLE,
|
||||
out_prem DOUBLE,
|
||||
une_prem_reser DOUBLE,
|
||||
reins_income DOUBLE,
|
||||
n_sec_tb_income DOUBLE,
|
||||
n_sec_uw_income DOUBLE,
|
||||
n_asset_mg_income DOUBLE,
|
||||
oth_b_income DOUBLE,
|
||||
fv_value_chg_gain DOUBLE,
|
||||
invest_income DOUBLE,
|
||||
ass_invest_income DOUBLE,
|
||||
forex_gain DOUBLE,
|
||||
total_cogs DOUBLE,
|
||||
oper_cost DOUBLE,
|
||||
int_exp DOUBLE,
|
||||
comm_exp DOUBLE,
|
||||
biz_tax_surchg DOUBLE,
|
||||
sell_exp DOUBLE,
|
||||
admin_exp DOUBLE,
|
||||
fin_exp DOUBLE,
|
||||
assets_impair_loss DOUBLE,
|
||||
prem_refund DOUBLE,
|
||||
compens_payout DOUBLE,
|
||||
reser_insur_liab DOUBLE,
|
||||
div_payt DOUBLE,
|
||||
reins_exp DOUBLE,
|
||||
oper_exp DOUBLE,
|
||||
compens_payout_refu DOUBLE,
|
||||
insur_reser_refu DOUBLE,
|
||||
reins_cost_refund DOUBLE,
|
||||
other_bus_cost DOUBLE,
|
||||
operate_profit DOUBLE,
|
||||
non_oper_income DOUBLE,
|
||||
non_oper_exp DOUBLE,
|
||||
nca_disploss DOUBLE,
|
||||
total_profit DOUBLE,
|
||||
income_tax DOUBLE,
|
||||
n_income DOUBLE,
|
||||
n_income_attr_p DOUBLE,
|
||||
minority_gain DOUBLE,
|
||||
oth_compr_income DOUBLE,
|
||||
t_compr_income DOUBLE,
|
||||
compr_inc_attr_p DOUBLE,
|
||||
compr_inc_attr_m_s DOUBLE,
|
||||
ebit DOUBLE,
|
||||
ebitda DOUBLE,
|
||||
insurance_exp DOUBLE,
|
||||
undist_profit DOUBLE,
|
||||
distable_profit DOUBLE,
|
||||
rd_exp DOUBLE,
|
||||
fin_exp_int_exp DOUBLE,
|
||||
fin_exp_int_inc DOUBLE,
|
||||
transfer_surplus_rese DOUBLE,
|
||||
transfer_housing_imprest DOUBLE,
|
||||
transfer_oth DOUBLE,
|
||||
adj_lossgain DOUBLE,
|
||||
withdra_legal_surplus DOUBLE,
|
||||
withdra_legal_pubfund DOUBLE,
|
||||
withdra_biz_devfund DOUBLE,
|
||||
withdra_rese_fund DOUBLE,
|
||||
withdra_oth_ersu DOUBLE,
|
||||
workers_welfare DOUBLE,
|
||||
distr_profit_shrhder DOUBLE,
|
||||
prfshare_payable_dvd DOUBLE,
|
||||
comshare_payable_dvd DOUBLE,
|
||||
capit_comstock_div DOUBLE,
|
||||
net_after_nr_lp_correct DOUBLE,
|
||||
credit_impa_loss DOUBLE,
|
||||
net_expo_hedging_benefits DOUBLE,
|
||||
oth_impair_loss_assets DOUBLE,
|
||||
total_opcost DOUBLE,
|
||||
amodcost_fin_assets DOUBLE,
|
||||
oth_income DOUBLE,
|
||||
asset_disp_income DOUBLE,
|
||||
continued_net_profit DOUBLE,
|
||||
end_net_profit DOUBLE,
|
||||
update_flag VARCHAR(1),
|
||||
PRIMARY KEY (ts_code, end_date)
|
||||
)
|
||||
""")
|
||||
|
||||
# Create index for financial_income
|
||||
self._connection.execute("""
|
||||
CREATE INDEX IF NOT EXISTS idx_financial_ann ON financial_income(ts_code, ann_date)
|
||||
""")
|
||||
|
||||
def save(self, name: str, data: pd.DataFrame, mode: str = "append") -> dict:
|
||||
"""Save data to DuckDB.
|
||||
|
||||
@@ -104,13 +211,35 @@ class Storage:
|
||||
if data.empty:
|
||||
return {"status": "skipped", "rows": 0}
|
||||
|
||||
# Ensure date column is proper type
|
||||
# 确保日期列是正确的类型 (YYYYMMDD -> date)
|
||||
# trade_date: 日线数据日期
|
||||
if "trade_date" in data.columns:
|
||||
data = data.copy()
|
||||
data["trade_date"] = pd.to_datetime(
|
||||
data["trade_date"], format="%Y%m%d"
|
||||
).dt.date
|
||||
|
||||
# ann_date: 公告日期
|
||||
if "ann_date" in data.columns:
|
||||
data = data.copy()
|
||||
data["ann_date"] = pd.to_datetime(
|
||||
data["ann_date"], format="%Y%m%d", errors="coerce"
|
||||
).dt.date
|
||||
|
||||
# f_ann_date: 最终公告日期
|
||||
if "f_ann_date" in data.columns:
|
||||
data = data.copy()
|
||||
data["f_ann_date"] = pd.to_datetime(
|
||||
data["f_ann_date"], format="%Y%m%d", errors="coerce"
|
||||
).dt.date
|
||||
|
||||
# end_date: 报告期/期末日期
|
||||
if "end_date" in data.columns:
|
||||
data = data.copy()
|
||||
data["end_date"] = pd.to_datetime(
|
||||
data["end_date"], format="%Y%m%d", errors="coerce"
|
||||
).dt.date
|
||||
|
||||
# Register DataFrame as temporary view
|
||||
self._connection.register("temp_data", data)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user