This commit is contained in:
2025-06-10 15:22:25 +08:00
parent 15f327b8ae
commit 0c12e6c2b1
25 changed files with 8157 additions and 5583 deletions

View File

@@ -16,7 +16,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
"e:\\PyProject\\NewStock\\main\\factor\n"
"/mnt/d/PyProject/NewStock\n"
]
}
],
@@ -62,8 +62,8 @@
"cyq perf\n",
"left merge on ['ts_code', 'trade_date']\n",
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 5123740 entries, 0 to 5123739\n",
"Data columns (total 31 columns):\n",
"RangeIndex: 8713571 entries, 0 to 8713570\n",
"Data columns (total 33 columns):\n",
" # Column Dtype \n",
"--- ------ ----- \n",
" 0 ts_code object \n",
@@ -74,57 +74,248 @@
" 5 low float64 \n",
" 6 vol float64 \n",
" 7 pct_chg float64 \n",
" 8 turnover_rate float64 \n",
" 9 pe_ttm float64 \n",
" 10 circ_mv float64 \n",
" 11 volume_ratio float64 \n",
" 12 is_st bool \n",
" 13 up_limit float64 \n",
" 14 down_limit float64 \n",
" 15 buy_sm_vol float64 \n",
" 16 sell_sm_vol float64 \n",
" 17 buy_lg_vol float64 \n",
" 18 sell_lg_vol float64 \n",
" 19 buy_elg_vol float64 \n",
" 20 sell_elg_vol float64 \n",
" 21 net_mf_vol float64 \n",
" 22 his_low float64 \n",
" 23 his_high float64 \n",
" 24 cost_5pct float64 \n",
" 25 cost_15pct float64 \n",
" 26 cost_50pct float64 \n",
" 27 cost_85pct float64 \n",
" 28 cost_95pct float64 \n",
" 29 weight_avg float64 \n",
" 30 winner_rate float64 \n",
"dtypes: bool(1), datetime64[ns](1), float64(28), object(1)\n",
"memory usage: 1.2+ GB\n",
"None\n",
"['ts_code', 'trade_date', 'open', 'close', 'high', 'low', 'vol', 'pct_chg', 'turnover_rate', 'pe_ttm', 'circ_mv', 'volume_ratio', 'is_st', 'up_limit', 'down_limit', 'buy_sm_vol', 'sell_sm_vol', 'buy_lg_vol', 'sell_lg_vol', 'buy_elg_vol', 'sell_elg_vol', 'net_mf_vol', 'his_low', 'his_high', 'cost_5pct', 'cost_15pct', 'cost_50pct', 'cost_85pct', 'cost_95pct', 'weight_avg', 'winner_rate']\n",
" 8 amount float64 \n",
" 9 turnover_rate float64 \n",
" 10 pe_ttm float64 \n",
" 11 circ_mv float64 \n",
" 12 total_mv float64 \n",
" 13 volume_ratio float64 \n",
" 14 is_st bool \n",
" 15 up_limit float64 \n",
" 16 down_limit float64 \n",
" 17 buy_sm_vol float64 \n",
" 18 sell_sm_vol float64 \n",
" 19 buy_lg_vol float64 \n",
" 20 sell_lg_vol float64 \n",
" 21 buy_elg_vol float64 \n",
" 22 sell_elg_vol float64 \n",
" 23 net_mf_vol float64 \n",
" 24 his_low float64 \n",
" 25 his_high float64 \n",
" 26 cost_5pct float64 \n",
" 27 cost_15pct float64 \n",
" 28 cost_50pct float64 \n",
" 29 cost_85pct float64 \n",
" 30 cost_95pct float64 \n",
" 31 weight_avg float64 \n",
" 32 winner_rate float64 \n",
"dtypes: bool(1), datetime64[ns](1), float64(30), object(1)\n",
"memory usage: 2.1+ GB\n",
"None\n"
]
}
],
"source": [
"from main.utils.utils import read_and_merge_h5_data\n",
"\n",
"print('daily data')\n",
"df = read_and_merge_h5_data('/mnt/d/PyProject/NewStock/data/daily_data.h5', key='daily_data',\n",
" columns=['ts_code', 'trade_date', 'open', 'close', 'high', 'low', 'vol', 'pct_chg', 'amount'],\n",
" df=None)\n",
"\n",
"print('daily basic')\n",
"df = read_and_merge_h5_data('/mnt/d/PyProject/NewStock/data/daily_basic.h5', key='daily_basic',\n",
" columns=['ts_code', 'trade_date', 'turnover_rate', 'pe_ttm', 'circ_mv', 'total_mv', 'volume_ratio',\n",
" 'is_st'], df=df, join='inner')\n",
"\n",
"print('stk limit')\n",
"df = read_and_merge_h5_data('/mnt/d/PyProject/NewStock/data/stk_limit.h5', key='stk_limit',\n",
" columns=['ts_code', 'trade_date', 'pre_close', 'up_limit', 'down_limit'],\n",
" df=df)\n",
"print('money flow')\n",
"df = read_and_merge_h5_data('/mnt/d/PyProject/NewStock/data/money_flow.h5', key='money_flow',\n",
" columns=['ts_code', 'trade_date', 'buy_sm_vol', 'sell_sm_vol', 'buy_lg_vol', 'sell_lg_vol',\n",
" 'buy_elg_vol', 'sell_elg_vol', 'net_mf_vol'],\n",
" df=df)\n",
"print('cyq perf')\n",
"df = read_and_merge_h5_data('/mnt/d/PyProject/NewStock/data/cyq_perf.h5', key='cyq_perf',\n",
" columns=['ts_code', 'trade_date', 'his_low', 'his_high', 'cost_5pct', 'cost_15pct',\n",
" 'cost_50pct',\n",
" 'cost_85pct', 'cost_95pct', 'weight_avg', 'winner_rate'],\n",
" df=df)\n",
"print(df.info())"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "0acb6625",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['ts_code', 'trade_date', 'open', 'close', 'high', 'low', 'vol', 'pct_chg', 'amount', 'turnover_rate', 'pe_ttm', 'circ_mv', 'total_mv', 'volume_ratio', 'is_st', 'up_limit', 'down_limit', 'buy_sm_vol', 'sell_sm_vol', 'buy_lg_vol', 'sell_lg_vol', 'buy_elg_vol', 'sell_elg_vol', 'net_mf_vol', 'his_low', 'his_high', 'cost_5pct', 'cost_15pct', 'cost_50pct', 'cost_85pct', 'cost_95pct', 'weight_avg', 'winner_rate']\n"
]
}
],
"source": [
"\n",
"origin_columns = df.columns.tolist()\n",
"origin_columns = [col for col in origin_columns if 'cyq' not in col]\n",
"print(origin_columns)\n"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "820a6b50",
"metadata": {},
"outputs": [],
"source": [
"fina_indicator_df = read_and_merge_h5_data('/mnt/d/PyProject/NewStock/data/fina_indicator.h5', key='fina_indicator',\n",
" columns=['ts_code', 'ann_date', 'undist_profit_ps', 'ocfps', 'bps'],\n",
" df=None)\n",
"cashflow_df = read_and_merge_h5_data('/mnt/d/PyProject/NewStock/data/cashflow.h5', key='cashflow',\n",
" columns=['ts_code', 'ann_date', 'n_cashflow_act'],\n",
" df=None)\n",
"balancesheet_df = read_and_merge_h5_data('/mnt/d/PyProject/NewStock/data/balancesheet.h5', key='balancesheet',\n",
" columns=['ts_code', 'ann_date', 'money_cap', 'total_liab'],\n",
" df=None)\n",
"top_list_df = read_and_merge_h5_data('/mnt/d/PyProject/NewStock/data/top_list.h5', key='top_list',\n",
" columns=['ts_code', 'trade_date', 'reason'],\n",
" df=None)\n",
"\n",
"top_list_df = top_list_df.sort_values(by='trade_date', ascending=False).drop_duplicates(subset=['ts_code', 'trade_date'], keep='first').sort_values(by='trade_date')\n",
"\n",
"stk_holdertrade_df = read_and_merge_h5_data('/mnt/d/PyProject/NewStock/data/stk_holdertrade.h5', key='stk_holdertrade',\n",
" columns=['ts_code', 'ann_date', 'in_de', 'change_ratio'],\n",
" df=None)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "903469a7",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"✅ 成功从 Redis Hash 'concept_stocks_daily_lists_pickle' 读取 1794 条每日概念股票数据。\n"
]
}
],
"source": [
"import redis\n",
"import pickle\n",
"from datetime import date, datetime\n",
"\n",
"# --- 配置 Redis 连接 ---\n",
"REDIS_HOST = '140.143.91.66'\n",
"REDIS_PORT = 6389\n",
"REDIS_DB = 0\n",
"\n",
"# --- 定义 Redis 键名 ---\n",
"HASH_KEY = \"concept_stocks_daily_lists_pickle\" # 区分之前的 JSON 版本\n",
"MAX_DATE_KEY = \"concept_stocks_max_date_pickle\" # 区分之前的 JSON 版本\n",
"\n",
"concept_dict = {}\n",
"\n",
"# --- 连接 Redis ---\n",
"try:\n",
" r = redis.StrictRedis(host=REDIS_HOST, port=REDIS_PORT, db=REDIS_DB, password='Redis520102')\n",
"\n",
" all_data_from_redis = r.hgetall(HASH_KEY) # 返回的是字典,键是字节,值是字节\n",
" \n",
" if all_data_from_redis:\n",
" for date_bytes, stocks_bytes in all_data_from_redis.items(): # 将变量名改为 date_bytes 更清晰\n",
" try:\n",
" # *** 修正点:将日期字节解码为字符串 ***\n",
" date_str = date_bytes.decode('utf-8') \n",
" date_obj = datetime.strptime(date_str, '%Y%m%d').date()\n",
" \n",
" stocks_list = pickle.loads(stocks_bytes)\n",
" concept_dict[date_obj] = stocks_list\n",
" except (ValueError, pickle.UnpicklingError) as e:\n",
" print(f\"⚠️ 警告: 解析 Redis 数据时出错 (日期键: '{date_bytes.decode('utf-8', errors='ignore')}'),跳过此条数据: {e}\") # 打印警告时也解码一下\n",
" print(f\"✅ 成功从 Redis Hash '{HASH_KEY}' 读取 {len(concept_dict)} 条每日概念股票数据。\")\n",
" else:\n",
" print(f\" Redis Hash '{HASH_KEY}' 中没有找到任何数据。\")\n",
"\n",
"except redis.exceptions.ConnectionError as e:\n",
" print(f\"❌ 错误: 无法连接到 Redis 服务器,请检查 Redis 是否正在运行或连接配置: {e}\")\n",
"except Exception as e:\n",
" print(f\"❌ 从 Redis 读取数据时发生未知错误: {e}\")"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "afb8da3d",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"4566757\n",
"开始生成概念相关因子...\n",
"开始计算概念内截面排序因子,基于: ['pct_chg', 'turnover_rate', 'volume_ratio']\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Ranking Features in Concepts: 100%|██████████| 3/3 [00:00<00:00, 15.82it/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"概念相关因子生成完毕。\n",
"4566757\n",
"开始计算股东增减持因子...\n",
"警告: 'in_de' 列中存在未映射的值,可能导致 _direction 列出现NaN。\n",
"股东增减持因子计算完成。\n",
"Calculating cat_senti_mom_vol_spike...\n",
"Finished cat_senti_mom_vol_spike.\n",
"Calculating cat_senti_pre_breakout...\n",
"Calculating atr_10 as it's missing...\n",
"Calculating atr_40 as it's missing...\n",
"Finished cat_senti_pre_breakout.\n",
"计算因子 ts_turnover_rate_acceleration_5_20\n",
"计算因子 ts_vol_sustain_10_30\n",
"计算因子 cs_amount_outlier_10\n",
"计算因子 ts_ff_to_total_turnover_ratio\n",
"计算因子 ts_price_volume_trend_coherence_5_20\n",
"计算因子 ts_ff_turnover_rate_surge_10\n",
"使用 'ann_date' 作为财务数据生效日期。\n",
"警告: 从 financial_data_subset 中移除了 366 行,因为其 'ts_code' 或 'ann_date' 列存在空值。\n",
"使用 'ann_date' 作为财务数据生效日期。\n",
"警告: 从 financial_data_subset 中移除了 366 行,因为其 'ts_code' 或 'ann_date' 列存在空值。\n",
"开始计算因子: AR, BR (原地修改)...\n",
"因子 AR, BR 计算成功。\n",
"因子 AR, BR 计算流程结束。\n",
"使用 'ann_date' 作为财务数据生效日期。\n",
"使用 'ann_date' 作为财务数据生效日期。\n",
"使用 'ann_date' 作为财务数据生效日期。\n",
"使用 'ann_date' 作为财务数据生效日期。\n",
"警告: 从 financial_data_subset 中移除了 366 行,因为其 'ts_code' 或 'ann_date' 列存在空值。\n",
"计算 BBI...\n",
"--- 计算日级别偏离度 (使用 pct_chg) ---\n",
"--- 计算日级别动量基准 (使用 pct_chg) ---\n",
"日级别动量基准计算完成 (使用 pct_chg)。\n",
"日级别偏离度计算完成 (使用 pct_chg)。\n",
"--- 计算日级别行业偏离度 (使用 pct_chg 和行业基准) ---\n",
"--- 计算日级别行业动量基准 (使用 pct_chg 和 cat_l2_code) ---\n",
"错误: 计算日级别行业动量基准需要以下列: ['pct_chg', 'cat_l2_code', 'trade_date', 'ts_code']。\n",
"错误: 计算日级别行业偏离度需要以下列: ['pct_chg', 'daily_industry_positive_benchmark', 'daily_industry_negative_benchmark']。请先运行 daily_industry_momentum_benchmark(df)。\n",
"Index(['ts_code', 'trade_date', 'open', 'close', 'high', 'low', 'vol',\n",
" 'pct_chg', 'turnover_rate', 'pe_ttm', 'circ_mv', 'volume_ratio',\n",
" 'is_st', 'up_limit', 'down_limit', 'buy_sm_vol', 'sell_sm_vol',\n",
" 'buy_lg_vol', 'sell_lg_vol', 'buy_elg_vol', 'sell_elg_vol',\n",
" 'net_mf_vol', 'his_low', 'his_high', 'cost_5pct', 'cost_15pct',\n",
" 'cost_50pct', 'cost_85pct', 'cost_95pct', 'weight_avg', 'winner_rate',\n",
" 'lg_elg_net_buy_vol', 'flow_lg_elg_intensity', 'sm_net_buy_vol',\n",
" 'flow_divergence_diff', 'flow_divergence_ratio', 'total_buy_vol',\n",
" 'lg_elg_buy_prop', 'flow_struct_buy_change',\n",
" 'lg_elg_net_buy_vol_change', 'flow_lg_elg_accel',\n",
" 'chip_concentration_range', 'chip_skewness', 'floating_chip_proxy',\n",
" 'cost_support_15pct_change', 'cat_winner_price_zone',\n",
" 'flow_chip_consistency', 'profit_taking_vs_absorb', '_is_positive',\n",
" '_is_negative', 'cat_is_positive', '_pos_returns', '_neg_returns',\n",
" '_pos_returns_sq', '_neg_returns_sq', 'upside_vol', 'downside_vol',\n",
" 'vol_ratio', 'return_skew', 'return_kurtosis', 'volume_change_rate',\n",
" 'pct_chg', 'amount', 'turnover_rate',\n",
" ...\n",
" 'cat_volume_breakout', 'turnover_deviation', 'cat_turnover_spike',\n",
" 'avg_volume_ratio', 'cat_volume_ratio_breakout', 'vol_spike',\n",
" 'vol_std_5', 'atr_14', 'atr_6', 'obv'],\n",
" dtype='object')\n",
" dtype='object', length=103)\n",
"Calculating senti_strong_inflow...\n",
"Finished senti_strong_inflow.\n",
"Calculating lg_flow_mom_corr_20_60...\n",
"Finished lg_flow_mom_corr_20_60.\n",
"Calculating lg_buy_consolidation_20...\n",
"Finished lg_buy_consolidation_20.\n",
"Calculating lg_flow_accel...\n",
"Finished lg_flow_accel.\n",
"Calculating profit_pressure...\n",
@@ -155,58 +346,73 @@
"Finished vol_wgt_hist_pos_20.\n",
"Calculating vol_adj_roc_20...\n",
"Finished vol_adj_roc_20.\n",
"Calculating intraday_lg_flow_corr_20 (Placeholder - complex implementation)...\n",
"Finished intraday_lg_flow_corr_20 (Placeholder).\n",
"Calculating cap_neutral_cost_metric (Placeholder - requires statsmodels)...\n",
"Finished cap_neutral_cost_metric (Placeholder).\n"
"Calculating cs_rank_net_lg_flow_val...\n",
"Finished cs_rank_net_lg_flow_val.\n",
"Calculating cs_rank_flow_divergence...\n",
"Finished cs_rank_flow_divergence.\n",
"Calculating cs_rank_ind_adj_lg_flow...\n",
"Error calculating cs_rank_ind_adj_lg_flow: Missing 'cat_l2_code' column. Assigning NaN.\n",
"Calculating cs_rank_elg_buy_ratio...\n",
"Finished cs_rank_elg_buy_ratio.\n",
"Calculating cs_rank_rel_profit_margin...\n",
"Finished cs_rank_rel_profit_margin.\n",
"Calculating cs_rank_cost_breadth...\n",
"Finished cs_rank_cost_breadth.\n",
"Calculating cs_rank_dist_to_upper_cost...\n",
"Finished cs_rank_dist_to_upper_cost.\n",
"Calculating cs_rank_winner_rate...\n",
"Finished cs_rank_winner_rate.\n",
"Calculating cs_rank_intraday_range...\n",
"Finished cs_rank_intraday_range.\n",
"Calculating cs_rank_close_pos_in_range...\n",
"Finished cs_rank_close_pos_in_range.\n",
"Calculating cs_rank_opening_gap...\n",
"Error calculating cs_rank_opening_gap: Missing 'pre_close' column. Assigning NaN.\n",
"Calculating cs_rank_pos_in_hist_range...\n",
"Finished cs_rank_pos_in_hist_range.\n",
"Calculating cs_rank_vol_x_profit_margin...\n",
"Finished cs_rank_vol_x_profit_margin.\n",
"Calculating cs_rank_lg_flow_price_concordance...\n",
"Finished cs_rank_lg_flow_price_concordance.\n",
"Calculating cs_rank_turnover_per_winner...\n",
"Finished cs_rank_turnover_per_winner.\n",
"Calculating cs_rank_ind_cap_neutral_pe (Placeholder - requires statsmodels)...\n",
"Finished cs_rank_ind_cap_neutral_pe (Placeholder).\n",
"Calculating cs_rank_volume_ratio...\n",
"Finished cs_rank_volume_ratio.\n",
"Calculating cs_rank_elg_buy_sell_sm_ratio...\n",
"Finished cs_rank_elg_buy_sell_sm_ratio.\n",
"Calculating cs_rank_cost_dist_vol_ratio...\n",
"Finished cs_rank_cost_dist_vol_ratio.\n",
"Calculating cs_rank_size...\n",
"Finished cs_rank_size.\n",
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 4566757 entries, 0 to 4566756\n",
"Columns: 197 entries, ts_code to cs_rank_size\n",
"dtypes: bool(10), datetime64[ns](1), float64(176), int64(6), int8(1), object(3)\n",
"memory usage: 6.4+ GB\n",
"None\n",
"['ts_code', 'trade_date', 'open', 'close', 'high', 'low', 'vol', 'pct_chg', 'amount', 'turnover_rate', 'pe_ttm', 'circ_mv', 'total_mv', 'volume_ratio', 'is_st', 'up_limit', 'down_limit', 'buy_sm_vol', 'sell_sm_vol', 'buy_lg_vol', 'sell_lg_vol', 'buy_elg_vol', 'sell_elg_vol', 'net_mf_vol', 'his_low', 'his_high', 'cost_5pct', 'cost_15pct', 'cost_50pct', 'cost_85pct', 'cost_95pct', 'weight_avg', 'winner_rate', 'cat_hot_concept_stock', 'concept_rank_pct_chg', 'concept_rank_turnover_rate', 'concept_rank_volume_ratio', 'holder_net_change_sum_10d', 'holder_increase_days_10d', 'holder_decrease_days_10d', 'holder_any_increase_flag_10d', 'holder_any_decrease_flag_10d', 'holder_direction_score_10d', 'cat_senti_mom_vol_spike', 'cat_senti_pre_breakout', 'ts_turnover_rate_acceleration_5_20', 'ts_vol_sustain_10_30', 'cs_amount_outlier_10', 'ts_ff_to_total_turnover_ratio', 'ts_price_volume_trend_coherence_5_20', 'ts_ff_turnover_rate_surge_10', 'undist_profit_ps', 'ocfps', 'AR', 'BR', 'AR_BR', 'log_circ_mv', 'cashflow_to_ev_factor', 'book_to_price_ratio', 'turnover_rate_mean_5', 'variance_20', 'bbi_ratio_factor', 'daily_deviation', 'lg_elg_net_buy_vol', 'flow_lg_elg_intensity', 'sm_net_buy_vol', 'flow_divergence_diff', 'flow_divergence_ratio', 'total_buy_vol', 'lg_elg_buy_prop', 'flow_struct_buy_change', 'lg_elg_net_buy_vol_change', 'flow_lg_elg_accel', 'chip_concentration_range', 'chip_skewness', 'floating_chip_proxy', 'cost_support_15pct_change', 'cat_winner_price_zone', 'flow_chip_consistency', 'profit_taking_vs_absorb', 'cat_is_positive', 'upside_vol', 'downside_vol', 'vol_ratio', 'return_skew', 'return_kurtosis', 'volume_change_rate', 'cat_volume_breakout', 'turnover_deviation', 'cat_turnover_spike', 'avg_volume_ratio', 'cat_volume_ratio_breakout', 'vol_spike', 'vol_std_5', 'atr_14', 'atr_6', 'obv', 'maobv_6', 'rsi_3', 'return_5', 'return_20', 'std_return_5', 'std_return_90', 'std_return_90_2', 'act_factor1', 'act_factor2', 'act_factor3', 'act_factor4', 'rank_act_factor1', 'rank_act_factor2', 'rank_act_factor3', 'cov', 'delta_cov', 'alpha_22_improved', 'alpha_003', 'alpha_007', 'alpha_013', 'vol_break', 'weight_roc5', 'price_cost_divergence', 'smallcap_concentration', 'cost_stability', 'high_cost_break_days', 'liquidity_risk', 'turnover_std', 'mv_volatility', 'volume_growth', 'mv_growth', 'momentum_factor', 'resonance_factor', 'log_close', 'cat_vol_spike', 'up', 'down', 'obv_maobv_6', 'std_return_5_over_std_return_90', 'std_return_90_minus_std_return_90_2', 'cat_af2', 'cat_af3', 'cat_af4', 'act_factor5', 'act_factor6', 'active_buy_volume_large', 'active_buy_volume_big', 'active_buy_volume_small', 'buy_lg_vol_minus_sell_lg_vol', 'buy_elg_vol_minus_sell_elg_vol', 'ctrl_strength', 'low_cost_dev', 'asymmetry', 'lock_factor', 'cat_vol_break', 'cost_atr_adj', 'cat_golden_resonance', 'mv_turnover_ratio', 'mv_adjusted_volume', 'mv_weighted_turnover', 'nonlinear_mv_volume', 'mv_volume_ratio', 'mv_momentum', 'senti_strong_inflow', 'lg_flow_mom_corr_20_60', 'lg_flow_accel', 'profit_pressure', 'underwater_resistance', 'cost_conc_std_20', 'profit_decay_20', 'vol_amp_loss_20', 'vol_drop_profit_cnt_5', 'lg_flow_vol_interact_20', 'cost_break_confirm_cnt_5', 'atr_norm_channel_pos_14', 'turnover_diff_skew_20', 'lg_sm_flow_diverge_20', 'pullback_strong_20_20', 'vol_wgt_hist_pos_20', 'vol_adj_roc_20', 'cs_rank_net_lg_flow_val', 'cs_rank_flow_divergence', 'cs_rank_ind_adj_lg_flow', 'cs_rank_elg_buy_ratio', 'cs_rank_rel_profit_margin', 'cs_rank_cost_breadth', 'cs_rank_dist_to_upper_cost', 'cs_rank_winner_rate', 'cs_rank_intraday_range', 'cs_rank_close_pos_in_range', 'cs_rank_opening_gap', 'cs_rank_pos_in_hist_range', 'cs_rank_vol_x_profit_margin', 'cs_rank_lg_flow_price_concordance', 'cs_rank_turnover_per_winner', 'cs_rank_ind_cap_neutral_pe', 'cs_rank_volume_ratio', 'cs_rank_elg_buy_sell_sm_ratio', 'cs_rank_cost_dist_vol_ratio', 'cs_rank_size']\n"
]
}
],
"source": [
"print('daily data')\n",
"df = read_and_merge_h5_data('../../data/daily_data.h5', key='daily_data',\n",
" columns=['ts_code', 'trade_date', 'open', 'close', 'high', 'low', 'vol', 'pct_chg'],\n",
" df=None)\n",
"\n",
"print('daily basic')\n",
"df = read_and_merge_h5_data('../../data/daily_basic.h5', key='daily_basic',\n",
" columns=['ts_code', 'trade_date', 'turnover_rate', 'pe_ttm', 'circ_mv', 'volume_ratio',\n",
" 'is_st'], df=df, join='inner')\n",
"df = df[df['trade_date'] >= '2021-01-01']\n",
"\n",
"print('stk limit')\n",
"df = read_and_merge_h5_data('../../data/stk_limit.h5', key='stk_limit',\n",
" columns=['ts_code', 'trade_date', 'pre_close', 'up_limit', 'down_limit'],\n",
" df=df)\n",
"print('money flow')\n",
"df = read_and_merge_h5_data('../../data/money_flow.h5', key='money_flow',\n",
" columns=['ts_code', 'trade_date', 'buy_sm_vol', 'sell_sm_vol', 'buy_lg_vol',\n",
" 'sell_lg_vol',\n",
" 'buy_elg_vol', 'sell_elg_vol', 'net_mf_vol'],\n",
" df=df)\n",
"print('cyq perf')\n",
"df = read_and_merge_h5_data('../../data/cyq_perf.h5', key='cyq_perf',\n",
" columns=['ts_code', 'trade_date', 'his_low', 'his_high', 'cost_5pct', 'cost_15pct',\n",
" 'cost_50pct',\n",
" 'cost_85pct', 'cost_95pct', 'weight_avg', 'winner_rate'],\n",
" df=df)\n",
"print(df.info())\n",
"\n",
"origin_columns = df.columns.tolist()\n",
"origin_columns = [col for col in origin_columns if 'cyq' not in col]\n",
"print(origin_columns)\n",
"import numpy as np\n",
"from main.factor.factor import *\n",
"from main.factor.money_factor import * \n",
"from main.factor.concept_factor import * \n",
"\n",
"\n",
"def filter_data(df):\n",
" # df = df.groupby('trade_date').apply(lambda x: x.nlargest(1000, 'act_factor1'))\n",
" df = df[~df['is_st']]\n",
" df = df[~df['ts_code'].str.endswith('BJ')]\n",
" df = df[~df['ts_code'].str.startswith('30')]\n",
" df = df[~df['ts_code'].str.startswith('68')]\n",
" df = df[~df['ts_code'].str.startswith('8')]\n",
" df = df[df['trade_date'] >= '2022-01-01']\n",
" if 'in_date' in df.columns:\n",
" df = df.drop(columns=['in_date'])\n",
" df = df[~df[\"is_st\"]]\n",
" df = df[~df[\"ts_code\"].str.endswith(\"BJ\")]\n",
" df = df[~df[\"ts_code\"].str.startswith(\"30\")]\n",
" df = df[~df[\"ts_code\"].str.startswith(\"68\")]\n",
" df = df[~df[\"ts_code\"].str.startswith(\"8\")]\n",
" df = df[df[\"trade_date\"] >= \"2019-01-01\"]\n",
" if \"in_date\" in df.columns:\n",
" df = df.drop(columns=[\"in_date\"])\n",
" df = df.reset_index(drop=True)\n",
" return df\n",
"\n",
@@ -214,11 +420,70 @@
"gc.collect()\n",
"\n",
"df = filter_data(df)\n",
"df = df.sort_values(by=[\"ts_code\", \"trade_date\"])\n",
"\n",
"# df = price_minus_deduction_price(df, n=120)\n",
"# df = price_deduction_price_diff_ratio_to_sma(df, n=120)\n",
"# df = cat_price_vs_sma_vs_deduction_price(df, n=120)\n",
"# df = cat_reason(df, top_list_df)\n",
"# df = cat_is_on_top_list(df, top_list_df)\n",
"print(len(df))\n",
"df = generate_concept_factors(df, concept_dict)\n",
"print(len(df))\n",
"\n",
"df = holder_trade_factors(df, stk_holdertrade_df)\n",
"\n",
"df = cat_senti_mom_vol_spike(\n",
" df,\n",
" return_period=3,\n",
" return_threshold=0.03, # 近3日涨幅超3%\n",
" volume_ratio_threshold=1.3,\n",
" current_pct_chg_min=0.0, # 当日必须收红\n",
" current_pct_chg_max=0.05,\n",
") # 当日涨幅不宜过大\n",
"\n",
"df = cat_senti_pre_breakout(\n",
" df,\n",
" atr_short_N=10,\n",
" atr_long_M=40,\n",
" vol_atrophy_N=10,\n",
" vol_atrophy_M=40,\n",
" price_stab_N=5,\n",
" price_stab_threshold=0.06,\n",
" current_pct_chg_min_signal=0.002,\n",
" current_pct_chg_max_signal=0.05,\n",
" volume_ratio_signal_threshold=1.1,\n",
")\n",
"\n",
"df = ts_turnover_rate_acceleration_5_20(df)\n",
"df = ts_vol_sustain_10_30(df)\n",
"# df = cs_turnover_rate_relative_strength_20(df)\n",
"df = cs_amount_outlier_10(df)\n",
"df = ts_ff_to_total_turnover_ratio(df)\n",
"df = ts_price_volume_trend_coherence_5_20(df)\n",
"# df = ts_turnover_rate_trend_strength_5(df)\n",
"df = ts_ff_turnover_rate_surge_10(df)\n",
"\n",
"df = add_financial_factor(df, fina_indicator_df, factor_value_col=\"undist_profit_ps\")\n",
"df = add_financial_factor(df, fina_indicator_df, factor_value_col=\"ocfps\")\n",
"calculate_arbr(df, N=26)\n",
"df[\"log_circ_mv\"] = np.log(df[\"circ_mv\"])\n",
"df = calculate_cashflow_to_ev_factor(df, cashflow_df, balancesheet_df)\n",
"df = caculate_book_to_price_ratio(df, fina_indicator_df)\n",
"df = turnover_rate_n(df, n=5)\n",
"df = variance_n(df, n=20)\n",
"df = bbi_ratio_factor(df)\n",
"df = daily_deviation(df)\n",
"df = daily_industry_deviation(df)\n",
"df, _ = get_rolling_factor(df)\n",
"df, _ = get_simple_factor(df)\n",
"from main.factor.factor import *\n",
"\n",
"df = calculate_strong_inflow_signal(df)\n",
"\n",
"df = df.rename(columns={\"l1_code\": \"cat_l1_code\"})\n",
"df = df.rename(columns={\"l2_code\": \"cat_l2_code\"})\n",
"\n",
"lg_flow_mom_corr(df, N=20, M=60)\n",
"lg_buy_consolidation(df, N=20)\n",
"lg_flow_accel(df)\n",
"profit_pressure(df)\n",
"underwater_resistance(df)\n",
@@ -234,12 +499,57 @@
"pullback_strong(df, N=20, M=20)\n",
"vol_wgt_hist_pos(df, N=20)\n",
"vol_adj_roc(df, N=20)\n",
"intraday_lg_flow_corr(df, N=20) # Placeholder\n",
"cap_neutral_cost_metric(df) # Placeholder\n",
"# hurst_exponent_flow(df, N=60) # Placeholder\n",
"# df['test'] = 1\n",
"# df['test2'] = 2\n",
"# df = df.merge(industry_df, on=['l2_code', 'trade_date'], how='left')\n",
"\n",
"cs_rank_net_lg_flow_val(df)\n",
"cs_rank_flow_divergence(df)\n",
"cs_rank_industry_adj_lg_flow(df) # Needs cat_l2_code\n",
"cs_rank_elg_buy_ratio(df)\n",
"cs_rank_rel_profit_margin(df)\n",
"cs_rank_cost_breadth(df)\n",
"cs_rank_dist_to_upper_cost(df)\n",
"cs_rank_winner_rate(df)\n",
"cs_rank_intraday_range(df)\n",
"cs_rank_close_pos_in_range(df)\n",
"cs_rank_opening_gap(df) # Needs pre_close\n",
"cs_rank_pos_in_hist_range(df) # Needs his_low, his_high\n",
"cs_rank_vol_x_profit_margin(df)\n",
"cs_rank_lg_flow_price_concordance(df)\n",
"cs_rank_turnover_per_winner(df)\n",
"cs_rank_ind_cap_neutral_pe(df) # Placeholder - needs external libraries\n",
"cs_rank_volume_ratio(df) # Needs volume_ratio\n",
"cs_rank_elg_buy_sell_sm_ratio(df)\n",
"cs_rank_cost_dist_vol_ratio(df) # Needs volume_ratio\n",
"cs_rank_size(df) # Needs circ_mv\n",
"\n",
"\n",
"# df = df.merge(index_data, on='trade_date', how='left')\n",
"\n",
"print(df.info())\n",
"print(df.columns.tolist())"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "48712034",
"metadata": {},
"outputs": [
{
"ename": "FileNotFoundError",
"evalue": "File ../../data/industry_data.h5 does not exist",
"output_type": "error",
"traceback": [
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
"\u001b[31mFileNotFoundError\u001b[39m Traceback (most recent call last)",
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[8]\u001b[39m\u001b[32m, line 1\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m l2_df = \u001b[43mread_and_merge_h5_data\u001b[49m\u001b[43m(\u001b[49m\u001b[33;43m'\u001b[39;49m\u001b[33;43m../../data/industry_data.h5\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkey\u001b[49m\u001b[43m=\u001b[49m\u001b[33;43m'\u001b[39;49m\u001b[33;43mindustry_data\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[32m 2\u001b[39m \u001b[43m \u001b[49m\u001b[43mcolumns\u001b[49m\u001b[43m=\u001b[49m\u001b[43m[\u001b[49m\u001b[33;43m'\u001b[39;49m\u001b[33;43mts_code\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[33;43m'\u001b[39;49m\u001b[33;43ml2_code\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[33;43m'\u001b[39;49m\u001b[33;43min_date\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 3\u001b[39m \u001b[43m \u001b[49m\u001b[43mdf\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mon\u001b[49m\u001b[43m=\u001b[49m\u001b[43m[\u001b[49m\u001b[33;43m'\u001b[39;49m\u001b[33;43mts_code\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mjoin\u001b[49m\u001b[43m=\u001b[49m\u001b[33;43m'\u001b[39;49m\u001b[33;43mleft\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[32m 4\u001b[39m df = merge_with_industry_data(df, l2_df)\n\u001b[32m 5\u001b[39m df = df.rename(columns={\u001b[33m'\u001b[39m\u001b[33ml2_code\u001b[39m\u001b[33m'\u001b[39m: \u001b[33m'\u001b[39m\u001b[33mcat_l2_code\u001b[39m\u001b[33m'\u001b[39m})\n",
"\u001b[36mFile \u001b[39m\u001b[32m/mnt/d/PyProject/NewStock/main/utils/utils.py:14\u001b[39m, in \u001b[36mread_and_merge_h5_data\u001b[39m\u001b[34m(h5_filename, key, columns, df, join, on, prefix)\u001b[39m\n\u001b[32m 11\u001b[39m processed_columns.append(col)\n\u001b[32m 13\u001b[39m \u001b[38;5;66;03m# 从 HDF5 文件读取数据,选择需要的列\u001b[39;00m\n\u001b[32m---> \u001b[39m\u001b[32m14\u001b[39m data = \u001b[43mpd\u001b[49m\u001b[43m.\u001b[49m\u001b[43mread_hdf\u001b[49m\u001b[43m(\u001b[49m\u001b[43mh5_filename\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkey\u001b[49m\u001b[43m=\u001b[49m\u001b[43mkey\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcolumns\u001b[49m\u001b[43m=\u001b[49m\u001b[43mprocessed_columns\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 16\u001b[39m \u001b[38;5;66;03m# 修改列名,如果列名以前有 _加上 _\u001b[39;00m\n\u001b[32m 17\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m col \u001b[38;5;129;01min\u001b[39;00m data.columns:\n",
"\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/stock/lib/python3.13/site-packages/pandas/io/pytables.py:424\u001b[39m, in \u001b[36mread_hdf\u001b[39m\u001b[34m(path_or_buf, key, mode, errors, where, start, stop, columns, iterator, chunksize, **kwargs)\u001b[39m\n\u001b[32m 421\u001b[39m exists = \u001b[38;5;28;01mFalse\u001b[39;00m\n\u001b[32m 423\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m exists:\n\u001b[32m--> \u001b[39m\u001b[32m424\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mFileNotFoundError\u001b[39;00m(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mFile \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mpath_or_buf\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m does not exist\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m 426\u001b[39m store = HDFStore(path_or_buf, mode=mode, errors=errors, **kwargs)\n\u001b[32m 427\u001b[39m \u001b[38;5;66;03m# can't auto open/close if we are using an iterator\u001b[39;00m\n\u001b[32m 428\u001b[39m \u001b[38;5;66;03m# so delegate to the iterator\u001b[39;00m\n",
"\u001b[31mFileNotFoundError\u001b[39m: File ../../data/industry_data.h5 does not exist"
]
}
],
"source": [
"\n",
"l2_df = read_and_merge_h5_data('../../data/industry_data.h5', key='industry_data',\n",
" columns=['ts_code', 'l2_code', 'in_date'],\n",
" df=None, on=['ts_code'], join='left')\n",
@@ -247,7 +557,7 @@
"df = df.rename(columns={'l2_code': 'cat_l2_code'})\n",
"# df = df.merge(index_data, on='trade_date', how='left')\n",
"\n",
"days = 2\n",
"days = 5\n",
"df = df.sort_values(by=['ts_code', 'trade_date'])\n",
"# df['future_return'] = df.groupby('ts_code', group_keys=False)['close'].apply(lambda x: x.shift(-days) / x - 1)\n",
"df['future_return'] = (df.groupby('ts_code')['close'].shift(-days) - df.groupby('ts_code')['open'].shift(-1)) / \\\n",
@@ -265,7 +575,7 @@
"\n",
"def select_pre_zt_stocks_dynamic(stock_df):\n",
" def select_stocks(group):\n",
" return group.nlargest(1000, 'return_5') # 如果循环结束仍未找到足够标签,则返回最大数量的股票\n",
" return group.nsmallest(1000, 'total_mv') # 如果循环结束仍未找到足够标签,则返回最大数量的股票\n",
"\n",
" stock_df = stock_df.groupby('trade_date', group_keys=False).apply(select_stocks)\n",
" return stock_df\n",
@@ -281,7 +591,7 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": null,
"id": "1c1dd3d6",
"metadata": {},
"outputs": [
@@ -316,7 +626,7 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": null,
"id": "2c60c1ea",
"metadata": {},
"outputs": [
@@ -541,7 +851,7 @@
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": null,
"id": "e088bd8a357e815a",
"metadata": {
"ExecuteTime": {
@@ -785,7 +1095,7 @@
},
{
"cell_type": "code",
"execution_count": 14,
"execution_count": null,
"id": "a0b3d7551ef0c81f",
"metadata": {
"ExecuteTime": {
@@ -1006,7 +1316,7 @@
],
"metadata": {
"kernelspec": {
"display_name": "new_trader",
"display_name": "stock",
"language": "python",
"name": "python3"
},
@@ -1020,7 +1330,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.11"
"version": "3.13.2"
}
},
"nbformat": 4,