2025-04-28 11:02:52 +08:00
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "initial_id",
"metadata": {
"ExecuteTime": {
"end_time": "2025-04-13T07:26:19.000054Z",
"start_time": "2025-04-13T07:26:18.895713Z"
},
"collapsed": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
2025-06-10 15:22:25 +08:00
"/mnt/d/PyProject/NewStock\n"
2025-04-28 11:02:52 +08:00
]
}
],
"source": [
"import gc\n",
"import os\n",
"import sys\n",
"sys.path.append('../../')\n",
"print(os.getcwd())\n",
"import pandas as pd\n",
"from main.factor.factor import get_rolling_factor, get_simple_factor\n",
"from main.utils.factor import read_industry_data\n",
"from main.utils.factor_processor import calculate_score\n",
"from main.utils.utils import read_and_merge_h5_data, merge_with_industry_data\n",
"\n",
"import warnings\n",
"\n",
"warnings.filterwarnings(\"ignore\")"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "f1623b04c7a366af",
"metadata": {
"ExecuteTime": {
"end_time": "2025-04-13T07:30:48.534271Z",
"start_time": "2025-04-13T07:26:19.005576Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"daily data\n",
"daily basic\n",
"inner merge on ['ts_code', 'trade_date']\n",
"stk limit\n",
"left merge on ['ts_code', 'trade_date']\n",
"money flow\n",
"left merge on ['ts_code', 'trade_date']\n",
"cyq perf\n",
"left merge on ['ts_code', 'trade_date']\n",
"<class 'pandas.core.frame.DataFrame'>\n",
2025-06-10 15:22:25 +08:00
"RangeIndex: 8713571 entries, 0 to 8713570\n",
"Data columns (total 33 columns):\n",
2025-04-28 11:02:52 +08:00
" # Column Dtype \n",
"--- ------ ----- \n",
" 0 ts_code object \n",
" 1 trade_date datetime64[ns]\n",
" 2 open float64 \n",
" 3 close float64 \n",
" 4 high float64 \n",
" 5 low float64 \n",
" 6 vol float64 \n",
" 7 pct_chg float64 \n",
2025-06-10 15:22:25 +08:00
" 8 amount float64 \n",
" 9 turnover_rate float64 \n",
" 10 pe_ttm float64 \n",
" 11 circ_mv float64 \n",
" 12 total_mv float64 \n",
" 13 volume_ratio float64 \n",
" 14 is_st bool \n",
" 15 up_limit float64 \n",
" 16 down_limit float64 \n",
" 17 buy_sm_vol float64 \n",
" 18 sell_sm_vol float64 \n",
" 19 buy_lg_vol float64 \n",
" 20 sell_lg_vol float64 \n",
" 21 buy_elg_vol float64 \n",
" 22 sell_elg_vol float64 \n",
" 23 net_mf_vol float64 \n",
" 24 his_low float64 \n",
" 25 his_high float64 \n",
" 26 cost_5pct float64 \n",
" 27 cost_15pct float64 \n",
" 28 cost_50pct float64 \n",
" 29 cost_85pct float64 \n",
" 30 cost_95pct float64 \n",
" 31 weight_avg float64 \n",
" 32 winner_rate float64 \n",
"dtypes: bool(1), datetime64[ns](1), float64(30), object(1)\n",
"memory usage: 2.1+ GB\n",
"None\n"
]
}
],
"source": [
"from main.utils.utils import read_and_merge_h5_data\n",
"\n",
"print('daily data')\n",
"df = read_and_merge_h5_data('/mnt/d/PyProject/NewStock/data/daily_data.h5', key='daily_data',\n",
" columns=['ts_code', 'trade_date', 'open', 'close', 'high', 'low', 'vol', 'pct_chg', 'amount'],\n",
" df=None)\n",
"\n",
"print('daily basic')\n",
"df = read_and_merge_h5_data('/mnt/d/PyProject/NewStock/data/daily_basic.h5', key='daily_basic',\n",
" columns=['ts_code', 'trade_date', 'turnover_rate', 'pe_ttm', 'circ_mv', 'total_mv', 'volume_ratio',\n",
" 'is_st'], df=df, join='inner')\n",
"\n",
"print('stk limit')\n",
"df = read_and_merge_h5_data('/mnt/d/PyProject/NewStock/data/stk_limit.h5', key='stk_limit',\n",
" columns=['ts_code', 'trade_date', 'pre_close', 'up_limit', 'down_limit'],\n",
" df=df)\n",
"print('money flow')\n",
"df = read_and_merge_h5_data('/mnt/d/PyProject/NewStock/data/money_flow.h5', key='money_flow',\n",
" columns=['ts_code', 'trade_date', 'buy_sm_vol', 'sell_sm_vol', 'buy_lg_vol', 'sell_lg_vol',\n",
" 'buy_elg_vol', 'sell_elg_vol', 'net_mf_vol'],\n",
" df=df)\n",
"print('cyq perf')\n",
"df = read_and_merge_h5_data('/mnt/d/PyProject/NewStock/data/cyq_perf.h5', key='cyq_perf',\n",
" columns=['ts_code', 'trade_date', 'his_low', 'his_high', 'cost_5pct', 'cost_15pct',\n",
" 'cost_50pct',\n",
" 'cost_85pct', 'cost_95pct', 'weight_avg', 'winner_rate'],\n",
" df=df)\n",
"print(df.info())"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "0acb6625",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['ts_code', 'trade_date', 'open', 'close', 'high', 'low', 'vol', 'pct_chg', 'amount', 'turnover_rate', 'pe_ttm', 'circ_mv', 'total_mv', 'volume_ratio', 'is_st', 'up_limit', 'down_limit', 'buy_sm_vol', 'sell_sm_vol', 'buy_lg_vol', 'sell_lg_vol', 'buy_elg_vol', 'sell_elg_vol', 'net_mf_vol', 'his_low', 'his_high', 'cost_5pct', 'cost_15pct', 'cost_50pct', 'cost_85pct', 'cost_95pct', 'weight_avg', 'winner_rate']\n"
]
}
],
"source": [
"\n",
"origin_columns = df.columns.tolist()\n",
"origin_columns = [col for col in origin_columns if 'cyq' not in col]\n",
"print(origin_columns)\n"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "820a6b50",
"metadata": {},
"outputs": [],
"source": [
"fina_indicator_df = read_and_merge_h5_data('/mnt/d/PyProject/NewStock/data/fina_indicator.h5', key='fina_indicator',\n",
" columns=['ts_code', 'ann_date', 'undist_profit_ps', 'ocfps', 'bps'],\n",
" df=None)\n",
"cashflow_df = read_and_merge_h5_data('/mnt/d/PyProject/NewStock/data/cashflow.h5', key='cashflow',\n",
" columns=['ts_code', 'ann_date', 'n_cashflow_act'],\n",
" df=None)\n",
"balancesheet_df = read_and_merge_h5_data('/mnt/d/PyProject/NewStock/data/balancesheet.h5', key='balancesheet',\n",
" columns=['ts_code', 'ann_date', 'money_cap', 'total_liab'],\n",
" df=None)\n",
"top_list_df = read_and_merge_h5_data('/mnt/d/PyProject/NewStock/data/top_list.h5', key='top_list',\n",
" columns=['ts_code', 'trade_date', 'reason'],\n",
" df=None)\n",
"\n",
"top_list_df = top_list_df.sort_values(by='trade_date', ascending=False).drop_duplicates(subset=['ts_code', 'trade_date'], keep='first').sort_values(by='trade_date')\n",
"\n",
"stk_holdertrade_df = read_and_merge_h5_data('/mnt/d/PyProject/NewStock/data/stk_holdertrade.h5', key='stk_holdertrade',\n",
" columns=['ts_code', 'ann_date', 'in_de', 'change_ratio'],\n",
" df=None)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "903469a7",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"✅ 成功从 Redis Hash 'concept_stocks_daily_lists_pickle' 读取 1794 条每日概念股票数据。\n"
]
}
],
"source": [
"import redis\n",
"import pickle\n",
"from datetime import date, datetime\n",
"\n",
"# --- 配置 Redis 连接 ---\n",
"REDIS_HOST = '140.143.91.66'\n",
"REDIS_PORT = 6389\n",
"REDIS_DB = 0\n",
"\n",
"# --- 定义 Redis 键名 ---\n",
"HASH_KEY = \"concept_stocks_daily_lists_pickle\" # 区分之前的 JSON 版本\n",
"MAX_DATE_KEY = \"concept_stocks_max_date_pickle\" # 区分之前的 JSON 版本\n",
"\n",
"concept_dict = {}\n",
"\n",
"# --- 连接 Redis ---\n",
"try:\n",
" r = redis.StrictRedis(host=REDIS_HOST, port=REDIS_PORT, db=REDIS_DB, password='Redis520102')\n",
"\n",
" all_data_from_redis = r.hgetall(HASH_KEY) # 返回的是字典,键是字节,值是字节\n",
" \n",
" if all_data_from_redis:\n",
" for date_bytes, stocks_bytes in all_data_from_redis.items(): # 将变量名改为 date_bytes 更清晰\n",
" try:\n",
" # *** 修正点:将日期字节解码为字符串 ***\n",
" date_str = date_bytes.decode('utf-8') \n",
" date_obj = datetime.strptime(date_str, '%Y%m%d').date()\n",
" \n",
" stocks_list = pickle.loads(stocks_bytes)\n",
" concept_dict[date_obj] = stocks_list\n",
" except (ValueError, pickle.UnpicklingError) as e:\n",
" print(f\"⚠️ 警告: 解析 Redis 数据时出错 (日期键: '{date_bytes.decode('utf-8', errors='ignore')}'),跳过此条数据: {e}\") # 打印警告时也解码一下\n",
" print(f\"✅ 成功从 Redis Hash '{HASH_KEY}' 读取 {len(concept_dict)} 条每日概念股票数据。\")\n",
" else:\n",
" print(f\"ℹ ️ Redis Hash '{HASH_KEY}' 中没有找到任何数据。\")\n",
"\n",
"except redis.exceptions.ConnectionError as e:\n",
" print(f\"❌ 错误: 无法连接到 Redis 服务器,请检查 Redis 是否正在运行或连接配置: {e}\")\n",
"except Exception as e:\n",
" print(f\"❌ 从 Redis 读取数据时发生未知错误: {e}\")"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "afb8da3d",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"4566757\n",
"开始生成概念相关因子...\n",
"开始计算概念内截面排序因子,基于: ['pct_chg', 'turnover_rate', 'volume_ratio']\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Ranking Features in Concepts: 100%|██████████| 3/3 [00:00<00:00, 15.82it/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"概念相关因子生成完毕。\n",
"4566757\n",
"开始计算股东增减持因子...\n",
"警告: 'in_de' 列中存在未映射的值,可能导致 _direction 列出现NaN。\n",
"股东增减持因子计算完成。\n",
"Calculating cat_senti_mom_vol_spike...\n",
"Finished cat_senti_mom_vol_spike.\n",
"Calculating cat_senti_pre_breakout...\n",
"Calculating atr_10 as it's missing...\n",
"Calculating atr_40 as it's missing...\n",
"Finished cat_senti_pre_breakout.\n",
"计算因子 ts_turnover_rate_acceleration_5_20\n",
"计算因子 ts_vol_sustain_10_30\n",
"计算因子 cs_amount_outlier_10\n",
"计算因子 ts_ff_to_total_turnover_ratio\n",
"计算因子 ts_price_volume_trend_coherence_5_20\n",
"计算因子 ts_ff_turnover_rate_surge_10\n",
"使用 'ann_date' 作为财务数据生效日期。\n",
"警告: 从 financial_data_subset 中移除了 366 行,因为其 'ts_code' 或 'ann_date' 列存在空值。\n",
"使用 'ann_date' 作为财务数据生效日期。\n",
"警告: 从 financial_data_subset 中移除了 366 行,因为其 'ts_code' 或 'ann_date' 列存在空值。\n",
"开始计算因子: AR, BR (原地修改)...\n",
"因子 AR, BR 计算成功。\n",
"因子 AR, BR 计算流程结束。\n",
"使用 'ann_date' 作为财务数据生效日期。\n",
"使用 'ann_date' 作为财务数据生效日期。\n",
"使用 'ann_date' 作为财务数据生效日期。\n",
"使用 'ann_date' 作为财务数据生效日期。\n",
"警告: 从 financial_data_subset 中移除了 366 行,因为其 'ts_code' 或 'ann_date' 列存在空值。\n",
"计算 BBI...\n",
"--- 计算日级别偏离度 (使用 pct_chg) ---\n",
"--- 计算日级别动量基准 (使用 pct_chg) ---\n",
"日级别动量基准计算完成 (使用 pct_chg)。\n",
"日级别偏离度计算完成 (使用 pct_chg)。\n",
"--- 计算日级别行业偏离度 (使用 pct_chg 和行业基准) ---\n",
"--- 计算日级别行业动量基准 (使用 pct_chg 和 cat_l2_code) ---\n",
"错误: 计算日级别行业动量基准需要以下列: ['pct_chg', 'cat_l2_code', 'trade_date', 'ts_code']。\n",
"错误: 计算日级别行业偏离度需要以下列: ['pct_chg', 'daily_industry_positive_benchmark', 'daily_industry_negative_benchmark']。请先运行 daily_industry_momentum_benchmark(df)。\n",
2025-04-28 11:02:52 +08:00
"Index(['ts_code', 'trade_date', 'open', 'close', 'high', 'low', 'vol',\n",
2025-06-10 15:22:25 +08:00
" 'pct_chg', 'amount', 'turnover_rate',\n",
" ...\n",
2025-04-28 11:02:52 +08:00
" 'cat_volume_breakout', 'turnover_deviation', 'cat_turnover_spike',\n",
" 'avg_volume_ratio', 'cat_volume_ratio_breakout', 'vol_spike',\n",
" 'vol_std_5', 'atr_14', 'atr_6', 'obv'],\n",
2025-06-10 15:22:25 +08:00
" dtype='object', length=103)\n",
"Calculating senti_strong_inflow...\n",
"Finished senti_strong_inflow.\n",
2025-04-28 11:02:52 +08:00
"Calculating lg_flow_mom_corr_20_60...\n",
"Finished lg_flow_mom_corr_20_60.\n",
"Calculating lg_flow_accel...\n",
"Finished lg_flow_accel.\n",
"Calculating profit_pressure...\n",
"Finished profit_pressure.\n",
"Calculating underwater_resistance...\n",
"Finished underwater_resistance.\n",
"Calculating cost_conc_std_20...\n",
"Finished cost_conc_std_20.\n",
"Calculating profit_decay_20...\n",
"Finished profit_decay_20.\n",
"Calculating vol_amp_loss_20...\n",
"Finished vol_amp_loss_20.\n",
"Calculating vol_drop_profit_cnt_5...\n",
"Finished vol_drop_profit_cnt_5.\n",
"Calculating lg_flow_vol_interact_20...\n",
"Finished lg_flow_vol_interact_20.\n",
"Calculating cost_break_confirm_cnt_5...\n",
"Finished cost_break_confirm_cnt_5.\n",
"Calculating atr_norm_channel_pos_14...\n",
"Finished atr_norm_channel_pos_14.\n",
"Calculating turnover_diff_skew_20...\n",
"Finished turnover_diff_skew_20.\n",
"Calculating lg_sm_flow_diverge_20...\n",
"Finished lg_sm_flow_diverge_20.\n",
"Calculating pullback_strong_20_20...\n",
"Finished pullback_strong_20_20.\n",
"Calculating vol_wgt_hist_pos_20...\n",
"Finished vol_wgt_hist_pos_20.\n",
"Calculating vol_adj_roc_20...\n",
"Finished vol_adj_roc_20.\n",
2025-06-10 15:22:25 +08:00
"Calculating cs_rank_net_lg_flow_val...\n",
"Finished cs_rank_net_lg_flow_val.\n",
"Calculating cs_rank_flow_divergence...\n",
"Finished cs_rank_flow_divergence.\n",
"Calculating cs_rank_ind_adj_lg_flow...\n",
"Error calculating cs_rank_ind_adj_lg_flow: Missing 'cat_l2_code' column. Assigning NaN.\n",
"Calculating cs_rank_elg_buy_ratio...\n",
"Finished cs_rank_elg_buy_ratio.\n",
"Calculating cs_rank_rel_profit_margin...\n",
"Finished cs_rank_rel_profit_margin.\n",
"Calculating cs_rank_cost_breadth...\n",
"Finished cs_rank_cost_breadth.\n",
"Calculating cs_rank_dist_to_upper_cost...\n",
"Finished cs_rank_dist_to_upper_cost.\n",
"Calculating cs_rank_winner_rate...\n",
"Finished cs_rank_winner_rate.\n",
"Calculating cs_rank_intraday_range...\n",
"Finished cs_rank_intraday_range.\n",
"Calculating cs_rank_close_pos_in_range...\n",
"Finished cs_rank_close_pos_in_range.\n",
"Calculating cs_rank_opening_gap...\n",
"Error calculating cs_rank_opening_gap: Missing 'pre_close' column. Assigning NaN.\n",
"Calculating cs_rank_pos_in_hist_range...\n",
"Finished cs_rank_pos_in_hist_range.\n",
"Calculating cs_rank_vol_x_profit_margin...\n",
"Finished cs_rank_vol_x_profit_margin.\n",
"Calculating cs_rank_lg_flow_price_concordance...\n",
"Finished cs_rank_lg_flow_price_concordance.\n",
"Calculating cs_rank_turnover_per_winner...\n",
"Finished cs_rank_turnover_per_winner.\n",
"Calculating cs_rank_ind_cap_neutral_pe (Placeholder - requires statsmodels)...\n",
"Finished cs_rank_ind_cap_neutral_pe (Placeholder).\n",
"Calculating cs_rank_volume_ratio...\n",
"Finished cs_rank_volume_ratio.\n",
"Calculating cs_rank_elg_buy_sell_sm_ratio...\n",
"Finished cs_rank_elg_buy_sell_sm_ratio.\n",
"Calculating cs_rank_cost_dist_vol_ratio...\n",
"Finished cs_rank_cost_dist_vol_ratio.\n",
"Calculating cs_rank_size...\n",
"Finished cs_rank_size.\n",
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 4566757 entries, 0 to 4566756\n",
"Columns: 197 entries, ts_code to cs_rank_size\n",
"dtypes: bool(10), datetime64[ns](1), float64(176), int64(6), int8(1), object(3)\n",
"memory usage: 6.4+ GB\n",
"None\n",
"['ts_code', 'trade_date', 'open', 'close', 'high', 'low', 'vol', 'pct_chg', 'amount', 'turnover_rate', 'pe_ttm', 'circ_mv', 'total_mv', 'volume_ratio', 'is_st', 'up_limit', 'down_limit', 'buy_sm_vol', 'sell_sm_vol', 'buy_lg_vol', 'sell_lg_vol', 'buy_elg_vol', 'sell_elg_vol', 'net_mf_vol', 'his_low', 'his_high', 'cost_5pct', 'cost_15pct', 'cost_50pct', 'cost_85pct', 'cost_95pct', 'weight_avg', 'winner_rate', 'cat_hot_concept_stock', 'concept_rank_pct_chg', 'concept_rank_turnover_rate', 'concept_rank_volume_ratio', 'holder_net_change_sum_10d', 'holder_increase_days_10d', 'holder_decrease_days_10d', 'holder_any_increase_flag_10d', 'holder_any_decrease_flag_10d', 'holder_direction_score_10d', 'cat_senti_mom_vol_spike', 'cat_senti_pre_breakout', 'ts_turnover_rate_acceleration_5_20', 'ts_vol_sustain_10_30', 'cs_amount_outlier_10', 'ts_ff_to_total_turnover_ratio', 'ts_price_volume_trend_coherence_5_20', 'ts_ff_turnover_rate_surge_10', 'undist_profit_ps', 'ocfps', 'AR', 'BR', 'AR_BR', 'log_circ_mv', 'cashflow_to_ev_factor', 'book_to_price_ratio', 'turnover_rate_mean_5', 'variance_20', 'bbi_ratio_factor', 'daily_deviation', 'lg_elg_net_buy_vol', 'flow_lg_elg_intensity', 'sm_net_buy_vol', 'flow_divergence_diff', 'flow_divergence_ratio', 'total_buy_vol', 'lg_elg_buy_prop', 'flow_struct_buy_change', 'lg_elg_net_buy_vol_change', 'flow_lg_elg_accel', 'chip_concentration_range', 'chip_skewness', 'floating_chip_proxy', 'cost_support_15pct_change', 'cat_winner_price_zone', 'flow_chip_consistency', 'profit_taking_vs_absorb', 'cat_is_positive', 'upside_vol', 'downside_vol', 'vol_ratio', 'return_skew', 'return_kurtosis', 'volume_change_rate', 'cat_volume_breakout', 'turnover_deviation', 'cat_turnover_spike', 'avg_volume_ratio', 'cat_volume_ratio_breakout', 'vol_spike', 'vol_std_5', 'atr_14', 'atr_6', 'obv', 'maobv_6', 'rsi_3', 'return_5', 'return_20', 'std_return_5', 'std_return_90', 'std_return_90_2', 'act_factor1', 'act_factor2', 'act_factor3', 'act_factor4', 'rank_act_factor1', 'rank_act_factor2', 'rank_act_factor3', 'cov', 'delta_cov', 'alpha_22_improved', 'alpha_003', 'alpha_007', 'alpha_013', 'vol_break', 'weight_roc5', 'price_cost_divergence', 'smallcap_concentration', 'cost_stability', 'high_cost_break_days', 'liquidity_risk', 'turnover_std', 'mv_volatility', 'volume_growth', 'mv_growth', 'momentum_factor', 'resonance_factor', 'log_close', 'cat_vol_spike', 'up', 'down', 'obv_maobv_6', 'std_return_5_over_std_return_90', 'std_return_90_minus_std_return_90_2', 'cat_af2', 'cat_af3', 'cat_af4', 'act_factor5', 'act_factor6', 'active_buy_volume_large', 'active_buy_volume_big', 'active_buy_volume_small', 'buy_lg_vol_minus_sell_lg_vol', 'buy_elg_vol_minus_sell_elg_vol', 'ctrl_strength', 'low_cost_dev', 'asymmetry', 'lock_factor', 'cat_vol_break', 'cost_atr_adj', 'cat_golden_resonance', 'mv_turnover_ratio', 'mv_adjusted_volume', 'mv_weighted_turnover', 'nonlinear_mv_volume', 'mv_volume_ratio', 'mv_momentum', 'senti_strong_inflow', 'lg_flow_mom_corr_20_60', 'lg_flow_accel', 'profit_pressure', 'underwater_resistance', 'cost_conc_std_20', 'profit_decay_20', 'vol_amp_loss_20', 'vol_drop_profit_cnt_5', 'lg_flow_vol_interact_20', 'cost_break_confirm_cnt_5', 'atr_norm_channel_pos_14', 'turnover_diff_skew_20', 'lg_sm_flow_diverge_20', 'pullback_strong_20_20', 'vol_wgt_hist_pos_20', 'vol_adj_roc_20', 'cs_rank_net_lg_flow_val', 'cs_rank_flow_divergence', 'cs_rank_ind_adj_lg_flow', 'cs_rank_elg_buy_ratio', 'cs_rank_rel_profit_margin', 'cs_rank_cost_breadth', 'cs_rank_dist_to_upper_cost', 'cs_rank_winner_rate', 'cs_rank_intraday_range', 'cs_rank_close_pos_in_range', 'cs_rank_opening_gap', 'cs_rank_pos_in_hist_range', 'cs_rank_vol_x_profit_margin', 'cs_rank_lg_flow_price_concordance', 'cs_rank_turnover_per_winner', 'cs_rank_ind_cap_neutral_pe', 'cs_rank_volume_ratio', 'cs_rank_elg_buy_sell_sm_ratio', 'cs_rank_cost_dist_vol_ratio', 'cs_rank_size']\n"
2025-04-28 11:02:52 +08:00
]
}
],
"source": [
2025-06-10 15:22:25 +08:00
"import numpy as np\n",
"from main.factor.factor import *\n",
"from main.factor.money_factor import * \n",
"from main.factor.concept_factor import * \n",
2025-04-28 11:02:52 +08:00
"\n",
"\n",
"def filter_data(df):\n",
" # df = df.groupby('trade_date').apply(lambda x: x.nlargest(1000, 'act_factor1'))\n",
2025-06-10 15:22:25 +08:00
" df = df[~df[\"is_st\"]]\n",
" df = df[~df[\"ts_code\"].str.endswith(\"BJ\")]\n",
" df = df[~df[\"ts_code\"].str.startswith(\"30\")]\n",
" df = df[~df[\"ts_code\"].str.startswith(\"68\")]\n",
" df = df[~df[\"ts_code\"].str.startswith(\"8\")]\n",
" df = df[df[\"trade_date\"] >= \"2019-01-01\"]\n",
" if \"in_date\" in df.columns:\n",
" df = df.drop(columns=[\"in_date\"])\n",
2025-04-28 11:02:52 +08:00
" df = df.reset_index(drop=True)\n",
" return df\n",
"\n",
"\n",
"gc.collect()\n",
"\n",
"df = filter_data(df)\n",
2025-06-10 15:22:25 +08:00
"df = df.sort_values(by=[\"ts_code\", \"trade_date\"])\n",
"\n",
"# df = price_minus_deduction_price(df, n=120)\n",
"# df = price_deduction_price_diff_ratio_to_sma(df, n=120)\n",
"# df = cat_price_vs_sma_vs_deduction_price(df, n=120)\n",
"# df = cat_reason(df, top_list_df)\n",
"# df = cat_is_on_top_list(df, top_list_df)\n",
"print(len(df))\n",
"df = generate_concept_factors(df, concept_dict)\n",
"print(len(df))\n",
"\n",
"df = holder_trade_factors(df, stk_holdertrade_df)\n",
"\n",
"df = cat_senti_mom_vol_spike(\n",
" df,\n",
" return_period=3,\n",
" return_threshold=0.03, # 近3日涨幅超3%\n",
" volume_ratio_threshold=1.3,\n",
" current_pct_chg_min=0.0, # 当日必须收红\n",
" current_pct_chg_max=0.05,\n",
") # 当日涨幅不宜过大\n",
"\n",
"df = cat_senti_pre_breakout(\n",
" df,\n",
" atr_short_N=10,\n",
" atr_long_M=40,\n",
" vol_atrophy_N=10,\n",
" vol_atrophy_M=40,\n",
" price_stab_N=5,\n",
" price_stab_threshold=0.06,\n",
" current_pct_chg_min_signal=0.002,\n",
" current_pct_chg_max_signal=0.05,\n",
" volume_ratio_signal_threshold=1.1,\n",
")\n",
"\n",
"df = ts_turnover_rate_acceleration_5_20(df)\n",
"df = ts_vol_sustain_10_30(df)\n",
"# df = cs_turnover_rate_relative_strength_20(df)\n",
"df = cs_amount_outlier_10(df)\n",
"df = ts_ff_to_total_turnover_ratio(df)\n",
"df = ts_price_volume_trend_coherence_5_20(df)\n",
"# df = ts_turnover_rate_trend_strength_5(df)\n",
"df = ts_ff_turnover_rate_surge_10(df)\n",
"\n",
"df = add_financial_factor(df, fina_indicator_df, factor_value_col=\"undist_profit_ps\")\n",
"df = add_financial_factor(df, fina_indicator_df, factor_value_col=\"ocfps\")\n",
"calculate_arbr(df, N=26)\n",
"df[\"log_circ_mv\"] = np.log(df[\"circ_mv\"])\n",
"df = calculate_cashflow_to_ev_factor(df, cashflow_df, balancesheet_df)\n",
"df = caculate_book_to_price_ratio(df, fina_indicator_df)\n",
"df = turnover_rate_n(df, n=5)\n",
"df = variance_n(df, n=20)\n",
"df = bbi_ratio_factor(df)\n",
"df = daily_deviation(df)\n",
"df = daily_industry_deviation(df)\n",
2025-04-28 11:02:52 +08:00
"df, _ = get_rolling_factor(df)\n",
"df, _ = get_simple_factor(df)\n",
2025-06-10 15:22:25 +08:00
"\n",
"df = calculate_strong_inflow_signal(df)\n",
"\n",
"df = df.rename(columns={\"l1_code\": \"cat_l1_code\"})\n",
"df = df.rename(columns={\"l2_code\": \"cat_l2_code\"})\n",
"\n",
2025-04-28 11:02:52 +08:00
"lg_flow_mom_corr(df, N=20, M=60)\n",
"lg_flow_accel(df)\n",
"profit_pressure(df)\n",
"underwater_resistance(df)\n",
"cost_conc_std(df, N=20)\n",
"profit_decay(df, N=20)\n",
"vol_amp_loss(df, N=20)\n",
"vol_drop_profit_cnt(df, N=20, M=5)\n",
"lg_flow_vol_interact(df, N=20)\n",
"cost_break_confirm_cnt(df, M=5)\n",
"atr_norm_channel_pos(df, N=14)\n",
"turnover_diff_skew(df, N=20)\n",
"lg_sm_flow_diverge(df, N=20)\n",
"pullback_strong(df, N=20, M=20)\n",
"vol_wgt_hist_pos(df, N=20)\n",
"vol_adj_roc(df, N=20)\n",
2025-06-10 15:22:25 +08:00
"\n",
"cs_rank_net_lg_flow_val(df)\n",
"cs_rank_flow_divergence(df)\n",
"cs_rank_industry_adj_lg_flow(df) # Needs cat_l2_code\n",
"cs_rank_elg_buy_ratio(df)\n",
"cs_rank_rel_profit_margin(df)\n",
"cs_rank_cost_breadth(df)\n",
"cs_rank_dist_to_upper_cost(df)\n",
"cs_rank_winner_rate(df)\n",
"cs_rank_intraday_range(df)\n",
"cs_rank_close_pos_in_range(df)\n",
"cs_rank_opening_gap(df) # Needs pre_close\n",
"cs_rank_pos_in_hist_range(df) # Needs his_low, his_high\n",
"cs_rank_vol_x_profit_margin(df)\n",
"cs_rank_lg_flow_price_concordance(df)\n",
"cs_rank_turnover_per_winner(df)\n",
"cs_rank_ind_cap_neutral_pe(df) # Placeholder - needs external libraries\n",
"cs_rank_volume_ratio(df) # Needs volume_ratio\n",
"cs_rank_elg_buy_sell_sm_ratio(df)\n",
"cs_rank_cost_dist_vol_ratio(df) # Needs volume_ratio\n",
"cs_rank_size(df) # Needs circ_mv\n",
"\n",
"\n",
"# df = df.merge(index_data, on='trade_date', how='left')\n",
"\n",
"print(df.info())\n",
"print(df.columns.tolist())"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "48712034",
"metadata": {},
"outputs": [
{
"ename": "FileNotFoundError",
"evalue": "File ../../data/industry_data.h5 does not exist",
"output_type": "error",
"traceback": [
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
"\u001b[31mFileNotFoundError\u001b[39m Traceback (most recent call last)",
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[8]\u001b[39m\u001b[32m, line 1\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m l2_df = \u001b[43mread_and_merge_h5_data\u001b[49m\u001b[43m(\u001b[49m\u001b[33;43m'\u001b[39;49m\u001b[33;43m../../data/industry_data.h5\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkey\u001b[49m\u001b[43m=\u001b[49m\u001b[33;43m'\u001b[39;49m\u001b[33;43mindustry_data\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[32m 2\u001b[39m \u001b[43m \u001b[49m\u001b[43mcolumns\u001b[49m\u001b[43m=\u001b[49m\u001b[43m[\u001b[49m\u001b[33;43m'\u001b[39;49m\u001b[33;43mts_code\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[33;43m'\u001b[39;49m\u001b[33;43ml2_code\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[33;43m'\u001b[39;49m\u001b[33;43min_date\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 3\u001b[39m \u001b[43m \u001b[49m\u001b[43mdf\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mon\u001b[49m\u001b[43m=\u001b[49m\u001b[43m[\u001b[49m\u001b[33;43m'\u001b[39;49m\u001b[33;43mts_code\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mjoin\u001b[49m\u001b[43m=\u001b[49m\u001b[33;43m'\u001b[39;49m\u001b[33;43mleft\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[32m 4\u001b[39m df = merge_with_industry_data(df, l2_df)\n\u001b[32m 5\u001b[39m df = df.rename(columns={\u001b[33m'\u001b[39m\u001b[33ml2_code\u001b[39m\u001b[33m'\u001b[39m: \u001b[33m'\u001b[39m\u001b[33mcat_l2_code\u001b[39m\u001b[33m'\u001b[39m})\n",
"\u001b[36mFile \u001b[39m\u001b[32m/mnt/d/PyProject/NewStock/main/utils/utils.py:14\u001b[39m, in \u001b[36mread_and_merge_h5_data\u001b[39m\u001b[34m(h5_filename, key, columns, df, join, on, prefix)\u001b[39m\n\u001b[32m 11\u001b[39m processed_columns.append(col)\n\u001b[32m 13\u001b[39m \u001b[38;5;66;03m# 从 HDF5 文件读取数据,选择需要的列\u001b[39;00m\n\u001b[32m---> \u001b[39m\u001b[32m14\u001b[39m data = \u001b[43mpd\u001b[49m\u001b[43m.\u001b[49m\u001b[43mread_hdf\u001b[49m\u001b[43m(\u001b[49m\u001b[43mh5_filename\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkey\u001b[49m\u001b[43m=\u001b[49m\u001b[43mkey\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcolumns\u001b[49m\u001b[43m=\u001b[49m\u001b[43mprocessed_columns\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 16\u001b[39m \u001b[38;5;66;03m# 修改列名,如果列名以前有 _, 加上 _\u001b[39;00m\n\u001b[32m 17\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m col \u001b[38;5;129;01min\u001b[39;00m data.columns:\n",
"\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/stock/lib/python3.13/site-packages/pandas/io/pytables.py:424\u001b[39m, in \u001b[36mread_hdf\u001b[39m\u001b[34m(path_or_buf, key, mode, errors, where, start, stop, columns, iterator, chunksize, **kwargs)\u001b[39m\n\u001b[32m 421\u001b[39m exists = \u001b[38;5;28;01mFalse\u001b[39;00m\n\u001b[32m 423\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m exists:\n\u001b[32m--> \u001b[39m\u001b[32m424\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mFileNotFoundError\u001b[39;00m(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mFile \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mpath_or_buf\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m does not exist\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m 426\u001b[39m store = HDFStore(path_or_buf, mode=mode, errors=errors, **kwargs)\n\u001b[32m 427\u001b[39m \u001b[38;5;66;03m# can't auto open/close if we are using an iterator\u001b[39;00m\n\u001b[32m 428\u001b[39m \u001b[38;5;66;03m# so delegate to the iterator\u001b[39;00m\n",
"\u001b[31mFileNotFoundError\u001b[39m: File ../../data/industry_data.h5 does not exist"
]
}
],
"source": [
"\n",
2025-04-28 11:02:52 +08:00
"l2_df = read_and_merge_h5_data('../../data/industry_data.h5', key='industry_data',\n",
" columns=['ts_code', 'l2_code', 'in_date'],\n",
" df=None, on=['ts_code'], join='left')\n",
"df = merge_with_industry_data(df, l2_df)\n",
"df = df.rename(columns={'l2_code': 'cat_l2_code'})\n",
"# df = df.merge(index_data, on='trade_date', how='left')\n",
"\n",
2025-06-10 15:22:25 +08:00
"days = 5\n",
2025-04-28 11:02:52 +08:00
"df = df.sort_values(by=['ts_code', 'trade_date'])\n",
"# df['future_return'] = df.groupby('ts_code', group_keys=False)['close'].apply(lambda x: x.shift(-days) / x - 1)\n",
"df['future_return'] = (df.groupby('ts_code')['close'].shift(-days) - df.groupby('ts_code')['open'].shift(-1)) / \\\n",
" df.groupby('ts_code')['open'].shift(-1)\n",
"# df['future_return'] = df.groupby('ts_code')['pct_chg'].shift(-1)\n",
"df['future_return2'] = (df.groupby('ts_code')['close'].shift(-1) - df.groupby('ts_code')['open'].shift(-1)) / \\\n",
" df.groupby('ts_code')['open'].shift(-1)\n",
"\n",
"df['future_volatility'] = (\n",
" df.groupby('ts_code')['pct_chg']\n",
" .transform(lambda x: x.rolling(days).std().shift(-days))\n",
")\n",
"df['future_score'] = calculate_score(df, days=days, lambda_param=0.3)\n",
"\n",
"\n",
"def select_pre_zt_stocks_dynamic(stock_df):\n",
" def select_stocks(group):\n",
2025-06-10 15:22:25 +08:00
" return group.nsmallest(1000, 'total_mv') # 如果循环结束仍未找到足够标签,则返回最大数量的股票\n",
2025-04-28 11:02:52 +08:00
"\n",
" stock_df = stock_df.groupby('trade_date', group_keys=False).apply(select_stocks)\n",
" return stock_df\n",
"\n",
"\n",
"gc.collect()\n",
"\n",
"# df = select_pre_zt_stocks_dynamic(df[(df['trade_date'] >= '2022-01-01') & (df['trade_date'] <= '2029-04-07')])\n",
"\n",
"industry_df = read_industry_data('../../data/sw_daily.h5')\n",
"df = df.merge(industry_df, on=['cat_l2_code', 'trade_date'], how='left')\n"
]
},
{
"cell_type": "code",
2025-06-10 15:22:25 +08:00
"execution_count": null,
2025-04-28 11:02:52 +08:00
"id": "1c1dd3d6",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['open', 'close', 'high', 'low', 'vol', 'pct_chg', 'turnover_rate', 'circ_mv', 'volume_ratio', 'up_limit', 'down_limit', 'buy_sm_vol', 'sell_sm_vol', 'buy_lg_vol', 'sell_lg_vol', 'buy_elg_vol', 'sell_elg_vol', 'net_mf_vol', 'his_low', 'his_high', 'cost_5pct', 'cost_15pct', 'cost_50pct', 'cost_85pct', 'cost_95pct', 'weight_avg', 'winner_rate', 'lg_elg_net_buy_vol', 'flow_lg_elg_intensity', 'sm_net_buy_vol', 'total_buy_vol', 'lg_elg_buy_prop', 'flow_struct_buy_change', 'lg_elg_net_buy_vol_change', 'flow_lg_elg_accel', 'chip_concentration_range', 'chip_skewness', 'floating_chip_proxy', 'cost_support_15pct_change', 'cat_winner_price_zone', 'flow_chip_consistency', 'profit_taking_vs_absorb', 'cat_is_positive', 'upside_vol', 'downside_vol', 'vol_ratio', 'return_skew', 'return_kurtosis', 'volume_change_rate', 'cat_volume_breakout', 'turnover_deviation', 'cat_turnover_spike', 'avg_volume_ratio', 'cat_volume_ratio_breakout', 'vol_spike', 'vol_std_5', 'atr_14', 'atr_6', 'obv', 'maobv_6', 'rsi_3', 'return_5', 'return_20', 'std_return_5', 'std_return_90', 'std_return_90_2', 'act_factor1', 'act_factor2', 'act_factor3', 'act_factor4', 'rank_act_factor1', 'rank_act_factor2', 'rank_act_factor3', 'log_circ_mv', 'cov', 'delta_cov', 'alpha_22_improved', 'alpha_003', 'alpha_007', 'alpha_013', 'cat_up_limit', 'cat_down_limit', 'up_limit_count_10d', 'down_limit_count_10d', 'consecutive_up_limit', 'vol_break', 'weight_roc5', 'smallcap_concentration', 'cost_stability', 'high_cost_break_days', 'liquidity_risk', 'turnover_std', 'mv_volatility', 'volume_growth', 'mv_growth', 'arbr', 'momentum_factor', 'resonance_factor', 'log_close', 'cat_vol_spike', 'up', 'down', 'obv_maobv_6', 'std_return_5_over_std_return_90', 'std_return_90_minus_std_return_90_2', 'cat_af2', 'cat_af3', 'cat_af4', 'act_factor5', 'act_factor6', 'active_buy_volume_large', 'active_buy_volume_big', 'active_buy_volume_small', 'buy_lg_vol_minus_sell_lg_vol', 'buy_elg_vol_minus_sell_elg_vol', 'ctrl_strength', 'low_cost_dev', 'asymmetry', 'lock_factor', 'cat_vol_break', 'cost_atr_adj', 'cat_golden_resonance', 'mv_turnover_ratio', 'mv_adjusted_volume', 'mv_weighted_turnover', 'nonlinear_mv_volume', 'mv_volume_ratio', 'mv_momentum', 'lg_flow_mom_corr_20_60', 'lg_buy_consolidation_20', 'lg_flow_accel', 'profit_pressure', 'underwater_resistance', 'cost_conc_std_20', 'profit_decay_20', 'vol_amp_loss_20', 'vol_drop_profit_cnt_5', 'lg_flow_vol_interact_20', 'cost_break_confirm_cnt_5', 'atr_norm_channel_pos_14', 'turnover_diff_skew_20', 'lg_sm_flow_diverge_20', 'pullback_strong_20_20', 'vol_wgt_hist_pos_20', 'vol_adj_roc_20', 'intraday_lg_flow_corr_20', 'cap_neutral_cost_metric', 'in_date', 'industry_obv', 'industry_return_5', 'industry_return_20', 'industry__ema_5', 'industry__ema_13', 'industry__ema_20', 'industry__ema_60', 'industry_act_factor1', 'industry_act_factor2', 'industry_act_factor3', 'industry_act_factor4', 'industry_act_factor5', 'industry_act_factor6', 'industry_rank_act_factor1', 'industry_rank_act_factor2', 'industry_rank_act_factor3', 'industry_return_5_percentile', 'industry_return_20_percentile']\n"
]
}
],
"source": [
"feature_columns = [col for col in df.columns if col in df.columns]\n",
"feature_columns = [col for col in feature_columns if col not in ['trade_date',\n",
" 'ts_code',\n",
" 'label']]\n",
"feature_columns = [col for col in feature_columns if 'future' not in col]\n",
"feature_columns = [col for col in feature_columns if 'label' not in col]\n",
"feature_columns = [col for col in feature_columns if 'score' not in col]\n",
"feature_columns = [col for col in feature_columns if 'gen' not in col]\n",
"feature_columns = [col for col in feature_columns if 'is_st' not in col]\n",
"feature_columns = [col for col in feature_columns if 'pe_ttm' not in col]\n",
"feature_columns = [col for col in feature_columns if 'cat_l2_code' not in col]\n",
"# feature_columns = [col for col in feature_columns if col not in origin_columns]\n",
"feature_columns = [col for col in feature_columns if not col.startswith('_')]\n",
"# feature_columns = [col for col in feature_columns if col not in ['ts_code', 'trade_date', 'vol_std_5', 'cov', 'delta_cov', 'alpha_22_improved', 'alpha_007', 'consecutive_up_limit', 'mv_volatility', 'volume_growth', 'mv_growth', 'arbr']]\n",
"\n",
"print(feature_columns)\n",
"numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns\n",
"numeric_columns = [col for col in numeric_columns if col in feature_columns]"
]
},
{
"cell_type": "code",
2025-06-10 15:22:25 +08:00
"execution_count": null,
2025-04-28 11:02:52 +08:00
"id": "2c60c1ea",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"每个特征列中的 NaN 值数量(字典形式):\n",
"ts_code: 0\n",
"trade_date: 0\n",
"open: 0\n",
"close: 0\n",
"high: 0\n",
"low: 0\n",
"vol: 0\n",
"pct_chg: 0\n",
"turnover_rate: 0\n",
"pe_ttm: 499616\n",
"circ_mv: 0\n",
"volume_ratio: 791\n",
"is_st: 0\n",
"up_limit: 0\n",
"down_limit: 0\n",
"buy_sm_vol: 7\n",
"sell_sm_vol: 7\n",
"buy_lg_vol: 7\n",
"sell_lg_vol: 7\n",
"buy_elg_vol: 7\n",
"sell_elg_vol: 7\n",
"net_mf_vol: 7\n",
"his_low: 24695\n",
"his_high: 24695\n",
"cost_5pct: 24695\n",
"cost_15pct: 24695\n",
"cost_50pct: 24695\n",
"cost_85pct: 24695\n",
"cost_95pct: 24695\n",
"weight_avg: 24695\n",
"winner_rate: 24695\n",
"lg_elg_net_buy_vol: 7\n",
"flow_lg_elg_intensity: 7\n",
"sm_net_buy_vol: 7\n",
"flow_divergence_diff: 7\n",
"flow_divergence_ratio: 7\n",
"total_buy_vol: 7\n",
"lg_elg_buy_prop: 7\n",
"flow_struct_buy_change: 3287\n",
"lg_elg_net_buy_vol_change: 3287\n",
"flow_lg_elg_accel: 6567\n",
"chip_concentration_range: 24695\n",
"chip_skewness: 24695\n",
"floating_chip_proxy: 24695\n",
"cost_support_15pct_change: 27855\n",
"cat_winner_price_zone: 0\n",
"flow_chip_consistency: 7\n",
"profit_taking_vs_absorb: 7\n",
"cat_is_positive: 0\n",
"upside_vol: 29581\n",
"downside_vol: 29655\n",
"vol_ratio: 0\n",
"return_skew: 13096\n",
"return_kurtosis: 13096\n",
"volume_change_rate: 29466\n",
"cat_volume_breakout: 0\n",
"turnover_deviation: 6548\n",
"cat_turnover_spike: 0\n",
"avg_volume_ratio: 7341\n",
"cat_volume_ratio_breakout: 0\n",
"vol_spike: 62074\n",
"vol_std_5: 16370\n",
"atr_14: 45836\n",
"atr_6: 19644\n",
"obv: 0\n",
"maobv_6: 16370\n",
"rsi_3: 9822\n",
"return_5: 16370\n",
"return_20: 65315\n",
"std_return_5: 16370\n",
"std_return_90: 291770\n",
"std_return_90_2: 323906\n",
"act_factor1: 16370\n",
"act_factor2: 42562\n",
"act_factor3: 65315\n",
"act_factor4: 194886\n",
"rank_act_factor1: 16370\n",
"rank_act_factor2: 42562\n",
"rank_act_factor3: 65315\n",
"log_circ_mv: 0\n",
"cov: 13096\n",
"delta_cov: 29466\n",
"alpha_22_improved: 62074\n",
"alpha_003: 0\n",
"alpha_007: 13120\n",
"alpha_013: 62074\n",
"cat_up_limit: 0\n",
"cat_down_limit: 0\n",
"up_limit_count_10d: 0\n",
"down_limit_count_10d: 0\n",
"consecutive_up_limit: 0\n",
"vol_break: 0\n",
"weight_roc5: 40531\n",
"price_cost_divergence: 93280\n",
"smallcap_concentration: 24695\n",
"cost_stability: 85077\n",
"high_cost_break_days: 13096\n",
"liquidity_risk: 53215\n",
"turnover_std: 62074\n",
"mv_volatility: 62074\n",
"volume_growth: 65315\n",
"mv_growth: 65315\n",
"arbr: 9822\n",
"momentum_factor: 29466\n",
"resonance_factor: 791\n",
"log_close: 0\n",
"cat_vol_spike: 0\n",
"up: 0\n",
"down: 0\n",
"obv_maobv_6: 16370\n",
"std_return_5_over_std_return_90: 291770\n",
"std_return_90_minus_std_return_90_2: 323906\n",
"cat_af2: 0\n",
"cat_af3: 0\n",
"cat_af4: 0\n",
"act_factor5: 194886\n",
"act_factor6: 42562\n",
"active_buy_volume_large: 13\n",
"active_buy_volume_big: 79\n",
"active_buy_volume_small: 7\n",
"buy_lg_vol_minus_sell_lg_vol: 8\n",
"buy_elg_vol_minus_sell_elg_vol: 69\n",
"ctrl_strength: 24695\n",
"low_cost_dev: 24695\n",
"asymmetry: 24701\n",
"lock_factor: 24695\n",
"cat_vol_break: 0\n",
"cost_atr_adj: 69060\n",
"cat_golden_resonance: 0\n",
"mv_turnover_ratio: 0\n",
"mv_adjusted_volume: 0\n",
"mv_weighted_turnover: 0\n",
"nonlinear_mv_volume: 0\n",
"mv_volume_ratio: 791\n",
"mv_momentum: 791\n",
"lg_flow_mom_corr_20_60: 1186\n",
"lg_buy_consolidation_20: 1950902\n",
"lg_flow_accel: 6567\n",
"profit_pressure: 24695\n",
"underwater_resistance: 24695\n",
"cost_conc_std_20: 29466\n",
"profit_decay_20: 0\n",
"vol_amp_loss_20: 53215\n",
"vol_drop_profit_cnt_5: 0\n",
"lg_flow_vol_interact_20: 29466\n",
"cost_break_confirm_cnt_5: 0\n",
"atr_norm_channel_pos_14: 0\n",
"turnover_diff_skew_20: 32740\n",
"lg_sm_flow_diverge_20: 29466\n",
"pullback_strong_20_20: 0\n",
"vol_wgt_hist_pos_20: 0\n",
"vol_adj_roc_20: 0\n",
"intraday_lg_flow_corr_20: 2431461\n",
"cap_neutral_cost_metric: 2431461\n",
"cat_l2_code: 290\n",
"in_date: 65486\n",
"future_return: 6548\n",
"future_return2: 3274\n",
"future_volatility: 6548\n",
"score: 6548\n",
"future_score: 6548\n",
"industry_obv: 11272\n",
"industry_return_5: 11272\n",
"industry_return_20: 11272\n",
"industry__ema_5: 11272\n",
"industry__ema_13: 11272\n",
"industry__ema_20: 11272\n",
"industry__ema_60: 11272\n",
"industry_act_factor1: 11272\n",
"industry_act_factor2: 11272\n",
"industry_act_factor3: 11272\n",
"industry_act_factor4: 11272\n",
"industry_act_factor5: 11272\n",
"industry_act_factor6: 11272\n",
"industry_rank_act_factor1: 11272\n",
"industry_rank_act_factor2: 11272\n",
"industry_rank_act_factor3: 11272\n",
"industry_return_5_percentile: 11272\n",
"industry_return_20_percentile: 11272\n",
"['open', 'close', 'high', 'low', 'vol', 'pct_chg', 'turnover_rate', 'circ_mv', 'volume_ratio', 'up_limit', 'down_limit', 'buy_sm_vol', 'sell_sm_vol', 'buy_lg_vol', 'sell_lg_vol', 'buy_elg_vol', 'sell_elg_vol', 'net_mf_vol', 'his_low', 'his_high', 'cost_5pct', 'cost_15pct', 'cost_50pct', 'cost_85pct', 'cost_95pct', 'weight_avg', 'winner_rate', 'lg_elg_net_buy_vol', 'flow_lg_elg_intensity', 'sm_net_buy_vol', 'total_buy_vol', 'lg_elg_buy_prop', 'flow_struct_buy_change', 'lg_elg_net_buy_vol_change', 'flow_lg_elg_accel', 'chip_concentration_range', 'chip_skewness', 'floating_chip_proxy', 'cost_support_15pct_change', 'cat_winner_price_zone', 'flow_chip_consistency', 'profit_taking_vs_absorb', 'cat_is_positive', 'upside_vol', 'downside_vol', 'vol_ratio', 'return_skew', 'return_kurtosis', 'volume_change_rate', 'cat_volume_breakout', 'turnover_deviation', 'cat_turnover_spike', 'avg_volume_ratio', 'cat_volume_ratio_breakout', 'vol_std_5', 'atr_6', 'obv', 'maobv_6', 'rsi_3', 'return_5', 'std_return_5', 'act_factor1', 'rank_act_factor1', 'log_circ_mv', 'cov', 'delta_cov', 'alpha_003', 'alpha_007', 'cat_up_limit', 'cat_down_limit', 'up_limit_count_10d', 'down_limit_count_10d', 'consecutive_up_limit', 'vol_break', 'smallcap_concentration', 'high_cost_break_days', 'arbr', 'momentum_factor', 'resonance_factor', 'log_close', 'cat_vol_spike', 'up', 'down', 'obv_maobv_6', 'cat_af2', 'cat_af3', 'cat_af4', 'active_buy_volume_large', 'active_buy_volume_big', 'active_buy_volume_small', 'buy_lg_vol_minus_sell_lg_vol', 'buy_elg_vol_minus_sell_elg_vol', 'ctrl_strength', 'low_cost_dev', 'asymmetry', 'lock_factor', 'cat_vol_break', 'cat_golden_resonance', 'mv_turnover_ratio', 'mv_adjusted_volume', 'mv_weighted_turnover', 'nonlinear_mv_volume', 'mv_volume_ratio', 'mv_momentum', 'lg_flow_mom_corr_20_60', 'lg_flow_accel', 'profit_pressure', 'underwater_resistance', 'cost_conc_std_20', 'profit_decay_20', 'vol_drop_profit_cnt_5', 'lg_flow_vol_interact_20', 'cost_break_confirm_cnt_5', 'atr_norm_channel_pos_14', 'lg_sm_flow_diverge_20', 'pullback_strong_20_20', 'vol_wgt_hist_pos_20', 'vol_adj_roc_20', 'industry_obv', 'industry_return_5', 'industry_return_20', 'industry__ema_5', 'industry__ema_13', 'industry__ema_20', 'industry__ema_60', 'industry_act_factor1', 'industry_act_factor2', 'industry_act_factor3', 'industry_act_factor4', 'industry_act_factor5', 'industry_act_factor6', 'industry_rank_act_factor1', 'industry_rank_act_factor2', 'industry_rank_act_factor3', 'industry_return_5_percentile', 'industry_return_20_percentile']\n"
]
}
],
"source": [
"def count_nan_and_inf_per_feature(df: pd.DataFrame):\n",
" \"\"\"\n",
" 计算 DataFrame 中每个特征列的 NaN 和 Inf 值数量。\n",
"\n",
" Args:\n",
" df: 要分析的 pandas DataFrame。\n",
"\n",
" Returns:\n",
" 一个字典,包含两个 pandas Series: \n",
" - 'NaN_Count': 索引是列名,值是该列中 NaN 的数量。\n",
" - 'Inf_Count': 索引是列名,值是该列中 Inf 的数量。\n",
" \"\"\"\n",
" nan_counts = df.isna().sum()\n",
" # inf_counts = np.isinf(df).sum()\n",
" return nan_counts\n",
"\n",
"\n",
"nan_counts_series = count_nan_and_inf_per_feature(df)\n",
"\n",
"# 或者,如果您想以字典的形式获取结果:\n",
"nan_counts_dict = nan_counts_series.to_dict()\n",
"print(\"\\n每个特征列中的 NaN 值数量(字典形式):\")\n",
"for k, v in nan_counts_dict.items():\n",
" print(f'{k}: {v}')\n",
" if v > 30000 and k in feature_columns:\n",
" feature_columns.remove(k)\n",
"print(feature_columns)"
]
},
{
"cell_type": "code",
2025-06-10 15:22:25 +08:00
"execution_count": null,
2025-04-28 11:02:52 +08:00
"id": "e088bd8a357e815a",
"metadata": {
"ExecuteTime": {
"end_time": "2025-04-13T15:39:47.461434Z",
"start_time": "2025-04-13T15:39:44.369664Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"gen\tnevals\tavg \tstd \tmin\tmax \n",
"0 \t64 \t-0.387605\t0.492269\t-1 \t0.84339\n",
"1 \t52 \t-0.0280574\t0.328787\t-1 \t0.84339\n",
"2 \t56 \t-0.0442643\t0.498012\t-1 \t0.84339\n",
"3 \t50 \t0.0843881 \t0.506873\t-1 \t0.84339\n",
"4 \t56 \t0.128797 \t0.586781\t-1 \t0.84339\n",
"5 \t52 \t0.107366 \t0.586957\t-1 \t0.918103\n",
"6 \t52 \t0.0602483 \t0.666345\t-1 \t0.918103\n",
"7 \t54 \t0.177717 \t0.561644\t-1 \t0.918103\n",
"8 \t57 \t0.206183 \t0.620791\t-1 \t0.957887\n",
"9 \t51 \t0.253306 \t0.667259\t-1 \t1.07875 \n",
"10 \t53 \t0.19914 \t0.681541\t-1 \t1.14356 \n",
"11 \t54 \t0.173093 \t0.752007\t-1 \t1.24408 \n",
"12 \t53 \t0.429303 \t0.636249\t-1.07606\t1.24408 \n",
"13 \t46 \t0.443469 \t0.754764\t-1.17052\t1.24408 \n",
"14 \t57 \t0.412168 \t0.719715\t-1.2066 \t1.24408 \n",
"15 \t47 \t0.420095 \t0.833547\t-1.20899\t1.25608 \n",
"16 \t46 \t0.516075 \t0.916347\t-1.16556\t1.25765 \n",
"17 \t48 \t0.52129 \t0.872883\t-1 \t1.30663 \n",
"18 \t53 \t0.530992 \t0.923366\t-1 \t1.3677 \n",
"19 \t54 \t0.569299 \t0.861833\t-1.39138\t1.3677 \n",
"20 \t51 \t0.538589 \t0.883032\t-1.12472\t1.3677 \n",
"21 \t49 \t0.684813 \t0.874059\t-1 \t1.3677 \n",
"22 \t46 \t0.659823 \t0.86879 \t-1.17051\t1.3677 \n",
"23 \t42 \t0.678971 \t0.886044\t-1.39138\t1.3677 \n",
"24 \t55 \t0.639381 \t0.905808\t-1.39138\t1.37645 \n",
"25 \t42 \t0.721136 \t0.915513\t-1.30205\t1.39372 \n",
"26 \t56 \t0.695918 \t0.849837\t-1.0437 \t1.39372 \n",
"27 \t56 \t0.465007 \t0.934313\t-1 \t1.39372 \n",
"28 \t51 \t0.714563 \t0.88635 \t-1.13547\t1.43745 \n",
"29 \t49 \t0.687478 \t0.84568 \t-1 \t1.43745 \n",
"30 \t50 \t0.646657 \t0.835957\t-1 \t1.43745 \n",
"31 \t49 \t0.615978 \t0.939622\t-1.04846\t1.43745 \n",
"32 \t49 \t0.654171 \t0.973861\t-1.12771\t1.43745 \n",
"\n",
"Best Factors Found:\n",
"Fitness: 1.4375, Factor 1: protected_div_torch(mul(protected_div_torch(add(return_kurtosis, profit_pressure), mul(cost_85pct, buy_elg_vol_minus_sell_elg_vol)), protected_div_torch(cost_break_confirm_cnt_5, pow(cos(lg_flow_vol_interact_20), cos(chip_concentration_range)))), sub(add(obv, protected_div_torch(cost_break_confirm_cnt_5, cos(chip_skewness))), add(obv, protected_div_torch(add(return_kurtosis, profit_pressure), pow(alpha_007, active_buy_volume_big)))))\n",
"Fitness: 1.3937, Factor 2: protected_div_torch(mul(protected_div_torch(protected_div_torch(add(return_kurtosis, profit_pressure), pow(alpha_007, active_buy_volume_big)), delta_cov), protected_div_torch(protected_div_torch(add(return_kurtosis, profit_pressure), pow(alpha_007, active_buy_volume_big)), rank_act_factor2)), sub(add(obv, protected_div_torch(cost_break_confirm_cnt_5, cos(chip_skewness))), add(obv, protected_div_torch(add(return_kurtosis, profit_pressure), pow(alpha_007, active_buy_volume_big)))))\n",
"Fitness: 1.3843, Factor 3: protected_div_torch(mul(protected_div_torch(protected_div_torch(profit_pressure, pow(alpha_007, active_buy_volume_big)), delta_cov), protected_div_torch(protected_div_torch(add(return_kurtosis, profit_pressure), pow(alpha_007, active_buy_volume_big)), rank_act_factor2)), sub(add(obv, protected_div_torch(cost_break_confirm_cnt_5, cos(chip_skewness))), add(obv, protected_div_torch(add(return_kurtosis, profit_pressure), pow(alpha_007, active_buy_volume_big)))))\n"
]
}
],
"source": [
"from deap import creator, gp, tools, base, algorithms\n",
"import numpy as np\n",
"import pandas as pd\n",
"import torch\n",
"from scipy.stats import spearmanr\n",
"import operator\n",
"\n",
"# 保护性除法函数 (PyTorch 版本)\n",
"def protected_div_torch(left, right):\n",
" return torch.where(right != 0, left / right, torch.ones_like(left))\n",
"\n",
"def generate_deap_factors_pytorch_v2(df: pd.DataFrame, numeric_columns: list, target_column: str = 'future_return', date_column: str = 'trade_date', params: dict = None, random_state: int = 42):\n",
" \"\"\"\n",
" 使用 deap 库通过遗传编程生成新的因子,并使用 PyTorch 算子和计算,过滤 NaN 值。\n",
"\n",
" Args:\n",
" df (pd.DataFrame): 包含因子和目标变量的数据框。\n",
" numeric_columns (list): 数值型因子列名的列表。\n",
" target_column (str): 目标变量的列名,默认为 'future_return'。\n",
" params (dict): deap 进化算法的参数字典。\n",
" random_state (int): 随机种子,用于保证结果的可重复性。\n",
"\n",
" Returns:\n",
" list: 包含最佳因子表达式的列表。\n",
" \"\"\"\n",
" if params is None:\n",
" params = {}\n",
"\n",
" # 设置随机种子\n",
" np.random.seed(random_state)\n",
" torch.manual_seed(random_state)\n",
"\n",
" # 1. 定义原始集 (Primitive Set) - 使用 PyTorch 算子\n",
" pset_torch = gp.PrimitiveSet(\"PYTORCH\", arity=len(numeric_columns))\n",
" pset_torch.addPrimitive(torch.add, 2)\n",
" pset_torch.addPrimitive(torch.sub, 2)\n",
" pset_torch.addPrimitive(torch.mul, 2)\n",
" pset_torch.addPrimitive(protected_div_torch, 2) # 使用 PyTorch 保护性除法\n",
" # 新增的复杂算子\n",
" pset_torch.addPrimitive(torch.sin, 1) # 正弦函数 (一元算子)\n",
" pset_torch.addPrimitive(torch.cos, 1) # 余弦函数 (一元算子)\n",
" # pset_torch.addPrimitive(torch.abs, 1) # 绝对值 (一元算子)\n",
" # pset_torch.addPrimitive(torch.sqrt, 1) # 平方根 (一元算子)\n",
" pset_torch.addPrimitive(torch.pow, 2) # 指数运算 (二元算子,例如 x 的 y 次方)\n",
" # pset_torch.addPrimitive(torch.tanh, 1) # 双曲正切函数 (一元算子)\n",
"\n",
" # def rate_of_change_torch(x, y): # 计算 y 相对于 x 的变化率\n",
" # return (y - x) / (x + 1e-8)\n",
" # pset_torch.addPrimitive(rate_of_change_torch, 2)\n",
"\n",
" # def covariance_like_torch(x, y):\n",
" # mean_x = torch.mean(x, dim=0, keepdim=True) # 保持维度以便广播\n",
" # mean_y = torch.mean(y, dim=0, keepdim=True)\n",
" # return (x - mean_x) * (y - mean_y)\n",
"\n",
" # pset_torch.addPrimitive(covariance_like_torch, 2)\n",
"\n",
" # 将 numeric_columns 作为终端添加到原始集\n",
" pset_torch.renameArguments(**{f\"ARG{i}\": col for i, col in enumerate(numeric_columns)})\n",
"\n",
" # 2. 定义适应度和个体\n",
" # 目标是最大化 IC 夏普比率\n",
" creator.create(\"FitnessMax\", base.Fitness, weights=(1.0,))\n",
" creator.create(\"Individual\", gp.PrimitiveTree, fitness=creator.FitnessMax)\n",
"\n",
" # 3. 创建工具箱 (Toolbox)\n",
" toolbox = base.Toolbox()\n",
" toolbox.register(\"expr_torch\", gp.genHalfAndHalf, pset=pset_torch, min_=1, max_=3) # 调整 min_/max_ 以控制表达式复杂性\n",
" toolbox.register(\"individual\", tools.initIterate, creator.Individual, toolbox.expr_torch)\n",
" toolbox.register(\"population\", tools.initRepeat, list, toolbox.individual)\n",
" toolbox.register(\"compile_torch\", gp.compile, pset=pset_torch) # 编译为 PyTorch 函数\n",
"\n",
" # 准备 PyTorch 张量数据 (所有日期所有股票)\n",
" device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
" data_tensor_all = torch.from_numpy(df[numeric_columns].values).float().to(device)\n",
" target_tensor_all = torch.from_numpy(df[target_column].values).float().to(device)\n",
" dates_all = df[date_column].values # 获取日期 numpy 数组\n",
"\n",
" # 4. 定义基于 PyTorch + IC 夏普比率的适应度函数\n",
" def evaluate_torch_cuda_ic_sharpe(individual, data_tensor_all, target_tensor_all, dates_all):\n",
" # 将个体(表达式树)编译成可执行的 PyTorch 函数\n",
" func_torch = toolbox.compile_torch(expr=individual)\n",
"\n",
" try:\n",
" # 应用该函数到 PyTorch 张量 (一次性计算所有日期所有股票的因子值)\n",
" # 处理可能的维度不一致,确保输出是一维或二维 (N, 1) 的张量\n",
" factor_values_tensor = func_torch(*torch.split(data_tensor_all, 1, dim=1))\n",
" if factor_values_tensor.ndim > 1 and factor_values_tensor.shape[1] != 1:\n",
" # 如果输出是 (N, M) 其中 M > 1, 可能需要一个聚合操作, 这里暂时返回负适应度\n",
" print(f\"警告: 因子表达式输出张量维度为 {factor_values_tensor.shape},期望 (N, 1)。\")\n",
" return (-1.0,)\n",
" factor_values_tensor = factor_values_tensor.flatten() # 确保是展平的一维张量\n",
"\n",
" # 将 PyTorch 张量移回 CPU 并转换为 NumPy 数组\n",
" factor_values_np = factor_values_tensor.cpu().numpy()\n",
" target_np = target_tensor_all.cpu().numpy().flatten() # 目标也展平\n",
" dates_np = dates_all # 日期已经是 numpy 数组\n",
"\n",
" # 创建一个临时 Pandas DataFrame 以便按日期分组计算每日 IC\n",
" temp_df = pd.DataFrame({\n",
" 'date': dates_np,\n",
" 'factor_value': factor_values_np,\n",
" 'target_value': target_np\n",
" })\n",
"\n",
" # 计算每日 Rank IC\n",
" # 在分组应用 spearmanr 时处理 NaN 和数据点不足的问题\n",
" daily_ics = temp_df.groupby('date').apply(\n",
" lambda x: spearmanr(x['factor_value'], x['target_value'])[0]\n",
" if len(x) >= 2 and x['factor_value'].notna().sum() >= 2 and x['target_value'].notna().sum() >= 2 # 确保分组内有效数据点 >= 2\n",
" else np.nan # 数据点不足或计算失败时返回 NaN\n",
" ).dropna() # 移除 NaN 的每日 IC\n",
"\n",
" # 计算 IC 夏普比率\n",
" if len(daily_ics) < 5: # 需要至少几个有效日 IC 才能计算夏普比率\n",
" # print(f\"警告: 有效日 IC 数量不足 ({len(daily_ics)}),无法计算夏普比率。\")\n",
" return (-1.0,) # 有效日 IC 太少,返回负适应度\n",
"\n",
" ic_mean = daily_ics.mean()\n",
" ic_std = daily_ics.std()\n",
"\n",
" # 处理标准差为零的情况 (非常罕见,可能意味着每日 IC 是常数)\n",
" if ic_std == 0:\n",
" ic_sharpe = ic_mean * 1e6 if ic_mean > 0 else -1.0 # 如果均值>0且标差为0, 给一个很大的正值\n",
" else:\n",
" ic_sharpe = ic_mean / ic_std\n",
"\n",
" # 返回 IC 夏普比率作为适应度 (需要最大化)\n",
" # 如果计算结果是 NaN (例如, mean/std 导致 NaN),返回负值\n",
" return (ic_sharpe if not np.isnan(ic_sharpe) else -1.0,)\n",
"\n",
" except (ValueError, TypeError, ZeroDivisionError, RuntimeError) as e:\n",
" # 打印错误信息和导致错误的个体,以便调试\n",
" print(f\"Error during evaluation for individual {individual}: {e}\")\n",
" return (-1.0,) # 如果计算过程中出现错误,返回一个很小的负值\n",
"\n",
" # 修改 toolbox.register 调用,将 target_tensor 传递给 evaluate_torch_cuda\n",
" toolbox.register(\"evaluate\", evaluate_torch_cuda_ic_sharpe, data_tensor_all=data_tensor_all, target_tensor_all=target_tensor_all, dates_all=dates_all)\n",
" toolbox.register(\"select\", tools.selTournament, tournsize=params.get('tournament_size', 3))\n",
" toolbox.register(\"mate\", gp.cxOnePointLeafBiased, termpb=0.2) # 移除 pset=pset\n",
" toolbox.register(\"mutate\", gp.mutUniform, expr=toolbox.expr_torch, pset=pset_torch) # 使用 PyTorch 原始集\n",
"\n",
" MAX_TREE_DEPTH = 5\n",
"\n",
" toolbox.decorate(\"mate\", gp.staticLimit(key=operator.attrgetter('height'), max_value=MAX_TREE_DEPTH))\n",
" toolbox.decorate(\"mutate\", gp.staticLimit(key=operator.attrgetter('height'), max_value=MAX_TREE_DEPTH))\n",
"\n",
" # 5. 设置进化参数\n",
" population_size = params.get('population_size', 100)\n",
" generations = params.get('generations', 10)\n",
" crossover_probability = params.get('crossover_probability', 0.7) # 调整参数以增加探索\n",
" mutation_probability = params.get('mutation_probability', 0.3) # 调整参数以增加探索\n",
"\n",
" # 6. 初始化种群\n",
" pop = toolbox.population(n=population_size)\n",
" hof = tools.HallOfFame(params.get('hall_of_fame_size', 5)) # 保留最佳的几个个体\n",
" stats = tools.Statistics(lambda ind: ind.fitness.values)\n",
" stats.register(\"avg\", np.mean)\n",
" stats.register(\"std\", np.std)\n",
" stats.register(\"min\", np.min)\n",
" stats.register(\"max\", np.max)\n",
"\n",
" # 7. 运行进化算法\n",
" algorithms.eaSimple(pop, toolbox, cxpb=crossover_probability, mutpb=mutation_probability, ngen=generations,\n",
" stats=stats, halloffame=hof, verbose=True)\n",
"\n",
" # 8. 返回最佳因子表达式\n",
" return hof, stats\n",
"\n",
"params = {\n",
" 'population_size': 64,\n",
" 'generations': 32,\n",
" 'crossover_probability': 0.7,\n",
" 'mutation_probability': 0.3,\n",
" 'tournament_size': 4,\n",
" 'hall_of_fame_size': 3\n",
"}\n",
"\n",
"best_factors_hof, stats = generate_deap_factors_pytorch_v2(df.copy(), numeric_columns, params=params)\n",
"\n",
"print(\"\\nBest Factors Found:\")\n",
"for i, ind in enumerate(best_factors_hof):\n",
" fitness_value = ind.fitness.values[0] # 获取适应度值\n",
" print(f\"Fitness: {fitness_value:.4f}, Factor {i+1}: {ind}\")"
]
},
{
"cell_type": "code",
2025-06-10 15:22:25 +08:00
"execution_count": null,
2025-04-28 11:02:52 +08:00
"id": "a0b3d7551ef0c81f",
"metadata": {
"ExecuteTime": {
"end_time": "2025-04-13T15:39:47.502867Z",
"start_time": "2025-04-13T15:39:47.461434Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"全面因子分析报告 - 特征因子: 'generated_factor'\n",
"------------------------------------------------------------\n",
"整体 Rank IC: 0.0817\n",
"整体 P-value: 0.0000\n",
"------------------------------------------------------------\n",
"计算滚动 Rank IC (按 'D' 窗口)...\n",
"滚动 Rank IC 统计量 (D):\n",
" 均值: 0.0124\n",
" 标准差: 0.2330\n",
" 夏普比率 (IC Mean / IC Std): 0.0531\n",
" T-statistic: 1.4577\n",
" T-statistic P-value: 0.1453\n",
"------------------------------------------------------------\n",
"Hit Ratio (正向 Rank IC 比例): 0.5060\n",
"------------------------------------------------------------\n",
"因子 10 分位数分析 (按因子值从小到大排序):\n",
" 第 1 分位数: 平均 'future_return' = -0.0004\n",
" 第 2 分位数: 平均 'future_return' = -0.0008\n",
" 第 3 分位数: 平均 'future_return' = -0.0004\n",
" 第 4 分位数: 平均 'future_return' = 0.0005\n",
" 第 5 分位数: 平均 'future_return' = 0.0007\n",
" 第 6 分位数: 平均 'future_return' = 0.0015\n",
" 第 7 分位数: 平均 'future_return' = 0.0021\n",
" 第 8 分位数: 平均 'future_return' = 0.0033\n",
" 第 9 分位数: 平均 'future_return' = 0.0054\n",
" 第 10 分位数: 平均 'future_return' = 0.0135\n",
"\n",
"因子值的分位数范围:\n",
" 第 1 分位数: [-1.0490, 0.0581]\n",
" 第 2 分位数: [0.0581, 0.1051]\n",
" 第 3 分位数: [0.1051, 0.1458]\n",
" 第 4 分位数: [0.1458, 0.1881]\n",
" 第 5 分位数: [0.1881, 0.2354]\n",
" 第 6 分位数: [0.2354, 0.2909]\n",
" 第 7 分位数: [0.2909, 0.3594]\n",
" 第 8 分位数: [0.3594, 0.4505]\n",
" 第 9 分位数: [0.4505, 0.5880]\n",
" 第 10 分位数: [0.5880, 1.9782]\n",
"------------------------------------------------------------\n",
"分析完成。\n"
]
}
],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import torch\n",
"\n",
"target_column = 'future_return'\n",
"# 假设您已经定义了 protected_div_torch 函数\n",
"def protected_div_torch(left, right):\n",
" return torch.where(right != 0, left / right, torch.ones_like(left))\n",
"\n",
"def protected_div_np(left, right):\n",
" \"\"\"安全除法,避免除以零的错误\"\"\"\n",
" return np.where(right != 0, left / right, np.ones_like(left) * np.nan) # 除以零时返回 NaN\n",
"\n",
"def calculate_factor_4(df: pd.DataFrame) -> pd.Series:\n",
" \"\"\"\n",
" 计算因子: sub(add(add(protected_div_torch(pow(pct_chg, std_return_90), cost_95pct), protected_div_torch(industry_act_factor6, cost_95pct)), pow(protected_div_torch(protected_div_torch(act_factor6, cost_95pct), cost_95pct), protected_div_torch(protected_div_torch(act_factor6, cost_95pct), cost_95pct))), cos(industry_act_factor1)).\n",
"\n",
" Args:\n",
" df (pd.DataFrame): 包含必要列的数据框。\n",
"\n",
" Returns:\n",
" pd.Series: 计算得到的因子值。\n",
" \"\"\"\n",
" pct_chg = df['pct_chg']\n",
" std_return_90 = df['std_return_90']\n",
" cost_95pct = df['cost_95pct']\n",
" industry_act_factor6 = df['industry_act_factor6']\n",
" act_factor6 = df['act_factor6']\n",
" industry_act_factor1 = df['industry_act_factor1']\n",
"\n",
" # Term 1: protected_div_torch(pow(pct_chg, std_return_90), cost_95pct)\n",
" term1_num = np.power(pct_chg, std_return_90)\n",
" term1 = protected_div_np(term1_num, cost_95pct)\n",
"\n",
" # Term 2: protected_div_torch(industry_act_factor6, cost_95pct)\n",
" term2 = protected_div_np(industry_act_factor6, cost_95pct)\n",
"\n",
" # Term 3: pow(protected_div_torch(protected_div_torch(act_factor6, cost_95pct), cost_95pct), protected_div_torch(protected_div_torch(act_factor6, cost_95pct), cost_95pct))\n",
" term3_base_inner = protected_div_np(act_factor6, cost_95pct)\n",
" term3_base = protected_div_np(term3_base_inner, cost_95pct)\n",
" term3_exponent_inner = protected_div_np(act_factor6, cost_95pct)\n",
" term3_exponent = protected_div_np(term3_exponent_inner, cost_95pct)\n",
" term3 = np.power(term3_base, term3_exponent)\n",
"\n",
"\n",
" # Sum of the first three terms\n",
" add_terms = term1 + term2 + term3\n",
"\n",
" # Term 4: cos(industry_act_factor1)\n",
" term4 = np.cos(industry_act_factor1)\n",
"\n",
" # Final factor\n",
" factor4 = add_terms - term4\n",
"\n",
" return factor4\n",
"\n",
"df['generated_factor'] = calculate_factor_4(df)\n",
"\n",
"import pandas as pd\n",
"import numpy as np\n",
"from scipy.stats import spearmanr, ttest_1samp\n",
"\n",
"def comprehensive_factor_analysis(df: pd.DataFrame, factor_column: str, target_column: str = 'future_return', date_column: str = 'trade_date', rolling_window: str = 'D', n_deciles: int = 10):\n",
" \"\"\"\n",
" 对 DataFrame 中的一个特征因子进行全面分析。\n",
"\n",
" Args:\n",
" df (pd.DataFrame): 包含因子和目标变量的数据框。\n",
" factor_column (str): 要分析的特征因子的列名。\n",
" target_column (str): 目标变量的列名,默认为 'future_return'。\n",
" date_column (str): 包含日期信息的列名,默认为 'trade_date'。\n",
" rolling_window (str): 滚动 Rank IC 的时间窗口(例如 'D' 表示按天,'W' 表示按周)。\n",
" n_deciles (int): 进行分位数分析时使用的分位数数量,默认为 10。\n",
" \"\"\"\n",
" if factor_column not in df.columns:\n",
" print(f\"错误: 特征因子列 '{factor_column}' 不存在于 DataFrame 中。\")\n",
" return\n",
" if target_column not in df.columns:\n",
" print(f\"错误: 目标列 '{target_column}' 不存在于 DataFrame 中。\")\n",
" return\n",
" if date_column not in df.columns:\n",
" print(f\"错误: 日期列 '{date_column}' 不存在于 DataFrame 中。\")\n",
" return\n",
"\n",
" # 确保日期列是 datetime 类型并设置为索引\n",
" df_analy = df.copy()\n",
" df_analy[date_column] = pd.to_datetime(df_analy[date_column])\n",
" df_analy = df_analy.set_index(date_column)\n",
"\n",
" # 移除因子或目标变量为 NaN 的行\n",
" df_analy = df_analy.dropna(subset=[factor_column, target_column])\n",
"\n",
" if len(df_analy) < 2:\n",
" print(\"警告: 有效数据点太少,无法进行分析。\")\n",
" return\n",
"\n",
" print(f\"全面因子分析报告 - 特征因子: '{factor_column}'\")\n",
" print(\"-\" * 60)\n",
"\n",
" # 1. 计算整体 Rank IC\n",
" overall_rank_ic, overall_p_value = spearmanr(df_analy[factor_column], df_analy[target_column])\n",
" print(f\"整体 Rank IC: {overall_rank_ic:.4f}\")\n",
" print(f\"整体 P-value: {overall_p_value:.4f}\")\n",
" print(\"-\" * 60)\n",
"\n",
" # 2. 计算滚动 Rank IC (按指定时间窗口)\n",
" print(f\"计算滚动 Rank IC (按 '{rolling_window}' 窗口)...\")\n",
" rolling_ics = df_analy.groupby(df_analy.index.to_period(rolling_window)).apply(\n",
" lambda x: spearmanr(x[factor_column], x[target_column])[0] if len(x) >= 2 else np.nan\n",
" ).dropna()\n",
"\n",
" if len(rolling_ics) < 2:\n",
" print(\"警告: 滚动 Rank IC 有效周期太少,无法计算统计量。\")\n",
" else:\n",
" # 3. 滚动 IC 统计量\n",
" ic_mean = rolling_ics.mean()\n",
" ic_std = rolling_ics.std()\n",
" ic_sharpe = ic_mean / ic_std if ic_std != 0 else np.nan\n",
" t_statistic, p_value_t = ttest_1samp(rolling_ics, 0) # 检验均值是否显著不为零\n",
"\n",
" print(f\"滚动 Rank IC 统计量 ({rolling_window}):\")\n",
" print(f\" 均值: {ic_mean:.4f}\")\n",
" print(f\" 标准差: {ic_std:.4f}\")\n",
" print(f\" 夏普比率 (IC Mean / IC Std): {ic_sharpe:.4f}\")\n",
" print(f\" T-statistic: {t_statistic:.4f}\")\n",
" print(f\" T-statistic P-value: {p_value_t:.4f}\")\n",
" print(\"-\" * 60)\n",
"\n",
" # 4. Hit Ratio (正向 Rank IC 的比例)\n",
" hit_ratio = (rolling_ics > 0).sum() / len(rolling_ics)\n",
" print(f\"Hit Ratio (正向 Rank IC 比例): {hit_ratio:.4f}\")\n",
" print(\"-\" * 60)\n",
"\n",
" # 5. 分位数分析 (在整个数据集上进行)\n",
" print(f\"因子 {n_deciles} 分位数分析 (按因子值从小到大排序):\")\n",
" df_analy['decile'] = pd.qcut(df_analy[factor_column], q=n_deciles, labels=False, duplicates='drop')\n",
" decile_analysis = df_analy.groupby('decile')[target_column].mean().sort_index()\n",
"\n",
" if len(decile_analysis) > 0:\n",
" for decile, avg_return in decile_analysis.items():\n",
" print(f\" 第 {decile + 1} 分位数: 平均 '{target_column}' = {avg_return:.4f}\")\n",
"\n",
" # 打印每个分位数的因子值范围\n",
" percentiles = np.linspace(0, 100, n_deciles + 1)\n",
" factor_percentiles = df_analy[factor_column].quantile(percentiles / 100)\n",
" print(\"\\n因子值的分位数范围:\")\n",
" # 修复分位数范围打印的 KeyError\n",
" for i in range(len(decile_analysis)): # 确保只打印实际存在的分位数\n",
" lower_bound = factor_percentiles[percentiles[i] / 100]\n",
" upper_bound = factor_percentiles[percentiles[i+1] / 100]\n",
" print(f\" 第 {i + 1} 分位数: [{lower_bound:.4f}, {upper_bound:.4f}]\")\n",
" else:\n",
" print(\"警告: 分位数分析无法执行,可能是因为数据点不足或因子值分布问题。\")\n",
"\n",
"\n",
" print(\"-\" * 60)\n",
" print(\"分析完成。\")\n",
"\n",
"comprehensive_factor_analysis(df, factor_column='generated_factor', target_column='future_return', date_column='trade_date', rolling_window='D', n_deciles=10)\n"
]
}
],
"metadata": {
"kernelspec": {
2025-06-10 15:22:25 +08:00
"display_name": "stock",
2025-04-28 11:02:52 +08:00
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
2025-06-10 15:22:25 +08:00
"version": "3.13.2"
2025-04-28 11:02:52 +08:00
}
},
"nbformat": 4,
"nbformat_minor": 5
}