Files
NewStock/main/factor/generate_factor.ipynb
2025-06-10 15:22:25 +08:00

1339 lines
73 KiB
Plaintext
Raw Blame History

This file contains invisible Unicode characters
This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "initial_id",
"metadata": {
"ExecuteTime": {
"end_time": "2025-04-13T07:26:19.000054Z",
"start_time": "2025-04-13T07:26:18.895713Z"
},
"collapsed": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"/mnt/d/PyProject/NewStock\n"
]
}
],
"source": [
"import gc\n",
"import os\n",
"import sys\n",
"sys.path.append('../../')\n",
"print(os.getcwd())\n",
"import pandas as pd\n",
"from main.factor.factor import get_rolling_factor, get_simple_factor\n",
"from main.utils.factor import read_industry_data\n",
"from main.utils.factor_processor import calculate_score\n",
"from main.utils.utils import read_and_merge_h5_data, merge_with_industry_data\n",
"\n",
"import warnings\n",
"\n",
"warnings.filterwarnings(\"ignore\")"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "f1623b04c7a366af",
"metadata": {
"ExecuteTime": {
"end_time": "2025-04-13T07:30:48.534271Z",
"start_time": "2025-04-13T07:26:19.005576Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"daily data\n",
"daily basic\n",
"inner merge on ['ts_code', 'trade_date']\n",
"stk limit\n",
"left merge on ['ts_code', 'trade_date']\n",
"money flow\n",
"left merge on ['ts_code', 'trade_date']\n",
"cyq perf\n",
"left merge on ['ts_code', 'trade_date']\n",
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 8713571 entries, 0 to 8713570\n",
"Data columns (total 33 columns):\n",
" # Column Dtype \n",
"--- ------ ----- \n",
" 0 ts_code object \n",
" 1 trade_date datetime64[ns]\n",
" 2 open float64 \n",
" 3 close float64 \n",
" 4 high float64 \n",
" 5 low float64 \n",
" 6 vol float64 \n",
" 7 pct_chg float64 \n",
" 8 amount float64 \n",
" 9 turnover_rate float64 \n",
" 10 pe_ttm float64 \n",
" 11 circ_mv float64 \n",
" 12 total_mv float64 \n",
" 13 volume_ratio float64 \n",
" 14 is_st bool \n",
" 15 up_limit float64 \n",
" 16 down_limit float64 \n",
" 17 buy_sm_vol float64 \n",
" 18 sell_sm_vol float64 \n",
" 19 buy_lg_vol float64 \n",
" 20 sell_lg_vol float64 \n",
" 21 buy_elg_vol float64 \n",
" 22 sell_elg_vol float64 \n",
" 23 net_mf_vol float64 \n",
" 24 his_low float64 \n",
" 25 his_high float64 \n",
" 26 cost_5pct float64 \n",
" 27 cost_15pct float64 \n",
" 28 cost_50pct float64 \n",
" 29 cost_85pct float64 \n",
" 30 cost_95pct float64 \n",
" 31 weight_avg float64 \n",
" 32 winner_rate float64 \n",
"dtypes: bool(1), datetime64[ns](1), float64(30), object(1)\n",
"memory usage: 2.1+ GB\n",
"None\n"
]
}
],
"source": [
"from main.utils.utils import read_and_merge_h5_data\n",
"\n",
"print('daily data')\n",
"df = read_and_merge_h5_data('/mnt/d/PyProject/NewStock/data/daily_data.h5', key='daily_data',\n",
" columns=['ts_code', 'trade_date', 'open', 'close', 'high', 'low', 'vol', 'pct_chg', 'amount'],\n",
" df=None)\n",
"\n",
"print('daily basic')\n",
"df = read_and_merge_h5_data('/mnt/d/PyProject/NewStock/data/daily_basic.h5', key='daily_basic',\n",
" columns=['ts_code', 'trade_date', 'turnover_rate', 'pe_ttm', 'circ_mv', 'total_mv', 'volume_ratio',\n",
" 'is_st'], df=df, join='inner')\n",
"\n",
"print('stk limit')\n",
"df = read_and_merge_h5_data('/mnt/d/PyProject/NewStock/data/stk_limit.h5', key='stk_limit',\n",
" columns=['ts_code', 'trade_date', 'pre_close', 'up_limit', 'down_limit'],\n",
" df=df)\n",
"print('money flow')\n",
"df = read_and_merge_h5_data('/mnt/d/PyProject/NewStock/data/money_flow.h5', key='money_flow',\n",
" columns=['ts_code', 'trade_date', 'buy_sm_vol', 'sell_sm_vol', 'buy_lg_vol', 'sell_lg_vol',\n",
" 'buy_elg_vol', 'sell_elg_vol', 'net_mf_vol'],\n",
" df=df)\n",
"print('cyq perf')\n",
"df = read_and_merge_h5_data('/mnt/d/PyProject/NewStock/data/cyq_perf.h5', key='cyq_perf',\n",
" columns=['ts_code', 'trade_date', 'his_low', 'his_high', 'cost_5pct', 'cost_15pct',\n",
" 'cost_50pct',\n",
" 'cost_85pct', 'cost_95pct', 'weight_avg', 'winner_rate'],\n",
" df=df)\n",
"print(df.info())"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "0acb6625",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['ts_code', 'trade_date', 'open', 'close', 'high', 'low', 'vol', 'pct_chg', 'amount', 'turnover_rate', 'pe_ttm', 'circ_mv', 'total_mv', 'volume_ratio', 'is_st', 'up_limit', 'down_limit', 'buy_sm_vol', 'sell_sm_vol', 'buy_lg_vol', 'sell_lg_vol', 'buy_elg_vol', 'sell_elg_vol', 'net_mf_vol', 'his_low', 'his_high', 'cost_5pct', 'cost_15pct', 'cost_50pct', 'cost_85pct', 'cost_95pct', 'weight_avg', 'winner_rate']\n"
]
}
],
"source": [
"\n",
"origin_columns = df.columns.tolist()\n",
"origin_columns = [col for col in origin_columns if 'cyq' not in col]\n",
"print(origin_columns)\n"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "820a6b50",
"metadata": {},
"outputs": [],
"source": [
"fina_indicator_df = read_and_merge_h5_data('/mnt/d/PyProject/NewStock/data/fina_indicator.h5', key='fina_indicator',\n",
" columns=['ts_code', 'ann_date', 'undist_profit_ps', 'ocfps', 'bps'],\n",
" df=None)\n",
"cashflow_df = read_and_merge_h5_data('/mnt/d/PyProject/NewStock/data/cashflow.h5', key='cashflow',\n",
" columns=['ts_code', 'ann_date', 'n_cashflow_act'],\n",
" df=None)\n",
"balancesheet_df = read_and_merge_h5_data('/mnt/d/PyProject/NewStock/data/balancesheet.h5', key='balancesheet',\n",
" columns=['ts_code', 'ann_date', 'money_cap', 'total_liab'],\n",
" df=None)\n",
"top_list_df = read_and_merge_h5_data('/mnt/d/PyProject/NewStock/data/top_list.h5', key='top_list',\n",
" columns=['ts_code', 'trade_date', 'reason'],\n",
" df=None)\n",
"\n",
"top_list_df = top_list_df.sort_values(by='trade_date', ascending=False).drop_duplicates(subset=['ts_code', 'trade_date'], keep='first').sort_values(by='trade_date')\n",
"\n",
"stk_holdertrade_df = read_and_merge_h5_data('/mnt/d/PyProject/NewStock/data/stk_holdertrade.h5', key='stk_holdertrade',\n",
" columns=['ts_code', 'ann_date', 'in_de', 'change_ratio'],\n",
" df=None)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "903469a7",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"✅ 成功从 Redis Hash 'concept_stocks_daily_lists_pickle' 读取 1794 条每日概念股票数据。\n"
]
}
],
"source": [
"import redis\n",
"import pickle\n",
"from datetime import date, datetime\n",
"\n",
"# --- 配置 Redis 连接 ---\n",
"REDIS_HOST = '140.143.91.66'\n",
"REDIS_PORT = 6389\n",
"REDIS_DB = 0\n",
"\n",
"# --- 定义 Redis 键名 ---\n",
"HASH_KEY = \"concept_stocks_daily_lists_pickle\" # 区分之前的 JSON 版本\n",
"MAX_DATE_KEY = \"concept_stocks_max_date_pickle\" # 区分之前的 JSON 版本\n",
"\n",
"concept_dict = {}\n",
"\n",
"# --- 连接 Redis ---\n",
"try:\n",
" r = redis.StrictRedis(host=REDIS_HOST, port=REDIS_PORT, db=REDIS_DB, password='Redis520102')\n",
"\n",
" all_data_from_redis = r.hgetall(HASH_KEY) # 返回的是字典,键是字节,值是字节\n",
" \n",
" if all_data_from_redis:\n",
" for date_bytes, stocks_bytes in all_data_from_redis.items(): # 将变量名改为 date_bytes 更清晰\n",
" try:\n",
" # *** 修正点:将日期字节解码为字符串 ***\n",
" date_str = date_bytes.decode('utf-8') \n",
" date_obj = datetime.strptime(date_str, '%Y%m%d').date()\n",
" \n",
" stocks_list = pickle.loads(stocks_bytes)\n",
" concept_dict[date_obj] = stocks_list\n",
" except (ValueError, pickle.UnpicklingError) as e:\n",
" print(f\"⚠️ 警告: 解析 Redis 数据时出错 (日期键: '{date_bytes.decode('utf-8', errors='ignore')}'),跳过此条数据: {e}\") # 打印警告时也解码一下\n",
" print(f\"✅ 成功从 Redis Hash '{HASH_KEY}' 读取 {len(concept_dict)} 条每日概念股票数据。\")\n",
" else:\n",
" print(f\" Redis Hash '{HASH_KEY}' 中没有找到任何数据。\")\n",
"\n",
"except redis.exceptions.ConnectionError as e:\n",
" print(f\"❌ 错误: 无法连接到 Redis 服务器,请检查 Redis 是否正在运行或连接配置: {e}\")\n",
"except Exception as e:\n",
" print(f\"❌ 从 Redis 读取数据时发生未知错误: {e}\")"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "afb8da3d",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"4566757\n",
"开始生成概念相关因子...\n",
"开始计算概念内截面排序因子,基于: ['pct_chg', 'turnover_rate', 'volume_ratio']\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Ranking Features in Concepts: 100%|██████████| 3/3 [00:00<00:00, 15.82it/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"概念相关因子生成完毕。\n",
"4566757\n",
"开始计算股东增减持因子...\n",
"警告: 'in_de' 列中存在未映射的值,可能导致 _direction 列出现NaN。\n",
"股东增减持因子计算完成。\n",
"Calculating cat_senti_mom_vol_spike...\n",
"Finished cat_senti_mom_vol_spike.\n",
"Calculating cat_senti_pre_breakout...\n",
"Calculating atr_10 as it's missing...\n",
"Calculating atr_40 as it's missing...\n",
"Finished cat_senti_pre_breakout.\n",
"计算因子 ts_turnover_rate_acceleration_5_20\n",
"计算因子 ts_vol_sustain_10_30\n",
"计算因子 cs_amount_outlier_10\n",
"计算因子 ts_ff_to_total_turnover_ratio\n",
"计算因子 ts_price_volume_trend_coherence_5_20\n",
"计算因子 ts_ff_turnover_rate_surge_10\n",
"使用 'ann_date' 作为财务数据生效日期。\n",
"警告: 从 financial_data_subset 中移除了 366 行,因为其 'ts_code' 或 'ann_date' 列存在空值。\n",
"使用 'ann_date' 作为财务数据生效日期。\n",
"警告: 从 financial_data_subset 中移除了 366 行,因为其 'ts_code' 或 'ann_date' 列存在空值。\n",
"开始计算因子: AR, BR (原地修改)...\n",
"因子 AR, BR 计算成功。\n",
"因子 AR, BR 计算流程结束。\n",
"使用 'ann_date' 作为财务数据生效日期。\n",
"使用 'ann_date' 作为财务数据生效日期。\n",
"使用 'ann_date' 作为财务数据生效日期。\n",
"使用 'ann_date' 作为财务数据生效日期。\n",
"警告: 从 financial_data_subset 中移除了 366 行,因为其 'ts_code' 或 'ann_date' 列存在空值。\n",
"计算 BBI...\n",
"--- 计算日级别偏离度 (使用 pct_chg) ---\n",
"--- 计算日级别动量基准 (使用 pct_chg) ---\n",
"日级别动量基准计算完成 (使用 pct_chg)。\n",
"日级别偏离度计算完成 (使用 pct_chg)。\n",
"--- 计算日级别行业偏离度 (使用 pct_chg 和行业基准) ---\n",
"--- 计算日级别行业动量基准 (使用 pct_chg 和 cat_l2_code) ---\n",
"错误: 计算日级别行业动量基准需要以下列: ['pct_chg', 'cat_l2_code', 'trade_date', 'ts_code']。\n",
"错误: 计算日级别行业偏离度需要以下列: ['pct_chg', 'daily_industry_positive_benchmark', 'daily_industry_negative_benchmark']。请先运行 daily_industry_momentum_benchmark(df)。\n",
"Index(['ts_code', 'trade_date', 'open', 'close', 'high', 'low', 'vol',\n",
" 'pct_chg', 'amount', 'turnover_rate',\n",
" ...\n",
" 'cat_volume_breakout', 'turnover_deviation', 'cat_turnover_spike',\n",
" 'avg_volume_ratio', 'cat_volume_ratio_breakout', 'vol_spike',\n",
" 'vol_std_5', 'atr_14', 'atr_6', 'obv'],\n",
" dtype='object', length=103)\n",
"Calculating senti_strong_inflow...\n",
"Finished senti_strong_inflow.\n",
"Calculating lg_flow_mom_corr_20_60...\n",
"Finished lg_flow_mom_corr_20_60.\n",
"Calculating lg_flow_accel...\n",
"Finished lg_flow_accel.\n",
"Calculating profit_pressure...\n",
"Finished profit_pressure.\n",
"Calculating underwater_resistance...\n",
"Finished underwater_resistance.\n",
"Calculating cost_conc_std_20...\n",
"Finished cost_conc_std_20.\n",
"Calculating profit_decay_20...\n",
"Finished profit_decay_20.\n",
"Calculating vol_amp_loss_20...\n",
"Finished vol_amp_loss_20.\n",
"Calculating vol_drop_profit_cnt_5...\n",
"Finished vol_drop_profit_cnt_5.\n",
"Calculating lg_flow_vol_interact_20...\n",
"Finished lg_flow_vol_interact_20.\n",
"Calculating cost_break_confirm_cnt_5...\n",
"Finished cost_break_confirm_cnt_5.\n",
"Calculating atr_norm_channel_pos_14...\n",
"Finished atr_norm_channel_pos_14.\n",
"Calculating turnover_diff_skew_20...\n",
"Finished turnover_diff_skew_20.\n",
"Calculating lg_sm_flow_diverge_20...\n",
"Finished lg_sm_flow_diverge_20.\n",
"Calculating pullback_strong_20_20...\n",
"Finished pullback_strong_20_20.\n",
"Calculating vol_wgt_hist_pos_20...\n",
"Finished vol_wgt_hist_pos_20.\n",
"Calculating vol_adj_roc_20...\n",
"Finished vol_adj_roc_20.\n",
"Calculating cs_rank_net_lg_flow_val...\n",
"Finished cs_rank_net_lg_flow_val.\n",
"Calculating cs_rank_flow_divergence...\n",
"Finished cs_rank_flow_divergence.\n",
"Calculating cs_rank_ind_adj_lg_flow...\n",
"Error calculating cs_rank_ind_adj_lg_flow: Missing 'cat_l2_code' column. Assigning NaN.\n",
"Calculating cs_rank_elg_buy_ratio...\n",
"Finished cs_rank_elg_buy_ratio.\n",
"Calculating cs_rank_rel_profit_margin...\n",
"Finished cs_rank_rel_profit_margin.\n",
"Calculating cs_rank_cost_breadth...\n",
"Finished cs_rank_cost_breadth.\n",
"Calculating cs_rank_dist_to_upper_cost...\n",
"Finished cs_rank_dist_to_upper_cost.\n",
"Calculating cs_rank_winner_rate...\n",
"Finished cs_rank_winner_rate.\n",
"Calculating cs_rank_intraday_range...\n",
"Finished cs_rank_intraday_range.\n",
"Calculating cs_rank_close_pos_in_range...\n",
"Finished cs_rank_close_pos_in_range.\n",
"Calculating cs_rank_opening_gap...\n",
"Error calculating cs_rank_opening_gap: Missing 'pre_close' column. Assigning NaN.\n",
"Calculating cs_rank_pos_in_hist_range...\n",
"Finished cs_rank_pos_in_hist_range.\n",
"Calculating cs_rank_vol_x_profit_margin...\n",
"Finished cs_rank_vol_x_profit_margin.\n",
"Calculating cs_rank_lg_flow_price_concordance...\n",
"Finished cs_rank_lg_flow_price_concordance.\n",
"Calculating cs_rank_turnover_per_winner...\n",
"Finished cs_rank_turnover_per_winner.\n",
"Calculating cs_rank_ind_cap_neutral_pe (Placeholder - requires statsmodels)...\n",
"Finished cs_rank_ind_cap_neutral_pe (Placeholder).\n",
"Calculating cs_rank_volume_ratio...\n",
"Finished cs_rank_volume_ratio.\n",
"Calculating cs_rank_elg_buy_sell_sm_ratio...\n",
"Finished cs_rank_elg_buy_sell_sm_ratio.\n",
"Calculating cs_rank_cost_dist_vol_ratio...\n",
"Finished cs_rank_cost_dist_vol_ratio.\n",
"Calculating cs_rank_size...\n",
"Finished cs_rank_size.\n",
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 4566757 entries, 0 to 4566756\n",
"Columns: 197 entries, ts_code to cs_rank_size\n",
"dtypes: bool(10), datetime64[ns](1), float64(176), int64(6), int8(1), object(3)\n",
"memory usage: 6.4+ GB\n",
"None\n",
"['ts_code', 'trade_date', 'open', 'close', 'high', 'low', 'vol', 'pct_chg', 'amount', 'turnover_rate', 'pe_ttm', 'circ_mv', 'total_mv', 'volume_ratio', 'is_st', 'up_limit', 'down_limit', 'buy_sm_vol', 'sell_sm_vol', 'buy_lg_vol', 'sell_lg_vol', 'buy_elg_vol', 'sell_elg_vol', 'net_mf_vol', 'his_low', 'his_high', 'cost_5pct', 'cost_15pct', 'cost_50pct', 'cost_85pct', 'cost_95pct', 'weight_avg', 'winner_rate', 'cat_hot_concept_stock', 'concept_rank_pct_chg', 'concept_rank_turnover_rate', 'concept_rank_volume_ratio', 'holder_net_change_sum_10d', 'holder_increase_days_10d', 'holder_decrease_days_10d', 'holder_any_increase_flag_10d', 'holder_any_decrease_flag_10d', 'holder_direction_score_10d', 'cat_senti_mom_vol_spike', 'cat_senti_pre_breakout', 'ts_turnover_rate_acceleration_5_20', 'ts_vol_sustain_10_30', 'cs_amount_outlier_10', 'ts_ff_to_total_turnover_ratio', 'ts_price_volume_trend_coherence_5_20', 'ts_ff_turnover_rate_surge_10', 'undist_profit_ps', 'ocfps', 'AR', 'BR', 'AR_BR', 'log_circ_mv', 'cashflow_to_ev_factor', 'book_to_price_ratio', 'turnover_rate_mean_5', 'variance_20', 'bbi_ratio_factor', 'daily_deviation', 'lg_elg_net_buy_vol', 'flow_lg_elg_intensity', 'sm_net_buy_vol', 'flow_divergence_diff', 'flow_divergence_ratio', 'total_buy_vol', 'lg_elg_buy_prop', 'flow_struct_buy_change', 'lg_elg_net_buy_vol_change', 'flow_lg_elg_accel', 'chip_concentration_range', 'chip_skewness', 'floating_chip_proxy', 'cost_support_15pct_change', 'cat_winner_price_zone', 'flow_chip_consistency', 'profit_taking_vs_absorb', 'cat_is_positive', 'upside_vol', 'downside_vol', 'vol_ratio', 'return_skew', 'return_kurtosis', 'volume_change_rate', 'cat_volume_breakout', 'turnover_deviation', 'cat_turnover_spike', 'avg_volume_ratio', 'cat_volume_ratio_breakout', 'vol_spike', 'vol_std_5', 'atr_14', 'atr_6', 'obv', 'maobv_6', 'rsi_3', 'return_5', 'return_20', 'std_return_5', 'std_return_90', 'std_return_90_2', 'act_factor1', 'act_factor2', 'act_factor3', 'act_factor4', 'rank_act_factor1', 'rank_act_factor2', 'rank_act_factor3', 'cov', 'delta_cov', 'alpha_22_improved', 'alpha_003', 'alpha_007', 'alpha_013', 'vol_break', 'weight_roc5', 'price_cost_divergence', 'smallcap_concentration', 'cost_stability', 'high_cost_break_days', 'liquidity_risk', 'turnover_std', 'mv_volatility', 'volume_growth', 'mv_growth', 'momentum_factor', 'resonance_factor', 'log_close', 'cat_vol_spike', 'up', 'down', 'obv_maobv_6', 'std_return_5_over_std_return_90', 'std_return_90_minus_std_return_90_2', 'cat_af2', 'cat_af3', 'cat_af4', 'act_factor5', 'act_factor6', 'active_buy_volume_large', 'active_buy_volume_big', 'active_buy_volume_small', 'buy_lg_vol_minus_sell_lg_vol', 'buy_elg_vol_minus_sell_elg_vol', 'ctrl_strength', 'low_cost_dev', 'asymmetry', 'lock_factor', 'cat_vol_break', 'cost_atr_adj', 'cat_golden_resonance', 'mv_turnover_ratio', 'mv_adjusted_volume', 'mv_weighted_turnover', 'nonlinear_mv_volume', 'mv_volume_ratio', 'mv_momentum', 'senti_strong_inflow', 'lg_flow_mom_corr_20_60', 'lg_flow_accel', 'profit_pressure', 'underwater_resistance', 'cost_conc_std_20', 'profit_decay_20', 'vol_amp_loss_20', 'vol_drop_profit_cnt_5', 'lg_flow_vol_interact_20', 'cost_break_confirm_cnt_5', 'atr_norm_channel_pos_14', 'turnover_diff_skew_20', 'lg_sm_flow_diverge_20', 'pullback_strong_20_20', 'vol_wgt_hist_pos_20', 'vol_adj_roc_20', 'cs_rank_net_lg_flow_val', 'cs_rank_flow_divergence', 'cs_rank_ind_adj_lg_flow', 'cs_rank_elg_buy_ratio', 'cs_rank_rel_profit_margin', 'cs_rank_cost_breadth', 'cs_rank_dist_to_upper_cost', 'cs_rank_winner_rate', 'cs_rank_intraday_range', 'cs_rank_close_pos_in_range', 'cs_rank_opening_gap', 'cs_rank_pos_in_hist_range', 'cs_rank_vol_x_profit_margin', 'cs_rank_lg_flow_price_concordance', 'cs_rank_turnover_per_winner', 'cs_rank_ind_cap_neutral_pe', 'cs_rank_volume_ratio', 'cs_rank_elg_buy_sell_sm_ratio', 'cs_rank_cost_dist_vol_ratio', 'cs_rank_size']\n"
]
}
],
"source": [
"import numpy as np\n",
"from main.factor.factor import *\n",
"from main.factor.money_factor import * \n",
"from main.factor.concept_factor import * \n",
"\n",
"\n",
"def filter_data(df):\n",
" # df = df.groupby('trade_date').apply(lambda x: x.nlargest(1000, 'act_factor1'))\n",
" df = df[~df[\"is_st\"]]\n",
" df = df[~df[\"ts_code\"].str.endswith(\"BJ\")]\n",
" df = df[~df[\"ts_code\"].str.startswith(\"30\")]\n",
" df = df[~df[\"ts_code\"].str.startswith(\"68\")]\n",
" df = df[~df[\"ts_code\"].str.startswith(\"8\")]\n",
" df = df[df[\"trade_date\"] >= \"2019-01-01\"]\n",
" if \"in_date\" in df.columns:\n",
" df = df.drop(columns=[\"in_date\"])\n",
" df = df.reset_index(drop=True)\n",
" return df\n",
"\n",
"\n",
"gc.collect()\n",
"\n",
"df = filter_data(df)\n",
"df = df.sort_values(by=[\"ts_code\", \"trade_date\"])\n",
"\n",
"# df = price_minus_deduction_price(df, n=120)\n",
"# df = price_deduction_price_diff_ratio_to_sma(df, n=120)\n",
"# df = cat_price_vs_sma_vs_deduction_price(df, n=120)\n",
"# df = cat_reason(df, top_list_df)\n",
"# df = cat_is_on_top_list(df, top_list_df)\n",
"print(len(df))\n",
"df = generate_concept_factors(df, concept_dict)\n",
"print(len(df))\n",
"\n",
"df = holder_trade_factors(df, stk_holdertrade_df)\n",
"\n",
"df = cat_senti_mom_vol_spike(\n",
" df,\n",
" return_period=3,\n",
" return_threshold=0.03, # 近3日涨幅超3%\n",
" volume_ratio_threshold=1.3,\n",
" current_pct_chg_min=0.0, # 当日必须收红\n",
" current_pct_chg_max=0.05,\n",
") # 当日涨幅不宜过大\n",
"\n",
"df = cat_senti_pre_breakout(\n",
" df,\n",
" atr_short_N=10,\n",
" atr_long_M=40,\n",
" vol_atrophy_N=10,\n",
" vol_atrophy_M=40,\n",
" price_stab_N=5,\n",
" price_stab_threshold=0.06,\n",
" current_pct_chg_min_signal=0.002,\n",
" current_pct_chg_max_signal=0.05,\n",
" volume_ratio_signal_threshold=1.1,\n",
")\n",
"\n",
"df = ts_turnover_rate_acceleration_5_20(df)\n",
"df = ts_vol_sustain_10_30(df)\n",
"# df = cs_turnover_rate_relative_strength_20(df)\n",
"df = cs_amount_outlier_10(df)\n",
"df = ts_ff_to_total_turnover_ratio(df)\n",
"df = ts_price_volume_trend_coherence_5_20(df)\n",
"# df = ts_turnover_rate_trend_strength_5(df)\n",
"df = ts_ff_turnover_rate_surge_10(df)\n",
"\n",
"df = add_financial_factor(df, fina_indicator_df, factor_value_col=\"undist_profit_ps\")\n",
"df = add_financial_factor(df, fina_indicator_df, factor_value_col=\"ocfps\")\n",
"calculate_arbr(df, N=26)\n",
"df[\"log_circ_mv\"] = np.log(df[\"circ_mv\"])\n",
"df = calculate_cashflow_to_ev_factor(df, cashflow_df, balancesheet_df)\n",
"df = caculate_book_to_price_ratio(df, fina_indicator_df)\n",
"df = turnover_rate_n(df, n=5)\n",
"df = variance_n(df, n=20)\n",
"df = bbi_ratio_factor(df)\n",
"df = daily_deviation(df)\n",
"df = daily_industry_deviation(df)\n",
"df, _ = get_rolling_factor(df)\n",
"df, _ = get_simple_factor(df)\n",
"\n",
"df = calculate_strong_inflow_signal(df)\n",
"\n",
"df = df.rename(columns={\"l1_code\": \"cat_l1_code\"})\n",
"df = df.rename(columns={\"l2_code\": \"cat_l2_code\"})\n",
"\n",
"lg_flow_mom_corr(df, N=20, M=60)\n",
"lg_flow_accel(df)\n",
"profit_pressure(df)\n",
"underwater_resistance(df)\n",
"cost_conc_std(df, N=20)\n",
"profit_decay(df, N=20)\n",
"vol_amp_loss(df, N=20)\n",
"vol_drop_profit_cnt(df, N=20, M=5)\n",
"lg_flow_vol_interact(df, N=20)\n",
"cost_break_confirm_cnt(df, M=5)\n",
"atr_norm_channel_pos(df, N=14)\n",
"turnover_diff_skew(df, N=20)\n",
"lg_sm_flow_diverge(df, N=20)\n",
"pullback_strong(df, N=20, M=20)\n",
"vol_wgt_hist_pos(df, N=20)\n",
"vol_adj_roc(df, N=20)\n",
"\n",
"cs_rank_net_lg_flow_val(df)\n",
"cs_rank_flow_divergence(df)\n",
"cs_rank_industry_adj_lg_flow(df) # Needs cat_l2_code\n",
"cs_rank_elg_buy_ratio(df)\n",
"cs_rank_rel_profit_margin(df)\n",
"cs_rank_cost_breadth(df)\n",
"cs_rank_dist_to_upper_cost(df)\n",
"cs_rank_winner_rate(df)\n",
"cs_rank_intraday_range(df)\n",
"cs_rank_close_pos_in_range(df)\n",
"cs_rank_opening_gap(df) # Needs pre_close\n",
"cs_rank_pos_in_hist_range(df) # Needs his_low, his_high\n",
"cs_rank_vol_x_profit_margin(df)\n",
"cs_rank_lg_flow_price_concordance(df)\n",
"cs_rank_turnover_per_winner(df)\n",
"cs_rank_ind_cap_neutral_pe(df) # Placeholder - needs external libraries\n",
"cs_rank_volume_ratio(df) # Needs volume_ratio\n",
"cs_rank_elg_buy_sell_sm_ratio(df)\n",
"cs_rank_cost_dist_vol_ratio(df) # Needs volume_ratio\n",
"cs_rank_size(df) # Needs circ_mv\n",
"\n",
"\n",
"# df = df.merge(index_data, on='trade_date', how='left')\n",
"\n",
"print(df.info())\n",
"print(df.columns.tolist())"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "48712034",
"metadata": {},
"outputs": [
{
"ename": "FileNotFoundError",
"evalue": "File ../../data/industry_data.h5 does not exist",
"output_type": "error",
"traceback": [
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
"\u001b[31mFileNotFoundError\u001b[39m Traceback (most recent call last)",
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[8]\u001b[39m\u001b[32m, line 1\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m l2_df = \u001b[43mread_and_merge_h5_data\u001b[49m\u001b[43m(\u001b[49m\u001b[33;43m'\u001b[39;49m\u001b[33;43m../../data/industry_data.h5\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkey\u001b[49m\u001b[43m=\u001b[49m\u001b[33;43m'\u001b[39;49m\u001b[33;43mindustry_data\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[32m 2\u001b[39m \u001b[43m \u001b[49m\u001b[43mcolumns\u001b[49m\u001b[43m=\u001b[49m\u001b[43m[\u001b[49m\u001b[33;43m'\u001b[39;49m\u001b[33;43mts_code\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[33;43m'\u001b[39;49m\u001b[33;43ml2_code\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[33;43m'\u001b[39;49m\u001b[33;43min_date\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 3\u001b[39m \u001b[43m \u001b[49m\u001b[43mdf\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mon\u001b[49m\u001b[43m=\u001b[49m\u001b[43m[\u001b[49m\u001b[33;43m'\u001b[39;49m\u001b[33;43mts_code\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mjoin\u001b[49m\u001b[43m=\u001b[49m\u001b[33;43m'\u001b[39;49m\u001b[33;43mleft\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[32m 4\u001b[39m df = merge_with_industry_data(df, l2_df)\n\u001b[32m 5\u001b[39m df = df.rename(columns={\u001b[33m'\u001b[39m\u001b[33ml2_code\u001b[39m\u001b[33m'\u001b[39m: \u001b[33m'\u001b[39m\u001b[33mcat_l2_code\u001b[39m\u001b[33m'\u001b[39m})\n",
"\u001b[36mFile \u001b[39m\u001b[32m/mnt/d/PyProject/NewStock/main/utils/utils.py:14\u001b[39m, in \u001b[36mread_and_merge_h5_data\u001b[39m\u001b[34m(h5_filename, key, columns, df, join, on, prefix)\u001b[39m\n\u001b[32m 11\u001b[39m processed_columns.append(col)\n\u001b[32m 13\u001b[39m \u001b[38;5;66;03m# 从 HDF5 文件读取数据,选择需要的列\u001b[39;00m\n\u001b[32m---> \u001b[39m\u001b[32m14\u001b[39m data = \u001b[43mpd\u001b[49m\u001b[43m.\u001b[49m\u001b[43mread_hdf\u001b[49m\u001b[43m(\u001b[49m\u001b[43mh5_filename\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkey\u001b[49m\u001b[43m=\u001b[49m\u001b[43mkey\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcolumns\u001b[49m\u001b[43m=\u001b[49m\u001b[43mprocessed_columns\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 16\u001b[39m \u001b[38;5;66;03m# 修改列名,如果列名以前有 _加上 _\u001b[39;00m\n\u001b[32m 17\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m col \u001b[38;5;129;01min\u001b[39;00m data.columns:\n",
"\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/stock/lib/python3.13/site-packages/pandas/io/pytables.py:424\u001b[39m, in \u001b[36mread_hdf\u001b[39m\u001b[34m(path_or_buf, key, mode, errors, where, start, stop, columns, iterator, chunksize, **kwargs)\u001b[39m\n\u001b[32m 421\u001b[39m exists = \u001b[38;5;28;01mFalse\u001b[39;00m\n\u001b[32m 423\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m exists:\n\u001b[32m--> \u001b[39m\u001b[32m424\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mFileNotFoundError\u001b[39;00m(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mFile \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mpath_or_buf\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m does not exist\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m 426\u001b[39m store = HDFStore(path_or_buf, mode=mode, errors=errors, **kwargs)\n\u001b[32m 427\u001b[39m \u001b[38;5;66;03m# can't auto open/close if we are using an iterator\u001b[39;00m\n\u001b[32m 428\u001b[39m \u001b[38;5;66;03m# so delegate to the iterator\u001b[39;00m\n",
"\u001b[31mFileNotFoundError\u001b[39m: File ../../data/industry_data.h5 does not exist"
]
}
],
"source": [
"\n",
"l2_df = read_and_merge_h5_data('../../data/industry_data.h5', key='industry_data',\n",
" columns=['ts_code', 'l2_code', 'in_date'],\n",
" df=None, on=['ts_code'], join='left')\n",
"df = merge_with_industry_data(df, l2_df)\n",
"df = df.rename(columns={'l2_code': 'cat_l2_code'})\n",
"# df = df.merge(index_data, on='trade_date', how='left')\n",
"\n",
"days = 5\n",
"df = df.sort_values(by=['ts_code', 'trade_date'])\n",
"# df['future_return'] = df.groupby('ts_code', group_keys=False)['close'].apply(lambda x: x.shift(-days) / x - 1)\n",
"df['future_return'] = (df.groupby('ts_code')['close'].shift(-days) - df.groupby('ts_code')['open'].shift(-1)) / \\\n",
" df.groupby('ts_code')['open'].shift(-1)\n",
"# df['future_return'] = df.groupby('ts_code')['pct_chg'].shift(-1)\n",
"df['future_return2'] = (df.groupby('ts_code')['close'].shift(-1) - df.groupby('ts_code')['open'].shift(-1)) / \\\n",
" df.groupby('ts_code')['open'].shift(-1)\n",
"\n",
"df['future_volatility'] = (\n",
" df.groupby('ts_code')['pct_chg']\n",
" .transform(lambda x: x.rolling(days).std().shift(-days))\n",
")\n",
"df['future_score'] = calculate_score(df, days=days, lambda_param=0.3)\n",
"\n",
"\n",
"def select_pre_zt_stocks_dynamic(stock_df):\n",
" def select_stocks(group):\n",
" return group.nsmallest(1000, 'total_mv') # 如果循环结束仍未找到足够标签,则返回最大数量的股票\n",
"\n",
" stock_df = stock_df.groupby('trade_date', group_keys=False).apply(select_stocks)\n",
" return stock_df\n",
"\n",
"\n",
"gc.collect()\n",
"\n",
"# df = select_pre_zt_stocks_dynamic(df[(df['trade_date'] >= '2022-01-01') & (df['trade_date'] <= '2029-04-07')])\n",
"\n",
"industry_df = read_industry_data('../../data/sw_daily.h5')\n",
"df = df.merge(industry_df, on=['cat_l2_code', 'trade_date'], how='left')\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1c1dd3d6",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['open', 'close', 'high', 'low', 'vol', 'pct_chg', 'turnover_rate', 'circ_mv', 'volume_ratio', 'up_limit', 'down_limit', 'buy_sm_vol', 'sell_sm_vol', 'buy_lg_vol', 'sell_lg_vol', 'buy_elg_vol', 'sell_elg_vol', 'net_mf_vol', 'his_low', 'his_high', 'cost_5pct', 'cost_15pct', 'cost_50pct', 'cost_85pct', 'cost_95pct', 'weight_avg', 'winner_rate', 'lg_elg_net_buy_vol', 'flow_lg_elg_intensity', 'sm_net_buy_vol', 'total_buy_vol', 'lg_elg_buy_prop', 'flow_struct_buy_change', 'lg_elg_net_buy_vol_change', 'flow_lg_elg_accel', 'chip_concentration_range', 'chip_skewness', 'floating_chip_proxy', 'cost_support_15pct_change', 'cat_winner_price_zone', 'flow_chip_consistency', 'profit_taking_vs_absorb', 'cat_is_positive', 'upside_vol', 'downside_vol', 'vol_ratio', 'return_skew', 'return_kurtosis', 'volume_change_rate', 'cat_volume_breakout', 'turnover_deviation', 'cat_turnover_spike', 'avg_volume_ratio', 'cat_volume_ratio_breakout', 'vol_spike', 'vol_std_5', 'atr_14', 'atr_6', 'obv', 'maobv_6', 'rsi_3', 'return_5', 'return_20', 'std_return_5', 'std_return_90', 'std_return_90_2', 'act_factor1', 'act_factor2', 'act_factor3', 'act_factor4', 'rank_act_factor1', 'rank_act_factor2', 'rank_act_factor3', 'log_circ_mv', 'cov', 'delta_cov', 'alpha_22_improved', 'alpha_003', 'alpha_007', 'alpha_013', 'cat_up_limit', 'cat_down_limit', 'up_limit_count_10d', 'down_limit_count_10d', 'consecutive_up_limit', 'vol_break', 'weight_roc5', 'smallcap_concentration', 'cost_stability', 'high_cost_break_days', 'liquidity_risk', 'turnover_std', 'mv_volatility', 'volume_growth', 'mv_growth', 'arbr', 'momentum_factor', 'resonance_factor', 'log_close', 'cat_vol_spike', 'up', 'down', 'obv_maobv_6', 'std_return_5_over_std_return_90', 'std_return_90_minus_std_return_90_2', 'cat_af2', 'cat_af3', 'cat_af4', 'act_factor5', 'act_factor6', 'active_buy_volume_large', 'active_buy_volume_big', 'active_buy_volume_small', 'buy_lg_vol_minus_sell_lg_vol', 'buy_elg_vol_minus_sell_elg_vol', 'ctrl_strength', 'low_cost_dev', 'asymmetry', 'lock_factor', 'cat_vol_break', 'cost_atr_adj', 'cat_golden_resonance', 'mv_turnover_ratio', 'mv_adjusted_volume', 'mv_weighted_turnover', 'nonlinear_mv_volume', 'mv_volume_ratio', 'mv_momentum', 'lg_flow_mom_corr_20_60', 'lg_buy_consolidation_20', 'lg_flow_accel', 'profit_pressure', 'underwater_resistance', 'cost_conc_std_20', 'profit_decay_20', 'vol_amp_loss_20', 'vol_drop_profit_cnt_5', 'lg_flow_vol_interact_20', 'cost_break_confirm_cnt_5', 'atr_norm_channel_pos_14', 'turnover_diff_skew_20', 'lg_sm_flow_diverge_20', 'pullback_strong_20_20', 'vol_wgt_hist_pos_20', 'vol_adj_roc_20', 'intraday_lg_flow_corr_20', 'cap_neutral_cost_metric', 'in_date', 'industry_obv', 'industry_return_5', 'industry_return_20', 'industry__ema_5', 'industry__ema_13', 'industry__ema_20', 'industry__ema_60', 'industry_act_factor1', 'industry_act_factor2', 'industry_act_factor3', 'industry_act_factor4', 'industry_act_factor5', 'industry_act_factor6', 'industry_rank_act_factor1', 'industry_rank_act_factor2', 'industry_rank_act_factor3', 'industry_return_5_percentile', 'industry_return_20_percentile']\n"
]
}
],
"source": [
"feature_columns = [col for col in df.columns if col in df.columns]\n",
"feature_columns = [col for col in feature_columns if col not in ['trade_date',\n",
" 'ts_code',\n",
" 'label']]\n",
"feature_columns = [col for col in feature_columns if 'future' not in col]\n",
"feature_columns = [col for col in feature_columns if 'label' not in col]\n",
"feature_columns = [col for col in feature_columns if 'score' not in col]\n",
"feature_columns = [col for col in feature_columns if 'gen' not in col]\n",
"feature_columns = [col for col in feature_columns if 'is_st' not in col]\n",
"feature_columns = [col for col in feature_columns if 'pe_ttm' not in col]\n",
"feature_columns = [col for col in feature_columns if 'cat_l2_code' not in col]\n",
"# feature_columns = [col for col in feature_columns if col not in origin_columns]\n",
"feature_columns = [col for col in feature_columns if not col.startswith('_')]\n",
"# feature_columns = [col for col in feature_columns if col not in ['ts_code', 'trade_date', 'vol_std_5', 'cov', 'delta_cov', 'alpha_22_improved', 'alpha_007', 'consecutive_up_limit', 'mv_volatility', 'volume_growth', 'mv_growth', 'arbr']]\n",
"\n",
"print(feature_columns)\n",
"numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns\n",
"numeric_columns = [col for col in numeric_columns if col in feature_columns]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "2c60c1ea",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"每个特征列中的 NaN 值数量(字典形式):\n",
"ts_code: 0\n",
"trade_date: 0\n",
"open: 0\n",
"close: 0\n",
"high: 0\n",
"low: 0\n",
"vol: 0\n",
"pct_chg: 0\n",
"turnover_rate: 0\n",
"pe_ttm: 499616\n",
"circ_mv: 0\n",
"volume_ratio: 791\n",
"is_st: 0\n",
"up_limit: 0\n",
"down_limit: 0\n",
"buy_sm_vol: 7\n",
"sell_sm_vol: 7\n",
"buy_lg_vol: 7\n",
"sell_lg_vol: 7\n",
"buy_elg_vol: 7\n",
"sell_elg_vol: 7\n",
"net_mf_vol: 7\n",
"his_low: 24695\n",
"his_high: 24695\n",
"cost_5pct: 24695\n",
"cost_15pct: 24695\n",
"cost_50pct: 24695\n",
"cost_85pct: 24695\n",
"cost_95pct: 24695\n",
"weight_avg: 24695\n",
"winner_rate: 24695\n",
"lg_elg_net_buy_vol: 7\n",
"flow_lg_elg_intensity: 7\n",
"sm_net_buy_vol: 7\n",
"flow_divergence_diff: 7\n",
"flow_divergence_ratio: 7\n",
"total_buy_vol: 7\n",
"lg_elg_buy_prop: 7\n",
"flow_struct_buy_change: 3287\n",
"lg_elg_net_buy_vol_change: 3287\n",
"flow_lg_elg_accel: 6567\n",
"chip_concentration_range: 24695\n",
"chip_skewness: 24695\n",
"floating_chip_proxy: 24695\n",
"cost_support_15pct_change: 27855\n",
"cat_winner_price_zone: 0\n",
"flow_chip_consistency: 7\n",
"profit_taking_vs_absorb: 7\n",
"cat_is_positive: 0\n",
"upside_vol: 29581\n",
"downside_vol: 29655\n",
"vol_ratio: 0\n",
"return_skew: 13096\n",
"return_kurtosis: 13096\n",
"volume_change_rate: 29466\n",
"cat_volume_breakout: 0\n",
"turnover_deviation: 6548\n",
"cat_turnover_spike: 0\n",
"avg_volume_ratio: 7341\n",
"cat_volume_ratio_breakout: 0\n",
"vol_spike: 62074\n",
"vol_std_5: 16370\n",
"atr_14: 45836\n",
"atr_6: 19644\n",
"obv: 0\n",
"maobv_6: 16370\n",
"rsi_3: 9822\n",
"return_5: 16370\n",
"return_20: 65315\n",
"std_return_5: 16370\n",
"std_return_90: 291770\n",
"std_return_90_2: 323906\n",
"act_factor1: 16370\n",
"act_factor2: 42562\n",
"act_factor3: 65315\n",
"act_factor4: 194886\n",
"rank_act_factor1: 16370\n",
"rank_act_factor2: 42562\n",
"rank_act_factor3: 65315\n",
"log_circ_mv: 0\n",
"cov: 13096\n",
"delta_cov: 29466\n",
"alpha_22_improved: 62074\n",
"alpha_003: 0\n",
"alpha_007: 13120\n",
"alpha_013: 62074\n",
"cat_up_limit: 0\n",
"cat_down_limit: 0\n",
"up_limit_count_10d: 0\n",
"down_limit_count_10d: 0\n",
"consecutive_up_limit: 0\n",
"vol_break: 0\n",
"weight_roc5: 40531\n",
"price_cost_divergence: 93280\n",
"smallcap_concentration: 24695\n",
"cost_stability: 85077\n",
"high_cost_break_days: 13096\n",
"liquidity_risk: 53215\n",
"turnover_std: 62074\n",
"mv_volatility: 62074\n",
"volume_growth: 65315\n",
"mv_growth: 65315\n",
"arbr: 9822\n",
"momentum_factor: 29466\n",
"resonance_factor: 791\n",
"log_close: 0\n",
"cat_vol_spike: 0\n",
"up: 0\n",
"down: 0\n",
"obv_maobv_6: 16370\n",
"std_return_5_over_std_return_90: 291770\n",
"std_return_90_minus_std_return_90_2: 323906\n",
"cat_af2: 0\n",
"cat_af3: 0\n",
"cat_af4: 0\n",
"act_factor5: 194886\n",
"act_factor6: 42562\n",
"active_buy_volume_large: 13\n",
"active_buy_volume_big: 79\n",
"active_buy_volume_small: 7\n",
"buy_lg_vol_minus_sell_lg_vol: 8\n",
"buy_elg_vol_minus_sell_elg_vol: 69\n",
"ctrl_strength: 24695\n",
"low_cost_dev: 24695\n",
"asymmetry: 24701\n",
"lock_factor: 24695\n",
"cat_vol_break: 0\n",
"cost_atr_adj: 69060\n",
"cat_golden_resonance: 0\n",
"mv_turnover_ratio: 0\n",
"mv_adjusted_volume: 0\n",
"mv_weighted_turnover: 0\n",
"nonlinear_mv_volume: 0\n",
"mv_volume_ratio: 791\n",
"mv_momentum: 791\n",
"lg_flow_mom_corr_20_60: 1186\n",
"lg_buy_consolidation_20: 1950902\n",
"lg_flow_accel: 6567\n",
"profit_pressure: 24695\n",
"underwater_resistance: 24695\n",
"cost_conc_std_20: 29466\n",
"profit_decay_20: 0\n",
"vol_amp_loss_20: 53215\n",
"vol_drop_profit_cnt_5: 0\n",
"lg_flow_vol_interact_20: 29466\n",
"cost_break_confirm_cnt_5: 0\n",
"atr_norm_channel_pos_14: 0\n",
"turnover_diff_skew_20: 32740\n",
"lg_sm_flow_diverge_20: 29466\n",
"pullback_strong_20_20: 0\n",
"vol_wgt_hist_pos_20: 0\n",
"vol_adj_roc_20: 0\n",
"intraday_lg_flow_corr_20: 2431461\n",
"cap_neutral_cost_metric: 2431461\n",
"cat_l2_code: 290\n",
"in_date: 65486\n",
"future_return: 6548\n",
"future_return2: 3274\n",
"future_volatility: 6548\n",
"score: 6548\n",
"future_score: 6548\n",
"industry_obv: 11272\n",
"industry_return_5: 11272\n",
"industry_return_20: 11272\n",
"industry__ema_5: 11272\n",
"industry__ema_13: 11272\n",
"industry__ema_20: 11272\n",
"industry__ema_60: 11272\n",
"industry_act_factor1: 11272\n",
"industry_act_factor2: 11272\n",
"industry_act_factor3: 11272\n",
"industry_act_factor4: 11272\n",
"industry_act_factor5: 11272\n",
"industry_act_factor6: 11272\n",
"industry_rank_act_factor1: 11272\n",
"industry_rank_act_factor2: 11272\n",
"industry_rank_act_factor3: 11272\n",
"industry_return_5_percentile: 11272\n",
"industry_return_20_percentile: 11272\n",
"['open', 'close', 'high', 'low', 'vol', 'pct_chg', 'turnover_rate', 'circ_mv', 'volume_ratio', 'up_limit', 'down_limit', 'buy_sm_vol', 'sell_sm_vol', 'buy_lg_vol', 'sell_lg_vol', 'buy_elg_vol', 'sell_elg_vol', 'net_mf_vol', 'his_low', 'his_high', 'cost_5pct', 'cost_15pct', 'cost_50pct', 'cost_85pct', 'cost_95pct', 'weight_avg', 'winner_rate', 'lg_elg_net_buy_vol', 'flow_lg_elg_intensity', 'sm_net_buy_vol', 'total_buy_vol', 'lg_elg_buy_prop', 'flow_struct_buy_change', 'lg_elg_net_buy_vol_change', 'flow_lg_elg_accel', 'chip_concentration_range', 'chip_skewness', 'floating_chip_proxy', 'cost_support_15pct_change', 'cat_winner_price_zone', 'flow_chip_consistency', 'profit_taking_vs_absorb', 'cat_is_positive', 'upside_vol', 'downside_vol', 'vol_ratio', 'return_skew', 'return_kurtosis', 'volume_change_rate', 'cat_volume_breakout', 'turnover_deviation', 'cat_turnover_spike', 'avg_volume_ratio', 'cat_volume_ratio_breakout', 'vol_std_5', 'atr_6', 'obv', 'maobv_6', 'rsi_3', 'return_5', 'std_return_5', 'act_factor1', 'rank_act_factor1', 'log_circ_mv', 'cov', 'delta_cov', 'alpha_003', 'alpha_007', 'cat_up_limit', 'cat_down_limit', 'up_limit_count_10d', 'down_limit_count_10d', 'consecutive_up_limit', 'vol_break', 'smallcap_concentration', 'high_cost_break_days', 'arbr', 'momentum_factor', 'resonance_factor', 'log_close', 'cat_vol_spike', 'up', 'down', 'obv_maobv_6', 'cat_af2', 'cat_af3', 'cat_af4', 'active_buy_volume_large', 'active_buy_volume_big', 'active_buy_volume_small', 'buy_lg_vol_minus_sell_lg_vol', 'buy_elg_vol_minus_sell_elg_vol', 'ctrl_strength', 'low_cost_dev', 'asymmetry', 'lock_factor', 'cat_vol_break', 'cat_golden_resonance', 'mv_turnover_ratio', 'mv_adjusted_volume', 'mv_weighted_turnover', 'nonlinear_mv_volume', 'mv_volume_ratio', 'mv_momentum', 'lg_flow_mom_corr_20_60', 'lg_flow_accel', 'profit_pressure', 'underwater_resistance', 'cost_conc_std_20', 'profit_decay_20', 'vol_drop_profit_cnt_5', 'lg_flow_vol_interact_20', 'cost_break_confirm_cnt_5', 'atr_norm_channel_pos_14', 'lg_sm_flow_diverge_20', 'pullback_strong_20_20', 'vol_wgt_hist_pos_20', 'vol_adj_roc_20', 'industry_obv', 'industry_return_5', 'industry_return_20', 'industry__ema_5', 'industry__ema_13', 'industry__ema_20', 'industry__ema_60', 'industry_act_factor1', 'industry_act_factor2', 'industry_act_factor3', 'industry_act_factor4', 'industry_act_factor5', 'industry_act_factor6', 'industry_rank_act_factor1', 'industry_rank_act_factor2', 'industry_rank_act_factor3', 'industry_return_5_percentile', 'industry_return_20_percentile']\n"
]
}
],
"source": [
"def count_nan_and_inf_per_feature(df: pd.DataFrame):\n",
" \"\"\"\n",
" 计算 DataFrame 中每个特征列的 NaN 和 Inf 值数量。\n",
"\n",
" Args:\n",
" df: 要分析的 pandas DataFrame。\n",
"\n",
" Returns:\n",
" 一个字典,包含两个 pandas Series\n",
" - 'NaN_Count': 索引是列名,值是该列中 NaN 的数量。\n",
" - 'Inf_Count': 索引是列名,值是该列中 Inf 的数量。\n",
" \"\"\"\n",
" nan_counts = df.isna().sum()\n",
" # inf_counts = np.isinf(df).sum()\n",
" return nan_counts\n",
"\n",
"\n",
"nan_counts_series = count_nan_and_inf_per_feature(df)\n",
"\n",
"# 或者,如果您想以字典的形式获取结果:\n",
"nan_counts_dict = nan_counts_series.to_dict()\n",
"print(\"\\n每个特征列中的 NaN 值数量(字典形式):\")\n",
"for k, v in nan_counts_dict.items():\n",
" print(f'{k}: {v}')\n",
" if v > 30000 and k in feature_columns:\n",
" feature_columns.remove(k)\n",
"print(feature_columns)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e088bd8a357e815a",
"metadata": {
"ExecuteTime": {
"end_time": "2025-04-13T15:39:47.461434Z",
"start_time": "2025-04-13T15:39:44.369664Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"gen\tnevals\tavg \tstd \tmin\tmax \n",
"0 \t64 \t-0.387605\t0.492269\t-1 \t0.84339\n",
"1 \t52 \t-0.0280574\t0.328787\t-1 \t0.84339\n",
"2 \t56 \t-0.0442643\t0.498012\t-1 \t0.84339\n",
"3 \t50 \t0.0843881 \t0.506873\t-1 \t0.84339\n",
"4 \t56 \t0.128797 \t0.586781\t-1 \t0.84339\n",
"5 \t52 \t0.107366 \t0.586957\t-1 \t0.918103\n",
"6 \t52 \t0.0602483 \t0.666345\t-1 \t0.918103\n",
"7 \t54 \t0.177717 \t0.561644\t-1 \t0.918103\n",
"8 \t57 \t0.206183 \t0.620791\t-1 \t0.957887\n",
"9 \t51 \t0.253306 \t0.667259\t-1 \t1.07875 \n",
"10 \t53 \t0.19914 \t0.681541\t-1 \t1.14356 \n",
"11 \t54 \t0.173093 \t0.752007\t-1 \t1.24408 \n",
"12 \t53 \t0.429303 \t0.636249\t-1.07606\t1.24408 \n",
"13 \t46 \t0.443469 \t0.754764\t-1.17052\t1.24408 \n",
"14 \t57 \t0.412168 \t0.719715\t-1.2066 \t1.24408 \n",
"15 \t47 \t0.420095 \t0.833547\t-1.20899\t1.25608 \n",
"16 \t46 \t0.516075 \t0.916347\t-1.16556\t1.25765 \n",
"17 \t48 \t0.52129 \t0.872883\t-1 \t1.30663 \n",
"18 \t53 \t0.530992 \t0.923366\t-1 \t1.3677 \n",
"19 \t54 \t0.569299 \t0.861833\t-1.39138\t1.3677 \n",
"20 \t51 \t0.538589 \t0.883032\t-1.12472\t1.3677 \n",
"21 \t49 \t0.684813 \t0.874059\t-1 \t1.3677 \n",
"22 \t46 \t0.659823 \t0.86879 \t-1.17051\t1.3677 \n",
"23 \t42 \t0.678971 \t0.886044\t-1.39138\t1.3677 \n",
"24 \t55 \t0.639381 \t0.905808\t-1.39138\t1.37645 \n",
"25 \t42 \t0.721136 \t0.915513\t-1.30205\t1.39372 \n",
"26 \t56 \t0.695918 \t0.849837\t-1.0437 \t1.39372 \n",
"27 \t56 \t0.465007 \t0.934313\t-1 \t1.39372 \n",
"28 \t51 \t0.714563 \t0.88635 \t-1.13547\t1.43745 \n",
"29 \t49 \t0.687478 \t0.84568 \t-1 \t1.43745 \n",
"30 \t50 \t0.646657 \t0.835957\t-1 \t1.43745 \n",
"31 \t49 \t0.615978 \t0.939622\t-1.04846\t1.43745 \n",
"32 \t49 \t0.654171 \t0.973861\t-1.12771\t1.43745 \n",
"\n",
"Best Factors Found:\n",
"Fitness: 1.4375, Factor 1: protected_div_torch(mul(protected_div_torch(add(return_kurtosis, profit_pressure), mul(cost_85pct, buy_elg_vol_minus_sell_elg_vol)), protected_div_torch(cost_break_confirm_cnt_5, pow(cos(lg_flow_vol_interact_20), cos(chip_concentration_range)))), sub(add(obv, protected_div_torch(cost_break_confirm_cnt_5, cos(chip_skewness))), add(obv, protected_div_torch(add(return_kurtosis, profit_pressure), pow(alpha_007, active_buy_volume_big)))))\n",
"Fitness: 1.3937, Factor 2: protected_div_torch(mul(protected_div_torch(protected_div_torch(add(return_kurtosis, profit_pressure), pow(alpha_007, active_buy_volume_big)), delta_cov), protected_div_torch(protected_div_torch(add(return_kurtosis, profit_pressure), pow(alpha_007, active_buy_volume_big)), rank_act_factor2)), sub(add(obv, protected_div_torch(cost_break_confirm_cnt_5, cos(chip_skewness))), add(obv, protected_div_torch(add(return_kurtosis, profit_pressure), pow(alpha_007, active_buy_volume_big)))))\n",
"Fitness: 1.3843, Factor 3: protected_div_torch(mul(protected_div_torch(protected_div_torch(profit_pressure, pow(alpha_007, active_buy_volume_big)), delta_cov), protected_div_torch(protected_div_torch(add(return_kurtosis, profit_pressure), pow(alpha_007, active_buy_volume_big)), rank_act_factor2)), sub(add(obv, protected_div_torch(cost_break_confirm_cnt_5, cos(chip_skewness))), add(obv, protected_div_torch(add(return_kurtosis, profit_pressure), pow(alpha_007, active_buy_volume_big)))))\n"
]
}
],
"source": [
"from deap import creator, gp, tools, base, algorithms\n",
"import numpy as np\n",
"import pandas as pd\n",
"import torch\n",
"from scipy.stats import spearmanr\n",
"import operator\n",
"\n",
"# 保护性除法函数 (PyTorch 版本)\n",
"def protected_div_torch(left, right):\n",
" return torch.where(right != 0, left / right, torch.ones_like(left))\n",
"\n",
"def generate_deap_factors_pytorch_v2(df: pd.DataFrame, numeric_columns: list, target_column: str = 'future_return', date_column: str = 'trade_date', params: dict = None, random_state: int = 42):\n",
" \"\"\"\n",
" 使用 deap 库通过遗传编程生成新的因子,并使用 PyTorch 算子和计算,过滤 NaN 值。\n",
"\n",
" Args:\n",
" df (pd.DataFrame): 包含因子和目标变量的数据框。\n",
" numeric_columns (list): 数值型因子列名的列表。\n",
" target_column (str): 目标变量的列名,默认为 'future_return'。\n",
" params (dict): deap 进化算法的参数字典。\n",
" random_state (int): 随机种子,用于保证结果的可重复性。\n",
"\n",
" Returns:\n",
" list: 包含最佳因子表达式的列表。\n",
" \"\"\"\n",
" if params is None:\n",
" params = {}\n",
"\n",
" # 设置随机种子\n",
" np.random.seed(random_state)\n",
" torch.manual_seed(random_state)\n",
"\n",
" # 1. 定义原始集 (Primitive Set) - 使用 PyTorch 算子\n",
" pset_torch = gp.PrimitiveSet(\"PYTORCH\", arity=len(numeric_columns))\n",
" pset_torch.addPrimitive(torch.add, 2)\n",
" pset_torch.addPrimitive(torch.sub, 2)\n",
" pset_torch.addPrimitive(torch.mul, 2)\n",
" pset_torch.addPrimitive(protected_div_torch, 2) # 使用 PyTorch 保护性除法\n",
" # 新增的复杂算子\n",
" pset_torch.addPrimitive(torch.sin, 1) # 正弦函数 (一元算子)\n",
" pset_torch.addPrimitive(torch.cos, 1) # 余弦函数 (一元算子)\n",
" # pset_torch.addPrimitive(torch.abs, 1) # 绝对值 (一元算子)\n",
" # pset_torch.addPrimitive(torch.sqrt, 1) # 平方根 (一元算子)\n",
" pset_torch.addPrimitive(torch.pow, 2) # 指数运算 (二元算子,例如 x 的 y 次方)\n",
" # pset_torch.addPrimitive(torch.tanh, 1) # 双曲正切函数 (一元算子)\n",
"\n",
" # def rate_of_change_torch(x, y): # 计算 y 相对于 x 的变化率\n",
" # return (y - x) / (x + 1e-8)\n",
" # pset_torch.addPrimitive(rate_of_change_torch, 2)\n",
"\n",
" # def covariance_like_torch(x, y):\n",
" # mean_x = torch.mean(x, dim=0, keepdim=True) # 保持维度以便广播\n",
" # mean_y = torch.mean(y, dim=0, keepdim=True)\n",
" # return (x - mean_x) * (y - mean_y)\n",
"\n",
" # pset_torch.addPrimitive(covariance_like_torch, 2)\n",
"\n",
" # 将 numeric_columns 作为终端添加到原始集\n",
" pset_torch.renameArguments(**{f\"ARG{i}\": col for i, col in enumerate(numeric_columns)})\n",
"\n",
" # 2. 定义适应度和个体\n",
" # 目标是最大化 IC 夏普比率\n",
" creator.create(\"FitnessMax\", base.Fitness, weights=(1.0,))\n",
" creator.create(\"Individual\", gp.PrimitiveTree, fitness=creator.FitnessMax)\n",
"\n",
" # 3. 创建工具箱 (Toolbox)\n",
" toolbox = base.Toolbox()\n",
" toolbox.register(\"expr_torch\", gp.genHalfAndHalf, pset=pset_torch, min_=1, max_=3) # 调整 min_/max_ 以控制表达式复杂性\n",
" toolbox.register(\"individual\", tools.initIterate, creator.Individual, toolbox.expr_torch)\n",
" toolbox.register(\"population\", tools.initRepeat, list, toolbox.individual)\n",
" toolbox.register(\"compile_torch\", gp.compile, pset=pset_torch) # 编译为 PyTorch 函数\n",
"\n",
" # 准备 PyTorch 张量数据 (所有日期所有股票)\n",
" device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
" data_tensor_all = torch.from_numpy(df[numeric_columns].values).float().to(device)\n",
" target_tensor_all = torch.from_numpy(df[target_column].values).float().to(device)\n",
" dates_all = df[date_column].values # 获取日期 numpy 数组\n",
"\n",
" # 4. 定义基于 PyTorch + IC 夏普比率的适应度函数\n",
" def evaluate_torch_cuda_ic_sharpe(individual, data_tensor_all, target_tensor_all, dates_all):\n",
" # 将个体(表达式树)编译成可执行的 PyTorch 函数\n",
" func_torch = toolbox.compile_torch(expr=individual)\n",
"\n",
" try:\n",
" # 应用该函数到 PyTorch 张量 (一次性计算所有日期所有股票的因子值)\n",
" # 处理可能的维度不一致,确保输出是一维或二维 (N, 1) 的张量\n",
" factor_values_tensor = func_torch(*torch.split(data_tensor_all, 1, dim=1))\n",
" if factor_values_tensor.ndim > 1 and factor_values_tensor.shape[1] != 1:\n",
" # 如果输出是 (N, M) 其中 M > 1可能需要一个聚合操作这里暂时返回负适应度\n",
" print(f\"警告: 因子表达式输出张量维度为 {factor_values_tensor.shape},期望 (N, 1)。\")\n",
" return (-1.0,)\n",
" factor_values_tensor = factor_values_tensor.flatten() # 确保是展平的一维张量\n",
"\n",
" # 将 PyTorch 张量移回 CPU 并转换为 NumPy 数组\n",
" factor_values_np = factor_values_tensor.cpu().numpy()\n",
" target_np = target_tensor_all.cpu().numpy().flatten() # 目标也展平\n",
" dates_np = dates_all # 日期已经是 numpy 数组\n",
"\n",
" # 创建一个临时 Pandas DataFrame 以便按日期分组计算每日 IC\n",
" temp_df = pd.DataFrame({\n",
" 'date': dates_np,\n",
" 'factor_value': factor_values_np,\n",
" 'target_value': target_np\n",
" })\n",
"\n",
" # 计算每日 Rank IC\n",
" # 在分组应用 spearmanr 时处理 NaN 和数据点不足的问题\n",
" daily_ics = temp_df.groupby('date').apply(\n",
" lambda x: spearmanr(x['factor_value'], x['target_value'])[0]\n",
" if len(x) >= 2 and x['factor_value'].notna().sum() >= 2 and x['target_value'].notna().sum() >= 2 # 确保分组内有效数据点 >= 2\n",
" else np.nan # 数据点不足或计算失败时返回 NaN\n",
" ).dropna() # 移除 NaN 的每日 IC\n",
"\n",
" # 计算 IC 夏普比率\n",
" if len(daily_ics) < 5: # 需要至少几个有效日 IC 才能计算夏普比率\n",
" # print(f\"警告: 有效日 IC 数量不足 ({len(daily_ics)}),无法计算夏普比率。\")\n",
" return (-1.0,) # 有效日 IC 太少,返回负适应度\n",
"\n",
" ic_mean = daily_ics.mean()\n",
" ic_std = daily_ics.std()\n",
"\n",
" # 处理标准差为零的情况 (非常罕见,可能意味着每日 IC 是常数)\n",
" if ic_std == 0:\n",
" ic_sharpe = ic_mean * 1e6 if ic_mean > 0 else -1.0 # 如果均值>0且标差为0给一个很大的正值\n",
" else:\n",
" ic_sharpe = ic_mean / ic_std\n",
"\n",
" # 返回 IC 夏普比率作为适应度 (需要最大化)\n",
" # 如果计算结果是 NaN (例如mean/std 导致 NaN),返回负值\n",
" return (ic_sharpe if not np.isnan(ic_sharpe) else -1.0,)\n",
"\n",
" except (ValueError, TypeError, ZeroDivisionError, RuntimeError) as e:\n",
" # 打印错误信息和导致错误的个体,以便调试\n",
" print(f\"Error during evaluation for individual {individual}: {e}\")\n",
" return (-1.0,) # 如果计算过程中出现错误,返回一个很小的负值\n",
"\n",
" # 修改 toolbox.register 调用,将 target_tensor 传递给 evaluate_torch_cuda\n",
" toolbox.register(\"evaluate\", evaluate_torch_cuda_ic_sharpe, data_tensor_all=data_tensor_all, target_tensor_all=target_tensor_all, dates_all=dates_all)\n",
" toolbox.register(\"select\", tools.selTournament, tournsize=params.get('tournament_size', 3))\n",
" toolbox.register(\"mate\", gp.cxOnePointLeafBiased, termpb=0.2) # 移除 pset=pset\n",
" toolbox.register(\"mutate\", gp.mutUniform, expr=toolbox.expr_torch, pset=pset_torch) # 使用 PyTorch 原始集\n",
"\n",
" MAX_TREE_DEPTH = 5\n",
"\n",
" toolbox.decorate(\"mate\", gp.staticLimit(key=operator.attrgetter('height'), max_value=MAX_TREE_DEPTH))\n",
" toolbox.decorate(\"mutate\", gp.staticLimit(key=operator.attrgetter('height'), max_value=MAX_TREE_DEPTH))\n",
"\n",
" # 5. 设置进化参数\n",
" population_size = params.get('population_size', 100)\n",
" generations = params.get('generations', 10)\n",
" crossover_probability = params.get('crossover_probability', 0.7) # 调整参数以增加探索\n",
" mutation_probability = params.get('mutation_probability', 0.3) # 调整参数以增加探索\n",
"\n",
" # 6. 初始化种群\n",
" pop = toolbox.population(n=population_size)\n",
" hof = tools.HallOfFame(params.get('hall_of_fame_size', 5)) # 保留最佳的几个个体\n",
" stats = tools.Statistics(lambda ind: ind.fitness.values)\n",
" stats.register(\"avg\", np.mean)\n",
" stats.register(\"std\", np.std)\n",
" stats.register(\"min\", np.min)\n",
" stats.register(\"max\", np.max)\n",
"\n",
" # 7. 运行进化算法\n",
" algorithms.eaSimple(pop, toolbox, cxpb=crossover_probability, mutpb=mutation_probability, ngen=generations,\n",
" stats=stats, halloffame=hof, verbose=True)\n",
"\n",
" # 8. 返回最佳因子表达式\n",
" return hof, stats\n",
"\n",
"params = {\n",
" 'population_size': 64,\n",
" 'generations': 32,\n",
" 'crossover_probability': 0.7,\n",
" 'mutation_probability': 0.3,\n",
" 'tournament_size': 4,\n",
" 'hall_of_fame_size': 3\n",
"}\n",
"\n",
"best_factors_hof, stats = generate_deap_factors_pytorch_v2(df.copy(), numeric_columns, params=params)\n",
"\n",
"print(\"\\nBest Factors Found:\")\n",
"for i, ind in enumerate(best_factors_hof):\n",
" fitness_value = ind.fitness.values[0] # 获取适应度值\n",
" print(f\"Fitness: {fitness_value:.4f}, Factor {i+1}: {ind}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a0b3d7551ef0c81f",
"metadata": {
"ExecuteTime": {
"end_time": "2025-04-13T15:39:47.502867Z",
"start_time": "2025-04-13T15:39:47.461434Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"全面因子分析报告 - 特征因子: 'generated_factor'\n",
"------------------------------------------------------------\n",
"整体 Rank IC: 0.0817\n",
"整体 P-value: 0.0000\n",
"------------------------------------------------------------\n",
"计算滚动 Rank IC (按 'D' 窗口)...\n",
"滚动 Rank IC 统计量 (D):\n",
" 均值: 0.0124\n",
" 标准差: 0.2330\n",
" 夏普比率 (IC Mean / IC Std): 0.0531\n",
" T-statistic: 1.4577\n",
" T-statistic P-value: 0.1453\n",
"------------------------------------------------------------\n",
"Hit Ratio (正向 Rank IC 比例): 0.5060\n",
"------------------------------------------------------------\n",
"因子 10 分位数分析 (按因子值从小到大排序):\n",
" 第 1 分位数: 平均 'future_return' = -0.0004\n",
" 第 2 分位数: 平均 'future_return' = -0.0008\n",
" 第 3 分位数: 平均 'future_return' = -0.0004\n",
" 第 4 分位数: 平均 'future_return' = 0.0005\n",
" 第 5 分位数: 平均 'future_return' = 0.0007\n",
" 第 6 分位数: 平均 'future_return' = 0.0015\n",
" 第 7 分位数: 平均 'future_return' = 0.0021\n",
" 第 8 分位数: 平均 'future_return' = 0.0033\n",
" 第 9 分位数: 平均 'future_return' = 0.0054\n",
" 第 10 分位数: 平均 'future_return' = 0.0135\n",
"\n",
"因子值的分位数范围:\n",
" 第 1 分位数: [-1.0490, 0.0581]\n",
" 第 2 分位数: [0.0581, 0.1051]\n",
" 第 3 分位数: [0.1051, 0.1458]\n",
" 第 4 分位数: [0.1458, 0.1881]\n",
" 第 5 分位数: [0.1881, 0.2354]\n",
" 第 6 分位数: [0.2354, 0.2909]\n",
" 第 7 分位数: [0.2909, 0.3594]\n",
" 第 8 分位数: [0.3594, 0.4505]\n",
" 第 9 分位数: [0.4505, 0.5880]\n",
" 第 10 分位数: [0.5880, 1.9782]\n",
"------------------------------------------------------------\n",
"分析完成。\n"
]
}
],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import torch\n",
"\n",
"target_column = 'future_return'\n",
"# 假设您已经定义了 protected_div_torch 函数\n",
"def protected_div_torch(left, right):\n",
" return torch.where(right != 0, left / right, torch.ones_like(left))\n",
"\n",
"def protected_div_np(left, right):\n",
" \"\"\"安全除法,避免除以零的错误\"\"\"\n",
" return np.where(right != 0, left / right, np.ones_like(left) * np.nan) # 除以零时返回 NaN\n",
"\n",
"def calculate_factor_4(df: pd.DataFrame) -> pd.Series:\n",
" \"\"\"\n",
" 计算因子: sub(add(add(protected_div_torch(pow(pct_chg, std_return_90), cost_95pct), protected_div_torch(industry_act_factor6, cost_95pct)), pow(protected_div_torch(protected_div_torch(act_factor6, cost_95pct), cost_95pct), protected_div_torch(protected_div_torch(act_factor6, cost_95pct), cost_95pct))), cos(industry_act_factor1)).\n",
"\n",
" Args:\n",
" df (pd.DataFrame): 包含必要列的数据框。\n",
"\n",
" Returns:\n",
" pd.Series: 计算得到的因子值。\n",
" \"\"\"\n",
" pct_chg = df['pct_chg']\n",
" std_return_90 = df['std_return_90']\n",
" cost_95pct = df['cost_95pct']\n",
" industry_act_factor6 = df['industry_act_factor6']\n",
" act_factor6 = df['act_factor6']\n",
" industry_act_factor1 = df['industry_act_factor1']\n",
"\n",
" # Term 1: protected_div_torch(pow(pct_chg, std_return_90), cost_95pct)\n",
" term1_num = np.power(pct_chg, std_return_90)\n",
" term1 = protected_div_np(term1_num, cost_95pct)\n",
"\n",
" # Term 2: protected_div_torch(industry_act_factor6, cost_95pct)\n",
" term2 = protected_div_np(industry_act_factor6, cost_95pct)\n",
"\n",
" # Term 3: pow(protected_div_torch(protected_div_torch(act_factor6, cost_95pct), cost_95pct), protected_div_torch(protected_div_torch(act_factor6, cost_95pct), cost_95pct))\n",
" term3_base_inner = protected_div_np(act_factor6, cost_95pct)\n",
" term3_base = protected_div_np(term3_base_inner, cost_95pct)\n",
" term3_exponent_inner = protected_div_np(act_factor6, cost_95pct)\n",
" term3_exponent = protected_div_np(term3_exponent_inner, cost_95pct)\n",
" term3 = np.power(term3_base, term3_exponent)\n",
"\n",
"\n",
" # Sum of the first three terms\n",
" add_terms = term1 + term2 + term3\n",
"\n",
" # Term 4: cos(industry_act_factor1)\n",
" term4 = np.cos(industry_act_factor1)\n",
"\n",
" # Final factor\n",
" factor4 = add_terms - term4\n",
"\n",
" return factor4\n",
"\n",
"df['generated_factor'] = calculate_factor_4(df)\n",
"\n",
"import pandas as pd\n",
"import numpy as np\n",
"from scipy.stats import spearmanr, ttest_1samp\n",
"\n",
"def comprehensive_factor_analysis(df: pd.DataFrame, factor_column: str, target_column: str = 'future_return', date_column: str = 'trade_date', rolling_window: str = 'D', n_deciles: int = 10):\n",
" \"\"\"\n",
" 对 DataFrame 中的一个特征因子进行全面分析。\n",
"\n",
" Args:\n",
" df (pd.DataFrame): 包含因子和目标变量的数据框。\n",
" factor_column (str): 要分析的特征因子的列名。\n",
" target_column (str): 目标变量的列名,默认为 'future_return'。\n",
" date_column (str): 包含日期信息的列名,默认为 'trade_date'。\n",
" rolling_window (str): 滚动 Rank IC 的时间窗口(例如 'D' 表示按天,'W' 表示按周)。\n",
" n_deciles (int): 进行分位数分析时使用的分位数数量,默认为 10。\n",
" \"\"\"\n",
" if factor_column not in df.columns:\n",
" print(f\"错误: 特征因子列 '{factor_column}' 不存在于 DataFrame 中。\")\n",
" return\n",
" if target_column not in df.columns:\n",
" print(f\"错误: 目标列 '{target_column}' 不存在于 DataFrame 中。\")\n",
" return\n",
" if date_column not in df.columns:\n",
" print(f\"错误: 日期列 '{date_column}' 不存在于 DataFrame 中。\")\n",
" return\n",
"\n",
" # 确保日期列是 datetime 类型并设置为索引\n",
" df_analy = df.copy()\n",
" df_analy[date_column] = pd.to_datetime(df_analy[date_column])\n",
" df_analy = df_analy.set_index(date_column)\n",
"\n",
" # 移除因子或目标变量为 NaN 的行\n",
" df_analy = df_analy.dropna(subset=[factor_column, target_column])\n",
"\n",
" if len(df_analy) < 2:\n",
" print(\"警告: 有效数据点太少,无法进行分析。\")\n",
" return\n",
"\n",
" print(f\"全面因子分析报告 - 特征因子: '{factor_column}'\")\n",
" print(\"-\" * 60)\n",
"\n",
" # 1. 计算整体 Rank IC\n",
" overall_rank_ic, overall_p_value = spearmanr(df_analy[factor_column], df_analy[target_column])\n",
" print(f\"整体 Rank IC: {overall_rank_ic:.4f}\")\n",
" print(f\"整体 P-value: {overall_p_value:.4f}\")\n",
" print(\"-\" * 60)\n",
"\n",
" # 2. 计算滚动 Rank IC (按指定时间窗口)\n",
" print(f\"计算滚动 Rank IC (按 '{rolling_window}' 窗口)...\")\n",
" rolling_ics = df_analy.groupby(df_analy.index.to_period(rolling_window)).apply(\n",
" lambda x: spearmanr(x[factor_column], x[target_column])[0] if len(x) >= 2 else np.nan\n",
" ).dropna()\n",
"\n",
" if len(rolling_ics) < 2:\n",
" print(\"警告: 滚动 Rank IC 有效周期太少,无法计算统计量。\")\n",
" else:\n",
" # 3. 滚动 IC 统计量\n",
" ic_mean = rolling_ics.mean()\n",
" ic_std = rolling_ics.std()\n",
" ic_sharpe = ic_mean / ic_std if ic_std != 0 else np.nan\n",
" t_statistic, p_value_t = ttest_1samp(rolling_ics, 0) # 检验均值是否显著不为零\n",
"\n",
" print(f\"滚动 Rank IC 统计量 ({rolling_window}):\")\n",
" print(f\" 均值: {ic_mean:.4f}\")\n",
" print(f\" 标准差: {ic_std:.4f}\")\n",
" print(f\" 夏普比率 (IC Mean / IC Std): {ic_sharpe:.4f}\")\n",
" print(f\" T-statistic: {t_statistic:.4f}\")\n",
" print(f\" T-statistic P-value: {p_value_t:.4f}\")\n",
" print(\"-\" * 60)\n",
"\n",
" # 4. Hit Ratio (正向 Rank IC 的比例)\n",
" hit_ratio = (rolling_ics > 0).sum() / len(rolling_ics)\n",
" print(f\"Hit Ratio (正向 Rank IC 比例): {hit_ratio:.4f}\")\n",
" print(\"-\" * 60)\n",
"\n",
" # 5. 分位数分析 (在整个数据集上进行)\n",
" print(f\"因子 {n_deciles} 分位数分析 (按因子值从小到大排序):\")\n",
" df_analy['decile'] = pd.qcut(df_analy[factor_column], q=n_deciles, labels=False, duplicates='drop')\n",
" decile_analysis = df_analy.groupby('decile')[target_column].mean().sort_index()\n",
"\n",
" if len(decile_analysis) > 0:\n",
" for decile, avg_return in decile_analysis.items():\n",
" print(f\" 第 {decile + 1} 分位数: 平均 '{target_column}' = {avg_return:.4f}\")\n",
"\n",
" # 打印每个分位数的因子值范围\n",
" percentiles = np.linspace(0, 100, n_deciles + 1)\n",
" factor_percentiles = df_analy[factor_column].quantile(percentiles / 100)\n",
" print(\"\\n因子值的分位数范围:\")\n",
" # 修复分位数范围打印的 KeyError\n",
" for i in range(len(decile_analysis)): # 确保只打印实际存在的分位数\n",
" lower_bound = factor_percentiles[percentiles[i] / 100]\n",
" upper_bound = factor_percentiles[percentiles[i+1] / 100]\n",
" print(f\" 第 {i + 1} 分位数: [{lower_bound:.4f}, {upper_bound:.4f}]\")\n",
" else:\n",
" print(\"警告: 分位数分析无法执行,可能是因为数据点不足或因子值分布问题。\")\n",
"\n",
"\n",
" print(\"-\" * 60)\n",
" print(\"分析完成。\")\n",
"\n",
"comprehensive_factor_analysis(df, factor_column='generated_factor', target_column='future_return', date_column='trade_date', rolling_window='D', n_deciles=10)\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "stock",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.2"
}
},
"nbformat": 4,
"nbformat_minor": 5
}