Files
NewStock/main/factor/generate_factor.ipynb
2025-04-28 11:02:52 +08:00

1029 lines
53 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "initial_id",
"metadata": {
"ExecuteTime": {
"end_time": "2025-04-13T07:26:19.000054Z",
"start_time": "2025-04-13T07:26:18.895713Z"
},
"collapsed": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"e:\\PyProject\\NewStock\\main\\factor\n"
]
}
],
"source": [
"import gc\n",
"import os\n",
"import sys\n",
"sys.path.append('../../')\n",
"print(os.getcwd())\n",
"import pandas as pd\n",
"from main.factor.factor import get_rolling_factor, get_simple_factor\n",
"from main.utils.factor import read_industry_data\n",
"from main.utils.factor_processor import calculate_score\n",
"from main.utils.utils import read_and_merge_h5_data, merge_with_industry_data\n",
"\n",
"import warnings\n",
"\n",
"warnings.filterwarnings(\"ignore\")"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "f1623b04c7a366af",
"metadata": {
"ExecuteTime": {
"end_time": "2025-04-13T07:30:48.534271Z",
"start_time": "2025-04-13T07:26:19.005576Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"daily data\n",
"daily basic\n",
"inner merge on ['ts_code', 'trade_date']\n",
"stk limit\n",
"left merge on ['ts_code', 'trade_date']\n",
"money flow\n",
"left merge on ['ts_code', 'trade_date']\n",
"cyq perf\n",
"left merge on ['ts_code', 'trade_date']\n",
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 5123740 entries, 0 to 5123739\n",
"Data columns (total 31 columns):\n",
" # Column Dtype \n",
"--- ------ ----- \n",
" 0 ts_code object \n",
" 1 trade_date datetime64[ns]\n",
" 2 open float64 \n",
" 3 close float64 \n",
" 4 high float64 \n",
" 5 low float64 \n",
" 6 vol float64 \n",
" 7 pct_chg float64 \n",
" 8 turnover_rate float64 \n",
" 9 pe_ttm float64 \n",
" 10 circ_mv float64 \n",
" 11 volume_ratio float64 \n",
" 12 is_st bool \n",
" 13 up_limit float64 \n",
" 14 down_limit float64 \n",
" 15 buy_sm_vol float64 \n",
" 16 sell_sm_vol float64 \n",
" 17 buy_lg_vol float64 \n",
" 18 sell_lg_vol float64 \n",
" 19 buy_elg_vol float64 \n",
" 20 sell_elg_vol float64 \n",
" 21 net_mf_vol float64 \n",
" 22 his_low float64 \n",
" 23 his_high float64 \n",
" 24 cost_5pct float64 \n",
" 25 cost_15pct float64 \n",
" 26 cost_50pct float64 \n",
" 27 cost_85pct float64 \n",
" 28 cost_95pct float64 \n",
" 29 weight_avg float64 \n",
" 30 winner_rate float64 \n",
"dtypes: bool(1), datetime64[ns](1), float64(28), object(1)\n",
"memory usage: 1.2+ GB\n",
"None\n",
"['ts_code', 'trade_date', 'open', 'close', 'high', 'low', 'vol', 'pct_chg', 'turnover_rate', 'pe_ttm', 'circ_mv', 'volume_ratio', 'is_st', 'up_limit', 'down_limit', 'buy_sm_vol', 'sell_sm_vol', 'buy_lg_vol', 'sell_lg_vol', 'buy_elg_vol', 'sell_elg_vol', 'net_mf_vol', 'his_low', 'his_high', 'cost_5pct', 'cost_15pct', 'cost_50pct', 'cost_85pct', 'cost_95pct', 'weight_avg', 'winner_rate']\n",
"Index(['ts_code', 'trade_date', 'open', 'close', 'high', 'low', 'vol',\n",
" 'pct_chg', 'turnover_rate', 'pe_ttm', 'circ_mv', 'volume_ratio',\n",
" 'is_st', 'up_limit', 'down_limit', 'buy_sm_vol', 'sell_sm_vol',\n",
" 'buy_lg_vol', 'sell_lg_vol', 'buy_elg_vol', 'sell_elg_vol',\n",
" 'net_mf_vol', 'his_low', 'his_high', 'cost_5pct', 'cost_15pct',\n",
" 'cost_50pct', 'cost_85pct', 'cost_95pct', 'weight_avg', 'winner_rate',\n",
" 'lg_elg_net_buy_vol', 'flow_lg_elg_intensity', 'sm_net_buy_vol',\n",
" 'flow_divergence_diff', 'flow_divergence_ratio', 'total_buy_vol',\n",
" 'lg_elg_buy_prop', 'flow_struct_buy_change',\n",
" 'lg_elg_net_buy_vol_change', 'flow_lg_elg_accel',\n",
" 'chip_concentration_range', 'chip_skewness', 'floating_chip_proxy',\n",
" 'cost_support_15pct_change', 'cat_winner_price_zone',\n",
" 'flow_chip_consistency', 'profit_taking_vs_absorb', '_is_positive',\n",
" '_is_negative', 'cat_is_positive', '_pos_returns', '_neg_returns',\n",
" '_pos_returns_sq', '_neg_returns_sq', 'upside_vol', 'downside_vol',\n",
" 'vol_ratio', 'return_skew', 'return_kurtosis', 'volume_change_rate',\n",
" 'cat_volume_breakout', 'turnover_deviation', 'cat_turnover_spike',\n",
" 'avg_volume_ratio', 'cat_volume_ratio_breakout', 'vol_spike',\n",
" 'vol_std_5', 'atr_14', 'atr_6', 'obv'],\n",
" dtype='object')\n",
"Calculating lg_flow_mom_corr_20_60...\n",
"Finished lg_flow_mom_corr_20_60.\n",
"Calculating lg_buy_consolidation_20...\n",
"Finished lg_buy_consolidation_20.\n",
"Calculating lg_flow_accel...\n",
"Finished lg_flow_accel.\n",
"Calculating profit_pressure...\n",
"Finished profit_pressure.\n",
"Calculating underwater_resistance...\n",
"Finished underwater_resistance.\n",
"Calculating cost_conc_std_20...\n",
"Finished cost_conc_std_20.\n",
"Calculating profit_decay_20...\n",
"Finished profit_decay_20.\n",
"Calculating vol_amp_loss_20...\n",
"Finished vol_amp_loss_20.\n",
"Calculating vol_drop_profit_cnt_5...\n",
"Finished vol_drop_profit_cnt_5.\n",
"Calculating lg_flow_vol_interact_20...\n",
"Finished lg_flow_vol_interact_20.\n",
"Calculating cost_break_confirm_cnt_5...\n",
"Finished cost_break_confirm_cnt_5.\n",
"Calculating atr_norm_channel_pos_14...\n",
"Finished atr_norm_channel_pos_14.\n",
"Calculating turnover_diff_skew_20...\n",
"Finished turnover_diff_skew_20.\n",
"Calculating lg_sm_flow_diverge_20...\n",
"Finished lg_sm_flow_diverge_20.\n",
"Calculating pullback_strong_20_20...\n",
"Finished pullback_strong_20_20.\n",
"Calculating vol_wgt_hist_pos_20...\n",
"Finished vol_wgt_hist_pos_20.\n",
"Calculating vol_adj_roc_20...\n",
"Finished vol_adj_roc_20.\n",
"Calculating intraday_lg_flow_corr_20 (Placeholder - complex implementation)...\n",
"Finished intraday_lg_flow_corr_20 (Placeholder).\n",
"Calculating cap_neutral_cost_metric (Placeholder - requires statsmodels)...\n",
"Finished cap_neutral_cost_metric (Placeholder).\n"
]
}
],
"source": [
"print('daily data')\n",
"df = read_and_merge_h5_data('../../data/daily_data.h5', key='daily_data',\n",
" columns=['ts_code', 'trade_date', 'open', 'close', 'high', 'low', 'vol', 'pct_chg'],\n",
" df=None)\n",
"\n",
"print('daily basic')\n",
"df = read_and_merge_h5_data('../../data/daily_basic.h5', key='daily_basic',\n",
" columns=['ts_code', 'trade_date', 'turnover_rate', 'pe_ttm', 'circ_mv', 'volume_ratio',\n",
" 'is_st'], df=df, join='inner')\n",
"df = df[df['trade_date'] >= '2021-01-01']\n",
"\n",
"print('stk limit')\n",
"df = read_and_merge_h5_data('../../data/stk_limit.h5', key='stk_limit',\n",
" columns=['ts_code', 'trade_date', 'pre_close', 'up_limit', 'down_limit'],\n",
" df=df)\n",
"print('money flow')\n",
"df = read_and_merge_h5_data('../../data/money_flow.h5', key='money_flow',\n",
" columns=['ts_code', 'trade_date', 'buy_sm_vol', 'sell_sm_vol', 'buy_lg_vol',\n",
" 'sell_lg_vol',\n",
" 'buy_elg_vol', 'sell_elg_vol', 'net_mf_vol'],\n",
" df=df)\n",
"print('cyq perf')\n",
"df = read_and_merge_h5_data('../../data/cyq_perf.h5', key='cyq_perf',\n",
" columns=['ts_code', 'trade_date', 'his_low', 'his_high', 'cost_5pct', 'cost_15pct',\n",
" 'cost_50pct',\n",
" 'cost_85pct', 'cost_95pct', 'weight_avg', 'winner_rate'],\n",
" df=df)\n",
"print(df.info())\n",
"\n",
"origin_columns = df.columns.tolist()\n",
"origin_columns = [col for col in origin_columns if 'cyq' not in col]\n",
"print(origin_columns)\n",
"\n",
"\n",
"def filter_data(df):\n",
" # df = df.groupby('trade_date').apply(lambda x: x.nlargest(1000, 'act_factor1'))\n",
" df = df[~df['is_st']]\n",
" df = df[~df['ts_code'].str.endswith('BJ')]\n",
" df = df[~df['ts_code'].str.startswith('30')]\n",
" df = df[~df['ts_code'].str.startswith('68')]\n",
" df = df[~df['ts_code'].str.startswith('8')]\n",
" df = df[df['trade_date'] >= '2022-01-01']\n",
" if 'in_date' in df.columns:\n",
" df = df.drop(columns=['in_date'])\n",
" df = df.reset_index(drop=True)\n",
" return df\n",
"\n",
"\n",
"gc.collect()\n",
"\n",
"df = filter_data(df)\n",
"df, _ = get_rolling_factor(df)\n",
"df, _ = get_simple_factor(df)\n",
"from main.factor.factor import *\n",
"lg_flow_mom_corr(df, N=20, M=60)\n",
"lg_buy_consolidation(df, N=20)\n",
"lg_flow_accel(df)\n",
"profit_pressure(df)\n",
"underwater_resistance(df)\n",
"cost_conc_std(df, N=20)\n",
"profit_decay(df, N=20)\n",
"vol_amp_loss(df, N=20)\n",
"vol_drop_profit_cnt(df, N=20, M=5)\n",
"lg_flow_vol_interact(df, N=20)\n",
"cost_break_confirm_cnt(df, M=5)\n",
"atr_norm_channel_pos(df, N=14)\n",
"turnover_diff_skew(df, N=20)\n",
"lg_sm_flow_diverge(df, N=20)\n",
"pullback_strong(df, N=20, M=20)\n",
"vol_wgt_hist_pos(df, N=20)\n",
"vol_adj_roc(df, N=20)\n",
"intraday_lg_flow_corr(df, N=20) # Placeholder\n",
"cap_neutral_cost_metric(df) # Placeholder\n",
"# hurst_exponent_flow(df, N=60) # Placeholder\n",
"# df['test'] = 1\n",
"# df['test2'] = 2\n",
"# df = df.merge(industry_df, on=['l2_code', 'trade_date'], how='left')\n",
"l2_df = read_and_merge_h5_data('../../data/industry_data.h5', key='industry_data',\n",
" columns=['ts_code', 'l2_code', 'in_date'],\n",
" df=None, on=['ts_code'], join='left')\n",
"df = merge_with_industry_data(df, l2_df)\n",
"df = df.rename(columns={'l2_code': 'cat_l2_code'})\n",
"# df = df.merge(index_data, on='trade_date', how='left')\n",
"\n",
"days = 2\n",
"df = df.sort_values(by=['ts_code', 'trade_date'])\n",
"# df['future_return'] = df.groupby('ts_code', group_keys=False)['close'].apply(lambda x: x.shift(-days) / x - 1)\n",
"df['future_return'] = (df.groupby('ts_code')['close'].shift(-days) - df.groupby('ts_code')['open'].shift(-1)) / \\\n",
" df.groupby('ts_code')['open'].shift(-1)\n",
"# df['future_return'] = df.groupby('ts_code')['pct_chg'].shift(-1)\n",
"df['future_return2'] = (df.groupby('ts_code')['close'].shift(-1) - df.groupby('ts_code')['open'].shift(-1)) / \\\n",
" df.groupby('ts_code')['open'].shift(-1)\n",
"\n",
"df['future_volatility'] = (\n",
" df.groupby('ts_code')['pct_chg']\n",
" .transform(lambda x: x.rolling(days).std().shift(-days))\n",
")\n",
"df['future_score'] = calculate_score(df, days=days, lambda_param=0.3)\n",
"\n",
"\n",
"def select_pre_zt_stocks_dynamic(stock_df):\n",
" def select_stocks(group):\n",
" return group.nlargest(1000, 'return_5') # 如果循环结束仍未找到足够标签,则返回最大数量的股票\n",
"\n",
" stock_df = stock_df.groupby('trade_date', group_keys=False).apply(select_stocks)\n",
" return stock_df\n",
"\n",
"\n",
"gc.collect()\n",
"\n",
"# df = select_pre_zt_stocks_dynamic(df[(df['trade_date'] >= '2022-01-01') & (df['trade_date'] <= '2029-04-07')])\n",
"\n",
"industry_df = read_industry_data('../../data/sw_daily.h5')\n",
"df = df.merge(industry_df, on=['cat_l2_code', 'trade_date'], how='left')\n"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "1c1dd3d6",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['open', 'close', 'high', 'low', 'vol', 'pct_chg', 'turnover_rate', 'circ_mv', 'volume_ratio', 'up_limit', 'down_limit', 'buy_sm_vol', 'sell_sm_vol', 'buy_lg_vol', 'sell_lg_vol', 'buy_elg_vol', 'sell_elg_vol', 'net_mf_vol', 'his_low', 'his_high', 'cost_5pct', 'cost_15pct', 'cost_50pct', 'cost_85pct', 'cost_95pct', 'weight_avg', 'winner_rate', 'lg_elg_net_buy_vol', 'flow_lg_elg_intensity', 'sm_net_buy_vol', 'total_buy_vol', 'lg_elg_buy_prop', 'flow_struct_buy_change', 'lg_elg_net_buy_vol_change', 'flow_lg_elg_accel', 'chip_concentration_range', 'chip_skewness', 'floating_chip_proxy', 'cost_support_15pct_change', 'cat_winner_price_zone', 'flow_chip_consistency', 'profit_taking_vs_absorb', 'cat_is_positive', 'upside_vol', 'downside_vol', 'vol_ratio', 'return_skew', 'return_kurtosis', 'volume_change_rate', 'cat_volume_breakout', 'turnover_deviation', 'cat_turnover_spike', 'avg_volume_ratio', 'cat_volume_ratio_breakout', 'vol_spike', 'vol_std_5', 'atr_14', 'atr_6', 'obv', 'maobv_6', 'rsi_3', 'return_5', 'return_20', 'std_return_5', 'std_return_90', 'std_return_90_2', 'act_factor1', 'act_factor2', 'act_factor3', 'act_factor4', 'rank_act_factor1', 'rank_act_factor2', 'rank_act_factor3', 'log_circ_mv', 'cov', 'delta_cov', 'alpha_22_improved', 'alpha_003', 'alpha_007', 'alpha_013', 'cat_up_limit', 'cat_down_limit', 'up_limit_count_10d', 'down_limit_count_10d', 'consecutive_up_limit', 'vol_break', 'weight_roc5', 'smallcap_concentration', 'cost_stability', 'high_cost_break_days', 'liquidity_risk', 'turnover_std', 'mv_volatility', 'volume_growth', 'mv_growth', 'arbr', 'momentum_factor', 'resonance_factor', 'log_close', 'cat_vol_spike', 'up', 'down', 'obv_maobv_6', 'std_return_5_over_std_return_90', 'std_return_90_minus_std_return_90_2', 'cat_af2', 'cat_af3', 'cat_af4', 'act_factor5', 'act_factor6', 'active_buy_volume_large', 'active_buy_volume_big', 'active_buy_volume_small', 'buy_lg_vol_minus_sell_lg_vol', 'buy_elg_vol_minus_sell_elg_vol', 'ctrl_strength', 'low_cost_dev', 'asymmetry', 'lock_factor', 'cat_vol_break', 'cost_atr_adj', 'cat_golden_resonance', 'mv_turnover_ratio', 'mv_adjusted_volume', 'mv_weighted_turnover', 'nonlinear_mv_volume', 'mv_volume_ratio', 'mv_momentum', 'lg_flow_mom_corr_20_60', 'lg_buy_consolidation_20', 'lg_flow_accel', 'profit_pressure', 'underwater_resistance', 'cost_conc_std_20', 'profit_decay_20', 'vol_amp_loss_20', 'vol_drop_profit_cnt_5', 'lg_flow_vol_interact_20', 'cost_break_confirm_cnt_5', 'atr_norm_channel_pos_14', 'turnover_diff_skew_20', 'lg_sm_flow_diverge_20', 'pullback_strong_20_20', 'vol_wgt_hist_pos_20', 'vol_adj_roc_20', 'intraday_lg_flow_corr_20', 'cap_neutral_cost_metric', 'in_date', 'industry_obv', 'industry_return_5', 'industry_return_20', 'industry__ema_5', 'industry__ema_13', 'industry__ema_20', 'industry__ema_60', 'industry_act_factor1', 'industry_act_factor2', 'industry_act_factor3', 'industry_act_factor4', 'industry_act_factor5', 'industry_act_factor6', 'industry_rank_act_factor1', 'industry_rank_act_factor2', 'industry_rank_act_factor3', 'industry_return_5_percentile', 'industry_return_20_percentile']\n"
]
}
],
"source": [
"feature_columns = [col for col in df.columns if col in df.columns]\n",
"feature_columns = [col for col in feature_columns if col not in ['trade_date',\n",
" 'ts_code',\n",
" 'label']]\n",
"feature_columns = [col for col in feature_columns if 'future' not in col]\n",
"feature_columns = [col for col in feature_columns if 'label' not in col]\n",
"feature_columns = [col for col in feature_columns if 'score' not in col]\n",
"feature_columns = [col for col in feature_columns if 'gen' not in col]\n",
"feature_columns = [col for col in feature_columns if 'is_st' not in col]\n",
"feature_columns = [col for col in feature_columns if 'pe_ttm' not in col]\n",
"feature_columns = [col for col in feature_columns if 'cat_l2_code' not in col]\n",
"# feature_columns = [col for col in feature_columns if col not in origin_columns]\n",
"feature_columns = [col for col in feature_columns if not col.startswith('_')]\n",
"# feature_columns = [col for col in feature_columns if col not in ['ts_code', 'trade_date', 'vol_std_5', 'cov', 'delta_cov', 'alpha_22_improved', 'alpha_007', 'consecutive_up_limit', 'mv_volatility', 'volume_growth', 'mv_growth', 'arbr']]\n",
"\n",
"print(feature_columns)\n",
"numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns\n",
"numeric_columns = [col for col in numeric_columns if col in feature_columns]"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "2c60c1ea",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"每个特征列中的 NaN 值数量(字典形式):\n",
"ts_code: 0\n",
"trade_date: 0\n",
"open: 0\n",
"close: 0\n",
"high: 0\n",
"low: 0\n",
"vol: 0\n",
"pct_chg: 0\n",
"turnover_rate: 0\n",
"pe_ttm: 499616\n",
"circ_mv: 0\n",
"volume_ratio: 791\n",
"is_st: 0\n",
"up_limit: 0\n",
"down_limit: 0\n",
"buy_sm_vol: 7\n",
"sell_sm_vol: 7\n",
"buy_lg_vol: 7\n",
"sell_lg_vol: 7\n",
"buy_elg_vol: 7\n",
"sell_elg_vol: 7\n",
"net_mf_vol: 7\n",
"his_low: 24695\n",
"his_high: 24695\n",
"cost_5pct: 24695\n",
"cost_15pct: 24695\n",
"cost_50pct: 24695\n",
"cost_85pct: 24695\n",
"cost_95pct: 24695\n",
"weight_avg: 24695\n",
"winner_rate: 24695\n",
"lg_elg_net_buy_vol: 7\n",
"flow_lg_elg_intensity: 7\n",
"sm_net_buy_vol: 7\n",
"flow_divergence_diff: 7\n",
"flow_divergence_ratio: 7\n",
"total_buy_vol: 7\n",
"lg_elg_buy_prop: 7\n",
"flow_struct_buy_change: 3287\n",
"lg_elg_net_buy_vol_change: 3287\n",
"flow_lg_elg_accel: 6567\n",
"chip_concentration_range: 24695\n",
"chip_skewness: 24695\n",
"floating_chip_proxy: 24695\n",
"cost_support_15pct_change: 27855\n",
"cat_winner_price_zone: 0\n",
"flow_chip_consistency: 7\n",
"profit_taking_vs_absorb: 7\n",
"cat_is_positive: 0\n",
"upside_vol: 29581\n",
"downside_vol: 29655\n",
"vol_ratio: 0\n",
"return_skew: 13096\n",
"return_kurtosis: 13096\n",
"volume_change_rate: 29466\n",
"cat_volume_breakout: 0\n",
"turnover_deviation: 6548\n",
"cat_turnover_spike: 0\n",
"avg_volume_ratio: 7341\n",
"cat_volume_ratio_breakout: 0\n",
"vol_spike: 62074\n",
"vol_std_5: 16370\n",
"atr_14: 45836\n",
"atr_6: 19644\n",
"obv: 0\n",
"maobv_6: 16370\n",
"rsi_3: 9822\n",
"return_5: 16370\n",
"return_20: 65315\n",
"std_return_5: 16370\n",
"std_return_90: 291770\n",
"std_return_90_2: 323906\n",
"act_factor1: 16370\n",
"act_factor2: 42562\n",
"act_factor3: 65315\n",
"act_factor4: 194886\n",
"rank_act_factor1: 16370\n",
"rank_act_factor2: 42562\n",
"rank_act_factor3: 65315\n",
"log_circ_mv: 0\n",
"cov: 13096\n",
"delta_cov: 29466\n",
"alpha_22_improved: 62074\n",
"alpha_003: 0\n",
"alpha_007: 13120\n",
"alpha_013: 62074\n",
"cat_up_limit: 0\n",
"cat_down_limit: 0\n",
"up_limit_count_10d: 0\n",
"down_limit_count_10d: 0\n",
"consecutive_up_limit: 0\n",
"vol_break: 0\n",
"weight_roc5: 40531\n",
"price_cost_divergence: 93280\n",
"smallcap_concentration: 24695\n",
"cost_stability: 85077\n",
"high_cost_break_days: 13096\n",
"liquidity_risk: 53215\n",
"turnover_std: 62074\n",
"mv_volatility: 62074\n",
"volume_growth: 65315\n",
"mv_growth: 65315\n",
"arbr: 9822\n",
"momentum_factor: 29466\n",
"resonance_factor: 791\n",
"log_close: 0\n",
"cat_vol_spike: 0\n",
"up: 0\n",
"down: 0\n",
"obv_maobv_6: 16370\n",
"std_return_5_over_std_return_90: 291770\n",
"std_return_90_minus_std_return_90_2: 323906\n",
"cat_af2: 0\n",
"cat_af3: 0\n",
"cat_af4: 0\n",
"act_factor5: 194886\n",
"act_factor6: 42562\n",
"active_buy_volume_large: 13\n",
"active_buy_volume_big: 79\n",
"active_buy_volume_small: 7\n",
"buy_lg_vol_minus_sell_lg_vol: 8\n",
"buy_elg_vol_minus_sell_elg_vol: 69\n",
"ctrl_strength: 24695\n",
"low_cost_dev: 24695\n",
"asymmetry: 24701\n",
"lock_factor: 24695\n",
"cat_vol_break: 0\n",
"cost_atr_adj: 69060\n",
"cat_golden_resonance: 0\n",
"mv_turnover_ratio: 0\n",
"mv_adjusted_volume: 0\n",
"mv_weighted_turnover: 0\n",
"nonlinear_mv_volume: 0\n",
"mv_volume_ratio: 791\n",
"mv_momentum: 791\n",
"lg_flow_mom_corr_20_60: 1186\n",
"lg_buy_consolidation_20: 1950902\n",
"lg_flow_accel: 6567\n",
"profit_pressure: 24695\n",
"underwater_resistance: 24695\n",
"cost_conc_std_20: 29466\n",
"profit_decay_20: 0\n",
"vol_amp_loss_20: 53215\n",
"vol_drop_profit_cnt_5: 0\n",
"lg_flow_vol_interact_20: 29466\n",
"cost_break_confirm_cnt_5: 0\n",
"atr_norm_channel_pos_14: 0\n",
"turnover_diff_skew_20: 32740\n",
"lg_sm_flow_diverge_20: 29466\n",
"pullback_strong_20_20: 0\n",
"vol_wgt_hist_pos_20: 0\n",
"vol_adj_roc_20: 0\n",
"intraday_lg_flow_corr_20: 2431461\n",
"cap_neutral_cost_metric: 2431461\n",
"cat_l2_code: 290\n",
"in_date: 65486\n",
"future_return: 6548\n",
"future_return2: 3274\n",
"future_volatility: 6548\n",
"score: 6548\n",
"future_score: 6548\n",
"industry_obv: 11272\n",
"industry_return_5: 11272\n",
"industry_return_20: 11272\n",
"industry__ema_5: 11272\n",
"industry__ema_13: 11272\n",
"industry__ema_20: 11272\n",
"industry__ema_60: 11272\n",
"industry_act_factor1: 11272\n",
"industry_act_factor2: 11272\n",
"industry_act_factor3: 11272\n",
"industry_act_factor4: 11272\n",
"industry_act_factor5: 11272\n",
"industry_act_factor6: 11272\n",
"industry_rank_act_factor1: 11272\n",
"industry_rank_act_factor2: 11272\n",
"industry_rank_act_factor3: 11272\n",
"industry_return_5_percentile: 11272\n",
"industry_return_20_percentile: 11272\n",
"['open', 'close', 'high', 'low', 'vol', 'pct_chg', 'turnover_rate', 'circ_mv', 'volume_ratio', 'up_limit', 'down_limit', 'buy_sm_vol', 'sell_sm_vol', 'buy_lg_vol', 'sell_lg_vol', 'buy_elg_vol', 'sell_elg_vol', 'net_mf_vol', 'his_low', 'his_high', 'cost_5pct', 'cost_15pct', 'cost_50pct', 'cost_85pct', 'cost_95pct', 'weight_avg', 'winner_rate', 'lg_elg_net_buy_vol', 'flow_lg_elg_intensity', 'sm_net_buy_vol', 'total_buy_vol', 'lg_elg_buy_prop', 'flow_struct_buy_change', 'lg_elg_net_buy_vol_change', 'flow_lg_elg_accel', 'chip_concentration_range', 'chip_skewness', 'floating_chip_proxy', 'cost_support_15pct_change', 'cat_winner_price_zone', 'flow_chip_consistency', 'profit_taking_vs_absorb', 'cat_is_positive', 'upside_vol', 'downside_vol', 'vol_ratio', 'return_skew', 'return_kurtosis', 'volume_change_rate', 'cat_volume_breakout', 'turnover_deviation', 'cat_turnover_spike', 'avg_volume_ratio', 'cat_volume_ratio_breakout', 'vol_std_5', 'atr_6', 'obv', 'maobv_6', 'rsi_3', 'return_5', 'std_return_5', 'act_factor1', 'rank_act_factor1', 'log_circ_mv', 'cov', 'delta_cov', 'alpha_003', 'alpha_007', 'cat_up_limit', 'cat_down_limit', 'up_limit_count_10d', 'down_limit_count_10d', 'consecutive_up_limit', 'vol_break', 'smallcap_concentration', 'high_cost_break_days', 'arbr', 'momentum_factor', 'resonance_factor', 'log_close', 'cat_vol_spike', 'up', 'down', 'obv_maobv_6', 'cat_af2', 'cat_af3', 'cat_af4', 'active_buy_volume_large', 'active_buy_volume_big', 'active_buy_volume_small', 'buy_lg_vol_minus_sell_lg_vol', 'buy_elg_vol_minus_sell_elg_vol', 'ctrl_strength', 'low_cost_dev', 'asymmetry', 'lock_factor', 'cat_vol_break', 'cat_golden_resonance', 'mv_turnover_ratio', 'mv_adjusted_volume', 'mv_weighted_turnover', 'nonlinear_mv_volume', 'mv_volume_ratio', 'mv_momentum', 'lg_flow_mom_corr_20_60', 'lg_flow_accel', 'profit_pressure', 'underwater_resistance', 'cost_conc_std_20', 'profit_decay_20', 'vol_drop_profit_cnt_5', 'lg_flow_vol_interact_20', 'cost_break_confirm_cnt_5', 'atr_norm_channel_pos_14', 'lg_sm_flow_diverge_20', 'pullback_strong_20_20', 'vol_wgt_hist_pos_20', 'vol_adj_roc_20', 'industry_obv', 'industry_return_5', 'industry_return_20', 'industry__ema_5', 'industry__ema_13', 'industry__ema_20', 'industry__ema_60', 'industry_act_factor1', 'industry_act_factor2', 'industry_act_factor3', 'industry_act_factor4', 'industry_act_factor5', 'industry_act_factor6', 'industry_rank_act_factor1', 'industry_rank_act_factor2', 'industry_rank_act_factor3', 'industry_return_5_percentile', 'industry_return_20_percentile']\n"
]
}
],
"source": [
"def count_nan_and_inf_per_feature(df: pd.DataFrame):\n",
" \"\"\"\n",
" 计算 DataFrame 中每个特征列的 NaN 和 Inf 值数量。\n",
"\n",
" Args:\n",
" df: 要分析的 pandas DataFrame。\n",
"\n",
" Returns:\n",
" 一个字典,包含两个 pandas Series\n",
" - 'NaN_Count': 索引是列名,值是该列中 NaN 的数量。\n",
" - 'Inf_Count': 索引是列名,值是该列中 Inf 的数量。\n",
" \"\"\"\n",
" nan_counts = df.isna().sum()\n",
" # inf_counts = np.isinf(df).sum()\n",
" return nan_counts\n",
"\n",
"\n",
"nan_counts_series = count_nan_and_inf_per_feature(df)\n",
"\n",
"# 或者,如果您想以字典的形式获取结果:\n",
"nan_counts_dict = nan_counts_series.to_dict()\n",
"print(\"\\n每个特征列中的 NaN 值数量(字典形式):\")\n",
"for k, v in nan_counts_dict.items():\n",
" print(f'{k}: {v}')\n",
" if v > 30000 and k in feature_columns:\n",
" feature_columns.remove(k)\n",
"print(feature_columns)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "e088bd8a357e815a",
"metadata": {
"ExecuteTime": {
"end_time": "2025-04-13T15:39:47.461434Z",
"start_time": "2025-04-13T15:39:44.369664Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"gen\tnevals\tavg \tstd \tmin\tmax \n",
"0 \t64 \t-0.387605\t0.492269\t-1 \t0.84339\n",
"1 \t52 \t-0.0280574\t0.328787\t-1 \t0.84339\n",
"2 \t56 \t-0.0442643\t0.498012\t-1 \t0.84339\n",
"3 \t50 \t0.0843881 \t0.506873\t-1 \t0.84339\n",
"4 \t56 \t0.128797 \t0.586781\t-1 \t0.84339\n",
"5 \t52 \t0.107366 \t0.586957\t-1 \t0.918103\n",
"6 \t52 \t0.0602483 \t0.666345\t-1 \t0.918103\n",
"7 \t54 \t0.177717 \t0.561644\t-1 \t0.918103\n",
"8 \t57 \t0.206183 \t0.620791\t-1 \t0.957887\n",
"9 \t51 \t0.253306 \t0.667259\t-1 \t1.07875 \n",
"10 \t53 \t0.19914 \t0.681541\t-1 \t1.14356 \n",
"11 \t54 \t0.173093 \t0.752007\t-1 \t1.24408 \n",
"12 \t53 \t0.429303 \t0.636249\t-1.07606\t1.24408 \n",
"13 \t46 \t0.443469 \t0.754764\t-1.17052\t1.24408 \n",
"14 \t57 \t0.412168 \t0.719715\t-1.2066 \t1.24408 \n",
"15 \t47 \t0.420095 \t0.833547\t-1.20899\t1.25608 \n",
"16 \t46 \t0.516075 \t0.916347\t-1.16556\t1.25765 \n",
"17 \t48 \t0.52129 \t0.872883\t-1 \t1.30663 \n",
"18 \t53 \t0.530992 \t0.923366\t-1 \t1.3677 \n",
"19 \t54 \t0.569299 \t0.861833\t-1.39138\t1.3677 \n",
"20 \t51 \t0.538589 \t0.883032\t-1.12472\t1.3677 \n",
"21 \t49 \t0.684813 \t0.874059\t-1 \t1.3677 \n",
"22 \t46 \t0.659823 \t0.86879 \t-1.17051\t1.3677 \n",
"23 \t42 \t0.678971 \t0.886044\t-1.39138\t1.3677 \n",
"24 \t55 \t0.639381 \t0.905808\t-1.39138\t1.37645 \n",
"25 \t42 \t0.721136 \t0.915513\t-1.30205\t1.39372 \n",
"26 \t56 \t0.695918 \t0.849837\t-1.0437 \t1.39372 \n",
"27 \t56 \t0.465007 \t0.934313\t-1 \t1.39372 \n",
"28 \t51 \t0.714563 \t0.88635 \t-1.13547\t1.43745 \n",
"29 \t49 \t0.687478 \t0.84568 \t-1 \t1.43745 \n",
"30 \t50 \t0.646657 \t0.835957\t-1 \t1.43745 \n",
"31 \t49 \t0.615978 \t0.939622\t-1.04846\t1.43745 \n",
"32 \t49 \t0.654171 \t0.973861\t-1.12771\t1.43745 \n",
"\n",
"Best Factors Found:\n",
"Fitness: 1.4375, Factor 1: protected_div_torch(mul(protected_div_torch(add(return_kurtosis, profit_pressure), mul(cost_85pct, buy_elg_vol_minus_sell_elg_vol)), protected_div_torch(cost_break_confirm_cnt_5, pow(cos(lg_flow_vol_interact_20), cos(chip_concentration_range)))), sub(add(obv, protected_div_torch(cost_break_confirm_cnt_5, cos(chip_skewness))), add(obv, protected_div_torch(add(return_kurtosis, profit_pressure), pow(alpha_007, active_buy_volume_big)))))\n",
"Fitness: 1.3937, Factor 2: protected_div_torch(mul(protected_div_torch(protected_div_torch(add(return_kurtosis, profit_pressure), pow(alpha_007, active_buy_volume_big)), delta_cov), protected_div_torch(protected_div_torch(add(return_kurtosis, profit_pressure), pow(alpha_007, active_buy_volume_big)), rank_act_factor2)), sub(add(obv, protected_div_torch(cost_break_confirm_cnt_5, cos(chip_skewness))), add(obv, protected_div_torch(add(return_kurtosis, profit_pressure), pow(alpha_007, active_buy_volume_big)))))\n",
"Fitness: 1.3843, Factor 3: protected_div_torch(mul(protected_div_torch(protected_div_torch(profit_pressure, pow(alpha_007, active_buy_volume_big)), delta_cov), protected_div_torch(protected_div_torch(add(return_kurtosis, profit_pressure), pow(alpha_007, active_buy_volume_big)), rank_act_factor2)), sub(add(obv, protected_div_torch(cost_break_confirm_cnt_5, cos(chip_skewness))), add(obv, protected_div_torch(add(return_kurtosis, profit_pressure), pow(alpha_007, active_buy_volume_big)))))\n"
]
}
],
"source": [
"from deap import creator, gp, tools, base, algorithms\n",
"import numpy as np\n",
"import pandas as pd\n",
"import torch\n",
"from scipy.stats import spearmanr\n",
"import operator\n",
"\n",
"# 保护性除法函数 (PyTorch 版本)\n",
"def protected_div_torch(left, right):\n",
" return torch.where(right != 0, left / right, torch.ones_like(left))\n",
"\n",
"def generate_deap_factors_pytorch_v2(df: pd.DataFrame, numeric_columns: list, target_column: str = 'future_return', date_column: str = 'trade_date', params: dict = None, random_state: int = 42):\n",
" \"\"\"\n",
" 使用 deap 库通过遗传编程生成新的因子,并使用 PyTorch 算子和计算,过滤 NaN 值。\n",
"\n",
" Args:\n",
" df (pd.DataFrame): 包含因子和目标变量的数据框。\n",
" numeric_columns (list): 数值型因子列名的列表。\n",
" target_column (str): 目标变量的列名,默认为 'future_return'。\n",
" params (dict): deap 进化算法的参数字典。\n",
" random_state (int): 随机种子,用于保证结果的可重复性。\n",
"\n",
" Returns:\n",
" list: 包含最佳因子表达式的列表。\n",
" \"\"\"\n",
" if params is None:\n",
" params = {}\n",
"\n",
" # 设置随机种子\n",
" np.random.seed(random_state)\n",
" torch.manual_seed(random_state)\n",
"\n",
" # 1. 定义原始集 (Primitive Set) - 使用 PyTorch 算子\n",
" pset_torch = gp.PrimitiveSet(\"PYTORCH\", arity=len(numeric_columns))\n",
" pset_torch.addPrimitive(torch.add, 2)\n",
" pset_torch.addPrimitive(torch.sub, 2)\n",
" pset_torch.addPrimitive(torch.mul, 2)\n",
" pset_torch.addPrimitive(protected_div_torch, 2) # 使用 PyTorch 保护性除法\n",
" # 新增的复杂算子\n",
" pset_torch.addPrimitive(torch.sin, 1) # 正弦函数 (一元算子)\n",
" pset_torch.addPrimitive(torch.cos, 1) # 余弦函数 (一元算子)\n",
" # pset_torch.addPrimitive(torch.abs, 1) # 绝对值 (一元算子)\n",
" # pset_torch.addPrimitive(torch.sqrt, 1) # 平方根 (一元算子)\n",
" pset_torch.addPrimitive(torch.pow, 2) # 指数运算 (二元算子,例如 x 的 y 次方)\n",
" # pset_torch.addPrimitive(torch.tanh, 1) # 双曲正切函数 (一元算子)\n",
"\n",
" # def rate_of_change_torch(x, y): # 计算 y 相对于 x 的变化率\n",
" # return (y - x) / (x + 1e-8)\n",
" # pset_torch.addPrimitive(rate_of_change_torch, 2)\n",
"\n",
" # def covariance_like_torch(x, y):\n",
" # mean_x = torch.mean(x, dim=0, keepdim=True) # 保持维度以便广播\n",
" # mean_y = torch.mean(y, dim=0, keepdim=True)\n",
" # return (x - mean_x) * (y - mean_y)\n",
"\n",
" # pset_torch.addPrimitive(covariance_like_torch, 2)\n",
"\n",
" # 将 numeric_columns 作为终端添加到原始集\n",
" pset_torch.renameArguments(**{f\"ARG{i}\": col for i, col in enumerate(numeric_columns)})\n",
"\n",
" # 2. 定义适应度和个体\n",
" # 目标是最大化 IC 夏普比率\n",
" creator.create(\"FitnessMax\", base.Fitness, weights=(1.0,))\n",
" creator.create(\"Individual\", gp.PrimitiveTree, fitness=creator.FitnessMax)\n",
"\n",
" # 3. 创建工具箱 (Toolbox)\n",
" toolbox = base.Toolbox()\n",
" toolbox.register(\"expr_torch\", gp.genHalfAndHalf, pset=pset_torch, min_=1, max_=3) # 调整 min_/max_ 以控制表达式复杂性\n",
" toolbox.register(\"individual\", tools.initIterate, creator.Individual, toolbox.expr_torch)\n",
" toolbox.register(\"population\", tools.initRepeat, list, toolbox.individual)\n",
" toolbox.register(\"compile_torch\", gp.compile, pset=pset_torch) # 编译为 PyTorch 函数\n",
"\n",
" # 准备 PyTorch 张量数据 (所有日期所有股票)\n",
" device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
" data_tensor_all = torch.from_numpy(df[numeric_columns].values).float().to(device)\n",
" target_tensor_all = torch.from_numpy(df[target_column].values).float().to(device)\n",
" dates_all = df[date_column].values # 获取日期 numpy 数组\n",
"\n",
" # 4. 定义基于 PyTorch + IC 夏普比率的适应度函数\n",
" def evaluate_torch_cuda_ic_sharpe(individual, data_tensor_all, target_tensor_all, dates_all):\n",
" # 将个体(表达式树)编译成可执行的 PyTorch 函数\n",
" func_torch = toolbox.compile_torch(expr=individual)\n",
"\n",
" try:\n",
" # 应用该函数到 PyTorch 张量 (一次性计算所有日期所有股票的因子值)\n",
" # 处理可能的维度不一致,确保输出是一维或二维 (N, 1) 的张量\n",
" factor_values_tensor = func_torch(*torch.split(data_tensor_all, 1, dim=1))\n",
" if factor_values_tensor.ndim > 1 and factor_values_tensor.shape[1] != 1:\n",
" # 如果输出是 (N, M) 其中 M > 1可能需要一个聚合操作这里暂时返回负适应度\n",
" print(f\"警告: 因子表达式输出张量维度为 {factor_values_tensor.shape},期望 (N, 1)。\")\n",
" return (-1.0,)\n",
" factor_values_tensor = factor_values_tensor.flatten() # 确保是展平的一维张量\n",
"\n",
" # 将 PyTorch 张量移回 CPU 并转换为 NumPy 数组\n",
" factor_values_np = factor_values_tensor.cpu().numpy()\n",
" target_np = target_tensor_all.cpu().numpy().flatten() # 目标也展平\n",
" dates_np = dates_all # 日期已经是 numpy 数组\n",
"\n",
" # 创建一个临时 Pandas DataFrame 以便按日期分组计算每日 IC\n",
" temp_df = pd.DataFrame({\n",
" 'date': dates_np,\n",
" 'factor_value': factor_values_np,\n",
" 'target_value': target_np\n",
" })\n",
"\n",
" # 计算每日 Rank IC\n",
" # 在分组应用 spearmanr 时处理 NaN 和数据点不足的问题\n",
" daily_ics = temp_df.groupby('date').apply(\n",
" lambda x: spearmanr(x['factor_value'], x['target_value'])[0]\n",
" if len(x) >= 2 and x['factor_value'].notna().sum() >= 2 and x['target_value'].notna().sum() >= 2 # 确保分组内有效数据点 >= 2\n",
" else np.nan # 数据点不足或计算失败时返回 NaN\n",
" ).dropna() # 移除 NaN 的每日 IC\n",
"\n",
" # 计算 IC 夏普比率\n",
" if len(daily_ics) < 5: # 需要至少几个有效日 IC 才能计算夏普比率\n",
" # print(f\"警告: 有效日 IC 数量不足 ({len(daily_ics)}),无法计算夏普比率。\")\n",
" return (-1.0,) # 有效日 IC 太少,返回负适应度\n",
"\n",
" ic_mean = daily_ics.mean()\n",
" ic_std = daily_ics.std()\n",
"\n",
" # 处理标准差为零的情况 (非常罕见,可能意味着每日 IC 是常数)\n",
" if ic_std == 0:\n",
" ic_sharpe = ic_mean * 1e6 if ic_mean > 0 else -1.0 # 如果均值>0且标差为0给一个很大的正值\n",
" else:\n",
" ic_sharpe = ic_mean / ic_std\n",
"\n",
" # 返回 IC 夏普比率作为适应度 (需要最大化)\n",
" # 如果计算结果是 NaN (例如mean/std 导致 NaN),返回负值\n",
" return (ic_sharpe if not np.isnan(ic_sharpe) else -1.0,)\n",
"\n",
" except (ValueError, TypeError, ZeroDivisionError, RuntimeError) as e:\n",
" # 打印错误信息和导致错误的个体,以便调试\n",
" print(f\"Error during evaluation for individual {individual}: {e}\")\n",
" return (-1.0,) # 如果计算过程中出现错误,返回一个很小的负值\n",
"\n",
" # 修改 toolbox.register 调用,将 target_tensor 传递给 evaluate_torch_cuda\n",
" toolbox.register(\"evaluate\", evaluate_torch_cuda_ic_sharpe, data_tensor_all=data_tensor_all, target_tensor_all=target_tensor_all, dates_all=dates_all)\n",
" toolbox.register(\"select\", tools.selTournament, tournsize=params.get('tournament_size', 3))\n",
" toolbox.register(\"mate\", gp.cxOnePointLeafBiased, termpb=0.2) # 移除 pset=pset\n",
" toolbox.register(\"mutate\", gp.mutUniform, expr=toolbox.expr_torch, pset=pset_torch) # 使用 PyTorch 原始集\n",
"\n",
" MAX_TREE_DEPTH = 5\n",
"\n",
" toolbox.decorate(\"mate\", gp.staticLimit(key=operator.attrgetter('height'), max_value=MAX_TREE_DEPTH))\n",
" toolbox.decorate(\"mutate\", gp.staticLimit(key=operator.attrgetter('height'), max_value=MAX_TREE_DEPTH))\n",
"\n",
" # 5. 设置进化参数\n",
" population_size = params.get('population_size', 100)\n",
" generations = params.get('generations', 10)\n",
" crossover_probability = params.get('crossover_probability', 0.7) # 调整参数以增加探索\n",
" mutation_probability = params.get('mutation_probability', 0.3) # 调整参数以增加探索\n",
"\n",
" # 6. 初始化种群\n",
" pop = toolbox.population(n=population_size)\n",
" hof = tools.HallOfFame(params.get('hall_of_fame_size', 5)) # 保留最佳的几个个体\n",
" stats = tools.Statistics(lambda ind: ind.fitness.values)\n",
" stats.register(\"avg\", np.mean)\n",
" stats.register(\"std\", np.std)\n",
" stats.register(\"min\", np.min)\n",
" stats.register(\"max\", np.max)\n",
"\n",
" # 7. 运行进化算法\n",
" algorithms.eaSimple(pop, toolbox, cxpb=crossover_probability, mutpb=mutation_probability, ngen=generations,\n",
" stats=stats, halloffame=hof, verbose=True)\n",
"\n",
" # 8. 返回最佳因子表达式\n",
" return hof, stats\n",
"\n",
"params = {\n",
" 'population_size': 64,\n",
" 'generations': 32,\n",
" 'crossover_probability': 0.7,\n",
" 'mutation_probability': 0.3,\n",
" 'tournament_size': 4,\n",
" 'hall_of_fame_size': 3\n",
"}\n",
"\n",
"best_factors_hof, stats = generate_deap_factors_pytorch_v2(df.copy(), numeric_columns, params=params)\n",
"\n",
"print(\"\\nBest Factors Found:\")\n",
"for i, ind in enumerate(best_factors_hof):\n",
" fitness_value = ind.fitness.values[0] # 获取适应度值\n",
" print(f\"Fitness: {fitness_value:.4f}, Factor {i+1}: {ind}\")"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "a0b3d7551ef0c81f",
"metadata": {
"ExecuteTime": {
"end_time": "2025-04-13T15:39:47.502867Z",
"start_time": "2025-04-13T15:39:47.461434Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"全面因子分析报告 - 特征因子: 'generated_factor'\n",
"------------------------------------------------------------\n",
"整体 Rank IC: 0.0817\n",
"整体 P-value: 0.0000\n",
"------------------------------------------------------------\n",
"计算滚动 Rank IC (按 'D' 窗口)...\n",
"滚动 Rank IC 统计量 (D):\n",
" 均值: 0.0124\n",
" 标准差: 0.2330\n",
" 夏普比率 (IC Mean / IC Std): 0.0531\n",
" T-statistic: 1.4577\n",
" T-statistic P-value: 0.1453\n",
"------------------------------------------------------------\n",
"Hit Ratio (正向 Rank IC 比例): 0.5060\n",
"------------------------------------------------------------\n",
"因子 10 分位数分析 (按因子值从小到大排序):\n",
" 第 1 分位数: 平均 'future_return' = -0.0004\n",
" 第 2 分位数: 平均 'future_return' = -0.0008\n",
" 第 3 分位数: 平均 'future_return' = -0.0004\n",
" 第 4 分位数: 平均 'future_return' = 0.0005\n",
" 第 5 分位数: 平均 'future_return' = 0.0007\n",
" 第 6 分位数: 平均 'future_return' = 0.0015\n",
" 第 7 分位数: 平均 'future_return' = 0.0021\n",
" 第 8 分位数: 平均 'future_return' = 0.0033\n",
" 第 9 分位数: 平均 'future_return' = 0.0054\n",
" 第 10 分位数: 平均 'future_return' = 0.0135\n",
"\n",
"因子值的分位数范围:\n",
" 第 1 分位数: [-1.0490, 0.0581]\n",
" 第 2 分位数: [0.0581, 0.1051]\n",
" 第 3 分位数: [0.1051, 0.1458]\n",
" 第 4 分位数: [0.1458, 0.1881]\n",
" 第 5 分位数: [0.1881, 0.2354]\n",
" 第 6 分位数: [0.2354, 0.2909]\n",
" 第 7 分位数: [0.2909, 0.3594]\n",
" 第 8 分位数: [0.3594, 0.4505]\n",
" 第 9 分位数: [0.4505, 0.5880]\n",
" 第 10 分位数: [0.5880, 1.9782]\n",
"------------------------------------------------------------\n",
"分析完成。\n"
]
}
],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import torch\n",
"\n",
"target_column = 'future_return'\n",
"# 假设您已经定义了 protected_div_torch 函数\n",
"def protected_div_torch(left, right):\n",
" return torch.where(right != 0, left / right, torch.ones_like(left))\n",
"\n",
"def protected_div_np(left, right):\n",
" \"\"\"安全除法,避免除以零的错误\"\"\"\n",
" return np.where(right != 0, left / right, np.ones_like(left) * np.nan) # 除以零时返回 NaN\n",
"\n",
"def calculate_factor_4(df: pd.DataFrame) -> pd.Series:\n",
" \"\"\"\n",
" 计算因子: sub(add(add(protected_div_torch(pow(pct_chg, std_return_90), cost_95pct), protected_div_torch(industry_act_factor6, cost_95pct)), pow(protected_div_torch(protected_div_torch(act_factor6, cost_95pct), cost_95pct), protected_div_torch(protected_div_torch(act_factor6, cost_95pct), cost_95pct))), cos(industry_act_factor1)).\n",
"\n",
" Args:\n",
" df (pd.DataFrame): 包含必要列的数据框。\n",
"\n",
" Returns:\n",
" pd.Series: 计算得到的因子值。\n",
" \"\"\"\n",
" pct_chg = df['pct_chg']\n",
" std_return_90 = df['std_return_90']\n",
" cost_95pct = df['cost_95pct']\n",
" industry_act_factor6 = df['industry_act_factor6']\n",
" act_factor6 = df['act_factor6']\n",
" industry_act_factor1 = df['industry_act_factor1']\n",
"\n",
" # Term 1: protected_div_torch(pow(pct_chg, std_return_90), cost_95pct)\n",
" term1_num = np.power(pct_chg, std_return_90)\n",
" term1 = protected_div_np(term1_num, cost_95pct)\n",
"\n",
" # Term 2: protected_div_torch(industry_act_factor6, cost_95pct)\n",
" term2 = protected_div_np(industry_act_factor6, cost_95pct)\n",
"\n",
" # Term 3: pow(protected_div_torch(protected_div_torch(act_factor6, cost_95pct), cost_95pct), protected_div_torch(protected_div_torch(act_factor6, cost_95pct), cost_95pct))\n",
" term3_base_inner = protected_div_np(act_factor6, cost_95pct)\n",
" term3_base = protected_div_np(term3_base_inner, cost_95pct)\n",
" term3_exponent_inner = protected_div_np(act_factor6, cost_95pct)\n",
" term3_exponent = protected_div_np(term3_exponent_inner, cost_95pct)\n",
" term3 = np.power(term3_base, term3_exponent)\n",
"\n",
"\n",
" # Sum of the first three terms\n",
" add_terms = term1 + term2 + term3\n",
"\n",
" # Term 4: cos(industry_act_factor1)\n",
" term4 = np.cos(industry_act_factor1)\n",
"\n",
" # Final factor\n",
" factor4 = add_terms - term4\n",
"\n",
" return factor4\n",
"\n",
"df['generated_factor'] = calculate_factor_4(df)\n",
"\n",
"import pandas as pd\n",
"import numpy as np\n",
"from scipy.stats import spearmanr, ttest_1samp\n",
"\n",
"def comprehensive_factor_analysis(df: pd.DataFrame, factor_column: str, target_column: str = 'future_return', date_column: str = 'trade_date', rolling_window: str = 'D', n_deciles: int = 10):\n",
" \"\"\"\n",
" 对 DataFrame 中的一个特征因子进行全面分析。\n",
"\n",
" Args:\n",
" df (pd.DataFrame): 包含因子和目标变量的数据框。\n",
" factor_column (str): 要分析的特征因子的列名。\n",
" target_column (str): 目标变量的列名,默认为 'future_return'。\n",
" date_column (str): 包含日期信息的列名,默认为 'trade_date'。\n",
" rolling_window (str): 滚动 Rank IC 的时间窗口(例如 'D' 表示按天,'W' 表示按周)。\n",
" n_deciles (int): 进行分位数分析时使用的分位数数量,默认为 10。\n",
" \"\"\"\n",
" if factor_column not in df.columns:\n",
" print(f\"错误: 特征因子列 '{factor_column}' 不存在于 DataFrame 中。\")\n",
" return\n",
" if target_column not in df.columns:\n",
" print(f\"错误: 目标列 '{target_column}' 不存在于 DataFrame 中。\")\n",
" return\n",
" if date_column not in df.columns:\n",
" print(f\"错误: 日期列 '{date_column}' 不存在于 DataFrame 中。\")\n",
" return\n",
"\n",
" # 确保日期列是 datetime 类型并设置为索引\n",
" df_analy = df.copy()\n",
" df_analy[date_column] = pd.to_datetime(df_analy[date_column])\n",
" df_analy = df_analy.set_index(date_column)\n",
"\n",
" # 移除因子或目标变量为 NaN 的行\n",
" df_analy = df_analy.dropna(subset=[factor_column, target_column])\n",
"\n",
" if len(df_analy) < 2:\n",
" print(\"警告: 有效数据点太少,无法进行分析。\")\n",
" return\n",
"\n",
" print(f\"全面因子分析报告 - 特征因子: '{factor_column}'\")\n",
" print(\"-\" * 60)\n",
"\n",
" # 1. 计算整体 Rank IC\n",
" overall_rank_ic, overall_p_value = spearmanr(df_analy[factor_column], df_analy[target_column])\n",
" print(f\"整体 Rank IC: {overall_rank_ic:.4f}\")\n",
" print(f\"整体 P-value: {overall_p_value:.4f}\")\n",
" print(\"-\" * 60)\n",
"\n",
" # 2. 计算滚动 Rank IC (按指定时间窗口)\n",
" print(f\"计算滚动 Rank IC (按 '{rolling_window}' 窗口)...\")\n",
" rolling_ics = df_analy.groupby(df_analy.index.to_period(rolling_window)).apply(\n",
" lambda x: spearmanr(x[factor_column], x[target_column])[0] if len(x) >= 2 else np.nan\n",
" ).dropna()\n",
"\n",
" if len(rolling_ics) < 2:\n",
" print(\"警告: 滚动 Rank IC 有效周期太少,无法计算统计量。\")\n",
" else:\n",
" # 3. 滚动 IC 统计量\n",
" ic_mean = rolling_ics.mean()\n",
" ic_std = rolling_ics.std()\n",
" ic_sharpe = ic_mean / ic_std if ic_std != 0 else np.nan\n",
" t_statistic, p_value_t = ttest_1samp(rolling_ics, 0) # 检验均值是否显著不为零\n",
"\n",
" print(f\"滚动 Rank IC 统计量 ({rolling_window}):\")\n",
" print(f\" 均值: {ic_mean:.4f}\")\n",
" print(f\" 标准差: {ic_std:.4f}\")\n",
" print(f\" 夏普比率 (IC Mean / IC Std): {ic_sharpe:.4f}\")\n",
" print(f\" T-statistic: {t_statistic:.4f}\")\n",
" print(f\" T-statistic P-value: {p_value_t:.4f}\")\n",
" print(\"-\" * 60)\n",
"\n",
" # 4. Hit Ratio (正向 Rank IC 的比例)\n",
" hit_ratio = (rolling_ics > 0).sum() / len(rolling_ics)\n",
" print(f\"Hit Ratio (正向 Rank IC 比例): {hit_ratio:.4f}\")\n",
" print(\"-\" * 60)\n",
"\n",
" # 5. 分位数分析 (在整个数据集上进行)\n",
" print(f\"因子 {n_deciles} 分位数分析 (按因子值从小到大排序):\")\n",
" df_analy['decile'] = pd.qcut(df_analy[factor_column], q=n_deciles, labels=False, duplicates='drop')\n",
" decile_analysis = df_analy.groupby('decile')[target_column].mean().sort_index()\n",
"\n",
" if len(decile_analysis) > 0:\n",
" for decile, avg_return in decile_analysis.items():\n",
" print(f\" 第 {decile + 1} 分位数: 平均 '{target_column}' = {avg_return:.4f}\")\n",
"\n",
" # 打印每个分位数的因子值范围\n",
" percentiles = np.linspace(0, 100, n_deciles + 1)\n",
" factor_percentiles = df_analy[factor_column].quantile(percentiles / 100)\n",
" print(\"\\n因子值的分位数范围:\")\n",
" # 修复分位数范围打印的 KeyError\n",
" for i in range(len(decile_analysis)): # 确保只打印实际存在的分位数\n",
" lower_bound = factor_percentiles[percentiles[i] / 100]\n",
" upper_bound = factor_percentiles[percentiles[i+1] / 100]\n",
" print(f\" 第 {i + 1} 分位数: [{lower_bound:.4f}, {upper_bound:.4f}]\")\n",
" else:\n",
" print(\"警告: 分位数分析无法执行,可能是因为数据点不足或因子值分布问题。\")\n",
"\n",
"\n",
" print(\"-\" * 60)\n",
" print(\"分析完成。\")\n",
"\n",
"comprehensive_factor_analysis(df, factor_column='generated_factor', target_column='future_return', date_column='trade_date', rolling_window='D', n_deciles=10)\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "new_trader",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.11"
}
},
"nbformat": 4,
"nbformat_minor": 5
}