1029 lines
53 KiB
Plaintext
1029 lines
53 KiB
Plaintext
{
|
||
"cells": [
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 1,
|
||
"id": "initial_id",
|
||
"metadata": {
|
||
"ExecuteTime": {
|
||
"end_time": "2025-04-13T07:26:19.000054Z",
|
||
"start_time": "2025-04-13T07:26:18.895713Z"
|
||
},
|
||
"collapsed": true
|
||
},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"e:\\PyProject\\NewStock\\main\\factor\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"import gc\n",
|
||
"import os\n",
|
||
"import sys\n",
|
||
"sys.path.append('../../')\n",
|
||
"print(os.getcwd())\n",
|
||
"import pandas as pd\n",
|
||
"from main.factor.factor import get_rolling_factor, get_simple_factor\n",
|
||
"from main.utils.factor import read_industry_data\n",
|
||
"from main.utils.factor_processor import calculate_score\n",
|
||
"from main.utils.utils import read_and_merge_h5_data, merge_with_industry_data\n",
|
||
"\n",
|
||
"import warnings\n",
|
||
"\n",
|
||
"warnings.filterwarnings(\"ignore\")"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 2,
|
||
"id": "f1623b04c7a366af",
|
||
"metadata": {
|
||
"ExecuteTime": {
|
||
"end_time": "2025-04-13T07:30:48.534271Z",
|
||
"start_time": "2025-04-13T07:26:19.005576Z"
|
||
}
|
||
},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"daily data\n",
|
||
"daily basic\n",
|
||
"inner merge on ['ts_code', 'trade_date']\n",
|
||
"stk limit\n",
|
||
"left merge on ['ts_code', 'trade_date']\n",
|
||
"money flow\n",
|
||
"left merge on ['ts_code', 'trade_date']\n",
|
||
"cyq perf\n",
|
||
"left merge on ['ts_code', 'trade_date']\n",
|
||
"<class 'pandas.core.frame.DataFrame'>\n",
|
||
"RangeIndex: 5123740 entries, 0 to 5123739\n",
|
||
"Data columns (total 31 columns):\n",
|
||
" # Column Dtype \n",
|
||
"--- ------ ----- \n",
|
||
" 0 ts_code object \n",
|
||
" 1 trade_date datetime64[ns]\n",
|
||
" 2 open float64 \n",
|
||
" 3 close float64 \n",
|
||
" 4 high float64 \n",
|
||
" 5 low float64 \n",
|
||
" 6 vol float64 \n",
|
||
" 7 pct_chg float64 \n",
|
||
" 8 turnover_rate float64 \n",
|
||
" 9 pe_ttm float64 \n",
|
||
" 10 circ_mv float64 \n",
|
||
" 11 volume_ratio float64 \n",
|
||
" 12 is_st bool \n",
|
||
" 13 up_limit float64 \n",
|
||
" 14 down_limit float64 \n",
|
||
" 15 buy_sm_vol float64 \n",
|
||
" 16 sell_sm_vol float64 \n",
|
||
" 17 buy_lg_vol float64 \n",
|
||
" 18 sell_lg_vol float64 \n",
|
||
" 19 buy_elg_vol float64 \n",
|
||
" 20 sell_elg_vol float64 \n",
|
||
" 21 net_mf_vol float64 \n",
|
||
" 22 his_low float64 \n",
|
||
" 23 his_high float64 \n",
|
||
" 24 cost_5pct float64 \n",
|
||
" 25 cost_15pct float64 \n",
|
||
" 26 cost_50pct float64 \n",
|
||
" 27 cost_85pct float64 \n",
|
||
" 28 cost_95pct float64 \n",
|
||
" 29 weight_avg float64 \n",
|
||
" 30 winner_rate float64 \n",
|
||
"dtypes: bool(1), datetime64[ns](1), float64(28), object(1)\n",
|
||
"memory usage: 1.2+ GB\n",
|
||
"None\n",
|
||
"['ts_code', 'trade_date', 'open', 'close', 'high', 'low', 'vol', 'pct_chg', 'turnover_rate', 'pe_ttm', 'circ_mv', 'volume_ratio', 'is_st', 'up_limit', 'down_limit', 'buy_sm_vol', 'sell_sm_vol', 'buy_lg_vol', 'sell_lg_vol', 'buy_elg_vol', 'sell_elg_vol', 'net_mf_vol', 'his_low', 'his_high', 'cost_5pct', 'cost_15pct', 'cost_50pct', 'cost_85pct', 'cost_95pct', 'weight_avg', 'winner_rate']\n",
|
||
"Index(['ts_code', 'trade_date', 'open', 'close', 'high', 'low', 'vol',\n",
|
||
" 'pct_chg', 'turnover_rate', 'pe_ttm', 'circ_mv', 'volume_ratio',\n",
|
||
" 'is_st', 'up_limit', 'down_limit', 'buy_sm_vol', 'sell_sm_vol',\n",
|
||
" 'buy_lg_vol', 'sell_lg_vol', 'buy_elg_vol', 'sell_elg_vol',\n",
|
||
" 'net_mf_vol', 'his_low', 'his_high', 'cost_5pct', 'cost_15pct',\n",
|
||
" 'cost_50pct', 'cost_85pct', 'cost_95pct', 'weight_avg', 'winner_rate',\n",
|
||
" 'lg_elg_net_buy_vol', 'flow_lg_elg_intensity', 'sm_net_buy_vol',\n",
|
||
" 'flow_divergence_diff', 'flow_divergence_ratio', 'total_buy_vol',\n",
|
||
" 'lg_elg_buy_prop', 'flow_struct_buy_change',\n",
|
||
" 'lg_elg_net_buy_vol_change', 'flow_lg_elg_accel',\n",
|
||
" 'chip_concentration_range', 'chip_skewness', 'floating_chip_proxy',\n",
|
||
" 'cost_support_15pct_change', 'cat_winner_price_zone',\n",
|
||
" 'flow_chip_consistency', 'profit_taking_vs_absorb', '_is_positive',\n",
|
||
" '_is_negative', 'cat_is_positive', '_pos_returns', '_neg_returns',\n",
|
||
" '_pos_returns_sq', '_neg_returns_sq', 'upside_vol', 'downside_vol',\n",
|
||
" 'vol_ratio', 'return_skew', 'return_kurtosis', 'volume_change_rate',\n",
|
||
" 'cat_volume_breakout', 'turnover_deviation', 'cat_turnover_spike',\n",
|
||
" 'avg_volume_ratio', 'cat_volume_ratio_breakout', 'vol_spike',\n",
|
||
" 'vol_std_5', 'atr_14', 'atr_6', 'obv'],\n",
|
||
" dtype='object')\n",
|
||
"Calculating lg_flow_mom_corr_20_60...\n",
|
||
"Finished lg_flow_mom_corr_20_60.\n",
|
||
"Calculating lg_buy_consolidation_20...\n",
|
||
"Finished lg_buy_consolidation_20.\n",
|
||
"Calculating lg_flow_accel...\n",
|
||
"Finished lg_flow_accel.\n",
|
||
"Calculating profit_pressure...\n",
|
||
"Finished profit_pressure.\n",
|
||
"Calculating underwater_resistance...\n",
|
||
"Finished underwater_resistance.\n",
|
||
"Calculating cost_conc_std_20...\n",
|
||
"Finished cost_conc_std_20.\n",
|
||
"Calculating profit_decay_20...\n",
|
||
"Finished profit_decay_20.\n",
|
||
"Calculating vol_amp_loss_20...\n",
|
||
"Finished vol_amp_loss_20.\n",
|
||
"Calculating vol_drop_profit_cnt_5...\n",
|
||
"Finished vol_drop_profit_cnt_5.\n",
|
||
"Calculating lg_flow_vol_interact_20...\n",
|
||
"Finished lg_flow_vol_interact_20.\n",
|
||
"Calculating cost_break_confirm_cnt_5...\n",
|
||
"Finished cost_break_confirm_cnt_5.\n",
|
||
"Calculating atr_norm_channel_pos_14...\n",
|
||
"Finished atr_norm_channel_pos_14.\n",
|
||
"Calculating turnover_diff_skew_20...\n",
|
||
"Finished turnover_diff_skew_20.\n",
|
||
"Calculating lg_sm_flow_diverge_20...\n",
|
||
"Finished lg_sm_flow_diverge_20.\n",
|
||
"Calculating pullback_strong_20_20...\n",
|
||
"Finished pullback_strong_20_20.\n",
|
||
"Calculating vol_wgt_hist_pos_20...\n",
|
||
"Finished vol_wgt_hist_pos_20.\n",
|
||
"Calculating vol_adj_roc_20...\n",
|
||
"Finished vol_adj_roc_20.\n",
|
||
"Calculating intraday_lg_flow_corr_20 (Placeholder - complex implementation)...\n",
|
||
"Finished intraday_lg_flow_corr_20 (Placeholder).\n",
|
||
"Calculating cap_neutral_cost_metric (Placeholder - requires statsmodels)...\n",
|
||
"Finished cap_neutral_cost_metric (Placeholder).\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"print('daily data')\n",
|
||
"df = read_and_merge_h5_data('../../data/daily_data.h5', key='daily_data',\n",
|
||
" columns=['ts_code', 'trade_date', 'open', 'close', 'high', 'low', 'vol', 'pct_chg'],\n",
|
||
" df=None)\n",
|
||
"\n",
|
||
"print('daily basic')\n",
|
||
"df = read_and_merge_h5_data('../../data/daily_basic.h5', key='daily_basic',\n",
|
||
" columns=['ts_code', 'trade_date', 'turnover_rate', 'pe_ttm', 'circ_mv', 'volume_ratio',\n",
|
||
" 'is_st'], df=df, join='inner')\n",
|
||
"df = df[df['trade_date'] >= '2021-01-01']\n",
|
||
"\n",
|
||
"print('stk limit')\n",
|
||
"df = read_and_merge_h5_data('../../data/stk_limit.h5', key='stk_limit',\n",
|
||
" columns=['ts_code', 'trade_date', 'pre_close', 'up_limit', 'down_limit'],\n",
|
||
" df=df)\n",
|
||
"print('money flow')\n",
|
||
"df = read_and_merge_h5_data('../../data/money_flow.h5', key='money_flow',\n",
|
||
" columns=['ts_code', 'trade_date', 'buy_sm_vol', 'sell_sm_vol', 'buy_lg_vol',\n",
|
||
" 'sell_lg_vol',\n",
|
||
" 'buy_elg_vol', 'sell_elg_vol', 'net_mf_vol'],\n",
|
||
" df=df)\n",
|
||
"print('cyq perf')\n",
|
||
"df = read_and_merge_h5_data('../../data/cyq_perf.h5', key='cyq_perf',\n",
|
||
" columns=['ts_code', 'trade_date', 'his_low', 'his_high', 'cost_5pct', 'cost_15pct',\n",
|
||
" 'cost_50pct',\n",
|
||
" 'cost_85pct', 'cost_95pct', 'weight_avg', 'winner_rate'],\n",
|
||
" df=df)\n",
|
||
"print(df.info())\n",
|
||
"\n",
|
||
"origin_columns = df.columns.tolist()\n",
|
||
"origin_columns = [col for col in origin_columns if 'cyq' not in col]\n",
|
||
"print(origin_columns)\n",
|
||
"\n",
|
||
"\n",
|
||
"def filter_data(df):\n",
|
||
" # df = df.groupby('trade_date').apply(lambda x: x.nlargest(1000, 'act_factor1'))\n",
|
||
" df = df[~df['is_st']]\n",
|
||
" df = df[~df['ts_code'].str.endswith('BJ')]\n",
|
||
" df = df[~df['ts_code'].str.startswith('30')]\n",
|
||
" df = df[~df['ts_code'].str.startswith('68')]\n",
|
||
" df = df[~df['ts_code'].str.startswith('8')]\n",
|
||
" df = df[df['trade_date'] >= '2022-01-01']\n",
|
||
" if 'in_date' in df.columns:\n",
|
||
" df = df.drop(columns=['in_date'])\n",
|
||
" df = df.reset_index(drop=True)\n",
|
||
" return df\n",
|
||
"\n",
|
||
"\n",
|
||
"gc.collect()\n",
|
||
"\n",
|
||
"df = filter_data(df)\n",
|
||
"df, _ = get_rolling_factor(df)\n",
|
||
"df, _ = get_simple_factor(df)\n",
|
||
"from main.factor.factor import *\n",
|
||
"lg_flow_mom_corr(df, N=20, M=60)\n",
|
||
"lg_buy_consolidation(df, N=20)\n",
|
||
"lg_flow_accel(df)\n",
|
||
"profit_pressure(df)\n",
|
||
"underwater_resistance(df)\n",
|
||
"cost_conc_std(df, N=20)\n",
|
||
"profit_decay(df, N=20)\n",
|
||
"vol_amp_loss(df, N=20)\n",
|
||
"vol_drop_profit_cnt(df, N=20, M=5)\n",
|
||
"lg_flow_vol_interact(df, N=20)\n",
|
||
"cost_break_confirm_cnt(df, M=5)\n",
|
||
"atr_norm_channel_pos(df, N=14)\n",
|
||
"turnover_diff_skew(df, N=20)\n",
|
||
"lg_sm_flow_diverge(df, N=20)\n",
|
||
"pullback_strong(df, N=20, M=20)\n",
|
||
"vol_wgt_hist_pos(df, N=20)\n",
|
||
"vol_adj_roc(df, N=20)\n",
|
||
"intraday_lg_flow_corr(df, N=20) # Placeholder\n",
|
||
"cap_neutral_cost_metric(df) # Placeholder\n",
|
||
"# hurst_exponent_flow(df, N=60) # Placeholder\n",
|
||
"# df['test'] = 1\n",
|
||
"# df['test2'] = 2\n",
|
||
"# df = df.merge(industry_df, on=['l2_code', 'trade_date'], how='left')\n",
|
||
"l2_df = read_and_merge_h5_data('../../data/industry_data.h5', key='industry_data',\n",
|
||
" columns=['ts_code', 'l2_code', 'in_date'],\n",
|
||
" df=None, on=['ts_code'], join='left')\n",
|
||
"df = merge_with_industry_data(df, l2_df)\n",
|
||
"df = df.rename(columns={'l2_code': 'cat_l2_code'})\n",
|
||
"# df = df.merge(index_data, on='trade_date', how='left')\n",
|
||
"\n",
|
||
"days = 2\n",
|
||
"df = df.sort_values(by=['ts_code', 'trade_date'])\n",
|
||
"# df['future_return'] = df.groupby('ts_code', group_keys=False)['close'].apply(lambda x: x.shift(-days) / x - 1)\n",
|
||
"df['future_return'] = (df.groupby('ts_code')['close'].shift(-days) - df.groupby('ts_code')['open'].shift(-1)) / \\\n",
|
||
" df.groupby('ts_code')['open'].shift(-1)\n",
|
||
"# df['future_return'] = df.groupby('ts_code')['pct_chg'].shift(-1)\n",
|
||
"df['future_return2'] = (df.groupby('ts_code')['close'].shift(-1) - df.groupby('ts_code')['open'].shift(-1)) / \\\n",
|
||
" df.groupby('ts_code')['open'].shift(-1)\n",
|
||
"\n",
|
||
"df['future_volatility'] = (\n",
|
||
" df.groupby('ts_code')['pct_chg']\n",
|
||
" .transform(lambda x: x.rolling(days).std().shift(-days))\n",
|
||
")\n",
|
||
"df['future_score'] = calculate_score(df, days=days, lambda_param=0.3)\n",
|
||
"\n",
|
||
"\n",
|
||
"def select_pre_zt_stocks_dynamic(stock_df):\n",
|
||
" def select_stocks(group):\n",
|
||
" return group.nlargest(1000, 'return_5') # 如果循环结束仍未找到足够标签,则返回最大数量的股票\n",
|
||
"\n",
|
||
" stock_df = stock_df.groupby('trade_date', group_keys=False).apply(select_stocks)\n",
|
||
" return stock_df\n",
|
||
"\n",
|
||
"\n",
|
||
"gc.collect()\n",
|
||
"\n",
|
||
"# df = select_pre_zt_stocks_dynamic(df[(df['trade_date'] >= '2022-01-01') & (df['trade_date'] <= '2029-04-07')])\n",
|
||
"\n",
|
||
"industry_df = read_industry_data('../../data/sw_daily.h5')\n",
|
||
"df = df.merge(industry_df, on=['cat_l2_code', 'trade_date'], how='left')\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 3,
|
||
"id": "1c1dd3d6",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"['open', 'close', 'high', 'low', 'vol', 'pct_chg', 'turnover_rate', 'circ_mv', 'volume_ratio', 'up_limit', 'down_limit', 'buy_sm_vol', 'sell_sm_vol', 'buy_lg_vol', 'sell_lg_vol', 'buy_elg_vol', 'sell_elg_vol', 'net_mf_vol', 'his_low', 'his_high', 'cost_5pct', 'cost_15pct', 'cost_50pct', 'cost_85pct', 'cost_95pct', 'weight_avg', 'winner_rate', 'lg_elg_net_buy_vol', 'flow_lg_elg_intensity', 'sm_net_buy_vol', 'total_buy_vol', 'lg_elg_buy_prop', 'flow_struct_buy_change', 'lg_elg_net_buy_vol_change', 'flow_lg_elg_accel', 'chip_concentration_range', 'chip_skewness', 'floating_chip_proxy', 'cost_support_15pct_change', 'cat_winner_price_zone', 'flow_chip_consistency', 'profit_taking_vs_absorb', 'cat_is_positive', 'upside_vol', 'downside_vol', 'vol_ratio', 'return_skew', 'return_kurtosis', 'volume_change_rate', 'cat_volume_breakout', 'turnover_deviation', 'cat_turnover_spike', 'avg_volume_ratio', 'cat_volume_ratio_breakout', 'vol_spike', 'vol_std_5', 'atr_14', 'atr_6', 'obv', 'maobv_6', 'rsi_3', 'return_5', 'return_20', 'std_return_5', 'std_return_90', 'std_return_90_2', 'act_factor1', 'act_factor2', 'act_factor3', 'act_factor4', 'rank_act_factor1', 'rank_act_factor2', 'rank_act_factor3', 'log_circ_mv', 'cov', 'delta_cov', 'alpha_22_improved', 'alpha_003', 'alpha_007', 'alpha_013', 'cat_up_limit', 'cat_down_limit', 'up_limit_count_10d', 'down_limit_count_10d', 'consecutive_up_limit', 'vol_break', 'weight_roc5', 'smallcap_concentration', 'cost_stability', 'high_cost_break_days', 'liquidity_risk', 'turnover_std', 'mv_volatility', 'volume_growth', 'mv_growth', 'arbr', 'momentum_factor', 'resonance_factor', 'log_close', 'cat_vol_spike', 'up', 'down', 'obv_maobv_6', 'std_return_5_over_std_return_90', 'std_return_90_minus_std_return_90_2', 'cat_af2', 'cat_af3', 'cat_af4', 'act_factor5', 'act_factor6', 'active_buy_volume_large', 'active_buy_volume_big', 'active_buy_volume_small', 'buy_lg_vol_minus_sell_lg_vol', 'buy_elg_vol_minus_sell_elg_vol', 'ctrl_strength', 'low_cost_dev', 'asymmetry', 'lock_factor', 'cat_vol_break', 'cost_atr_adj', 'cat_golden_resonance', 'mv_turnover_ratio', 'mv_adjusted_volume', 'mv_weighted_turnover', 'nonlinear_mv_volume', 'mv_volume_ratio', 'mv_momentum', 'lg_flow_mom_corr_20_60', 'lg_buy_consolidation_20', 'lg_flow_accel', 'profit_pressure', 'underwater_resistance', 'cost_conc_std_20', 'profit_decay_20', 'vol_amp_loss_20', 'vol_drop_profit_cnt_5', 'lg_flow_vol_interact_20', 'cost_break_confirm_cnt_5', 'atr_norm_channel_pos_14', 'turnover_diff_skew_20', 'lg_sm_flow_diverge_20', 'pullback_strong_20_20', 'vol_wgt_hist_pos_20', 'vol_adj_roc_20', 'intraday_lg_flow_corr_20', 'cap_neutral_cost_metric', 'in_date', 'industry_obv', 'industry_return_5', 'industry_return_20', 'industry__ema_5', 'industry__ema_13', 'industry__ema_20', 'industry__ema_60', 'industry_act_factor1', 'industry_act_factor2', 'industry_act_factor3', 'industry_act_factor4', 'industry_act_factor5', 'industry_act_factor6', 'industry_rank_act_factor1', 'industry_rank_act_factor2', 'industry_rank_act_factor3', 'industry_return_5_percentile', 'industry_return_20_percentile']\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"feature_columns = [col for col in df.columns if col in df.columns]\n",
|
||
"feature_columns = [col for col in feature_columns if col not in ['trade_date',\n",
|
||
" 'ts_code',\n",
|
||
" 'label']]\n",
|
||
"feature_columns = [col for col in feature_columns if 'future' not in col]\n",
|
||
"feature_columns = [col for col in feature_columns if 'label' not in col]\n",
|
||
"feature_columns = [col for col in feature_columns if 'score' not in col]\n",
|
||
"feature_columns = [col for col in feature_columns if 'gen' not in col]\n",
|
||
"feature_columns = [col for col in feature_columns if 'is_st' not in col]\n",
|
||
"feature_columns = [col for col in feature_columns if 'pe_ttm' not in col]\n",
|
||
"feature_columns = [col for col in feature_columns if 'cat_l2_code' not in col]\n",
|
||
"# feature_columns = [col for col in feature_columns if col not in origin_columns]\n",
|
||
"feature_columns = [col for col in feature_columns if not col.startswith('_')]\n",
|
||
"# feature_columns = [col for col in feature_columns if col not in ['ts_code', 'trade_date', 'vol_std_5', 'cov', 'delta_cov', 'alpha_22_improved', 'alpha_007', 'consecutive_up_limit', 'mv_volatility', 'volume_growth', 'mv_growth', 'arbr']]\n",
|
||
"\n",
|
||
"print(feature_columns)\n",
|
||
"numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns\n",
|
||
"numeric_columns = [col for col in numeric_columns if col in feature_columns]"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 4,
|
||
"id": "2c60c1ea",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"\n",
|
||
"每个特征列中的 NaN 值数量(字典形式):\n",
|
||
"ts_code: 0\n",
|
||
"trade_date: 0\n",
|
||
"open: 0\n",
|
||
"close: 0\n",
|
||
"high: 0\n",
|
||
"low: 0\n",
|
||
"vol: 0\n",
|
||
"pct_chg: 0\n",
|
||
"turnover_rate: 0\n",
|
||
"pe_ttm: 499616\n",
|
||
"circ_mv: 0\n",
|
||
"volume_ratio: 791\n",
|
||
"is_st: 0\n",
|
||
"up_limit: 0\n",
|
||
"down_limit: 0\n",
|
||
"buy_sm_vol: 7\n",
|
||
"sell_sm_vol: 7\n",
|
||
"buy_lg_vol: 7\n",
|
||
"sell_lg_vol: 7\n",
|
||
"buy_elg_vol: 7\n",
|
||
"sell_elg_vol: 7\n",
|
||
"net_mf_vol: 7\n",
|
||
"his_low: 24695\n",
|
||
"his_high: 24695\n",
|
||
"cost_5pct: 24695\n",
|
||
"cost_15pct: 24695\n",
|
||
"cost_50pct: 24695\n",
|
||
"cost_85pct: 24695\n",
|
||
"cost_95pct: 24695\n",
|
||
"weight_avg: 24695\n",
|
||
"winner_rate: 24695\n",
|
||
"lg_elg_net_buy_vol: 7\n",
|
||
"flow_lg_elg_intensity: 7\n",
|
||
"sm_net_buy_vol: 7\n",
|
||
"flow_divergence_diff: 7\n",
|
||
"flow_divergence_ratio: 7\n",
|
||
"total_buy_vol: 7\n",
|
||
"lg_elg_buy_prop: 7\n",
|
||
"flow_struct_buy_change: 3287\n",
|
||
"lg_elg_net_buy_vol_change: 3287\n",
|
||
"flow_lg_elg_accel: 6567\n",
|
||
"chip_concentration_range: 24695\n",
|
||
"chip_skewness: 24695\n",
|
||
"floating_chip_proxy: 24695\n",
|
||
"cost_support_15pct_change: 27855\n",
|
||
"cat_winner_price_zone: 0\n",
|
||
"flow_chip_consistency: 7\n",
|
||
"profit_taking_vs_absorb: 7\n",
|
||
"cat_is_positive: 0\n",
|
||
"upside_vol: 29581\n",
|
||
"downside_vol: 29655\n",
|
||
"vol_ratio: 0\n",
|
||
"return_skew: 13096\n",
|
||
"return_kurtosis: 13096\n",
|
||
"volume_change_rate: 29466\n",
|
||
"cat_volume_breakout: 0\n",
|
||
"turnover_deviation: 6548\n",
|
||
"cat_turnover_spike: 0\n",
|
||
"avg_volume_ratio: 7341\n",
|
||
"cat_volume_ratio_breakout: 0\n",
|
||
"vol_spike: 62074\n",
|
||
"vol_std_5: 16370\n",
|
||
"atr_14: 45836\n",
|
||
"atr_6: 19644\n",
|
||
"obv: 0\n",
|
||
"maobv_6: 16370\n",
|
||
"rsi_3: 9822\n",
|
||
"return_5: 16370\n",
|
||
"return_20: 65315\n",
|
||
"std_return_5: 16370\n",
|
||
"std_return_90: 291770\n",
|
||
"std_return_90_2: 323906\n",
|
||
"act_factor1: 16370\n",
|
||
"act_factor2: 42562\n",
|
||
"act_factor3: 65315\n",
|
||
"act_factor4: 194886\n",
|
||
"rank_act_factor1: 16370\n",
|
||
"rank_act_factor2: 42562\n",
|
||
"rank_act_factor3: 65315\n",
|
||
"log_circ_mv: 0\n",
|
||
"cov: 13096\n",
|
||
"delta_cov: 29466\n",
|
||
"alpha_22_improved: 62074\n",
|
||
"alpha_003: 0\n",
|
||
"alpha_007: 13120\n",
|
||
"alpha_013: 62074\n",
|
||
"cat_up_limit: 0\n",
|
||
"cat_down_limit: 0\n",
|
||
"up_limit_count_10d: 0\n",
|
||
"down_limit_count_10d: 0\n",
|
||
"consecutive_up_limit: 0\n",
|
||
"vol_break: 0\n",
|
||
"weight_roc5: 40531\n",
|
||
"price_cost_divergence: 93280\n",
|
||
"smallcap_concentration: 24695\n",
|
||
"cost_stability: 85077\n",
|
||
"high_cost_break_days: 13096\n",
|
||
"liquidity_risk: 53215\n",
|
||
"turnover_std: 62074\n",
|
||
"mv_volatility: 62074\n",
|
||
"volume_growth: 65315\n",
|
||
"mv_growth: 65315\n",
|
||
"arbr: 9822\n",
|
||
"momentum_factor: 29466\n",
|
||
"resonance_factor: 791\n",
|
||
"log_close: 0\n",
|
||
"cat_vol_spike: 0\n",
|
||
"up: 0\n",
|
||
"down: 0\n",
|
||
"obv_maobv_6: 16370\n",
|
||
"std_return_5_over_std_return_90: 291770\n",
|
||
"std_return_90_minus_std_return_90_2: 323906\n",
|
||
"cat_af2: 0\n",
|
||
"cat_af3: 0\n",
|
||
"cat_af4: 0\n",
|
||
"act_factor5: 194886\n",
|
||
"act_factor6: 42562\n",
|
||
"active_buy_volume_large: 13\n",
|
||
"active_buy_volume_big: 79\n",
|
||
"active_buy_volume_small: 7\n",
|
||
"buy_lg_vol_minus_sell_lg_vol: 8\n",
|
||
"buy_elg_vol_minus_sell_elg_vol: 69\n",
|
||
"ctrl_strength: 24695\n",
|
||
"low_cost_dev: 24695\n",
|
||
"asymmetry: 24701\n",
|
||
"lock_factor: 24695\n",
|
||
"cat_vol_break: 0\n",
|
||
"cost_atr_adj: 69060\n",
|
||
"cat_golden_resonance: 0\n",
|
||
"mv_turnover_ratio: 0\n",
|
||
"mv_adjusted_volume: 0\n",
|
||
"mv_weighted_turnover: 0\n",
|
||
"nonlinear_mv_volume: 0\n",
|
||
"mv_volume_ratio: 791\n",
|
||
"mv_momentum: 791\n",
|
||
"lg_flow_mom_corr_20_60: 1186\n",
|
||
"lg_buy_consolidation_20: 1950902\n",
|
||
"lg_flow_accel: 6567\n",
|
||
"profit_pressure: 24695\n",
|
||
"underwater_resistance: 24695\n",
|
||
"cost_conc_std_20: 29466\n",
|
||
"profit_decay_20: 0\n",
|
||
"vol_amp_loss_20: 53215\n",
|
||
"vol_drop_profit_cnt_5: 0\n",
|
||
"lg_flow_vol_interact_20: 29466\n",
|
||
"cost_break_confirm_cnt_5: 0\n",
|
||
"atr_norm_channel_pos_14: 0\n",
|
||
"turnover_diff_skew_20: 32740\n",
|
||
"lg_sm_flow_diverge_20: 29466\n",
|
||
"pullback_strong_20_20: 0\n",
|
||
"vol_wgt_hist_pos_20: 0\n",
|
||
"vol_adj_roc_20: 0\n",
|
||
"intraday_lg_flow_corr_20: 2431461\n",
|
||
"cap_neutral_cost_metric: 2431461\n",
|
||
"cat_l2_code: 290\n",
|
||
"in_date: 65486\n",
|
||
"future_return: 6548\n",
|
||
"future_return2: 3274\n",
|
||
"future_volatility: 6548\n",
|
||
"score: 6548\n",
|
||
"future_score: 6548\n",
|
||
"industry_obv: 11272\n",
|
||
"industry_return_5: 11272\n",
|
||
"industry_return_20: 11272\n",
|
||
"industry__ema_5: 11272\n",
|
||
"industry__ema_13: 11272\n",
|
||
"industry__ema_20: 11272\n",
|
||
"industry__ema_60: 11272\n",
|
||
"industry_act_factor1: 11272\n",
|
||
"industry_act_factor2: 11272\n",
|
||
"industry_act_factor3: 11272\n",
|
||
"industry_act_factor4: 11272\n",
|
||
"industry_act_factor5: 11272\n",
|
||
"industry_act_factor6: 11272\n",
|
||
"industry_rank_act_factor1: 11272\n",
|
||
"industry_rank_act_factor2: 11272\n",
|
||
"industry_rank_act_factor3: 11272\n",
|
||
"industry_return_5_percentile: 11272\n",
|
||
"industry_return_20_percentile: 11272\n",
|
||
"['open', 'close', 'high', 'low', 'vol', 'pct_chg', 'turnover_rate', 'circ_mv', 'volume_ratio', 'up_limit', 'down_limit', 'buy_sm_vol', 'sell_sm_vol', 'buy_lg_vol', 'sell_lg_vol', 'buy_elg_vol', 'sell_elg_vol', 'net_mf_vol', 'his_low', 'his_high', 'cost_5pct', 'cost_15pct', 'cost_50pct', 'cost_85pct', 'cost_95pct', 'weight_avg', 'winner_rate', 'lg_elg_net_buy_vol', 'flow_lg_elg_intensity', 'sm_net_buy_vol', 'total_buy_vol', 'lg_elg_buy_prop', 'flow_struct_buy_change', 'lg_elg_net_buy_vol_change', 'flow_lg_elg_accel', 'chip_concentration_range', 'chip_skewness', 'floating_chip_proxy', 'cost_support_15pct_change', 'cat_winner_price_zone', 'flow_chip_consistency', 'profit_taking_vs_absorb', 'cat_is_positive', 'upside_vol', 'downside_vol', 'vol_ratio', 'return_skew', 'return_kurtosis', 'volume_change_rate', 'cat_volume_breakout', 'turnover_deviation', 'cat_turnover_spike', 'avg_volume_ratio', 'cat_volume_ratio_breakout', 'vol_std_5', 'atr_6', 'obv', 'maobv_6', 'rsi_3', 'return_5', 'std_return_5', 'act_factor1', 'rank_act_factor1', 'log_circ_mv', 'cov', 'delta_cov', 'alpha_003', 'alpha_007', 'cat_up_limit', 'cat_down_limit', 'up_limit_count_10d', 'down_limit_count_10d', 'consecutive_up_limit', 'vol_break', 'smallcap_concentration', 'high_cost_break_days', 'arbr', 'momentum_factor', 'resonance_factor', 'log_close', 'cat_vol_spike', 'up', 'down', 'obv_maobv_6', 'cat_af2', 'cat_af3', 'cat_af4', 'active_buy_volume_large', 'active_buy_volume_big', 'active_buy_volume_small', 'buy_lg_vol_minus_sell_lg_vol', 'buy_elg_vol_minus_sell_elg_vol', 'ctrl_strength', 'low_cost_dev', 'asymmetry', 'lock_factor', 'cat_vol_break', 'cat_golden_resonance', 'mv_turnover_ratio', 'mv_adjusted_volume', 'mv_weighted_turnover', 'nonlinear_mv_volume', 'mv_volume_ratio', 'mv_momentum', 'lg_flow_mom_corr_20_60', 'lg_flow_accel', 'profit_pressure', 'underwater_resistance', 'cost_conc_std_20', 'profit_decay_20', 'vol_drop_profit_cnt_5', 'lg_flow_vol_interact_20', 'cost_break_confirm_cnt_5', 'atr_norm_channel_pos_14', 'lg_sm_flow_diverge_20', 'pullback_strong_20_20', 'vol_wgt_hist_pos_20', 'vol_adj_roc_20', 'industry_obv', 'industry_return_5', 'industry_return_20', 'industry__ema_5', 'industry__ema_13', 'industry__ema_20', 'industry__ema_60', 'industry_act_factor1', 'industry_act_factor2', 'industry_act_factor3', 'industry_act_factor4', 'industry_act_factor5', 'industry_act_factor6', 'industry_rank_act_factor1', 'industry_rank_act_factor2', 'industry_rank_act_factor3', 'industry_return_5_percentile', 'industry_return_20_percentile']\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"def count_nan_and_inf_per_feature(df: pd.DataFrame):\n",
|
||
" \"\"\"\n",
|
||
" 计算 DataFrame 中每个特征列的 NaN 和 Inf 值数量。\n",
|
||
"\n",
|
||
" Args:\n",
|
||
" df: 要分析的 pandas DataFrame。\n",
|
||
"\n",
|
||
" Returns:\n",
|
||
" 一个字典,包含两个 pandas Series:\n",
|
||
" - 'NaN_Count': 索引是列名,值是该列中 NaN 的数量。\n",
|
||
" - 'Inf_Count': 索引是列名,值是该列中 Inf 的数量。\n",
|
||
" \"\"\"\n",
|
||
" nan_counts = df.isna().sum()\n",
|
||
" # inf_counts = np.isinf(df).sum()\n",
|
||
" return nan_counts\n",
|
||
"\n",
|
||
"\n",
|
||
"nan_counts_series = count_nan_and_inf_per_feature(df)\n",
|
||
"\n",
|
||
"# 或者,如果您想以字典的形式获取结果:\n",
|
||
"nan_counts_dict = nan_counts_series.to_dict()\n",
|
||
"print(\"\\n每个特征列中的 NaN 值数量(字典形式):\")\n",
|
||
"for k, v in nan_counts_dict.items():\n",
|
||
" print(f'{k}: {v}')\n",
|
||
" if v > 30000 and k in feature_columns:\n",
|
||
" feature_columns.remove(k)\n",
|
||
"print(feature_columns)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 10,
|
||
"id": "e088bd8a357e815a",
|
||
"metadata": {
|
||
"ExecuteTime": {
|
||
"end_time": "2025-04-13T15:39:47.461434Z",
|
||
"start_time": "2025-04-13T15:39:44.369664Z"
|
||
}
|
||
},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"gen\tnevals\tavg \tstd \tmin\tmax \n",
|
||
"0 \t64 \t-0.387605\t0.492269\t-1 \t0.84339\n",
|
||
"1 \t52 \t-0.0280574\t0.328787\t-1 \t0.84339\n",
|
||
"2 \t56 \t-0.0442643\t0.498012\t-1 \t0.84339\n",
|
||
"3 \t50 \t0.0843881 \t0.506873\t-1 \t0.84339\n",
|
||
"4 \t56 \t0.128797 \t0.586781\t-1 \t0.84339\n",
|
||
"5 \t52 \t0.107366 \t0.586957\t-1 \t0.918103\n",
|
||
"6 \t52 \t0.0602483 \t0.666345\t-1 \t0.918103\n",
|
||
"7 \t54 \t0.177717 \t0.561644\t-1 \t0.918103\n",
|
||
"8 \t57 \t0.206183 \t0.620791\t-1 \t0.957887\n",
|
||
"9 \t51 \t0.253306 \t0.667259\t-1 \t1.07875 \n",
|
||
"10 \t53 \t0.19914 \t0.681541\t-1 \t1.14356 \n",
|
||
"11 \t54 \t0.173093 \t0.752007\t-1 \t1.24408 \n",
|
||
"12 \t53 \t0.429303 \t0.636249\t-1.07606\t1.24408 \n",
|
||
"13 \t46 \t0.443469 \t0.754764\t-1.17052\t1.24408 \n",
|
||
"14 \t57 \t0.412168 \t0.719715\t-1.2066 \t1.24408 \n",
|
||
"15 \t47 \t0.420095 \t0.833547\t-1.20899\t1.25608 \n",
|
||
"16 \t46 \t0.516075 \t0.916347\t-1.16556\t1.25765 \n",
|
||
"17 \t48 \t0.52129 \t0.872883\t-1 \t1.30663 \n",
|
||
"18 \t53 \t0.530992 \t0.923366\t-1 \t1.3677 \n",
|
||
"19 \t54 \t0.569299 \t0.861833\t-1.39138\t1.3677 \n",
|
||
"20 \t51 \t0.538589 \t0.883032\t-1.12472\t1.3677 \n",
|
||
"21 \t49 \t0.684813 \t0.874059\t-1 \t1.3677 \n",
|
||
"22 \t46 \t0.659823 \t0.86879 \t-1.17051\t1.3677 \n",
|
||
"23 \t42 \t0.678971 \t0.886044\t-1.39138\t1.3677 \n",
|
||
"24 \t55 \t0.639381 \t0.905808\t-1.39138\t1.37645 \n",
|
||
"25 \t42 \t0.721136 \t0.915513\t-1.30205\t1.39372 \n",
|
||
"26 \t56 \t0.695918 \t0.849837\t-1.0437 \t1.39372 \n",
|
||
"27 \t56 \t0.465007 \t0.934313\t-1 \t1.39372 \n",
|
||
"28 \t51 \t0.714563 \t0.88635 \t-1.13547\t1.43745 \n",
|
||
"29 \t49 \t0.687478 \t0.84568 \t-1 \t1.43745 \n",
|
||
"30 \t50 \t0.646657 \t0.835957\t-1 \t1.43745 \n",
|
||
"31 \t49 \t0.615978 \t0.939622\t-1.04846\t1.43745 \n",
|
||
"32 \t49 \t0.654171 \t0.973861\t-1.12771\t1.43745 \n",
|
||
"\n",
|
||
"Best Factors Found:\n",
|
||
"Fitness: 1.4375, Factor 1: protected_div_torch(mul(protected_div_torch(add(return_kurtosis, profit_pressure), mul(cost_85pct, buy_elg_vol_minus_sell_elg_vol)), protected_div_torch(cost_break_confirm_cnt_5, pow(cos(lg_flow_vol_interact_20), cos(chip_concentration_range)))), sub(add(obv, protected_div_torch(cost_break_confirm_cnt_5, cos(chip_skewness))), add(obv, protected_div_torch(add(return_kurtosis, profit_pressure), pow(alpha_007, active_buy_volume_big)))))\n",
|
||
"Fitness: 1.3937, Factor 2: protected_div_torch(mul(protected_div_torch(protected_div_torch(add(return_kurtosis, profit_pressure), pow(alpha_007, active_buy_volume_big)), delta_cov), protected_div_torch(protected_div_torch(add(return_kurtosis, profit_pressure), pow(alpha_007, active_buy_volume_big)), rank_act_factor2)), sub(add(obv, protected_div_torch(cost_break_confirm_cnt_5, cos(chip_skewness))), add(obv, protected_div_torch(add(return_kurtosis, profit_pressure), pow(alpha_007, active_buy_volume_big)))))\n",
|
||
"Fitness: 1.3843, Factor 3: protected_div_torch(mul(protected_div_torch(protected_div_torch(profit_pressure, pow(alpha_007, active_buy_volume_big)), delta_cov), protected_div_torch(protected_div_torch(add(return_kurtosis, profit_pressure), pow(alpha_007, active_buy_volume_big)), rank_act_factor2)), sub(add(obv, protected_div_torch(cost_break_confirm_cnt_5, cos(chip_skewness))), add(obv, protected_div_torch(add(return_kurtosis, profit_pressure), pow(alpha_007, active_buy_volume_big)))))\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"from deap import creator, gp, tools, base, algorithms\n",
|
||
"import numpy as np\n",
|
||
"import pandas as pd\n",
|
||
"import torch\n",
|
||
"from scipy.stats import spearmanr\n",
|
||
"import operator\n",
|
||
"\n",
|
||
"# 保护性除法函数 (PyTorch 版本)\n",
|
||
"def protected_div_torch(left, right):\n",
|
||
" return torch.where(right != 0, left / right, torch.ones_like(left))\n",
|
||
"\n",
|
||
"def generate_deap_factors_pytorch_v2(df: pd.DataFrame, numeric_columns: list, target_column: str = 'future_return', date_column: str = 'trade_date', params: dict = None, random_state: int = 42):\n",
|
||
" \"\"\"\n",
|
||
" 使用 deap 库通过遗传编程生成新的因子,并使用 PyTorch 算子和计算,过滤 NaN 值。\n",
|
||
"\n",
|
||
" Args:\n",
|
||
" df (pd.DataFrame): 包含因子和目标变量的数据框。\n",
|
||
" numeric_columns (list): 数值型因子列名的列表。\n",
|
||
" target_column (str): 目标变量的列名,默认为 'future_return'。\n",
|
||
" params (dict): deap 进化算法的参数字典。\n",
|
||
" random_state (int): 随机种子,用于保证结果的可重复性。\n",
|
||
"\n",
|
||
" Returns:\n",
|
||
" list: 包含最佳因子表达式的列表。\n",
|
||
" \"\"\"\n",
|
||
" if params is None:\n",
|
||
" params = {}\n",
|
||
"\n",
|
||
" # 设置随机种子\n",
|
||
" np.random.seed(random_state)\n",
|
||
" torch.manual_seed(random_state)\n",
|
||
"\n",
|
||
" # 1. 定义原始集 (Primitive Set) - 使用 PyTorch 算子\n",
|
||
" pset_torch = gp.PrimitiveSet(\"PYTORCH\", arity=len(numeric_columns))\n",
|
||
" pset_torch.addPrimitive(torch.add, 2)\n",
|
||
" pset_torch.addPrimitive(torch.sub, 2)\n",
|
||
" pset_torch.addPrimitive(torch.mul, 2)\n",
|
||
" pset_torch.addPrimitive(protected_div_torch, 2) # 使用 PyTorch 保护性除法\n",
|
||
" # 新增的复杂算子\n",
|
||
" pset_torch.addPrimitive(torch.sin, 1) # 正弦函数 (一元算子)\n",
|
||
" pset_torch.addPrimitive(torch.cos, 1) # 余弦函数 (一元算子)\n",
|
||
" # pset_torch.addPrimitive(torch.abs, 1) # 绝对值 (一元算子)\n",
|
||
" # pset_torch.addPrimitive(torch.sqrt, 1) # 平方根 (一元算子)\n",
|
||
" pset_torch.addPrimitive(torch.pow, 2) # 指数运算 (二元算子,例如 x 的 y 次方)\n",
|
||
" # pset_torch.addPrimitive(torch.tanh, 1) # 双曲正切函数 (一元算子)\n",
|
||
"\n",
|
||
" # def rate_of_change_torch(x, y): # 计算 y 相对于 x 的变化率\n",
|
||
" # return (y - x) / (x + 1e-8)\n",
|
||
" # pset_torch.addPrimitive(rate_of_change_torch, 2)\n",
|
||
"\n",
|
||
" # def covariance_like_torch(x, y):\n",
|
||
" # mean_x = torch.mean(x, dim=0, keepdim=True) # 保持维度以便广播\n",
|
||
" # mean_y = torch.mean(y, dim=0, keepdim=True)\n",
|
||
" # return (x - mean_x) * (y - mean_y)\n",
|
||
"\n",
|
||
" # pset_torch.addPrimitive(covariance_like_torch, 2)\n",
|
||
"\n",
|
||
" # 将 numeric_columns 作为终端添加到原始集\n",
|
||
" pset_torch.renameArguments(**{f\"ARG{i}\": col for i, col in enumerate(numeric_columns)})\n",
|
||
"\n",
|
||
" # 2. 定义适应度和个体\n",
|
||
" # 目标是最大化 IC 夏普比率\n",
|
||
" creator.create(\"FitnessMax\", base.Fitness, weights=(1.0,))\n",
|
||
" creator.create(\"Individual\", gp.PrimitiveTree, fitness=creator.FitnessMax)\n",
|
||
"\n",
|
||
" # 3. 创建工具箱 (Toolbox)\n",
|
||
" toolbox = base.Toolbox()\n",
|
||
" toolbox.register(\"expr_torch\", gp.genHalfAndHalf, pset=pset_torch, min_=1, max_=3) # 调整 min_/max_ 以控制表达式复杂性\n",
|
||
" toolbox.register(\"individual\", tools.initIterate, creator.Individual, toolbox.expr_torch)\n",
|
||
" toolbox.register(\"population\", tools.initRepeat, list, toolbox.individual)\n",
|
||
" toolbox.register(\"compile_torch\", gp.compile, pset=pset_torch) # 编译为 PyTorch 函数\n",
|
||
"\n",
|
||
" # 准备 PyTorch 张量数据 (所有日期所有股票)\n",
|
||
" device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
|
||
" data_tensor_all = torch.from_numpy(df[numeric_columns].values).float().to(device)\n",
|
||
" target_tensor_all = torch.from_numpy(df[target_column].values).float().to(device)\n",
|
||
" dates_all = df[date_column].values # 获取日期 numpy 数组\n",
|
||
"\n",
|
||
" # 4. 定义基于 PyTorch + IC 夏普比率的适应度函数\n",
|
||
" def evaluate_torch_cuda_ic_sharpe(individual, data_tensor_all, target_tensor_all, dates_all):\n",
|
||
" # 将个体(表达式树)编译成可执行的 PyTorch 函数\n",
|
||
" func_torch = toolbox.compile_torch(expr=individual)\n",
|
||
"\n",
|
||
" try:\n",
|
||
" # 应用该函数到 PyTorch 张量 (一次性计算所有日期所有股票的因子值)\n",
|
||
" # 处理可能的维度不一致,确保输出是一维或二维 (N, 1) 的张量\n",
|
||
" factor_values_tensor = func_torch(*torch.split(data_tensor_all, 1, dim=1))\n",
|
||
" if factor_values_tensor.ndim > 1 and factor_values_tensor.shape[1] != 1:\n",
|
||
" # 如果输出是 (N, M) 其中 M > 1,可能需要一个聚合操作,这里暂时返回负适应度\n",
|
||
" print(f\"警告: 因子表达式输出张量维度为 {factor_values_tensor.shape},期望 (N, 1)。\")\n",
|
||
" return (-1.0,)\n",
|
||
" factor_values_tensor = factor_values_tensor.flatten() # 确保是展平的一维张量\n",
|
||
"\n",
|
||
" # 将 PyTorch 张量移回 CPU 并转换为 NumPy 数组\n",
|
||
" factor_values_np = factor_values_tensor.cpu().numpy()\n",
|
||
" target_np = target_tensor_all.cpu().numpy().flatten() # 目标也展平\n",
|
||
" dates_np = dates_all # 日期已经是 numpy 数组\n",
|
||
"\n",
|
||
" # 创建一个临时 Pandas DataFrame 以便按日期分组计算每日 IC\n",
|
||
" temp_df = pd.DataFrame({\n",
|
||
" 'date': dates_np,\n",
|
||
" 'factor_value': factor_values_np,\n",
|
||
" 'target_value': target_np\n",
|
||
" })\n",
|
||
"\n",
|
||
" # 计算每日 Rank IC\n",
|
||
" # 在分组应用 spearmanr 时处理 NaN 和数据点不足的问题\n",
|
||
" daily_ics = temp_df.groupby('date').apply(\n",
|
||
" lambda x: spearmanr(x['factor_value'], x['target_value'])[0]\n",
|
||
" if len(x) >= 2 and x['factor_value'].notna().sum() >= 2 and x['target_value'].notna().sum() >= 2 # 确保分组内有效数据点 >= 2\n",
|
||
" else np.nan # 数据点不足或计算失败时返回 NaN\n",
|
||
" ).dropna() # 移除 NaN 的每日 IC\n",
|
||
"\n",
|
||
" # 计算 IC 夏普比率\n",
|
||
" if len(daily_ics) < 5: # 需要至少几个有效日 IC 才能计算夏普比率\n",
|
||
" # print(f\"警告: 有效日 IC 数量不足 ({len(daily_ics)}),无法计算夏普比率。\")\n",
|
||
" return (-1.0,) # 有效日 IC 太少,返回负适应度\n",
|
||
"\n",
|
||
" ic_mean = daily_ics.mean()\n",
|
||
" ic_std = daily_ics.std()\n",
|
||
"\n",
|
||
" # 处理标准差为零的情况 (非常罕见,可能意味着每日 IC 是常数)\n",
|
||
" if ic_std == 0:\n",
|
||
" ic_sharpe = ic_mean * 1e6 if ic_mean > 0 else -1.0 # 如果均值>0且标差为0,给一个很大的正值\n",
|
||
" else:\n",
|
||
" ic_sharpe = ic_mean / ic_std\n",
|
||
"\n",
|
||
" # 返回 IC 夏普比率作为适应度 (需要最大化)\n",
|
||
" # 如果计算结果是 NaN (例如,mean/std 导致 NaN),返回负值\n",
|
||
" return (ic_sharpe if not np.isnan(ic_sharpe) else -1.0,)\n",
|
||
"\n",
|
||
" except (ValueError, TypeError, ZeroDivisionError, RuntimeError) as e:\n",
|
||
" # 打印错误信息和导致错误的个体,以便调试\n",
|
||
" print(f\"Error during evaluation for individual {individual}: {e}\")\n",
|
||
" return (-1.0,) # 如果计算过程中出现错误,返回一个很小的负值\n",
|
||
"\n",
|
||
" # 修改 toolbox.register 调用,将 target_tensor 传递给 evaluate_torch_cuda\n",
|
||
" toolbox.register(\"evaluate\", evaluate_torch_cuda_ic_sharpe, data_tensor_all=data_tensor_all, target_tensor_all=target_tensor_all, dates_all=dates_all)\n",
|
||
" toolbox.register(\"select\", tools.selTournament, tournsize=params.get('tournament_size', 3))\n",
|
||
" toolbox.register(\"mate\", gp.cxOnePointLeafBiased, termpb=0.2) # 移除 pset=pset\n",
|
||
" toolbox.register(\"mutate\", gp.mutUniform, expr=toolbox.expr_torch, pset=pset_torch) # 使用 PyTorch 原始集\n",
|
||
"\n",
|
||
" MAX_TREE_DEPTH = 5\n",
|
||
"\n",
|
||
" toolbox.decorate(\"mate\", gp.staticLimit(key=operator.attrgetter('height'), max_value=MAX_TREE_DEPTH))\n",
|
||
" toolbox.decorate(\"mutate\", gp.staticLimit(key=operator.attrgetter('height'), max_value=MAX_TREE_DEPTH))\n",
|
||
"\n",
|
||
" # 5. 设置进化参数\n",
|
||
" population_size = params.get('population_size', 100)\n",
|
||
" generations = params.get('generations', 10)\n",
|
||
" crossover_probability = params.get('crossover_probability', 0.7) # 调整参数以增加探索\n",
|
||
" mutation_probability = params.get('mutation_probability', 0.3) # 调整参数以增加探索\n",
|
||
"\n",
|
||
" # 6. 初始化种群\n",
|
||
" pop = toolbox.population(n=population_size)\n",
|
||
" hof = tools.HallOfFame(params.get('hall_of_fame_size', 5)) # 保留最佳的几个个体\n",
|
||
" stats = tools.Statistics(lambda ind: ind.fitness.values)\n",
|
||
" stats.register(\"avg\", np.mean)\n",
|
||
" stats.register(\"std\", np.std)\n",
|
||
" stats.register(\"min\", np.min)\n",
|
||
" stats.register(\"max\", np.max)\n",
|
||
"\n",
|
||
" # 7. 运行进化算法\n",
|
||
" algorithms.eaSimple(pop, toolbox, cxpb=crossover_probability, mutpb=mutation_probability, ngen=generations,\n",
|
||
" stats=stats, halloffame=hof, verbose=True)\n",
|
||
"\n",
|
||
" # 8. 返回最佳因子表达式\n",
|
||
" return hof, stats\n",
|
||
"\n",
|
||
"params = {\n",
|
||
" 'population_size': 64,\n",
|
||
" 'generations': 32,\n",
|
||
" 'crossover_probability': 0.7,\n",
|
||
" 'mutation_probability': 0.3,\n",
|
||
" 'tournament_size': 4,\n",
|
||
" 'hall_of_fame_size': 3\n",
|
||
"}\n",
|
||
"\n",
|
||
"best_factors_hof, stats = generate_deap_factors_pytorch_v2(df.copy(), numeric_columns, params=params)\n",
|
||
"\n",
|
||
"print(\"\\nBest Factors Found:\")\n",
|
||
"for i, ind in enumerate(best_factors_hof):\n",
|
||
" fitness_value = ind.fitness.values[0] # 获取适应度值\n",
|
||
" print(f\"Fitness: {fitness_value:.4f}, Factor {i+1}: {ind}\")"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 14,
|
||
"id": "a0b3d7551ef0c81f",
|
||
"metadata": {
|
||
"ExecuteTime": {
|
||
"end_time": "2025-04-13T15:39:47.502867Z",
|
||
"start_time": "2025-04-13T15:39:47.461434Z"
|
||
}
|
||
},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"全面因子分析报告 - 特征因子: 'generated_factor'\n",
|
||
"------------------------------------------------------------\n",
|
||
"整体 Rank IC: 0.0817\n",
|
||
"整体 P-value: 0.0000\n",
|
||
"------------------------------------------------------------\n",
|
||
"计算滚动 Rank IC (按 'D' 窗口)...\n",
|
||
"滚动 Rank IC 统计量 (D):\n",
|
||
" 均值: 0.0124\n",
|
||
" 标准差: 0.2330\n",
|
||
" 夏普比率 (IC Mean / IC Std): 0.0531\n",
|
||
" T-statistic: 1.4577\n",
|
||
" T-statistic P-value: 0.1453\n",
|
||
"------------------------------------------------------------\n",
|
||
"Hit Ratio (正向 Rank IC 比例): 0.5060\n",
|
||
"------------------------------------------------------------\n",
|
||
"因子 10 分位数分析 (按因子值从小到大排序):\n",
|
||
" 第 1 分位数: 平均 'future_return' = -0.0004\n",
|
||
" 第 2 分位数: 平均 'future_return' = -0.0008\n",
|
||
" 第 3 分位数: 平均 'future_return' = -0.0004\n",
|
||
" 第 4 分位数: 平均 'future_return' = 0.0005\n",
|
||
" 第 5 分位数: 平均 'future_return' = 0.0007\n",
|
||
" 第 6 分位数: 平均 'future_return' = 0.0015\n",
|
||
" 第 7 分位数: 平均 'future_return' = 0.0021\n",
|
||
" 第 8 分位数: 平均 'future_return' = 0.0033\n",
|
||
" 第 9 分位数: 平均 'future_return' = 0.0054\n",
|
||
" 第 10 分位数: 平均 'future_return' = 0.0135\n",
|
||
"\n",
|
||
"因子值的分位数范围:\n",
|
||
" 第 1 分位数: [-1.0490, 0.0581]\n",
|
||
" 第 2 分位数: [0.0581, 0.1051]\n",
|
||
" 第 3 分位数: [0.1051, 0.1458]\n",
|
||
" 第 4 分位数: [0.1458, 0.1881]\n",
|
||
" 第 5 分位数: [0.1881, 0.2354]\n",
|
||
" 第 6 分位数: [0.2354, 0.2909]\n",
|
||
" 第 7 分位数: [0.2909, 0.3594]\n",
|
||
" 第 8 分位数: [0.3594, 0.4505]\n",
|
||
" 第 9 分位数: [0.4505, 0.5880]\n",
|
||
" 第 10 分位数: [0.5880, 1.9782]\n",
|
||
"------------------------------------------------------------\n",
|
||
"分析完成。\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"import pandas as pd\n",
|
||
"import numpy as np\n",
|
||
"import torch\n",
|
||
"\n",
|
||
"target_column = 'future_return'\n",
|
||
"# 假设您已经定义了 protected_div_torch 函数\n",
|
||
"def protected_div_torch(left, right):\n",
|
||
" return torch.where(right != 0, left / right, torch.ones_like(left))\n",
|
||
"\n",
|
||
"def protected_div_np(left, right):\n",
|
||
" \"\"\"安全除法,避免除以零的错误\"\"\"\n",
|
||
" return np.where(right != 0, left / right, np.ones_like(left) * np.nan) # 除以零时返回 NaN\n",
|
||
"\n",
|
||
"def calculate_factor_4(df: pd.DataFrame) -> pd.Series:\n",
|
||
" \"\"\"\n",
|
||
" 计算因子: sub(add(add(protected_div_torch(pow(pct_chg, std_return_90), cost_95pct), protected_div_torch(industry_act_factor6, cost_95pct)), pow(protected_div_torch(protected_div_torch(act_factor6, cost_95pct), cost_95pct), protected_div_torch(protected_div_torch(act_factor6, cost_95pct), cost_95pct))), cos(industry_act_factor1)).\n",
|
||
"\n",
|
||
" Args:\n",
|
||
" df (pd.DataFrame): 包含必要列的数据框。\n",
|
||
"\n",
|
||
" Returns:\n",
|
||
" pd.Series: 计算得到的因子值。\n",
|
||
" \"\"\"\n",
|
||
" pct_chg = df['pct_chg']\n",
|
||
" std_return_90 = df['std_return_90']\n",
|
||
" cost_95pct = df['cost_95pct']\n",
|
||
" industry_act_factor6 = df['industry_act_factor6']\n",
|
||
" act_factor6 = df['act_factor6']\n",
|
||
" industry_act_factor1 = df['industry_act_factor1']\n",
|
||
"\n",
|
||
" # Term 1: protected_div_torch(pow(pct_chg, std_return_90), cost_95pct)\n",
|
||
" term1_num = np.power(pct_chg, std_return_90)\n",
|
||
" term1 = protected_div_np(term1_num, cost_95pct)\n",
|
||
"\n",
|
||
" # Term 2: protected_div_torch(industry_act_factor6, cost_95pct)\n",
|
||
" term2 = protected_div_np(industry_act_factor6, cost_95pct)\n",
|
||
"\n",
|
||
" # Term 3: pow(protected_div_torch(protected_div_torch(act_factor6, cost_95pct), cost_95pct), protected_div_torch(protected_div_torch(act_factor6, cost_95pct), cost_95pct))\n",
|
||
" term3_base_inner = protected_div_np(act_factor6, cost_95pct)\n",
|
||
" term3_base = protected_div_np(term3_base_inner, cost_95pct)\n",
|
||
" term3_exponent_inner = protected_div_np(act_factor6, cost_95pct)\n",
|
||
" term3_exponent = protected_div_np(term3_exponent_inner, cost_95pct)\n",
|
||
" term3 = np.power(term3_base, term3_exponent)\n",
|
||
"\n",
|
||
"\n",
|
||
" # Sum of the first three terms\n",
|
||
" add_terms = term1 + term2 + term3\n",
|
||
"\n",
|
||
" # Term 4: cos(industry_act_factor1)\n",
|
||
" term4 = np.cos(industry_act_factor1)\n",
|
||
"\n",
|
||
" # Final factor\n",
|
||
" factor4 = add_terms - term4\n",
|
||
"\n",
|
||
" return factor4\n",
|
||
"\n",
|
||
"df['generated_factor'] = calculate_factor_4(df)\n",
|
||
"\n",
|
||
"import pandas as pd\n",
|
||
"import numpy as np\n",
|
||
"from scipy.stats import spearmanr, ttest_1samp\n",
|
||
"\n",
|
||
"def comprehensive_factor_analysis(df: pd.DataFrame, factor_column: str, target_column: str = 'future_return', date_column: str = 'trade_date', rolling_window: str = 'D', n_deciles: int = 10):\n",
|
||
" \"\"\"\n",
|
||
" 对 DataFrame 中的一个特征因子进行全面分析。\n",
|
||
"\n",
|
||
" Args:\n",
|
||
" df (pd.DataFrame): 包含因子和目标变量的数据框。\n",
|
||
" factor_column (str): 要分析的特征因子的列名。\n",
|
||
" target_column (str): 目标变量的列名,默认为 'future_return'。\n",
|
||
" date_column (str): 包含日期信息的列名,默认为 'trade_date'。\n",
|
||
" rolling_window (str): 滚动 Rank IC 的时间窗口(例如 'D' 表示按天,'W' 表示按周)。\n",
|
||
" n_deciles (int): 进行分位数分析时使用的分位数数量,默认为 10。\n",
|
||
" \"\"\"\n",
|
||
" if factor_column not in df.columns:\n",
|
||
" print(f\"错误: 特征因子列 '{factor_column}' 不存在于 DataFrame 中。\")\n",
|
||
" return\n",
|
||
" if target_column not in df.columns:\n",
|
||
" print(f\"错误: 目标列 '{target_column}' 不存在于 DataFrame 中。\")\n",
|
||
" return\n",
|
||
" if date_column not in df.columns:\n",
|
||
" print(f\"错误: 日期列 '{date_column}' 不存在于 DataFrame 中。\")\n",
|
||
" return\n",
|
||
"\n",
|
||
" # 确保日期列是 datetime 类型并设置为索引\n",
|
||
" df_analy = df.copy()\n",
|
||
" df_analy[date_column] = pd.to_datetime(df_analy[date_column])\n",
|
||
" df_analy = df_analy.set_index(date_column)\n",
|
||
"\n",
|
||
" # 移除因子或目标变量为 NaN 的行\n",
|
||
" df_analy = df_analy.dropna(subset=[factor_column, target_column])\n",
|
||
"\n",
|
||
" if len(df_analy) < 2:\n",
|
||
" print(\"警告: 有效数据点太少,无法进行分析。\")\n",
|
||
" return\n",
|
||
"\n",
|
||
" print(f\"全面因子分析报告 - 特征因子: '{factor_column}'\")\n",
|
||
" print(\"-\" * 60)\n",
|
||
"\n",
|
||
" # 1. 计算整体 Rank IC\n",
|
||
" overall_rank_ic, overall_p_value = spearmanr(df_analy[factor_column], df_analy[target_column])\n",
|
||
" print(f\"整体 Rank IC: {overall_rank_ic:.4f}\")\n",
|
||
" print(f\"整体 P-value: {overall_p_value:.4f}\")\n",
|
||
" print(\"-\" * 60)\n",
|
||
"\n",
|
||
" # 2. 计算滚动 Rank IC (按指定时间窗口)\n",
|
||
" print(f\"计算滚动 Rank IC (按 '{rolling_window}' 窗口)...\")\n",
|
||
" rolling_ics = df_analy.groupby(df_analy.index.to_period(rolling_window)).apply(\n",
|
||
" lambda x: spearmanr(x[factor_column], x[target_column])[0] if len(x) >= 2 else np.nan\n",
|
||
" ).dropna()\n",
|
||
"\n",
|
||
" if len(rolling_ics) < 2:\n",
|
||
" print(\"警告: 滚动 Rank IC 有效周期太少,无法计算统计量。\")\n",
|
||
" else:\n",
|
||
" # 3. 滚动 IC 统计量\n",
|
||
" ic_mean = rolling_ics.mean()\n",
|
||
" ic_std = rolling_ics.std()\n",
|
||
" ic_sharpe = ic_mean / ic_std if ic_std != 0 else np.nan\n",
|
||
" t_statistic, p_value_t = ttest_1samp(rolling_ics, 0) # 检验均值是否显著不为零\n",
|
||
"\n",
|
||
" print(f\"滚动 Rank IC 统计量 ({rolling_window}):\")\n",
|
||
" print(f\" 均值: {ic_mean:.4f}\")\n",
|
||
" print(f\" 标准差: {ic_std:.4f}\")\n",
|
||
" print(f\" 夏普比率 (IC Mean / IC Std): {ic_sharpe:.4f}\")\n",
|
||
" print(f\" T-statistic: {t_statistic:.4f}\")\n",
|
||
" print(f\" T-statistic P-value: {p_value_t:.4f}\")\n",
|
||
" print(\"-\" * 60)\n",
|
||
"\n",
|
||
" # 4. Hit Ratio (正向 Rank IC 的比例)\n",
|
||
" hit_ratio = (rolling_ics > 0).sum() / len(rolling_ics)\n",
|
||
" print(f\"Hit Ratio (正向 Rank IC 比例): {hit_ratio:.4f}\")\n",
|
||
" print(\"-\" * 60)\n",
|
||
"\n",
|
||
" # 5. 分位数分析 (在整个数据集上进行)\n",
|
||
" print(f\"因子 {n_deciles} 分位数分析 (按因子值从小到大排序):\")\n",
|
||
" df_analy['decile'] = pd.qcut(df_analy[factor_column], q=n_deciles, labels=False, duplicates='drop')\n",
|
||
" decile_analysis = df_analy.groupby('decile')[target_column].mean().sort_index()\n",
|
||
"\n",
|
||
" if len(decile_analysis) > 0:\n",
|
||
" for decile, avg_return in decile_analysis.items():\n",
|
||
" print(f\" 第 {decile + 1} 分位数: 平均 '{target_column}' = {avg_return:.4f}\")\n",
|
||
"\n",
|
||
" # 打印每个分位数的因子值范围\n",
|
||
" percentiles = np.linspace(0, 100, n_deciles + 1)\n",
|
||
" factor_percentiles = df_analy[factor_column].quantile(percentiles / 100)\n",
|
||
" print(\"\\n因子值的分位数范围:\")\n",
|
||
" # 修复分位数范围打印的 KeyError\n",
|
||
" for i in range(len(decile_analysis)): # 确保只打印实际存在的分位数\n",
|
||
" lower_bound = factor_percentiles[percentiles[i] / 100]\n",
|
||
" upper_bound = factor_percentiles[percentiles[i+1] / 100]\n",
|
||
" print(f\" 第 {i + 1} 分位数: [{lower_bound:.4f}, {upper_bound:.4f}]\")\n",
|
||
" else:\n",
|
||
" print(\"警告: 分位数分析无法执行,可能是因为数据点不足或因子值分布问题。\")\n",
|
||
"\n",
|
||
"\n",
|
||
" print(\"-\" * 60)\n",
|
||
" print(\"分析完成。\")\n",
|
||
"\n",
|
||
"comprehensive_factor_analysis(df, factor_column='generated_factor', target_column='future_return', date_column='trade_date', rolling_window='D', n_deciles=10)\n"
|
||
]
|
||
}
|
||
],
|
||
"metadata": {
|
||
"kernelspec": {
|
||
"display_name": "new_trader",
|
||
"language": "python",
|
||
"name": "python3"
|
||
},
|
||
"language_info": {
|
||
"codemirror_mode": {
|
||
"name": "ipython",
|
||
"version": 3
|
||
},
|
||
"file_extension": ".py",
|
||
"mimetype": "text/x-python",
|
||
"name": "python",
|
||
"nbconvert_exporter": "python",
|
||
"pygments_lexer": "ipython3",
|
||
"version": "3.11.11"
|
||
}
|
||
},
|
||
"nbformat": 4,
|
||
"nbformat_minor": 5
|
||
}
|