Classify2

This commit is contained in:
liaozhaorun
2025-05-13 15:30:06 +08:00
parent 791c84aba6
commit a4b05bb62f
20 changed files with 10737 additions and 7456 deletions

View File

@@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 8,
"id": "79a7758178bafdd3",
"metadata": {
"ExecuteTime": {
@@ -18,6 +18,8 @@
"name": "stdout",
"output_type": "stream",
"text": [
"The autoreload extension is already loaded. To reload it, use:\n",
" %reload_ext autoreload\n",
"e:\\PyProject\\NewStock\\main\\train\n"
]
}
@@ -44,7 +46,7 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 9,
"id": "a79cafb06a7e0e43",
"metadata": {
"ExecuteTime": {
@@ -68,7 +70,7 @@
"cyq perf\n",
"left merge on ['ts_code', 'trade_date']\n",
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 8601132 entries, 0 to 8601131\n",
"RangeIndex: 8611848 entries, 0 to 8611847\n",
"Data columns (total 32 columns):\n",
" # Column Dtype \n",
"--- ------ ----- \n",
@@ -143,7 +145,7 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 10,
"id": "cac01788dac10678",
"metadata": {
"ExecuteTime": {
@@ -211,7 +213,7 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 11,
"id": "c4e9e1d31da6dba6",
"metadata": {
"ExecuteTime": {
@@ -224,6 +226,8 @@
},
"outputs": [],
"source": [
"from main.factor.factor import *\n",
"\n",
"def calculate_indicators(df):\n",
" \"\"\"\n",
" 计算四个指标当日涨跌幅、5日移动平均、RSI、MACD。\n",
@@ -261,6 +265,10 @@
" df['amount_mean'] = df['amount'].rolling(window=20).mean() # 过去20天的平均成交额\n",
" df['amount_change_rate'] = (df['amount'] - df['amount_mean']) / df['amount_mean'] * 100 # 成交额变化率\n",
"\n",
" # df = sentiment_panic_greed_index(df)\n",
" # df = sentiment_market_breadth_proxy(df)\n",
" # df = sentiment_reversal_indicator(df)\n",
"\n",
" return df\n",
"\n",
"\n",
@@ -283,8 +291,10 @@
" df_final = df_all_indicators.pivot_table(\n",
" index='trade_date',\n",
" columns='ts_code',\n",
" values=['daily_return', 'RSI', 'MACD', 'Signal_line',\n",
" 'MACD_hist', 'up_ratio_20d', 'volume_change_rate', 'volatility',\n",
" values=['daily_return', \n",
" 'RSI', 'MACD', 'Signal_line', 'MACD_hist', \n",
" # 'sentiment_panic_greed_index',\n",
" 'up_ratio_20d', 'volume_change_rate', 'volatility',\n",
" 'amount_change_rate', 'amount_mean'],\n",
" aggfunc='last'\n",
" )\n",
@@ -303,7 +313,7 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 12,
"id": "a735bc02ceb4d872",
"metadata": {
"ExecuteTime": {
@@ -319,7 +329,7 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 30,
"id": "53f86ddc0677a6d7",
"metadata": {
"ExecuteTime": {
@@ -368,6 +378,10 @@
" lambda x: x.rank(pct=True))\n",
" industry_data['return_20_percentile'] = industry_data.groupby('trade_date')['return_20'].transform(\n",
" lambda x: x.rank(pct=True))\n",
"\n",
" # cs_rank_intraday_range(industry_data)\n",
" # cs_rank_close_pos_in_range(industry_data)\n",
"\n",
" industry_data = industry_data.drop(columns=['open', 'close', 'high', 'low', 'pe', 'pb', 'vol'])\n",
"\n",
" industry_data = industry_data.rename(\n",
@@ -382,7 +396,7 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 14,
"id": "dbe2fd8021b9417f",
"metadata": {
"ExecuteTime": {
@@ -410,7 +424,7 @@
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": 15,
"id": "85c3e3d0235ffffa",
"metadata": {
"ExecuteTime": {
@@ -433,7 +447,7 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": 16,
"id": "92d84ce15a562ec6",
"metadata": {
"ExecuteTime": {
@@ -459,6 +473,14 @@
"使用 'ann_date' 作为财务数据生效日期。\n",
"警告: 从 financial_data_subset 中移除了 366 行,因为其 'ts_code' 或 'ann_date' 列存在空值。\n",
"计算 BBI...\n",
"--- 计算日级别偏离度 (使用 pct_chg) ---\n",
"--- 计算日级别动量基准 (使用 pct_chg) ---\n",
"日级别动量基准计算完成 (使用 pct_chg)。\n",
"日级别偏离度计算完成 (使用 pct_chg)。\n",
"--- 计算日级别行业偏离度 (使用 pct_chg 和行业基准) ---\n",
"--- 计算日级别行业动量基准 (使用 pct_chg 和 cat_l2_code) ---\n",
"错误: 计算日级别行业动量基准需要以下列: ['pct_chg', 'cat_l2_code', 'trade_date', 'ts_code']。\n",
"错误: 计算日级别行业偏离度需要以下列: ['pct_chg', 'daily_industry_positive_benchmark', 'daily_industry_negative_benchmark']。请先运行 daily_industry_momentum_benchmark(df)。\n",
"Index(['ts_code', 'trade_date', 'open', 'close', 'high', 'low', 'vol',\n",
" 'pct_chg', 'turnover_rate', 'pe_ttm', 'circ_mv', 'total_mv',\n",
" 'volume_ratio', 'is_st', 'up_limit', 'down_limit', 'buy_sm_vol',\n",
@@ -468,9 +490,9 @@
" 'winner_rate', 'l2_code', 'undist_profit_ps', 'ocfps', 'AR', 'BR',\n",
" 'AR_BR', 'log_circ_mv', 'cashflow_to_ev_factor', 'book_to_price_ratio',\n",
" 'turnover_rate_mean_5', 'variance_20', 'bbi_ratio_factor',\n",
" 'lg_elg_net_buy_vol', 'flow_lg_elg_intensity', 'sm_net_buy_vol',\n",
" 'flow_divergence_diff', 'flow_divergence_ratio', 'total_buy_vol',\n",
" 'lg_elg_buy_prop', 'flow_struct_buy_change',\n",
" 'daily_deviation', 'lg_elg_net_buy_vol', 'flow_lg_elg_intensity',\n",
" 'sm_net_buy_vol', 'flow_divergence_diff', 'flow_divergence_ratio',\n",
" 'total_buy_vol', 'lg_elg_buy_prop', 'flow_struct_buy_change',\n",
" 'lg_elg_net_buy_vol_change', 'flow_lg_elg_accel',\n",
" 'chip_concentration_range', 'chip_skewness', 'floating_chip_proxy',\n",
" 'cost_support_15pct_change', 'cat_winner_price_zone',\n",
@@ -519,7 +541,7 @@
"Calculating cs_rank_flow_divergence...\n",
"Finished cs_rank_flow_divergence.\n",
"Calculating cs_rank_ind_adj_lg_flow...\n",
"Error calculating cs_rank_ind_adj_lg_flow: Missing 'cat_l2_code' column. Assigning NaN.\n",
"Finished cs_rank_ind_adj_lg_flow.\n",
"Calculating cs_rank_elg_buy_ratio...\n",
"Finished cs_rank_elg_buy_ratio.\n",
"Calculating cs_rank_rel_profit_margin...\n",
@@ -555,12 +577,12 @@
"Calculating cs_rank_size...\n",
"Finished cs_rank_size.\n",
"<class 'pandas.core.frame.DataFrame'>\n",
"Index: 4503567 entries, 0 to 4503566\n",
"Columns: 177 entries, ts_code to cs_rank_size\n",
"dtypes: bool(10), datetime64[ns](1), float64(161), int32(3), object(2)\n",
"RangeIndex: 4509585 entries, 0 to 4509584\n",
"Columns: 178 entries, ts_code to cs_rank_size\n",
"dtypes: bool(10), datetime64[ns](1), float64(162), int32(3), object(2)\n",
"memory usage: 5.6+ GB\n",
"None\n",
"['ts_code', 'trade_date', 'open', 'close', 'high', 'low', 'vol', 'pct_chg', 'turnover_rate', 'pe_ttm', 'circ_mv', 'total_mv', 'volume_ratio', 'is_st', 'up_limit', 'down_limit', 'buy_sm_vol', 'sell_sm_vol', 'buy_lg_vol', 'sell_lg_vol', 'buy_elg_vol', 'sell_elg_vol', 'net_mf_vol', 'his_low', 'his_high', 'cost_5pct', 'cost_15pct', 'cost_50pct', 'cost_85pct', 'cost_95pct', 'weight_avg', 'winner_rate', 'cat_l2_code', 'undist_profit_ps', 'ocfps', 'AR', 'BR', 'AR_BR', 'log_circ_mv', 'cashflow_to_ev_factor', 'book_to_price_ratio', 'turnover_rate_mean_5', 'variance_20', 'bbi_ratio_factor', 'lg_elg_net_buy_vol', 'flow_lg_elg_intensity', 'sm_net_buy_vol', 'flow_divergence_diff', 'flow_divergence_ratio', 'total_buy_vol', 'lg_elg_buy_prop', 'flow_struct_buy_change', 'lg_elg_net_buy_vol_change', 'flow_lg_elg_accel', 'chip_concentration_range', 'chip_skewness', 'floating_chip_proxy', 'cost_support_15pct_change', 'cat_winner_price_zone', 'flow_chip_consistency', 'profit_taking_vs_absorb', 'cat_is_positive', 'upside_vol', 'downside_vol', 'vol_ratio', 'return_skew', 'return_kurtosis', 'volume_change_rate', 'cat_volume_breakout', 'turnover_deviation', 'cat_turnover_spike', 'avg_volume_ratio', 'cat_volume_ratio_breakout', 'vol_spike', 'vol_std_5', 'atr_14', 'atr_6', 'obv', 'maobv_6', 'rsi_3', 'return_5', 'return_20', 'std_return_5', 'std_return_90', 'std_return_90_2', 'act_factor1', 'act_factor2', 'act_factor3', 'act_factor4', 'rank_act_factor1', 'rank_act_factor2', 'rank_act_factor3', 'cov', 'delta_cov', 'alpha_22_improved', 'alpha_003', 'alpha_007', 'alpha_013', 'vol_break', 'weight_roc5', 'price_cost_divergence', 'smallcap_concentration', 'cost_stability', 'high_cost_break_days', 'liquidity_risk', 'turnover_std', 'mv_volatility', 'volume_growth', 'mv_growth', 'momentum_factor', 'resonance_factor', 'log_close', 'cat_vol_spike', 'up', 'down', 'obv_maobv_6', 'std_return_5_over_std_return_90', 'std_return_90_minus_std_return_90_2', 'cat_af2', 'cat_af3', 'cat_af4', 'act_factor5', 'act_factor6', 'active_buy_volume_large', 'active_buy_volume_big', 'active_buy_volume_small', 'buy_lg_vol_minus_sell_lg_vol', 'buy_elg_vol_minus_sell_elg_vol', 'ctrl_strength', 'low_cost_dev', 'asymmetry', 'lock_factor', 'cat_vol_break', 'cost_atr_adj', 'cat_golden_resonance', 'mv_turnover_ratio', 'mv_adjusted_volume', 'mv_weighted_turnover', 'nonlinear_mv_volume', 'mv_volume_ratio', 'mv_momentum', 'lg_flow_mom_corr_20_60', 'lg_flow_accel', 'profit_pressure', 'underwater_resistance', 'cost_conc_std_20', 'profit_decay_20', 'vol_amp_loss_20', 'vol_drop_profit_cnt_5', 'lg_flow_vol_interact_20', 'cost_break_confirm_cnt_5', 'atr_norm_channel_pos_14', 'turnover_diff_skew_20', 'lg_sm_flow_diverge_20', 'pullback_strong_20_20', 'vol_wgt_hist_pos_20', 'vol_adj_roc_20', 'cs_rank_net_lg_flow_val', 'cs_rank_flow_divergence', 'cs_rank_ind_adj_lg_flow', 'cs_rank_elg_buy_ratio', 'cs_rank_rel_profit_margin', 'cs_rank_cost_breadth', 'cs_rank_dist_to_upper_cost', 'cs_rank_winner_rate', 'cs_rank_intraday_range', 'cs_rank_close_pos_in_range', 'cs_rank_opening_gap', 'cs_rank_pos_in_hist_range', 'cs_rank_vol_x_profit_margin', 'cs_rank_lg_flow_price_concordance', 'cs_rank_turnover_per_winner', 'cs_rank_ind_cap_neutral_pe', 'cs_rank_volume_ratio', 'cs_rank_elg_buy_sell_sm_ratio', 'cs_rank_cost_dist_vol_ratio', 'cs_rank_size']\n"
"['ts_code', 'trade_date', 'open', 'close', 'high', 'low', 'vol', 'pct_chg', 'turnover_rate', 'pe_ttm', 'circ_mv', 'total_mv', 'volume_ratio', 'is_st', 'up_limit', 'down_limit', 'buy_sm_vol', 'sell_sm_vol', 'buy_lg_vol', 'sell_lg_vol', 'buy_elg_vol', 'sell_elg_vol', 'net_mf_vol', 'his_low', 'his_high', 'cost_5pct', 'cost_15pct', 'cost_50pct', 'cost_85pct', 'cost_95pct', 'weight_avg', 'winner_rate', 'cat_l2_code', 'undist_profit_ps', 'ocfps', 'AR', 'BR', 'AR_BR', 'log_circ_mv', 'cashflow_to_ev_factor', 'book_to_price_ratio', 'turnover_rate_mean_5', 'variance_20', 'bbi_ratio_factor', 'daily_deviation', 'lg_elg_net_buy_vol', 'flow_lg_elg_intensity', 'sm_net_buy_vol', 'flow_divergence_diff', 'flow_divergence_ratio', 'total_buy_vol', 'lg_elg_buy_prop', 'flow_struct_buy_change', 'lg_elg_net_buy_vol_change', 'flow_lg_elg_accel', 'chip_concentration_range', 'chip_skewness', 'floating_chip_proxy', 'cost_support_15pct_change', 'cat_winner_price_zone', 'flow_chip_consistency', 'profit_taking_vs_absorb', 'cat_is_positive', 'upside_vol', 'downside_vol', 'vol_ratio', 'return_skew', 'return_kurtosis', 'volume_change_rate', 'cat_volume_breakout', 'turnover_deviation', 'cat_turnover_spike', 'avg_volume_ratio', 'cat_volume_ratio_breakout', 'vol_spike', 'vol_std_5', 'atr_14', 'atr_6', 'obv', 'maobv_6', 'rsi_3', 'return_5', 'return_20', 'std_return_5', 'std_return_90', 'std_return_90_2', 'act_factor1', 'act_factor2', 'act_factor3', 'act_factor4', 'rank_act_factor1', 'rank_act_factor2', 'rank_act_factor3', 'cov', 'delta_cov', 'alpha_22_improved', 'alpha_003', 'alpha_007', 'alpha_013', 'vol_break', 'weight_roc5', 'price_cost_divergence', 'smallcap_concentration', 'cost_stability', 'high_cost_break_days', 'liquidity_risk', 'turnover_std', 'mv_volatility', 'volume_growth', 'mv_growth', 'momentum_factor', 'resonance_factor', 'log_close', 'cat_vol_spike', 'up', 'down', 'obv_maobv_6', 'std_return_5_over_std_return_90', 'std_return_90_minus_std_return_90_2', 'cat_af2', 'cat_af3', 'cat_af4', 'act_factor5', 'act_factor6', 'active_buy_volume_large', 'active_buy_volume_big', 'active_buy_volume_small', 'buy_lg_vol_minus_sell_lg_vol', 'buy_elg_vol_minus_sell_elg_vol', 'ctrl_strength', 'low_cost_dev', 'asymmetry', 'lock_factor', 'cat_vol_break', 'cost_atr_adj', 'cat_golden_resonance', 'mv_turnover_ratio', 'mv_adjusted_volume', 'mv_weighted_turnover', 'nonlinear_mv_volume', 'mv_volume_ratio', 'mv_momentum', 'lg_flow_mom_corr_20_60', 'lg_flow_accel', 'profit_pressure', 'underwater_resistance', 'cost_conc_std_20', 'profit_decay_20', 'vol_amp_loss_20', 'vol_drop_profit_cnt_5', 'lg_flow_vol_interact_20', 'cost_break_confirm_cnt_5', 'atr_norm_channel_pos_14', 'turnover_diff_skew_20', 'lg_sm_flow_diverge_20', 'pullback_strong_20_20', 'vol_wgt_hist_pos_20', 'vol_adj_roc_20', 'cs_rank_net_lg_flow_val', 'cs_rank_flow_divergence', 'cs_rank_ind_adj_lg_flow', 'cs_rank_elg_buy_ratio', 'cs_rank_rel_profit_margin', 'cs_rank_cost_breadth', 'cs_rank_dist_to_upper_cost', 'cs_rank_winner_rate', 'cs_rank_intraday_range', 'cs_rank_close_pos_in_range', 'cs_rank_opening_gap', 'cs_rank_pos_in_hist_range', 'cs_rank_vol_x_profit_margin', 'cs_rank_lg_flow_price_concordance', 'cs_rank_turnover_per_winner', 'cs_rank_ind_cap_neutral_pe', 'cs_rank_volume_ratio', 'cs_rank_elg_buy_sell_sm_ratio', 'cs_rank_cost_dist_vol_ratio', 'cs_rank_size']\n"
]
}
],
@@ -595,9 +617,14 @@
"df = turnover_rate_n(df, n=5)\n",
"df = variance_n(df, n=20)\n",
"df = bbi_ratio_factor(df)\n",
"df = daily_deviation(df)\n",
"df = daily_industry_deviation(df)\n",
"df, _ = get_rolling_factor(df)\n",
"df, _ = get_simple_factor(df)\n",
"\n",
"df = df.rename(columns={'l1_code': 'cat_l1_code'})\n",
"df = df.rename(columns={'l2_code': 'cat_l2_code'})\n",
"\n",
"lg_flow_mom_corr(df, N=20, M=60)\n",
"lg_flow_accel(df)\n",
"profit_pressure(df)\n",
@@ -636,8 +663,6 @@
"cs_rank_cost_dist_vol_ratio(df) # Needs volume_ratio\n",
"cs_rank_size(df) # Needs circ_mv\n",
"\n",
"df = df.rename(columns={'l1_code': 'cat_l1_code'})\n",
"df = df.rename(columns={'l2_code': 'cat_l2_code'})\n",
"\n",
"# df = df.merge(index_data, on='trade_date', how='left')\n",
"\n",
@@ -647,7 +672,7 @@
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": 17,
"id": "b87b938028afa206",
"metadata": {
"ExecuteTime": {
@@ -685,7 +710,7 @@
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": 18,
"id": "f4f16d63ad18d1bc",
"metadata": {
"ExecuteTime": {
@@ -931,7 +956,7 @@
},
{
"cell_type": "code",
"execution_count": 12,
"execution_count": 19,
"id": "40e6b68a91b30c79",
"metadata": {
"ExecuteTime": {
@@ -1217,12 +1242,48 @@
" n_actual = min(n, len(rankic_series))\n",
" top_features = rankic_series.sort_values(ascending=False).head(n_actual).index.tolist()\n",
" top_features = [col for col in feature_columns if col in top_features or col not in numeric_columns]\n",
" return top_features"
" return top_features\n",
"\n",
"def create_deviation_within_dates(df, feature_columns):\n",
" groupby_col = 'cat_l2_code' # 使用 trade_date 进行分组\n",
" new_columns = {}\n",
" ret_feature_columns = feature_columns[:]\n",
"\n",
" # 自动选择所有数值型特征\n",
" num_features = [col for col in feature_columns if 'cat' not in col and 'index' not in col]\n",
"\n",
" # num_features = ['vol', 'pct_chg', 'turnover_rate', 'volume_ratio', 'cat_vol_spike', 'obv', 'maobv_6', 'return_5', 'return_10', 'return_20', 'std_return_5', 'std_return_15', 'std_return_90', 'std_return_90_2', 'act_factor1', 'act_factor2', 'act_factor3', 'act_factor4', 'act_factor5', 'act_factor6', 'rank_act_factor1', 'rank_act_factor2', 'rank_act_factor3', 'active_buy_volume_large', 'active_buy_volume_big', 'active_buy_volume_small', 'alpha_022', 'alpha_003', 'alpha_007', 'alpha_013']\n",
" num_features = [col for col in num_features if 'cat' not in col and 'industry' not in col]\n",
" num_features = [col for col in num_features if 'limit' not in col]\n",
" num_features = [col for col in num_features if 'cyq' not in col]\n",
"\n",
" # 遍历所有数值型特征\n",
" for feature in num_features:\n",
" if feature == 'trade_date': # 不需要对 'trade_date' 计算偏差\n",
" continue\n",
"\n",
" # grouped_mean = df.groupby(['trade_date'])[feature].transform('mean')\n",
" # deviation_col_name = f'deviation_mean_{feature}'\n",
" # new_columns[deviation_col_name] = df[feature] - grouped_mean\n",
" # ret_feature_columns.append(deviation_col_name)\n",
"\n",
" grouped_mean = df.groupby(['trade_date', groupby_col])[feature].transform('mean')\n",
" deviation_col_name = f'deviation_mean_{feature}'\n",
" new_columns[deviation_col_name] = df[feature] - grouped_mean\n",
" ret_feature_columns.append(deviation_col_name)\n",
"\n",
" # 将新计算的偏差特征与原始 DataFrame 合并\n",
" df = pd.concat([df, pd.DataFrame(new_columns)], axis=1)\n",
"\n",
" # for feature in ['obv', 'return_20', 'act_factor1', 'act_factor2', 'act_factor3', 'act_factor4']:\n",
" # df[f'deviation_industry_{feature}'] = df[feature] - df[f'industry_{feature}']\n",
"\n",
" return df, ret_feature_columns\n"
]
},
{
"cell_type": "code",
"execution_count": 13,
"execution_count": 20,
"id": "47c12bb34062ae7a",
"metadata": {
"ExecuteTime": {
@@ -1256,7 +1317,7 @@
},
{
"cell_type": "code",
"execution_count": 14,
"execution_count": null,
"id": "29221dde",
"metadata": {},
"outputs": [],
@@ -1288,13 +1349,13 @@
"\n",
"# df = fill_nan_with_daily_median(df, feature_columns)\n",
"for feature_col in [col for col in feature_columns if col in df.columns]:\n",
" median_val = df[feature_col].median()\n",
" # median_val = df[feature_col].median()\n",
" df[feature_col].fillna(0, inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"execution_count": 22,
"id": "b76ea08a",
"metadata": {},
"outputs": [
@@ -1302,11 +1363,11 @@
"name": "stdout",
"output_type": "stream",
"text": [
" ts_code trade_date log_circ_mv\n",
"0 000001.SZ 2019-01-02 16.574219\n",
"2738 000001.SZ 2019-01-03 16.583965\n",
"5477 000001.SZ 2019-01-04 16.633371\n",
"['vol', 'pct_chg', 'turnover_rate', 'volume_ratio', 'winner_rate', 'undist_profit_ps', 'ocfps', 'AR', 'BR', 'AR_BR', 'cashflow_to_ev_factor', 'book_to_price_ratio', 'turnover_rate_mean_5', 'variance_20', 'bbi_ratio_factor', 'lg_elg_net_buy_vol', 'flow_lg_elg_intensity', 'sm_net_buy_vol', 'total_buy_vol', 'lg_elg_buy_prop', 'flow_struct_buy_change', 'lg_elg_net_buy_vol_change', 'flow_lg_elg_accel', 'chip_concentration_range', 'chip_skewness', 'floating_chip_proxy', 'cost_support_15pct_change', 'cat_winner_price_zone', 'flow_chip_consistency', 'profit_taking_vs_absorb', 'cat_is_positive', 'upside_vol', 'downside_vol', 'vol_ratio', 'return_skew', 'return_kurtosis', 'volume_change_rate', 'cat_volume_breakout', 'turnover_deviation', 'cat_turnover_spike', 'avg_volume_ratio', 'cat_volume_ratio_breakout', 'vol_spike', 'vol_std_5', 'atr_14', 'atr_6', 'obv', 'maobv_6', 'rsi_3', 'return_5', 'return_20', 'std_return_5', 'std_return_90', 'std_return_90_2', 'act_factor1', 'act_factor2', 'act_factor3', 'act_factor4', 'rank_act_factor1', 'rank_act_factor2', 'rank_act_factor3', 'cov', 'delta_cov', 'alpha_22_improved', 'alpha_003', 'alpha_007', 'alpha_013', 'vol_break', 'weight_roc5', 'smallcap_concentration', 'cost_stability', 'high_cost_break_days', 'liquidity_risk', 'turnover_std', 'mv_volatility', 'volume_growth', 'mv_growth', 'momentum_factor', 'resonance_factor', 'log_close', 'cat_vol_spike', 'up', 'down', 'obv_maobv_6', 'std_return_5_over_std_return_90', 'std_return_90_minus_std_return_90_2', 'cat_af2', 'cat_af3', 'cat_af4', 'act_factor5', 'act_factor6', 'active_buy_volume_large', 'active_buy_volume_big', 'active_buy_volume_small', 'buy_lg_vol_minus_sell_lg_vol', 'buy_elg_vol_minus_sell_elg_vol', 'ctrl_strength', 'low_cost_dev', 'asymmetry', 'lock_factor', 'cat_vol_break', 'cost_atr_adj', 'cat_golden_resonance', 'mv_turnover_ratio', 'mv_adjusted_volume', 'mv_weighted_turnover', 'nonlinear_mv_volume', 'mv_volume_ratio', 'mv_momentum', 'lg_flow_mom_corr_20_60', 'lg_flow_accel', 'profit_pressure', 'underwater_resistance', 'cost_conc_std_20', 'profit_decay_20', 'vol_amp_loss_20', 'vol_drop_profit_cnt_5', 'lg_flow_vol_interact_20', 'cost_break_confirm_cnt_5', 'atr_norm_channel_pos_14', 'turnover_diff_skew_20', 'lg_sm_flow_diverge_20', 'pullback_strong_20_20', 'vol_wgt_hist_pos_20', 'vol_adj_roc_20', 'cs_rank_net_lg_flow_val', 'cs_rank_elg_buy_ratio', 'cs_rank_rel_profit_margin', 'cs_rank_cost_breadth', 'cs_rank_dist_to_upper_cost', 'cs_rank_winner_rate', 'cs_rank_intraday_range', 'cs_rank_close_pos_in_range', 'cs_rank_pos_in_hist_range', 'cs_rank_vol_x_profit_margin', 'cs_rank_lg_flow_price_concordance', 'cs_rank_turnover_per_winner', 'cs_rank_volume_ratio', 'cs_rank_elg_buy_sell_sm_ratio', 'cs_rank_cost_dist_vol_ratio', 'cs_rank_size', 'cat_up_limit', 'industry_obv', 'industry_return_5', 'industry_return_20', 'industry__ema_5', 'industry__ema_13', 'industry__ema_20', 'industry__ema_60', 'industry_act_factor1', 'industry_act_factor2', 'industry_act_factor3', 'industry_act_factor4', 'industry_act_factor5', 'industry_act_factor6', 'industry_rank_act_factor1', 'industry_rank_act_factor2', 'industry_rank_act_factor3', 'industry_return_5_percentile', 'industry_return_20_percentile', '000852.SH_MACD', '000905.SH_MACD', '399006.SZ_MACD', '000852.SH_MACD_hist', '000905.SH_MACD_hist', '399006.SZ_MACD_hist', '000852.SH_RSI', '000905.SH_RSI', '399006.SZ_RSI', '000852.SH_Signal_line', '000905.SH_Signal_line', '399006.SZ_Signal_line', '000852.SH_amount_change_rate', '000905.SH_amount_change_rate', '399006.SZ_amount_change_rate', '000852.SH_amount_mean', '000905.SH_amount_mean', '399006.SZ_amount_mean', '000852.SH_daily_return', '000905.SH_daily_return', '399006.SZ_daily_return', '000852.SH_up_ratio_20d', '000905.SH_up_ratio_20d', '399006.SZ_up_ratio_20d', '000852.SH_volatility', '000905.SH_volatility', '399006.SZ_volatility', '000852.SH_volume_change_rate', '000905.SH_volume_change_rate', '399006.SZ_volume_change_rate']\n",
" ts_code trade_date log_circ_mv\n",
"0 000001.SZ 2019-01-02 16.574219\n",
"1 000001.SZ 2019-01-03 16.583965\n",
"2 000001.SZ 2019-01-04 16.633371\n",
"['vol', 'pct_chg', 'turnover_rate', 'volume_ratio', 'winner_rate', 'undist_profit_ps', 'ocfps', 'AR', 'BR', 'AR_BR', 'cashflow_to_ev_factor', 'book_to_price_ratio', 'turnover_rate_mean_5', 'variance_20', 'bbi_ratio_factor', 'daily_deviation', 'lg_elg_net_buy_vol', 'flow_lg_elg_intensity', 'sm_net_buy_vol', 'total_buy_vol', 'lg_elg_buy_prop', 'flow_struct_buy_change', 'lg_elg_net_buy_vol_change', 'flow_lg_elg_accel', 'chip_concentration_range', 'chip_skewness', 'floating_chip_proxy', 'cost_support_15pct_change', 'cat_winner_price_zone', 'flow_chip_consistency', 'profit_taking_vs_absorb', 'cat_is_positive', 'upside_vol', 'downside_vol', 'vol_ratio', 'return_skew', 'return_kurtosis', 'volume_change_rate', 'cat_volume_breakout', 'turnover_deviation', 'cat_turnover_spike', 'avg_volume_ratio', 'cat_volume_ratio_breakout', 'vol_spike', 'vol_std_5', 'atr_14', 'atr_6', 'obv', 'maobv_6', 'rsi_3', 'return_5', 'return_20', 'std_return_5', 'std_return_90', 'std_return_90_2', 'act_factor1', 'act_factor2', 'act_factor3', 'act_factor4', 'rank_act_factor1', 'rank_act_factor2', 'rank_act_factor3', 'cov', 'delta_cov', 'alpha_22_improved', 'alpha_003', 'alpha_007', 'alpha_013', 'vol_break', 'weight_roc5', 'smallcap_concentration', 'cost_stability', 'high_cost_break_days', 'liquidity_risk', 'turnover_std', 'mv_volatility', 'volume_growth', 'mv_growth', 'momentum_factor', 'resonance_factor', 'log_close', 'cat_vol_spike', 'up', 'down', 'obv_maobv_6', 'std_return_5_over_std_return_90', 'std_return_90_minus_std_return_90_2', 'cat_af2', 'cat_af3', 'cat_af4', 'act_factor5', 'act_factor6', 'active_buy_volume_large', 'active_buy_volume_big', 'active_buy_volume_small', 'buy_lg_vol_minus_sell_lg_vol', 'buy_elg_vol_minus_sell_elg_vol', 'ctrl_strength', 'low_cost_dev', 'asymmetry', 'lock_factor', 'cat_vol_break', 'cost_atr_adj', 'cat_golden_resonance', 'mv_turnover_ratio', 'mv_adjusted_volume', 'mv_weighted_turnover', 'nonlinear_mv_volume', 'mv_volume_ratio', 'mv_momentum', 'lg_flow_mom_corr_20_60', 'lg_flow_accel', 'profit_pressure', 'underwater_resistance', 'cost_conc_std_20', 'profit_decay_20', 'vol_amp_loss_20', 'vol_drop_profit_cnt_5', 'lg_flow_vol_interact_20', 'cost_break_confirm_cnt_5', 'atr_norm_channel_pos_14', 'turnover_diff_skew_20', 'lg_sm_flow_diverge_20', 'pullback_strong_20_20', 'vol_wgt_hist_pos_20', 'vol_adj_roc_20', 'cs_rank_net_lg_flow_val', 'cs_rank_elg_buy_ratio', 'cs_rank_rel_profit_margin', 'cs_rank_cost_breadth', 'cs_rank_dist_to_upper_cost', 'cs_rank_winner_rate', 'cs_rank_intraday_range', 'cs_rank_close_pos_in_range', 'cs_rank_pos_in_hist_range', 'cs_rank_vol_x_profit_margin', 'cs_rank_lg_flow_price_concordance', 'cs_rank_turnover_per_winner', 'cs_rank_volume_ratio', 'cs_rank_elg_buy_sell_sm_ratio', 'cs_rank_cost_dist_vol_ratio', 'cs_rank_size', 'cat_up_limit', 'industry_obv', 'industry_return_5', 'industry_return_20', 'industry__ema_5', 'industry__ema_13', 'industry__ema_20', 'industry__ema_60', 'industry_act_factor1', 'industry_act_factor2', 'industry_act_factor3', 'industry_act_factor4', 'industry_act_factor5', 'industry_act_factor6', 'industry_rank_act_factor1', 'industry_rank_act_factor2', 'industry_rank_act_factor3', 'industry_return_5_percentile', 'industry_return_20_percentile', 'industry_cs_rank_intraday_range', 'industry_cs_rank_close_pos_in_range', '000852.SH_MACD', '000905.SH_MACD', '399006.SZ_MACD', '000852.SH_MACD_hist', '000905.SH_MACD_hist', '399006.SZ_MACD_hist', '000852.SH_RSI', '000905.SH_RSI', '399006.SZ_RSI', '000852.SH_Signal_line', '000905.SH_Signal_line', '399006.SZ_Signal_line', '000852.SH_amount_change_rate', '000905.SH_amount_change_rate', '399006.SZ_amount_change_rate', '000852.SH_amount_mean', '000905.SH_amount_mean', '399006.SZ_amount_mean', '000852.SH_daily_return', '000905.SH_daily_return', '399006.SZ_daily_return', '000852.SH_up_ratio_20d', '000905.SH_up_ratio_20d', '399006.SZ_up_ratio_20d', '000852.SH_volatility', '000905.SH_volatility', '399006.SZ_volatility', '000852.SH_volume_change_rate', '000905.SH_volume_change_rate', '399006.SZ_volume_change_rate']\n",
"去除极值\n",
"开始截面 MAD 去极值处理 (k=3.0)...\n"
]
@@ -1315,7 +1376,7 @@
"name": "stderr",
"output_type": "stream",
"text": [
"MAD Filtering: 100%|██████████| 130/130 [00:28<00:00, 4.62it/s]\n"
"MAD Filtering: 100%|██████████| 131/131 [00:27<00:00, 4.69it/s]\n"
]
},
{
@@ -1330,7 +1391,7 @@
"name": "stderr",
"output_type": "stream",
"text": [
"MAD Filtering: 100%|██████████| 130/130 [00:23<00:00, 5.55it/s]\n"
"MAD Filtering: 100%|██████████| 131/131 [00:23<00:00, 5.52it/s]\n"
]
},
{
@@ -1368,25 +1429,26 @@
"output_type": "stream",
"text": [
"截面 MAD 去极值处理完成。\n",
"feature_columns: ['vol', 'pct_chg', 'turnover_rate', 'volume_ratio', 'winner_rate', 'undist_profit_ps', 'ocfps', 'AR', 'BR', 'AR_BR', 'cashflow_to_ev_factor', 'book_to_price_ratio', 'turnover_rate_mean_5', 'variance_20', 'bbi_ratio_factor', 'lg_elg_net_buy_vol', 'flow_lg_elg_intensity', 'sm_net_buy_vol', 'total_buy_vol', 'lg_elg_buy_prop', 'flow_struct_buy_change', 'lg_elg_net_buy_vol_change', 'flow_lg_elg_accel', 'chip_concentration_range', 'chip_skewness', 'floating_chip_proxy', 'cost_support_15pct_change', 'cat_winner_price_zone', 'flow_chip_consistency', 'profit_taking_vs_absorb', 'cat_is_positive', 'upside_vol', 'downside_vol', 'vol_ratio', 'return_skew', 'return_kurtosis', 'volume_change_rate', 'cat_volume_breakout', 'turnover_deviation', 'cat_turnover_spike', 'avg_volume_ratio', 'cat_volume_ratio_breakout', 'vol_spike', 'vol_std_5', 'atr_14', 'atr_6', 'obv', 'maobv_6', 'rsi_3', 'return_5', 'return_20', 'std_return_5', 'std_return_90', 'std_return_90_2', 'act_factor1', 'act_factor2', 'act_factor3', 'act_factor4', 'rank_act_factor1', 'rank_act_factor2', 'rank_act_factor3', 'cov', 'delta_cov', 'alpha_22_improved', 'alpha_003', 'alpha_007', 'alpha_013', 'vol_break', 'weight_roc5', 'smallcap_concentration', 'cost_stability', 'high_cost_break_days', 'liquidity_risk', 'turnover_std', 'mv_volatility', 'volume_growth', 'mv_growth', 'momentum_factor', 'resonance_factor', 'log_close', 'cat_vol_spike', 'up', 'down', 'obv_maobv_6', 'std_return_5_over_std_return_90', 'std_return_90_minus_std_return_90_2', 'cat_af2', 'cat_af3', 'cat_af4', 'act_factor5', 'act_factor6', 'active_buy_volume_large', 'active_buy_volume_big', 'active_buy_volume_small', 'buy_lg_vol_minus_sell_lg_vol', 'buy_elg_vol_minus_sell_elg_vol', 'ctrl_strength', 'low_cost_dev', 'asymmetry', 'lock_factor', 'cat_vol_break', 'cost_atr_adj', 'cat_golden_resonance', 'mv_turnover_ratio', 'mv_adjusted_volume', 'mv_weighted_turnover', 'nonlinear_mv_volume', 'mv_volume_ratio', 'mv_momentum', 'lg_flow_mom_corr_20_60', 'lg_flow_accel', 'profit_pressure', 'underwater_resistance', 'cost_conc_std_20', 'profit_decay_20', 'vol_amp_loss_20', 'vol_drop_profit_cnt_5', 'lg_flow_vol_interact_20', 'cost_break_confirm_cnt_5', 'atr_norm_channel_pos_14', 'turnover_diff_skew_20', 'lg_sm_flow_diverge_20', 'pullback_strong_20_20', 'vol_wgt_hist_pos_20', 'vol_adj_roc_20', 'cs_rank_net_lg_flow_val', 'cs_rank_elg_buy_ratio', 'cs_rank_rel_profit_margin', 'cs_rank_cost_breadth', 'cs_rank_dist_to_upper_cost', 'cs_rank_winner_rate', 'cs_rank_intraday_range', 'cs_rank_close_pos_in_range', 'cs_rank_pos_in_hist_range', 'cs_rank_vol_x_profit_margin', 'cs_rank_lg_flow_price_concordance', 'cs_rank_turnover_per_winner', 'cs_rank_volume_ratio', 'cs_rank_elg_buy_sell_sm_ratio', 'cs_rank_cost_dist_vol_ratio', 'cs_rank_size', 'cat_up_limit', 'industry_obv', 'industry_return_5', 'industry_return_20', 'industry__ema_5', 'industry__ema_13', 'industry__ema_20', 'industry__ema_60', 'industry_act_factor1', 'industry_act_factor2', 'industry_act_factor3', 'industry_act_factor4', 'industry_act_factor5', 'industry_act_factor6', 'industry_rank_act_factor1', 'industry_rank_act_factor2', 'industry_rank_act_factor3', 'industry_return_5_percentile', 'industry_return_20_percentile', '000852.SH_MACD', '000905.SH_MACD', '399006.SZ_MACD', '000852.SH_MACD_hist', '000905.SH_MACD_hist', '399006.SZ_MACD_hist', '000852.SH_RSI', '000905.SH_RSI', '399006.SZ_RSI', '000852.SH_Signal_line', '000905.SH_Signal_line', '399006.SZ_Signal_line', '000852.SH_amount_change_rate', '000905.SH_amount_change_rate', '399006.SZ_amount_change_rate', '000852.SH_amount_mean', '000905.SH_amount_mean', '399006.SZ_amount_mean', '000852.SH_daily_return', '000905.SH_daily_return', '399006.SZ_daily_return', '000852.SH_up_ratio_20d', '000905.SH_up_ratio_20d', '399006.SZ_up_ratio_20d', '000852.SH_volatility', '000905.SH_volatility', '399006.SZ_volatility', '000852.SH_volume_change_rate', '000905.SH_volume_change_rate', '399006.SZ_volume_change_rate']\n",
"feature_columns: ['vol', 'pct_chg', 'turnover_rate', 'volume_ratio', 'winner_rate', 'undist_profit_ps', 'ocfps', 'AR', 'BR', 'AR_BR', 'cashflow_to_ev_factor', 'book_to_price_ratio', 'turnover_rate_mean_5', 'variance_20', 'bbi_ratio_factor', 'daily_deviation', 'lg_elg_net_buy_vol', 'flow_lg_elg_intensity', 'sm_net_buy_vol', 'total_buy_vol', 'lg_elg_buy_prop', 'flow_struct_buy_change', 'lg_elg_net_buy_vol_change', 'flow_lg_elg_accel', 'chip_concentration_range', 'chip_skewness', 'floating_chip_proxy', 'cost_support_15pct_change', 'cat_winner_price_zone', 'flow_chip_consistency', 'profit_taking_vs_absorb', 'cat_is_positive', 'upside_vol', 'downside_vol', 'vol_ratio', 'return_skew', 'return_kurtosis', 'volume_change_rate', 'cat_volume_breakout', 'turnover_deviation', 'cat_turnover_spike', 'avg_volume_ratio', 'cat_volume_ratio_breakout', 'vol_spike', 'vol_std_5', 'atr_14', 'atr_6', 'obv', 'maobv_6', 'rsi_3', 'return_5', 'return_20', 'std_return_5', 'std_return_90', 'std_return_90_2', 'act_factor1', 'act_factor2', 'act_factor3', 'act_factor4', 'rank_act_factor1', 'rank_act_factor2', 'rank_act_factor3', 'cov', 'delta_cov', 'alpha_22_improved', 'alpha_003', 'alpha_007', 'alpha_013', 'vol_break', 'weight_roc5', 'smallcap_concentration', 'cost_stability', 'high_cost_break_days', 'liquidity_risk', 'turnover_std', 'mv_volatility', 'volume_growth', 'mv_growth', 'momentum_factor', 'resonance_factor', 'log_close', 'cat_vol_spike', 'up', 'down', 'obv_maobv_6', 'std_return_5_over_std_return_90', 'std_return_90_minus_std_return_90_2', 'cat_af2', 'cat_af3', 'cat_af4', 'act_factor5', 'act_factor6', 'active_buy_volume_large', 'active_buy_volume_big', 'active_buy_volume_small', 'buy_lg_vol_minus_sell_lg_vol', 'buy_elg_vol_minus_sell_elg_vol', 'ctrl_strength', 'low_cost_dev', 'asymmetry', 'lock_factor', 'cat_vol_break', 'cost_atr_adj', 'cat_golden_resonance', 'mv_turnover_ratio', 'mv_adjusted_volume', 'mv_weighted_turnover', 'nonlinear_mv_volume', 'mv_volume_ratio', 'mv_momentum', 'lg_flow_mom_corr_20_60', 'lg_flow_accel', 'profit_pressure', 'underwater_resistance', 'cost_conc_std_20', 'profit_decay_20', 'vol_amp_loss_20', 'vol_drop_profit_cnt_5', 'lg_flow_vol_interact_20', 'cost_break_confirm_cnt_5', 'atr_norm_channel_pos_14', 'turnover_diff_skew_20', 'lg_sm_flow_diverge_20', 'pullback_strong_20_20', 'vol_wgt_hist_pos_20', 'vol_adj_roc_20', 'cs_rank_net_lg_flow_val', 'cs_rank_elg_buy_ratio', 'cs_rank_rel_profit_margin', 'cs_rank_cost_breadth', 'cs_rank_dist_to_upper_cost', 'cs_rank_winner_rate', 'cs_rank_intraday_range', 'cs_rank_close_pos_in_range', 'cs_rank_pos_in_hist_range', 'cs_rank_vol_x_profit_margin', 'cs_rank_lg_flow_price_concordance', 'cs_rank_turnover_per_winner', 'cs_rank_volume_ratio', 'cs_rank_elg_buy_sell_sm_ratio', 'cs_rank_cost_dist_vol_ratio', 'cs_rank_size', 'cat_up_limit', 'industry_obv', 'industry_return_5', 'industry_return_20', 'industry__ema_5', 'industry__ema_13', 'industry__ema_20', 'industry__ema_60', 'industry_act_factor1', 'industry_act_factor2', 'industry_act_factor3', 'industry_act_factor4', 'industry_act_factor5', 'industry_act_factor6', 'industry_rank_act_factor1', 'industry_rank_act_factor2', 'industry_rank_act_factor3', 'industry_return_5_percentile', 'industry_return_20_percentile', 'industry_cs_rank_intraday_range', 'industry_cs_rank_close_pos_in_range', '000852.SH_MACD', '000905.SH_MACD', '399006.SZ_MACD', '000852.SH_MACD_hist', '000905.SH_MACD_hist', '399006.SZ_MACD_hist', '000852.SH_RSI', '000905.SH_RSI', '399006.SZ_RSI', '000852.SH_Signal_line', '000905.SH_Signal_line', '399006.SZ_Signal_line', '000852.SH_amount_change_rate', '000905.SH_amount_change_rate', '399006.SZ_amount_change_rate', '000852.SH_amount_mean', '000905.SH_amount_mean', '399006.SZ_amount_mean', '000852.SH_daily_return', '000905.SH_daily_return', '399006.SZ_daily_return', '000852.SH_up_ratio_20d', '000905.SH_up_ratio_20d', '399006.SZ_up_ratio_20d', '000852.SH_volatility', '000905.SH_volatility', '399006.SZ_volatility', '000852.SH_volume_change_rate', '000905.SH_volume_change_rate', '399006.SZ_volume_change_rate']\n",
"df最小日期: 2019-01-02\n",
"df最大日期: 2025-05-07\n",
"2057678\n",
"df最大日期: 2025-05-09\n",
"2057671\n",
"train_data最小日期: 2020-01-02\n",
"train_data最大日期: 2022-12-30\n",
"1730630\n",
"1736644\n",
"test_data最小日期: 2023-01-03\n",
"test_data最大日期: 2025-05-07\n",
" ts_code trade_date log_circ_mv\n",
"0 000001.SZ 2019-01-02 16.574219\n",
"2738 000001.SZ 2019-01-03 16.583965\n",
"5477 000001.SZ 2019-01-04 16.633371\n"
"test_data最大日期: 2025-05-09\n",
" ts_code trade_date log_circ_mv\n",
"0 000001.SZ 2019-01-02 16.574219\n",
"1 000001.SZ 2019-01-03 16.583965\n",
"2 000001.SZ 2019-01-04 16.633371\n"
]
}
],
"source": [
"train_data = df[filter_index & (df['trade_date'] <= '2023-01-01') & (df['trade_date'] >= '2020-01-01')]\n",
"test_data = df[(df['trade_date'] >= '2023-01-01')]\n",
"split_date = '2023-01-01'\n",
"train_data = df[filter_index & (df['trade_date'] <= split_date) & (df['trade_date'] >= '2020-01-01')]\n",
"test_data = df[(df['trade_date'] >= split_date)]\n",
"\n",
"print(df[['ts_code', 'trade_date', 'log_circ_mv']].head(3))\n",
"\n",
@@ -1401,8 +1463,8 @@
"train_data, test_data = train_data.replace([np.inf, -np.inf], np.nan), test_data.replace([np.inf, -np.inf], np.nan)\n",
"\n",
"# feature_columns_new = feature_columns[:]\n",
"# train_data, _ = create_deviation_within_dates(train_data, feature_columns)\n",
"# test_data, _ = create_deviation_within_dates(test_data, feature_columns)\n",
"# train_data, _ = create_deviation_within_dates(train_data, [col for col in feature_columns if col in train_data.columns])\n",
"# test_data, _ = create_deviation_within_dates(test_data, [col for col in feature_columns if col in train_data.columns])\n",
"\n",
"# feature_columns = [\n",
"# 'undist_profit_ps', \n",
@@ -1511,75 +1573,10 @@
},
{
"cell_type": "code",
"execution_count": 16,
"id": "e23d1759",
"execution_count": 34,
"id": "3ff2d1c5",
"metadata": {},
"outputs": [],
"source": [
"# feature_columns = [\n",
"# 'undist_profit_ps', \n",
"# 'AR_BR', \n",
"# # 'pe_ttm',\n",
"# # 'alpha_22_improved', \n",
"# # 'alpha_003', \n",
"# # 'alpha_007', \n",
"# # 'alpha_013', \n",
"# # 'cat_up_limit', \n",
"# # 'cat_down_limit', \n",
"# # 'up_limit_count_10d', \n",
"# # 'down_limit_count_10d', \n",
"# # 'consecutive_up_limit', \n",
"# # 'vol_break', \n",
"# # 'weight_roc5', \n",
"# # 'price_cost_divergence', \n",
"# # 'smallcap_concentration', \n",
"# # 'cost_stability', \n",
"# # 'high_cost_break_days', \n",
"# # 'liquidity_risk', \n",
"# # 'turnover_std', \n",
"# # 'mv_volatility', \n",
"# # 'volume_growth', \n",
"# # 'mv_growth', \n",
"# # 'lg_flow_mom_corr_20_60', \n",
"# # 'lg_flow_accel', \n",
"# # 'profit_pressure', \n",
"# # 'underwater_resistance', \n",
"# # 'cost_conc_std_20', \n",
"# # 'profit_decay_20', \n",
"# # 'vol_amp_loss_20', \n",
"# # 'vol_drop_profit_cnt_5', \n",
"# # 'lg_flow_vol_interact_20', \n",
"# # 'cost_break_confirm_cnt_5', \n",
"# # 'atr_norm_channel_pos_14', \n",
"# # 'turnover_diff_skew_20', \n",
"# # 'lg_sm_flow_diverge_20', \n",
"# # 'pullback_strong_20_20', \n",
"# # 'vol_wgt_hist_pos_20', \n",
"# # 'vol_adj_roc_20',\n",
"# 'cashflow_to_ev_factor',\n",
"# 'ocfps',\n",
"# 'book_to_price_ratio',\n",
"# 'turnover_rate_mean_5',\n",
"# 'variance_20',\n",
"# 'bbi_ratio_factor'\n",
"# ]\n",
"# feature_columns = [col for col in feature_columns if col in train_data.columns]"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "8f134d435f71e9e2",
"metadata": {
"ExecuteTime": {
"end_time": "2025-04-03T14:57:51.050696Z",
"start_time": "2025-04-03T14:57:51.034030Z"
},
"jupyter": {
"source_hidden": true
}
},
"outputs": [],
"source": [
"from sklearn.preprocessing import StandardScaler\n",
"from sklearn.linear_model import LogisticRegression\n",
@@ -1590,12 +1587,12 @@
"import datetime # 用于日期计算\n",
"from catboost import CatBoostClassifier\n",
"from catboost import Pool\n",
"\n",
"import lightgbm as lgb\n",
"\n",
"def train_model(train_data_df, feature_columns,\n",
" print_info=True, # 调整参数名,更通用\n",
" validation_days=180, use_pca=False, split_date=None,\n",
" target_column='label'): # 增加目标列参数\n",
" target_column='label', type='light'): # 增加目标列参数\n",
"\n",
" print('train data size: ', len(train_data_df))\n",
" print(train_data_df[['ts_code', 'trade_date', 'log_circ_mv']])\n",
@@ -1636,38 +1633,80 @@
" \n",
" # # 使用处理后的特征和样本权重进行训练\n",
" # model.fit(X_train, y_train)\n",
" \n",
" \n",
" cat_features = [i for i, col in enumerate(feature_columns) if col.startswith('cat')]\n",
" print(f'cat_features: {cat_features}')\n",
" # cat_features = []\n",
"\n",
" params = {\n",
" 'loss_function': 'Logloss', # 适用于二分类\n",
" 'eval_metric': 'Logloss', # 评估指标\n",
" 'iterations': 1500,\n",
" 'learning_rate': 0.01,\n",
" 'depth': 8, # 控制模型复杂度\n",
" 'l2_leaf_reg': 5, # L2 正则化\n",
" 'verbose': 5000,\n",
" 'early_stopping_rounds': 3000,\n",
" 'one_hot_max_size': 50,\n",
" 'class_weights': [0.6, 1.2],\n",
" 'task_type': 'GPU',\n",
" 'has_time': True,\n",
" 'random_seed': 7\n",
" }\n",
"\n",
" train_pool = Pool(data=X_train, label=y_train, cat_features=cat_features)\n",
" val_pool = Pool(data=X_val, label=y_val, cat_features=cat_features)\n",
"\n",
"\n",
" model = CatBoostClassifier(**params)\n",
" model.fit(train_pool,\n",
" eval_set=val_pool, \n",
" plot=True, \n",
" use_best_model=True\n",
" )\n",
" if type == 'cat':\n",
" params = {\n",
" 'loss_function': 'Logloss', # 适用于二分类\n",
" 'eval_metric': 'Logloss', # 评估指标\n",
" 'iterations': 1500,\n",
" 'learning_rate': 0.01,\n",
" 'depth': 10, # 控制模型复杂度\n",
" 'l2_leaf_reg': 50, # L2 正则化\n",
" 'verbose': 5000,\n",
" 'early_stopping_rounds': 3000,\n",
" 'one_hot_max_size': 50,\n",
" 'class_weights': [0.6, 1.2],\n",
" 'task_type': 'GPU',\n",
" 'has_time': True,\n",
" 'random_seed': 7\n",
" }\n",
" cat_features = [i for i, col in enumerate(feature_columns) if col.startswith('cat')]\n",
" train_pool = Pool(data=X_train, label=y_train, cat_features=cat_features)\n",
" val_pool = Pool(data=X_val, label=y_val, cat_features=cat_features)\n",
"\n",
"\n",
" model = CatBoostClassifier(**params)\n",
" model.fit(train_pool,\n",
" eval_set=val_pool, \n",
" plot=True, \n",
" use_best_model=True\n",
" )\n",
" elif type == 'light':\n",
" params = {\n",
" 'objective': 'binary',\n",
" 'metric': 'average_precision',\n",
" 'learning_rate': 0.01,\n",
" 'is_unbalance': True,\n",
" 'num_leaves': 2048,\n",
" 'min_data_in_leaf': 1024,\n",
" 'max_depth': 32,\n",
" 'max_bin': 1024,\n",
" 'feature_fraction': 0.5,\n",
" 'bagging_fraction': 0.5,\n",
" 'bagging_freq': 1,\n",
" 'lambda_l1': 50,\n",
" 'lambda_l2': 50,\n",
" 'verbosity': -1,\n",
" 'num_threads' : 8\n",
" }\n",
" categorical_feature = [col for col in feature_columns if 'cat' in col]\n",
" train_dataset = lgb.Dataset(\n",
" X_train, label=y_train,\n",
" categorical_feature=categorical_feature\n",
" )\n",
" val_dataset = lgb.Dataset(\n",
" X_val, label=y_val,\n",
" categorical_feature=categorical_feature\n",
" )\n",
"\n",
" evals = {}\n",
" callbacks = [lgb.log_evaluation(period=1000),\n",
" lgb.callback.record_evaluation(evals),\n",
" lgb.early_stopping(100, first_metric_only=True)\n",
" ]\n",
" # 训练模型\n",
" model = lgb.train(\n",
" params, train_dataset, num_boost_round=1000,\n",
" valid_sets=[train_dataset, val_dataset], valid_names=['train', 'valid'],\n",
" callbacks=callbacks\n",
" )\n",
"\n",
" # 打印特征重要性(如果需要)\n",
" if True:\n",
" lgb.plot_metric(evals)\n",
" lgb.plot_importance(model, importance_type='split', max_num_features=20)\n",
" plt.show()\n",
"\n",
"\n",
" return model, scaler, None # 返回训练好的模型、scaler 和 pca 对象"
@@ -1675,7 +1714,7 @@
},
{
"cell_type": "code",
"execution_count": 18,
"execution_count": 35,
"id": "c6eb5cd4-e714-420a-ac48-39af3e11ee81",
"metadata": {
"ExecuteTime": {
@@ -1703,14 +1742,13 @@
"36399 600561.SH 2022-12-30 11.858571\n",
"\n",
"[36400 rows x 3 columns]\n",
"原始样本数: 36400, 去除标签为空后样本数: 36400\n",
"cat_features: [27, 30, 37, 39, 41, 80, 86, 87, 88, 100, 102, 141]\n"
"原始样本数: 36400, 去除标签为空后样本数: 36400\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "73e4fa876f004bafb847cea54620f732",
"model_id": "0acc9aa66b564c16ba0dfdaa7dab6a9e",
"version_major": 2,
"version_minor": 0
},
@@ -1725,11 +1763,11 @@
"name": "stdout",
"output_type": "stream",
"text": [
"0:\tlearn: 0.6886803\ttest: 0.6892921\tbest: 0.6892921 (0)\ttotal: 141ms\tremaining: 3m 32s\n",
"1499:\tlearn: 0.3359066\ttest: 0.5174742\tbest: 0.5163721 (873)\ttotal: 1m 51s\tremaining: 0us\n",
"bestTest = 0.5163720682\n",
"bestIteration = 873\n",
"Shrink model to first 874 iterations.\n"
"0:\tlearn: 0.6886094\ttest: 0.6894541\tbest: 0.6894541 (0)\ttotal: 255ms\tremaining: 6m 22s\n",
"1499:\tlearn: 0.3197977\ttest: 0.5228570\tbest: 0.5197799 (414)\ttotal: 5m 23s\tremaining: 0us\n",
"bestTest = 0.5197798592\n",
"bestIteration = 414\n",
"Shrink model to first 415 iterations.\n"
]
}
],
@@ -1738,6 +1776,7 @@
"gc.collect()\n",
"\n",
"use_pca = False\n",
"type = 'cat'\n",
"# feature_contri = [2 if feat.startswith('act_factor') or 'buy' in feat or 'sell' in feat else 1 for feat in feature_columns]\n",
"# light_params['feature_contri'] = feature_contri\n",
"# print(f'feature_contri: {feature_contri}')\n",
@@ -1745,71 +1784,12 @@
" .dropna(subset=['label']).groupby('trade_date', group_keys=False)\n",
" .apply(lambda x: x.nsmallest(50, 'total_mv'))\n",
" .merge(industry_df, on=['cat_l2_code', 'trade_date'], how='left')\n",
" .merge(index_data, on='trade_date', how='left'), feature_columns)\n"
" .merge(index_data, on='trade_date', how='left'), feature_columns, type=type)\n"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "ec189398",
"metadata": {},
"outputs": [],
"source": [
"# if True:\n",
"# train_data_df = train_data.dropna(subset=['label']).groupby('trade_date', group_keys=False).apply(lambda x: x.nsmallest(50, 'total_mv'))\n",
"# # 识别数值型特征列\n",
"\n",
"# # 去除标签为空的样本\n",
"# initial_len = len(train_data_df)\n",
"# train_data_df = train_data_df.dropna(subset=['label'])\n",
"\n",
"\n",
"# # 提取特征和标签,只取数值型特征用于线性回归\n",
" \n",
"# all_dates = train_data_df['trade_date'].unique() # 获取所有唯一的 trade_date\n",
"# split_date = all_dates[-validation_days] # 划分点为倒数第 validation_days 天\n",
"# val_data_split = train_data_df[train_data_df['trade_date'] >= split_date] # 验证集\n",
" \n",
"\n",
"# score_df = val_data_split\n",
"# score_df = fill_nan_with_daily_median(score_df, ['pe_ttm'])\n",
"# score_df = score_df[score_df['pe_ttm'] > 0]\n",
"# score_df = score_df.merge(industry_df, on=['cat_l2_code', 'trade_date'], how='left')\n",
"# score_df = score_df.merge(index_data, on='trade_date', how='left')\n",
"# # score_df = score_df.groupby('trade_date', group_keys=False).apply(lambda x: x.nsmallest(50, 'total_mv')).reset_index()\n",
"# numeric_columns = score_df.select_dtypes(include=['float64', 'int64']).columns\n",
"# numeric_columns = [col for col in feature_columns if col in numeric_columns]\n",
"# # score_df.loc[:, numeric_columns] = scaler.transform(score_df[numeric_columns])\n",
"# # score_df = cross_sectional_standardization(score_df, numeric_columns)\n",
"# print(score_df.columns.tolist())\n",
"\n",
"# score_df['score'] = model.predict_proba(score_df[feature_columns])[:, 1]\n",
"# score_df['score_ranks'] = score_df.groupby('trade_date')['score'].rank(ascending=True)\n",
"\n",
"# score_df = score_df.groupby('trade_date', group_keys=False).apply(\n",
"# lambda x: x[x['score'] >= x['score'].quantile(0.90)] # 计算90%分位数作为阈值,筛选分数>=阈值的行\n",
"# ).reset_index(drop=True) # drop=True 避免添加旧索引列\n",
"# # save_df = score_df.groupby('trade_date', group_keys=False).apply(lambda x: x.nlargest(1, 'score')).reset_index()\n",
"# save_df = score_df.groupby('trade_date', group_keys=False).apply(lambda x: x.nsmallest(1, 'total_mv')).reset_index()\n",
"# # save_df[['trade_date', 'score', 'ts_code']].to_csv('predictions_test.tsv', index=False)\n",
"# import pandas as pd\n",
"# from sklearn.metrics import accuracy_score\n",
"\n",
"# # 假设 df 是你的 DataFrame\n",
"# # df = pd.read_csv('your_data.csv')\n",
"\n",
"# # 将预测分数转换为类别预测例如0.5 为阈值)\n",
"# save_df['pred'] = (save_df['score'] >= 0.5).astype(int)\n",
"\n",
"# # 计算准确率\n",
"# acc = accuracy_score(save_df['label'], save_df['pred'])\n",
"\n",
"# print(f\"准确率为:{acc:.4f}\")"
]
},
{
"cell_type": "code",
"execution_count": 31,
"execution_count": 36,
"id": "5d1522a7538db91b",
"metadata": {
"ExecuteTime": {
@@ -1819,13 +1799,6 @@
},
"outputs": [],
"source": [
"# train_data = train_data.sort_values(by='trade_date')\n",
"# all_dates = train_data['trade_date'].unique() # 获取所有唯一的 trade_date\n",
"# split_date = all_dates[-120] # 划分点为倒数第 validation_days 天\n",
"# print(split_date)\n",
"# print(all_dates)\n",
"# val_data_split = train_data[train_data['trade_date'] >= split_date] # 验证集\n",
"\n",
"score_df = test_data.groupby('trade_date', group_keys=False).apply(lambda x: x.nsmallest(500, 'total_mv'))\n",
"# score_df = fill_nan_with_daily_median(score_df, ['pe_ttm'])\n",
"# score_df = score_df[score_df['pe_ttm'] > 0]\n",
@@ -1837,21 +1810,24 @@
"# score_df.loc[:, numeric_columns] = scaler.transform(score_df[numeric_columns])\n",
"# score_df = cross_sectional_standardization(score_df, numeric_columns)\n",
"\n",
"score_df['score'] = model.predict_proba(score_df[feature_columns])[:, 1]\n",
"if type == 'cat':\n",
" score_df['score'] = model.predict_proba(score_df[feature_columns])[:, 1]\n",
"elif type == 'light':\n",
" score_df['score'] = model.predict(score_df[feature_columns])\n",
"score_df['score_ranks'] = score_df.groupby('trade_date')['score'].rank(ascending=True)\n",
"\n",
"score_df = score_df.groupby('trade_date', group_keys=False).apply(\n",
" lambda x: x[x['score'] >= x['score'].quantile(0.90)] # 计算90%分位数作为阈值,筛选分数>=阈值的行\n",
").reset_index(drop=True) # drop=True 避免添加旧索引列\n",
"# save_df = score_df.groupby('trade_date', group_keys=False).apply(lambda x: x.nlargest(1, 'score')).reset_index()\n",
"save_df = score_df.groupby('trade_date', group_keys=False).apply(lambda x: x.nsmallest(1, 'total_mv')).reset_index()\n",
"save_df = score_df.groupby('trade_date', group_keys=False).apply(lambda x: x.nsmallest(2, 'total_mv')).reset_index()\n",
"save_df = save_df.sort_values(['trade_date', 'score'])\n",
"save_df[['trade_date', 'score', 'ts_code']].to_csv('predictions_test.tsv', index=False)\n"
]
},
{
"cell_type": "code",
"execution_count": 32,
"execution_count": 37,
"id": "09b1799e",
"metadata": {},
"outputs": [
@@ -1859,8 +1835,8 @@
"name": "stdout",
"output_type": "stream",
"text": [
"190\n",
"['vol', 'pct_chg', 'turnover_rate', 'volume_ratio', 'winner_rate', 'undist_profit_ps', 'ocfps', 'AR', 'BR', 'AR_BR', 'cashflow_to_ev_factor', 'book_to_price_ratio', 'turnover_rate_mean_5', 'variance_20', 'bbi_ratio_factor', 'lg_elg_net_buy_vol', 'flow_lg_elg_intensity', 'sm_net_buy_vol', 'total_buy_vol', 'lg_elg_buy_prop', 'flow_struct_buy_change', 'lg_elg_net_buy_vol_change', 'flow_lg_elg_accel', 'chip_concentration_range', 'chip_skewness', 'floating_chip_proxy', 'cost_support_15pct_change', 'cat_winner_price_zone', 'flow_chip_consistency', 'profit_taking_vs_absorb', 'cat_is_positive', 'upside_vol', 'downside_vol', 'vol_ratio', 'return_skew', 'return_kurtosis', 'volume_change_rate', 'cat_volume_breakout', 'turnover_deviation', 'cat_turnover_spike', 'avg_volume_ratio', 'cat_volume_ratio_breakout', 'vol_spike', 'vol_std_5', 'atr_14', 'atr_6', 'obv', 'maobv_6', 'rsi_3', 'return_5', 'return_20', 'std_return_5', 'std_return_90', 'std_return_90_2', 'act_factor1', 'act_factor2', 'act_factor3', 'act_factor4', 'rank_act_factor1', 'rank_act_factor2', 'rank_act_factor3', 'cov', 'delta_cov', 'alpha_22_improved', 'alpha_003', 'alpha_007', 'alpha_013', 'vol_break', 'weight_roc5', 'smallcap_concentration', 'cost_stability', 'high_cost_break_days', 'liquidity_risk', 'turnover_std', 'mv_volatility', 'volume_growth', 'mv_growth', 'momentum_factor', 'resonance_factor', 'log_close', 'cat_vol_spike', 'up', 'down', 'obv_maobv_6', 'std_return_5_over_std_return_90', 'std_return_90_minus_std_return_90_2', 'cat_af2', 'cat_af3', 'cat_af4', 'act_factor5', 'act_factor6', 'active_buy_volume_large', 'active_buy_volume_big', 'active_buy_volume_small', 'buy_lg_vol_minus_sell_lg_vol', 'buy_elg_vol_minus_sell_elg_vol', 'ctrl_strength', 'low_cost_dev', 'asymmetry', 'lock_factor', 'cat_vol_break', 'cost_atr_adj', 'cat_golden_resonance', 'mv_turnover_ratio', 'mv_adjusted_volume', 'mv_weighted_turnover', 'nonlinear_mv_volume', 'mv_volume_ratio', 'mv_momentum', 'lg_flow_mom_corr_20_60', 'lg_flow_accel', 'profit_pressure', 'underwater_resistance', 'cost_conc_std_20', 'profit_decay_20', 'vol_amp_loss_20', 'vol_drop_profit_cnt_5', 'lg_flow_vol_interact_20', 'cost_break_confirm_cnt_5', 'atr_norm_channel_pos_14', 'turnover_diff_skew_20', 'lg_sm_flow_diverge_20', 'pullback_strong_20_20', 'vol_wgt_hist_pos_20', 'vol_adj_roc_20', 'cs_rank_net_lg_flow_val', 'cs_rank_elg_buy_ratio', 'cs_rank_rel_profit_margin', 'cs_rank_cost_breadth', 'cs_rank_dist_to_upper_cost', 'cs_rank_winner_rate', 'cs_rank_intraday_range', 'cs_rank_close_pos_in_range', 'cs_rank_pos_in_hist_range', 'cs_rank_vol_x_profit_margin', 'cs_rank_lg_flow_price_concordance', 'cs_rank_turnover_per_winner', 'cs_rank_volume_ratio', 'cs_rank_elg_buy_sell_sm_ratio', 'cs_rank_cost_dist_vol_ratio', 'cs_rank_size', 'cat_up_limit', 'industry_obv', 'industry_return_5', 'industry_return_20', 'industry__ema_5', 'industry__ema_13', 'industry__ema_20', 'industry__ema_60', 'industry_act_factor1', 'industry_act_factor2', 'industry_act_factor3', 'industry_act_factor4', 'industry_act_factor5', 'industry_act_factor6', 'industry_rank_act_factor1', 'industry_rank_act_factor2', 'industry_rank_act_factor3', 'industry_return_5_percentile', 'industry_return_20_percentile', '000852.SH_MACD', '000905.SH_MACD', '399006.SZ_MACD', '000852.SH_MACD_hist', '000905.SH_MACD_hist', '399006.SZ_MACD_hist', '000852.SH_RSI', '000905.SH_RSI', '399006.SZ_RSI', '000852.SH_Signal_line', '000905.SH_Signal_line', '399006.SZ_Signal_line', '000852.SH_amount_change_rate', '000905.SH_amount_change_rate', '399006.SZ_amount_change_rate', '000852.SH_amount_mean', '000905.SH_amount_mean', '399006.SZ_amount_mean', '000852.SH_daily_return', '000905.SH_daily_return', '399006.SZ_daily_return', '000852.SH_up_ratio_20d', '000905.SH_up_ratio_20d', '399006.SZ_up_ratio_20d', '000852.SH_volatility', '000905.SH_volatility', '399006.SZ_volatility', '000852.SH_volume_change_rate', '000905.SH_volume_change_rate', '399006.SZ_volume_change_rate']\n"
"191\n",
"['vol', 'pct_chg', 'turnover_rate', 'volume_ratio', 'winner_rate', 'undist_profit_ps', 'ocfps', 'AR', 'BR', 'AR_BR', 'cashflow_to_ev_factor', 'book_to_price_ratio', 'turnover_rate_mean_5', 'variance_20', 'bbi_ratio_factor', 'daily_deviation', 'lg_elg_net_buy_vol', 'flow_lg_elg_intensity', 'sm_net_buy_vol', 'total_buy_vol', 'lg_elg_buy_prop', 'flow_struct_buy_change', 'lg_elg_net_buy_vol_change', 'flow_lg_elg_accel', 'chip_concentration_range', 'chip_skewness', 'floating_chip_proxy', 'cost_support_15pct_change', 'cat_winner_price_zone', 'flow_chip_consistency', 'profit_taking_vs_absorb', 'cat_is_positive', 'upside_vol', 'downside_vol', 'vol_ratio', 'return_skew', 'return_kurtosis', 'volume_change_rate', 'cat_volume_breakout', 'turnover_deviation', 'cat_turnover_spike', 'avg_volume_ratio', 'cat_volume_ratio_breakout', 'vol_spike', 'vol_std_5', 'atr_14', 'atr_6', 'obv', 'maobv_6', 'rsi_3', 'return_5', 'return_20', 'std_return_5', 'std_return_90', 'std_return_90_2', 'act_factor1', 'act_factor2', 'act_factor3', 'act_factor4', 'rank_act_factor1', 'rank_act_factor2', 'rank_act_factor3', 'cov', 'delta_cov', 'alpha_22_improved', 'alpha_003', 'alpha_007', 'alpha_013', 'vol_break', 'weight_roc5', 'smallcap_concentration', 'cost_stability', 'high_cost_break_days', 'liquidity_risk', 'turnover_std', 'mv_volatility', 'volume_growth', 'mv_growth', 'momentum_factor', 'resonance_factor', 'log_close', 'cat_vol_spike', 'up', 'down', 'obv_maobv_6', 'std_return_5_over_std_return_90', 'std_return_90_minus_std_return_90_2', 'cat_af2', 'cat_af3', 'cat_af4', 'act_factor5', 'act_factor6', 'active_buy_volume_large', 'active_buy_volume_big', 'active_buy_volume_small', 'buy_lg_vol_minus_sell_lg_vol', 'buy_elg_vol_minus_sell_elg_vol', 'ctrl_strength', 'low_cost_dev', 'asymmetry', 'lock_factor', 'cat_vol_break', 'cost_atr_adj', 'cat_golden_resonance', 'mv_turnover_ratio', 'mv_adjusted_volume', 'mv_weighted_turnover', 'nonlinear_mv_volume', 'mv_volume_ratio', 'mv_momentum', 'lg_flow_mom_corr_20_60', 'lg_flow_accel', 'profit_pressure', 'underwater_resistance', 'cost_conc_std_20', 'profit_decay_20', 'vol_amp_loss_20', 'vol_drop_profit_cnt_5', 'lg_flow_vol_interact_20', 'cost_break_confirm_cnt_5', 'atr_norm_channel_pos_14', 'turnover_diff_skew_20', 'lg_sm_flow_diverge_20', 'pullback_strong_20_20', 'vol_wgt_hist_pos_20', 'vol_adj_roc_20', 'cs_rank_net_lg_flow_val', 'cs_rank_elg_buy_ratio', 'cs_rank_rel_profit_margin', 'cs_rank_cost_breadth', 'cs_rank_dist_to_upper_cost', 'cs_rank_winner_rate', 'cs_rank_intraday_range', 'cs_rank_close_pos_in_range', 'cs_rank_pos_in_hist_range', 'cs_rank_vol_x_profit_margin', 'cs_rank_lg_flow_price_concordance', 'cs_rank_turnover_per_winner', 'cs_rank_volume_ratio', 'cs_rank_elg_buy_sell_sm_ratio', 'cs_rank_cost_dist_vol_ratio', 'cs_rank_size', 'cat_up_limit', 'industry_obv', 'industry_return_5', 'industry_return_20', 'industry__ema_5', 'industry__ema_13', 'industry__ema_20', 'industry__ema_60', 'industry_act_factor1', 'industry_act_factor2', 'industry_act_factor3', 'industry_act_factor4', 'industry_act_factor5', 'industry_act_factor6', 'industry_rank_act_factor1', 'industry_rank_act_factor2', 'industry_rank_act_factor3', 'industry_return_5_percentile', 'industry_return_20_percentile', '000852.SH_MACD', '000905.SH_MACD', '399006.SZ_MACD', '000852.SH_MACD_hist', '000905.SH_MACD_hist', '399006.SZ_MACD_hist', '000852.SH_RSI', '000905.SH_RSI', '399006.SZ_RSI', '000852.SH_Signal_line', '000905.SH_Signal_line', '399006.SZ_Signal_line', '000852.SH_amount_change_rate', '000905.SH_amount_change_rate', '399006.SZ_amount_change_rate', '000852.SH_amount_mean', '000905.SH_amount_mean', '399006.SZ_amount_mean', '000852.SH_daily_return', '000905.SH_daily_return', '399006.SZ_daily_return', '000852.SH_up_ratio_20d', '000905.SH_up_ratio_20d', '399006.SZ_up_ratio_20d', '000852.SH_volatility', '000905.SH_volatility', '399006.SZ_volatility', '000852.SH_volume_change_rate', '000905.SH_volume_change_rate', '399006.SZ_volume_change_rate']\n"
]
}
],
@@ -1871,7 +1847,7 @@
},
{
"cell_type": "code",
"execution_count": 33,
"execution_count": 38,
"id": "7e9023cc",
"metadata": {},
"outputs": [],
@@ -2071,7 +2047,7 @@
},
{
"cell_type": "code",
"execution_count": 34,
"execution_count": 39,
"id": "a0000d75",
"metadata": {},
"outputs": [
@@ -2081,7 +2057,7 @@
"text": [
"开始分析 'score' 在 'circ_mv' 和 'future_return' 下的表现...\n",
"准备数据,处理 NaN 值...\n",
"原始数据 28200 行,移除 NaN 后剩余 27807 行用于分析。\n",
"原始数据 28300 行,移除 NaN 后剩余 27850 行用于分析。\n",
"对 'circ_mv' 和 'future_return' 进行 100 分位数分箱...\n",
"按二维分箱分组计算 Spearman Rank IC...\n",
"整理结果用于绘图...\n",
@@ -2319,7 +2295,7 @@
},
{
"cell_type": "code",
"execution_count": 35,
"execution_count": 40,
"id": "a436dba4",
"metadata": {},
"outputs": [