NewStock/main/factor/generate_factor.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "initial_id",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-04-13T07:26:19.000054Z",
     "start_time": "2025-04-13T07:26:18.895713Z"
    },
    "collapsed": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "/mnt/d/PyProject/NewStock\n"
     ]
    }
   ],
   "source": [
    "import gc\n",
    "import os\n",
    "import sys\n",
    "sys.path.append('../../')\n",
    "print(os.getcwd())\n",
    "import pandas as pd\n",
    "from main.factor.factor import get_rolling_factor, get_simple_factor\n",
    "from main.utils.factor import read_industry_data\n",
    "from main.utils.factor_processor import calculate_score\n",
    "from main.utils.utils import read_and_merge_h5_data, merge_with_industry_data\n",
    "\n",
    "import warnings\n",
    "\n",
    "warnings.filterwarnings(\"ignore\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "f1623b04c7a366af",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-04-13T07:30:48.534271Z",
     "start_time": "2025-04-13T07:26:19.005576Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "daily data\n",
      "daily basic\n",
      "inner merge on ['ts_code', 'trade_date']\n",
      "stk limit\n",
      "left merge on ['ts_code', 'trade_date']\n",
      "money flow\n",
      "left merge on ['ts_code', 'trade_date']\n",
      "cyq perf\n",
      "left merge on ['ts_code', 'trade_date']\n",
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 8713571 entries, 0 to 8713570\n",
      "Data columns (total 33 columns):\n",
      " #   Column         Dtype         \n",
      "---  ------         -----         \n",
      " 0   ts_code        object        \n",
      " 1   trade_date     datetime64[ns]\n",
      " 2   open           float64       \n",
      " 3   close          float64       \n",
      " 4   high           float64       \n",
      " 5   low            float64       \n",
      " 6   vol            float64       \n",
      " 7   pct_chg        float64       \n",
      " 8   amount         float64       \n",
      " 9   turnover_rate  float64       \n",
      " 10  pe_ttm         float64       \n",
      " 11  circ_mv        float64       \n",
      " 12  total_mv       float64       \n",
      " 13  volume_ratio   float64       \n",
      " 14  is_st          bool          \n",
      " 15  up_limit       float64       \n",
      " 16  down_limit     float64       \n",
      " 17  buy_sm_vol     float64       \n",
      " 18  sell_sm_vol    float64       \n",
      " 19  buy_lg_vol     float64       \n",
      " 20  sell_lg_vol    float64       \n",
      " 21  buy_elg_vol    float64       \n",
      " 22  sell_elg_vol   float64       \n",
      " 23  net_mf_vol     float64       \n",
      " 24  his_low        float64       \n",
      " 25  his_high       float64       \n",
      " 26  cost_5pct      float64       \n",
      " 27  cost_15pct     float64       \n",
      " 28  cost_50pct     float64       \n",
      " 29  cost_85pct     float64       \n",
      " 30  cost_95pct     float64       \n",
      " 31  weight_avg     float64       \n",
      " 32  winner_rate    float64       \n",
      "dtypes: bool(1), datetime64[ns](1), float64(30), object(1)\n",
      "memory usage: 2.1+ GB\n",
      "None\n"
     ]
    }
   ],
   "source": [
    "from main.utils.utils import read_and_merge_h5_data\n",
    "\n",
    "print('daily data')\n",
    "df = read_and_merge_h5_data('/mnt/d/PyProject/NewStock/data/daily_data.h5', key='daily_data',\n",
    "                            columns=['ts_code', 'trade_date', 'open', 'close', 'high', 'low', 'vol', 'pct_chg', 'amount'],\n",
    "                            df=None)\n",
    "\n",
    "print('daily basic')\n",
    "df = read_and_merge_h5_data('/mnt/d/PyProject/NewStock/data/daily_basic.h5', key='daily_basic',\n",
    "                            columns=['ts_code', 'trade_date', 'turnover_rate', 'pe_ttm', 'circ_mv', 'total_mv', 'volume_ratio',\n",
    "                                     'is_st'], df=df, join='inner')\n",
    "\n",
    "print('stk limit')\n",
    "df = read_and_merge_h5_data('/mnt/d/PyProject/NewStock/data/stk_limit.h5', key='stk_limit',\n",
    "                            columns=['ts_code', 'trade_date', 'pre_close', 'up_limit', 'down_limit'],\n",
    "                            df=df)\n",
    "print('money flow')\n",
    "df = read_and_merge_h5_data('/mnt/d/PyProject/NewStock/data/money_flow.h5', key='money_flow',\n",
    "                            columns=['ts_code', 'trade_date', 'buy_sm_vol', 'sell_sm_vol', 'buy_lg_vol', 'sell_lg_vol',\n",
    "                                     'buy_elg_vol', 'sell_elg_vol', 'net_mf_vol'],\n",
    "                            df=df)\n",
    "print('cyq perf')\n",
    "df = read_and_merge_h5_data('/mnt/d/PyProject/NewStock/data/cyq_perf.h5', key='cyq_perf',\n",
    "                            columns=['ts_code', 'trade_date', 'his_low', 'his_high', 'cost_5pct', 'cost_15pct',\n",
    "                                     'cost_50pct',\n",
    "                                     'cost_85pct', 'cost_95pct', 'weight_avg', 'winner_rate'],\n",
    "                            df=df)\n",
    "print(df.info())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "0acb6625",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['ts_code', 'trade_date', 'open', 'close', 'high', 'low', 'vol', 'pct_chg', 'amount', 'turnover_rate', 'pe_ttm', 'circ_mv', 'total_mv', 'volume_ratio', 'is_st', 'up_limit', 'down_limit', 'buy_sm_vol', 'sell_sm_vol', 'buy_lg_vol', 'sell_lg_vol', 'buy_elg_vol', 'sell_elg_vol', 'net_mf_vol', 'his_low', 'his_high', 'cost_5pct', 'cost_15pct', 'cost_50pct', 'cost_85pct', 'cost_95pct', 'weight_avg', 'winner_rate']\n"
     ]
    }
   ],
   "source": [
    "\n",
    "origin_columns = df.columns.tolist()\n",
    "origin_columns = [col for col in origin_columns if 'cyq' not in col]\n",
    "print(origin_columns)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "820a6b50",
   "metadata": {},
   "outputs": [],
   "source": [
    "fina_indicator_df = read_and_merge_h5_data('/mnt/d/PyProject/NewStock/data/fina_indicator.h5', key='fina_indicator',\n",
    "                            columns=['ts_code', 'ann_date', 'undist_profit_ps', 'ocfps', 'bps'],\n",
    "                            df=None)\n",
    "cashflow_df = read_and_merge_h5_data('/mnt/d/PyProject/NewStock/data/cashflow.h5', key='cashflow',\n",
    "                            columns=['ts_code', 'ann_date', 'n_cashflow_act'],\n",
    "                            df=None)\n",
    "balancesheet_df = read_and_merge_h5_data('/mnt/d/PyProject/NewStock/data/balancesheet.h5', key='balancesheet',\n",
    "                            columns=['ts_code', 'ann_date', 'money_cap', 'total_liab'],\n",
    "                            df=None)\n",
    "top_list_df = read_and_merge_h5_data('/mnt/d/PyProject/NewStock/data/top_list.h5', key='top_list',\n",
    "                            columns=['ts_code', 'trade_date', 'reason'],\n",
    "                            df=None)\n",
    "\n",
    "top_list_df = top_list_df.sort_values(by='trade_date', ascending=False).drop_duplicates(subset=['ts_code', 'trade_date'], keep='first').sort_values(by='trade_date')\n",
    "\n",
    "stk_holdertrade_df = read_and_merge_h5_data('/mnt/d/PyProject/NewStock/data/stk_holdertrade.h5', key='stk_holdertrade',\n",
    "                            columns=['ts_code', 'ann_date', 'in_de', 'change_ratio'],\n",
    "                            df=None)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "903469a7",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "✅ 成功从 Redis Hash 'concept_stocks_daily_lists_pickle' 读取 1794 条每日概念股票数据。\n"
     ]
    }
   ],
   "source": [
    "import redis\n",
    "import pickle\n",
    "from datetime import date, datetime\n",
    "\n",
    "# --- 配置 Redis 连接 ---\n",
    "REDIS_HOST = '140.143.91.66'\n",
    "REDIS_PORT = 6389\n",
    "REDIS_DB = 0\n",
    "\n",
    "# --- 定义 Redis 键名 ---\n",
    "HASH_KEY = \"concept_stocks_daily_lists_pickle\" # 区分之前的 JSON 版本\n",
    "MAX_DATE_KEY = \"concept_stocks_max_date_pickle\" # 区分之前的 JSON 版本\n",
    "\n",
    "concept_dict = {}\n",
    "\n",
    "# --- 连接 Redis ---\n",
    "try:\n",
    "    r = redis.StrictRedis(host=REDIS_HOST, port=REDIS_PORT, db=REDIS_DB, password='Redis520102')\n",
    "\n",
    "    all_data_from_redis = r.hgetall(HASH_KEY) # 返回的是字典，键是字节，值是字节\n",
    "    \n",
    "    if all_data_from_redis:\n",
    "        for date_bytes, stocks_bytes in all_data_from_redis.items(): # 将变量名改为 date_bytes 更清晰\n",
    "            try:\n",
    "                # *** 修正点：将日期字节解码为字符串 ***\n",
    "                date_str = date_bytes.decode('utf-8') \n",
    "                date_obj = datetime.strptime(date_str, '%Y%m%d').date()\n",
    "                \n",
    "                stocks_list = pickle.loads(stocks_bytes)\n",
    "                concept_dict[date_obj] = stocks_list\n",
    "            except (ValueError, pickle.UnpicklingError) as e:\n",
    "                print(f\"⚠️ 警告: 解析 Redis 数据时出错 (日期键: '{date_bytes.decode('utf-8', errors='ignore')}')，跳过此条数据: {e}\") # 打印警告时也解码一下\n",
    "        print(f\"✅ 成功从 Redis Hash '{HASH_KEY}' 读取 {len(concept_dict)} 条每日概念股票数据。\")\n",
    "    else:\n",
    "        print(f\"ℹ️ Redis Hash '{HASH_KEY}' 中没有找到任何数据。\")\n",
    "\n",
    "except redis.exceptions.ConnectionError as e:\n",
    "    print(f\"❌ 错误: 无法连接到 Redis 服务器，请检查 Redis 是否正在运行或连接配置: {e}\")\n",
    "except Exception as e:\n",
    "    print(f\"❌ 从 Redis 读取数据时发生未知错误: {e}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "afb8da3d",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "4566757\n",
      "开始生成概念相关因子...\n",
      "开始计算概念内截面排序因子，基于: ['pct_chg', 'turnover_rate', 'volume_ratio']\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Ranking Features in Concepts: 100%|██████████| 3/3 [00:00<00:00, 15.82it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "概念相关因子生成完毕。\n",
      "4566757\n",
      "开始计算股东增减持因子...\n",
      "警告: 'in_de' 列中存在未映射的值，可能导致 _direction 列出现NaN。\n",
      "股东增减持因子计算完成。\n",
      "Calculating cat_senti_mom_vol_spike...\n",
      "Finished cat_senti_mom_vol_spike.\n",
      "Calculating cat_senti_pre_breakout...\n",
      "Calculating atr_10 as it's missing...\n",
      "Calculating atr_40 as it's missing...\n",
      "Finished cat_senti_pre_breakout.\n",
      "计算因子 ts_turnover_rate_acceleration_5_20\n",
      "计算因子 ts_vol_sustain_10_30\n",
      "计算因子 cs_amount_outlier_10\n",
      "计算因子 ts_ff_to_total_turnover_ratio\n",
      "计算因子 ts_price_volume_trend_coherence_5_20\n",
      "计算因子 ts_ff_turnover_rate_surge_10\n",
      "使用 'ann_date' 作为财务数据生效日期。\n",
      "警告: 从 financial_data_subset 中移除了 366 行，因为其 'ts_code' 或 'ann_date' 列存在空值。\n",
      "使用 'ann_date' 作为财务数据生效日期。\n",
      "警告: 从 financial_data_subset 中移除了 366 行，因为其 'ts_code' 或 'ann_date' 列存在空值。\n",
      "开始计算因子: AR, BR (原地修改)...\n",
      "因子 AR, BR 计算成功。\n",
      "因子 AR, BR 计算流程结束。\n",
      "使用 'ann_date' 作为财务数据生效日期。\n",
      "使用 'ann_date' 作为财务数据生效日期。\n",
      "使用 'ann_date' 作为财务数据生效日期。\n",
      "使用 'ann_date' 作为财务数据生效日期。\n",
      "警告: 从 financial_data_subset 中移除了 366 行，因为其 'ts_code' 或 'ann_date' 列存在空值。\n",
      "计算 BBI...\n",
      "--- 计算日级别偏离度 (使用 pct_chg) ---\n",
      "--- 计算日级别动量基准 (使用 pct_chg) ---\n",
      "日级别动量基准计算完成 (使用 pct_chg)。\n",
      "日级别偏离度计算完成 (使用 pct_chg)。\n",
      "--- 计算日级别行业偏离度 (使用 pct_chg 和行业基准) ---\n",
      "--- 计算日级别行业动量基准 (使用 pct_chg 和 cat_l2_code) ---\n",
      "错误: 计算日级别行业动量基准需要以下列: ['pct_chg', 'cat_l2_code', 'trade_date', 'ts_code']。\n",
      "错误: 计算日级别行业偏离度需要以下列: ['pct_chg', 'daily_industry_positive_benchmark', 'daily_industry_negative_benchmark']。请先运行 daily_industry_momentum_benchmark(df)。\n",
      "Index(['ts_code', 'trade_date', 'open', 'close', 'high', 'low', 'vol',\n",
      "       'pct_chg', 'amount', 'turnover_rate',\n",
      "       ...\n",
      "       'cat_volume_breakout', 'turnover_deviation', 'cat_turnover_spike',\n",
      "       'avg_volume_ratio', 'cat_volume_ratio_breakout', 'vol_spike',\n",
      "       'vol_std_5', 'atr_14', 'atr_6', 'obv'],\n",
      "      dtype='object', length=103)\n",
      "Calculating senti_strong_inflow...\n",
      "Finished senti_strong_inflow.\n",
      "Calculating lg_flow_mom_corr_20_60...\n",
      "Finished lg_flow_mom_corr_20_60.\n",
      "Calculating lg_flow_accel...\n",
      "Finished lg_flow_accel.\n",
      "Calculating profit_pressure...\n",
      "Finished profit_pressure.\n",
      "Calculating underwater_resistance...\n",
      "Finished underwater_resistance.\n",
      "Calculating cost_conc_std_20...\n",
      "Finished cost_conc_std_20.\n",
      "Calculating profit_decay_20...\n",
      "Finished profit_decay_20.\n",
      "Calculating vol_amp_loss_20...\n",
      "Finished vol_amp_loss_20.\n",
      "Calculating vol_drop_profit_cnt_5...\n",
      "Finished vol_drop_profit_cnt_5.\n",
      "Calculating lg_flow_vol_interact_20...\n",
      "Finished lg_flow_vol_interact_20.\n",
      "Calculating cost_break_confirm_cnt_5...\n",
      "Finished cost_break_confirm_cnt_5.\n",
      "Calculating atr_norm_channel_pos_14...\n",
      "Finished atr_norm_channel_pos_14.\n",
      "Calculating turnover_diff_skew_20...\n",
      "Finished turnover_diff_skew_20.\n",
      "Calculating lg_sm_flow_diverge_20...\n",
      "Finished lg_sm_flow_diverge_20.\n",
      "Calculating pullback_strong_20_20...\n",
      "Finished pullback_strong_20_20.\n",
      "Calculating vol_wgt_hist_pos_20...\n",
      "Finished vol_wgt_hist_pos_20.\n",
      "Calculating vol_adj_roc_20...\n",
      "Finished vol_adj_roc_20.\n",
      "Calculating cs_rank_net_lg_flow_val...\n",
      "Finished cs_rank_net_lg_flow_val.\n",
      "Calculating cs_rank_flow_divergence...\n",
      "Finished cs_rank_flow_divergence.\n",
      "Calculating cs_rank_ind_adj_lg_flow...\n",
      "Error calculating cs_rank_ind_adj_lg_flow: Missing 'cat_l2_code' column. Assigning NaN.\n",
      "Calculating cs_rank_elg_buy_ratio...\n",
      "Finished cs_rank_elg_buy_ratio.\n",
      "Calculating cs_rank_rel_profit_margin...\n",
      "Finished cs_rank_rel_profit_margin.\n",
      "Calculating cs_rank_cost_breadth...\n",
      "Finished cs_rank_cost_breadth.\n",
      "Calculating cs_rank_dist_to_upper_cost...\n",
      "Finished cs_rank_dist_to_upper_cost.\n",
      "Calculating cs_rank_winner_rate...\n",
      "Finished cs_rank_winner_rate.\n",
      "Calculating cs_rank_intraday_range...\n",
      "Finished cs_rank_intraday_range.\n",
      "Calculating cs_rank_close_pos_in_range...\n",
      "Finished cs_rank_close_pos_in_range.\n",
      "Calculating cs_rank_opening_gap...\n",
      "Error calculating cs_rank_opening_gap: Missing 'pre_close' column. Assigning NaN.\n",
      "Calculating cs_rank_pos_in_hist_range...\n",
      "Finished cs_rank_pos_in_hist_range.\n",
      "Calculating cs_rank_vol_x_profit_margin...\n",
      "Finished cs_rank_vol_x_profit_margin.\n",
      "Calculating cs_rank_lg_flow_price_concordance...\n",
      "Finished cs_rank_lg_flow_price_concordance.\n",
      "Calculating cs_rank_turnover_per_winner...\n",
      "Finished cs_rank_turnover_per_winner.\n",
      "Calculating cs_rank_ind_cap_neutral_pe (Placeholder - requires statsmodels)...\n",
      "Finished cs_rank_ind_cap_neutral_pe (Placeholder).\n",
      "Calculating cs_rank_volume_ratio...\n",
      "Finished cs_rank_volume_ratio.\n",
      "Calculating cs_rank_elg_buy_sell_sm_ratio...\n",
      "Finished cs_rank_elg_buy_sell_sm_ratio.\n",
      "Calculating cs_rank_cost_dist_vol_ratio...\n",
      "Finished cs_rank_cost_dist_vol_ratio.\n",
      "Calculating cs_rank_size...\n",
      "Finished cs_rank_size.\n",
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 4566757 entries, 0 to 4566756\n",
      "Columns: 197 entries, ts_code to cs_rank_size\n",
      "dtypes: bool(10), datetime64[ns](1), float64(176), int64(6), int8(1), object(3)\n",
      "memory usage: 6.4+ GB\n",
      "None\n",
      "['ts_code', 'trade_date', 'open', 'close', 'high', 'low', 'vol', 'pct_chg', 'amount', 'turnover_rate', 'pe_ttm', 'circ_mv', 'total_mv', 'volume_ratio', 'is_st', 'up_limit', 'down_limit', 'buy_sm_vol', 'sell_sm_vol', 'buy_lg_vol', 'sell_lg_vol', 'buy_elg_vol', 'sell_elg_vol', 'net_mf_vol', 'his_low', 'his_high', 'cost_5pct', 'cost_15pct', 'cost_50pct', 'cost_85pct', 'cost_95pct', 'weight_avg', 'winner_rate', 'cat_hot_concept_stock', 'concept_rank_pct_chg', 'concept_rank_turnover_rate', 'concept_rank_volume_ratio', 'holder_net_change_sum_10d', 'holder_increase_days_10d', 'holder_decrease_days_10d', 'holder_any_increase_flag_10d', 'holder_any_decrease_flag_10d', 'holder_direction_score_10d', 'cat_senti_mom_vol_spike', 'cat_senti_pre_breakout', 'ts_turnover_rate_acceleration_5_20', 'ts_vol_sustain_10_30', 'cs_amount_outlier_10', 'ts_ff_to_total_turnover_ratio', 'ts_price_volume_trend_coherence_5_20', 'ts_ff_turnover_rate_surge_10', 'undist_profit_ps', 'ocfps', 'AR', 'BR', 'AR_BR', 'log_circ_mv', 'cashflow_to_ev_factor', 'book_to_price_ratio', 'turnover_rate_mean_5', 'variance_20', 'bbi_ratio_factor', 'daily_deviation', 'lg_elg_net_buy_vol', 'flow_lg_elg_intensity', 'sm_net_buy_vol', 'flow_divergence_diff', 'flow_divergence_ratio', 'total_buy_vol', 'lg_elg_buy_prop', 'flow_struct_buy_change', 'lg_elg_net_buy_vol_change', 'flow_lg_elg_accel', 'chip_concentration_range', 'chip_skewness', 'floating_chip_proxy', 'cost_support_15pct_change', 'cat_winner_price_zone', 'flow_chip_consistency', 'profit_taking_vs_absorb', 'cat_is_positive', 'upside_vol', 'downside_vol', 'vol_ratio', 'return_skew', 'return_kurtosis', 'volume_change_rate', 'cat_volume_breakout', 'turnover_deviation', 'cat_turnover_spike', 'avg_volume_ratio', 'cat_volume_ratio_breakout', 'vol_spike', 'vol_std_5', 'atr_14', 'atr_6', 'obv', 'maobv_6', 'rsi_3', 'return_5', 'return_20', 'std_return_5', 'std_return_90', 'std_return_90_2', 'act_factor1', 'act_factor2', 'act_factor3', 'act_factor4', 'rank_act_factor1', 'rank_act_factor2', 'rank_act_factor3', 'cov', 'delta_cov', 'alpha_22_improved', 'alpha_003', 'alpha_007', 'alpha_013', 'vol_break', 'weight_roc5', 'price_cost_divergence', 'smallcap_concentration', 'cost_stability', 'high_cost_break_days', 'liquidity_risk', 'turnover_std', 'mv_volatility', 'volume_growth', 'mv_growth', 'momentum_factor', 'resonance_factor', 'log_close', 'cat_vol_spike', 'up', 'down', 'obv_maobv_6', 'std_return_5_over_std_return_90', 'std_return_90_minus_std_return_90_2', 'cat_af2', 'cat_af3', 'cat_af4', 'act_factor5', 'act_factor6', 'active_buy_volume_large', 'active_buy_volume_big', 'active_buy_volume_small', 'buy_lg_vol_minus_sell_lg_vol', 'buy_elg_vol_minus_sell_elg_vol', 'ctrl_strength', 'low_cost_dev', 'asymmetry', 'lock_factor', 'cat_vol_break', 'cost_atr_adj', 'cat_golden_resonance', 'mv_turnover_ratio', 'mv_adjusted_volume', 'mv_weighted_turnover', 'nonlinear_mv_volume', 'mv_volume_ratio', 'mv_momentum', 'senti_strong_inflow', 'lg_flow_mom_corr_20_60', 'lg_flow_accel', 'profit_pressure', 'underwater_resistance', 'cost_conc_std_20', 'profit_decay_20', 'vol_amp_loss_20', 'vol_drop_profit_cnt_5', 'lg_flow_vol_interact_20', 'cost_break_confirm_cnt_5', 'atr_norm_channel_pos_14', 'turnover_diff_skew_20', 'lg_sm_flow_diverge_20', 'pullback_strong_20_20', 'vol_wgt_hist_pos_20', 'vol_adj_roc_20', 'cs_rank_net_lg_flow_val', 'cs_rank_flow_divergence', 'cs_rank_ind_adj_lg_flow', 'cs_rank_elg_buy_ratio', 'cs_rank_rel_profit_margin', 'cs_rank_cost_breadth', 'cs_rank_dist_to_upper_cost', 'cs_rank_winner_rate', 'cs_rank_intraday_range', 'cs_rank_close_pos_in_range', 'cs_rank_opening_gap', 'cs_rank_pos_in_hist_range', 'cs_rank_vol_x_profit_margin', 'cs_rank_lg_flow_price_concordance', 'cs_rank_turnover_per_winner', 'cs_rank_ind_cap_neutral_pe', 'cs_rank_volume_ratio', 'cs_rank_elg_buy_sell_sm_ratio', 'cs_rank_cost_dist_vol_ratio', 'cs_rank_size']\n"
     ]
    }
   ],
   "source": [
    "import numpy as np\n",
    "from main.factor.factor import *\n",
    "from main.factor.money_factor import * \n",
    "from main.factor.concept_factor import * \n",
    "\n",
    "\n",
    "def filter_data(df):\n",
    "    # df = df.groupby('trade_date').apply(lambda x: x.nlargest(1000, 'act_factor1'))\n",
    "    df = df[~df[\"is_st\"]]\n",
    "    df = df[~df[\"ts_code\"].str.endswith(\"BJ\")]\n",
    "    df = df[~df[\"ts_code\"].str.startswith(\"30\")]\n",
    "    df = df[~df[\"ts_code\"].str.startswith(\"68\")]\n",
    "    df = df[~df[\"ts_code\"].str.startswith(\"8\")]\n",
    "    df = df[df[\"trade_date\"] >= \"2019-01-01\"]\n",
    "    if \"in_date\" in df.columns:\n",
    "        df = df.drop(columns=[\"in_date\"])\n",
    "    df = df.reset_index(drop=True)\n",
    "    return df\n",
    "\n",
    "\n",
    "gc.collect()\n",
    "\n",
    "df = filter_data(df)\n",
    "df = df.sort_values(by=[\"ts_code\", \"trade_date\"])\n",
    "\n",
    "# df = price_minus_deduction_price(df, n=120)\n",
    "# df = price_deduction_price_diff_ratio_to_sma(df, n=120)\n",
    "# df = cat_price_vs_sma_vs_deduction_price(df, n=120)\n",
    "# df = cat_reason(df, top_list_df)\n",
    "# df = cat_is_on_top_list(df, top_list_df)\n",
    "print(len(df))\n",
    "df = generate_concept_factors(df, concept_dict)\n",
    "print(len(df))\n",
    "\n",
    "df = holder_trade_factors(df, stk_holdertrade_df)\n",
    "\n",
    "df = cat_senti_mom_vol_spike(\n",
    "    df,\n",
    "    return_period=3,\n",
    "    return_threshold=0.03,  # 近3日涨幅超3%\n",
    "    volume_ratio_threshold=1.3,\n",
    "    current_pct_chg_min=0.0,  # 当日必须收红\n",
    "    current_pct_chg_max=0.05,\n",
    ")  # 当日涨幅不宜过大\n",
    "\n",
    "df = cat_senti_pre_breakout(\n",
    "    df,\n",
    "    atr_short_N=10,\n",
    "    atr_long_M=40,\n",
    "    vol_atrophy_N=10,\n",
    "    vol_atrophy_M=40,\n",
    "    price_stab_N=5,\n",
    "    price_stab_threshold=0.06,\n",
    "    current_pct_chg_min_signal=0.002,\n",
    "    current_pct_chg_max_signal=0.05,\n",
    "    volume_ratio_signal_threshold=1.1,\n",
    ")\n",
    "\n",
    "df = ts_turnover_rate_acceleration_5_20(df)\n",
    "df = ts_vol_sustain_10_30(df)\n",
    "# df = cs_turnover_rate_relative_strength_20(df)\n",
    "df = cs_amount_outlier_10(df)\n",
    "df = ts_ff_to_total_turnover_ratio(df)\n",
    "df = ts_price_volume_trend_coherence_5_20(df)\n",
    "# df = ts_turnover_rate_trend_strength_5(df)\n",
    "df = ts_ff_turnover_rate_surge_10(df)\n",
    "\n",
    "df = add_financial_factor(df, fina_indicator_df, factor_value_col=\"undist_profit_ps\")\n",
    "df = add_financial_factor(df, fina_indicator_df, factor_value_col=\"ocfps\")\n",
    "calculate_arbr(df, N=26)\n",
    "df[\"log_circ_mv\"] = np.log(df[\"circ_mv\"])\n",
    "df = calculate_cashflow_to_ev_factor(df, cashflow_df, balancesheet_df)\n",
    "df = caculate_book_to_price_ratio(df, fina_indicator_df)\n",
    "df = turnover_rate_n(df, n=5)\n",
    "df = variance_n(df, n=20)\n",
    "df = bbi_ratio_factor(df)\n",
    "df = daily_deviation(df)\n",
    "df = daily_industry_deviation(df)\n",
    "df, _ = get_rolling_factor(df)\n",
    "df, _ = get_simple_factor(df)\n",
    "\n",
    "df = calculate_strong_inflow_signal(df)\n",
    "\n",
    "df = df.rename(columns={\"l1_code\": \"cat_l1_code\"})\n",
    "df = df.rename(columns={\"l2_code\": \"cat_l2_code\"})\n",
    "\n",
    "lg_flow_mom_corr(df, N=20, M=60)\n",
    "lg_flow_accel(df)\n",
    "profit_pressure(df)\n",
    "underwater_resistance(df)\n",
    "cost_conc_std(df, N=20)\n",
    "profit_decay(df, N=20)\n",
    "vol_amp_loss(df, N=20)\n",
    "vol_drop_profit_cnt(df, N=20, M=5)\n",
    "lg_flow_vol_interact(df, N=20)\n",
    "cost_break_confirm_cnt(df, M=5)\n",
    "atr_norm_channel_pos(df, N=14)\n",
    "turnover_diff_skew(df, N=20)\n",
    "lg_sm_flow_diverge(df, N=20)\n",
    "pullback_strong(df, N=20, M=20)\n",
    "vol_wgt_hist_pos(df, N=20)\n",
    "vol_adj_roc(df, N=20)\n",
    "\n",
    "cs_rank_net_lg_flow_val(df)\n",
    "cs_rank_flow_divergence(df)\n",
    "cs_rank_industry_adj_lg_flow(df)  # Needs cat_l2_code\n",
    "cs_rank_elg_buy_ratio(df)\n",
    "cs_rank_rel_profit_margin(df)\n",
    "cs_rank_cost_breadth(df)\n",
    "cs_rank_dist_to_upper_cost(df)\n",
    "cs_rank_winner_rate(df)\n",
    "cs_rank_intraday_range(df)\n",
    "cs_rank_close_pos_in_range(df)\n",
    "cs_rank_opening_gap(df)  # Needs pre_close\n",
    "cs_rank_pos_in_hist_range(df)  # Needs his_low, his_high\n",
    "cs_rank_vol_x_profit_margin(df)\n",
    "cs_rank_lg_flow_price_concordance(df)\n",
    "cs_rank_turnover_per_winner(df)\n",
    "cs_rank_ind_cap_neutral_pe(df)  # Placeholder - needs external libraries\n",
    "cs_rank_volume_ratio(df)  # Needs volume_ratio\n",
    "cs_rank_elg_buy_sell_sm_ratio(df)\n",
    "cs_rank_cost_dist_vol_ratio(df)  # Needs volume_ratio\n",
    "cs_rank_size(df)  # Needs circ_mv\n",
    "\n",
    "\n",
    "# df = df.merge(index_data, on='trade_date', how='left')\n",
    "\n",
    "print(df.info())\n",
    "print(df.columns.tolist())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "48712034",
   "metadata": {},
   "outputs": [
    {
     "ename": "FileNotFoundError",
     "evalue": "File ../../data/industry_data.h5 does not exist",
     "output_type": "error",
     "traceback": [
      "\u001b[31m---------------------------------------------------------------------------\u001b[39m",
      "\u001b[31mFileNotFoundError\u001b[39m                         Traceback (most recent call last)",
      "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[8]\u001b[39m\u001b[32m, line 1\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m l2_df = \u001b[43mread_and_merge_h5_data\u001b[49m\u001b[43m(\u001b[49m\u001b[33;43m'\u001b[39;49m\u001b[33;43m../../data/industry_data.h5\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkey\u001b[49m\u001b[43m=\u001b[49m\u001b[33;43m'\u001b[39;49m\u001b[33;43mindustry_data\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[32m      2\u001b[39m \u001b[43m                               \u001b[49m\u001b[43mcolumns\u001b[49m\u001b[43m=\u001b[49m\u001b[43m[\u001b[49m\u001b[33;43m'\u001b[39;49m\u001b[33;43mts_code\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[33;43m'\u001b[39;49m\u001b[33;43ml2_code\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[33;43m'\u001b[39;49m\u001b[33;43min_date\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m      3\u001b[39m \u001b[43m                               \u001b[49m\u001b[43mdf\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mon\u001b[49m\u001b[43m=\u001b[49m\u001b[43m[\u001b[49m\u001b[33;43m'\u001b[39;49m\u001b[33;43mts_code\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mjoin\u001b[49m\u001b[43m=\u001b[49m\u001b[33;43m'\u001b[39;49m\u001b[33;43mleft\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[32m      4\u001b[39m df = merge_with_industry_data(df, l2_df)\n\u001b[32m      5\u001b[39m df = df.rename(columns={\u001b[33m'\u001b[39m\u001b[33ml2_code\u001b[39m\u001b[33m'\u001b[39m: \u001b[33m'\u001b[39m\u001b[33mcat_l2_code\u001b[39m\u001b[33m'\u001b[39m})\n",
      "\u001b[36mFile \u001b[39m\u001b[32m/mnt/d/PyProject/NewStock/main/utils/utils.py:14\u001b[39m, in \u001b[36mread_and_merge_h5_data\u001b[39m\u001b[34m(h5_filename, key, columns, df, join, on, prefix)\u001b[39m\n\u001b[32m     11\u001b[39m         processed_columns.append(col)\n\u001b[32m     13\u001b[39m \u001b[38;5;66;03m# 从 HDF5 文件读取数据，选择需要的列\u001b[39;00m\n\u001b[32m---> \u001b[39m\u001b[32m14\u001b[39m data = \u001b[43mpd\u001b[49m\u001b[43m.\u001b[49m\u001b[43mread_hdf\u001b[49m\u001b[43m(\u001b[49m\u001b[43mh5_filename\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkey\u001b[49m\u001b[43m=\u001b[49m\u001b[43mkey\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcolumns\u001b[49m\u001b[43m=\u001b[49m\u001b[43mprocessed_columns\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m     16\u001b[39m \u001b[38;5;66;03m# 修改列名，如果列名以前有 _，加上 _\u001b[39;00m\n\u001b[32m     17\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m col \u001b[38;5;129;01min\u001b[39;00m data.columns:\n",
      "\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/stock/lib/python3.13/site-packages/pandas/io/pytables.py:424\u001b[39m, in \u001b[36mread_hdf\u001b[39m\u001b[34m(path_or_buf, key, mode, errors, where, start, stop, columns, iterator, chunksize, **kwargs)\u001b[39m\n\u001b[32m    421\u001b[39m     exists = \u001b[38;5;28;01mFalse\u001b[39;00m\n\u001b[32m    423\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m exists:\n\u001b[32m--> \u001b[39m\u001b[32m424\u001b[39m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mFileNotFoundError\u001b[39;00m(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mFile \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mpath_or_buf\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m does not exist\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m    426\u001b[39m store = HDFStore(path_or_buf, mode=mode, errors=errors, **kwargs)\n\u001b[32m    427\u001b[39m \u001b[38;5;66;03m# can't auto open/close if we are using an iterator\u001b[39;00m\n\u001b[32m    428\u001b[39m \u001b[38;5;66;03m# so delegate to the iterator\u001b[39;00m\n",
      "\u001b[31mFileNotFoundError\u001b[39m: File ../../data/industry_data.h5 does not exist"
     ]
    }
   ],
   "source": [
    "\n",
    "l2_df = read_and_merge_h5_data('../../data/industry_data.h5', key='industry_data',\n",
    "                               columns=['ts_code', 'l2_code', 'in_date'],\n",
    "                               df=None, on=['ts_code'], join='left')\n",
    "df = merge_with_industry_data(df, l2_df)\n",
    "df = df.rename(columns={'l2_code': 'cat_l2_code'})\n",
    "# df = df.merge(index_data, on='trade_date', how='left')\n",
    "\n",
    "days = 5\n",
    "df = df.sort_values(by=['ts_code', 'trade_date'])\n",
    "# df['future_return'] = df.groupby('ts_code', group_keys=False)['close'].apply(lambda x: x.shift(-days) / x - 1)\n",
    "df['future_return'] = (df.groupby('ts_code')['close'].shift(-days) - df.groupby('ts_code')['open'].shift(-1)) / \\\n",
    "                      df.groupby('ts_code')['open'].shift(-1)\n",
    "# df['future_return'] = df.groupby('ts_code')['pct_chg'].shift(-1)\n",
    "df['future_return2'] = (df.groupby('ts_code')['close'].shift(-1) - df.groupby('ts_code')['open'].shift(-1)) / \\\n",
    "                       df.groupby('ts_code')['open'].shift(-1)\n",
    "\n",
    "df['future_volatility'] = (\n",
    "    df.groupby('ts_code')['pct_chg']\n",
    "    .transform(lambda x: x.rolling(days).std().shift(-days))\n",
    ")\n",
    "df['future_score'] = calculate_score(df, days=days, lambda_param=0.3)\n",
    "\n",
    "\n",
    "def select_pre_zt_stocks_dynamic(stock_df):\n",
    "    def select_stocks(group):\n",
    "        return group.nsmallest(1000, 'total_mv')  # 如果循环结束仍未找到足够标签，则返回最大数量的股票\n",
    "\n",
    "    stock_df = stock_df.groupby('trade_date', group_keys=False).apply(select_stocks)\n",
    "    return stock_df\n",
    "\n",
    "\n",
    "gc.collect()\n",
    "\n",
    "# df = select_pre_zt_stocks_dynamic(df[(df['trade_date'] >= '2022-01-01') & (df['trade_date'] <= '2029-04-07')])\n",
    "\n",
    "industry_df = read_industry_data('../../data/sw_daily.h5')\n",
    "df = df.merge(industry_df, on=['cat_l2_code', 'trade_date'], how='left')\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1c1dd3d6",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['open', 'close', 'high', 'low', 'vol', 'pct_chg', 'turnover_rate', 'circ_mv', 'volume_ratio', 'up_limit', 'down_limit', 'buy_sm_vol', 'sell_sm_vol', 'buy_lg_vol', 'sell_lg_vol', 'buy_elg_vol', 'sell_elg_vol', 'net_mf_vol', 'his_low', 'his_high', 'cost_5pct', 'cost_15pct', 'cost_50pct', 'cost_85pct', 'cost_95pct', 'weight_avg', 'winner_rate', 'lg_elg_net_buy_vol', 'flow_lg_elg_intensity', 'sm_net_buy_vol', 'total_buy_vol', 'lg_elg_buy_prop', 'flow_struct_buy_change', 'lg_elg_net_buy_vol_change', 'flow_lg_elg_accel', 'chip_concentration_range', 'chip_skewness', 'floating_chip_proxy', 'cost_support_15pct_change', 'cat_winner_price_zone', 'flow_chip_consistency', 'profit_taking_vs_absorb', 'cat_is_positive', 'upside_vol', 'downside_vol', 'vol_ratio', 'return_skew', 'return_kurtosis', 'volume_change_rate', 'cat_volume_breakout', 'turnover_deviation', 'cat_turnover_spike', 'avg_volume_ratio', 'cat_volume_ratio_breakout', 'vol_spike', 'vol_std_5', 'atr_14', 'atr_6', 'obv', 'maobv_6', 'rsi_3', 'return_5', 'return_20', 'std_return_5', 'std_return_90', 'std_return_90_2', 'act_factor1', 'act_factor2', 'act_factor3', 'act_factor4', 'rank_act_factor1', 'rank_act_factor2', 'rank_act_factor3', 'log_circ_mv', 'cov', 'delta_cov', 'alpha_22_improved', 'alpha_003', 'alpha_007', 'alpha_013', 'cat_up_limit', 'cat_down_limit', 'up_limit_count_10d', 'down_limit_count_10d', 'consecutive_up_limit', 'vol_break', 'weight_roc5', 'smallcap_concentration', 'cost_stability', 'high_cost_break_days', 'liquidity_risk', 'turnover_std', 'mv_volatility', 'volume_growth', 'mv_growth', 'arbr', 'momentum_factor', 'resonance_factor', 'log_close', 'cat_vol_spike', 'up', 'down', 'obv_maobv_6', 'std_return_5_over_std_return_90', 'std_return_90_minus_std_return_90_2', 'cat_af2', 'cat_af3', 'cat_af4', 'act_factor5', 'act_factor6', 'active_buy_volume_large', 'active_buy_volume_big', 'active_buy_volume_small', 'buy_lg_vol_minus_sell_lg_vol', 'buy_elg_vol_minus_sell_elg_vol', 'ctrl_strength', 'low_cost_dev', 'asymmetry', 'lock_factor', 'cat_vol_break', 'cost_atr_adj', 'cat_golden_resonance', 'mv_turnover_ratio', 'mv_adjusted_volume', 'mv_weighted_turnover', 'nonlinear_mv_volume', 'mv_volume_ratio', 'mv_momentum', 'lg_flow_mom_corr_20_60', 'lg_buy_consolidation_20', 'lg_flow_accel', 'profit_pressure', 'underwater_resistance', 'cost_conc_std_20', 'profit_decay_20', 'vol_amp_loss_20', 'vol_drop_profit_cnt_5', 'lg_flow_vol_interact_20', 'cost_break_confirm_cnt_5', 'atr_norm_channel_pos_14', 'turnover_diff_skew_20', 'lg_sm_flow_diverge_20', 'pullback_strong_20_20', 'vol_wgt_hist_pos_20', 'vol_adj_roc_20', 'intraday_lg_flow_corr_20', 'cap_neutral_cost_metric', 'in_date', 'industry_obv', 'industry_return_5', 'industry_return_20', 'industry__ema_5', 'industry__ema_13', 'industry__ema_20', 'industry__ema_60', 'industry_act_factor1', 'industry_act_factor2', 'industry_act_factor3', 'industry_act_factor4', 'industry_act_factor5', 'industry_act_factor6', 'industry_rank_act_factor1', 'industry_rank_act_factor2', 'industry_rank_act_factor3', 'industry_return_5_percentile', 'industry_return_20_percentile']\n"
     ]
    }
   ],
   "source": [
    "feature_columns = [col for col in df.columns if col in df.columns]\n",
    "feature_columns = [col for col in feature_columns if col not in ['trade_date',\n",
    "                                                                 'ts_code',\n",
    "                                                                 'label']]\n",
    "feature_columns = [col for col in feature_columns if 'future' not in col]\n",
    "feature_columns = [col for col in feature_columns if 'label' not in col]\n",
    "feature_columns = [col for col in feature_columns if 'score' not in col]\n",
    "feature_columns = [col for col in feature_columns if 'gen' not in col]\n",
    "feature_columns = [col for col in feature_columns if 'is_st' not in col]\n",
    "feature_columns = [col for col in feature_columns if 'pe_ttm' not in col]\n",
    "feature_columns = [col for col in feature_columns if 'cat_l2_code' not in col]\n",
    "# feature_columns = [col for col in feature_columns if col not in origin_columns]\n",
    "feature_columns = [col for col in feature_columns if not col.startswith('_')]\n",
    "# feature_columns = [col for col in feature_columns if col not in ['ts_code', 'trade_date', 'vol_std_5', 'cov', 'delta_cov', 'alpha_22_improved', 'alpha_007', 'consecutive_up_limit', 'mv_volatility', 'volume_growth', 'mv_growth', 'arbr']]\n",
    "\n",
    "print(feature_columns)\n",
    "numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns\n",
    "numeric_columns = [col for col in numeric_columns if col in feature_columns]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2c60c1ea",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "每个特征列中的 NaN 值数量（字典形式）：\n",
      "ts_code: 0\n",
      "trade_date: 0\n",
      "open: 0\n",
      "close: 0\n",
      "high: 0\n",
      "low: 0\n",
      "vol: 0\n",
      "pct_chg: 0\n",
      "turnover_rate: 0\n",
      "pe_ttm: 499616\n",
      "circ_mv: 0\n",
      "volume_ratio: 791\n",
      "is_st: 0\n",
      "up_limit: 0\n",
      "down_limit: 0\n",
      "buy_sm_vol: 7\n",
      "sell_sm_vol: 7\n",
      "buy_lg_vol: 7\n",
      "sell_lg_vol: 7\n",
      "buy_elg_vol: 7\n",
      "sell_elg_vol: 7\n",
      "net_mf_vol: 7\n",
      "his_low: 24695\n",
      "his_high: 24695\n",
      "cost_5pct: 24695\n",
      "cost_15pct: 24695\n",
      "cost_50pct: 24695\n",
      "cost_85pct: 24695\n",
      "cost_95pct: 24695\n",
      "weight_avg: 24695\n",
      "winner_rate: 24695\n",
      "lg_elg_net_buy_vol: 7\n",
      "flow_lg_elg_intensity: 7\n",
      "sm_net_buy_vol: 7\n",
      "flow_divergence_diff: 7\n",
      "flow_divergence_ratio: 7\n",
      "total_buy_vol: 7\n",
      "lg_elg_buy_prop: 7\n",
      "flow_struct_buy_change: 3287\n",
      "lg_elg_net_buy_vol_change: 3287\n",
      "flow_lg_elg_accel: 6567\n",
      "chip_concentration_range: 24695\n",
      "chip_skewness: 24695\n",
      "floating_chip_proxy: 24695\n",
      "cost_support_15pct_change: 27855\n",
      "cat_winner_price_zone: 0\n",
      "flow_chip_consistency: 7\n",
      "profit_taking_vs_absorb: 7\n",
      "cat_is_positive: 0\n",
      "upside_vol: 29581\n",
      "downside_vol: 29655\n",
      "vol_ratio: 0\n",
      "return_skew: 13096\n",
      "return_kurtosis: 13096\n",
      "volume_change_rate: 29466\n",
      "cat_volume_breakout: 0\n",
      "turnover_deviation: 6548\n",
      "cat_turnover_spike: 0\n",
      "avg_volume_ratio: 7341\n",
      "cat_volume_ratio_breakout: 0\n",
      "vol_spike: 62074\n",
      "vol_std_5: 16370\n",
      "atr_14: 45836\n",
      "atr_6: 19644\n",
      "obv: 0\n",
      "maobv_6: 16370\n",
      "rsi_3: 9822\n",
      "return_5: 16370\n",
      "return_20: 65315\n",
      "std_return_5: 16370\n",
      "std_return_90: 291770\n",
      "std_return_90_2: 323906\n",
      "act_factor1: 16370\n",
      "act_factor2: 42562\n",
      "act_factor3: 65315\n",
      "act_factor4: 194886\n",
      "rank_act_factor1: 16370\n",
      "rank_act_factor2: 42562\n",
      "rank_act_factor3: 65315\n",
      "log_circ_mv: 0\n",
      "cov: 13096\n",
      "delta_cov: 29466\n",
      "alpha_22_improved: 62074\n",
      "alpha_003: 0\n",
      "alpha_007: 13120\n",
      "alpha_013: 62074\n",
      "cat_up_limit: 0\n",
      "cat_down_limit: 0\n",
      "up_limit_count_10d: 0\n",
      "down_limit_count_10d: 0\n",
      "consecutive_up_limit: 0\n",
      "vol_break: 0\n",
      "weight_roc5: 40531\n",
      "price_cost_divergence: 93280\n",
      "smallcap_concentration: 24695\n",
      "cost_stability: 85077\n",
      "high_cost_break_days: 13096\n",
      "liquidity_risk: 53215\n",
      "turnover_std: 62074\n",
      "mv_volatility: 62074\n",
      "volume_growth: 65315\n",
      "mv_growth: 65315\n",
      "arbr: 9822\n",
      "momentum_factor: 29466\n",
      "resonance_factor: 791\n",
      "log_close: 0\n",
      "cat_vol_spike: 0\n",
      "up: 0\n",
      "down: 0\n",
      "obv_maobv_6: 16370\n",
      "std_return_5_over_std_return_90: 291770\n",
      "std_return_90_minus_std_return_90_2: 323906\n",
      "cat_af2: 0\n",
      "cat_af3: 0\n",
      "cat_af4: 0\n",
      "act_factor5: 194886\n",
      "act_factor6: 42562\n",
      "active_buy_volume_large: 13\n",
      "active_buy_volume_big: 79\n",
      "active_buy_volume_small: 7\n",
      "buy_lg_vol_minus_sell_lg_vol: 8\n",
      "buy_elg_vol_minus_sell_elg_vol: 69\n",
      "ctrl_strength: 24695\n",
      "low_cost_dev: 24695\n",
      "asymmetry: 24701\n",
      "lock_factor: 24695\n",
      "cat_vol_break: 0\n",
      "cost_atr_adj: 69060\n",
      "cat_golden_resonance: 0\n",
      "mv_turnover_ratio: 0\n",
      "mv_adjusted_volume: 0\n",
      "mv_weighted_turnover: 0\n",
      "nonlinear_mv_volume: 0\n",
      "mv_volume_ratio: 791\n",
      "mv_momentum: 791\n",
      "lg_flow_mom_corr_20_60: 1186\n",
      "lg_buy_consolidation_20: 1950902\n",
      "lg_flow_accel: 6567\n",
      "profit_pressure: 24695\n",
      "underwater_resistance: 24695\n",
      "cost_conc_std_20: 29466\n",
      "profit_decay_20: 0\n",
      "vol_amp_loss_20: 53215\n",
      "vol_drop_profit_cnt_5: 0\n",
      "lg_flow_vol_interact_20: 29466\n",
      "cost_break_confirm_cnt_5: 0\n",
      "atr_norm_channel_pos_14: 0\n",
      "turnover_diff_skew_20: 32740\n",
      "lg_sm_flow_diverge_20: 29466\n",
      "pullback_strong_20_20: 0\n",
      "vol_wgt_hist_pos_20: 0\n",
      "vol_adj_roc_20: 0\n",
      "intraday_lg_flow_corr_20: 2431461\n",
      "cap_neutral_cost_metric: 2431461\n",
      "cat_l2_code: 290\n",
      "in_date: 65486\n",
      "future_return: 6548\n",
      "future_return2: 3274\n",
      "future_volatility: 6548\n",
      "score: 6548\n",
      "future_score: 6548\n",
      "industry_obv: 11272\n",
      "industry_return_5: 11272\n",
      "industry_return_20: 11272\n",
      "industry__ema_5: 11272\n",
      "industry__ema_13: 11272\n",
      "industry__ema_20: 11272\n",
      "industry__ema_60: 11272\n",
      "industry_act_factor1: 11272\n",
      "industry_act_factor2: 11272\n",
      "industry_act_factor3: 11272\n",
      "industry_act_factor4: 11272\n",
      "industry_act_factor5: 11272\n",
      "industry_act_factor6: 11272\n",
      "industry_rank_act_factor1: 11272\n",
      "industry_rank_act_factor2: 11272\n",
      "industry_rank_act_factor3: 11272\n",
      "industry_return_5_percentile: 11272\n",
      "industry_return_20_percentile: 11272\n",
      "['open', 'close', 'high', 'low', 'vol', 'pct_chg', 'turnover_rate', 'circ_mv', 'volume_ratio', 'up_limit', 'down_limit', 'buy_sm_vol', 'sell_sm_vol', 'buy_lg_vol', 'sell_lg_vol', 'buy_elg_vol', 'sell_elg_vol', 'net_mf_vol', 'his_low', 'his_high', 'cost_5pct', 'cost_15pct', 'cost_50pct', 'cost_85pct', 'cost_95pct', 'weight_avg', 'winner_rate', 'lg_elg_net_buy_vol', 'flow_lg_elg_intensity', 'sm_net_buy_vol', 'total_buy_vol', 'lg_elg_buy_prop', 'flow_struct_buy_change', 'lg_elg_net_buy_vol_change', 'flow_lg_elg_accel', 'chip_concentration_range', 'chip_skewness', 'floating_chip_proxy', 'cost_support_15pct_change', 'cat_winner_price_zone', 'flow_chip_consistency', 'profit_taking_vs_absorb', 'cat_is_positive', 'upside_vol', 'downside_vol', 'vol_ratio', 'return_skew', 'return_kurtosis', 'volume_change_rate', 'cat_volume_breakout', 'turnover_deviation', 'cat_turnover_spike', 'avg_volume_ratio', 'cat_volume_ratio_breakout', 'vol_std_5', 'atr_6', 'obv', 'maobv_6', 'rsi_3', 'return_5', 'std_return_5', 'act_factor1', 'rank_act_factor1', 'log_circ_mv', 'cov', 'delta_cov', 'alpha_003', 'alpha_007', 'cat_up_limit', 'cat_down_limit', 'up_limit_count_10d', 'down_limit_count_10d', 'consecutive_up_limit', 'vol_break', 'smallcap_concentration', 'high_cost_break_days', 'arbr', 'momentum_factor', 'resonance_factor', 'log_close', 'cat_vol_spike', 'up', 'down', 'obv_maobv_6', 'cat_af2', 'cat_af3', 'cat_af4', 'active_buy_volume_large', 'active_buy_volume_big', 'active_buy_volume_small', 'buy_lg_vol_minus_sell_lg_vol', 'buy_elg_vol_minus_sell_elg_vol', 'ctrl_strength', 'low_cost_dev', 'asymmetry', 'lock_factor', 'cat_vol_break', 'cat_golden_resonance', 'mv_turnover_ratio', 'mv_adjusted_volume', 'mv_weighted_turnover', 'nonlinear_mv_volume', 'mv_volume_ratio', 'mv_momentum', 'lg_flow_mom_corr_20_60', 'lg_flow_accel', 'profit_pressure', 'underwater_resistance', 'cost_conc_std_20', 'profit_decay_20', 'vol_drop_profit_cnt_5', 'lg_flow_vol_interact_20', 'cost_break_confirm_cnt_5', 'atr_norm_channel_pos_14', 'lg_sm_flow_diverge_20', 'pullback_strong_20_20', 'vol_wgt_hist_pos_20', 'vol_adj_roc_20', 'industry_obv', 'industry_return_5', 'industry_return_20', 'industry__ema_5', 'industry__ema_13', 'industry__ema_20', 'industry__ema_60', 'industry_act_factor1', 'industry_act_factor2', 'industry_act_factor3', 'industry_act_factor4', 'industry_act_factor5', 'industry_act_factor6', 'industry_rank_act_factor1', 'industry_rank_act_factor2', 'industry_rank_act_factor3', 'industry_return_5_percentile', 'industry_return_20_percentile']\n"
     ]
    }
   ],
   "source": [
    "def count_nan_and_inf_per_feature(df: pd.DataFrame):\n",
    "  \"\"\"\n",
    "  计算 DataFrame 中每个特征列的 NaN 和 Inf 值数量。\n",
    "\n",
    "  Args:\n",
    "    df: 要分析的 pandas DataFrame。\n",
    "\n",
    "  Returns:\n",
    "    一个字典，包含两个 pandas Series：\n",
    "      - 'NaN_Count': 索引是列名，值是该列中 NaN 的数量。\n",
    "      - 'Inf_Count': 索引是列名，值是该列中 Inf 的数量。\n",
    "  \"\"\"\n",
    "  nan_counts = df.isna().sum()\n",
    "  # inf_counts = np.isinf(df).sum()\n",
    "  return nan_counts\n",
    "\n",
    "\n",
    "nan_counts_series = count_nan_and_inf_per_feature(df)\n",
    "\n",
    "# 或者，如果您想以字典的形式获取结果：\n",
    "nan_counts_dict = nan_counts_series.to_dict()\n",
    "print(\"\\n每个特征列中的 NaN 值数量（字典形式）：\")\n",
    "for k, v in nan_counts_dict.items():\n",
    "  print(f'{k}: {v}')\n",
    "  if v > 30000 and k in feature_columns:\n",
    "    feature_columns.remove(k)\n",
    "print(feature_columns)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e088bd8a357e815a",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-04-13T15:39:47.461434Z",
     "start_time": "2025-04-13T15:39:44.369664Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "gen\tnevals\tavg      \tstd     \tmin\tmax    \n",
      "0  \t64    \t-0.387605\t0.492269\t-1 \t0.84339\n",
      "1  \t52    \t-0.0280574\t0.328787\t-1 \t0.84339\n",
      "2  \t56    \t-0.0442643\t0.498012\t-1 \t0.84339\n",
      "3  \t50    \t0.0843881 \t0.506873\t-1 \t0.84339\n",
      "4  \t56    \t0.128797  \t0.586781\t-1 \t0.84339\n",
      "5  \t52    \t0.107366  \t0.586957\t-1 \t0.918103\n",
      "6  \t52    \t0.0602483 \t0.666345\t-1 \t0.918103\n",
      "7  \t54    \t0.177717  \t0.561644\t-1 \t0.918103\n",
      "8  \t57    \t0.206183  \t0.620791\t-1 \t0.957887\n",
      "9  \t51    \t0.253306  \t0.667259\t-1 \t1.07875 \n",
      "10 \t53    \t0.19914   \t0.681541\t-1 \t1.14356 \n",
      "11 \t54    \t0.173093  \t0.752007\t-1 \t1.24408 \n",
      "12 \t53    \t0.429303  \t0.636249\t-1.07606\t1.24408 \n",
      "13 \t46    \t0.443469  \t0.754764\t-1.17052\t1.24408 \n",
      "14 \t57    \t0.412168  \t0.719715\t-1.2066 \t1.24408 \n",
      "15 \t47    \t0.420095  \t0.833547\t-1.20899\t1.25608 \n",
      "16 \t46    \t0.516075  \t0.916347\t-1.16556\t1.25765 \n",
      "17 \t48    \t0.52129   \t0.872883\t-1      \t1.30663 \n",
      "18 \t53    \t0.530992  \t0.923366\t-1      \t1.3677  \n",
      "19 \t54    \t0.569299  \t0.861833\t-1.39138\t1.3677  \n",
      "20 \t51    \t0.538589  \t0.883032\t-1.12472\t1.3677  \n",
      "21 \t49    \t0.684813  \t0.874059\t-1      \t1.3677  \n",
      "22 \t46    \t0.659823  \t0.86879 \t-1.17051\t1.3677  \n",
      "23 \t42    \t0.678971  \t0.886044\t-1.39138\t1.3677  \n",
      "24 \t55    \t0.639381  \t0.905808\t-1.39138\t1.37645 \n",
      "25 \t42    \t0.721136  \t0.915513\t-1.30205\t1.39372 \n",
      "26 \t56    \t0.695918  \t0.849837\t-1.0437 \t1.39372 \n",
      "27 \t56    \t0.465007  \t0.934313\t-1      \t1.39372 \n",
      "28 \t51    \t0.714563  \t0.88635 \t-1.13547\t1.43745 \n",
      "29 \t49    \t0.687478  \t0.84568 \t-1      \t1.43745 \n",
      "30 \t50    \t0.646657  \t0.835957\t-1      \t1.43745 \n",
      "31 \t49    \t0.615978  \t0.939622\t-1.04846\t1.43745 \n",
      "32 \t49    \t0.654171  \t0.973861\t-1.12771\t1.43745 \n",
      "\n",
      "Best Factors Found:\n",
      "Fitness: 1.4375, Factor 1: protected_div_torch(mul(protected_div_torch(add(return_kurtosis, profit_pressure), mul(cost_85pct, buy_elg_vol_minus_sell_elg_vol)), protected_div_torch(cost_break_confirm_cnt_5, pow(cos(lg_flow_vol_interact_20), cos(chip_concentration_range)))), sub(add(obv, protected_div_torch(cost_break_confirm_cnt_5, cos(chip_skewness))), add(obv, protected_div_torch(add(return_kurtosis, profit_pressure), pow(alpha_007, active_buy_volume_big)))))\n",
      "Fitness: 1.3937, Factor 2: protected_div_torch(mul(protected_div_torch(protected_div_torch(add(return_kurtosis, profit_pressure), pow(alpha_007, active_buy_volume_big)), delta_cov), protected_div_torch(protected_div_torch(add(return_kurtosis, profit_pressure), pow(alpha_007, active_buy_volume_big)), rank_act_factor2)), sub(add(obv, protected_div_torch(cost_break_confirm_cnt_5, cos(chip_skewness))), add(obv, protected_div_torch(add(return_kurtosis, profit_pressure), pow(alpha_007, active_buy_volume_big)))))\n",
      "Fitness: 1.3843, Factor 3: protected_div_torch(mul(protected_div_torch(protected_div_torch(profit_pressure, pow(alpha_007, active_buy_volume_big)), delta_cov), protected_div_torch(protected_div_torch(add(return_kurtosis, profit_pressure), pow(alpha_007, active_buy_volume_big)), rank_act_factor2)), sub(add(obv, protected_div_torch(cost_break_confirm_cnt_5, cos(chip_skewness))), add(obv, protected_div_torch(add(return_kurtosis, profit_pressure), pow(alpha_007, active_buy_volume_big)))))\n"
     ]
    }
   ],
   "source": [
    "from deap import creator, gp, tools, base, algorithms\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import torch\n",
    "from scipy.stats import spearmanr\n",
    "import operator\n",
    "\n",
    "# 保护性除法函数 (PyTorch 版本)\n",
    "def protected_div_torch(left, right):\n",
    "    return torch.where(right != 0, left / right, torch.ones_like(left))\n",
    "\n",
    "def generate_deap_factors_pytorch_v2(df: pd.DataFrame, numeric_columns: list, target_column: str = 'future_return', date_column: str = 'trade_date', params: dict = None, random_state: int = 42):\n",
    "    \"\"\"\n",
    "    使用 deap 库通过遗传编程生成新的因子，并使用 PyTorch 算子和计算，过滤 NaN 值。\n",
    "\n",
    "    Args:\n",
    "        df (pd.DataFrame): 包含因子和目标变量的数据框。\n",
    "        numeric_columns (list): 数值型因子列名的列表。\n",
    "        target_column (str): 目标变量的列名，默认为 'future_return'。\n",
    "        params (dict): deap 进化算法的参数字典。\n",
    "        random_state (int): 随机种子，用于保证结果的可重复性。\n",
    "\n",
    "    Returns:\n",
    "        list: 包含最佳因子表达式的列表。\n",
    "    \"\"\"\n",
    "    if params is None:\n",
    "        params = {}\n",
    "\n",
    "    # 设置随机种子\n",
    "    np.random.seed(random_state)\n",
    "    torch.manual_seed(random_state)\n",
    "\n",
    "    # 1. 定义原始集 (Primitive Set) - 使用 PyTorch 算子\n",
    "    pset_torch = gp.PrimitiveSet(\"PYTORCH\", arity=len(numeric_columns))\n",
    "    pset_torch.addPrimitive(torch.add, 2)\n",
    "    pset_torch.addPrimitive(torch.sub, 2)\n",
    "    pset_torch.addPrimitive(torch.mul, 2)\n",
    "    pset_torch.addPrimitive(protected_div_torch, 2) # 使用 PyTorch 保护性除法\n",
    "    # 新增的复杂算子\n",
    "    pset_torch.addPrimitive(torch.sin, 1)        # 正弦函数 (一元算子)\n",
    "    pset_torch.addPrimitive(torch.cos, 1)        # 余弦函数 (一元算子)\n",
    "    # pset_torch.addPrimitive(torch.abs, 1)        # 绝对值 (一元算子)\n",
    "    # pset_torch.addPrimitive(torch.sqrt, 1)       # 平方根 (一元算子)\n",
    "    pset_torch.addPrimitive(torch.pow, 2)        # 指数运算 (二元算子，例如 x 的 y 次方)\n",
    "    # pset_torch.addPrimitive(torch.tanh, 1)       # 双曲正切函数 (一元算子)\n",
    "\n",
    "    # def rate_of_change_torch(x, y): # 计算 y 相对于 x 的变化率\n",
    "    #     return (y - x) / (x + 1e-8)\n",
    "    # pset_torch.addPrimitive(rate_of_change_torch, 2)\n",
    "\n",
    "    # def covariance_like_torch(x, y):\n",
    "    #     mean_x = torch.mean(x, dim=0, keepdim=True) # 保持维度以便广播\n",
    "    #     mean_y = torch.mean(y, dim=0, keepdim=True)\n",
    "    #     return (x - mean_x) * (y - mean_y)\n",
    "\n",
    "    # pset_torch.addPrimitive(covariance_like_torch, 2)\n",
    "\n",
    "    # 将 numeric_columns 作为终端添加到原始集\n",
    "    pset_torch.renameArguments(**{f\"ARG{i}\": col for i, col in enumerate(numeric_columns)})\n",
    "\n",
    "    # 2. 定义适应度和个体\n",
    "    # 目标是最大化 IC 夏普比率\n",
    "    creator.create(\"FitnessMax\", base.Fitness, weights=(1.0,))\n",
    "    creator.create(\"Individual\", gp.PrimitiveTree, fitness=creator.FitnessMax)\n",
    "\n",
    "    # 3. 创建工具箱 (Toolbox)\n",
    "    toolbox = base.Toolbox()\n",
    "    toolbox.register(\"expr_torch\", gp.genHalfAndHalf, pset=pset_torch, min_=1, max_=3) # 调整 min_/max_ 以控制表达式复杂性\n",
    "    toolbox.register(\"individual\", tools.initIterate, creator.Individual, toolbox.expr_torch)\n",
    "    toolbox.register(\"population\", tools.initRepeat, list, toolbox.individual)\n",
    "    toolbox.register(\"compile_torch\", gp.compile, pset=pset_torch) # 编译为 PyTorch 函数\n",
    "\n",
    "    # 准备 PyTorch 张量数据 (所有日期所有股票)\n",
    "    device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
    "    data_tensor_all = torch.from_numpy(df[numeric_columns].values).float().to(device)\n",
    "    target_tensor_all = torch.from_numpy(df[target_column].values).float().to(device)\n",
    "    dates_all = df[date_column].values # 获取日期 numpy 数组\n",
    "\n",
    "    # 4. 定义基于 PyTorch + IC 夏普比率的适应度函数\n",
    "    def evaluate_torch_cuda_ic_sharpe(individual, data_tensor_all, target_tensor_all, dates_all):\n",
    "        # 将个体（表达式树）编译成可执行的 PyTorch 函数\n",
    "        func_torch = toolbox.compile_torch(expr=individual)\n",
    "\n",
    "        try:\n",
    "            # 应用该函数到 PyTorch 张量 (一次性计算所有日期所有股票的因子值)\n",
    "            # 处理可能的维度不一致，确保输出是一维或二维 (N, 1) 的张量\n",
    "            factor_values_tensor = func_torch(*torch.split(data_tensor_all, 1, dim=1))\n",
    "            if factor_values_tensor.ndim > 1 and factor_values_tensor.shape[1] != 1:\n",
    "                 # 如果输出是 (N, M) 其中 M > 1，可能需要一个聚合操作，这里暂时返回负适应度\n",
    "                 print(f\"警告: 因子表达式输出张量维度为 {factor_values_tensor.shape}，期望 (N, 1)。\")\n",
    "                 return (-1.0,)\n",
    "            factor_values_tensor = factor_values_tensor.flatten() # 确保是展平的一维张量\n",
    "\n",
    "            # 将 PyTorch 张量移回 CPU 并转换为 NumPy 数组\n",
    "            factor_values_np = factor_values_tensor.cpu().numpy()\n",
    "            target_np = target_tensor_all.cpu().numpy().flatten() # 目标也展平\n",
    "            dates_np = dates_all # 日期已经是 numpy 数组\n",
    "\n",
    "            # 创建一个临时 Pandas DataFrame 以便按日期分组计算每日 IC\n",
    "            temp_df = pd.DataFrame({\n",
    "                'date': dates_np,\n",
    "                'factor_value': factor_values_np,\n",
    "                'target_value': target_np\n",
    "            })\n",
    "\n",
    "            # 计算每日 Rank IC\n",
    "            # 在分组应用 spearmanr 时处理 NaN 和数据点不足的问题\n",
    "            daily_ics = temp_df.groupby('date').apply(\n",
    "                lambda x: spearmanr(x['factor_value'], x['target_value'])[0]\n",
    "                if len(x) >= 2 and x['factor_value'].notna().sum() >= 2 and x['target_value'].notna().sum() >= 2 # 确保分组内有效数据点 >= 2\n",
    "                else np.nan # 数据点不足或计算失败时返回 NaN\n",
    "            ).dropna() # 移除 NaN 的每日 IC\n",
    "\n",
    "            # 计算 IC 夏普比率\n",
    "            if len(daily_ics) < 5: # 需要至少几个有效日 IC 才能计算夏普比率\n",
    "                 # print(f\"警告: 有效日 IC 数量不足 ({len(daily_ics)})，无法计算夏普比率。\")\n",
    "                 return (-1.0,) # 有效日 IC 太少，返回负适应度\n",
    "\n",
    "            ic_mean = daily_ics.mean()\n",
    "            ic_std = daily_ics.std()\n",
    "\n",
    "            # 处理标准差为零的情况 (非常罕见，可能意味着每日 IC 是常数)\n",
    "            if ic_std == 0:\n",
    "                ic_sharpe = ic_mean * 1e6 if ic_mean > 0 else -1.0 # 如果均值>0且标差为0，给一个很大的正值\n",
    "            else:\n",
    "                ic_sharpe = ic_mean / ic_std\n",
    "\n",
    "            # 返回 IC 夏普比率作为适应度 (需要最大化)\n",
    "            # 如果计算结果是 NaN (例如，mean/std 导致 NaN)，返回负值\n",
    "            return (ic_sharpe if not np.isnan(ic_sharpe) else -1.0,)\n",
    "\n",
    "        except (ValueError, TypeError, ZeroDivisionError, RuntimeError) as e:\n",
    "            # 打印错误信息和导致错误的个体，以便调试\n",
    "            print(f\"Error during evaluation for individual {individual}: {e}\")\n",
    "            return (-1.0,) # 如果计算过程中出现错误，返回一个很小的负值\n",
    "\n",
    "    # 修改 toolbox.register 调用，将 target_tensor 传递给 evaluate_torch_cuda\n",
    "    toolbox.register(\"evaluate\", evaluate_torch_cuda_ic_sharpe, data_tensor_all=data_tensor_all, target_tensor_all=target_tensor_all, dates_all=dates_all)\n",
    "    toolbox.register(\"select\", tools.selTournament, tournsize=params.get('tournament_size', 3))\n",
    "    toolbox.register(\"mate\", gp.cxOnePointLeafBiased, termpb=0.2) # 移除 pset=pset\n",
    "    toolbox.register(\"mutate\", gp.mutUniform, expr=toolbox.expr_torch, pset=pset_torch) # 使用 PyTorch 原始集\n",
    "\n",
    "    MAX_TREE_DEPTH = 5\n",
    "\n",
    "    toolbox.decorate(\"mate\", gp.staticLimit(key=operator.attrgetter('height'), max_value=MAX_TREE_DEPTH))\n",
    "    toolbox.decorate(\"mutate\", gp.staticLimit(key=operator.attrgetter('height'), max_value=MAX_TREE_DEPTH))\n",
    "\n",
    "    # 5. 设置进化参数\n",
    "    population_size = params.get('population_size', 100)\n",
    "    generations = params.get('generations', 10)\n",
    "    crossover_probability = params.get('crossover_probability', 0.7) # 调整参数以增加探索\n",
    "    mutation_probability = params.get('mutation_probability', 0.3) # 调整参数以增加探索\n",
    "\n",
    "    # 6. 初始化种群\n",
    "    pop = toolbox.population(n=population_size)\n",
    "    hof = tools.HallOfFame(params.get('hall_of_fame_size', 5)) # 保留最佳的几个个体\n",
    "    stats = tools.Statistics(lambda ind: ind.fitness.values)\n",
    "    stats.register(\"avg\", np.mean)\n",
    "    stats.register(\"std\", np.std)\n",
    "    stats.register(\"min\", np.min)\n",
    "    stats.register(\"max\", np.max)\n",
    "\n",
    "    # 7. 运行进化算法\n",
    "    algorithms.eaSimple(pop, toolbox, cxpb=crossover_probability, mutpb=mutation_probability, ngen=generations,\n",
    "                        stats=stats, halloffame=hof, verbose=True)\n",
    "\n",
    "    # 8. 返回最佳因子表达式\n",
    "    return hof, stats\n",
    "\n",
    "params = {\n",
    "    'population_size': 64,\n",
    "    'generations': 32,\n",
    "    'crossover_probability': 0.7,\n",
    "    'mutation_probability': 0.3,\n",
    "    'tournament_size': 4,\n",
    "    'hall_of_fame_size': 3\n",
    "}\n",
    "\n",
    "best_factors_hof, stats = generate_deap_factors_pytorch_v2(df.copy(), numeric_columns, params=params)\n",
    "\n",
    "print(\"\\nBest Factors Found:\")\n",
    "for i, ind in enumerate(best_factors_hof):\n",
    "    fitness_value = ind.fitness.values[0] # 获取适应度值\n",
    "    print(f\"Fitness: {fitness_value:.4f}, Factor {i+1}: {ind}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a0b3d7551ef0c81f",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-04-13T15:39:47.502867Z",
     "start_time": "2025-04-13T15:39:47.461434Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "全面因子分析报告 - 特征因子: 'generated_factor'\n",
      "------------------------------------------------------------\n",
      "整体 Rank IC: 0.0817\n",
      "整体 P-value: 0.0000\n",
      "------------------------------------------------------------\n",
      "计算滚动 Rank IC (按 'D' 窗口)...\n",
      "滚动 Rank IC 统计量 (D):\n",
      "  均值: 0.0124\n",
      "  标准差: 0.2330\n",
      "  夏普比率 (IC Mean / IC Std): 0.0531\n",
      "  T-statistic: 1.4577\n",
      "  T-statistic P-value: 0.1453\n",
      "------------------------------------------------------------\n",
      "Hit Ratio (正向 Rank IC 比例): 0.5060\n",
      "------------------------------------------------------------\n",
      "因子 10 分位数分析 (按因子值从小到大排序):\n",
      "  第 1 分位数: 平均 'future_return' = -0.0004\n",
      "  第 2 分位数: 平均 'future_return' = -0.0008\n",
      "  第 3 分位数: 平均 'future_return' = -0.0004\n",
      "  第 4 分位数: 平均 'future_return' = 0.0005\n",
      "  第 5 分位数: 平均 'future_return' = 0.0007\n",
      "  第 6 分位数: 平均 'future_return' = 0.0015\n",
      "  第 7 分位数: 平均 'future_return' = 0.0021\n",
      "  第 8 分位数: 平均 'future_return' = 0.0033\n",
      "  第 9 分位数: 平均 'future_return' = 0.0054\n",
      "  第 10 分位数: 平均 'future_return' = 0.0135\n",
      "\n",
      "因子值的分位数范围:\n",
      "  第 1 分位数: [-1.0490, 0.0581]\n",
      "  第 2 分位数: [0.0581, 0.1051]\n",
      "  第 3 分位数: [0.1051, 0.1458]\n",
      "  第 4 分位数: [0.1458, 0.1881]\n",
      "  第 5 分位数: [0.1881, 0.2354]\n",
      "  第 6 分位数: [0.2354, 0.2909]\n",
      "  第 7 分位数: [0.2909, 0.3594]\n",
      "  第 8 分位数: [0.3594, 0.4505]\n",
      "  第 9 分位数: [0.4505, 0.5880]\n",
      "  第 10 分位数: [0.5880, 1.9782]\n",
      "------------------------------------------------------------\n",
      "分析完成。\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import torch\n",
    "\n",
    "target_column = 'future_return'\n",
    "# 假设您已经定义了 protected_div_torch 函数\n",
    "def protected_div_torch(left, right):\n",
    "    return torch.where(right != 0, left / right, torch.ones_like(left))\n",
    "\n",
    "def protected_div_np(left, right):\n",
    "    \"\"\"安全除法，避免除以零的错误\"\"\"\n",
    "    return np.where(right != 0, left / right, np.ones_like(left) * np.nan) # 除以零时返回 NaN\n",
    "\n",
    "def calculate_factor_4(df: pd.DataFrame) -> pd.Series:\n",
    "    \"\"\"\n",
    "    计算因子: sub(add(add(protected_div_torch(pow(pct_chg, std_return_90), cost_95pct), protected_div_torch(industry_act_factor6, cost_95pct)), pow(protected_div_torch(protected_div_torch(act_factor6, cost_95pct), cost_95pct), protected_div_torch(protected_div_torch(act_factor6, cost_95pct), cost_95pct))), cos(industry_act_factor1)).\n",
    "\n",
    "    Args:\n",
    "        df (pd.DataFrame): 包含必要列的数据框。\n",
    "\n",
    "    Returns:\n",
    "        pd.Series: 计算得到的因子值。\n",
    "    \"\"\"\n",
    "    pct_chg = df['pct_chg']\n",
    "    std_return_90 = df['std_return_90']\n",
    "    cost_95pct = df['cost_95pct']\n",
    "    industry_act_factor6 = df['industry_act_factor6']\n",
    "    act_factor6 = df['act_factor6']\n",
    "    industry_act_factor1 = df['industry_act_factor1']\n",
    "\n",
    "    # Term 1: protected_div_torch(pow(pct_chg, std_return_90), cost_95pct)\n",
    "    term1_num = np.power(pct_chg, std_return_90)\n",
    "    term1 = protected_div_np(term1_num, cost_95pct)\n",
    "\n",
    "    # Term 2: protected_div_torch(industry_act_factor6, cost_95pct)\n",
    "    term2 = protected_div_np(industry_act_factor6, cost_95pct)\n",
    "\n",
    "    # Term 3: pow(protected_div_torch(protected_div_torch(act_factor6, cost_95pct), cost_95pct), protected_div_torch(protected_div_torch(act_factor6, cost_95pct), cost_95pct))\n",
    "    term3_base_inner = protected_div_np(act_factor6, cost_95pct)\n",
    "    term3_base = protected_div_np(term3_base_inner, cost_95pct)\n",
    "    term3_exponent_inner = protected_div_np(act_factor6, cost_95pct)\n",
    "    term3_exponent = protected_div_np(term3_exponent_inner, cost_95pct)\n",
    "    term3 = np.power(term3_base, term3_exponent)\n",
    "\n",
    "\n",
    "    # Sum of the first three terms\n",
    "    add_terms = term1 + term2 + term3\n",
    "\n",
    "    # Term 4: cos(industry_act_factor1)\n",
    "    term4 = np.cos(industry_act_factor1)\n",
    "\n",
    "    # Final factor\n",
    "    factor4 = add_terms - term4\n",
    "\n",
    "    return factor4\n",
    "\n",
    "df['generated_factor'] = calculate_factor_4(df)\n",
    "\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "from scipy.stats import spearmanr, ttest_1samp\n",
    "\n",
    "def comprehensive_factor_analysis(df: pd.DataFrame, factor_column: str, target_column: str = 'future_return', date_column: str = 'trade_date', rolling_window: str = 'D', n_deciles: int = 10):\n",
    "    \"\"\"\n",
    "    对 DataFrame 中的一个特征因子进行全面分析。\n",
    "\n",
    "    Args:\n",
    "        df (pd.DataFrame): 包含因子和目标变量的数据框。\n",
    "        factor_column (str): 要分析的特征因子的列名。\n",
    "        target_column (str): 目标变量的列名，默认为 'future_return'。\n",
    "        date_column (str): 包含日期信息的列名，默认为 'trade_date'。\n",
    "        rolling_window (str): 滚动 Rank IC 的时间窗口（例如 'D' 表示按天，'W' 表示按周）。\n",
    "        n_deciles (int): 进行分位数分析时使用的分位数数量，默认为 10。\n",
    "    \"\"\"\n",
    "    if factor_column not in df.columns:\n",
    "        print(f\"错误: 特征因子列 '{factor_column}' 不存在于 DataFrame 中。\")\n",
    "        return\n",
    "    if target_column not in df.columns:\n",
    "        print(f\"错误: 目标列 '{target_column}' 不存在于 DataFrame 中。\")\n",
    "        return\n",
    "    if date_column not in df.columns:\n",
    "        print(f\"错误: 日期列 '{date_column}' 不存在于 DataFrame 中。\")\n",
    "        return\n",
    "\n",
    "    # 确保日期列是 datetime 类型并设置为索引\n",
    "    df_analy = df.copy()\n",
    "    df_analy[date_column] = pd.to_datetime(df_analy[date_column])\n",
    "    df_analy = df_analy.set_index(date_column)\n",
    "\n",
    "    # 移除因子或目标变量为 NaN 的行\n",
    "    df_analy = df_analy.dropna(subset=[factor_column, target_column])\n",
    "\n",
    "    if len(df_analy) < 2:\n",
    "        print(\"警告: 有效数据点太少，无法进行分析。\")\n",
    "        return\n",
    "\n",
    "    print(f\"全面因子分析报告 - 特征因子: '{factor_column}'\")\n",
    "    print(\"-\" * 60)\n",
    "\n",
    "    # 1. 计算整体 Rank IC\n",
    "    overall_rank_ic, overall_p_value = spearmanr(df_analy[factor_column], df_analy[target_column])\n",
    "    print(f\"整体 Rank IC: {overall_rank_ic:.4f}\")\n",
    "    print(f\"整体 P-value: {overall_p_value:.4f}\")\n",
    "    print(\"-\" * 60)\n",
    "\n",
    "    # 2. 计算滚动 Rank IC (按指定时间窗口)\n",
    "    print(f\"计算滚动 Rank IC (按 '{rolling_window}' 窗口)...\")\n",
    "    rolling_ics = df_analy.groupby(df_analy.index.to_period(rolling_window)).apply(\n",
    "        lambda x: spearmanr(x[factor_column], x[target_column])[0] if len(x) >= 2 else np.nan\n",
    "    ).dropna()\n",
    "\n",
    "    if len(rolling_ics) < 2:\n",
    "        print(\"警告: 滚动 Rank IC 有效周期太少，无法计算统计量。\")\n",
    "    else:\n",
    "        # 3. 滚动 IC 统计量\n",
    "        ic_mean = rolling_ics.mean()\n",
    "        ic_std = rolling_ics.std()\n",
    "        ic_sharpe = ic_mean / ic_std if ic_std != 0 else np.nan\n",
    "        t_statistic, p_value_t = ttest_1samp(rolling_ics, 0) # 检验均值是否显著不为零\n",
    "\n",
    "        print(f\"滚动 Rank IC 统计量 ({rolling_window}):\")\n",
    "        print(f\"  均值: {ic_mean:.4f}\")\n",
    "        print(f\"  标准差: {ic_std:.4f}\")\n",
    "        print(f\"  夏普比率 (IC Mean / IC Std): {ic_sharpe:.4f}\")\n",
    "        print(f\"  T-statistic: {t_statistic:.4f}\")\n",
    "        print(f\"  T-statistic P-value: {p_value_t:.4f}\")\n",
    "        print(\"-\" * 60)\n",
    "\n",
    "        # 4. Hit Ratio (正向 Rank IC 的比例)\n",
    "        hit_ratio = (rolling_ics > 0).sum() / len(rolling_ics)\n",
    "        print(f\"Hit Ratio (正向 Rank IC 比例): {hit_ratio:.4f}\")\n",
    "        print(\"-\" * 60)\n",
    "\n",
    "    # 5. 分位数分析 (在整个数据集上进行)\n",
    "    print(f\"因子 {n_deciles} 分位数分析 (按因子值从小到大排序):\")\n",
    "    df_analy['decile'] = pd.qcut(df_analy[factor_column], q=n_deciles, labels=False, duplicates='drop')\n",
    "    decile_analysis = df_analy.groupby('decile')[target_column].mean().sort_index()\n",
    "\n",
    "    if len(decile_analysis) > 0:\n",
    "         for decile, avg_return in decile_analysis.items():\n",
    "             print(f\"  第 {decile + 1} 分位数: 平均 '{target_column}' = {avg_return:.4f}\")\n",
    "\n",
    "         # 打印每个分位数的因子值范围\n",
    "         percentiles = np.linspace(0, 100, n_deciles + 1)\n",
    "         factor_percentiles = df_analy[factor_column].quantile(percentiles / 100)\n",
    "         print(\"\\n因子值的分位数范围:\")\n",
    "         # 修复分位数范围打印的 KeyError\n",
    "         for i in range(len(decile_analysis)): # 确保只打印实际存在的分位数\n",
    "             lower_bound = factor_percentiles[percentiles[i] / 100]\n",
    "             upper_bound = factor_percentiles[percentiles[i+1] / 100]\n",
    "             print(f\"  第 {i + 1} 分位数: [{lower_bound:.4f}, {upper_bound:.4f}]\")\n",
    "    else:\n",
    "        print(\"警告: 分位数分析无法执行，可能是因为数据点不足或因子值分布问题。\")\n",
    "\n",
    "\n",
    "    print(\"-\" * 60)\n",
    "    print(\"分析完成。\")\n",
    "\n",
    "comprehensive_factor_analysis(df, factor_column='generated_factor', target_column='future_return', date_column='trade_date', rolling_window='D', n_deciles=10)\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "stock",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.13.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}