{
 "cells": [
  {
   "cell_type": "code",
   "id": "initial_id",
   "metadata": {
    "collapsed": true,
    "ExecuteTime": {
     "end_time": "2025-09-04T03:09:51.064325Z",
     "start_time": "2025-09-04T03:09:50.789242Z"
    }
   },
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "import talib as ta  # Make sure TA-Lib is installed: pip install TA-Lib\n",
    "import statsmodels.api as sm\n",
    "\n",
    "import warnings\n",
    "\n",
    "# 忽略所有警告\n",
    "warnings.filterwarnings(\"ignore\")\n",
    "\n",
    "# --- 0. Configure your file path ---\n",
    "# Please replace 'your_futures_data.csv' with the actual path to your CSV file\n",
    "file_path = '/mnt/d/PyProject/NewQuant/data/data/KQ_m@SHFE_rb/KQ_m@SHFE_rb_min15.csv'\n"
   ],
   "outputs": [],
   "execution_count": 1
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-09-04T03:09:51.149053Z",
     "start_time": "2025-09-04T03:09:51.081682Z"
    }
   },
   "cell_type": "code",
   "source": [
    "\n",
    "# --- 1. Data Loading and Preprocessing ---\n",
    "def load_and_preprocess_data(file_path):\n",
    "    \"\"\"\n",
    "    Loads historical futures data and performs basic preprocessing.\n",
    "    Assumes data contains 'datetime', 'open', 'high', 'low', 'close', 'volume' columns.\n",
    "    \"\"\"\n",
    "    try:\n",
    "        df = pd.read_csv(file_path, parse_dates=['datetime'], index_col='datetime')\n",
    "        # Ensure data is sorted by time\n",
    "        df = df.sort_index()\n",
    "        # Check and handle missing values\n",
    "        initial_rows = len(df)\n",
    "        df.dropna(inplace=True)\n",
    "        if len(df) < initial_rows:\n",
    "            print(f\"Warning: Missing values found in data, deleted {initial_rows - len(df)} rows.\")\n",
    "\n",
    "        # Check if necessary columns exist\n",
    "        required_columns = ['open', 'high', 'low', 'close', 'volume']\n",
    "        if not all(col in df.columns for col in required_columns):\n",
    "            raise ValueError(f\"CSV file is missing required columns. Please ensure it contains: {required_columns}\")\n",
    "\n",
    "        print(f\"Successfully loaded {len(df)} rows of data.\")\n",
    "        print(\"First 5 rows of data:\")\n",
    "        print(df.head())\n",
    "        return df\n",
    "    except FileNotFoundError:\n",
    "        print(f\"Error: File '{file_path}' not found. Please check the path.\")\n",
    "        return None\n",
    "    except Exception as e:\n",
    "        print(f\"Error during data loading or preprocessing: {e}\")\n",
    "        return None\n",
    "\n",
    "\n",
    "df_raw = load_and_preprocess_data(file_path)"
   ],
   "id": "548c68daa68af8c1",
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Successfully loaded 25470 rows of data.\n",
      "First 5 rows of data:\n",
      "                       open    high     low   close    volume    open_oi  \\\n",
      "datetime                                                                   \n",
      "2020-12-31 14:45:00  4352.0  4400.0  4345.0  4388.0  213731.0  1221661.0   \n",
      "2021-01-04 09:00:00  4356.0  4368.0  4309.0  4336.0  338332.0  1217327.0   \n",
      "2021-01-04 09:15:00  4336.0  4342.0  4307.0  4318.0  144479.0  1197881.0   \n",
      "2021-01-04 09:30:00  4318.0  4329.0  4312.0  4317.0   85679.0  1194567.0   \n",
      "2021-01-04 09:45:00  4317.0  4338.0  4316.0  4338.0   66461.0  1194592.0   \n",
      "\n",
      "                      close_oi underlying_symbol  \n",
      "datetime                                          \n",
      "2020-12-31 14:45:00  1217327.0       SHFE.rb2105  \n",
      "2021-01-04 09:00:00  1197881.0       SHFE.rb2105  \n",
      "2021-01-04 09:15:00  1194567.0       SHFE.rb2105  \n",
      "2021-01-04 09:30:00  1194592.0       SHFE.rb2105  \n",
      "2021-01-04 09:45:00  1198035.0       SHFE.rb2105  \n"
     ]
    }
   ],
   "execution_count": 2
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-09-04T03:12:29.108246Z",
     "start_time": "2025-09-04T03:12:26.136815Z"
    }
   },
   "cell_type": "code",
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import talib\n",
    "from scipy.stats import jarque_bera, bootstrap\n",
    "from statsmodels.tsa.stattools import adfuller, acf\n",
    "from statsmodels.stats.diagnostic import het_arch\n",
    "\n",
    "\n",
    "def rsi_stats(df_raw: pd.DataFrame,\n",
    "              period: int = 14,\n",
    "              lookback: int = 252 * 4):\n",
    "    \"\"\"\n",
    "    返回 RSI 多维度统计量\n",
    "    \"\"\"\n",
    "    rsi = talib.RSI(df_raw['close'], timeperiod=period)\n",
    "    rsi = rsi[~np.isnan(rsi)]\n",
    "    if len(rsi) < 100:\n",
    "        raise ValueError('RSI 样本不足')\n",
    "\n",
    "    # 基本描述\n",
    "    stats = {\n",
    "        'period': period,\n",
    "        'mean': np.mean(rsi),\n",
    "        'std': np.std(rsi, ddof=1),\n",
    "        'skew': pd.Series(rsi).skew(),\n",
    "        'kurt': pd.Series(rsi).kurtosis(),\n",
    "        'q10': np.percentile(rsi, 10),\n",
    "        'q90': np.percentile(rsi, 90),\n",
    "    }\n",
    "\n",
    "    # 均值回归 / 动量\n",
    "    adf_res = adfuller(rsi, regression='c')\n",
    "    stats['adf_stat'] = adf_res[0]\n",
    "    stats['adf_p']    = adf_res[1]\n",
    "    stats['acf_1'] = acf(rsi, nlags=1, fft=False)[1]\n",
    "\n",
    "    # 波动聚集（RSI 变化率的 ARCH）\n",
    "    delta_rsi = np.diff(rsi)\n",
    "    arch_res = het_arch(delta_rsi)\n",
    "    arch_lm, arch_p = arch_res[0], arch_res[1]\n",
    "\n",
    "    stats['arch_lm'] = arch_lm\n",
    "    stats['arch_p'] = arch_p\n",
    "\n",
    "    # 重尾指数（Hill）\n",
    "    tail = sorted(rsi)[-int(np.ceil(0.05 * len(rsi))):]\n",
    "    stats['hill_tail'] = len(tail) / np.sum(np.log(tail) - np.log(min(tail)))\n",
    "\n",
    "    # 均值回归区间收益\n",
    "    extreme_low = stats['q10']\n",
    "    extreme_high = stats['q90']\n",
    "    # 事件：RSI 穿越极值后 5-bar 收益\n",
    "    events = []\n",
    "    for i in range(1, len(rsi)):\n",
    "        if rsi[i - 1] < extreme_low and rsi[i] >= extreme_low:\n",
    "            events.append(i)\n",
    "        elif rsi[i - 1] > extreme_high and rsi[i] <= extreme_high:\n",
    "            events.append(-i)\n",
    "\n",
    "    if len(events) > 20:\n",
    "        rets = []\n",
    "        for idx in events:\n",
    "            idx_abs = abs(idx)\n",
    "            if idx_abs + 5 >= len(rsi):\n",
    "                continue\n",
    "            direc = 1 if idx > 0 else -1\n",
    "            start = df_raw['close'].iloc[idx_abs]\n",
    "            end = df_raw['close'].iloc[idx_abs + 5]\n",
    "            rets.append((end - start) * direc)\n",
    "        rets = np.array(rets)\n",
    "        stats['mean_ret_5'] = np.mean(rets)\n",
    "        stats['win_rate_5'] = np.mean(rets > 0)\n",
    "        stats['events_5'] = len(rets)\n",
    "    else:\n",
    "        stats.update({'mean_ret_5': np.nan, 'win_rate_5': np.nan, 'events_5': 0})\n",
    "\n",
    "    return pd.Series(stats)\n",
    "\n",
    "\n",
    "# ---------- 一键 ----------\n",
    "for p in [6, 14, 21]:\n",
    "    try:\n",
    "        res = rsi_stats(df_raw, period=p)\n",
    "        print(res.to_frame().T.to_string(index=False))\n",
    "    except ValueError as e:\n",
    "        e.with_traceback().print_exc()"
   ],
   "id": "c566a4757b4f6456",
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      " period      mean       std      skew      kurt       q10       q90   adf_stat  adf_p    acf_1    arch_lm       arch_p  hill_tail  mean_ret_5  win_rate_5  events_5\n",
      "    6.0 50.076761 18.780546 -0.007016 -0.633046 25.069754 75.082648 -39.303166    0.0 0.848566 360.978412 1.861253e-71   16.95949  -36.215042    0.005014    1795.0\n",
      " period      mean       std    skew      kurt       q10       q90   adf_stat  adf_p    acf_1    arch_lm       arch_p  hill_tail  mean_ret_5  win_rate_5  events_5\n",
      "   14.0 50.091278 12.856068 -0.0578 -0.260539 32.889392 66.554807 -27.311287    0.0 0.933965 102.255697 1.925344e-17  16.462544  -16.821584    0.240209    1149.0\n",
      " period      mean       std      skew      kurt       q10       q90   adf_stat  adf_p    acf_1   arch_lm       arch_p  hill_tail  mean_ret_5  win_rate_5  events_5\n",
      "   21.0 50.067891 10.675715 -0.080938 -0.156658 35.637464 63.528908 -22.804102    0.0 0.955594 55.768629 2.268265e-08  16.935726  -13.032393     0.30721     957.0\n"
     ]
    }
   ],
   "execution_count": 7
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}