{ "cells": [ { "cell_type": "code", "id": "initial_id", "metadata": { "collapsed": true, "ExecuteTime": { "end_time": "2025-09-04T03:09:51.064325Z", "start_time": "2025-09-04T03:09:50.789242Z" } }, "source": [ "import pandas as pd\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "import talib as ta # Make sure TA-Lib is installed: pip install TA-Lib\n", "import statsmodels.api as sm\n", "\n", "import warnings\n", "\n", "# 忽略所有警告\n", "warnings.filterwarnings(\"ignore\")\n", "\n", "# --- 0. Configure your file path ---\n", "# Please replace 'your_futures_data.csv' with the actual path to your CSV file\n", "file_path = '/mnt/d/PyProject/NewQuant/data/data/KQ_m@SHFE_rb/KQ_m@SHFE_rb_min15.csv'\n" ], "outputs": [], "execution_count": 1 }, { "metadata": { "ExecuteTime": { "end_time": "2025-09-04T03:09:51.149053Z", "start_time": "2025-09-04T03:09:51.081682Z" } }, "cell_type": "code", "source": [ "\n", "# --- 1. Data Loading and Preprocessing ---\n", "def load_and_preprocess_data(file_path):\n", " \"\"\"\n", " Loads historical futures data and performs basic preprocessing.\n", " Assumes data contains 'datetime', 'open', 'high', 'low', 'close', 'volume' columns.\n", " \"\"\"\n", " try:\n", " df = pd.read_csv(file_path, parse_dates=['datetime'], index_col='datetime')\n", " # Ensure data is sorted by time\n", " df = df.sort_index()\n", " # Check and handle missing values\n", " initial_rows = len(df)\n", " df.dropna(inplace=True)\n", " if len(df) < initial_rows:\n", " print(f\"Warning: Missing values found in data, deleted {initial_rows - len(df)} rows.\")\n", "\n", " # Check if necessary columns exist\n", " required_columns = ['open', 'high', 'low', 'close', 'volume']\n", " if not all(col in df.columns for col in required_columns):\n", " raise ValueError(f\"CSV file is missing required columns. Please ensure it contains: {required_columns}\")\n", "\n", " print(f\"Successfully loaded {len(df)} rows of data.\")\n", " print(\"First 5 rows of data:\")\n", " print(df.head())\n", " return df\n", " except FileNotFoundError:\n", " print(f\"Error: File '{file_path}' not found. Please check the path.\")\n", " return None\n", " except Exception as e:\n", " print(f\"Error during data loading or preprocessing: {e}\")\n", " return None\n", "\n", "\n", "df_raw = load_and_preprocess_data(file_path)" ], "id": "548c68daa68af8c1", "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Successfully loaded 25470 rows of data.\n", "First 5 rows of data:\n", " open high low close volume open_oi \\\n", "datetime \n", "2020-12-31 14:45:00 4352.0 4400.0 4345.0 4388.0 213731.0 1221661.0 \n", "2021-01-04 09:00:00 4356.0 4368.0 4309.0 4336.0 338332.0 1217327.0 \n", "2021-01-04 09:15:00 4336.0 4342.0 4307.0 4318.0 144479.0 1197881.0 \n", "2021-01-04 09:30:00 4318.0 4329.0 4312.0 4317.0 85679.0 1194567.0 \n", "2021-01-04 09:45:00 4317.0 4338.0 4316.0 4338.0 66461.0 1194592.0 \n", "\n", " close_oi underlying_symbol \n", "datetime \n", "2020-12-31 14:45:00 1217327.0 SHFE.rb2105 \n", "2021-01-04 09:00:00 1197881.0 SHFE.rb2105 \n", "2021-01-04 09:15:00 1194567.0 SHFE.rb2105 \n", "2021-01-04 09:30:00 1194592.0 SHFE.rb2105 \n", "2021-01-04 09:45:00 1198035.0 SHFE.rb2105 \n" ] } ], "execution_count": 2 }, { "metadata": { "ExecuteTime": { "end_time": "2025-09-04T03:12:29.108246Z", "start_time": "2025-09-04T03:12:26.136815Z" } }, "cell_type": "code", "source": [ "import numpy as np\n", "import pandas as pd\n", "import talib\n", "from scipy.stats import jarque_bera, bootstrap\n", "from statsmodels.tsa.stattools import adfuller, acf\n", "from statsmodels.stats.diagnostic import het_arch\n", "\n", "\n", "def rsi_stats(df_raw: pd.DataFrame,\n", " period: int = 14,\n", " lookback: int = 252 * 4):\n", " \"\"\"\n", " 返回 RSI 多维度统计量\n", " \"\"\"\n", " rsi = talib.RSI(df_raw['close'], timeperiod=period)\n", " rsi = rsi[~np.isnan(rsi)]\n", " if len(rsi) < 100:\n", " raise ValueError('RSI 样本不足')\n", "\n", " # 基本描述\n", " stats = {\n", " 'period': period,\n", " 'mean': np.mean(rsi),\n", " 'std': np.std(rsi, ddof=1),\n", " 'skew': pd.Series(rsi).skew(),\n", " 'kurt': pd.Series(rsi).kurtosis(),\n", " 'q10': np.percentile(rsi, 10),\n", " 'q90': np.percentile(rsi, 90),\n", " }\n", "\n", " # 均值回归 / 动量\n", " adf_res = adfuller(rsi, regression='c')\n", " stats['adf_stat'] = adf_res[0]\n", " stats['adf_p'] = adf_res[1]\n", " stats['acf_1'] = acf(rsi, nlags=1, fft=False)[1]\n", "\n", " # 波动聚集(RSI 变化率的 ARCH)\n", " delta_rsi = np.diff(rsi)\n", " arch_res = het_arch(delta_rsi)\n", " arch_lm, arch_p = arch_res[0], arch_res[1]\n", "\n", " stats['arch_lm'] = arch_lm\n", " stats['arch_p'] = arch_p\n", "\n", " # 重尾指数(Hill)\n", " tail = sorted(rsi)[-int(np.ceil(0.05 * len(rsi))):]\n", " stats['hill_tail'] = len(tail) / np.sum(np.log(tail) - np.log(min(tail)))\n", "\n", " # 均值回归区间收益\n", " extreme_low = stats['q10']\n", " extreme_high = stats['q90']\n", " # 事件:RSI 穿越极值后 5-bar 收益\n", " events = []\n", " for i in range(1, len(rsi)):\n", " if rsi[i - 1] < extreme_low and rsi[i] >= extreme_low:\n", " events.append(i)\n", " elif rsi[i - 1] > extreme_high and rsi[i] <= extreme_high:\n", " events.append(-i)\n", "\n", " if len(events) > 20:\n", " rets = []\n", " for idx in events:\n", " idx_abs = abs(idx)\n", " if idx_abs + 5 >= len(rsi):\n", " continue\n", " direc = 1 if idx > 0 else -1\n", " start = df_raw['close'].iloc[idx_abs]\n", " end = df_raw['close'].iloc[idx_abs + 5]\n", " rets.append((end - start) * direc)\n", " rets = np.array(rets)\n", " stats['mean_ret_5'] = np.mean(rets)\n", " stats['win_rate_5'] = np.mean(rets > 0)\n", " stats['events_5'] = len(rets)\n", " else:\n", " stats.update({'mean_ret_5': np.nan, 'win_rate_5': np.nan, 'events_5': 0})\n", "\n", " return pd.Series(stats)\n", "\n", "\n", "# ---------- 一键 ----------\n", "for p in [6, 14, 21]:\n", " try:\n", " res = rsi_stats(df_raw, period=p)\n", " print(res.to_frame().T.to_string(index=False))\n", " except ValueError as e:\n", " e.with_traceback().print_exc()" ], "id": "c566a4757b4f6456", "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " period mean std skew kurt q10 q90 adf_stat adf_p acf_1 arch_lm arch_p hill_tail mean_ret_5 win_rate_5 events_5\n", " 6.0 50.076761 18.780546 -0.007016 -0.633046 25.069754 75.082648 -39.303166 0.0 0.848566 360.978412 1.861253e-71 16.95949 -36.215042 0.005014 1795.0\n", " period mean std skew kurt q10 q90 adf_stat adf_p acf_1 arch_lm arch_p hill_tail mean_ret_5 win_rate_5 events_5\n", " 14.0 50.091278 12.856068 -0.0578 -0.260539 32.889392 66.554807 -27.311287 0.0 0.933965 102.255697 1.925344e-17 16.462544 -16.821584 0.240209 1149.0\n", " period mean std skew kurt q10 q90 adf_stat adf_p acf_1 arch_lm arch_p hill_tail mean_ret_5 win_rate_5 events_5\n", " 21.0 50.067891 10.675715 -0.080938 -0.156658 35.637464 63.528908 -22.804102 0.0 0.955594 55.768629 2.268265e-08 16.935726 -13.032393 0.30721 957.0\n" ] } ], "execution_count": 7 } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.6" } }, "nbformat": 4, "nbformat_minor": 5 }