(exception)勉强赚钱rank

This commit is contained in:
liaozhaorun
2025-03-31 23:08:03 +08:00
parent ee35513935
commit 01092b8cae
14 changed files with 5561 additions and 2922 deletions

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

1215
code/train/Regression.ipynb Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -8,8 +8,8 @@
"source_hidden": true
},
"ExecuteTime": {
"end_time": "2025-03-29T17:43:30.876671Z",
"start_time": "2025-03-29T17:43:30.425776Z"
"end_time": "2025-03-31T14:33:30.607252Z",
"start_time": "2025-03-31T14:33:30.170544Z"
}
},
"source": [
@@ -32,8 +32,8 @@
"metadata": {
"scrolled": true,
"ExecuteTime": {
"end_time": "2025-03-29T17:44:18.824363Z",
"start_time": "2025-03-29T17:43:30.876671Z"
"end_time": "2025-03-31T14:34:19.160370Z",
"start_time": "2025-03-31T14:33:30.794750Z"
}
},
"source": [
@@ -73,15 +73,11 @@
"text": [
"daily data\n",
"daily basic\n",
"inner merge on ['ts_code', 'trade_date']\n",
"stk limit\n",
"left merge on ['ts_code', 'trade_date']\n",
"money flow\n",
"left merge on ['ts_code', 'trade_date']\n",
"cyq perf\n",
"left merge on ['ts_code', 'trade_date']\n",
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 8450470 entries, 0 to 8450469\n",
"RangeIndex: 8477357 entries, 0 to 8477356\n",
"Data columns (total 31 columns):\n",
" # Column Dtype \n",
"--- ------ ----- \n",
@@ -132,8 +128,8 @@
"source_hidden": true
},
"ExecuteTime": {
"end_time": "2025-03-29T17:44:28.421215Z",
"start_time": "2025-03-29T17:44:19.106345Z"
"end_time": "2025-03-31T14:34:30.996034Z",
"start_time": "2025-03-31T14:34:19.168375Z"
}
},
"source": [
@@ -200,8 +196,8 @@
"id": "c4e9e1d31da6dba6",
"metadata": {
"ExecuteTime": {
"end_time": "2025-03-29T17:44:28.620721Z",
"start_time": "2025-03-29T17:44:28.436697Z"
"end_time": "2025-03-31T14:34:31.276589Z",
"start_time": "2025-03-31T14:34:31.060910Z"
}
},
"source": [
@@ -292,8 +288,8 @@
"source_hidden": true
},
"ExecuteTime": {
"end_time": "2025-03-29T17:44:28.706766Z",
"start_time": "2025-03-29T17:44:28.650141Z"
"end_time": "2025-03-31T14:34:31.348068Z",
"start_time": "2025-03-31T14:34:31.304847Z"
}
},
"source": [
@@ -609,8 +605,8 @@
},
"scrolled": true,
"ExecuteTime": {
"end_time": "2025-03-29T17:44:33.959917Z",
"start_time": "2025-03-29T17:44:28.720764Z"
"end_time": "2025-03-31T14:34:36.714777Z",
"start_time": "2025-03-31T14:34:31.369443Z"
}
},
"source": [
@@ -668,8 +664,8 @@
"id": "dbe2fd8021b9417f",
"metadata": {
"ExecuteTime": {
"end_time": "2025-03-29T17:44:33.985360Z",
"start_time": "2025-03-29T17:44:33.975319Z"
"end_time": "2025-03-31T14:34:36.727797Z",
"start_time": "2025-03-31T14:34:36.724265Z"
}
},
"source": [
@@ -696,8 +692,8 @@
"id": "85c3e3d0235ffffa",
"metadata": {
"ExecuteTime": {
"end_time": "2025-03-29T17:46:27.764400Z",
"start_time": "2025-03-29T17:44:34.016244Z"
"end_time": "2025-03-31T14:37:04.071963Z",
"start_time": "2025-03-31T14:34:36.756415Z"
}
},
"source": [
@@ -736,7 +732,7 @@
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"Index: 5102787 entries, 0 to 5102786\n",
"Index: 5118212 entries, 0 to 5118211\n",
"Columns: 115 entries, ts_code to mv_momentum\n",
"dtypes: bool(12), datetime64[ns](1), float64(98), int32(1), int64(1), object(2)\n",
"memory usage: 4.0+ GB\n",
@@ -751,8 +747,8 @@
"id": "92d84ce15a562ec6",
"metadata": {
"ExecuteTime": {
"end_time": "2025-03-29T17:46:29.644814Z",
"start_time": "2025-03-29T17:46:28.384214Z"
"end_time": "2025-03-31T14:37:05.401297Z",
"start_time": "2025-03-31T14:37:04.287413Z"
}
},
"source": [
@@ -795,8 +791,8 @@
"source_hidden": true
},
"ExecuteTime": {
"end_time": "2025-03-29T17:46:29.655148Z",
"start_time": "2025-03-29T17:46:29.646857Z"
"end_time": "2025-03-31T14:37:05.435586Z",
"start_time": "2025-03-31T14:37:05.429705Z"
}
},
"source": [
@@ -844,8 +840,8 @@
"id": "40e6b68a91b30c79",
"metadata": {
"ExecuteTime": {
"end_time": "2025-03-29T17:46:30.303881Z",
"start_time": "2025-03-29T17:46:29.698776Z"
"end_time": "2025-03-31T14:37:05.994210Z",
"start_time": "2025-03-31T14:37:05.479565Z"
}
},
"source": [
@@ -1013,8 +1009,8 @@
"id": "1c46817a-b5dd-4bec-8bb4-e6e80bfd9d66",
"metadata": {
"ExecuteTime": {
"end_time": "2025-03-29T17:46:30.320706Z",
"start_time": "2025-03-29T17:46:30.307889Z"
"end_time": "2025-03-31T14:37:06.026489Z",
"start_time": "2025-03-31T14:37:06.024035Z"
}
},
"source": "# print(test_data.head()[['act_factor1', 'act_factor2', 'ts_code', 'trade_date']])",
@@ -1026,8 +1022,8 @@
"id": "da2bb202843d9275",
"metadata": {
"ExecuteTime": {
"end_time": "2025-03-29T17:46:30.929501Z",
"start_time": "2025-03-29T17:46:30.343347Z"
"end_time": "2025-03-31T14:37:06.597135Z",
"start_time": "2025-03-31T14:37:06.031495Z"
}
},
"source": [
@@ -1133,14 +1129,50 @@
"execution_count": 13
},
{
"cell_type": "code",
"id": "20b7836efae720a3",
"metadata": {
"ExecuteTime": {
"end_time": "2025-03-29T17:46:31.021203Z",
"start_time": "2025-03-29T17:46:30.953273Z"
"end_time": "2025-03-31T14:37:11.087120Z",
"start_time": "2025-03-31T14:37:06.619979Z"
}
},
"cell_type": "code",
"source": [
"\n",
"days = 2\n",
"df['future_return'] = (df.groupby('ts_code')['close'].shift(-days) - df.groupby('ts_code')['open'].shift(-1)) / \\\n",
" df.groupby('ts_code')['open'].shift(-1)\n",
"df['future_volatility'] = (\n",
" df.groupby('ts_code')['future_return']\n",
" .transform(lambda x: x.rolling(days).std())\n",
")\n",
"\n",
"df['future_score'] = (\n",
" 0.7 * df['future_return'] +\n",
" 0.3 * df['future_volatility']\n",
")\n",
"\n",
"filter_index = df['future_return'].between(df['future_return'].quantile(0.01), df['future_return'].quantile(0.99))\n",
"filter_index = df['future_volatility'].between(df['future_volatility'].quantile(0.01),\n",
" df['future_volatility'].quantile(0.99)) | filter_index\n",
"filter_index2 = df['future_return'].between(df['future_return'].quantile(0.01), df['future_return'].quantile(0.99))\n",
"filter_index2 = df['future_volatility'].between(df['future_volatility'].quantile(0.01),\n",
" df['future_volatility'].quantile(0.99)) | filter_index2\n",
"df['label'] = df.groupby('trade_date', group_keys=False)['future_score'].transform(\n",
" lambda x: pd.qcut(x, q=50, labels=False, duplicates='drop')\n",
")\n"
],
"id": "81d4570663ae21d7",
"outputs": [],
"execution_count": 14
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-03-31T14:39:16.314466Z",
"start_time": "2025-03-31T14:39:15.609756Z"
}
},
"cell_type": "code",
"source": [
"# print('train data size: ', len(train_data))\n",
"\n",
@@ -1170,54 +1202,20 @@
"\n",
"gc.collect()"
],
"id": "92428d543f4727ad",
"outputs": [
{
"data": {
"text/plain": [
"0"
"6302"
]
},
"execution_count": 14,
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": 14
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-03-29T17:46:35.629560Z",
"start_time": "2025-03-29T17:46:31.046580Z"
}
},
"cell_type": "code",
"source": [
"\n",
"days = 2\n",
"df['future_return'] = (df.groupby('ts_code')['close'].shift(-days) - df.groupby('ts_code')['open'].shift(-1)) / \\\n",
" df.groupby('ts_code')['open'].shift(-1)\n",
"df['future_volatility'] = (\n",
" df.groupby('ts_code')['future_return']\n",
" .transform(lambda x: x.rolling(days).std())\n",
")\n",
"\n",
"df['future_score'] = (\n",
" 0.7 * df['future_return'] +\n",
" 0.3 * df['future_volatility']\n",
")\n",
"\n",
"filter_index = df['future_return'].between(df['future_return'].quantile(0.01), df['future_return'].quantile(0.99))\n",
"filter_index = df['future_volatility'].between(df['future_volatility'].quantile(0.01),\n",
" df['future_volatility'].quantile(0.99)) | filter_index\n",
"\n",
"df['label'] = df.groupby('trade_date', group_keys=False)['future_score'].transform(\n",
" lambda x: pd.qcut(x, q=50, labels=False, duplicates='drop')\n",
")\n"
],
"id": "81d4570663ae21d7",
"outputs": [],
"execution_count": 15
"execution_count": 21
},
{
"cell_type": "code",
@@ -1227,8 +1225,8 @@
"source_hidden": true
},
"ExecuteTime": {
"end_time": "2025-03-29T17:46:35.745784Z",
"start_time": "2025-03-29T17:46:35.675465Z"
"end_time": "2025-03-31T14:39:16.430821Z",
"start_time": "2025-03-31T14:39:16.321471Z"
}
},
"source": [
@@ -1255,7 +1253,7 @@
"\n",
" # 根据日期筛选数据\n",
" train_data = df[filter_index & df['trade_date'].isin(train_dates)]\n",
" test_data = df[filter_index & df['trade_date'].isin(test_dates)]\n",
" test_data = df[filter_index2 & df['trade_date'].isin(test_dates)]\n",
"\n",
" train_data = train_data.sort_values('trade_date')\n",
" test_data = test_data.sort_values('trade_date')\n",
@@ -1403,15 +1401,15 @@
" return final_predictions\n"
],
"outputs": [],
"execution_count": 16
"execution_count": 22
},
{
"cell_type": "code",
"id": "63235069-dc59-48fb-961a-e80373e41a61",
"metadata": {
"ExecuteTime": {
"end_time": "2025-03-29T17:52:27.309071Z",
"start_time": "2025-03-29T17:46:35.756531Z"
"end_time": "2025-03-31T14:45:27.262907Z",
"start_time": "2025-03-31T14:39:16.454548Z"
}
},
"source": [
@@ -1429,172 +1427,171 @@
"text": [
"去极值\n",
"去极值\n",
"检测到 20 个可能漂移的特征: ['pct_chg', 'turnover_rate', 'atr_14', 'atr_6', 'obv', 'maobv_6', 'act_factor3', 'log(circ_mv)', 'cov', 'delta_cov', 'alpha_22_improved', 'turnover_std', 'log_close', 'active_buy_volume_large', 'active_buy_volume_big', 'active_buy_volume_small', 'buy_lg_vol_minus_sell_lg_vol', 'buy_elg_vol_minus_sell_elg_vol', 'mv_adjusted_volume', 'nonlinear_mv_volume']\n",
"最小日期: 2020-03-11\n",
"最大日期: 2022-03-30\n",
"最小日期: 2022-03-31\n",
"最大日期: 2022-06-30\n",
"原始训练集大小: 402534\n",
"划分后的训练集大小: 307694, 验证集大小: 94840\n",
"检测到 21 个可能漂移的特征: ['vol', 'pct_chg', 'turnover_rate', 'atr_14', 'atr_6', 'obv', 'maobv_6', 'act_factor3', 'log(circ_mv)', 'cov', 'delta_cov', 'alpha_22_improved', 'turnover_std', 'log_close', 'active_buy_volume_large', 'active_buy_volume_big', 'active_buy_volume_small', 'buy_lg_vol_minus_sell_lg_vol', 'buy_elg_vol_minus_sell_elg_vol', 'mv_adjusted_volume', 'nonlinear_mv_volume']\n",
"最小日期: 2020-03-18\n",
"最大日期: 2022-04-08\n",
"最小日期: 2022-04-11\n",
"最大日期: 2022-07-07\n",
"原始训练集大小: 402509\n",
"划分后的训练集大小: 307874, 验证集大小: 94635\n",
"Training until validation scores don't improve for 50 rounds\n",
"Early stopping, best iteration is:\n",
"[15]\ttrain's ndcg@1: 0.662414\tvalid's ndcg@1: 0.613612\n",
"[1]\ttrain's ndcg@1: 0.519951\tvalid's ndcg@1: 0.628242\n",
"Evaluated only: ndcg@1\n",
"去极值\n",
"去极值\n",
"检测到 26 个可能漂移的特征: ['vol', 'pct_chg', 'turnover_rate', 'return_kurtosis', 'vol_spike', 'atr_14', 'atr_6', 'obv', 'maobv_6', 'rsi_3', 'act_factor3', 'log(circ_mv)', 'cov', 'delta_cov', 'alpha_22_improved', 'turnover_std', 'resonance_factor', 'log_close', 'obv-maobv_6', 'active_buy_volume_large', 'active_buy_volume_big', 'active_buy_volume_small', 'buy_lg_vol_minus_sell_lg_vol', 'buy_elg_vol_minus_sell_elg_vol', 'mv_adjusted_volume', 'nonlinear_mv_volume']\n",
"最小日期: 2020-06-09\n",
"最大日期: 2022-06-30\n",
"最小日期: 2022-07-01\n",
"最大日期: 2022-09-23\n",
"原始训练集大小: 400974\n",
"划分后的训练集大小: 306295, 验证集大小: 94679\n",
"最小日期: 2020-06-16\n",
"最大日期: 2022-07-07\n",
"最小日期: 2022-07-08\n",
"最大日期: 2022-09-30\n",
"原始训练集大小: 401052\n",
"划分后的训练集大小: 306109, 验证集大小: 94943\n",
"Training until validation scores don't improve for 50 rounds\n",
"Early stopping, best iteration is:\n",
"[29]\ttrain's ndcg@1: 0.705891\tvalid's ndcg@1: 0.608979\n",
"[19]\ttrain's ndcg@1: 0.656354\tvalid's ndcg@1: 0.628231\n",
"Evaluated only: ndcg@1\n",
"去极值\n",
"去极值\n",
"检测到 14 个可能漂移的特征: ['turnover_rate', 'vol_spike', 'atr_14', 'atr_6', 'obv', 'maobv_6', 'log(circ_mv)', 'delta_cov', 'alpha_22_improved', 'turnover_std', 'log_close', 'active_buy_volume_large', 'active_buy_volume_big', 'active_buy_volume_small']\n",
"最小日期: 2020-09-03\n",
"最大日期: 2022-09-23\n",
"最小日期: 2022-09-26\n",
"最大日期: 2022-12-23\n",
"原始训练集大小: 398352\n",
"划分后的训练集大小: 303767, 验证集大小: 94585\n",
"检测到 14 个可能漂移的特征: ['vol', 'turnover_rate', 'vol_spike', 'atr_14', 'atr_6', 'log(circ_mv)', 'alpha_22_improved', 'turnover_std', 'log_close', 'obv-maobv_6', 'active_buy_volume_large', 'active_buy_volume_big', 'active_buy_volume_small', 'buy_elg_vol_minus_sell_elg_vol']\n",
"最小日期: 2020-09-10\n",
"最大日期: 2022-09-30\n",
"最小日期: 2022-10-10\n",
"最大日期: 2022-12-30\n",
"原始训练集大小: 398374\n",
"划分后的训练集大小: 303799, 验证集大小: 94575\n",
"Training until validation scores don't improve for 50 rounds\n",
"[100]\ttrain's ndcg@1: 0.827415\tvalid's ndcg@1: 0.630578\n",
"Early stopping, best iteration is:\n",
"[71]\ttrain's ndcg@1: 0.78919\tvalid's ndcg@1: 0.66253\n",
"[8]\ttrain's ndcg@1: 0.624175\tvalid's ndcg@1: 0.60186\n",
"Evaluated only: ndcg@1\n",
"去极值\n",
"去极值\n",
"检测到 18 个可能漂移的特征: ['vol', 'turnover_rate', 'return_kurtosis', 'vol_spike', 'atr_14', 'atr_6', 'obv', 'maobv_6', 'log(circ_mv)', 'turnover_std', 'log_close', 'obv-maobv_6', 'active_buy_volume_large', 'active_buy_volume_big', 'active_buy_volume_small', 'buy_elg_vol_minus_sell_elg_vol', 'mv_adjusted_volume', 'nonlinear_mv_volume']\n",
"最小日期: 2020-12-04\n",
"最大日期: 2022-12-23\n",
"最小日期: 2022-12-26\n",
"最大日期: 2023-03-27\n",
"原始训练集大小: 395407\n",
"划分后的训练集大小: 305189, 验证集大小: 90218\n",
"检测到 17 个可能漂移的特征: ['vol', 'turnover_rate', 'return_kurtosis', 'vol_spike', 'atr_14', 'atr_6', 'act_factor3', 'log(circ_mv)', 'turnover_std', 'log_close', 'obv-maobv_6', 'active_buy_volume_large', 'active_buy_volume_big', 'active_buy_volume_small', 'buy_elg_vol_minus_sell_elg_vol', 'mv_adjusted_volume', 'nonlinear_mv_volume']\n",
"最小日期: 2020-12-11\n",
"最大日期: 2022-12-30\n",
"最小日期: 2023-01-03\n",
"最大日期: 2023-04-03\n",
"原始训练集大小: 395305\n",
"划分后的训练集大小: 305409, 验证集大小: 89896\n",
"Training until validation scores don't improve for 50 rounds\n",
"Early stopping, best iteration is:\n",
"[49]\ttrain's ndcg@1: 0.808651\tvalid's ndcg@1: 0.637922\n",
"[23]\ttrain's ndcg@1: 0.711527\tvalid's ndcg@1: 0.58914\n",
"Evaluated only: ndcg@1\n",
"去极值\n",
"去极值\n",
"检测到 19 个可能漂移的特征: ['vol', 'pct_chg', 'turnover_rate', 'return_skew', 'return_kurtosis', 'vol_spike', 'atr_14', 'atr_6', 'obv', 'maobv_6', 'rsi_3', 'act_factor3', 'resonance_factor', 'obv-maobv_6', 'active_buy_volume_large', 'active_buy_volume_big', 'active_buy_volume_small', 'mv_adjusted_volume', 'nonlinear_mv_volume']\n",
"最小日期: 2021-03-08\n",
"最大日期: 2023-03-27\n",
"最小日期: 2023-03-28\n",
"最大日期: 2023-06-27\n",
"原始训练集大小: 393886\n",
"划分后的训练集大小: 303266, 验证集大小: 90620\n",
"检测到 18 个可能漂移的特征: ['vol', 'pct_chg', 'turnover_rate', 'return_kurtosis', 'vol_spike', 'obv', 'maobv_6', 'rsi_3', 'act_factor3', 'delta_cov', 'resonance_factor', 'obv-maobv_6', 'active_buy_volume_large', 'active_buy_volume_big', 'active_buy_volume_small', 'buy_elg_vol_minus_sell_elg_vol', 'mv_adjusted_volume', 'nonlinear_mv_volume']\n",
"最小日期: 2021-03-15\n",
"最大日期: 2023-04-03\n",
"最小日期: 2023-04-04\n",
"最大日期: 2023-07-04\n",
"原始训练集大小: 394279\n",
"划分后的训练集大小: 303561, 验证集大小: 90718\n",
"Training until validation scores don't improve for 50 rounds\n",
"Early stopping, best iteration is:\n",
"[8]\ttrain's ndcg@1: 0.622492\tvalid's ndcg@1: 0.606022\n",
"[1]\ttrain's ndcg@1: 0.570726\tvalid's ndcg@1: 0.62316\n",
"Evaluated only: ndcg@1\n",
"去极值\n",
"去极值\n",
"检测到 23 个可能漂移的特征: ['vol', 'pct_chg', 'turnover_rate', 'return_skew', 'return_kurtosis', 'vol_spike', 'atr_6', 'obv', 'maobv_6', 'rsi_3', 'act_factor3', 'log(circ_mv)', 'delta_cov', 'alpha_22_improved', 'turnover_std', 'resonance_factor', 'log_close', 'obv-maobv_6', 'active_buy_volume_large', 'active_buy_volume_big', 'active_buy_volume_small', 'mv_adjusted_volume', 'nonlinear_mv_volume']\n",
"最小日期: 2021-06-04\n",
"最大日期: 2023-06-27\n",
"最小日期: 2023-06-28\n",
"最大日期: 2023-09-19\n",
"原始训练集大小: 393201\n",
"划分后的训练集大小: 300541, 验证集大小: 92660\n",
"检测到 25 个可能漂移的特征: ['vol', 'pct_chg', 'turnover_rate', 'return_skew', 'return_kurtosis', 'vol_spike', 'atr_14', 'atr_6', 'obv', 'maobv_6', 'rsi_3', 'act_factor2', 'act_factor3', 'log(circ_mv)', 'delta_cov', 'alpha_22_improved', 'turnover_std', 'resonance_factor', 'log_close', 'obv-maobv_6', 'active_buy_volume_large', 'active_buy_volume_big', 'active_buy_volume_small', 'mv_adjusted_volume', 'nonlinear_mv_volume']\n",
"最小日期: 2021-06-11\n",
"最大日期: 2023-07-04\n",
"最小日期: 2023-07-05\n",
"最大日期: 2023-09-26\n",
"原始训练集大小: 392902\n",
"划分后的训练集大小: 300091, 验证集大小: 92811\n",
"Training until validation scores don't improve for 50 rounds\n",
"Early stopping, best iteration is:\n",
"[3]\ttrain's ndcg@1: 0.623432\tvalid's ndcg@1: 0.604043\n",
"[4]\ttrain's ndcg@1: 0.623501\tvalid's ndcg@1: 0.608889\n",
"Evaluated only: ndcg@1\n",
"去极值\n",
"去极值\n",
"检测到 14 个可能漂移的特征: ['vol', 'pct_chg', 'return_kurtosis', 'atr_14', 'atr_6', 'obv', 'maobv_6', 'act_factor3', 'log(circ_mv)', 'cov', 'alpha_22_improved', 'resonance_factor', 'mv_adjusted_volume', 'nonlinear_mv_volume']\n",
"最小日期: 2021-08-30\n",
"最大日期: 2023-09-19\n",
"最小日期: 2023-09-20\n",
"最大日期: 2023-12-20\n",
"原始训练集大小: 386612\n",
"划分后的训练集大小: 296498, 验证集大小: 90114\n",
"检测到 11 个可能漂移的特征: ['pct_chg', 'return_skew', 'return_kurtosis', 'atr_14', 'atr_6', 'obv', 'maobv_6', 'act_factor3', 'delta_cov', 'alpha_22_improved', 'resonance_factor']\n",
"最小日期: 2021-09-06\n",
"最大日期: 2023-09-26\n",
"最小日期: 2023-09-27\n",
"最大日期: 2023-12-27\n",
"原始训练集大小: 386164\n",
"划分后的训练集大小: 296301, 验证集大小: 89863\n",
"Training until validation scores don't improve for 50 rounds\n",
"Early stopping, best iteration is:\n",
"[12]\ttrain's ndcg@1: 0.663139\tvalid's ndcg@1: 0.652339\n",
"[38]\ttrain's ndcg@1: 0.76403\tvalid's ndcg@1: 0.675597\n",
"Evaluated only: ndcg@1\n",
"去极值\n",
"去极值\n",
"检测到 14 个可能漂移的特征: ['pct_chg', 'turnover_rate', 'vol_spike', 'atr_14', 'atr_6', 'obv', 'maobv_6', 'act_factor3', 'log(circ_mv)', 'cov', 'alpha_22_improved', 'turnover_std', 'resonance_factor', 'log_close']\n",
"最小日期: 2021-12-01\n",
"最大日期: 2023-12-20\n",
"最小日期: 2023-12-21\n",
"最大日期: 2024-03-22\n",
"原始训练集大小: 379352\n",
"划分后的训练集大小: 293416, 验证集大小: 85936\n",
"检测到 14 个可能漂移的特征: ['pct_chg', 'turnover_rate', 'vol_spike', 'atr_14', 'atr_6', 'obv', 'maobv_6', 'act_factor3', 'log(circ_mv)', 'cov', 'delta_cov', 'alpha_22_improved', 'turnover_std', 'log_close']\n",
"最小日期: 2021-12-08\n",
"最大日期: 2023-12-27\n",
"最小日期: 2023-12-28\n",
"最大日期: 2024-03-29\n",
"原始训练集大小: 379125\n",
"划分后的训练集大小: 293170, 验证集大小: 85955\n",
"Training until validation scores don't improve for 50 rounds\n",
"Early stopping, best iteration is:\n",
"[5]\ttrain's ndcg@1: 0.634019\tvalid's ndcg@1: 0.638831\n",
"[17]\ttrain's ndcg@1: 0.707121\tvalid's ndcg@1: 0.6345\n",
"Evaluated only: ndcg@1\n",
"去极值\n",
"去极值\n",
"检测到 12 个可能漂移的特征: ['vol', 'turnover_rate', 'vol_spike', 'atr_14', 'atr_6', 'obv', 'log(circ_mv)', 'cov', 'log_close', 'obv-maobv_6', 'mv_adjusted_volume', 'nonlinear_mv_volume']\n",
"最小日期: 2022-03-03\n",
"最大日期: 2024-03-22\n",
"最小日期: 2024-03-25\n",
"最大日期: 2024-06-24\n",
"原始训练集大小: 379932\n",
"划分后的训练集大小: 290249, 验证集大小: 89683\n",
"检测到 13 个可能漂移的特征: ['vol', 'turnover_rate', 'vol_spike', 'atr_14', 'atr_6', 'obv', 'maobv_6', 'log(circ_mv)', 'turnover_std', 'log_close', 'obv-maobv_6', 'mv_adjusted_volume', 'nonlinear_mv_volume']\n",
"最小日期: 2022-03-10\n",
"最大日期: 2024-03-29\n",
"最小日期: 2024-04-01\n",
"最大日期: 2024-07-01\n",
"原始训练集大小: 379627\n",
"划分后的训练集大小: 290158, 验证集大小: 89469\n",
"Training until validation scores don't improve for 50 rounds\n",
"Early stopping, best iteration is:\n",
"[34]\ttrain's ndcg@1: 0.759349\tvalid's ndcg@1: 0.64614\n",
"[44]\ttrain's ndcg@1: 0.776797\tvalid's ndcg@1: 0.610802\n",
"Evaluated only: ndcg@1\n",
"去极值\n",
"去极值\n",
"检测到 18 个可能漂移的特征: ['vol', 'pct_chg', 'turnover_rate', 'vol_spike', 'atr_6', 'obv', 'maobv_6', 'rsi_3', 'act_factor3', 'cov', 'delta_cov', 'alpha_22_improved', 'turnover_std', 'resonance_factor', 'log_close', 'obv-maobv_6', 'mv_adjusted_volume', 'nonlinear_mv_volume']\n",
"最小日期: 2022-06-02\n",
"最大日期: 2024-06-24\n",
"最小日期: 2024-06-25\n",
"最大日期: 2024-09-18\n",
"原始训练集大小: 381303\n",
"划分后的训练集大小: 284860, 验证集大小: 96443\n",
"检测到 18 个可能漂移的特征: ['vol', 'pct_chg', 'turnover_rate', 'return_kurtosis', 'vol_spike', 'atr_6', 'maobv_6', 'rsi_3', 'act_factor3', 'cov', 'delta_cov', 'alpha_22_improved', 'turnover_std', 'resonance_factor', 'log_close', 'obv-maobv_6', 'mv_adjusted_volume', 'nonlinear_mv_volume']\n",
"最小日期: 2022-06-10\n",
"最大日期: 2024-07-01\n",
"最小日期: 2024-07-02\n",
"最大日期: 2024-09-25\n",
"原始训练集大小: 381403\n",
"划分后的训练集大小: 284687, 验证集大小: 96716\n",
"Training until validation scores don't improve for 50 rounds\n",
"Early stopping, best iteration is:\n",
"[40]\ttrain's ndcg@1: 0.745108\tvalid's ndcg@1: 0.589855\n",
"[49]\ttrain's ndcg@1: 0.787077\tvalid's ndcg@1: 0.579916\n",
"Evaluated only: ndcg@1\n",
"去极值\n",
"去极值\n",
"检测到 21 个可能漂移的特征: ['vol', 'pct_chg', 'turnover_rate', 'return_kurtosis', 'vol_spike', 'atr_14', 'atr_6', 'obv', 'maobv_6', 'rsi_3', 'act_factor1', 'act_factor3', 'log(circ_mv)', 'cov', 'delta_cov', 'alpha_22_improved', 'resonance_factor', 'log_close', 'obv-maobv_6', 'mv_adjusted_volume', 'nonlinear_mv_volume']\n",
"最小日期: 2022-08-26\n",
"最大日期: 2024-09-18\n",
"最小日期: 2024-09-19\n",
"最大日期: 2024-12-18\n",
"原始训练集大小: 379192\n",
"划分后的训练集大小: 285332, 验证集大小: 93860\n",
"检测到 20 个可能漂移的特征: ['vol', 'pct_chg', 'turnover_rate', 'vol_spike', 'atr_14', 'atr_6', 'obv', 'maobv_6', 'rsi_3', 'act_factor3', 'log(circ_mv)', 'cov', 'delta_cov', 'alpha_22_improved', 'turnover_std', 'resonance_factor', 'log_close', 'obv-maobv_6', 'mv_adjusted_volume', 'nonlinear_mv_volume']\n",
"最小日期: 2022-09-02\n",
"最大日期: 2024-09-25\n",
"最小日期: 2024-09-26\n",
"最大日期: 2024-12-25\n",
"原始训练集大小: 378304\n",
"划分后的训练集大小: 284976, 验证集大小: 93328\n",
"Training until validation scores don't improve for 50 rounds\n",
"[100]\ttrain's ndcg@1: 0.840198\tvalid's ndcg@1: 0.629064\n",
"Early stopping, best iteration is:\n",
"[10]\ttrain's ndcg@1: 0.629259\tvalid's ndcg@1: 0.646921\n",
"[98]\ttrain's ndcg@1: 0.836808\tvalid's ndcg@1: 0.644301\n",
"Evaluated only: ndcg@1\n",
"去极值\n",
"去极值\n",
"检测到 18 个可能漂移的特征: ['vol', 'pct_chg', 'turnover_rate', 'return_skew', 'return_kurtosis', 'obv', 'maobv_6', 'rsi_3', 'act_factor1', 'log(circ_mv)', 'cov', 'delta_cov', 'alpha_22_improved', 'resonance_factor', 'log_close', 'obv-maobv_6', 'mv_adjusted_volume', 'nonlinear_mv_volume']\n",
"最小日期: 2022-11-28\n",
"最大日期: 2024-12-18\n",
"最小日期: 2024-12-19\n",
"最大日期: 2025-03-19\n",
"原始训练集大小: 371676\n",
"划分后的训练集大小: 289037, 验证集大小: 82639\n",
"检测到 17 个可能漂移的特征: ['vol', 'pct_chg', 'turnover_rate', 'return_skew', 'obv', 'maobv_6', 'rsi_3', 'act_factor1', 'log(circ_mv)', 'cov', 'delta_cov', 'alpha_22_improved', 'resonance_factor', 'log_close', 'obv-maobv_6', 'mv_adjusted_volume', 'nonlinear_mv_volume']\n",
"最小日期: 2022-12-05\n",
"最大日期: 2024-12-25\n",
"最小日期: 2024-12-26\n",
"最大日期: 2025-03-26\n",
"原始训练集大小: 371843\n",
"划分后的训练集大小: 289769, 验证集大小: 82074\n",
"Training until validation scores don't improve for 50 rounds\n",
"[100]\ttrain's ndcg@1: 0.844744\tvalid's ndcg@1: 0.580519\n",
"Early stopping, best iteration is:\n",
"[54]\ttrain's ndcg@1: 0.792906\tvalid's ndcg@1: 0.599084\n",
"[2]\ttrain's ndcg@1: 0.592644\tvalid's ndcg@1: 0.596307\n",
"Evaluated only: ndcg@1\n"
]
}
],
"execution_count": 17
"execution_count": 23
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-03-29T17:52:27.393208Z",
"start_time": "2025-03-29T17:52:27.388403Z"
"end_time": "2025-03-31T14:45:27.345941Z",
"start_time": "2025-03-31T14:45:27.342565Z"
}
},
"cell_type": "code",
@@ -1609,208 +1606,17 @@
]
}
],
"execution_count": 18
},
{
"cell_type": "code",
"id": "d86af99d15cb3bdd",
"metadata": {
"scrolled": true
},
"source": [
"import pandas as pd\n",
"\n",
"gc.collect()\n",
"def rolling_train_predict(df, train_days, test_days, industry_df, index_df, days=5, use_pca=False, validation_days=60):\n",
"\n",
" # 1. 按照交易日期排序\n",
" unique_dates = df[df['trade_date'] >= '2020-01-01']['trade_date'].unique().tolist()\n",
" unique_dates = sorted(unique_dates)\n",
" n = len(unique_dates)\n",
" \n",
" # 2. 计算需要跳过的天数,使后续窗口对齐\n",
" extra_days = (n - train_days) % test_days \n",
" start_index = extra_days # 从此索引开始滚动\n",
" \n",
" predictions_list = []\n",
"\n",
"\n",
" for start in range(start_index, n - train_days - test_days + 1, test_days):\n",
" gc.collect()\n",
"\n",
" train_dates = unique_dates[start : start + train_days]\n",
" test_dates = unique_dates[start + train_days : start + train_days + test_days]\n",
"\n",
" # 根据日期筛选数据\n",
" train_data = df[df['trade_date'].isin(train_dates)]\n",
" test_data = df[df['trade_date'].isin(test_dates)]\n",
"\n",
" train_data = train_data.sort_values('trade_date')\n",
" test_data = test_data.sort_values('trade_date')\n",
"\n",
" \n",
" def select_pre_zt_stocks_dynamic(\n",
" stock_df,\n",
" vol_spike_multiplier=1.5,\n",
" min_return=0.03, # 最小累计涨幅(例如 3%\n",
" min_main_net_inflow=1e6, # 最小主力资金净流入(例如 100 万元)\n",
" window=30, # 计算历史均值的窗口大小\n",
" signal_days=1 # 异动信号需要连续出现的天数\n",
" ):\n",
" \n",
" # 排序数据\n",
" stock_df = stock_df.sort_values(by=['trade_date', 'ts_code'])\n",
" \n",
" # stock_df = stock_df[\n",
" # (stock_df['vol'] > vol_spike_multiplier * stock_df['avg_vol_20'])\n",
" # ]\n",
" cd1 = stock_df[\"close\"] > stock_df[\"close\"].shift(1)\n",
"\n",
" cd2 = stock_df[\"close\"] > stock_df[\"close\"].rolling(window=10).mean()\n",
"\n",
" cd3 = (stock_df[\"vol\"] > stock_df[\"vol\"].shift(1)) & (stock_df[\"vol\"] < 10 * stock_df[\"vol\"].shift(1))\n",
"\n",
" stock_df = stock_df[cd1 & cd2 & cd3]\n",
" stock_df = stock_df.groupby('trade_date', group_keys=False).apply(\n",
" lambda x: x.nlargest(1000, 'return_20')\n",
" )\n",
" \n",
" return stock_df\n",
" \n",
" train_data = select_pre_zt_stocks_dynamic(train_data)\n",
" test_data = select_pre_zt_stocks_dynamic(test_data)\n",
"\n",
" \n",
" # train_data, _ = get_simple_factor(train_data)\n",
" # test_data, _ = get_simple_factor(test_data)\n",
"\n",
" # df['future_return'] = (df.groupby('ts_code')['close'].shift(-days) - df.groupby('ts_code')['open'].shift(-1)) / \\\n",
" # df.groupby('ts_code')['open'].shift(-1)\n",
" \n",
" def symmetric_log_transform(values):\n",
" return np.sign(values) * np.log1p(np.abs(values))\n",
"\n",
" train_data['future_return'] = train_data.groupby('ts_code', group_keys=False)['close'].apply(lambda x: x.shift(-days) / x - 1)\n",
" train_data['future_score'] = calculate_score(train_data, days=days, lambda_param=0.3)\n",
" # train_data['future_score'] = symmetric_log_transform(train_data['future_score'])\n",
"\n",
" test_data['future_return'] = test_data.groupby('ts_code', group_keys=False)['close'].apply(lambda x: x.shift(-days) / x - 1)\n",
" test_data['future_score'] = calculate_score(test_data, days=days, lambda_param=0.3)\n",
" # test_data['future_score'] = symmetric_log_transform(test_data['future_score'])\n",
" \n",
" train_data['label'] = train_data.groupby('trade_date', group_keys=False)['future_score'].transform(\n",
" lambda x: pd.qcut(x, q=10, labels=False, duplicates='drop')\n",
" )\n",
" test_data['label'] = test_data.groupby('trade_date', group_keys=False)['future_score'].transform(\n",
" lambda x: pd.qcut(x, q=10, labels=False, duplicates='drop')\n",
" )\n",
" \n",
" industry_df = industry_df.sort_values(by=['trade_date'])\n",
" index_df = index_df.sort_values(by=['trade_date'])\n",
" \n",
" train_data = train_data.merge(industry_df, on=['cat_l2_code', 'trade_date'], how='left')\n",
" # train_data = train_data.merge(index_df, on='trade_date', how='left')\n",
" test_data = test_data.merge(industry_df, on=['cat_l2_code', 'trade_date'], how='left')\n",
" # test_data = test_data.merge(index_df, on='trade_date', how='left')\n",
" \n",
" train_data, test_data = train_data.replace([np.inf, -np.inf], np.nan), test_data.replace([np.inf, -np.inf], np.nan)\n",
" \n",
" feature_columns = [col for col in train_data.columns if col not in ['trade_date',\n",
" 'ts_code',\n",
" 'label']]\n",
" feature_columns = [col for col in feature_columns if 'future' not in col]\n",
" feature_columns = [col for col in feature_columns if 'score' not in col]\n",
" feature_columns = [col for col in feature_columns if col not in origin_columns]\n",
" feature_columns = [col for col in feature_columns if not col.startswith('_')]\n",
" # print(feature_columns)\n",
"\n",
" feature_columns_o = feature_columns[:]\n",
" train_data, feature_columns = create_deviation_within_dates(train_data, feature_columns_o)\n",
" test_data, _ = create_deviation_within_dates(test_data, feature_columns_o)\n",
" print(f'feature_columns size: {len(feature_columns)}')\n",
" \n",
" train_data = train_data.dropna(subset=feature_columns)\n",
" train_data = train_data.dropna(subset=['label'])\n",
" train_data = train_data.reset_index(drop=True)\n",
" \n",
" # print(test_data.tail())\n",
" # test_data = test_data.dropna(subset=feature_columns_new)\n",
" # test_data = test_data.dropna(subset=['label'])\n",
" test_data = test_data.reset_index(drop=True)\n",
" \n",
" # print(len(train_data))\n",
" print(f\"最小日期: {train_data['trade_date'].min().strftime('%Y-%m-%d')}\")\n",
" print(f\"最大日期: {train_data['trade_date'].max().strftime('%Y-%m-%d')}\")\n",
" # print(len(test_data))\n",
" print(f\"最小日期: {test_data['trade_date'].min().strftime('%Y-%m-%d')}\")\n",
" print(f\"最大日期: {test_data['trade_date'].max().strftime('%Y-%m-%d')}\")\n",
" \n",
" cat_columns = [col for col in df.columns if col.startswith('cat')]\n",
" for col in cat_columns:\n",
" train_data[col] = train_data[col].astype('category')\n",
" test_data[col] = test_data[col].astype('category')\n",
"\n",
"\n",
" feature_columns = remove_highly_correlated_features(train_data[train_data['label'] == 9], feature_columns)\n",
" feature_columns, _ = remove_shifted_features(train_data[train_data['label'] == 9], test_data[test_data['label'] == 9], feature_columns)\n",
" keep_columns = [col for col in train_data.columns if\n",
" col in feature_columns or col in ['ts_code', 'trade_date', 'label', 'future_return', 'future_score']]\n",
" train_data = train_data[keep_columns]\n",
"\n",
" label_gain = list(range(len(train_data['label'].unique())))\n",
" label_gain = [gain * 2 for gain in label_gain]\n",
" light_params['label_gain'] = label_gain\n",
" \n",
" ud = train_data[\"trade_date\"].unique()\n",
" date_weights = {date: weight for date, weight in zip(ud, np.linspace(1, 2, len(unique_dates)))}\n",
" light_params['weight'] = train_data[\"trade_date\"].map(date_weights).tolist()\n",
"\n",
" print(f'feature_columns: {feature_columns}')\n",
" model, scaler, pca = train_light_model(train_data.dropna(subset=['label']),\n",
" light_params, feature_columns,\n",
" [lgb.log_evaluation(period=100),\n",
" lgb.callback.record_evaluation(evals),\n",
" lgb.early_stopping(50, first_metric_only=True)\n",
" ], evals,\n",
" num_boost_round=3000, validation_days=validation_days,\n",
" print_feature_importance=False, use_pca=False)\n",
"\n",
" score_df = test_data.copy()\n",
" numeric_columns = score_df.select_dtypes(include=['float64', 'int64']).columns\n",
" numeric_columns = [col for col in numeric_columns if col in feature_columns]\n",
" score_df.loc[:, numeric_columns] = scaler.transform(score_df[numeric_columns])\n",
" if use_pca and pca is not None:\n",
" score_df.loc[:, numeric_columns] = pca.transform(score_df[numeric_columns])\n",
" score_df['score'] = model.predict(score_df[feature_columns])\n",
" # train_data['score'] = catboost_model.predict(train_data[feature_columns_new])\n",
" score_df = score_df.loc[score_df.groupby('trade_date')['score'].idxmax()]\n",
" # score_df = score_df[score_df['score'] > 0]\n",
" score_df = score_df[['trade_date', 'score', 'ts_code']]\n",
" predictions_list.append(score_df)\n",
" final_predictions = pd.concat(predictions_list, ignore_index=True)\n",
" return final_predictions\n",
"\n",
"\n",
"final_predictions = rolling_train_predict(df.sort_values(['trade_date'], ascending=[True]), 500, 60, industry_df, index_df, days=5, validation_days=100)\n",
"final_predictions.to_csv('predictions_test.tsv', index=False)\n"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"id": "7ed645f2-7755-496e-8a6d-c64adc9080ac",
"metadata": {},
"source": [
"print('finish')"
],
"outputs": [],
"execution_count": null
"execution_count": 24
},
{
"cell_type": "code",
"id": "0dc75517-c857-4f1d-8815-e807400a6d33",
"metadata": {},
"metadata": {
"ExecuteTime": {
"end_time": "2025-03-31T14:45:27.395705Z",
"start_time": "2025-03-31T14:45:27.393319Z"
}
},
"source": [],
"outputs": [],
"execution_count": null

1393
code/train/UpdateSGD.ipynb Normal file

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

View File

@@ -1,21 +1,8 @@
import numpy as np
import pandas as pd
def read_and_merge_h5_data(h5_filename, key, columns, df=None):
"""
读取 HDF5 文件中的数据,根据指定的 columns 筛选数据,
如果传入 df 参数,则将其与读取的数据根据 ts_code 和 trade_date 合并。
参数:
- h5_filename: HDF5 文件名
- key: 数据存储在 HDF5 文件中的 key
- columns: 要读取的列名列表
- df: 需要合并的 DataFrame如果为空则不进行合并
返回:
- 合并后的 DataFrame
"""
# 处理 _ 开头的列名
def read_and_merge_h5_data(h5_filename, key, columns, df=None, join='left', on=['ts_code', 'trade_date'], prefix=None):
processed_columns = []
for col in columns:
if col.startswith('_'):
@@ -32,14 +19,22 @@ def read_and_merge_h5_data(h5_filename, key, columns, df=None):
new_col = f'_{col}'
data.rename(columns={col: new_col}, inplace=True)
if prefix is not None:
for col in data.columns:
if col not in ['ts_code', 'trade_date']: # 只有不在 columns 中的列才需要加下划线
new_col = f'{prefix}_{col}'
data.rename(columns={col: new_col}, inplace=True)
# 如果传入的 df 不为空,则进行合并
if df is not None and not df.empty:
# 确保两个 DataFrame 都有 ts_code 和 trade_date 列
df['trade_date'] = pd.to_datetime(df['trade_date'], format='%Y%m%d')
data['trade_date'] = pd.to_datetime(data['trade_date'], format='%Y%m%d')
print(f'{join} merge on {on}')
if 'trade_date' in on:
# 确保两个 DataFrame 都有 ts_code 和 trade_date 列
df['trade_date'] = pd.to_datetime(df['trade_date'], format='%Y%m%d')
data['trade_date'] = pd.to_datetime(data['trade_date'], format='%Y%m%d')
# 根据 ts_code 和 trade_date 合并
merged_df = pd.merge(df, data, on=['ts_code', 'trade_date'], how='left')
merged_df = pd.merge(df, data, on=on, how=join)
else:
# 如果 df 为空,则直接返回读取的数据
merged_df = data
@@ -84,4 +79,42 @@ def calculate_risk_adjusted_return(df, days=1, method='ratio', lambda_=0.5, eps=
else:
raise ValueError("Invalid method. Use 'ratio' or 'difference'.")
return df
return df
# import polars as pl
#
# def read_and_merge_h5_data_polars(h5_filename, key, columns, df=None, join='left', on=['ts_code', 'trade_date']):
# processed_columns = []
# for col in columns:
# if col.startswith('_'):
# processed_columns.append(col[1:]) # 去掉下划线
# else:
# processed_columns.append(col)
#
# # 从 HDF5 文件读取数据,选择需要的列
# pd_df = pd.read_hdf(h5_filename, key=key, columns=processed_columns)
#
# # 将 Pandas DataFrame 转换为 Polars DataFrame
# data = pl.from_pandas(pd_df)
#
# # 修改列名,如果列名以前有 _加上 _
# data = data.rename({col: f'_{col}' for col in data.columns if col not in columns})
#
# # 如果传入的 df 不为空,则进行合并
# if df is not None and not df.is_empty():
# print(f'{join} merge on {on}')
#
# # 确保两个 DataFrame 都有 ts_code 和 trade_date 列
# # df = df.with_columns(pl.col('trade_date').str.strptime(pl.Datetime, format='%Y%m%d'))
# # data = data.with_columns(pl.col('trade_date').str.strptime(pl.Datetime, format='%Y%m%d'))
#
# # 根据 ts_code 和 trade_date 合并
# merged_df = df.join(data, on=on, how=join)
# else:
# # 如果 df 为空,则直接返回读取的数据
# merged_df = data
#
# return merged_df