Classify2

This commit is contained in:
liaozhaorun
2025-05-13 15:30:06 +08:00
parent 791c84aba6
commit a4b05bb62f
20 changed files with 10737 additions and 7456 deletions

View File

@@ -44,7 +44,7 @@
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\liaozhaorun\\AppData\\Local\\Temp\\ipykernel_28220\\1832869062.py:13: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n",
"C:\\Users\\liaozhaorun\\AppData\\Local\\Temp\\ipykernel_16940\\1832869062.py:13: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n",
" final_df = pd.concat(all_data, ignore_index=True)\n"
]
}
@@ -86,32 +86,32 @@
"output_type": "stream",
"text": [
" ts_code trade_date close open high low \\\n",
"0 000905.SH 20250506 5740.3338 5668.8762 5740.3338 5666.4698 \n",
"1 000905.SH 20250430 5631.8249 5604.6537 5647.7821 5603.1718 \n",
"2 000905.SH 20250429 5604.9057 5583.7186 5622.0220 5571.2363 \n",
"3 000905.SH 20250428 5598.2951 5624.4166 5628.0778 5587.7857 \n",
"4 000905.SH 20250425 5627.1804 5613.1407 5661.5869 5596.5266 \n",
"0 000905.SH 20250509 5721.7225 5770.4410 5770.4410 5705.1654 \n",
"1 000905.SH 20250508 5773.8056 5731.7157 5783.7915 5724.9511 \n",
"2 000905.SH 20250507 5750.2911 5805.6560 5819.2422 5713.2734 \n",
"3 000905.SH 20250506 5740.3338 5668.8762 5740.3338 5666.4698 \n",
"4 000905.SH 20250430 5631.8249 5604.6537 5647.7821 5603.1718 \n",
"... ... ... ... ... ... ... \n",
"13492 399006.SZ 20100607 1069.4680 1005.0280 1075.2250 1001.7020 \n",
"13493 399006.SZ 20100604 1027.6810 989.6810 1027.6810 986.5040 \n",
"13494 399006.SZ 20100603 998.3940 1002.3550 1026.7020 997.7750 \n",
"13495 399006.SZ 20100602 997.1190 967.6090 997.1190 952.6110 \n",
"13496 399006.SZ 20100601 973.2330 986.0150 994.7930 948.1180 \n",
"13501 399006.SZ 20100607 1069.4680 1005.0280 1075.2250 1001.7020 \n",
"13502 399006.SZ 20100604 1027.6810 989.6810 1027.6810 986.5040 \n",
"13503 399006.SZ 20100603 998.3940 1002.3550 1026.7020 997.7750 \n",
"13504 399006.SZ 20100602 997.1190 967.6090 997.1190 952.6110 \n",
"13505 399006.SZ 20100601 973.2330 986.0150 994.7930 948.1180 \n",
"\n",
" pre_close change pct_chg vol amount \n",
"0 5631.8249 108.5089 1.9267 1.627736e+08 2.170600e+08 \n",
"1 5604.9057 26.9192 0.4803 1.383866e+08 1.816166e+08 \n",
"2 5598.2951 6.6106 0.1181 1.267429e+08 1.580330e+08 \n",
"3 5627.1804 -28.8853 -0.5133 1.362181e+08 1.676163e+08 \n",
"4 5605.8796 21.3008 0.3800 1.400008e+08 1.719338e+08 \n",
"0 5773.8056 -52.0831 -0.9021 1.239390e+08 1.781623e+08 \n",
"1 5750.2911 23.5145 0.4089 1.361403e+08 1.870326e+08 \n",
"2 5740.3338 9.9573 0.1735 1.710118e+08 2.275662e+08 \n",
"3 5631.8249 108.5089 1.9267 1.627736e+08 2.170600e+08 \n",
"4 5604.9057 26.9192 0.4803 1.383866e+08 1.816166e+08 \n",
"... ... ... ... ... ... \n",
"13492 1027.6810 41.7870 4.0661 2.655275e+06 9.106095e+06 \n",
"13493 998.3940 29.2870 2.9334 1.500295e+06 5.269441e+06 \n",
"13494 997.1190 1.2750 0.1279 1.616805e+06 6.240835e+06 \n",
"13495 973.2330 23.8860 2.4543 1.074628e+06 4.001206e+06 \n",
"13496 1000.0000 -26.7670 -2.6767 1.356285e+06 4.924177e+06 \n",
"13501 1027.6810 41.7870 4.0661 2.655275e+06 9.106095e+06 \n",
"13502 998.3940 29.2870 2.9334 1.500295e+06 5.269441e+06 \n",
"13503 997.1190 1.2750 0.1279 1.616805e+06 6.240835e+06 \n",
"13504 973.2330 23.8860 2.4543 1.074628e+06 4.001206e+06 \n",
"13505 1000.0000 -26.7670 -2.6767 1.356285e+06 4.924177e+06 \n",
"\n",
"[13497 rows x 11 columns]\n"
"[13506 rows x 11 columns]\n"
]
}
],

View File

@@ -39,15 +39,15 @@
"3 000006.SZ 20250312\n",
"4 000007.SZ 20250312\n",
"... ... ...\n",
"5381 920445.BJ 20250506\n",
"5382 920489.BJ 20250506\n",
"5383 920682.BJ 20250506\n",
"5384 920799.BJ 20250506\n",
"5385 920819.BJ 20250506\n",
"5384 920445.BJ 20250508\n",
"5385 920489.BJ 20250508\n",
"5386 920682.BJ 20250508\n",
"5387 920799.BJ 20250508\n",
"5388 920819.BJ 20250508\n",
"\n",
"[7654317 rows x 2 columns]\n",
"20250506\n",
"start_date: 20250507\n"
"[7665071 rows x 2 columns]\n",
"20250508\n",
"start_date: 20250509\n"
]
}
],
@@ -88,8 +88,8 @@
"text": [
"任务 20250619 完成\n",
"任务 20250620 完成\n",
"任务 20250617 完成\n",
"任务 20250618 完成\n",
"任务 20250617 完成\n",
"任务 20250616 完成\n",
"任务 20250613 完成\n",
"任务 20250612 完成\n",
@@ -104,20 +104,18 @@
"任务 20250529 完成\n",
"任务 20250528 完成\n",
"任务 20250527 完成\n",
"任务 20250526 完成\n",
"任务 20250523 完成\n",
"任务 20250522 完成\n",
"任务 20250526 完成\n",
"任务 20250521 完成\n",
"任务 20250520 完成\n",
"任务 20250522 完成\n",
"任务 20250519 完成\n",
"任务 20250520 完成\n",
"任务 20250516 完成\n",
"任务 20250515 完成\n",
"任务 20250514 完成\n",
"任务 20250513 完成\n",
"任务 20250512 完成\n",
"任务 20250509 完成\n",
"任务 20250508 完成\n",
"任务 20250507 完成\n"
"任务 20250509 完成\n"
]
}
],

View File

@@ -39,15 +39,15 @@
"3 801005.SI 20250221\n",
"4 801010.SI 20250221\n",
".. ... ...\n",
"434 859811.SI 20250506\n",
"435 859821.SI 20250506\n",
"436 859822.SI 20250506\n",
"437 859852.SI 20250506\n",
"438 859951.SI 20250506\n",
"434 859811.SI 20250508\n",
"435 859821.SI 20250508\n",
"436 859822.SI 20250508\n",
"437 859852.SI 20250508\n",
"438 859951.SI 20250508\n",
"\n",
"[1065465 rows x 2 columns]\n",
"20250506\n",
"start_date: 20250507\n"
"[1066343 rows x 2 columns]\n",
"20250508\n",
"start_date: 20250509\n"
]
}
],
@@ -88,8 +88,8 @@
"text": [
"任务 20250619 完成\n",
"任务 20250620 完成\n",
"任务 20250618 完成\n",
"任务 20250617 完成\n",
"任务 20250618 完成\n",
"任务 20250616 完成\n",
"任务 20250613 完成\n",
"任务 20250612 完成\n",
@@ -101,23 +101,21 @@
"任务 20250604 完成\n",
"任务 20250603 完成\n",
"任务 20250530 完成\n",
"任务 20250528 完成\n",
"任务 20250529 完成\n",
"任务 20250526 完成\n",
"任务 20250528 完成\n",
"任务 20250527 完成\n",
"任务 20250526 完成\n",
"任务 20250523 完成\n",
"任务 20250522 完成\n",
"任务 20250521 完成\n",
"任务 20250520 完成\n",
"任务 20250519 完成\n",
"任务 20250515 完成\n",
"任务 20250516 完成\n",
"任务 20250514 完成\n",
"任务 20250515 完成\n",
"任务 20250513 完成\n",
"任务 20250512 完成\n",
"任务 20250509 完成\n",
"任务 20250508 完成\n",
"任务 20250507 完成\n"
"任务 20250509 完成\n"
]
}
],

View File

@@ -19,7 +19,7 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 2,
"id": "14671a7f72de2564",
"metadata": {
"ExecuteTime": {
@@ -80,7 +80,7 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 3,
"id": "e7f8cce2f80e2f20",
"metadata": {
"ExecuteTime": {
@@ -94,17 +94,17 @@
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"Index: 8599138 entries, 0 to 8599137\n",
"Index: 8615301 entries, 0 to 5388\n",
"Data columns (total 2 columns):\n",
" # Column Dtype \n",
"--- ------ ----- \n",
" 0 ts_code object\n",
" 1 trade_date object\n",
"dtypes: object(2)\n",
"memory usage: 196.8+ MB\n",
"memory usage: 197.2+ MB\n",
"None\n",
"20250430\n",
"20250506\n"
"20250508\n",
"20250509\n"
]
}
],
@@ -130,7 +130,7 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 4,
"id": "553cfb36-f560-4cc4-b2bc-68323ccc5072",
"metadata": {
"ExecuteTime": {
@@ -148,8 +148,8 @@
"任务 20250717 完成\n",
"任务 20250716 完成\n",
"任务 20250715 完成\n",
"任务 20250714 完成\n",
"任务 20250711 完成\n",
"任务 20250714 完成\n",
"任务 20250710 完成\n",
"任务 20250709 完成\n",
"任务 20250708 完成\n",
@@ -178,14 +178,14 @@
"任务 20250605 完成\n",
"任务 20250604 完成\n",
"任务 20250603 完成\n",
"任务 20250529 完成\n",
"任务 20250530 完成\n",
"任务 20250527 完成\n",
"任务 20250529 完成\n",
"任务 20250528 完成\n",
"任务 20250527 完成\n",
"任务 20250526 完成\n",
"任务 20250523 完成\n",
"任务 20250521 完成\n",
"任务 20250522 完成\n",
"任务 20250521 完成\n",
"任务 20250520 完成\n",
"任务 20250519 完成\n",
"任务 20250516 完成\n",
@@ -193,10 +193,7 @@
"任务 20250514 完成\n",
"任务 20250513 完成\n",
"任务 20250512 完成\n",
"任务 20250509 完成\n",
"任务 20250508 完成\n",
"任务 20250507 完成\n",
"任务 20250506 完成\n"
"任务 20250509 完成\n"
]
}
],
@@ -253,7 +250,7 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 5,
"id": "919023c693d7a47a",
"metadata": {
"ExecuteTime": {
@@ -266,59 +263,59 @@
"name": "stdout",
"output_type": "stream",
"text": [
" ts_code trade_date close turnover_rate turnover_rate_f \\\n",
"0 301261.SZ 20250507 97.25 15.5042 19.6511 \n",
"1 002643.SZ 20250507 11.12 1.3481 2.3303 \n",
"2 001211.SZ 20250507 22.11 3.5506 6.1239 \n",
"3 002466.SZ 20250507 28.98 1.0588 1.5771 \n",
"4 603005.SH 20250507 29.32 5.1961 6.1690 \n",
"... ... ... ... ... ... \n",
"10769 000551.SZ 20250506 12.39 2.0213 3.1432 \n",
"10770 600792.SH 20250506 3.17 0.8036 2.3531 \n",
"10771 300176.SZ 20250506 6.62 1.7530 2.5325 \n",
"10772 000016.SZ 20250506 5.57 13.9545 20.7669 \n",
"10773 300339.SZ 20250506 56.53 11.3184 11.9579 \n",
" ts_code trade_date close turnover_rate turnover_rate_f \\\n",
"0 300575.SZ 20250509 6.05 1.9284 2.1880 \n",
"1 300247.SZ 20250509 3.77 2.1735 2.5437 \n",
"2 603038.SH 20250509 15.80 17.5702 32.3972 \n",
"3 002030.SZ 20250509 5.82 0.8252 1.2070 \n",
"4 600157.SH 20250509 1.36 0.8369 1.0222 \n",
"... ... ... ... ... ... \n",
"5384 600841.SH 20250509 5.57 1.0271 3.2670 \n",
"5385 300968.SZ 20250509 14.76 1.2857 2.7636 \n",
"5386 300634.SZ 20250509 25.79 5.2551 9.4581 \n",
"5387 300295.SZ 20250509 15.73 3.0347 3.2458 \n",
"5388 688370.SH 20250509 19.15 1.2008 1.2008 \n",
"\n",
" volume_ratio pe pe_ttm pb ps ps_ttm dv_ratio \\\n",
"0 0.84 122.6810 146.2352 5.5730 8.2774 8.3189 0.4627 \n",
"1 0.79 41.9902 45.3885 1.4569 2.8000 2.8594 2.6982 \n",
"2 0.83 56.0080 58.9563 1.8078 1.1637 1.1399 0.0000 \n",
"3 0.92 NaN NaN 1.1380 3.6409 3.6410 4.6569 \n",
"4 1.35 75.6520 71.1174 4.4020 16.9225 16.2060 0.1570 \n",
"... ... ... ... ... ... ... ... \n",
"10769 1.20 19.9692 18.7030 1.8602 1.1939 1.1927 0.5650 \n",
"10770 0.89 NaN NaN 1.1995 0.5271 0.5777 2.1767 \n",
"10771 1.12 92.1443 96.5538 2.7208 1.4839 1.4627 0.0000 \n",
"10772 3.66 NaN NaN 5.6643 1.2067 1.1979 0.0000 \n",
"10773 2.40 279.4392 270.1037 12.8967 13.2445 13.0061 0.0000 \n",
" volume_ratio pe pe_ttm pb ps ps_ttm dv_ratio \\\n",
"0 0.71 239.8914 NaN 1.3451 1.1608 1.1259 1.9835 \n",
"1 0.96 64.6952 53.1680 2.7649 4.4008 3.9673 0.0000 \n",
"2 4.47 183.7603 154.4297 3.1047 4.0259 3.7692 0.2434 \n",
"3 0.62 NaN NaN 1.0296 9.5754 9.9145 0.2577 \n",
"4 0.55 19.3625 26.3896 0.6394 1.0656 1.1327 0.4044 \n",
"... ... ... ... ... ... ... ... \n",
"5384 0.77 NaN NaN 2.3362 1.1952 1.2860 0.0000 \n",
"5385 0.71 115.0812 181.8721 3.2254 4.9990 5.1146 0.3388 \n",
"5386 1.01 50.5639 52.9222 4.1166 7.0433 6.7806 0.8063 \n",
"5387 0.65 NaN NaN 2.6398 24.2982 28.1758 0.0000 \n",
"5388 1.25 29.1668 36.1111 0.9812 4.4106 4.4983 NaN \n",
"\n",
" dv_ttm total_share float_share free_share total_mv \\\n",
"0 0.4627 8789.0196 3748.3321 2957.3203 8.547322e+05 \n",
"1 2.6982 92996.9005 90932.5570 52604.5851 1.034126e+06 \n",
"2 NaN 7200.0000 6699.6575 3884.4502 1.591920e+05 \n",
"3 4.6569 164122.1583 147584.5634 99084.9325 4.756260e+06 \n",
"4 0.1570 65217.1706 65217.1706 54932.1940 1.912167e+06 \n",
"... ... ... ... ... ... \n",
"10769 0.5650 40394.4205 40263.2044 25893.0990 5.004869e+05 \n",
"10770 2.1767 110992.3600 105986.8113 36194.3684 3.518458e+05 \n",
"10771 NaN 38728.0800 38728.0800 26808.2764 2.563799e+05 \n",
"10772 NaN 240794.5408 159659.3800 107284.6868 1.341226e+06 \n",
"10773 NaN 79641.0841 77768.6667 73609.4256 4.502110e+06 \n",
" dv_ttm total_share float_share free_share total_mv \\\n",
"0 1.9835 4.647564e+04 3.427082e+04 3.020469e+04 2.811776e+05 \n",
"1 NaN 8.040403e+04 8.032753e+04 6.863630e+04 3.031232e+05 \n",
"2 0.2434 2.686771e+04 2.686771e+04 1.457134e+04 4.245098e+05 \n",
"3 0.2577 1.403446e+05 1.403446e+05 9.595371e+04 8.168056e+05 \n",
"4 0.4044 2.221776e+06 2.221776e+06 1.819047e+06 3.021616e+06 \n",
"... ... ... ... ... ... \n",
"5384 NaN 1.387822e+05 1.043024e+05 3.279094e+04 7.730167e+05 \n",
"5385 0.3388 4.133800e+04 4.133800e+04 1.923185e+04 6.101489e+05 \n",
"5386 0.8063 4.512109e+04 4.346809e+04 2.415175e+04 1.163673e+06 \n",
"5387 NaN 1.896137e+04 1.675486e+04 1.566518e+04 2.982624e+05 \n",
"5388 NaN 1.371079e+04 4.374912e+03 4.374912e+03 2.625616e+05 \n",
"\n",
" circ_mv is_st \n",
"0 3.645253e+05 False \n",
"1 1.011170e+06 False \n",
"2 1.481294e+05 False \n",
"3 4.277001e+06 False \n",
"4 1.912167e+06 False \n",
"... ... ... \n",
"10769 4.988611e+05 False \n",
"10770 3.359782e+05 False \n",
"10771 2.563799e+05 False \n",
"10772 8.893027e+05 False \n",
"10773 4.396263e+06 False \n",
" circ_mv is_st \n",
"0 2.073385e+05 False \n",
"1 3.028348e+05 False \n",
"2 4.245098e+05 False \n",
"3 8.168056e+05 False \n",
"4 3.021616e+06 False \n",
"... ... ... \n",
"5384 5.809646e+05 False \n",
"5385 6.101489e+05 False \n",
"5386 1.121042e+06 False \n",
"5387 2.635540e+05 False \n",
"5388 8.377956e+04 False \n",
"\n",
"[10774 rows x 19 columns]\n"
"[5389 rows x 19 columns]\n"
]
}
],
@@ -329,7 +326,7 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 6,
"id": "28cb78d032671b20",
"metadata": {
"ExecuteTime": {
@@ -342,59 +339,59 @@
"name": "stdout",
"output_type": "stream",
"text": [
" ts_code trade_date close turnover_rate turnover_rate_f \\\n",
"8 300147.SZ 20250507 6.58 5.3209 6.8857 \n",
"19 002501.SZ 20250507 2.10 2.8874 3.7273 \n",
"52 600238.SH 20250507 4.55 11.2843 13.8699 \n",
"63 300391.SZ 20250507 5.58 5.5505 7.0395 \n",
"73 600421.SH 20250507 4.99 2.8571 6.1511 \n",
"... ... ... ... ... ... \n",
"10647 600243.SH 20250506 2.43 6.7484 8.1172 \n",
"10652 002528.SZ 20250506 2.35 2.0592 4.3961 \n",
"10682 300044.SZ 20250506 3.31 12.8866 13.4490 \n",
"10712 300097.SZ 20250506 4.36 2.5814 3.0107 \n",
"10733 600200.SH 20250506 3.04 0.2013 0.2433 \n",
" ts_code trade_date close turnover_rate turnover_rate_f \\\n",
"54 002496.SZ 20250509 1.43 3.1262 3.2341 \n",
"148 603828.SH 20250509 5.04 3.5674 7.1692 \n",
"166 600599.SH 20250509 7.70 10.8623 27.2882 \n",
"193 000820.SZ 20250509 2.16 5.5698 5.7239 \n",
"203 300506.SZ 20250509 3.28 0.6710 0.9449 \n",
"... ... ... ... ... ... \n",
"5204 002602.SZ 20250509 8.00 1.3867 1.7044 \n",
"5253 300147.SZ 20250509 7.37 7.2159 9.3379 \n",
"5264 002501.SZ 20250509 2.08 2.4301 3.1371 \n",
"5317 600421.SH 20250509 5.27 2.7391 5.8971 \n",
"5345 600289.SH 20250509 5.78 1.3847 2.0115 \n",
"\n",
" volume_ratio pe pe_ttm pb ps ps_ttm dv_ratio \\\n",
"8 1.62 NaN NaN 4.4991 2.3410 2.5434 0.0 \n",
"19 1.28 NaN NaN 22.7988 22.3498 26.2757 0.0 \n",
"52 2.57 NaN NaN 20.0224 11.6394 12.3461 0.0 \n",
"63 1.35 NaN NaN NaN 17.5129 12.5138 0.0 \n",
"73 0.80 NaN NaN 135.5854 8.3301 8.4697 0.0 \n",
"... ... ... ... ... ... ... ... \n",
"10647 0.73 NaN NaN 1.6685 4.5071 4.6210 0.0 \n",
"10652 1.52 NaN NaN 15.5269 2.9812 3.6083 0.0 \n",
"10682 2.91 NaN NaN 24.3171 17.6463 26.1361 0.0 \n",
"10712 0.99 NaN NaN 2.7137 3.2758 3.8102 0.0 \n",
"10733 0.05 30.7156 NaN 1.2351 1.3543 1.7858 0.0 \n",
" volume_ratio pe pe_ttm pb ps ps_ttm dv_ratio \\\n",
"54 0.73 NaN NaN 1.6044 7.6992 7.2633 0.0 \n",
"148 1.65 349.9490 1691.0271 3.9734 1.2211 1.3170 0.0 \n",
"166 4.51 NaN NaN 11.5933 3.9468 4.0472 0.0 \n",
"193 1.00 NaN NaN 9.5443 11.2714 14.3393 0.0 \n",
"203 0.87 NaN NaN 28.5909 19.5183 19.3088 0.0 \n",
"... ... ... ... ... ... ... ... \n",
"5204 0.78 49.1432 31.1887 2.2169 2.6358 2.2496 0.0 \n",
"5253 1.74 NaN NaN 5.0393 2.6221 2.8487 0.0 \n",
"5264 0.87 NaN NaN 22.5816 22.1370 26.0255 0.0 \n",
"5317 0.74 NaN NaN 143.1934 8.7976 8.9449 0.0 \n",
"5345 0.55 NaN NaN 2.9752 11.3890 11.6628 0.0 \n",
"\n",
" dv_ttm total_share float_share free_share total_mv \\\n",
"8 NaN 66127.9045 65745.9042 50804.9121 435121.6116 \n",
"19 NaN 355000.0000 354999.9006 274999.9006 745500.0000 \n",
"52 NaN 44820.0000 44500.1580 36204.3908 203931.0000 \n",
"63 NaN 35033.6112 35033.6112 27623.1259 195487.5505 \n",
"73 NaN 19560.0000 19560.0000 9085.2748 97604.4000 \n",
"... ... ... ... ... ... \n",
"10647 NaN 43885.0000 43885.0000 36485.0000 106640.5500 \n",
"10652 NaN 119867.5082 104974.0608 49171.2582 281688.6443 \n",
"10682 NaN 76386.9228 76375.7508 73182.1277 252840.7145 \n",
"10712 NaN 28854.9669 27000.9948 23150.5534 125807.6557 \n",
"10733 NaN 71215.1832 71087.9480 58808.3718 216494.1569 \n",
" dv_ttm total_share float_share free_share total_mv \\\n",
"54 NaN 150758.9677 118138.6559 114196.4999 2.155853e+05 \n",
"148 NaN 59596.0158 59593.9625 29654.2988 3.003639e+05 \n",
"166 NaN 16600.0000 16600.0000 6607.7948 1.278200e+05 \n",
"193 NaN 64362.0201 29403.1899 28611.4718 1.390220e+05 \n",
"203 NaN 69559.6569 57572.5450 40880.9749 2.281557e+05 \n",
"... ... ... ... ... ... \n",
"5204 NaN 745255.6968 687870.8273 559649.7754 5.962046e+06 \n",
"5253 NaN 66127.9045 65745.9042 50804.9121 4.873627e+05 \n",
"5264 NaN 355000.0000 354999.9006 274999.9006 7.384000e+05 \n",
"5317 NaN 19560.0000 19560.0000 9085.2748 1.030812e+05 \n",
"5345 NaN 63105.2069 56592.2684 38956.2787 3.647481e+05 \n",
"\n",
" circ_mv is_st \n",
"8 432608.0496 True \n",
"19 745499.7913 True \n",
"52 202475.7189 True \n",
"63 195487.5505 True \n",
"73 97604.4000 True \n",
"54 1.689383e+05 True \n",
"148 3.003536e+05 True \n",
"166 1.278200e+05 True \n",
"193 6.351089e+04 True \n",
"203 1.888379e+05 True \n",
"... ... ... \n",
"10647 106640.5500 True \n",
"10652 246689.0429 True \n",
"10682 252803.7351 True \n",
"10712 117724.3373 True \n",
"10733 216107.3619 True \n",
"5204 5.502967e+06 True \n",
"5253 4.845473e+05 True \n",
"5264 7.383998e+05 True \n",
"5317 1.030812e+05 True \n",
"5345 3.271033e+05 True \n",
"\n",
"[394 rows x 19 columns]\n"
"[197 rows x 19 columns]\n"
]
}
],
@@ -404,7 +401,7 @@
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": 7,
"id": "692b58674b7462c9",
"metadata": {
"ExecuteTime": {
@@ -430,7 +427,7 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": 8,
"id": "d7a773fc20293477",
"metadata": {
"ExecuteTime": {
@@ -444,7 +441,7 @@
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"Index: 8609912 entries, 0 to 10773\n",
"Index: 8620690 entries, 0 to 5388\n",
"Data columns (total 3 columns):\n",
" # Column Dtype \n",
"--- ------ ----- \n",
@@ -452,7 +449,7 @@
" 1 trade_date object\n",
" 2 is_st bool \n",
"dtypes: bool(1), object(2)\n",
"memory usage: 205.3+ MB\n",
"memory usage: 205.5+ MB\n",
"None\n"
]
}

File diff suppressed because it is too large Load Diff

View File

@@ -34,17 +34,17 @@
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"Index: 8440821 entries, 0 to 5120\n",
"Index: 8451068 entries, 0 to 5123\n",
"Data columns (total 2 columns):\n",
" # Column Dtype \n",
"--- ------ ----- \n",
" 0 ts_code object\n",
" 1 trade_date object\n",
"dtypes: object(2)\n",
"memory usage: 193.2+ MB\n",
"memory usage: 193.4+ MB\n",
"None\n",
"20250506\n",
"start_date: 20250507\n"
"20250508\n",
"start_date: 20250509\n"
]
}
],
@@ -84,30 +84,30 @@
"name": "stdout",
"output_type": "stream",
"text": [
"任务 20250718 完成\n",
"任务 20250717 完成\n",
"任务 20250716 完成\n",
"任务 20250718 完成\n",
"任务 20250715 完成\n",
"任务 20250716 完成\n",
"任务 20250714 完成\n",
"任务 20250711 完成\n",
"任务 20250709 完成\n",
"任务 20250710 完成\n",
"任务 20250709 完成\n",
"任务 20250708 完成\n",
"任务 20250707 完成\n",
"任务 20250703 完成\n",
"任务 20250704 完成\n",
"任务 20250703 完成\n",
"任务 20250702 完成\n",
"任务 20250701 完成\n",
"任务 20250630 完成\n",
"任务 20250627 完成\n",
"任务 20250626 完成\n",
"任务 20250625 完成\n",
"任务 20250626 完成\n",
"任务 20250624 完成\n",
"任务 20250623 完成\n",
"任务 20250620 完成\n",
"任务 20250619 完成\n",
"任务 20250617 完成\n",
"任务 20250618 完成\n",
"任务 20250617 完成\n",
"任务 20250616 完成\n",
"任务 20250613 完成\n",
"任务 20250612 完成\n",
@@ -126,16 +126,14 @@
"任务 20250523 完成\n",
"任务 20250522 完成\n",
"任务 20250521 完成\n",
"任务 20250519 完成\n",
"任务 20250520 完成\n",
"任务 20250519 完成\n",
"任务 20250516 完成\n",
"任务 20250515 完成\n",
"任务 20250514 完成\n",
"任务 20250513 完成\n",
"任务 20250512 完成\n",
"任务 20250509 完成\n",
"任务 20250508 完成\n",
"任务 20250507 完成\n"
"任务 20250509 完成\n"
]
}
],

View File

@@ -34,23 +34,23 @@
"output_type": "stream",
"text": [
" ts_code trade_date\n",
"4745 600276.SH 20250506\n",
"4746 600278.SH 20250506\n",
"4747 600279.SH 20250506\n",
"4736 600262.SH 20250506\n",
"281 000791.SZ 20250506\n",
"2364 300067.SZ 20250508\n",
"2363 300066.SZ 20250508\n",
"2362 300065.SZ 20250508\n",
"2373 300076.SZ 20250508\n",
"7111 920819.BJ 20250508\n",
"<class 'pandas.core.frame.DataFrame'>\n",
"Index: 10436295 entries, 0 to 113592\n",
"Index: 10450519 entries, 0 to 7111\n",
"Data columns (total 2 columns):\n",
" # Column Dtype \n",
"--- ------ ----- \n",
" 0 ts_code object\n",
" 1 trade_date object\n",
"dtypes: object(2)\n",
"memory usage: 238.9+ MB\n",
"memory usage: 239.2+ MB\n",
"None\n",
"20250506\n",
"20250507\n"
"20250508\n",
"20250509\n"
]
}
],
@@ -123,11 +123,11 @@
"任务 20250609 完成\n",
"任务 20250606 完成\n",
"任务 20250605 完成\n",
"任务 20250603 完成\n",
"任务 20250604 完成\n",
"任务 20250529 完成\n",
"任务 20250603 完成\n",
"任务 20250530 完成\n",
"任务 20250528 完成\n",
"任务 20250529 完成\n",
"任务 20250527 完成\n",
"任务 20250526 完成\n",
"任务 20250523 完成\n",
@@ -140,9 +140,7 @@
"任务 20250514 完成\n",
"任务 20250513 完成\n",
"任务 20250512 完成\n",
"任务 20250509 完成\n",
"任务 20250508 完成\n",
"任务 20250507 完成\n"
"任务 20250509 完成\n"
]
}
],
@@ -194,19 +192,19 @@
"output_type": "stream",
"text": [
"[ trade_date ts_code up_limit down_limit\n",
"0 20250507 000001.SZ 12.06 9.86\n",
"1 20250507 000002.SZ 7.51 6.15\n",
"2 20250507 000004.SZ 7.95 7.19\n",
"3 20250507 000006.SZ 7.11 5.81\n",
"4 20250507 000007.SZ 7.50 6.14\n",
"0 20250509 000001.SZ 12.19 9.97\n",
"1 20250509 000002.SZ 7.57 6.19\n",
"2 20250509 000004.SZ 7.86 7.12\n",
"3 20250509 000006.SZ 7.33 5.99\n",
"4 20250509 000007.SZ 7.66 6.26\n",
"... ... ... ... ...\n",
"7107 20250507 920445.BJ 13.42 7.24\n",
"7108 20250507 920489.BJ 31.69 17.07\n",
"7109 20250507 920682.BJ 16.41 8.85\n",
"7110 20250507 920799.BJ 78.58 42.32\n",
"7111 20250507 920819.BJ 5.82 3.14\n",
"7109 20250509 920445.BJ 13.14 7.08\n",
"7110 20250509 920489.BJ 31.70 17.08\n",
"7111 20250509 920682.BJ 16.17 8.71\n",
"7112 20250509 920799.BJ 78.39 42.21\n",
"7113 20250509 920819.BJ 5.74 3.10\n",
"\n",
"[7112 rows x 4 columns]]\n"
"[7114 rows x 4 columns]]\n"
]
}
],

View File

@@ -2547,3 +2547,305 @@ def limit_factor(df: pd.DataFrame) -> pd.DataFrame:
lambda x: calculate_consecutive_limits(x)[0]
)
return df
import pandas as pd
import numpy as np
# 假设 df 已经加载并包含 'ts_code', 'trade_date', 'pct_chg' 列
# 并且已经按照 'ts_code' 和 'trade_date' 进行了排序
def daily_momentum_benchmark(df):
"""
计算日级别动量基准 (Positive and Negative),使用现有的 'pct_chg' 列。
这个函数将原分钟级动量基准的概念应用于日线数据。
计算每日全市场上涨股票 ('pct_chg' > 0) 的平均涨跌幅
和下跌股票 ('pct_chg' < 0) 的平均涨跌幅。
参数:
df (pd.DataFrame): 包含日级别股票数据的DataFrame。
必须包含 'ts_code', 'trade_date', 'pct_chg' 列,
并已按 'ts_code''trade_date' 排序。
返回:
pd.DataFrame: 增加了 'daily_positive_benchmark', 'daily_negative_benchmark' 列的DataFrame。
原始的 'pct_chg' 列会被直接使用。
"""
print("--- 计算日级别动量基准 (使用 pct_chg) ---")
# 确保 pct_chg 列存在
if 'pct_chg' not in df.columns:
print("错误: DataFrame中没有'pct_chg'列,无法计算日级别动量基准。")
return df
# 计算每日的全市场动量基准
# 对于每一个交易日,计算所有股票中 pct_chg > 0 和 < 0 的平均值
# 使用 trade_date 进行分组
daily_benchmarks = df.groupby('trade_date')['pct_chg'].agg(
daily_positive_benchmark = lambda x: x[x > 0].mean(), # 日级别上涨股票的平均涨跌幅
daily_negative_benchmark = lambda x: x[x < 0].mean() # 日级别下跌股票的平均涨跌幅
).reset_index()
# 将日级别动量基准合并回原始日线数据DataFrame
df = pd.merge(
df,
daily_benchmarks,
on='trade_date',
how='left'
)
# 对可能出现的NaN基准进行填充这里用0填充表示没有对应的同向基准
df['daily_positive_benchmark'].fillna(0, inplace=True)
df['daily_negative_benchmark'].fillna(0, inplace=True)
print("日级别动量基准计算完成 (使用 pct_chg)。")
return df
def daily_deviation(df):
"""
计算日级别偏离度,使用现有的 'pct_chg' 列和计算出的日级别动量基准。
计算每只股票的日涨跌幅 ('pct_chg') 相对于日级别动量基准的偏离。
参数:
df (pd.DataFrame): 包含日级别股票数据的DataFrame。
必须包含 'ts_code', 'trade_date', 'pct_chg',
'daily_positive_benchmark', 'daily_negative_benchmark' 列。
这些基准列通常通过运行 daily_momentum_benchmark(df) 获得。
返回:
pd.DataFrame: 增加了 'daily_deviation' 列的DataFrame。
"""
print("--- 计算日级别偏离度 (使用 pct_chg) ---")
# 确保所需的列存在
df = daily_momentum_benchmark(df)
required_cols = ['pct_chg', 'daily_positive_benchmark', 'daily_negative_benchmark']
if not all(col in df.columns for col in required_cols):
print(f"错误: 计算日级别偏离度需要以下列: {required_cols}。请先运行 daily_momentum_benchmark(df)。")
return df
conditions = [
(df['pct_chg'] > 0) & (df['daily_positive_benchmark'] > 0),
(df['pct_chg'] < 0) & (df['daily_negative_benchmark'] < 0)
]
choices = [
df['pct_chg'] - df['daily_positive_benchmark'],
df['pct_chg'] - df['daily_negative_benchmark']
]
df['daily_deviation'] = np.select(conditions, choices, default=0)
df = df.drop(columns=['daily_positive_benchmark', 'daily_negative_benchmark'])
print("日级别偏离度计算完成 (使用 pct_chg)。")
return df
def daily_industry_momentum_benchmark(df):
"""
计算日级别行业动量基准 (Positive and Negative),使用现有的 'pct_chg' 列和 'cat_l2_code' 列。
计算每日每个行业内部上涨股票 ('pct_chg' > 0) 的平均涨跌幅
和下跌股票 ('pct_chg' < 0) 的平均涨跌幅。
参数:
df (pd.DataFrame): 包含日级别股票数据的DataFrame。
必须包含 'ts_code', 'trade_date', 'pct_chg', 'cat_l2_code' 列,
并已按 'ts_code''trade_date' 排序。
返回:
pd.DataFrame: 增加了 'daily_industry_positive_benchmark', 'daily_industry_negative_benchmark' 列的DataFrame。
原始的 'pct_chg''cat_l2_code' 列会被直接使用。
"""
print("--- 计算日级别行业动量基准 (使用 pct_chg 和 cat_l2_code) ---")
# 确保必需列存在
required_cols = ['pct_chg', 'cat_l2_code', 'trade_date', 'ts_code']
if not all(col in df.columns for col in required_cols):
print(f"错误: 计算日级别行业动量基准需要以下列: {required_cols}")
return df
# 计算每日每个行业内部的动量基准
# 使用 trade_date 和 cat_l2_code 进行分组
industry_daily_benchmarks = df.groupby(['trade_date', 'cat_l2_code'])['pct_chg'].agg(
daily_industry_positive_benchmark = lambda x: x[x > 0].mean(), # 日级别行业内上涨股票的平均涨跌幅
daily_industry_negative_benchmark = lambda x: x[x < 0].mean() # 日级别行业内下跌股票的平均涨跌幅
).reset_index()
# 将日级别行业动量基准合并回原始日线数据DataFrame
# 使用 trade_date 和 cat_l2_code 进行 merge
df = pd.merge(
df,
industry_daily_benchmarks,
on=['trade_date', 'cat_l2_code'],
how='left'
)
# 对可能出现的NaN基准进行填充例如某个行业某一天没有上涨或下跌的股票
# 这里用0填充表示该行业该天没有对应的同向基准
df['daily_industry_positive_benchmark'].fillna(0, inplace=True)
df['daily_industry_negative_benchmark'].fillna(0, inplace=True)
print("日级别行业动量基准计算完成 (使用 pct_chg 和 cat_l2_code)。")
return df
def daily_industry_deviation(df):
"""
计算日级别行业偏离度,使用现有的 'pct_chg' 列和计算出的日级别行业动量基准。
计算每只股票的日涨跌幅 ('pct_chg') 相对于其所属行业日级别动量基准的偏离。
参数:
df (pd.DataFrame): 包含日级别股票数据的DataFrame。
必须包含 'ts_code', 'trade_date', 'pct_chg', 'cat_l2_code',
'daily_industry_positive_benchmark', 'daily_industry_negative_benchmark' 列。
这些基准列通常通过运行 daily_industry_momentum_benchmark(df) 获得。
返回:
pd.DataFrame: 增加了 'daily_industry_deviation' 列的DataFrame。
"""
print("--- 计算日级别行业偏离度 (使用 pct_chg 和行业基准) ---")
# 确保所需的列存在
df = daily_industry_momentum_benchmark(df)
required_cols = ['pct_chg', 'daily_industry_positive_benchmark', 'daily_industry_negative_benchmark']
if not all(col in df.columns for col in required_cols):
print(f"错误: 计算日级别行业偏离度需要以下列: {required_cols}。请先运行 daily_industry_momentum_benchmark(df)。")
return df
# 根据规则计算日级别行业偏离度:
# 如果 pct_chg > 0 且 daily_industry_positive_benchmark > 0deviation = pct_chg - daily_industry_positive_benchmark
# 如果 pct_chg < 0 且 daily_industry_negative_benchmark < 0deviation = pct_chg - daily_industry_negative_benchmark
# 否则 deviation = 0
conditions = [
(df['pct_chg'] > 0) & (df['daily_industry_positive_benchmark'] > 0),
(df['pct_chg'] < 0) & (df['daily_industry_negative_benchmark'] < 0)
]
choices = [
df['pct_chg'] - df['daily_industry_positive_benchmark'],
df['pct_chg'] - df['daily_industry_negative_benchmark']
]
df['daily_industry_deviation'] = np.select(conditions, choices, default=0)
df = df.drop(columns=['daily_industry_positive_benchmark', 'daily_industry_negative_benchmark'])
print("日级别行业偏离度计算完成 (使用 pct_chg 和行业基准)。")
return df
def sentiment_panic_greed_index(df: pd.DataFrame, window_atr: int = 14, window_smooth: int = 5, factor_name: str = 'senti_panic_greed'):
"""
计算市场恐慌/贪婪指数 (原地修改)。
结合日内振幅、影线、跳空及与近期ATR的比较。
WARNING: Modifies df in-place.
"""
print(f"Calculating {factor_name}...")
_temp_cols = ['_prev_close', '_atr', '_true_range', '_upper_shadow', '_lower_shadow', '_body', '_gap', '_volatility_surprise']
if not all(col in df.columns for col in ['open', 'high', 'low', 'close', 'vol']):
print(f"Error: DataFrame 缺少必需的 OHLCV 列。将为 {factor_name} 填充 NaN。")
df[factor_name] = np.nan
return
try:
df['_prev_close'] = df['close'].shift(1)
# 计算真实波幅 (TR) 和 ATR
df['_true_range'] = talib.TRANGE(df['high'], df['low'], df['_prev_close'])
df['_atr'] = talib.ATR(df['high'], df['low'], df['_prev_close'], timeperiod=window_atr)
# 计算影线和实体
df['_upper_shadow'] = df['high'] - np.maximum(df['open'], df['close'])
df['_lower_shadow'] = np.minimum(df['open'], df['close']) - df['low']
df['_body'] = np.abs(df['close'] - df['open'])
# 计算跳空
df['_gap'] = (df['open'] / df['_prev_close'] - 1).fillna(0)
# 波动性意外: 当日真实波幅相对于近期ATR的倍数乘以涨跌方向
# 如果真实波幅显著放大,根据涨跌幅赋予正负号,表明情绪的强度和方向
df['_volatility_surprise'] = (df['_true_range'] / (df['_atr'] + epsilon) -1) * np.sign(df['pct_chg'].fillna(0))
# 简化版情绪指标:(下影线 - 上影线) / ATR + 跳空幅度 + 当日涨跌幅, 然后平滑
# 更强的信号:波动性意外,结合跳空
# 考虑当日振幅相对于ATR的超额部分并结合实体方向
# ( (真实波幅/ATR) * 涨跌方向 ) + 跳空幅度
raw_senti = (df['_true_range'] / (df['_atr'] + epsilon)) * np.sign(df['pct_chg'].fillna(0)) + df['_gap'] * 2 # 放大跳空影响
df[factor_name] = raw_senti.rolling(window_smooth, min_periods=1).mean()
except Exception as e:
print(f"Error calculating {factor_name}: {e}")
df[factor_name] = np.nan
finally:
cols_to_drop = [col for col in _temp_cols if col in df.columns]
if cols_to_drop:
df.drop(columns=cols_to_drop, inplace=True)
print(f"Finished {factor_name}.")
return df
def sentiment_market_breadth_proxy(df: pd.DataFrame, window_vol: int = 20, window_smooth: int = 3, factor_name: str = 'senti_breadth_proxy'):
"""
计算市场宽度情绪代理指标 (基于指数的价量配合度) (原地修改).
WARNING: Modifies df in-place.
"""
print(f"Calculating {factor_name}...")
_temp_cols = ['_rolling_avg_vol']
if not all(col in df.columns for col in ['pct_chg', 'vol']):
print(f"Error: DataFrame 缺少 'pct_chg''vol' 列。将为 {factor_name} 填充 NaN。")
df[factor_name] = np.nan
return
try:
df['_rolling_avg_vol'] = df['vol'].rolling(window_vol, min_periods=max(1, window_vol//2)).mean()
# 价量配合度:涨跌幅乘以相对成交量强度
raw_breadth = df['pct_chg'] * (df['vol'] / (df['_rolling_avg_vol'] + epsilon))
df[factor_name] = raw_breadth.rolling(window_smooth, min_periods=1).mean() # 平滑处理
except Exception as e:
print(f"Error calculating {factor_name}: {e}")
df[factor_name] = np.nan
finally:
cols_to_drop = [col for col in _temp_cols if col in df.columns]
if cols_to_drop:
df.drop(columns=cols_to_drop, inplace=True)
print(f"Finished {factor_name}.")
return df
def sentiment_reversal_indicator(df: pd.DataFrame, window_ret: int = 5, window_vol: int = 5, factor_name: str = 'senti_reversal'):
"""
计算短期情绪反转因子 (原地修改).
WARNING: Modifies df in-place.
"""
print(f"Calculating {factor_name}...")
_temp_cols = ['_return_M', '_volatility_M']
if 'pct_chg' not in df.columns:
print(f"Error: DataFrame 缺少 'pct_chg' 列。将为 {factor_name} 填充 NaN。")
df[factor_name] = np.nan
return
try:
# 计算 M 日累计收益率 (这里用连乘近似,或者 sum of log returns)
# (close / close.shift(M)) -1
df['_return_M'] = (df['close'] / df['close'].shift(window_ret)) - 1
# df['_return_M'] = df['pct_chg'].rolling(window_ret, min_periods=1).sum() # 另一种近似
# 计算 M 日已实现波动率
df['_volatility_M'] = df['pct_chg'].rolling(window_vol, min_periods=max(1, window_vol//2)).std()
# 因子计算
df[factor_name] = -df['_return_M'] * df['_volatility_M']
# 对因子本身可以再做一次平滑
# df[factor_name] = df[factor_name].rolling(3, min_periods=1).mean()
except Exception as e:
print(f"Error calculating {factor_name}: {e}")
df[factor_name] = np.nan
finally:
cols_to_drop = [col for col in _temp_cols if col in df.columns]
if cols_to_drop:
df.drop(columns=cols_to_drop, inplace=True)
print(f"Finished {factor_name}.")
return df

File diff suppressed because one or more lines are too long

View File

@@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 8,
"id": "79a7758178bafdd3",
"metadata": {
"ExecuteTime": {
@@ -18,6 +18,8 @@
"name": "stdout",
"output_type": "stream",
"text": [
"The autoreload extension is already loaded. To reload it, use:\n",
" %reload_ext autoreload\n",
"e:\\PyProject\\NewStock\\main\\train\n"
]
}
@@ -44,7 +46,7 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 9,
"id": "a79cafb06a7e0e43",
"metadata": {
"ExecuteTime": {
@@ -68,7 +70,7 @@
"cyq perf\n",
"left merge on ['ts_code', 'trade_date']\n",
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 8601132 entries, 0 to 8601131\n",
"RangeIndex: 8611848 entries, 0 to 8611847\n",
"Data columns (total 32 columns):\n",
" # Column Dtype \n",
"--- ------ ----- \n",
@@ -143,7 +145,7 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 10,
"id": "cac01788dac10678",
"metadata": {
"ExecuteTime": {
@@ -211,7 +213,7 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 11,
"id": "c4e9e1d31da6dba6",
"metadata": {
"ExecuteTime": {
@@ -224,6 +226,8 @@
},
"outputs": [],
"source": [
"from main.factor.factor import *\n",
"\n",
"def calculate_indicators(df):\n",
" \"\"\"\n",
" 计算四个指标当日涨跌幅、5日移动平均、RSI、MACD。\n",
@@ -261,6 +265,10 @@
" df['amount_mean'] = df['amount'].rolling(window=20).mean() # 过去20天的平均成交额\n",
" df['amount_change_rate'] = (df['amount'] - df['amount_mean']) / df['amount_mean'] * 100 # 成交额变化率\n",
"\n",
" # df = sentiment_panic_greed_index(df)\n",
" # df = sentiment_market_breadth_proxy(df)\n",
" # df = sentiment_reversal_indicator(df)\n",
"\n",
" return df\n",
"\n",
"\n",
@@ -283,8 +291,10 @@
" df_final = df_all_indicators.pivot_table(\n",
" index='trade_date',\n",
" columns='ts_code',\n",
" values=['daily_return', 'RSI', 'MACD', 'Signal_line',\n",
" 'MACD_hist', 'up_ratio_20d', 'volume_change_rate', 'volatility',\n",
" values=['daily_return', \n",
" 'RSI', 'MACD', 'Signal_line', 'MACD_hist', \n",
" # 'sentiment_panic_greed_index',\n",
" 'up_ratio_20d', 'volume_change_rate', 'volatility',\n",
" 'amount_change_rate', 'amount_mean'],\n",
" aggfunc='last'\n",
" )\n",
@@ -303,7 +313,7 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 12,
"id": "a735bc02ceb4d872",
"metadata": {
"ExecuteTime": {
@@ -319,7 +329,7 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 30,
"id": "53f86ddc0677a6d7",
"metadata": {
"ExecuteTime": {
@@ -368,6 +378,10 @@
" lambda x: x.rank(pct=True))\n",
" industry_data['return_20_percentile'] = industry_data.groupby('trade_date')['return_20'].transform(\n",
" lambda x: x.rank(pct=True))\n",
"\n",
" # cs_rank_intraday_range(industry_data)\n",
" # cs_rank_close_pos_in_range(industry_data)\n",
"\n",
" industry_data = industry_data.drop(columns=['open', 'close', 'high', 'low', 'pe', 'pb', 'vol'])\n",
"\n",
" industry_data = industry_data.rename(\n",
@@ -382,7 +396,7 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 14,
"id": "dbe2fd8021b9417f",
"metadata": {
"ExecuteTime": {
@@ -410,7 +424,7 @@
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": 15,
"id": "85c3e3d0235ffffa",
"metadata": {
"ExecuteTime": {
@@ -433,7 +447,7 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": 16,
"id": "92d84ce15a562ec6",
"metadata": {
"ExecuteTime": {
@@ -459,6 +473,14 @@
"使用 'ann_date' 作为财务数据生效日期。\n",
"警告: 从 financial_data_subset 中移除了 366 行,因为其 'ts_code' 或 'ann_date' 列存在空值。\n",
"计算 BBI...\n",
"--- 计算日级别偏离度 (使用 pct_chg) ---\n",
"--- 计算日级别动量基准 (使用 pct_chg) ---\n",
"日级别动量基准计算完成 (使用 pct_chg)。\n",
"日级别偏离度计算完成 (使用 pct_chg)。\n",
"--- 计算日级别行业偏离度 (使用 pct_chg 和行业基准) ---\n",
"--- 计算日级别行业动量基准 (使用 pct_chg 和 cat_l2_code) ---\n",
"错误: 计算日级别行业动量基准需要以下列: ['pct_chg', 'cat_l2_code', 'trade_date', 'ts_code']。\n",
"错误: 计算日级别行业偏离度需要以下列: ['pct_chg', 'daily_industry_positive_benchmark', 'daily_industry_negative_benchmark']。请先运行 daily_industry_momentum_benchmark(df)。\n",
"Index(['ts_code', 'trade_date', 'open', 'close', 'high', 'low', 'vol',\n",
" 'pct_chg', 'turnover_rate', 'pe_ttm', 'circ_mv', 'total_mv',\n",
" 'volume_ratio', 'is_st', 'up_limit', 'down_limit', 'buy_sm_vol',\n",
@@ -468,9 +490,9 @@
" 'winner_rate', 'l2_code', 'undist_profit_ps', 'ocfps', 'AR', 'BR',\n",
" 'AR_BR', 'log_circ_mv', 'cashflow_to_ev_factor', 'book_to_price_ratio',\n",
" 'turnover_rate_mean_5', 'variance_20', 'bbi_ratio_factor',\n",
" 'lg_elg_net_buy_vol', 'flow_lg_elg_intensity', 'sm_net_buy_vol',\n",
" 'flow_divergence_diff', 'flow_divergence_ratio', 'total_buy_vol',\n",
" 'lg_elg_buy_prop', 'flow_struct_buy_change',\n",
" 'daily_deviation', 'lg_elg_net_buy_vol', 'flow_lg_elg_intensity',\n",
" 'sm_net_buy_vol', 'flow_divergence_diff', 'flow_divergence_ratio',\n",
" 'total_buy_vol', 'lg_elg_buy_prop', 'flow_struct_buy_change',\n",
" 'lg_elg_net_buy_vol_change', 'flow_lg_elg_accel',\n",
" 'chip_concentration_range', 'chip_skewness', 'floating_chip_proxy',\n",
" 'cost_support_15pct_change', 'cat_winner_price_zone',\n",
@@ -519,7 +541,7 @@
"Calculating cs_rank_flow_divergence...\n",
"Finished cs_rank_flow_divergence.\n",
"Calculating cs_rank_ind_adj_lg_flow...\n",
"Error calculating cs_rank_ind_adj_lg_flow: Missing 'cat_l2_code' column. Assigning NaN.\n",
"Finished cs_rank_ind_adj_lg_flow.\n",
"Calculating cs_rank_elg_buy_ratio...\n",
"Finished cs_rank_elg_buy_ratio.\n",
"Calculating cs_rank_rel_profit_margin...\n",
@@ -555,12 +577,12 @@
"Calculating cs_rank_size...\n",
"Finished cs_rank_size.\n",
"<class 'pandas.core.frame.DataFrame'>\n",
"Index: 4503567 entries, 0 to 4503566\n",
"Columns: 177 entries, ts_code to cs_rank_size\n",
"dtypes: bool(10), datetime64[ns](1), float64(161), int32(3), object(2)\n",
"RangeIndex: 4509585 entries, 0 to 4509584\n",
"Columns: 178 entries, ts_code to cs_rank_size\n",
"dtypes: bool(10), datetime64[ns](1), float64(162), int32(3), object(2)\n",
"memory usage: 5.6+ GB\n",
"None\n",
"['ts_code', 'trade_date', 'open', 'close', 'high', 'low', 'vol', 'pct_chg', 'turnover_rate', 'pe_ttm', 'circ_mv', 'total_mv', 'volume_ratio', 'is_st', 'up_limit', 'down_limit', 'buy_sm_vol', 'sell_sm_vol', 'buy_lg_vol', 'sell_lg_vol', 'buy_elg_vol', 'sell_elg_vol', 'net_mf_vol', 'his_low', 'his_high', 'cost_5pct', 'cost_15pct', 'cost_50pct', 'cost_85pct', 'cost_95pct', 'weight_avg', 'winner_rate', 'cat_l2_code', 'undist_profit_ps', 'ocfps', 'AR', 'BR', 'AR_BR', 'log_circ_mv', 'cashflow_to_ev_factor', 'book_to_price_ratio', 'turnover_rate_mean_5', 'variance_20', 'bbi_ratio_factor', 'lg_elg_net_buy_vol', 'flow_lg_elg_intensity', 'sm_net_buy_vol', 'flow_divergence_diff', 'flow_divergence_ratio', 'total_buy_vol', 'lg_elg_buy_prop', 'flow_struct_buy_change', 'lg_elg_net_buy_vol_change', 'flow_lg_elg_accel', 'chip_concentration_range', 'chip_skewness', 'floating_chip_proxy', 'cost_support_15pct_change', 'cat_winner_price_zone', 'flow_chip_consistency', 'profit_taking_vs_absorb', 'cat_is_positive', 'upside_vol', 'downside_vol', 'vol_ratio', 'return_skew', 'return_kurtosis', 'volume_change_rate', 'cat_volume_breakout', 'turnover_deviation', 'cat_turnover_spike', 'avg_volume_ratio', 'cat_volume_ratio_breakout', 'vol_spike', 'vol_std_5', 'atr_14', 'atr_6', 'obv', 'maobv_6', 'rsi_3', 'return_5', 'return_20', 'std_return_5', 'std_return_90', 'std_return_90_2', 'act_factor1', 'act_factor2', 'act_factor3', 'act_factor4', 'rank_act_factor1', 'rank_act_factor2', 'rank_act_factor3', 'cov', 'delta_cov', 'alpha_22_improved', 'alpha_003', 'alpha_007', 'alpha_013', 'vol_break', 'weight_roc5', 'price_cost_divergence', 'smallcap_concentration', 'cost_stability', 'high_cost_break_days', 'liquidity_risk', 'turnover_std', 'mv_volatility', 'volume_growth', 'mv_growth', 'momentum_factor', 'resonance_factor', 'log_close', 'cat_vol_spike', 'up', 'down', 'obv_maobv_6', 'std_return_5_over_std_return_90', 'std_return_90_minus_std_return_90_2', 'cat_af2', 'cat_af3', 'cat_af4', 'act_factor5', 'act_factor6', 'active_buy_volume_large', 'active_buy_volume_big', 'active_buy_volume_small', 'buy_lg_vol_minus_sell_lg_vol', 'buy_elg_vol_minus_sell_elg_vol', 'ctrl_strength', 'low_cost_dev', 'asymmetry', 'lock_factor', 'cat_vol_break', 'cost_atr_adj', 'cat_golden_resonance', 'mv_turnover_ratio', 'mv_adjusted_volume', 'mv_weighted_turnover', 'nonlinear_mv_volume', 'mv_volume_ratio', 'mv_momentum', 'lg_flow_mom_corr_20_60', 'lg_flow_accel', 'profit_pressure', 'underwater_resistance', 'cost_conc_std_20', 'profit_decay_20', 'vol_amp_loss_20', 'vol_drop_profit_cnt_5', 'lg_flow_vol_interact_20', 'cost_break_confirm_cnt_5', 'atr_norm_channel_pos_14', 'turnover_diff_skew_20', 'lg_sm_flow_diverge_20', 'pullback_strong_20_20', 'vol_wgt_hist_pos_20', 'vol_adj_roc_20', 'cs_rank_net_lg_flow_val', 'cs_rank_flow_divergence', 'cs_rank_ind_adj_lg_flow', 'cs_rank_elg_buy_ratio', 'cs_rank_rel_profit_margin', 'cs_rank_cost_breadth', 'cs_rank_dist_to_upper_cost', 'cs_rank_winner_rate', 'cs_rank_intraday_range', 'cs_rank_close_pos_in_range', 'cs_rank_opening_gap', 'cs_rank_pos_in_hist_range', 'cs_rank_vol_x_profit_margin', 'cs_rank_lg_flow_price_concordance', 'cs_rank_turnover_per_winner', 'cs_rank_ind_cap_neutral_pe', 'cs_rank_volume_ratio', 'cs_rank_elg_buy_sell_sm_ratio', 'cs_rank_cost_dist_vol_ratio', 'cs_rank_size']\n"
"['ts_code', 'trade_date', 'open', 'close', 'high', 'low', 'vol', 'pct_chg', 'turnover_rate', 'pe_ttm', 'circ_mv', 'total_mv', 'volume_ratio', 'is_st', 'up_limit', 'down_limit', 'buy_sm_vol', 'sell_sm_vol', 'buy_lg_vol', 'sell_lg_vol', 'buy_elg_vol', 'sell_elg_vol', 'net_mf_vol', 'his_low', 'his_high', 'cost_5pct', 'cost_15pct', 'cost_50pct', 'cost_85pct', 'cost_95pct', 'weight_avg', 'winner_rate', 'cat_l2_code', 'undist_profit_ps', 'ocfps', 'AR', 'BR', 'AR_BR', 'log_circ_mv', 'cashflow_to_ev_factor', 'book_to_price_ratio', 'turnover_rate_mean_5', 'variance_20', 'bbi_ratio_factor', 'daily_deviation', 'lg_elg_net_buy_vol', 'flow_lg_elg_intensity', 'sm_net_buy_vol', 'flow_divergence_diff', 'flow_divergence_ratio', 'total_buy_vol', 'lg_elg_buy_prop', 'flow_struct_buy_change', 'lg_elg_net_buy_vol_change', 'flow_lg_elg_accel', 'chip_concentration_range', 'chip_skewness', 'floating_chip_proxy', 'cost_support_15pct_change', 'cat_winner_price_zone', 'flow_chip_consistency', 'profit_taking_vs_absorb', 'cat_is_positive', 'upside_vol', 'downside_vol', 'vol_ratio', 'return_skew', 'return_kurtosis', 'volume_change_rate', 'cat_volume_breakout', 'turnover_deviation', 'cat_turnover_spike', 'avg_volume_ratio', 'cat_volume_ratio_breakout', 'vol_spike', 'vol_std_5', 'atr_14', 'atr_6', 'obv', 'maobv_6', 'rsi_3', 'return_5', 'return_20', 'std_return_5', 'std_return_90', 'std_return_90_2', 'act_factor1', 'act_factor2', 'act_factor3', 'act_factor4', 'rank_act_factor1', 'rank_act_factor2', 'rank_act_factor3', 'cov', 'delta_cov', 'alpha_22_improved', 'alpha_003', 'alpha_007', 'alpha_013', 'vol_break', 'weight_roc5', 'price_cost_divergence', 'smallcap_concentration', 'cost_stability', 'high_cost_break_days', 'liquidity_risk', 'turnover_std', 'mv_volatility', 'volume_growth', 'mv_growth', 'momentum_factor', 'resonance_factor', 'log_close', 'cat_vol_spike', 'up', 'down', 'obv_maobv_6', 'std_return_5_over_std_return_90', 'std_return_90_minus_std_return_90_2', 'cat_af2', 'cat_af3', 'cat_af4', 'act_factor5', 'act_factor6', 'active_buy_volume_large', 'active_buy_volume_big', 'active_buy_volume_small', 'buy_lg_vol_minus_sell_lg_vol', 'buy_elg_vol_minus_sell_elg_vol', 'ctrl_strength', 'low_cost_dev', 'asymmetry', 'lock_factor', 'cat_vol_break', 'cost_atr_adj', 'cat_golden_resonance', 'mv_turnover_ratio', 'mv_adjusted_volume', 'mv_weighted_turnover', 'nonlinear_mv_volume', 'mv_volume_ratio', 'mv_momentum', 'lg_flow_mom_corr_20_60', 'lg_flow_accel', 'profit_pressure', 'underwater_resistance', 'cost_conc_std_20', 'profit_decay_20', 'vol_amp_loss_20', 'vol_drop_profit_cnt_5', 'lg_flow_vol_interact_20', 'cost_break_confirm_cnt_5', 'atr_norm_channel_pos_14', 'turnover_diff_skew_20', 'lg_sm_flow_diverge_20', 'pullback_strong_20_20', 'vol_wgt_hist_pos_20', 'vol_adj_roc_20', 'cs_rank_net_lg_flow_val', 'cs_rank_flow_divergence', 'cs_rank_ind_adj_lg_flow', 'cs_rank_elg_buy_ratio', 'cs_rank_rel_profit_margin', 'cs_rank_cost_breadth', 'cs_rank_dist_to_upper_cost', 'cs_rank_winner_rate', 'cs_rank_intraday_range', 'cs_rank_close_pos_in_range', 'cs_rank_opening_gap', 'cs_rank_pos_in_hist_range', 'cs_rank_vol_x_profit_margin', 'cs_rank_lg_flow_price_concordance', 'cs_rank_turnover_per_winner', 'cs_rank_ind_cap_neutral_pe', 'cs_rank_volume_ratio', 'cs_rank_elg_buy_sell_sm_ratio', 'cs_rank_cost_dist_vol_ratio', 'cs_rank_size']\n"
]
}
],
@@ -595,9 +617,14 @@
"df = turnover_rate_n(df, n=5)\n",
"df = variance_n(df, n=20)\n",
"df = bbi_ratio_factor(df)\n",
"df = daily_deviation(df)\n",
"df = daily_industry_deviation(df)\n",
"df, _ = get_rolling_factor(df)\n",
"df, _ = get_simple_factor(df)\n",
"\n",
"df = df.rename(columns={'l1_code': 'cat_l1_code'})\n",
"df = df.rename(columns={'l2_code': 'cat_l2_code'})\n",
"\n",
"lg_flow_mom_corr(df, N=20, M=60)\n",
"lg_flow_accel(df)\n",
"profit_pressure(df)\n",
@@ -636,8 +663,6 @@
"cs_rank_cost_dist_vol_ratio(df) # Needs volume_ratio\n",
"cs_rank_size(df) # Needs circ_mv\n",
"\n",
"df = df.rename(columns={'l1_code': 'cat_l1_code'})\n",
"df = df.rename(columns={'l2_code': 'cat_l2_code'})\n",
"\n",
"# df = df.merge(index_data, on='trade_date', how='left')\n",
"\n",
@@ -647,7 +672,7 @@
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": 17,
"id": "b87b938028afa206",
"metadata": {
"ExecuteTime": {
@@ -685,7 +710,7 @@
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": 18,
"id": "f4f16d63ad18d1bc",
"metadata": {
"ExecuteTime": {
@@ -931,7 +956,7 @@
},
{
"cell_type": "code",
"execution_count": 12,
"execution_count": 19,
"id": "40e6b68a91b30c79",
"metadata": {
"ExecuteTime": {
@@ -1217,12 +1242,48 @@
" n_actual = min(n, len(rankic_series))\n",
" top_features = rankic_series.sort_values(ascending=False).head(n_actual).index.tolist()\n",
" top_features = [col for col in feature_columns if col in top_features or col not in numeric_columns]\n",
" return top_features"
" return top_features\n",
"\n",
"def create_deviation_within_dates(df, feature_columns):\n",
" groupby_col = 'cat_l2_code' # 使用 trade_date 进行分组\n",
" new_columns = {}\n",
" ret_feature_columns = feature_columns[:]\n",
"\n",
" # 自动选择所有数值型特征\n",
" num_features = [col for col in feature_columns if 'cat' not in col and 'index' not in col]\n",
"\n",
" # num_features = ['vol', 'pct_chg', 'turnover_rate', 'volume_ratio', 'cat_vol_spike', 'obv', 'maobv_6', 'return_5', 'return_10', 'return_20', 'std_return_5', 'std_return_15', 'std_return_90', 'std_return_90_2', 'act_factor1', 'act_factor2', 'act_factor3', 'act_factor4', 'act_factor5', 'act_factor6', 'rank_act_factor1', 'rank_act_factor2', 'rank_act_factor3', 'active_buy_volume_large', 'active_buy_volume_big', 'active_buy_volume_small', 'alpha_022', 'alpha_003', 'alpha_007', 'alpha_013']\n",
" num_features = [col for col in num_features if 'cat' not in col and 'industry' not in col]\n",
" num_features = [col for col in num_features if 'limit' not in col]\n",
" num_features = [col for col in num_features if 'cyq' not in col]\n",
"\n",
" # 遍历所有数值型特征\n",
" for feature in num_features:\n",
" if feature == 'trade_date': # 不需要对 'trade_date' 计算偏差\n",
" continue\n",
"\n",
" # grouped_mean = df.groupby(['trade_date'])[feature].transform('mean')\n",
" # deviation_col_name = f'deviation_mean_{feature}'\n",
" # new_columns[deviation_col_name] = df[feature] - grouped_mean\n",
" # ret_feature_columns.append(deviation_col_name)\n",
"\n",
" grouped_mean = df.groupby(['trade_date', groupby_col])[feature].transform('mean')\n",
" deviation_col_name = f'deviation_mean_{feature}'\n",
" new_columns[deviation_col_name] = df[feature] - grouped_mean\n",
" ret_feature_columns.append(deviation_col_name)\n",
"\n",
" # 将新计算的偏差特征与原始 DataFrame 合并\n",
" df = pd.concat([df, pd.DataFrame(new_columns)], axis=1)\n",
"\n",
" # for feature in ['obv', 'return_20', 'act_factor1', 'act_factor2', 'act_factor3', 'act_factor4']:\n",
" # df[f'deviation_industry_{feature}'] = df[feature] - df[f'industry_{feature}']\n",
"\n",
" return df, ret_feature_columns\n"
]
},
{
"cell_type": "code",
"execution_count": 13,
"execution_count": 20,
"id": "47c12bb34062ae7a",
"metadata": {
"ExecuteTime": {
@@ -1256,7 +1317,7 @@
},
{
"cell_type": "code",
"execution_count": 14,
"execution_count": null,
"id": "29221dde",
"metadata": {},
"outputs": [],
@@ -1288,13 +1349,13 @@
"\n",
"# df = fill_nan_with_daily_median(df, feature_columns)\n",
"for feature_col in [col for col in feature_columns if col in df.columns]:\n",
" median_val = df[feature_col].median()\n",
" # median_val = df[feature_col].median()\n",
" df[feature_col].fillna(0, inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"execution_count": 22,
"id": "b76ea08a",
"metadata": {},
"outputs": [
@@ -1302,11 +1363,11 @@
"name": "stdout",
"output_type": "stream",
"text": [
" ts_code trade_date log_circ_mv\n",
"0 000001.SZ 2019-01-02 16.574219\n",
"2738 000001.SZ 2019-01-03 16.583965\n",
"5477 000001.SZ 2019-01-04 16.633371\n",
"['vol', 'pct_chg', 'turnover_rate', 'volume_ratio', 'winner_rate', 'undist_profit_ps', 'ocfps', 'AR', 'BR', 'AR_BR', 'cashflow_to_ev_factor', 'book_to_price_ratio', 'turnover_rate_mean_5', 'variance_20', 'bbi_ratio_factor', 'lg_elg_net_buy_vol', 'flow_lg_elg_intensity', 'sm_net_buy_vol', 'total_buy_vol', 'lg_elg_buy_prop', 'flow_struct_buy_change', 'lg_elg_net_buy_vol_change', 'flow_lg_elg_accel', 'chip_concentration_range', 'chip_skewness', 'floating_chip_proxy', 'cost_support_15pct_change', 'cat_winner_price_zone', 'flow_chip_consistency', 'profit_taking_vs_absorb', 'cat_is_positive', 'upside_vol', 'downside_vol', 'vol_ratio', 'return_skew', 'return_kurtosis', 'volume_change_rate', 'cat_volume_breakout', 'turnover_deviation', 'cat_turnover_spike', 'avg_volume_ratio', 'cat_volume_ratio_breakout', 'vol_spike', 'vol_std_5', 'atr_14', 'atr_6', 'obv', 'maobv_6', 'rsi_3', 'return_5', 'return_20', 'std_return_5', 'std_return_90', 'std_return_90_2', 'act_factor1', 'act_factor2', 'act_factor3', 'act_factor4', 'rank_act_factor1', 'rank_act_factor2', 'rank_act_factor3', 'cov', 'delta_cov', 'alpha_22_improved', 'alpha_003', 'alpha_007', 'alpha_013', 'vol_break', 'weight_roc5', 'smallcap_concentration', 'cost_stability', 'high_cost_break_days', 'liquidity_risk', 'turnover_std', 'mv_volatility', 'volume_growth', 'mv_growth', 'momentum_factor', 'resonance_factor', 'log_close', 'cat_vol_spike', 'up', 'down', 'obv_maobv_6', 'std_return_5_over_std_return_90', 'std_return_90_minus_std_return_90_2', 'cat_af2', 'cat_af3', 'cat_af4', 'act_factor5', 'act_factor6', 'active_buy_volume_large', 'active_buy_volume_big', 'active_buy_volume_small', 'buy_lg_vol_minus_sell_lg_vol', 'buy_elg_vol_minus_sell_elg_vol', 'ctrl_strength', 'low_cost_dev', 'asymmetry', 'lock_factor', 'cat_vol_break', 'cost_atr_adj', 'cat_golden_resonance', 'mv_turnover_ratio', 'mv_adjusted_volume', 'mv_weighted_turnover', 'nonlinear_mv_volume', 'mv_volume_ratio', 'mv_momentum', 'lg_flow_mom_corr_20_60', 'lg_flow_accel', 'profit_pressure', 'underwater_resistance', 'cost_conc_std_20', 'profit_decay_20', 'vol_amp_loss_20', 'vol_drop_profit_cnt_5', 'lg_flow_vol_interact_20', 'cost_break_confirm_cnt_5', 'atr_norm_channel_pos_14', 'turnover_diff_skew_20', 'lg_sm_flow_diverge_20', 'pullback_strong_20_20', 'vol_wgt_hist_pos_20', 'vol_adj_roc_20', 'cs_rank_net_lg_flow_val', 'cs_rank_elg_buy_ratio', 'cs_rank_rel_profit_margin', 'cs_rank_cost_breadth', 'cs_rank_dist_to_upper_cost', 'cs_rank_winner_rate', 'cs_rank_intraday_range', 'cs_rank_close_pos_in_range', 'cs_rank_pos_in_hist_range', 'cs_rank_vol_x_profit_margin', 'cs_rank_lg_flow_price_concordance', 'cs_rank_turnover_per_winner', 'cs_rank_volume_ratio', 'cs_rank_elg_buy_sell_sm_ratio', 'cs_rank_cost_dist_vol_ratio', 'cs_rank_size', 'cat_up_limit', 'industry_obv', 'industry_return_5', 'industry_return_20', 'industry__ema_5', 'industry__ema_13', 'industry__ema_20', 'industry__ema_60', 'industry_act_factor1', 'industry_act_factor2', 'industry_act_factor3', 'industry_act_factor4', 'industry_act_factor5', 'industry_act_factor6', 'industry_rank_act_factor1', 'industry_rank_act_factor2', 'industry_rank_act_factor3', 'industry_return_5_percentile', 'industry_return_20_percentile', '000852.SH_MACD', '000905.SH_MACD', '399006.SZ_MACD', '000852.SH_MACD_hist', '000905.SH_MACD_hist', '399006.SZ_MACD_hist', '000852.SH_RSI', '000905.SH_RSI', '399006.SZ_RSI', '000852.SH_Signal_line', '000905.SH_Signal_line', '399006.SZ_Signal_line', '000852.SH_amount_change_rate', '000905.SH_amount_change_rate', '399006.SZ_amount_change_rate', '000852.SH_amount_mean', '000905.SH_amount_mean', '399006.SZ_amount_mean', '000852.SH_daily_return', '000905.SH_daily_return', '399006.SZ_daily_return', '000852.SH_up_ratio_20d', '000905.SH_up_ratio_20d', '399006.SZ_up_ratio_20d', '000852.SH_volatility', '000905.SH_volatility', '399006.SZ_volatility', '000852.SH_volume_change_rate', '000905.SH_volume_change_rate', '399006.SZ_volume_change_rate']\n",
" ts_code trade_date log_circ_mv\n",
"0 000001.SZ 2019-01-02 16.574219\n",
"1 000001.SZ 2019-01-03 16.583965\n",
"2 000001.SZ 2019-01-04 16.633371\n",
"['vol', 'pct_chg', 'turnover_rate', 'volume_ratio', 'winner_rate', 'undist_profit_ps', 'ocfps', 'AR', 'BR', 'AR_BR', 'cashflow_to_ev_factor', 'book_to_price_ratio', 'turnover_rate_mean_5', 'variance_20', 'bbi_ratio_factor', 'daily_deviation', 'lg_elg_net_buy_vol', 'flow_lg_elg_intensity', 'sm_net_buy_vol', 'total_buy_vol', 'lg_elg_buy_prop', 'flow_struct_buy_change', 'lg_elg_net_buy_vol_change', 'flow_lg_elg_accel', 'chip_concentration_range', 'chip_skewness', 'floating_chip_proxy', 'cost_support_15pct_change', 'cat_winner_price_zone', 'flow_chip_consistency', 'profit_taking_vs_absorb', 'cat_is_positive', 'upside_vol', 'downside_vol', 'vol_ratio', 'return_skew', 'return_kurtosis', 'volume_change_rate', 'cat_volume_breakout', 'turnover_deviation', 'cat_turnover_spike', 'avg_volume_ratio', 'cat_volume_ratio_breakout', 'vol_spike', 'vol_std_5', 'atr_14', 'atr_6', 'obv', 'maobv_6', 'rsi_3', 'return_5', 'return_20', 'std_return_5', 'std_return_90', 'std_return_90_2', 'act_factor1', 'act_factor2', 'act_factor3', 'act_factor4', 'rank_act_factor1', 'rank_act_factor2', 'rank_act_factor3', 'cov', 'delta_cov', 'alpha_22_improved', 'alpha_003', 'alpha_007', 'alpha_013', 'vol_break', 'weight_roc5', 'smallcap_concentration', 'cost_stability', 'high_cost_break_days', 'liquidity_risk', 'turnover_std', 'mv_volatility', 'volume_growth', 'mv_growth', 'momentum_factor', 'resonance_factor', 'log_close', 'cat_vol_spike', 'up', 'down', 'obv_maobv_6', 'std_return_5_over_std_return_90', 'std_return_90_minus_std_return_90_2', 'cat_af2', 'cat_af3', 'cat_af4', 'act_factor5', 'act_factor6', 'active_buy_volume_large', 'active_buy_volume_big', 'active_buy_volume_small', 'buy_lg_vol_minus_sell_lg_vol', 'buy_elg_vol_minus_sell_elg_vol', 'ctrl_strength', 'low_cost_dev', 'asymmetry', 'lock_factor', 'cat_vol_break', 'cost_atr_adj', 'cat_golden_resonance', 'mv_turnover_ratio', 'mv_adjusted_volume', 'mv_weighted_turnover', 'nonlinear_mv_volume', 'mv_volume_ratio', 'mv_momentum', 'lg_flow_mom_corr_20_60', 'lg_flow_accel', 'profit_pressure', 'underwater_resistance', 'cost_conc_std_20', 'profit_decay_20', 'vol_amp_loss_20', 'vol_drop_profit_cnt_5', 'lg_flow_vol_interact_20', 'cost_break_confirm_cnt_5', 'atr_norm_channel_pos_14', 'turnover_diff_skew_20', 'lg_sm_flow_diverge_20', 'pullback_strong_20_20', 'vol_wgt_hist_pos_20', 'vol_adj_roc_20', 'cs_rank_net_lg_flow_val', 'cs_rank_elg_buy_ratio', 'cs_rank_rel_profit_margin', 'cs_rank_cost_breadth', 'cs_rank_dist_to_upper_cost', 'cs_rank_winner_rate', 'cs_rank_intraday_range', 'cs_rank_close_pos_in_range', 'cs_rank_pos_in_hist_range', 'cs_rank_vol_x_profit_margin', 'cs_rank_lg_flow_price_concordance', 'cs_rank_turnover_per_winner', 'cs_rank_volume_ratio', 'cs_rank_elg_buy_sell_sm_ratio', 'cs_rank_cost_dist_vol_ratio', 'cs_rank_size', 'cat_up_limit', 'industry_obv', 'industry_return_5', 'industry_return_20', 'industry__ema_5', 'industry__ema_13', 'industry__ema_20', 'industry__ema_60', 'industry_act_factor1', 'industry_act_factor2', 'industry_act_factor3', 'industry_act_factor4', 'industry_act_factor5', 'industry_act_factor6', 'industry_rank_act_factor1', 'industry_rank_act_factor2', 'industry_rank_act_factor3', 'industry_return_5_percentile', 'industry_return_20_percentile', 'industry_cs_rank_intraday_range', 'industry_cs_rank_close_pos_in_range', '000852.SH_MACD', '000905.SH_MACD', '399006.SZ_MACD', '000852.SH_MACD_hist', '000905.SH_MACD_hist', '399006.SZ_MACD_hist', '000852.SH_RSI', '000905.SH_RSI', '399006.SZ_RSI', '000852.SH_Signal_line', '000905.SH_Signal_line', '399006.SZ_Signal_line', '000852.SH_amount_change_rate', '000905.SH_amount_change_rate', '399006.SZ_amount_change_rate', '000852.SH_amount_mean', '000905.SH_amount_mean', '399006.SZ_amount_mean', '000852.SH_daily_return', '000905.SH_daily_return', '399006.SZ_daily_return', '000852.SH_up_ratio_20d', '000905.SH_up_ratio_20d', '399006.SZ_up_ratio_20d', '000852.SH_volatility', '000905.SH_volatility', '399006.SZ_volatility', '000852.SH_volume_change_rate', '000905.SH_volume_change_rate', '399006.SZ_volume_change_rate']\n",
"去除极值\n",
"开始截面 MAD 去极值处理 (k=3.0)...\n"
]
@@ -1315,7 +1376,7 @@
"name": "stderr",
"output_type": "stream",
"text": [
"MAD Filtering: 100%|██████████| 130/130 [00:28<00:00, 4.62it/s]\n"
"MAD Filtering: 100%|██████████| 131/131 [00:27<00:00, 4.69it/s]\n"
]
},
{
@@ -1330,7 +1391,7 @@
"name": "stderr",
"output_type": "stream",
"text": [
"MAD Filtering: 100%|██████████| 130/130 [00:23<00:00, 5.55it/s]\n"
"MAD Filtering: 100%|██████████| 131/131 [00:23<00:00, 5.52it/s]\n"
]
},
{
@@ -1368,25 +1429,26 @@
"output_type": "stream",
"text": [
"截面 MAD 去极值处理完成。\n",
"feature_columns: ['vol', 'pct_chg', 'turnover_rate', 'volume_ratio', 'winner_rate', 'undist_profit_ps', 'ocfps', 'AR', 'BR', 'AR_BR', 'cashflow_to_ev_factor', 'book_to_price_ratio', 'turnover_rate_mean_5', 'variance_20', 'bbi_ratio_factor', 'lg_elg_net_buy_vol', 'flow_lg_elg_intensity', 'sm_net_buy_vol', 'total_buy_vol', 'lg_elg_buy_prop', 'flow_struct_buy_change', 'lg_elg_net_buy_vol_change', 'flow_lg_elg_accel', 'chip_concentration_range', 'chip_skewness', 'floating_chip_proxy', 'cost_support_15pct_change', 'cat_winner_price_zone', 'flow_chip_consistency', 'profit_taking_vs_absorb', 'cat_is_positive', 'upside_vol', 'downside_vol', 'vol_ratio', 'return_skew', 'return_kurtosis', 'volume_change_rate', 'cat_volume_breakout', 'turnover_deviation', 'cat_turnover_spike', 'avg_volume_ratio', 'cat_volume_ratio_breakout', 'vol_spike', 'vol_std_5', 'atr_14', 'atr_6', 'obv', 'maobv_6', 'rsi_3', 'return_5', 'return_20', 'std_return_5', 'std_return_90', 'std_return_90_2', 'act_factor1', 'act_factor2', 'act_factor3', 'act_factor4', 'rank_act_factor1', 'rank_act_factor2', 'rank_act_factor3', 'cov', 'delta_cov', 'alpha_22_improved', 'alpha_003', 'alpha_007', 'alpha_013', 'vol_break', 'weight_roc5', 'smallcap_concentration', 'cost_stability', 'high_cost_break_days', 'liquidity_risk', 'turnover_std', 'mv_volatility', 'volume_growth', 'mv_growth', 'momentum_factor', 'resonance_factor', 'log_close', 'cat_vol_spike', 'up', 'down', 'obv_maobv_6', 'std_return_5_over_std_return_90', 'std_return_90_minus_std_return_90_2', 'cat_af2', 'cat_af3', 'cat_af4', 'act_factor5', 'act_factor6', 'active_buy_volume_large', 'active_buy_volume_big', 'active_buy_volume_small', 'buy_lg_vol_minus_sell_lg_vol', 'buy_elg_vol_minus_sell_elg_vol', 'ctrl_strength', 'low_cost_dev', 'asymmetry', 'lock_factor', 'cat_vol_break', 'cost_atr_adj', 'cat_golden_resonance', 'mv_turnover_ratio', 'mv_adjusted_volume', 'mv_weighted_turnover', 'nonlinear_mv_volume', 'mv_volume_ratio', 'mv_momentum', 'lg_flow_mom_corr_20_60', 'lg_flow_accel', 'profit_pressure', 'underwater_resistance', 'cost_conc_std_20', 'profit_decay_20', 'vol_amp_loss_20', 'vol_drop_profit_cnt_5', 'lg_flow_vol_interact_20', 'cost_break_confirm_cnt_5', 'atr_norm_channel_pos_14', 'turnover_diff_skew_20', 'lg_sm_flow_diverge_20', 'pullback_strong_20_20', 'vol_wgt_hist_pos_20', 'vol_adj_roc_20', 'cs_rank_net_lg_flow_val', 'cs_rank_elg_buy_ratio', 'cs_rank_rel_profit_margin', 'cs_rank_cost_breadth', 'cs_rank_dist_to_upper_cost', 'cs_rank_winner_rate', 'cs_rank_intraday_range', 'cs_rank_close_pos_in_range', 'cs_rank_pos_in_hist_range', 'cs_rank_vol_x_profit_margin', 'cs_rank_lg_flow_price_concordance', 'cs_rank_turnover_per_winner', 'cs_rank_volume_ratio', 'cs_rank_elg_buy_sell_sm_ratio', 'cs_rank_cost_dist_vol_ratio', 'cs_rank_size', 'cat_up_limit', 'industry_obv', 'industry_return_5', 'industry_return_20', 'industry__ema_5', 'industry__ema_13', 'industry__ema_20', 'industry__ema_60', 'industry_act_factor1', 'industry_act_factor2', 'industry_act_factor3', 'industry_act_factor4', 'industry_act_factor5', 'industry_act_factor6', 'industry_rank_act_factor1', 'industry_rank_act_factor2', 'industry_rank_act_factor3', 'industry_return_5_percentile', 'industry_return_20_percentile', '000852.SH_MACD', '000905.SH_MACD', '399006.SZ_MACD', '000852.SH_MACD_hist', '000905.SH_MACD_hist', '399006.SZ_MACD_hist', '000852.SH_RSI', '000905.SH_RSI', '399006.SZ_RSI', '000852.SH_Signal_line', '000905.SH_Signal_line', '399006.SZ_Signal_line', '000852.SH_amount_change_rate', '000905.SH_amount_change_rate', '399006.SZ_amount_change_rate', '000852.SH_amount_mean', '000905.SH_amount_mean', '399006.SZ_amount_mean', '000852.SH_daily_return', '000905.SH_daily_return', '399006.SZ_daily_return', '000852.SH_up_ratio_20d', '000905.SH_up_ratio_20d', '399006.SZ_up_ratio_20d', '000852.SH_volatility', '000905.SH_volatility', '399006.SZ_volatility', '000852.SH_volume_change_rate', '000905.SH_volume_change_rate', '399006.SZ_volume_change_rate']\n",
"feature_columns: ['vol', 'pct_chg', 'turnover_rate', 'volume_ratio', 'winner_rate', 'undist_profit_ps', 'ocfps', 'AR', 'BR', 'AR_BR', 'cashflow_to_ev_factor', 'book_to_price_ratio', 'turnover_rate_mean_5', 'variance_20', 'bbi_ratio_factor', 'daily_deviation', 'lg_elg_net_buy_vol', 'flow_lg_elg_intensity', 'sm_net_buy_vol', 'total_buy_vol', 'lg_elg_buy_prop', 'flow_struct_buy_change', 'lg_elg_net_buy_vol_change', 'flow_lg_elg_accel', 'chip_concentration_range', 'chip_skewness', 'floating_chip_proxy', 'cost_support_15pct_change', 'cat_winner_price_zone', 'flow_chip_consistency', 'profit_taking_vs_absorb', 'cat_is_positive', 'upside_vol', 'downside_vol', 'vol_ratio', 'return_skew', 'return_kurtosis', 'volume_change_rate', 'cat_volume_breakout', 'turnover_deviation', 'cat_turnover_spike', 'avg_volume_ratio', 'cat_volume_ratio_breakout', 'vol_spike', 'vol_std_5', 'atr_14', 'atr_6', 'obv', 'maobv_6', 'rsi_3', 'return_5', 'return_20', 'std_return_5', 'std_return_90', 'std_return_90_2', 'act_factor1', 'act_factor2', 'act_factor3', 'act_factor4', 'rank_act_factor1', 'rank_act_factor2', 'rank_act_factor3', 'cov', 'delta_cov', 'alpha_22_improved', 'alpha_003', 'alpha_007', 'alpha_013', 'vol_break', 'weight_roc5', 'smallcap_concentration', 'cost_stability', 'high_cost_break_days', 'liquidity_risk', 'turnover_std', 'mv_volatility', 'volume_growth', 'mv_growth', 'momentum_factor', 'resonance_factor', 'log_close', 'cat_vol_spike', 'up', 'down', 'obv_maobv_6', 'std_return_5_over_std_return_90', 'std_return_90_minus_std_return_90_2', 'cat_af2', 'cat_af3', 'cat_af4', 'act_factor5', 'act_factor6', 'active_buy_volume_large', 'active_buy_volume_big', 'active_buy_volume_small', 'buy_lg_vol_minus_sell_lg_vol', 'buy_elg_vol_minus_sell_elg_vol', 'ctrl_strength', 'low_cost_dev', 'asymmetry', 'lock_factor', 'cat_vol_break', 'cost_atr_adj', 'cat_golden_resonance', 'mv_turnover_ratio', 'mv_adjusted_volume', 'mv_weighted_turnover', 'nonlinear_mv_volume', 'mv_volume_ratio', 'mv_momentum', 'lg_flow_mom_corr_20_60', 'lg_flow_accel', 'profit_pressure', 'underwater_resistance', 'cost_conc_std_20', 'profit_decay_20', 'vol_amp_loss_20', 'vol_drop_profit_cnt_5', 'lg_flow_vol_interact_20', 'cost_break_confirm_cnt_5', 'atr_norm_channel_pos_14', 'turnover_diff_skew_20', 'lg_sm_flow_diverge_20', 'pullback_strong_20_20', 'vol_wgt_hist_pos_20', 'vol_adj_roc_20', 'cs_rank_net_lg_flow_val', 'cs_rank_elg_buy_ratio', 'cs_rank_rel_profit_margin', 'cs_rank_cost_breadth', 'cs_rank_dist_to_upper_cost', 'cs_rank_winner_rate', 'cs_rank_intraday_range', 'cs_rank_close_pos_in_range', 'cs_rank_pos_in_hist_range', 'cs_rank_vol_x_profit_margin', 'cs_rank_lg_flow_price_concordance', 'cs_rank_turnover_per_winner', 'cs_rank_volume_ratio', 'cs_rank_elg_buy_sell_sm_ratio', 'cs_rank_cost_dist_vol_ratio', 'cs_rank_size', 'cat_up_limit', 'industry_obv', 'industry_return_5', 'industry_return_20', 'industry__ema_5', 'industry__ema_13', 'industry__ema_20', 'industry__ema_60', 'industry_act_factor1', 'industry_act_factor2', 'industry_act_factor3', 'industry_act_factor4', 'industry_act_factor5', 'industry_act_factor6', 'industry_rank_act_factor1', 'industry_rank_act_factor2', 'industry_rank_act_factor3', 'industry_return_5_percentile', 'industry_return_20_percentile', 'industry_cs_rank_intraday_range', 'industry_cs_rank_close_pos_in_range', '000852.SH_MACD', '000905.SH_MACD', '399006.SZ_MACD', '000852.SH_MACD_hist', '000905.SH_MACD_hist', '399006.SZ_MACD_hist', '000852.SH_RSI', '000905.SH_RSI', '399006.SZ_RSI', '000852.SH_Signal_line', '000905.SH_Signal_line', '399006.SZ_Signal_line', '000852.SH_amount_change_rate', '000905.SH_amount_change_rate', '399006.SZ_amount_change_rate', '000852.SH_amount_mean', '000905.SH_amount_mean', '399006.SZ_amount_mean', '000852.SH_daily_return', '000905.SH_daily_return', '399006.SZ_daily_return', '000852.SH_up_ratio_20d', '000905.SH_up_ratio_20d', '399006.SZ_up_ratio_20d', '000852.SH_volatility', '000905.SH_volatility', '399006.SZ_volatility', '000852.SH_volume_change_rate', '000905.SH_volume_change_rate', '399006.SZ_volume_change_rate']\n",
"df最小日期: 2019-01-02\n",
"df最大日期: 2025-05-07\n",
"2057678\n",
"df最大日期: 2025-05-09\n",
"2057671\n",
"train_data最小日期: 2020-01-02\n",
"train_data最大日期: 2022-12-30\n",
"1730630\n",
"1736644\n",
"test_data最小日期: 2023-01-03\n",
"test_data最大日期: 2025-05-07\n",
" ts_code trade_date log_circ_mv\n",
"0 000001.SZ 2019-01-02 16.574219\n",
"2738 000001.SZ 2019-01-03 16.583965\n",
"5477 000001.SZ 2019-01-04 16.633371\n"
"test_data最大日期: 2025-05-09\n",
" ts_code trade_date log_circ_mv\n",
"0 000001.SZ 2019-01-02 16.574219\n",
"1 000001.SZ 2019-01-03 16.583965\n",
"2 000001.SZ 2019-01-04 16.633371\n"
]
}
],
"source": [
"train_data = df[filter_index & (df['trade_date'] <= '2023-01-01') & (df['trade_date'] >= '2020-01-01')]\n",
"test_data = df[(df['trade_date'] >= '2023-01-01')]\n",
"split_date = '2023-01-01'\n",
"train_data = df[filter_index & (df['trade_date'] <= split_date) & (df['trade_date'] >= '2020-01-01')]\n",
"test_data = df[(df['trade_date'] >= split_date)]\n",
"\n",
"print(df[['ts_code', 'trade_date', 'log_circ_mv']].head(3))\n",
"\n",
@@ -1401,8 +1463,8 @@
"train_data, test_data = train_data.replace([np.inf, -np.inf], np.nan), test_data.replace([np.inf, -np.inf], np.nan)\n",
"\n",
"# feature_columns_new = feature_columns[:]\n",
"# train_data, _ = create_deviation_within_dates(train_data, feature_columns)\n",
"# test_data, _ = create_deviation_within_dates(test_data, feature_columns)\n",
"# train_data, _ = create_deviation_within_dates(train_data, [col for col in feature_columns if col in train_data.columns])\n",
"# test_data, _ = create_deviation_within_dates(test_data, [col for col in feature_columns if col in train_data.columns])\n",
"\n",
"# feature_columns = [\n",
"# 'undist_profit_ps', \n",
@@ -1511,75 +1573,10 @@
},
{
"cell_type": "code",
"execution_count": 16,
"id": "e23d1759",
"execution_count": 34,
"id": "3ff2d1c5",
"metadata": {},
"outputs": [],
"source": [
"# feature_columns = [\n",
"# 'undist_profit_ps', \n",
"# 'AR_BR', \n",
"# # 'pe_ttm',\n",
"# # 'alpha_22_improved', \n",
"# # 'alpha_003', \n",
"# # 'alpha_007', \n",
"# # 'alpha_013', \n",
"# # 'cat_up_limit', \n",
"# # 'cat_down_limit', \n",
"# # 'up_limit_count_10d', \n",
"# # 'down_limit_count_10d', \n",
"# # 'consecutive_up_limit', \n",
"# # 'vol_break', \n",
"# # 'weight_roc5', \n",
"# # 'price_cost_divergence', \n",
"# # 'smallcap_concentration', \n",
"# # 'cost_stability', \n",
"# # 'high_cost_break_days', \n",
"# # 'liquidity_risk', \n",
"# # 'turnover_std', \n",
"# # 'mv_volatility', \n",
"# # 'volume_growth', \n",
"# # 'mv_growth', \n",
"# # 'lg_flow_mom_corr_20_60', \n",
"# # 'lg_flow_accel', \n",
"# # 'profit_pressure', \n",
"# # 'underwater_resistance', \n",
"# # 'cost_conc_std_20', \n",
"# # 'profit_decay_20', \n",
"# # 'vol_amp_loss_20', \n",
"# # 'vol_drop_profit_cnt_5', \n",
"# # 'lg_flow_vol_interact_20', \n",
"# # 'cost_break_confirm_cnt_5', \n",
"# # 'atr_norm_channel_pos_14', \n",
"# # 'turnover_diff_skew_20', \n",
"# # 'lg_sm_flow_diverge_20', \n",
"# # 'pullback_strong_20_20', \n",
"# # 'vol_wgt_hist_pos_20', \n",
"# # 'vol_adj_roc_20',\n",
"# 'cashflow_to_ev_factor',\n",
"# 'ocfps',\n",
"# 'book_to_price_ratio',\n",
"# 'turnover_rate_mean_5',\n",
"# 'variance_20',\n",
"# 'bbi_ratio_factor'\n",
"# ]\n",
"# feature_columns = [col for col in feature_columns if col in train_data.columns]"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "8f134d435f71e9e2",
"metadata": {
"ExecuteTime": {
"end_time": "2025-04-03T14:57:51.050696Z",
"start_time": "2025-04-03T14:57:51.034030Z"
},
"jupyter": {
"source_hidden": true
}
},
"outputs": [],
"source": [
"from sklearn.preprocessing import StandardScaler\n",
"from sklearn.linear_model import LogisticRegression\n",
@@ -1590,12 +1587,12 @@
"import datetime # 用于日期计算\n",
"from catboost import CatBoostClassifier\n",
"from catboost import Pool\n",
"\n",
"import lightgbm as lgb\n",
"\n",
"def train_model(train_data_df, feature_columns,\n",
" print_info=True, # 调整参数名,更通用\n",
" validation_days=180, use_pca=False, split_date=None,\n",
" target_column='label'): # 增加目标列参数\n",
" target_column='label', type='light'): # 增加目标列参数\n",
"\n",
" print('train data size: ', len(train_data_df))\n",
" print(train_data_df[['ts_code', 'trade_date', 'log_circ_mv']])\n",
@@ -1636,38 +1633,80 @@
" \n",
" # # 使用处理后的特征和样本权重进行训练\n",
" # model.fit(X_train, y_train)\n",
" \n",
" \n",
" cat_features = [i for i, col in enumerate(feature_columns) if col.startswith('cat')]\n",
" print(f'cat_features: {cat_features}')\n",
" # cat_features = []\n",
"\n",
" params = {\n",
" 'loss_function': 'Logloss', # 适用于二分类\n",
" 'eval_metric': 'Logloss', # 评估指标\n",
" 'iterations': 1500,\n",
" 'learning_rate': 0.01,\n",
" 'depth': 8, # 控制模型复杂度\n",
" 'l2_leaf_reg': 5, # L2 正则化\n",
" 'verbose': 5000,\n",
" 'early_stopping_rounds': 3000,\n",
" 'one_hot_max_size': 50,\n",
" 'class_weights': [0.6, 1.2],\n",
" 'task_type': 'GPU',\n",
" 'has_time': True,\n",
" 'random_seed': 7\n",
" }\n",
"\n",
" train_pool = Pool(data=X_train, label=y_train, cat_features=cat_features)\n",
" val_pool = Pool(data=X_val, label=y_val, cat_features=cat_features)\n",
"\n",
"\n",
" model = CatBoostClassifier(**params)\n",
" model.fit(train_pool,\n",
" eval_set=val_pool, \n",
" plot=True, \n",
" use_best_model=True\n",
" )\n",
" if type == 'cat':\n",
" params = {\n",
" 'loss_function': 'Logloss', # 适用于二分类\n",
" 'eval_metric': 'Logloss', # 评估指标\n",
" 'iterations': 1500,\n",
" 'learning_rate': 0.01,\n",
" 'depth': 10, # 控制模型复杂度\n",
" 'l2_leaf_reg': 50, # L2 正则化\n",
" 'verbose': 5000,\n",
" 'early_stopping_rounds': 3000,\n",
" 'one_hot_max_size': 50,\n",
" 'class_weights': [0.6, 1.2],\n",
" 'task_type': 'GPU',\n",
" 'has_time': True,\n",
" 'random_seed': 7\n",
" }\n",
" cat_features = [i for i, col in enumerate(feature_columns) if col.startswith('cat')]\n",
" train_pool = Pool(data=X_train, label=y_train, cat_features=cat_features)\n",
" val_pool = Pool(data=X_val, label=y_val, cat_features=cat_features)\n",
"\n",
"\n",
" model = CatBoostClassifier(**params)\n",
" model.fit(train_pool,\n",
" eval_set=val_pool, \n",
" plot=True, \n",
" use_best_model=True\n",
" )\n",
" elif type == 'light':\n",
" params = {\n",
" 'objective': 'binary',\n",
" 'metric': 'average_precision',\n",
" 'learning_rate': 0.01,\n",
" 'is_unbalance': True,\n",
" 'num_leaves': 2048,\n",
" 'min_data_in_leaf': 1024,\n",
" 'max_depth': 32,\n",
" 'max_bin': 1024,\n",
" 'feature_fraction': 0.5,\n",
" 'bagging_fraction': 0.5,\n",
" 'bagging_freq': 1,\n",
" 'lambda_l1': 50,\n",
" 'lambda_l2': 50,\n",
" 'verbosity': -1,\n",
" 'num_threads' : 8\n",
" }\n",
" categorical_feature = [col for col in feature_columns if 'cat' in col]\n",
" train_dataset = lgb.Dataset(\n",
" X_train, label=y_train,\n",
" categorical_feature=categorical_feature\n",
" )\n",
" val_dataset = lgb.Dataset(\n",
" X_val, label=y_val,\n",
" categorical_feature=categorical_feature\n",
" )\n",
"\n",
" evals = {}\n",
" callbacks = [lgb.log_evaluation(period=1000),\n",
" lgb.callback.record_evaluation(evals),\n",
" lgb.early_stopping(100, first_metric_only=True)\n",
" ]\n",
" # 训练模型\n",
" model = lgb.train(\n",
" params, train_dataset, num_boost_round=1000,\n",
" valid_sets=[train_dataset, val_dataset], valid_names=['train', 'valid'],\n",
" callbacks=callbacks\n",
" )\n",
"\n",
" # 打印特征重要性(如果需要)\n",
" if True:\n",
" lgb.plot_metric(evals)\n",
" lgb.plot_importance(model, importance_type='split', max_num_features=20)\n",
" plt.show()\n",
"\n",
"\n",
" return model, scaler, None # 返回训练好的模型、scaler 和 pca 对象"
@@ -1675,7 +1714,7 @@
},
{
"cell_type": "code",
"execution_count": 18,
"execution_count": 35,
"id": "c6eb5cd4-e714-420a-ac48-39af3e11ee81",
"metadata": {
"ExecuteTime": {
@@ -1703,14 +1742,13 @@
"36399 600561.SH 2022-12-30 11.858571\n",
"\n",
"[36400 rows x 3 columns]\n",
"原始样本数: 36400, 去除标签为空后样本数: 36400\n",
"cat_features: [27, 30, 37, 39, 41, 80, 86, 87, 88, 100, 102, 141]\n"
"原始样本数: 36400, 去除标签为空后样本数: 36400\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "73e4fa876f004bafb847cea54620f732",
"model_id": "0acc9aa66b564c16ba0dfdaa7dab6a9e",
"version_major": 2,
"version_minor": 0
},
@@ -1725,11 +1763,11 @@
"name": "stdout",
"output_type": "stream",
"text": [
"0:\tlearn: 0.6886803\ttest: 0.6892921\tbest: 0.6892921 (0)\ttotal: 141ms\tremaining: 3m 32s\n",
"1499:\tlearn: 0.3359066\ttest: 0.5174742\tbest: 0.5163721 (873)\ttotal: 1m 51s\tremaining: 0us\n",
"bestTest = 0.5163720682\n",
"bestIteration = 873\n",
"Shrink model to first 874 iterations.\n"
"0:\tlearn: 0.6886094\ttest: 0.6894541\tbest: 0.6894541 (0)\ttotal: 255ms\tremaining: 6m 22s\n",
"1499:\tlearn: 0.3197977\ttest: 0.5228570\tbest: 0.5197799 (414)\ttotal: 5m 23s\tremaining: 0us\n",
"bestTest = 0.5197798592\n",
"bestIteration = 414\n",
"Shrink model to first 415 iterations.\n"
]
}
],
@@ -1738,6 +1776,7 @@
"gc.collect()\n",
"\n",
"use_pca = False\n",
"type = 'cat'\n",
"# feature_contri = [2 if feat.startswith('act_factor') or 'buy' in feat or 'sell' in feat else 1 for feat in feature_columns]\n",
"# light_params['feature_contri'] = feature_contri\n",
"# print(f'feature_contri: {feature_contri}')\n",
@@ -1745,71 +1784,12 @@
" .dropna(subset=['label']).groupby('trade_date', group_keys=False)\n",
" .apply(lambda x: x.nsmallest(50, 'total_mv'))\n",
" .merge(industry_df, on=['cat_l2_code', 'trade_date'], how='left')\n",
" .merge(index_data, on='trade_date', how='left'), feature_columns)\n"
" .merge(index_data, on='trade_date', how='left'), feature_columns, type=type)\n"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "ec189398",
"metadata": {},
"outputs": [],
"source": [
"# if True:\n",
"# train_data_df = train_data.dropna(subset=['label']).groupby('trade_date', group_keys=False).apply(lambda x: x.nsmallest(50, 'total_mv'))\n",
"# # 识别数值型特征列\n",
"\n",
"# # 去除标签为空的样本\n",
"# initial_len = len(train_data_df)\n",
"# train_data_df = train_data_df.dropna(subset=['label'])\n",
"\n",
"\n",
"# # 提取特征和标签,只取数值型特征用于线性回归\n",
" \n",
"# all_dates = train_data_df['trade_date'].unique() # 获取所有唯一的 trade_date\n",
"# split_date = all_dates[-validation_days] # 划分点为倒数第 validation_days 天\n",
"# val_data_split = train_data_df[train_data_df['trade_date'] >= split_date] # 验证集\n",
" \n",
"\n",
"# score_df = val_data_split\n",
"# score_df = fill_nan_with_daily_median(score_df, ['pe_ttm'])\n",
"# score_df = score_df[score_df['pe_ttm'] > 0]\n",
"# score_df = score_df.merge(industry_df, on=['cat_l2_code', 'trade_date'], how='left')\n",
"# score_df = score_df.merge(index_data, on='trade_date', how='left')\n",
"# # score_df = score_df.groupby('trade_date', group_keys=False).apply(lambda x: x.nsmallest(50, 'total_mv')).reset_index()\n",
"# numeric_columns = score_df.select_dtypes(include=['float64', 'int64']).columns\n",
"# numeric_columns = [col for col in feature_columns if col in numeric_columns]\n",
"# # score_df.loc[:, numeric_columns] = scaler.transform(score_df[numeric_columns])\n",
"# # score_df = cross_sectional_standardization(score_df, numeric_columns)\n",
"# print(score_df.columns.tolist())\n",
"\n",
"# score_df['score'] = model.predict_proba(score_df[feature_columns])[:, 1]\n",
"# score_df['score_ranks'] = score_df.groupby('trade_date')['score'].rank(ascending=True)\n",
"\n",
"# score_df = score_df.groupby('trade_date', group_keys=False).apply(\n",
"# lambda x: x[x['score'] >= x['score'].quantile(0.90)] # 计算90%分位数作为阈值,筛选分数>=阈值的行\n",
"# ).reset_index(drop=True) # drop=True 避免添加旧索引列\n",
"# # save_df = score_df.groupby('trade_date', group_keys=False).apply(lambda x: x.nlargest(1, 'score')).reset_index()\n",
"# save_df = score_df.groupby('trade_date', group_keys=False).apply(lambda x: x.nsmallest(1, 'total_mv')).reset_index()\n",
"# # save_df[['trade_date', 'score', 'ts_code']].to_csv('predictions_test.tsv', index=False)\n",
"# import pandas as pd\n",
"# from sklearn.metrics import accuracy_score\n",
"\n",
"# # 假设 df 是你的 DataFrame\n",
"# # df = pd.read_csv('your_data.csv')\n",
"\n",
"# # 将预测分数转换为类别预测例如0.5 为阈值)\n",
"# save_df['pred'] = (save_df['score'] >= 0.5).astype(int)\n",
"\n",
"# # 计算准确率\n",
"# acc = accuracy_score(save_df['label'], save_df['pred'])\n",
"\n",
"# print(f\"准确率为:{acc:.4f}\")"
]
},
{
"cell_type": "code",
"execution_count": 31,
"execution_count": 36,
"id": "5d1522a7538db91b",
"metadata": {
"ExecuteTime": {
@@ -1819,13 +1799,6 @@
},
"outputs": [],
"source": [
"# train_data = train_data.sort_values(by='trade_date')\n",
"# all_dates = train_data['trade_date'].unique() # 获取所有唯一的 trade_date\n",
"# split_date = all_dates[-120] # 划分点为倒数第 validation_days 天\n",
"# print(split_date)\n",
"# print(all_dates)\n",
"# val_data_split = train_data[train_data['trade_date'] >= split_date] # 验证集\n",
"\n",
"score_df = test_data.groupby('trade_date', group_keys=False).apply(lambda x: x.nsmallest(500, 'total_mv'))\n",
"# score_df = fill_nan_with_daily_median(score_df, ['pe_ttm'])\n",
"# score_df = score_df[score_df['pe_ttm'] > 0]\n",
@@ -1837,21 +1810,24 @@
"# score_df.loc[:, numeric_columns] = scaler.transform(score_df[numeric_columns])\n",
"# score_df = cross_sectional_standardization(score_df, numeric_columns)\n",
"\n",
"score_df['score'] = model.predict_proba(score_df[feature_columns])[:, 1]\n",
"if type == 'cat':\n",
" score_df['score'] = model.predict_proba(score_df[feature_columns])[:, 1]\n",
"elif type == 'light':\n",
" score_df['score'] = model.predict(score_df[feature_columns])\n",
"score_df['score_ranks'] = score_df.groupby('trade_date')['score'].rank(ascending=True)\n",
"\n",
"score_df = score_df.groupby('trade_date', group_keys=False).apply(\n",
" lambda x: x[x['score'] >= x['score'].quantile(0.90)] # 计算90%分位数作为阈值,筛选分数>=阈值的行\n",
").reset_index(drop=True) # drop=True 避免添加旧索引列\n",
"# save_df = score_df.groupby('trade_date', group_keys=False).apply(lambda x: x.nlargest(1, 'score')).reset_index()\n",
"save_df = score_df.groupby('trade_date', group_keys=False).apply(lambda x: x.nsmallest(1, 'total_mv')).reset_index()\n",
"save_df = score_df.groupby('trade_date', group_keys=False).apply(lambda x: x.nsmallest(2, 'total_mv')).reset_index()\n",
"save_df = save_df.sort_values(['trade_date', 'score'])\n",
"save_df[['trade_date', 'score', 'ts_code']].to_csv('predictions_test.tsv', index=False)\n"
]
},
{
"cell_type": "code",
"execution_count": 32,
"execution_count": 37,
"id": "09b1799e",
"metadata": {},
"outputs": [
@@ -1859,8 +1835,8 @@
"name": "stdout",
"output_type": "stream",
"text": [
"190\n",
"['vol', 'pct_chg', 'turnover_rate', 'volume_ratio', 'winner_rate', 'undist_profit_ps', 'ocfps', 'AR', 'BR', 'AR_BR', 'cashflow_to_ev_factor', 'book_to_price_ratio', 'turnover_rate_mean_5', 'variance_20', 'bbi_ratio_factor', 'lg_elg_net_buy_vol', 'flow_lg_elg_intensity', 'sm_net_buy_vol', 'total_buy_vol', 'lg_elg_buy_prop', 'flow_struct_buy_change', 'lg_elg_net_buy_vol_change', 'flow_lg_elg_accel', 'chip_concentration_range', 'chip_skewness', 'floating_chip_proxy', 'cost_support_15pct_change', 'cat_winner_price_zone', 'flow_chip_consistency', 'profit_taking_vs_absorb', 'cat_is_positive', 'upside_vol', 'downside_vol', 'vol_ratio', 'return_skew', 'return_kurtosis', 'volume_change_rate', 'cat_volume_breakout', 'turnover_deviation', 'cat_turnover_spike', 'avg_volume_ratio', 'cat_volume_ratio_breakout', 'vol_spike', 'vol_std_5', 'atr_14', 'atr_6', 'obv', 'maobv_6', 'rsi_3', 'return_5', 'return_20', 'std_return_5', 'std_return_90', 'std_return_90_2', 'act_factor1', 'act_factor2', 'act_factor3', 'act_factor4', 'rank_act_factor1', 'rank_act_factor2', 'rank_act_factor3', 'cov', 'delta_cov', 'alpha_22_improved', 'alpha_003', 'alpha_007', 'alpha_013', 'vol_break', 'weight_roc5', 'smallcap_concentration', 'cost_stability', 'high_cost_break_days', 'liquidity_risk', 'turnover_std', 'mv_volatility', 'volume_growth', 'mv_growth', 'momentum_factor', 'resonance_factor', 'log_close', 'cat_vol_spike', 'up', 'down', 'obv_maobv_6', 'std_return_5_over_std_return_90', 'std_return_90_minus_std_return_90_2', 'cat_af2', 'cat_af3', 'cat_af4', 'act_factor5', 'act_factor6', 'active_buy_volume_large', 'active_buy_volume_big', 'active_buy_volume_small', 'buy_lg_vol_minus_sell_lg_vol', 'buy_elg_vol_minus_sell_elg_vol', 'ctrl_strength', 'low_cost_dev', 'asymmetry', 'lock_factor', 'cat_vol_break', 'cost_atr_adj', 'cat_golden_resonance', 'mv_turnover_ratio', 'mv_adjusted_volume', 'mv_weighted_turnover', 'nonlinear_mv_volume', 'mv_volume_ratio', 'mv_momentum', 'lg_flow_mom_corr_20_60', 'lg_flow_accel', 'profit_pressure', 'underwater_resistance', 'cost_conc_std_20', 'profit_decay_20', 'vol_amp_loss_20', 'vol_drop_profit_cnt_5', 'lg_flow_vol_interact_20', 'cost_break_confirm_cnt_5', 'atr_norm_channel_pos_14', 'turnover_diff_skew_20', 'lg_sm_flow_diverge_20', 'pullback_strong_20_20', 'vol_wgt_hist_pos_20', 'vol_adj_roc_20', 'cs_rank_net_lg_flow_val', 'cs_rank_elg_buy_ratio', 'cs_rank_rel_profit_margin', 'cs_rank_cost_breadth', 'cs_rank_dist_to_upper_cost', 'cs_rank_winner_rate', 'cs_rank_intraday_range', 'cs_rank_close_pos_in_range', 'cs_rank_pos_in_hist_range', 'cs_rank_vol_x_profit_margin', 'cs_rank_lg_flow_price_concordance', 'cs_rank_turnover_per_winner', 'cs_rank_volume_ratio', 'cs_rank_elg_buy_sell_sm_ratio', 'cs_rank_cost_dist_vol_ratio', 'cs_rank_size', 'cat_up_limit', 'industry_obv', 'industry_return_5', 'industry_return_20', 'industry__ema_5', 'industry__ema_13', 'industry__ema_20', 'industry__ema_60', 'industry_act_factor1', 'industry_act_factor2', 'industry_act_factor3', 'industry_act_factor4', 'industry_act_factor5', 'industry_act_factor6', 'industry_rank_act_factor1', 'industry_rank_act_factor2', 'industry_rank_act_factor3', 'industry_return_5_percentile', 'industry_return_20_percentile', '000852.SH_MACD', '000905.SH_MACD', '399006.SZ_MACD', '000852.SH_MACD_hist', '000905.SH_MACD_hist', '399006.SZ_MACD_hist', '000852.SH_RSI', '000905.SH_RSI', '399006.SZ_RSI', '000852.SH_Signal_line', '000905.SH_Signal_line', '399006.SZ_Signal_line', '000852.SH_amount_change_rate', '000905.SH_amount_change_rate', '399006.SZ_amount_change_rate', '000852.SH_amount_mean', '000905.SH_amount_mean', '399006.SZ_amount_mean', '000852.SH_daily_return', '000905.SH_daily_return', '399006.SZ_daily_return', '000852.SH_up_ratio_20d', '000905.SH_up_ratio_20d', '399006.SZ_up_ratio_20d', '000852.SH_volatility', '000905.SH_volatility', '399006.SZ_volatility', '000852.SH_volume_change_rate', '000905.SH_volume_change_rate', '399006.SZ_volume_change_rate']\n"
"191\n",
"['vol', 'pct_chg', 'turnover_rate', 'volume_ratio', 'winner_rate', 'undist_profit_ps', 'ocfps', 'AR', 'BR', 'AR_BR', 'cashflow_to_ev_factor', 'book_to_price_ratio', 'turnover_rate_mean_5', 'variance_20', 'bbi_ratio_factor', 'daily_deviation', 'lg_elg_net_buy_vol', 'flow_lg_elg_intensity', 'sm_net_buy_vol', 'total_buy_vol', 'lg_elg_buy_prop', 'flow_struct_buy_change', 'lg_elg_net_buy_vol_change', 'flow_lg_elg_accel', 'chip_concentration_range', 'chip_skewness', 'floating_chip_proxy', 'cost_support_15pct_change', 'cat_winner_price_zone', 'flow_chip_consistency', 'profit_taking_vs_absorb', 'cat_is_positive', 'upside_vol', 'downside_vol', 'vol_ratio', 'return_skew', 'return_kurtosis', 'volume_change_rate', 'cat_volume_breakout', 'turnover_deviation', 'cat_turnover_spike', 'avg_volume_ratio', 'cat_volume_ratio_breakout', 'vol_spike', 'vol_std_5', 'atr_14', 'atr_6', 'obv', 'maobv_6', 'rsi_3', 'return_5', 'return_20', 'std_return_5', 'std_return_90', 'std_return_90_2', 'act_factor1', 'act_factor2', 'act_factor3', 'act_factor4', 'rank_act_factor1', 'rank_act_factor2', 'rank_act_factor3', 'cov', 'delta_cov', 'alpha_22_improved', 'alpha_003', 'alpha_007', 'alpha_013', 'vol_break', 'weight_roc5', 'smallcap_concentration', 'cost_stability', 'high_cost_break_days', 'liquidity_risk', 'turnover_std', 'mv_volatility', 'volume_growth', 'mv_growth', 'momentum_factor', 'resonance_factor', 'log_close', 'cat_vol_spike', 'up', 'down', 'obv_maobv_6', 'std_return_5_over_std_return_90', 'std_return_90_minus_std_return_90_2', 'cat_af2', 'cat_af3', 'cat_af4', 'act_factor5', 'act_factor6', 'active_buy_volume_large', 'active_buy_volume_big', 'active_buy_volume_small', 'buy_lg_vol_minus_sell_lg_vol', 'buy_elg_vol_minus_sell_elg_vol', 'ctrl_strength', 'low_cost_dev', 'asymmetry', 'lock_factor', 'cat_vol_break', 'cost_atr_adj', 'cat_golden_resonance', 'mv_turnover_ratio', 'mv_adjusted_volume', 'mv_weighted_turnover', 'nonlinear_mv_volume', 'mv_volume_ratio', 'mv_momentum', 'lg_flow_mom_corr_20_60', 'lg_flow_accel', 'profit_pressure', 'underwater_resistance', 'cost_conc_std_20', 'profit_decay_20', 'vol_amp_loss_20', 'vol_drop_profit_cnt_5', 'lg_flow_vol_interact_20', 'cost_break_confirm_cnt_5', 'atr_norm_channel_pos_14', 'turnover_diff_skew_20', 'lg_sm_flow_diverge_20', 'pullback_strong_20_20', 'vol_wgt_hist_pos_20', 'vol_adj_roc_20', 'cs_rank_net_lg_flow_val', 'cs_rank_elg_buy_ratio', 'cs_rank_rel_profit_margin', 'cs_rank_cost_breadth', 'cs_rank_dist_to_upper_cost', 'cs_rank_winner_rate', 'cs_rank_intraday_range', 'cs_rank_close_pos_in_range', 'cs_rank_pos_in_hist_range', 'cs_rank_vol_x_profit_margin', 'cs_rank_lg_flow_price_concordance', 'cs_rank_turnover_per_winner', 'cs_rank_volume_ratio', 'cs_rank_elg_buy_sell_sm_ratio', 'cs_rank_cost_dist_vol_ratio', 'cs_rank_size', 'cat_up_limit', 'industry_obv', 'industry_return_5', 'industry_return_20', 'industry__ema_5', 'industry__ema_13', 'industry__ema_20', 'industry__ema_60', 'industry_act_factor1', 'industry_act_factor2', 'industry_act_factor3', 'industry_act_factor4', 'industry_act_factor5', 'industry_act_factor6', 'industry_rank_act_factor1', 'industry_rank_act_factor2', 'industry_rank_act_factor3', 'industry_return_5_percentile', 'industry_return_20_percentile', '000852.SH_MACD', '000905.SH_MACD', '399006.SZ_MACD', '000852.SH_MACD_hist', '000905.SH_MACD_hist', '399006.SZ_MACD_hist', '000852.SH_RSI', '000905.SH_RSI', '399006.SZ_RSI', '000852.SH_Signal_line', '000905.SH_Signal_line', '399006.SZ_Signal_line', '000852.SH_amount_change_rate', '000905.SH_amount_change_rate', '399006.SZ_amount_change_rate', '000852.SH_amount_mean', '000905.SH_amount_mean', '399006.SZ_amount_mean', '000852.SH_daily_return', '000905.SH_daily_return', '399006.SZ_daily_return', '000852.SH_up_ratio_20d', '000905.SH_up_ratio_20d', '399006.SZ_up_ratio_20d', '000852.SH_volatility', '000905.SH_volatility', '399006.SZ_volatility', '000852.SH_volume_change_rate', '000905.SH_volume_change_rate', '399006.SZ_volume_change_rate']\n"
]
}
],
@@ -1871,7 +1847,7 @@
},
{
"cell_type": "code",
"execution_count": 33,
"execution_count": 38,
"id": "7e9023cc",
"metadata": {},
"outputs": [],
@@ -2071,7 +2047,7 @@
},
{
"cell_type": "code",
"execution_count": 34,
"execution_count": 39,
"id": "a0000d75",
"metadata": {},
"outputs": [
@@ -2081,7 +2057,7 @@
"text": [
"开始分析 'score' 在 'circ_mv' 和 'future_return' 下的表现...\n",
"准备数据,处理 NaN 值...\n",
"原始数据 28200 行,移除 NaN 后剩余 27807 行用于分析。\n",
"原始数据 28300 行,移除 NaN 后剩余 27850 行用于分析。\n",
"对 'circ_mv' 和 'future_return' 进行 100 分位数分箱...\n",
"按二维分箱分组计算 Spearman Rank IC...\n",
"整理结果用于绘图...\n",
@@ -2319,7 +2295,7 @@
},
{
"cell_type": "code",
"execution_count": 35,
"execution_count": 40,
"id": "a436dba4",
"metadata": {},
"outputs": [

2471
main/train/Classify3.ipynb Normal file

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -10,6 +10,6 @@ from main.factor.factor import calculate_arbr
ts.set_token('3a0741c702ee7e5e5f2bf1f0846bafaafe4e320833240b2a7e4a685f')
pro = ts.pro_api()
df = pro.dc_member(trade_date='20250102', ts_code='BK1184.DC')
df = pro.dc_member(trade_date='20190105')
print(df.sort_values('end_date'))
print(df)