Classify2

This commit is contained in:
liaozhaorun
2025-05-06 23:42:40 +08:00
parent 721e72c599
commit b783a6f968
19 changed files with 9390 additions and 2774 deletions

View File

@@ -2,6 +2,7 @@
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "18d1d622-b083-4cc4-a6f8-7c1ed2d0edd2",
"metadata": {
"ExecuteTime": {
@@ -9,16 +10,16 @@
"start_time": "2025-04-09T14:57:36.159612Z"
}
},
"outputs": [],
"source": [
"import tushare as ts\n",
"ts.set_token('3a0741c702ee7e5e5f2bf1f0846bafaafe4e320833240b2a7e4a685f')\n",
"pro = ts.pro_api()"
],
"outputs": [],
"execution_count": 1
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "14671a7f72de2564",
"metadata": {
"ExecuteTime": {
@@ -26,6 +27,7 @@
"start_time": "2025-04-09T14:57:36.918051Z"
}
},
"outputs": [],
"source": [
"from datetime import datetime\n",
"import pandas as pd\n",
@@ -70,15 +72,15 @@
"name_change_dict = {}\n",
"for ts_code, group in name_change_df.groupby('ts_code'):\n",
" # 只保留 'ST' 和 '*ST' 的记录\n",
" st_data = group[(group['change_reason'] == 'ST') | (group['change_reason'] == '*ST')]\n",
" # st_data = group[(group['change_reason'] == 'ST') | (group['change_reason'] == '*ST')]\n",
" st_data = group[group['name'].str.contains('ST')]\n",
" if not st_data.empty:\n",
" name_change_dict[ts_code] = filter_rows(st_data)"
],
"outputs": [],
"execution_count": 2
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "e7f8cce2f80e2f20",
"metadata": {
"ExecuteTime": {
@@ -86,6 +88,26 @@
"start_time": "2025-04-09T14:57:39.339423Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"Index: 8599138 entries, 0 to 8599137\n",
"Data columns (total 2 columns):\n",
" # Column Dtype \n",
"--- ------ ----- \n",
" 0 ts_code object\n",
" 1 trade_date object\n",
"dtypes: object(2)\n",
"memory usage: 196.8+ MB\n",
"None\n",
"20250430\n",
"20250506\n"
]
}
],
"source": [
"import time\n",
"from concurrent.futures import ThreadPoolExecutor, as_completed\n",
@@ -99,44 +121,85 @@
" max_date = df['trade_date'].max()\n",
"\n",
"print(max_date)\n",
"trade_cal = pro.trade_cal(exchange='', start_date='20170101', end_date='20250420')\n",
"trade_cal = pro.trade_cal(exchange='', start_date='20170101', end_date='20250720')\n",
"trade_cal = trade_cal[trade_cal['is_open'] == 1] # 只保留交易日\n",
"trade_dates = trade_cal[trade_cal['cal_date'] > max_date]['cal_date'].tolist()\n",
"start_date = min(trade_dates)\n",
"print(start_date)"
],
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "553cfb36-f560-4cc4-b2bc-68323ccc5072",
"metadata": {
"ExecuteTime": {
"end_time": "2025-04-09T14:58:16.817010Z",
"start_time": "2025-04-09T14:58:09.326485Z"
},
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"Index: 8512911 entries, 0 to 5391\n",
"Data columns (total 2 columns):\n",
" # Column Dtype \n",
"--- ------ ----- \n",
" 0 ts_code object\n",
" 1 trade_date object\n",
"dtypes: object(2)\n",
"memory usage: 194.8+ MB\n",
"None\n",
"20250408\n",
"20250409\n"
"任务 20250718 完成\n",
"任务 20250717 完成\n",
"任务 20250715 完成\n",
"任务 20250716 完成\n",
"任务 20250711 完成\n",
"任务 20250714 完成\n",
"任务 20250709 完成\n",
"任务 20250710 完成\n",
"任务 20250707 完成\n",
"任务 20250708 完成\n",
"任务 20250704 完成\n",
"任务 20250703 完成\n",
"任务 20250702 完成\n",
"任务 20250701 完成\n",
"任务 20250630 完成\n",
"任务 20250627 完成\n",
"任务 20250626 完成\n",
"任务 20250625 完成\n",
"任务 20250624 完成\n",
"任务 20250623 完成\n",
"任务 20250619 完成\n",
"任务 20250620 完成\n",
"任务 20250618 完成\n",
"任务 20250617 完成\n",
"任务 20250616 完成\n",
"任务 20250613 完成\n",
"任务 20250612 完成\n",
"任务 20250611 完成\n",
"任务 20250610 完成\n",
"任务 20250609 完成\n",
"任务 20250606 完成\n",
"任务 20250605 完成\n",
"任务 20250604 完成\n",
"任务 20250603 完成\n",
"任务 20250530 完成\n",
"任务 20250529 完成\n",
"任务 20250528 完成\n",
"任务 20250527 完成\n",
"任务 20250526 完成\n",
"任务 20250523 完成\n",
"任务 20250522 完成\n",
"任务 20250521 完成\n",
"任务 20250520 完成\n",
"任务 20250519 完成\n",
"任务 20250516 完成\n",
"任务 20250515 完成\n",
"任务 20250514 完成\n",
"任务 20250513 完成\n",
"任务 20250512 完成\n",
"任务 20250509 完成\n",
"任务 20250508 完成\n",
"任务 20250507 完成\n",
"任务 20250506 完成\n"
]
}
],
"execution_count": 3
},
{
"cell_type": "code",
"id": "553cfb36-f560-4cc4-b2bc-68323ccc5072",
"metadata": {
"scrolled": true,
"ExecuteTime": {
"end_time": "2025-04-09T14:58:16.817010Z",
"start_time": "2025-04-09T14:58:09.326485Z"
}
},
"source": [
"\n",
"\n",
@@ -186,27 +249,11 @@
" # 重置批次起始时间\n",
" batch_start_time = time.time()\n",
"\n"
],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"任务 20250418 完成\n",
"任务 20250417 完成\n",
"任务 20250416 完成\n",
"任务 20250415 完成\n",
"任务 20250414 完成\n",
"任务 20250411 完成\n",
"任务 20250410 完成\n",
"任务 20250409 完成\n"
]
}
],
"execution_count": 4
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "919023c693d7a47a",
"metadata": {
"ExecuteTime": {
@@ -214,75 +261,75 @@
"start_time": "2025-04-09T14:58:16.855084Z"
}
},
"source": [
"all_daily_data_df = pd.concat(all_daily_data, ignore_index=True)\n",
"print(all_daily_data_df)"
],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" ts_code trade_date close turnover_rate turnover_rate_f \\\n",
"0 300285.SZ 20250409 16.61 2.1086 2.2506 \n",
"1 300458.SZ 20250409 44.48 9.9286 11.7046 \n",
"2 605090.SH 20250409 23.81 0.6834 1.1888 \n",
"3 688686.SH 20250409 69.52 1.6005 5.7492 \n",
"4 002057.SZ 20250409 7.18 4.7461 7.1088 \n",
"... ... ... ... ... ... \n",
"5390 301511.SZ 20250409 12.23 3.4040 4.6900 \n",
"5391 688355.SH 20250409 15.84 1.4154 4.4898 \n",
"5392 600019.SH 20250409 6.83 0.4729 1.2898 \n",
"5393 603507.SH 20250409 22.00 30.8936 42.4775 \n",
"5394 600886.SH 20250409 14.58 0.7795 2.4989 \n",
" ts_code trade_date close turnover_rate turnover_rate_f \\\n",
"0 002390.SZ 20250506 3.48 0.7696 1.3833 \n",
"1 300708.SZ 20250506 11.64 2.8994 3.2217 \n",
"2 301171.SZ 20250506 27.73 9.9120 10.7228 \n",
"3 301662.SZ 20250506 52.50 17.0926 17.0926 \n",
"4 001309.SZ 20250506 129.63 5.7123 6.3388 \n",
"... ... ... ... ... ... \n",
"5381 000551.SZ 20250506 12.39 2.0213 3.1432 \n",
"5382 600792.SH 20250506 3.17 0.8036 2.3531 \n",
"5383 300176.SZ 20250506 6.62 1.7530 2.5325 \n",
"5384 000016.SZ 20250506 5.57 13.9545 20.7669 \n",
"5385 300339.SZ 20250506 56.53 11.3184 11.9579 \n",
"\n",
" volume_ratio pe pe_ttm pb ps ps_ttm dv_ratio \\\n",
"0 1.11 29.0985 27.1266 2.5144 4.2913 4.1010 0.6020 \n",
"1 1.54 168.9309 168.9309 9.3966 12.3119 12.3119 0.3364 \n",
"2 1.00 11.8377 9.0427 1.7135 0.5819 0.6421 3.2226 \n",
"3 1.18 43.8690 61.1222 2.9105 9.0031 9.2377 NaN \n",
"4 1.35 19.8304 29.3370 1.7625 1.9656 2.0487 3.2191 \n",
"... ... ... ... ... ... ... ... \n",
"5390 1.36 58.1209 NaN 1.9116 1.1803 1.1129 0.3212 \n",
"5391 1.31 133.9017 29.7427 1.8103 3.6805 3.1067 NaN \n",
"5392 1.28 12.5281 15.7915 0.7518 0.4344 0.4503 4.4796 \n",
"5393 2.89 22.7537 22.7537 1.6401 1.0276 1.0276 1.3553 \n",
"5394 1.04 17.4059 16.1402 1.8424 2.0579 1.9930 3.1604 \n",
" volume_ratio pe pe_ttm pb ps ps_ttm dv_ratio \\\n",
"0 1.02 66.7242 80.7223 1.0020 1.1214 1.1483 2.5321 \n",
"1 1.14 40.4767 37.8935 2.9328 2.8689 2.7390 1.3334 \n",
"2 0.95 56.4451 55.0565 3.6159 5.1380 4.3691 0.4867 \n",
"3 0.79 20.2143 23.5423 2.7909 2.0091 2.2310 NaN \n",
"4 1.02 59.8205 243.9150 8.6523 4.3939 4.0221 0.0702 \n",
"... ... ... ... ... ... ... ... \n",
"5381 1.20 19.9692 18.7030 1.8602 1.1939 1.1927 0.5650 \n",
"5382 0.89 NaN NaN 1.1995 0.5271 0.5777 2.1767 \n",
"5383 1.12 92.1443 96.5538 2.7208 1.4839 1.4627 0.0000 \n",
"5384 3.66 NaN NaN 5.6643 1.2067 1.1979 0.0000 \n",
"5385 2.40 279.4392 270.1037 12.8967 13.2445 13.0061 0.0000 \n",
"\n",
" dv_ttm total_share float_share free_share total_mv \\\n",
"0 0.6020 9.970483e+04 8.039498e+04 75323.2612 1.656097e+06 \n",
"1 0.3364 6.332851e+04 5.179696e+04 43937.3622 2.816852e+06 \n",
"2 3.2226 6.492580e+04 6.426965e+04 36946.4646 1.545883e+06 \n",
"3 NaN 1.222355e+04 1.222355e+04 3402.7889 8.497809e+05 \n",
"4 3.2191 7.584828e+04 7.501396e+04 50081.8345 5.445906e+05 \n",
"... ... ... ... ... ... \n",
"5390 0.3212 6.303220e+04 3.736720e+04 27120.6014 7.708838e+05 \n",
"5391 NaN 1.239561e+04 1.239561e+04 3907.6756 1.963464e+05 \n",
"5392 4.4796 2.190864e+06 2.178208e+06 798651.6922 1.496360e+07 \n",
"5393 1.3553 1.843013e+04 1.843013e+04 13404.1045 4.054629e+05 \n",
"5394 3.1604 8.004494e+05 7.454180e+05 232532.2636 1.167055e+07 \n",
" dv_ttm total_share float_share free_share total_mv \\\n",
"0 2.5321 194385.1868 185230.5076 103045.2550 6.764605e+05 \n",
"1 1.3003 68015.2346 52260.4413 47031.2918 7.916973e+05 \n",
"2 0.4867 47188.5905 30877.5025 28542.8345 1.308540e+06 \n",
"3 NaN 8000.0000 1577.6325 1577.6325 4.200000e+05 \n",
"4 NaN 16177.0306 8763.6153 7897.4398 2.097028e+06 \n",
"... ... ... ... ... ... \n",
"5381 0.5650 40394.4205 40263.2044 25893.0990 5.004869e+05 \n",
"5382 2.1767 110992.3600 105986.8113 36194.3684 3.518458e+05 \n",
"5383 NaN 38728.0800 38728.0800 26808.2764 2.563799e+05 \n",
"5384 NaN 240794.5408 159659.3800 107284.6868 1.341226e+06 \n",
"5385 NaN 79641.0841 77768.6667 73609.4256 4.502110e+06 \n",
"\n",
" circ_mv is_st \n",
"0 1.335361e+06 False \n",
"1 2.303929e+06 False \n",
"2 1.530260e+06 False \n",
"3 8.497809e+05 False \n",
"4 5.386002e+05 False \n",
"0 6.446022e+05 False \n",
"1 6.083115e+05 False \n",
"2 8.562331e+05 False \n",
"3 8.282571e+04 False \n",
"4 1.136027e+06 False \n",
"... ... ... \n",
"5390 4.570009e+05 False \n",
"5391 1.963464e+05 False \n",
"5392 1.487716e+07 False \n",
"5393 4.054629e+05 False \n",
"5394 1.086819e+07 False \n",
"5381 4.988611e+05 False \n",
"5382 3.359782e+05 False \n",
"5383 2.563799e+05 False \n",
"5384 8.893027e+05 False \n",
"5385 4.396263e+06 False \n",
"\n",
"[5395 rows x 19 columns]\n"
"[5386 rows x 19 columns]\n"
]
}
],
"execution_count": 5
"source": [
"all_daily_data_df = pd.concat(all_daily_data, ignore_index=True)\n",
"print(all_daily_data_df)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "28cb78d032671b20",
"metadata": {
"ExecuteTime": {
@@ -290,74 +337,74 @@
"start_time": "2025-04-09T14:58:16.871184Z"
}
},
"source": [
"print(all_daily_data_df[all_daily_data_df['is_st']])"
],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" ts_code trade_date close turnover_rate turnover_rate_f \\\n",
"85 002822.SZ 20250409 3.11 1.8467 1.9219 \n",
"123 603959.SH 20250409 3.27 1.7568 2.2420 \n",
"181 688282.SH 20250409 42.59 2.5546 3.0570 \n",
"259 600777.SH 20250409 2.66 1.9331 2.4597 \n",
"283 002052.SZ 20250409 6.15 1.5326 2.5481 \n",
"23 000820.SZ 20250506 2.04 11.8279 12.1552 \n",
"33 300506.SZ 20250506 3.27 0.6104 0.8597 \n",
"82 839680.BJ 20250506 7.25 34.6648 39.7153 \n",
"105 300159.SZ 20250506 1.83 3.6351 4.0740 \n",
"114 300301.SZ 20250506 1.82 1.3707 1.4819 \n",
"... ... ... ... ... ... \n",
"5286 002602.SZ 20250409 5.93 3.0376 3.5162 \n",
"5345 002501.SZ 20250409 1.89 4.3252 5.5834 \n",
"5364 600387.SH 20250409 2.34 0.0904 0.1163 \n",
"5366 002656.SZ 20250409 1.95 2.7047 3.0210 \n",
"5378 300013.SZ 20250409 3.57 2.8370 3.1107 \n",
"5259 600243.SH 20250506 2.43 6.7484 8.1172 \n",
"5264 002528.SZ 20250506 2.35 2.0592 4.3961 \n",
"5294 300044.SZ 20250506 3.31 12.8866 13.4490 \n",
"5324 300097.SZ 20250506 4.36 2.5814 3.0107 \n",
"5345 600200.SH 20250506 3.04 0.2013 0.2433 \n",
"\n",
" volume_ratio pe pe_ttm pb ps ps_ttm dv_ratio \\\n",
"85 2.59 NaN NaN 1.2023 0.5923 0.7314 0.0 \n",
"123 2.22 NaN NaN 4.3282 0.7749 1.1811 0.0 \n",
"181 1.07 NaN NaN 2.9277 172.3150 21.9335 NaN \n",
"259 0.96 6.9694 7.6204 0.8381 2.0443 2.0567 0.0 \n",
"283 0.74 NaN NaN NaN 19.5551 17.1988 0.0 \n",
"... ... ... ... ... ... ... ... \n",
"5286 3.30 84.3318 49.2129 1.6993 3.3267 2.3228 0.0 \n",
"5345 1.75 NaN NaN 7.0441 14.0701 19.7111 0.0 \n",
"5364 1.33 NaN NaN 0.3818 0.5148 0.8454 0.0 \n",
"5366 1.75 NaN NaN 3.8456 4.7986 5.9354 0.0 \n",
"5378 0.90 NaN NaN 8.2438 4.8281 4.2666 0.0 \n",
" volume_ratio pe pe_ttm pb ps ps_ttm dv_ratio \\\n",
"23 3.99 NaN NaN 9.0141 10.6452 13.5427 0.0 \n",
"33 0.77 NaN NaN 28.5038 19.4588 19.2499 0.0 \n",
"82 1.96 NaN NaN 7.4242 9.3299 11.0451 NaN \n",
"105 1.34 NaN NaN NaN 4.1337 4.1261 0.0 \n",
"114 1.22 NaN NaN 120.9449 2.9900 3.1074 0.0 \n",
"... ... ... ... ... ... ... ... \n",
"5259 0.73 NaN NaN 1.6685 4.5071 4.6210 0.0 \n",
"5264 1.52 NaN NaN 15.5269 2.9812 3.6083 0.0 \n",
"5294 2.91 NaN NaN 24.3171 17.6463 26.1361 0.0 \n",
"5324 0.99 NaN NaN 2.7137 3.2758 3.8102 0.0 \n",
"5345 0.05 30.7156 NaN 1.2351 1.3543 1.7858 0.0 \n",
"\n",
" dv_ttm total_share float_share free_share total_mv \\\n",
"85 NaN 73467.1821 56245.3696 54046.3738 2.284829e+05 \n",
"123 NaN 49029.8992 49029.8992 38419.3842 1.603278e+05 \n",
"181 NaN 8800.0000 3652.0000 3051.8414 3.747920e+05 \n",
"259 NaN 680049.5825 636615.2391 500325.8436 1.808932e+06 \n",
"283 NaN 74595.9694 74595.5944 44867.2806 4.587652e+05 \n",
"... ... ... ... ... ... \n",
"5286 NaN 745255.6968 687870.8273 594244.1179 4.419366e+06 \n",
"5345 NaN 355000.0000 354999.9006 274999.9006 6.709500e+05 \n",
"5364 NaN 46814.4464 40404.8492 31411.4405 1.095458e+05 \n",
"5366 NaN 71251.9844 60945.7555 54564.8212 1.389414e+05 \n",
"5378 NaN 55835.8894 44606.0865 40680.8215 1.993341e+05 \n",
" dv_ttm total_share float_share free_share total_mv circ_mv \\\n",
"23 NaN 64362.0201 29403.1899 28611.4718 131298.5210 59982.5074 \n",
"33 NaN 69559.6569 57572.5450 40880.9749 227460.0781 188262.2222 \n",
"82 NaN 6699.9900 4689.3344 4093.0077 48574.9275 33997.6744 \n",
"105 NaN 150196.5923 147183.9203 131325.6306 274859.7639 269346.5741 \n",
"114 NaN 82986.8769 78987.6719 73061.8561 151036.1160 143757.5629 \n",
"... ... ... ... ... ... ... \n",
"5259 NaN 43885.0000 43885.0000 36485.0000 106640.5500 106640.5500 \n",
"5264 NaN 119867.5082 104974.0608 49171.2582 281688.6443 246689.0429 \n",
"5294 NaN 76386.9228 76375.7508 73182.1277 252840.7145 252803.7351 \n",
"5324 NaN 28854.9669 27000.9948 23150.5534 125807.6557 117724.3373 \n",
"5345 NaN 71215.1832 71087.9480 58808.3718 216494.1569 216107.3619 \n",
"\n",
" circ_mv is_st \n",
"85 1.749231e+05 True \n",
"123 1.603278e+05 True \n",
"181 1.555387e+05 True \n",
"259 1.693397e+06 True \n",
"283 4.587629e+05 True \n",
"... ... ... \n",
"5286 4.079074e+06 True \n",
"5345 6.709498e+05 True \n",
"5364 9.454735e+04 True \n",
"5366 1.188442e+05 True \n",
"5378 1.592437e+05 True \n",
" is_st \n",
"23 True \n",
"33 True \n",
"82 True \n",
"105 True \n",
"114 True \n",
"... ... \n",
"5259 True \n",
"5264 True \n",
"5294 True \n",
"5324 True \n",
"5345 True \n",
"\n",
"[106 rows x 19 columns]\n"
"[196 rows x 19 columns]\n"
]
}
],
"execution_count": 6
"source": [
"print(all_daily_data_df[all_daily_data_df['is_st']])"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "692b58674b7462c9",
"metadata": {
"ExecuteTime": {
@@ -365,12 +412,6 @@
"start_time": "2025-04-09T14:58:16.903459Z"
}
},
"source": [
"# 将数据保存为 HDF5 文件table 格式)\n",
"all_daily_data_df.to_hdf(h5_filename, key='daily_basic', mode='a', format='table', append=True, data_columns=True)\n",
"\n",
"print(\"所有每日基础数据获取并保存完毕!\")\n"
],
"outputs": [
{
"name": "stdout",
@@ -380,10 +421,16 @@
]
}
],
"execution_count": 7
"source": [
"# 将数据保存为 HDF5 文件table 格式)\n",
"all_daily_data_df.to_hdf(h5_filename, key='daily_basic', mode='a', format='table', append=True, data_columns=True)\n",
"\n",
"print(\"所有每日基础数据获取并保存完毕!\")\n"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "d7a773fc20293477",
"metadata": {
"ExecuteTime": {
@@ -391,18 +438,13 @@
"start_time": "2025-04-09T14:58:17.816332Z"
}
},
"source": [
"with pd.HDFStore(h5_filename, mode='r') as store:\n",
" df = store[key][['ts_code', 'trade_date', 'is_st']]\n",
" print(df.info())"
],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"Index: 8518306 entries, 0 to 5394\n",
"Index: 8604524 entries, 0 to 5385\n",
"Data columns (total 3 columns):\n",
" # Column Dtype \n",
"--- ------ ----- \n",
@@ -410,17 +452,21 @@
" 1 trade_date object\n",
" 2 is_st bool \n",
"dtypes: bool(1), object(2)\n",
"memory usage: 203.1+ MB\n",
"memory usage: 205.1+ MB\n",
"None\n"
]
}
],
"execution_count": 8
"source": [
"with pd.HDFStore(h5_filename, mode='r') as store:\n",
" df = store[key][['ts_code', 'trade_date', 'is_st']]\n",
" print(df.info())"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"display_name": "new_trader",
"language": "python",
"name": "python3"
},