- 添加 is_scheduled_reconnecting 标志位协调重连逻辑 - 增强定时重连任务的日志前缀便于追踪 - 改进异常处理和资源清理日志 - 优化代码格式和注释
444 lines
17 KiB
Plaintext
444 lines
17 KiB
Plaintext
{
|
||
"cells": [
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 1,
|
||
"id": "18d1d622-b083-4cc4-a6f8-7c1ed2d0edd2",
|
||
"metadata": {
|
||
"ExecuteTime": {
|
||
"end_time": "2025-04-09T14:57:36.913044Z",
|
||
"start_time": "2025-04-09T14:57:36.159612Z"
|
||
}
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"import tushare as ts\n",
|
||
"ts.set_token('3a0741c702ee7e5e5f2bf1f0846bafaafe4e320833240b2a7e4a685f')\n",
|
||
"pro = ts.pro_api()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 2,
|
||
"id": "14671a7f72de2564",
|
||
"metadata": {
|
||
"ExecuteTime": {
|
||
"end_time": "2025-04-09T14:57:39.128278Z",
|
||
"start_time": "2025-04-09T14:57:36.918051Z"
|
||
}
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"from datetime import datetime\n",
|
||
"import pandas as pd\n",
|
||
"import warnings\n",
|
||
"\n",
|
||
"warnings.filterwarnings(\"ignore\")\n",
|
||
"def filter_rows(df):\n",
|
||
" # 按照 name 和 start_date 分组\n",
|
||
" def select_row(group):\n",
|
||
" # 如果有 end_date 不为 NaT 的行,优先保留这些行\n",
|
||
" valid_rows = group[group['end_date'].notna()]\n",
|
||
" if not valid_rows.empty:\n",
|
||
" return valid_rows.iloc[0] # 返回第一个有效行\n",
|
||
" else:\n",
|
||
" return group.iloc[0] # 如果没有有效行,返回第一行\n",
|
||
"\n",
|
||
" filtered_df = df.groupby(['name', 'start_date'], group_keys=False).apply(select_row)\n",
|
||
" filtered_df = filtered_df.reset_index(drop=True)\n",
|
||
" return filtered_df\n",
|
||
"\n",
|
||
"def is_st(name_change_dict, stock_code, target_date):\n",
|
||
" target_date = datetime.strptime(target_date, '%Y%m%d')\n",
|
||
" if stock_code not in name_change_dict.keys():\n",
|
||
" return False\n",
|
||
" df = name_change_dict[stock_code]\n",
|
||
" for i in range(len(df)):\n",
|
||
" sds = df.iloc[i, 2]\n",
|
||
" eds = df.iloc[i, 3]\n",
|
||
" if eds is None or eds is pd.NaT:\n",
|
||
" eds = datetime.now()\n",
|
||
" if (target_date - sds).days >= 0 and (target_date - eds).days <= 0:\n",
|
||
" return True\n",
|
||
" return False\n",
|
||
"\n",
|
||
"name_change_df = pd.read_hdf('/mnt/d/PyProject/NewStock/data/name_change.h5', key='name_change')\n",
|
||
"name_change_df = name_change_df.drop_duplicates(keep='first')\n",
|
||
"\n",
|
||
"# 确保 name_change_df 的日期格式正确\n",
|
||
"name_change_df['start_date'] = pd.to_datetime(name_change_df['start_date'], format='%Y%m%d')\n",
|
||
"name_change_df['end_date'] = pd.to_datetime(name_change_df['end_date'], format='%Y%m%d', errors='coerce')\n",
|
||
"# name_change_df = name_change_df[name_change_df.name.str.contains('ST') ]\n",
|
||
"name_change_dict = {}\n",
|
||
"for ts_code, group in name_change_df.groupby('ts_code'):\n",
|
||
" # 只保留 'ST' 和 '*ST' 的记录\n",
|
||
" # st_data = group[(group['change_reason'] == 'ST') | (group['change_reason'] == '*ST')]\n",
|
||
" st_data = group[(group['name'].str.contains('ST')) | (group['name'].str.contains('退'))]\n",
|
||
" if not st_data.empty:\n",
|
||
" name_change_dict[ts_code] = filter_rows(st_data)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 3,
|
||
"id": "e7f8cce2f80e2f20",
|
||
"metadata": {
|
||
"ExecuteTime": {
|
||
"end_time": "2025-04-09T14:58:09.296046Z",
|
||
"start_time": "2025-04-09T14:57:39.339423Z"
|
||
}
|
||
},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"<class 'pandas.core.frame.DataFrame'>\n",
|
||
"Index: 9602310 entries, 0 to 27316\n",
|
||
"Data columns (total 2 columns):\n",
|
||
" # Column Dtype \n",
|
||
"--- ------ ----- \n",
|
||
" 0 ts_code object\n",
|
||
" 1 trade_date object\n",
|
||
"dtypes: object(2)\n",
|
||
"memory usage: 219.8+ MB\n",
|
||
"None\n",
|
||
"20260130\n",
|
||
"20260202\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"import time\n",
|
||
"from concurrent.futures import ThreadPoolExecutor, as_completed\n",
|
||
"\n",
|
||
"h5_filename = '/mnt/d/PyProject/NewStock/data/daily_basic.h5'\n",
|
||
"key = '/daily_basic'\n",
|
||
"max_date = None\n",
|
||
"with pd.HDFStore(h5_filename, mode='r') as store:\n",
|
||
" df = store[key][['ts_code', 'trade_date']]\n",
|
||
" print(df.info())\n",
|
||
" max_date = df['trade_date'].max()\n",
|
||
"\n",
|
||
"print(max_date)\n",
|
||
"trade_cal = pro.trade_cal(exchange='', start_date='20170101', end_date='20260310')\n",
|
||
"trade_cal = trade_cal[trade_cal['is_open'] == 1] # 只保留交易日\n",
|
||
"trade_dates = trade_cal[trade_cal['cal_date'] > max_date]['cal_date'].tolist()\n",
|
||
"start_date = min(trade_dates)\n",
|
||
"print(start_date)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 4,
|
||
"id": "553cfb36-f560-4cc4-b2bc-68323ccc5072",
|
||
"metadata": {
|
||
"ExecuteTime": {
|
||
"end_time": "2025-04-09T14:58:16.817010Z",
|
||
"start_time": "2025-04-09T14:58:09.326485Z"
|
||
},
|
||
"scrolled": true
|
||
},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"任务 20260309 完成\n",
|
||
"任务 20260310 完成\n",
|
||
"任务 20260305 完成\n",
|
||
"任务 20260306 完成\n",
|
||
"任务 20260303 完成\n",
|
||
"任务 20260304 完成\n",
|
||
"任务 20260302 完成\n",
|
||
"任务 20260227 完成\n",
|
||
"任务 20260226 完成\n",
|
||
"任务 20260225 完成\n",
|
||
"任务 20260224 完成\n",
|
||
"任务 20260213 完成\n",
|
||
"任务 20260212 完成\n",
|
||
"任务 20260211 完成\n",
|
||
"任务 20260210 完成\n",
|
||
"任务 20260209 完成\n",
|
||
"任务 20260206 完成\n",
|
||
"任务 20260205 完成\n",
|
||
"任务 20260204 完成\n",
|
||
"任务 20260203 完成\n",
|
||
"任务 20260202 完成\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"\n",
|
||
"\n",
|
||
"# 使用 HDFStore 存储数据\n",
|
||
"all_daily_data = []\n",
|
||
"\n",
|
||
"# API 调用计数和时间控制变量\n",
|
||
"api_call_count = 0\n",
|
||
"batch_start_time = time.time()\n",
|
||
"\n",
|
||
"\n",
|
||
"def get_data(trade_date):\n",
|
||
" daily_basic_data = pro.daily_basic(ts_code='', trade_date=trade_date)\n",
|
||
" if daily_basic_data is not None and not daily_basic_data.empty:\n",
|
||
" # 添加交易日期列标识\n",
|
||
" daily_basic_data['trade_date'] = trade_date\n",
|
||
" daily_basic_data['is_st'] = daily_basic_data.apply(\n",
|
||
" lambda row: is_st(name_change_dict, row['ts_code'], row['trade_date']), axis=1\n",
|
||
" )\n",
|
||
" time.sleep(0.2)\n",
|
||
" # print(f\"成功获取并保存 {trade_date} 的每日基础数据\")\n",
|
||
" return daily_basic_data\n",
|
||
"\n",
|
||
"\n",
|
||
"# 遍历每个交易日期并获取数据\n",
|
||
"with ThreadPoolExecutor(max_workers=2) as executor:\n",
|
||
" future_to_date = {executor.submit(get_data, td): td for td in trade_dates}\n",
|
||
"\n",
|
||
" for future in as_completed(future_to_date):\n",
|
||
" trade_date = future_to_date[future] # 获取对应的交易日期\n",
|
||
" try:\n",
|
||
" result = future.result() # 获取任务执行的结果\n",
|
||
" all_daily_data.append(result)\n",
|
||
" print(f\"任务 {trade_date} 完成\")\n",
|
||
" except Exception as e:\n",
|
||
" print(f\"获取 {trade_date} 数据时出错: {e}\")\n",
|
||
" # 计数一次 API 调用\n",
|
||
" api_call_count += 1\n",
|
||
"\n",
|
||
" # 每调用 300 次,检查时间是否少于 1 分钟,如果少于则等待剩余时间\n",
|
||
" if api_call_count % 150 == 0:\n",
|
||
" elapsed = time.time() - batch_start_time\n",
|
||
" if elapsed < 60:\n",
|
||
" sleep_time = 60 - elapsed\n",
|
||
" print(f\"已调用 150 次 API,等待 {sleep_time:.2f} 秒以满足速率限制...\")\n",
|
||
" time.sleep(sleep_time)\n",
|
||
" # 重置批次起始时间\n",
|
||
" batch_start_time = time.time()\n",
|
||
"\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 5,
|
||
"id": "919023c693d7a47a",
|
||
"metadata": {
|
||
"ExecuteTime": {
|
||
"end_time": "2025-04-09T14:58:16.864178Z",
|
||
"start_time": "2025-04-09T14:58:16.855084Z"
|
||
}
|
||
},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
" ts_code trade_date close turnover_rate turnover_rate_f \\\n",
|
||
"0 001389.SZ 20260206 99.50 6.1259 6.1259 \n",
|
||
"1 600841.SH 20260206 9.55 4.4177 14.0519 \n",
|
||
"2 300968.SZ 20260206 13.71 1.0258 2.1909 \n",
|
||
"3 300634.SZ 20260206 28.46 4.0862 6.8510 \n",
|
||
"4 300295.SZ 20260206 11.99 3.5950 3.8451 \n",
|
||
"... ... ... ... ... ... \n",
|
||
"27325 603766.SH 20260202 14.89 1.1630 1.9407 \n",
|
||
"27326 603408.SH 20260202 12.92 0.7963 3.0049 \n",
|
||
"27327 000004.SZ 20260202 11.99 0.2338 0.3161 \n",
|
||
"27328 601628.SH 20260202 48.34 0.1384 1.9210 \n",
|
||
"27329 301042.SZ 20260202 86.10 3.6207 5.4797 \n",
|
||
"\n",
|
||
" volume_ratio pe pe_ttm pb ps ps_ttm dv_ratio \\\n",
|
||
"0 0.83 62.6474 46.6770 11.6304 11.3424 8.6639 0.4819 \n",
|
||
"1 1.02 NaN NaN 4.1908 2.0493 2.4531 NaN \n",
|
||
"2 0.80 106.8945 225.1215 2.9871 4.6434 4.4246 0.3647 \n",
|
||
"3 0.68 55.7987 55.5119 4.4068 7.7725 7.2020 0.5590 \n",
|
||
"4 0.40 NaN NaN 2.0169 18.5210 23.1598 NaN \n",
|
||
"... ... ... ... ... ... ... ... \n",
|
||
"27325 1.04 27.2701 16.9932 3.2106 1.8177 1.5958 2.6864 \n",
|
||
"27326 0.99 11.9989 13.3289 1.7275 1.1550 1.1674 3.8700 \n",
|
||
"27327 0.05 NaN NaN 42.9590 16.0879 21.5107 NaN \n",
|
||
"27328 0.95 12.7771 8.0270 2.2069 2.5849 2.1373 1.4233 \n",
|
||
"27329 1.92 84.1572 534.3244 5.7108 8.6920 12.7424 NaN \n",
|
||
"\n",
|
||
" dv_ttm total_share float_share free_share total_mv \\\n",
|
||
"0 0.4819 4.256875e+04 1.511580e+04 15115.7957 4.235591e+06 \n",
|
||
"1 NaN 1.387822e+05 1.043024e+05 32790.9410 1.325370e+06 \n",
|
||
"2 0.3647 4.133800e+04 4.133800e+04 19355.6537 5.667440e+05 \n",
|
||
"3 0.5590 4.512109e+04 4.345735e+04 25919.5274 1.284146e+06 \n",
|
||
"4 NaN 1.896137e+04 1.675259e+04 15662.9042 2.273468e+05 \n",
|
||
"... ... ... ... ... ... \n",
|
||
"27325 4.0296 2.053542e+05 2.053542e+05 123065.9980 3.057724e+06 \n",
|
||
"27326 5.3406 4.475730e+04 4.475730e+04 11860.1633 5.782643e+05 \n",
|
||
"27327 NaN 1.323803e+04 1.262878e+04 9339.3580 1.587240e+05 \n",
|
||
"27328 1.4233 2.826470e+06 2.082353e+06 150000.0000 1.366316e+08 \n",
|
||
"27329 NaN 6.972358e+03 6.602083e+03 4362.2827 6.003200e+05 \n",
|
||
"\n",
|
||
" circ_mv is_st \n",
|
||
"0 1.504022e+06 False \n",
|
||
"1 9.960884e+05 False \n",
|
||
"2 5.667440e+05 False \n",
|
||
"3 1.236796e+06 False \n",
|
||
"4 2.008635e+05 False \n",
|
||
"... ... ... \n",
|
||
"27325 3.057724e+06 False \n",
|
||
"27326 5.782643e+05 False \n",
|
||
"27327 1.514190e+05 True \n",
|
||
"27328 1.006609e+08 False \n",
|
||
"27329 5.684393e+05 False \n",
|
||
"\n",
|
||
"[27330 rows x 19 columns]\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"all_daily_data_df = pd.concat(all_daily_data, ignore_index=True)\n",
|
||
"print(all_daily_data_df)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 6,
|
||
"id": "28cb78d032671b20",
|
||
"metadata": {
|
||
"ExecuteTime": {
|
||
"end_time": "2025-04-09T14:58:16.881685Z",
|
||
"start_time": "2025-04-09T14:58:16.871184Z"
|
||
}
|
||
},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
" ts_code trade_date close turnover_rate turnover_rate_f \\\n",
|
||
"52 002713.SZ 20260206 9.52 4.6530 6.2693 \n",
|
||
"107 000609.SZ 20260206 8.23 0.2427 0.3213 \n",
|
||
"113 300052.SZ 20260206 13.86 3.5983 4.3260 \n",
|
||
"116 600624.SH 20260206 5.31 0.6260 0.8827 \n",
|
||
"119 300555.SZ 20260206 14.31 1.4520 1.7474 \n",
|
||
"... ... ... ... ... ... \n",
|
||
"27182 000668.SZ 20260202 13.85 3.5014 5.9253 \n",
|
||
"27240 003032.SZ 20260202 6.37 2.2066 3.1223 \n",
|
||
"27305 000691.SZ 20260202 8.30 1.9108 2.2317 \n",
|
||
"27312 300167.SZ 20260202 4.65 2.1705 2.3552 \n",
|
||
"27327 000004.SZ 20260202 11.99 0.2338 0.3161 \n",
|
||
"\n",
|
||
" volume_ratio pe pe_ttm pb ps ps_ttm dv_ratio dv_ttm \\\n",
|
||
"52 1.02 NaN NaN NaN 6.9890 12.8336 NaN NaN \n",
|
||
"107 0.05 NaN NaN NaN 8.0655 15.8232 NaN NaN \n",
|
||
"113 1.51 NaN NaN 9.6004 15.9784 18.8191 NaN NaN \n",
|
||
"116 0.60 NaN NaN 5.7984 5.5900 5.4551 NaN NaN \n",
|
||
"119 1.24 NaN NaN 6.0224 16.1604 18.5602 NaN NaN \n",
|
||
"... ... .. ... ... ... ... ... ... \n",
|
||
"27182 1.32 NaN NaN 2.8792 15.2387 7.2708 NaN NaN \n",
|
||
"27240 1.15 NaN NaN 2.0451 10.4354 7.6772 NaN NaN \n",
|
||
"27305 0.69 NaN NaN NaN 9.0850 9.0002 NaN NaN \n",
|
||
"27312 0.71 NaN 134.128 63.0032 4.3517 3.8269 NaN NaN \n",
|
||
"27327 0.05 NaN NaN 42.9590 16.0879 21.5107 NaN NaN \n",
|
||
"\n",
|
||
" total_share float_share free_share total_mv circ_mv is_st \n",
|
||
"52 95140.5184 52650.0831 39075.6229 905737.7352 501228.7911 True \n",
|
||
"107 29926.5522 29105.8272 21991.3472 246295.5246 239540.9579 True \n",
|
||
"113 26185.8710 26185.1210 21780.6332 362936.1721 362925.7771 True \n",
|
||
"116 67934.6942 67934.6942 48180.1806 360733.2262 360733.2262 True \n",
|
||
"119 20000.0000 19798.4863 16450.7792 286200.0000 283316.3390 True \n",
|
||
"... ... ... ... ... ... ... \n",
|
||
"27182 14684.1890 14684.1890 8677.2104 203376.0177 203376.0177 True \n",
|
||
"27240 40244.7500 27621.9885 19520.7308 256359.0575 175952.0667 True \n",
|
||
"27305 48490.5000 32327.0000 27679.8405 402471.1500 268314.1000 True \n",
|
||
"27312 38937.4000 35889.3250 33073.6637 181058.9100 166885.3613 True \n",
|
||
"27327 13238.0282 12628.7768 9339.3580 158723.9581 151419.0338 True \n",
|
||
"\n",
|
||
"[872 rows x 19 columns]\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"print(all_daily_data_df[all_daily_data_df['is_st']])"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 7,
|
||
"id": "692b58674b7462c9",
|
||
"metadata": {
|
||
"ExecuteTime": {
|
||
"end_time": "2025-04-09T14:58:17.773453Z",
|
||
"start_time": "2025-04-09T14:58:16.903459Z"
|
||
}
|
||
},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"所有每日基础数据获取并保存完毕!\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"# 将数据保存为 HDF5 文件(table 格式)\n",
|
||
"all_daily_data_df.to_hdf(h5_filename, key='daily_basic', mode='a', format='table', append=True, data_columns=True)\n",
|
||
"\n",
|
||
"print(\"所有每日基础数据获取并保存完毕!\")\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 8,
|
||
"id": "d7a773fc20293477",
|
||
"metadata": {
|
||
"ExecuteTime": {
|
||
"end_time": "2025-04-09T14:58:24.305403Z",
|
||
"start_time": "2025-04-09T14:58:17.816332Z"
|
||
}
|
||
},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"<class 'pandas.core.frame.DataFrame'>\n",
|
||
"Index: 9629640 entries, 0 to 27329\n",
|
||
"Data columns (total 3 columns):\n",
|
||
" # Column Dtype \n",
|
||
"--- ------ ----- \n",
|
||
" 0 ts_code object\n",
|
||
" 1 trade_date object\n",
|
||
" 2 is_st bool \n",
|
||
"dtypes: bool(1), object(2)\n",
|
||
"memory usage: 229.6+ MB\n",
|
||
"None\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"with pd.HDFStore(h5_filename, mode='r') as store:\n",
|
||
" df = store[key][['ts_code', 'trade_date', 'is_st']]\n",
|
||
" print(df.info())"
|
||
]
|
||
}
|
||
],
|
||
"metadata": {
|
||
"kernelspec": {
|
||
"display_name": "stock",
|
||
"language": "python",
|
||
"name": "python3"
|
||
},
|
||
"language_info": {
|
||
"codemirror_mode": {
|
||
"name": "ipython",
|
||
"version": 3
|
||
},
|
||
"file_extension": ".py",
|
||
"mimetype": "text/x-python",
|
||
"name": "python",
|
||
"nbconvert_exporter": "python",
|
||
"pygments_lexer": "ipython3",
|
||
"version": "3.12.11"
|
||
}
|
||
},
|
||
"nbformat": 4,
|
||
"nbformat_minor": 5
|
||
}
|