Files
NewStock/main/data/update/update_daily_basic.ipynb
liaozhaorun e407225d29 feat(qmt): 优化定时重连机制避免与健康检查冲突
- 添加 is_scheduled_reconnecting 标志位协调重连逻辑
- 增强定时重连任务的日志前缀便于追踪
- 改进异常处理和资源清理日志
- 优化代码格式和注释
2026-02-09 22:12:14 +08:00

444 lines
17 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "18d1d622-b083-4cc4-a6f8-7c1ed2d0edd2",
"metadata": {
"ExecuteTime": {
"end_time": "2025-04-09T14:57:36.913044Z",
"start_time": "2025-04-09T14:57:36.159612Z"
}
},
"outputs": [],
"source": [
"import tushare as ts\n",
"ts.set_token('3a0741c702ee7e5e5f2bf1f0846bafaafe4e320833240b2a7e4a685f')\n",
"pro = ts.pro_api()"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "14671a7f72de2564",
"metadata": {
"ExecuteTime": {
"end_time": "2025-04-09T14:57:39.128278Z",
"start_time": "2025-04-09T14:57:36.918051Z"
}
},
"outputs": [],
"source": [
"from datetime import datetime\n",
"import pandas as pd\n",
"import warnings\n",
"\n",
"warnings.filterwarnings(\"ignore\")\n",
"def filter_rows(df):\n",
" # 按照 name 和 start_date 分组\n",
" def select_row(group):\n",
" # 如果有 end_date 不为 NaT 的行,优先保留这些行\n",
" valid_rows = group[group['end_date'].notna()]\n",
" if not valid_rows.empty:\n",
" return valid_rows.iloc[0] # 返回第一个有效行\n",
" else:\n",
" return group.iloc[0] # 如果没有有效行,返回第一行\n",
"\n",
" filtered_df = df.groupby(['name', 'start_date'], group_keys=False).apply(select_row)\n",
" filtered_df = filtered_df.reset_index(drop=True)\n",
" return filtered_df\n",
"\n",
"def is_st(name_change_dict, stock_code, target_date):\n",
" target_date = datetime.strptime(target_date, '%Y%m%d')\n",
" if stock_code not in name_change_dict.keys():\n",
" return False\n",
" df = name_change_dict[stock_code]\n",
" for i in range(len(df)):\n",
" sds = df.iloc[i, 2]\n",
" eds = df.iloc[i, 3]\n",
" if eds is None or eds is pd.NaT:\n",
" eds = datetime.now()\n",
" if (target_date - sds).days >= 0 and (target_date - eds).days <= 0:\n",
" return True\n",
" return False\n",
"\n",
"name_change_df = pd.read_hdf('/mnt/d/PyProject/NewStock/data/name_change.h5', key='name_change')\n",
"name_change_df = name_change_df.drop_duplicates(keep='first')\n",
"\n",
"# 确保 name_change_df 的日期格式正确\n",
"name_change_df['start_date'] = pd.to_datetime(name_change_df['start_date'], format='%Y%m%d')\n",
"name_change_df['end_date'] = pd.to_datetime(name_change_df['end_date'], format='%Y%m%d', errors='coerce')\n",
"# name_change_df = name_change_df[name_change_df.name.str.contains('ST') ]\n",
"name_change_dict = {}\n",
"for ts_code, group in name_change_df.groupby('ts_code'):\n",
" # 只保留 'ST' 和 '*ST' 的记录\n",
" # st_data = group[(group['change_reason'] == 'ST') | (group['change_reason'] == '*ST')]\n",
" st_data = group[(group['name'].str.contains('ST')) | (group['name'].str.contains('退'))]\n",
" if not st_data.empty:\n",
" name_change_dict[ts_code] = filter_rows(st_data)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "e7f8cce2f80e2f20",
"metadata": {
"ExecuteTime": {
"end_time": "2025-04-09T14:58:09.296046Z",
"start_time": "2025-04-09T14:57:39.339423Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"Index: 9602310 entries, 0 to 27316\n",
"Data columns (total 2 columns):\n",
" # Column Dtype \n",
"--- ------ ----- \n",
" 0 ts_code object\n",
" 1 trade_date object\n",
"dtypes: object(2)\n",
"memory usage: 219.8+ MB\n",
"None\n",
"20260130\n",
"20260202\n"
]
}
],
"source": [
"import time\n",
"from concurrent.futures import ThreadPoolExecutor, as_completed\n",
"\n",
"h5_filename = '/mnt/d/PyProject/NewStock/data/daily_basic.h5'\n",
"key = '/daily_basic'\n",
"max_date = None\n",
"with pd.HDFStore(h5_filename, mode='r') as store:\n",
" df = store[key][['ts_code', 'trade_date']]\n",
" print(df.info())\n",
" max_date = df['trade_date'].max()\n",
"\n",
"print(max_date)\n",
"trade_cal = pro.trade_cal(exchange='', start_date='20170101', end_date='20260310')\n",
"trade_cal = trade_cal[trade_cal['is_open'] == 1] # 只保留交易日\n",
"trade_dates = trade_cal[trade_cal['cal_date'] > max_date]['cal_date'].tolist()\n",
"start_date = min(trade_dates)\n",
"print(start_date)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "553cfb36-f560-4cc4-b2bc-68323ccc5072",
"metadata": {
"ExecuteTime": {
"end_time": "2025-04-09T14:58:16.817010Z",
"start_time": "2025-04-09T14:58:09.326485Z"
},
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"任务 20260309 完成\n",
"任务 20260310 完成\n",
"任务 20260305 完成\n",
"任务 20260306 完成\n",
"任务 20260303 完成\n",
"任务 20260304 完成\n",
"任务 20260302 完成\n",
"任务 20260227 完成\n",
"任务 20260226 完成\n",
"任务 20260225 完成\n",
"任务 20260224 完成\n",
"任务 20260213 完成\n",
"任务 20260212 完成\n",
"任务 20260211 完成\n",
"任务 20260210 完成\n",
"任务 20260209 完成\n",
"任务 20260206 完成\n",
"任务 20260205 完成\n",
"任务 20260204 完成\n",
"任务 20260203 完成\n",
"任务 20260202 完成\n"
]
}
],
"source": [
"\n",
"\n",
"# 使用 HDFStore 存储数据\n",
"all_daily_data = []\n",
"\n",
"# API 调用计数和时间控制变量\n",
"api_call_count = 0\n",
"batch_start_time = time.time()\n",
"\n",
"\n",
"def get_data(trade_date):\n",
" daily_basic_data = pro.daily_basic(ts_code='', trade_date=trade_date)\n",
" if daily_basic_data is not None and not daily_basic_data.empty:\n",
" # 添加交易日期列标识\n",
" daily_basic_data['trade_date'] = trade_date\n",
" daily_basic_data['is_st'] = daily_basic_data.apply(\n",
" lambda row: is_st(name_change_dict, row['ts_code'], row['trade_date']), axis=1\n",
" )\n",
" time.sleep(0.2)\n",
" # print(f\"成功获取并保存 {trade_date} 的每日基础数据\")\n",
" return daily_basic_data\n",
"\n",
"\n",
"# 遍历每个交易日期并获取数据\n",
"with ThreadPoolExecutor(max_workers=2) as executor:\n",
" future_to_date = {executor.submit(get_data, td): td for td in trade_dates}\n",
"\n",
" for future in as_completed(future_to_date):\n",
" trade_date = future_to_date[future] # 获取对应的交易日期\n",
" try:\n",
" result = future.result() # 获取任务执行的结果\n",
" all_daily_data.append(result)\n",
" print(f\"任务 {trade_date} 完成\")\n",
" except Exception as e:\n",
" print(f\"获取 {trade_date} 数据时出错: {e}\")\n",
" # 计数一次 API 调用\n",
" api_call_count += 1\n",
"\n",
" # 每调用 300 次,检查时间是否少于 1 分钟,如果少于则等待剩余时间\n",
" if api_call_count % 150 == 0:\n",
" elapsed = time.time() - batch_start_time\n",
" if elapsed < 60:\n",
" sleep_time = 60 - elapsed\n",
" print(f\"已调用 150 次 API等待 {sleep_time:.2f} 秒以满足速率限制...\")\n",
" time.sleep(sleep_time)\n",
" # 重置批次起始时间\n",
" batch_start_time = time.time()\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "919023c693d7a47a",
"metadata": {
"ExecuteTime": {
"end_time": "2025-04-09T14:58:16.864178Z",
"start_time": "2025-04-09T14:58:16.855084Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" ts_code trade_date close turnover_rate turnover_rate_f \\\n",
"0 001389.SZ 20260206 99.50 6.1259 6.1259 \n",
"1 600841.SH 20260206 9.55 4.4177 14.0519 \n",
"2 300968.SZ 20260206 13.71 1.0258 2.1909 \n",
"3 300634.SZ 20260206 28.46 4.0862 6.8510 \n",
"4 300295.SZ 20260206 11.99 3.5950 3.8451 \n",
"... ... ... ... ... ... \n",
"27325 603766.SH 20260202 14.89 1.1630 1.9407 \n",
"27326 603408.SH 20260202 12.92 0.7963 3.0049 \n",
"27327 000004.SZ 20260202 11.99 0.2338 0.3161 \n",
"27328 601628.SH 20260202 48.34 0.1384 1.9210 \n",
"27329 301042.SZ 20260202 86.10 3.6207 5.4797 \n",
"\n",
" volume_ratio pe pe_ttm pb ps ps_ttm dv_ratio \\\n",
"0 0.83 62.6474 46.6770 11.6304 11.3424 8.6639 0.4819 \n",
"1 1.02 NaN NaN 4.1908 2.0493 2.4531 NaN \n",
"2 0.80 106.8945 225.1215 2.9871 4.6434 4.4246 0.3647 \n",
"3 0.68 55.7987 55.5119 4.4068 7.7725 7.2020 0.5590 \n",
"4 0.40 NaN NaN 2.0169 18.5210 23.1598 NaN \n",
"... ... ... ... ... ... ... ... \n",
"27325 1.04 27.2701 16.9932 3.2106 1.8177 1.5958 2.6864 \n",
"27326 0.99 11.9989 13.3289 1.7275 1.1550 1.1674 3.8700 \n",
"27327 0.05 NaN NaN 42.9590 16.0879 21.5107 NaN \n",
"27328 0.95 12.7771 8.0270 2.2069 2.5849 2.1373 1.4233 \n",
"27329 1.92 84.1572 534.3244 5.7108 8.6920 12.7424 NaN \n",
"\n",
" dv_ttm total_share float_share free_share total_mv \\\n",
"0 0.4819 4.256875e+04 1.511580e+04 15115.7957 4.235591e+06 \n",
"1 NaN 1.387822e+05 1.043024e+05 32790.9410 1.325370e+06 \n",
"2 0.3647 4.133800e+04 4.133800e+04 19355.6537 5.667440e+05 \n",
"3 0.5590 4.512109e+04 4.345735e+04 25919.5274 1.284146e+06 \n",
"4 NaN 1.896137e+04 1.675259e+04 15662.9042 2.273468e+05 \n",
"... ... ... ... ... ... \n",
"27325 4.0296 2.053542e+05 2.053542e+05 123065.9980 3.057724e+06 \n",
"27326 5.3406 4.475730e+04 4.475730e+04 11860.1633 5.782643e+05 \n",
"27327 NaN 1.323803e+04 1.262878e+04 9339.3580 1.587240e+05 \n",
"27328 1.4233 2.826470e+06 2.082353e+06 150000.0000 1.366316e+08 \n",
"27329 NaN 6.972358e+03 6.602083e+03 4362.2827 6.003200e+05 \n",
"\n",
" circ_mv is_st \n",
"0 1.504022e+06 False \n",
"1 9.960884e+05 False \n",
"2 5.667440e+05 False \n",
"3 1.236796e+06 False \n",
"4 2.008635e+05 False \n",
"... ... ... \n",
"27325 3.057724e+06 False \n",
"27326 5.782643e+05 False \n",
"27327 1.514190e+05 True \n",
"27328 1.006609e+08 False \n",
"27329 5.684393e+05 False \n",
"\n",
"[27330 rows x 19 columns]\n"
]
}
],
"source": [
"all_daily_data_df = pd.concat(all_daily_data, ignore_index=True)\n",
"print(all_daily_data_df)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "28cb78d032671b20",
"metadata": {
"ExecuteTime": {
"end_time": "2025-04-09T14:58:16.881685Z",
"start_time": "2025-04-09T14:58:16.871184Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" ts_code trade_date close turnover_rate turnover_rate_f \\\n",
"52 002713.SZ 20260206 9.52 4.6530 6.2693 \n",
"107 000609.SZ 20260206 8.23 0.2427 0.3213 \n",
"113 300052.SZ 20260206 13.86 3.5983 4.3260 \n",
"116 600624.SH 20260206 5.31 0.6260 0.8827 \n",
"119 300555.SZ 20260206 14.31 1.4520 1.7474 \n",
"... ... ... ... ... ... \n",
"27182 000668.SZ 20260202 13.85 3.5014 5.9253 \n",
"27240 003032.SZ 20260202 6.37 2.2066 3.1223 \n",
"27305 000691.SZ 20260202 8.30 1.9108 2.2317 \n",
"27312 300167.SZ 20260202 4.65 2.1705 2.3552 \n",
"27327 000004.SZ 20260202 11.99 0.2338 0.3161 \n",
"\n",
" volume_ratio pe pe_ttm pb ps ps_ttm dv_ratio dv_ttm \\\n",
"52 1.02 NaN NaN NaN 6.9890 12.8336 NaN NaN \n",
"107 0.05 NaN NaN NaN 8.0655 15.8232 NaN NaN \n",
"113 1.51 NaN NaN 9.6004 15.9784 18.8191 NaN NaN \n",
"116 0.60 NaN NaN 5.7984 5.5900 5.4551 NaN NaN \n",
"119 1.24 NaN NaN 6.0224 16.1604 18.5602 NaN NaN \n",
"... ... .. ... ... ... ... ... ... \n",
"27182 1.32 NaN NaN 2.8792 15.2387 7.2708 NaN NaN \n",
"27240 1.15 NaN NaN 2.0451 10.4354 7.6772 NaN NaN \n",
"27305 0.69 NaN NaN NaN 9.0850 9.0002 NaN NaN \n",
"27312 0.71 NaN 134.128 63.0032 4.3517 3.8269 NaN NaN \n",
"27327 0.05 NaN NaN 42.9590 16.0879 21.5107 NaN NaN \n",
"\n",
" total_share float_share free_share total_mv circ_mv is_st \n",
"52 95140.5184 52650.0831 39075.6229 905737.7352 501228.7911 True \n",
"107 29926.5522 29105.8272 21991.3472 246295.5246 239540.9579 True \n",
"113 26185.8710 26185.1210 21780.6332 362936.1721 362925.7771 True \n",
"116 67934.6942 67934.6942 48180.1806 360733.2262 360733.2262 True \n",
"119 20000.0000 19798.4863 16450.7792 286200.0000 283316.3390 True \n",
"... ... ... ... ... ... ... \n",
"27182 14684.1890 14684.1890 8677.2104 203376.0177 203376.0177 True \n",
"27240 40244.7500 27621.9885 19520.7308 256359.0575 175952.0667 True \n",
"27305 48490.5000 32327.0000 27679.8405 402471.1500 268314.1000 True \n",
"27312 38937.4000 35889.3250 33073.6637 181058.9100 166885.3613 True \n",
"27327 13238.0282 12628.7768 9339.3580 158723.9581 151419.0338 True \n",
"\n",
"[872 rows x 19 columns]\n"
]
}
],
"source": [
"print(all_daily_data_df[all_daily_data_df['is_st']])"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "692b58674b7462c9",
"metadata": {
"ExecuteTime": {
"end_time": "2025-04-09T14:58:17.773453Z",
"start_time": "2025-04-09T14:58:16.903459Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"所有每日基础数据获取并保存完毕!\n"
]
}
],
"source": [
"# 将数据保存为 HDF5 文件table 格式)\n",
"all_daily_data_df.to_hdf(h5_filename, key='daily_basic', mode='a', format='table', append=True, data_columns=True)\n",
"\n",
"print(\"所有每日基础数据获取并保存完毕!\")\n"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "d7a773fc20293477",
"metadata": {
"ExecuteTime": {
"end_time": "2025-04-09T14:58:24.305403Z",
"start_time": "2025-04-09T14:58:17.816332Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"Index: 9629640 entries, 0 to 27329\n",
"Data columns (total 3 columns):\n",
" # Column Dtype \n",
"--- ------ ----- \n",
" 0 ts_code object\n",
" 1 trade_date object\n",
" 2 is_st bool \n",
"dtypes: bool(1), object(2)\n",
"memory usage: 229.6+ MB\n",
"None\n"
]
}
],
"source": [
"with pd.HDFStore(h5_filename, mode='r') as store:\n",
" df = store[key][['ts_code', 'trade_date', 'is_st']]\n",
" print(df.info())"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "stock",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.11"
}
},
"nbformat": 4,
"nbformat_minor": 5
}