Files
NewStock/main/data/update/update_daily_basic.ipynb

452 lines
18 KiB
Plaintext
Raw Normal View History

2025-02-12 00:21:33 +08:00
{
"cells": [
{
"cell_type": "code",
2025-05-06 23:42:40 +08:00
"execution_count": 1,
2025-02-12 00:21:33 +08:00
"id": "18d1d622-b083-4cc4-a6f8-7c1ed2d0edd2",
"metadata": {
"ExecuteTime": {
2025-04-10 23:17:22 +08:00
"end_time": "2025-04-09T14:57:36.913044Z",
"start_time": "2025-04-09T14:57:36.159612Z"
2025-02-12 00:21:33 +08:00
}
},
2025-05-06 23:42:40 +08:00
"outputs": [],
2025-02-12 00:21:33 +08:00
"source": [
"import tushare as ts\n",
"ts.set_token('3a0741c702ee7e5e5f2bf1f0846bafaafe4e320833240b2a7e4a685f')\n",
"pro = ts.pro_api()"
2025-05-06 23:42:40 +08:00
]
2025-02-12 00:21:33 +08:00
},
{
2025-02-15 23:33:34 +08:00
"cell_type": "code",
2025-05-13 15:30:06 +08:00
"execution_count": 2,
2025-02-15 23:33:34 +08:00
"id": "14671a7f72de2564",
2025-02-12 00:21:33 +08:00
"metadata": {
"ExecuteTime": {
2025-04-10 23:17:22 +08:00
"end_time": "2025-04-09T14:57:39.128278Z",
"start_time": "2025-04-09T14:57:36.918051Z"
2025-02-12 00:21:33 +08:00
}
},
2025-05-06 23:42:40 +08:00
"outputs": [],
2025-02-12 00:21:33 +08:00
"source": [
"from datetime import datetime\n",
"import pandas as pd\n",
2025-03-31 23:08:03 +08:00
"import warnings\n",
"\n",
"warnings.filterwarnings(\"ignore\")\n",
"def filter_rows(df):\n",
" # 按照 name 和 start_date 分组\n",
" def select_row(group):\n",
" # 如果有 end_date 不为 NaT 的行,优先保留这些行\n",
" valid_rows = group[group['end_date'].notna()]\n",
" if not valid_rows.empty:\n",
" return valid_rows.iloc[0] # 返回第一个有效行\n",
" else:\n",
" return group.iloc[0] # 如果没有有效行,返回第一行\n",
"\n",
" filtered_df = df.groupby(['name', 'start_date'], group_keys=False).apply(select_row)\n",
" filtered_df = filtered_df.reset_index(drop=True)\n",
" return filtered_df\n",
2025-02-12 00:21:33 +08:00
"\n",
"def is_st(name_change_dict, stock_code, target_date):\n",
" target_date = datetime.strptime(target_date, '%Y%m%d')\n",
" if stock_code not in name_change_dict.keys():\n",
" return False\n",
" df = name_change_dict[stock_code]\n",
" for i in range(len(df)):\n",
" sds = df.iloc[i, 2]\n",
" eds = df.iloc[i, 3]\n",
" if eds is None or eds is pd.NaT:\n",
" eds = datetime.now()\n",
" if (target_date - sds).days >= 0 and (target_date - eds).days <= 0:\n",
" return True\n",
" return False\n",
"\n",
2025-06-02 22:23:44 +08:00
"name_change_df = pd.read_hdf('/mnt/d/PyProject/NewStock/data/name_change.h5', key='name_change')\n",
2025-02-12 00:21:33 +08:00
"name_change_df = name_change_df.drop_duplicates(keep='first')\n",
"\n",
"# 确保 name_change_df 的日期格式正确\n",
"name_change_df['start_date'] = pd.to_datetime(name_change_df['start_date'], format='%Y%m%d')\n",
"name_change_df['end_date'] = pd.to_datetime(name_change_df['end_date'], format='%Y%m%d', errors='coerce')\n",
2025-05-08 15:42:17 +08:00
"# name_change_df = name_change_df[name_change_df.name.str.contains('ST') ]\n",
2025-02-12 00:21:33 +08:00
"name_change_dict = {}\n",
"for ts_code, group in name_change_df.groupby('ts_code'):\n",
" # 只保留 'ST' 和 '*ST' 的记录\n",
2025-05-06 23:42:40 +08:00
" # st_data = group[(group['change_reason'] == 'ST') | (group['change_reason'] == '*ST')]\n",
2025-05-08 15:42:17 +08:00
" st_data = group[(group['name'].str.contains('ST')) | (group['name'].str.contains('退'))]\n",
2025-02-12 00:21:33 +08:00
" if not st_data.empty:\n",
2025-03-31 23:08:03 +08:00
" name_change_dict[ts_code] = filter_rows(st_data)"
2025-05-06 23:42:40 +08:00
]
2025-02-12 00:21:33 +08:00
},
{
2025-02-15 23:33:34 +08:00
"cell_type": "code",
2025-05-13 15:30:06 +08:00
"execution_count": 3,
2025-02-15 23:33:34 +08:00
"id": "e7f8cce2f80e2f20",
2025-02-12 00:21:33 +08:00
"metadata": {
"ExecuteTime": {
2025-04-10 23:17:22 +08:00
"end_time": "2025-04-09T14:58:09.296046Z",
"start_time": "2025-04-09T14:57:39.339423Z"
2025-02-12 00:21:33 +08:00
}
},
2025-05-06 23:42:40 +08:00
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
2026-02-24 13:06:14 +08:00
"Index: 9629640 entries, 0 to 27329\n",
2025-05-06 23:42:40 +08:00
"Data columns (total 2 columns):\n",
" # Column Dtype \n",
"--- ------ ----- \n",
" 0 ts_code object\n",
" 1 trade_date object\n",
"dtypes: object(2)\n",
2026-02-24 13:06:14 +08:00
"memory usage: 220.4+ MB\n",
2025-05-06 23:42:40 +08:00
"None\n",
2026-02-24 13:06:14 +08:00
"20260206\n",
"20260209\n"
2025-05-06 23:42:40 +08:00
]
}
],
2025-02-12 00:21:33 +08:00
"source": [
"import time\n",
"from concurrent.futures import ThreadPoolExecutor, as_completed\n",
"\n",
2025-06-02 22:23:44 +08:00
"h5_filename = '/mnt/d/PyProject/NewStock/data/daily_basic.h5'\n",
2025-02-12 00:21:33 +08:00
"key = '/daily_basic'\n",
"max_date = None\n",
"with pd.HDFStore(h5_filename, mode='r') as store:\n",
" df = store[key][['ts_code', 'trade_date']]\n",
" print(df.info())\n",
" max_date = df['trade_date'].max()\n",
"\n",
"print(max_date)\n",
"trade_cal = pro.trade_cal(exchange='', start_date='20170101', end_date='20260310')\n",
2025-02-12 00:21:33 +08:00
"trade_cal = trade_cal[trade_cal['is_open'] == 1] # 只保留交易日\n",
"trade_dates = trade_cal[trade_cal['cal_date'] > max_date]['cal_date'].tolist()\n",
"start_date = min(trade_dates)\n",
"print(start_date)"
2025-05-06 23:42:40 +08:00
]
2025-04-09 22:57:01 +08:00
},
{
"cell_type": "code",
2025-05-13 15:30:06 +08:00
"execution_count": 4,
2025-04-09 22:57:01 +08:00
"id": "553cfb36-f560-4cc4-b2bc-68323ccc5072",
"metadata": {
"ExecuteTime": {
2025-04-10 23:17:22 +08:00
"end_time": "2025-04-09T14:58:16.817010Z",
"start_time": "2025-04-09T14:58:09.326485Z"
2025-05-06 23:42:40 +08:00
},
"scrolled": true
2025-04-09 22:57:01 +08:00
},
2025-05-06 23:42:40 +08:00
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"任务 20260309 完成\n",
"任务 20260310 完成\n",
"任务 20260306 完成\n",
2026-02-24 13:06:14 +08:00
"任务 20260305 完成\n",
"任务 20260303 完成\n",
"任务 20260304 完成\n",
"任务 20260302 完成\n",
"任务 20260227 完成\n",
"任务 20260226 完成\n",
"任务 20260225 完成\n",
"任务 20260224 完成\n",
"任务 20260213 完成\n",
"任务 20260212 完成\n",
"任务 20260211 完成\n",
"任务 20260210 完成\n",
2026-02-24 13:06:14 +08:00
"任务 20260209 完成\n"
2025-05-06 23:42:40 +08:00
]
}
],
2025-02-12 00:21:33 +08:00
"source": [
"\n",
"\n",
"# 使用 HDFStore 存储数据\n",
"all_daily_data = []\n",
"\n",
"# API 调用计数和时间控制变量\n",
"api_call_count = 0\n",
"batch_start_time = time.time()\n",
"\n",
"\n",
"def get_data(trade_date):\n",
" daily_basic_data = pro.daily_basic(ts_code='', trade_date=trade_date)\n",
" if daily_basic_data is not None and not daily_basic_data.empty:\n",
" # 添加交易日期列标识\n",
" daily_basic_data['trade_date'] = trade_date\n",
" daily_basic_data['is_st'] = daily_basic_data.apply(\n",
" lambda row: is_st(name_change_dict, row['ts_code'], row['trade_date']), axis=1\n",
" )\n",
" time.sleep(0.2)\n",
" # print(f\"成功获取并保存 {trade_date} 的每日基础数据\")\n",
" return daily_basic_data\n",
"\n",
"\n",
"# 遍历每个交易日期并获取数据\n",
"with ThreadPoolExecutor(max_workers=2) as executor:\n",
" future_to_date = {executor.submit(get_data, td): td for td in trade_dates}\n",
"\n",
" for future in as_completed(future_to_date):\n",
" trade_date = future_to_date[future] # 获取对应的交易日期\n",
" try:\n",
" result = future.result() # 获取任务执行的结果\n",
" all_daily_data.append(result)\n",
" print(f\"任务 {trade_date} 完成\")\n",
" except Exception as e:\n",
" print(f\"获取 {trade_date} 数据时出错: {e}\")\n",
" # 计数一次 API 调用\n",
" api_call_count += 1\n",
"\n",
" # 每调用 300 次,检查时间是否少于 1 分钟,如果少于则等待剩余时间\n",
" if api_call_count % 150 == 0:\n",
" elapsed = time.time() - batch_start_time\n",
" if elapsed < 60:\n",
" sleep_time = 60 - elapsed\n",
" print(f\"已调用 150 次 API等待 {sleep_time:.2f} 秒以满足速率限制...\")\n",
" time.sleep(sleep_time)\n",
" # 重置批次起始时间\n",
" batch_start_time = time.time()\n",
"\n"
2025-05-06 23:42:40 +08:00
]
2025-02-12 00:21:33 +08:00
},
{
2025-02-15 23:33:34 +08:00
"cell_type": "code",
2025-05-13 15:30:06 +08:00
"execution_count": 5,
2025-02-15 23:33:34 +08:00
"id": "919023c693d7a47a",
2025-02-12 00:21:33 +08:00
"metadata": {
"ExecuteTime": {
2025-04-10 23:17:22 +08:00
"end_time": "2025-04-09T14:58:16.864178Z",
"start_time": "2025-04-09T14:58:16.855084Z"
2025-02-12 00:21:33 +08:00
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
2026-02-24 13:06:14 +08:00
" ts_code trade_date close turnover_rate turnover_rate_f \\\n",
"0 001301.SZ 20260213 78.24 1.0939 1.3757 \n",
"1 301050.SZ 20260213 53.61 2.8931 3.6211 \n",
"2 000829.SZ 20260213 10.58 2.2336 3.5271 \n",
"3 688498.SH 20260213 746.98 3.4302 4.7903 \n",
"4 920510.BJ 20260213 18.30 2.8519 4.1749 \n",
"... ... ... ... ... ... \n",
"27350 000065.SZ 20260209 12.07 1.7167 3.2552 \n",
"27351 920249.BJ 20260209 13.86 3.2235 3.3218 \n",
"27352 300824.SZ 20260209 11.36 1.2161 3.0644 \n",
"27353 000766.SZ 20260209 23.65 1.8680 2.3253 \n",
"27354 000591.SZ 20260209 5.72 5.9341 9.0727 \n",
2025-02-12 00:21:33 +08:00
"\n",
2026-02-24 13:06:14 +08:00
" volume_ratio pe pe_ttm pb ps ps_ttm \\\n",
"0 0.51 24.3404 20.9997 3.0175 3.9021 2.8680 \n",
"1 0.80 38.6753 105.5089 4.2033 11.2338 18.9830 \n",
"2 0.65 347.7936 NaN 3.7189 0.1291 0.1352 \n",
"3 0.66 NaN 640.0343 29.5658 254.5923 140.4090 \n",
"4 1.41 160.0964 NaN 8.4111 14.5063 16.3558 \n",
"... ... ... ... ... ... ... \n",
"27350 0.73 13.3573 18.0503 1.2589 0.7347 0.9425 \n",
"27351 0.85 NaN NaN 8.0159 2.9840 2.5496 \n",
"27352 0.87 53.3349 33.5338 4.9536 4.9192 3.9029 \n",
"27353 0.75 405.0086 397.6822 9.9329 17.2829 18.3415 \n",
"27354 1.27 18.3189 21.1797 0.9314 3.7171 4.1825 \n",
2025-02-12 00:21:33 +08:00
"\n",
2026-02-24 13:06:14 +08:00
" dv_ratio dv_ttm total_share float_share free_share total_mv \\\n",
"0 1.0183 1.0183 26080.2350 18605.2851 14794.9501 2.040518e+06 \n",
"1 0.6398 0.6398 24721.2171 21036.5666 16807.4482 1.325304e+06 \n",
"2 0.0945 0.0945 102510.0438 102412.9669 64854.1551 1.084556e+06 \n",
"3 0.0665 0.0533 8594.7726 8400.0000 6014.9271 6.420123e+06 \n",
"4 NaN NaN 18421.3929 9389.9078 6414.2078 3.371115e+05 \n",
"... ... ... ... ... ... ... \n",
"27350 1.1303 1.1303 116144.2159 97496.2288 51416.3105 1.401861e+06 \n",
"27351 NaN NaN 42163.0000 12466.9576 12097.9576 5.843792e+05 \n",
"27352 1.7937 1.7937 32634.1682 31649.5307 12560.3926 3.707242e+05 \n",
"27353 NaN NaN 96649.4707 96600.7681 77600.7681 2.285760e+06 \n",
"27354 1.9711 2.0779 392444.2493 392354.1671 256620.6136 2.244781e+06 \n",
2025-02-12 00:21:33 +08:00
"\n",
" circ_mv is_st \n",
2026-02-24 13:06:14 +08:00
"0 1.455678e+06 False \n",
"1 1.127770e+06 False \n",
"2 1.083529e+06 False \n",
"3 6.274632e+06 False \n",
"4 1.718353e+05 False \n",
"... ... ... \n",
2026-02-24 13:06:14 +08:00
"27350 1.176779e+06 False \n",
"27351 1.727920e+05 False \n",
"27352 3.595387e+05 False \n",
"27353 2.284608e+06 False \n",
"27354 2.244266e+06 False \n",
2025-02-12 00:21:33 +08:00
"\n",
2026-02-24 13:06:14 +08:00
"[27355 rows x 19 columns]\n"
2025-02-12 00:21:33 +08:00
]
}
],
2025-05-06 23:42:40 +08:00
"source": [
"all_daily_data_df = pd.concat(all_daily_data, ignore_index=True)\n",
"print(all_daily_data_df)"
]
2025-02-12 00:21:33 +08:00
},
{
2025-02-15 23:33:34 +08:00
"cell_type": "code",
2025-05-13 15:30:06 +08:00
"execution_count": 6,
2025-02-15 23:33:34 +08:00
"id": "28cb78d032671b20",
2025-02-12 00:21:33 +08:00
"metadata": {
"ExecuteTime": {
2025-04-10 23:17:22 +08:00
"end_time": "2025-04-09T14:58:16.881685Z",
"start_time": "2025-04-09T14:58:16.871184Z"
2025-02-12 00:21:33 +08:00
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" ts_code trade_date close turnover_rate turnover_rate_f \\\n",
2026-02-24 13:06:14 +08:00
"5 603261.SH 20260213 28.95 0.4664 1.3719 \n",
"14 002700.SZ 20260213 7.28 1.5973 1.6181 \n",
"34 300344.SZ 20260213 1.87 14.3407 25.9753 \n",
"67 000430.SZ 20260213 7.26 0.8541 1.3705 \n",
"81 000752.SZ 20260213 10.86 1.0429 1.2849 \n",
"... ... ... ... ... ... \n",
2026-02-24 13:06:14 +08:00
"27165 300301.SZ 20260209 2.49 1.9012 2.0554 \n",
"27244 002822.SZ 20260209 3.53 1.0762 1.0762 \n",
"27270 300147.SZ 20260209 9.44 1.7053 2.2069 \n",
"27281 002501.SZ 20260209 2.16 3.6044 4.6543 \n",
"27297 002620.SZ 20260209 5.98 2.2406 3.1755 \n",
"\n",
" volume_ratio pe pe_ttm pb ps ps_ttm dv_ratio \\\n",
"5 0.49 NaN NaN 3.3744 7.7410 7.8754 NaN \n",
"14 0.77 32.3153 40.4077 2.2014 4.2912 4.3540 0.9066 \n",
"34 0.60 NaN NaN 5.3118 4.6416 4.9881 NaN \n",
"67 0.49 NaN NaN 21.9522 13.6188 12.8340 NaN \n",
"81 1.21 109.3444 19.4711 4.6597 6.7963 6.4062 NaN \n",
"... ... ... ... ... ... ... ... \n",
"27165 0.97 NaN NaN 12.5588 4.0907 4.7109 NaN \n",
"27244 0.95 NaN NaN 4.9828 2.9693 6.5620 NaN \n",
"27270 1.21 NaN NaN 9.0126 3.3586 4.2436 NaN \n",
"27281 0.87 NaN NaN 29.0833 22.9884 26.1834 NaN \n",
"27297 1.30 NaN NaN NaN 2.8840 4.2972 NaN \n",
2025-02-12 00:21:33 +08:00
"\n",
2026-02-24 13:06:14 +08:00
" dv_ttm total_share float_share free_share total_mv \\\n",
"5 NaN 7750.5022 7750.5022 2634.9858 224377.0387 \n",
"14 0.9066 41362.8185 26346.4874 26007.0037 301121.3187 \n",
"34 NaN 64170.6416 63999.4166 35333.5001 119999.0998 \n",
"67 NaN 80963.5372 37055.6486 23092.8156 587795.2801 \n",
"81 NaN 26375.8491 26375.8491 21407.3042 286441.7212 \n",
"... ... ... ... ... ... \n",
"27165 NaN 82986.8769 78987.6719 73061.8561 206637.3235 \n",
"27244 NaN 195094.2200 107059.0368 107059.0368 688682.5966 \n",
"27270 NaN 66127.9045 65739.8353 50798.8432 624247.4185 \n",
"27281 NaN 355000.0000 354646.9206 274646.9206 766800.0000 \n",
"27297 NaN 37749.4000 31536.0303 22251.3747 225741.4120 \n",
2025-06-02 22:23:44 +08:00
"\n",
2026-02-24 13:06:14 +08:00
" circ_mv is_st \n",
"5 224377.0387 True \n",
"14 191802.4283 True \n",
"34 119678.9090 True \n",
"67 269024.0088 True \n",
"81 286441.7212 True \n",
"... ... ... \n",
"27165 196679.3030 True \n",
"27244 377918.3999 True \n",
"27270 620584.0452 True \n",
"27281 766037.3485 True \n",
"27297 188585.4612 True \n",
2025-11-29 00:23:12 +08:00
"\n",
2026-02-24 13:06:14 +08:00
"[870 rows x 19 columns]\n"
2025-02-12 00:21:33 +08:00
]
}
],
2025-05-06 23:42:40 +08:00
"source": [
"print(all_daily_data_df[all_daily_data_df['is_st']])"
]
2025-02-12 00:21:33 +08:00
},
{
2025-02-15 23:33:34 +08:00
"cell_type": "code",
2025-05-13 15:30:06 +08:00
"execution_count": 7,
2025-02-15 23:33:34 +08:00
"id": "692b58674b7462c9",
2025-02-12 00:21:33 +08:00
"metadata": {
"ExecuteTime": {
2025-04-10 23:17:22 +08:00
"end_time": "2025-04-09T14:58:17.773453Z",
"start_time": "2025-04-09T14:58:16.903459Z"
2025-02-12 00:21:33 +08:00
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"所有每日基础数据获取并保存完毕!\n"
]
}
],
2025-05-06 23:42:40 +08:00
"source": [
"# 将数据保存为 HDF5 文件table 格式)\n",
"all_daily_data_df.to_hdf(h5_filename, key='daily_basic', mode='a', format='table', append=True, data_columns=True)\n",
"\n",
"print(\"所有每日基础数据获取并保存完毕!\")\n"
]
2025-02-12 00:21:33 +08:00
},
{
2025-02-15 23:33:34 +08:00
"cell_type": "code",
2025-05-13 15:30:06 +08:00
"execution_count": 8,
2025-02-15 23:33:34 +08:00
"id": "d7a773fc20293477",
2025-02-12 00:21:33 +08:00
"metadata": {
"ExecuteTime": {
2025-04-10 23:17:22 +08:00
"end_time": "2025-04-09T14:58:24.305403Z",
"start_time": "2025-04-09T14:58:17.816332Z"
2025-02-12 00:21:33 +08:00
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
2026-02-24 13:06:14 +08:00
"Index: 9656995 entries, 0 to 27354\n",
2025-02-12 00:21:33 +08:00
"Data columns (total 3 columns):\n",
" # Column Dtype \n",
"--- ------ ----- \n",
" 0 ts_code object\n",
" 1 trade_date object\n",
" 2 is_st bool \n",
"dtypes: bool(1), object(2)\n",
2026-02-24 13:06:14 +08:00
"memory usage: 230.2+ MB\n",
2025-02-12 00:21:33 +08:00
"None\n"
]
}
],
2025-05-06 23:42:40 +08:00
"source": [
"with pd.HDFStore(h5_filename, mode='r') as store:\n",
" df = store[key][['ts_code', 'trade_date', 'is_st']]\n",
" print(df.info())"
]
2025-02-12 00:21:33 +08:00
}
],
"metadata": {
"kernelspec": {
2025-06-02 22:23:44 +08:00
"display_name": "stock",
2025-02-12 00:21:33 +08:00
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
2025-11-29 00:23:12 +08:00
"version": "3.12.11"
2025-02-12 00:21:33 +08:00
}
},
"nbformat": 4,
"nbformat_minor": 5
}