Files
NewStock/main/data/update/update_daily_basic.ipynb

452 lines
18 KiB
Plaintext
Raw Normal View History

2025-02-12 00:21:33 +08:00
{
"cells": [
{
"cell_type": "code",
2025-05-06 23:42:40 +08:00
"execution_count": 1,
2025-02-12 00:21:33 +08:00
"id": "18d1d622-b083-4cc4-a6f8-7c1ed2d0edd2",
"metadata": {
"ExecuteTime": {
2025-04-10 23:17:22 +08:00
"end_time": "2025-04-09T14:57:36.913044Z",
"start_time": "2025-04-09T14:57:36.159612Z"
2025-02-12 00:21:33 +08:00
}
},
2025-05-06 23:42:40 +08:00
"outputs": [],
2025-02-12 00:21:33 +08:00
"source": [
"import tushare as ts\n",
"ts.set_token('3a0741c702ee7e5e5f2bf1f0846bafaafe4e320833240b2a7e4a685f')\n",
"pro = ts.pro_api()"
2025-05-06 23:42:40 +08:00
]
2025-02-12 00:21:33 +08:00
},
{
2025-02-15 23:33:34 +08:00
"cell_type": "code",
2025-05-13 15:30:06 +08:00
"execution_count": 2,
2025-02-15 23:33:34 +08:00
"id": "14671a7f72de2564",
2025-02-12 00:21:33 +08:00
"metadata": {
"ExecuteTime": {
2025-04-10 23:17:22 +08:00
"end_time": "2025-04-09T14:57:39.128278Z",
"start_time": "2025-04-09T14:57:36.918051Z"
2025-02-12 00:21:33 +08:00
}
},
2025-05-06 23:42:40 +08:00
"outputs": [],
2025-02-12 00:21:33 +08:00
"source": [
"from datetime import datetime\n",
"import pandas as pd\n",
2025-03-31 23:08:03 +08:00
"import warnings\n",
"\n",
"warnings.filterwarnings(\"ignore\")\n",
"def filter_rows(df):\n",
" # 按照 name 和 start_date 分组\n",
" def select_row(group):\n",
" # 如果有 end_date 不为 NaT 的行,优先保留这些行\n",
" valid_rows = group[group['end_date'].notna()]\n",
" if not valid_rows.empty:\n",
" return valid_rows.iloc[0] # 返回第一个有效行\n",
" else:\n",
" return group.iloc[0] # 如果没有有效行,返回第一行\n",
"\n",
" filtered_df = df.groupby(['name', 'start_date'], group_keys=False).apply(select_row)\n",
" filtered_df = filtered_df.reset_index(drop=True)\n",
" return filtered_df\n",
2025-02-12 00:21:33 +08:00
"\n",
"def is_st(name_change_dict, stock_code, target_date):\n",
" target_date = datetime.strptime(target_date, '%Y%m%d')\n",
" if stock_code not in name_change_dict.keys():\n",
" return False\n",
" df = name_change_dict[stock_code]\n",
" for i in range(len(df)):\n",
" sds = df.iloc[i, 2]\n",
" eds = df.iloc[i, 3]\n",
" if eds is None or eds is pd.NaT:\n",
" eds = datetime.now()\n",
" if (target_date - sds).days >= 0 and (target_date - eds).days <= 0:\n",
" return True\n",
" return False\n",
"\n",
2025-06-02 22:23:44 +08:00
"name_change_df = pd.read_hdf('/mnt/d/PyProject/NewStock/data/name_change.h5', key='name_change')\n",
2025-02-12 00:21:33 +08:00
"name_change_df = name_change_df.drop_duplicates(keep='first')\n",
"\n",
"# 确保 name_change_df 的日期格式正确\n",
"name_change_df['start_date'] = pd.to_datetime(name_change_df['start_date'], format='%Y%m%d')\n",
"name_change_df['end_date'] = pd.to_datetime(name_change_df['end_date'], format='%Y%m%d', errors='coerce')\n",
2025-05-08 15:42:17 +08:00
"# name_change_df = name_change_df[name_change_df.name.str.contains('ST') ]\n",
2025-02-12 00:21:33 +08:00
"name_change_dict = {}\n",
"for ts_code, group in name_change_df.groupby('ts_code'):\n",
" # 只保留 'ST' 和 '*ST' 的记录\n",
2025-05-06 23:42:40 +08:00
" # st_data = group[(group['change_reason'] == 'ST') | (group['change_reason'] == '*ST')]\n",
2025-05-08 15:42:17 +08:00
" st_data = group[(group['name'].str.contains('ST')) | (group['name'].str.contains('退'))]\n",
2025-02-12 00:21:33 +08:00
" if not st_data.empty:\n",
2025-03-31 23:08:03 +08:00
" name_change_dict[ts_code] = filter_rows(st_data)"
2025-05-06 23:42:40 +08:00
]
2025-02-12 00:21:33 +08:00
},
{
2025-02-15 23:33:34 +08:00
"cell_type": "code",
2025-05-13 15:30:06 +08:00
"execution_count": 3,
2025-02-15 23:33:34 +08:00
"id": "e7f8cce2f80e2f20",
2025-02-12 00:21:33 +08:00
"metadata": {
"ExecuteTime": {
2025-04-10 23:17:22 +08:00
"end_time": "2025-04-09T14:58:09.296046Z",
"start_time": "2025-04-09T14:57:39.339423Z"
2025-02-12 00:21:33 +08:00
}
},
2025-05-06 23:42:40 +08:00
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"Index: 9547667 entries, 0 to 27281\n",
2025-05-06 23:42:40 +08:00
"Data columns (total 2 columns):\n",
" # Column Dtype \n",
"--- ------ ----- \n",
" 0 ts_code object\n",
" 1 trade_date object\n",
"dtypes: object(2)\n",
"memory usage: 218.5+ MB\n",
2025-05-06 23:42:40 +08:00
"None\n",
"20260116\n",
"20260119\n"
2025-05-06 23:42:40 +08:00
]
}
],
2025-02-12 00:21:33 +08:00
"source": [
"import time\n",
"from concurrent.futures import ThreadPoolExecutor, as_completed\n",
"\n",
2025-06-02 22:23:44 +08:00
"h5_filename = '/mnt/d/PyProject/NewStock/data/daily_basic.h5'\n",
2025-02-12 00:21:33 +08:00
"key = '/daily_basic'\n",
"max_date = None\n",
"with pd.HDFStore(h5_filename, mode='r') as store:\n",
" df = store[key][['ts_code', 'trade_date']]\n",
" print(df.info())\n",
" max_date = df['trade_date'].max()\n",
"\n",
"print(max_date)\n",
"trade_cal = pro.trade_cal(exchange='', start_date='20170101', end_date='20260201')\n",
2025-02-12 00:21:33 +08:00
"trade_cal = trade_cal[trade_cal['is_open'] == 1] # 只保留交易日\n",
"trade_dates = trade_cal[trade_cal['cal_date'] > max_date]['cal_date'].tolist()\n",
"start_date = min(trade_dates)\n",
"print(start_date)"
2025-05-06 23:42:40 +08:00
]
2025-04-09 22:57:01 +08:00
},
{
"cell_type": "code",
2025-05-13 15:30:06 +08:00
"execution_count": 4,
2025-04-09 22:57:01 +08:00
"id": "553cfb36-f560-4cc4-b2bc-68323ccc5072",
"metadata": {
"ExecuteTime": {
2025-04-10 23:17:22 +08:00
"end_time": "2025-04-09T14:58:16.817010Z",
"start_time": "2025-04-09T14:58:09.326485Z"
2025-05-06 23:42:40 +08:00
},
"scrolled": true
2025-04-09 22:57:01 +08:00
},
2025-05-06 23:42:40 +08:00
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"任务 20260130 完成\n",
"任务 20260129 完成\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"任务 20260127 完成\n",
"任务 20260128 完成\n",
"任务 20260126 完成\n",
"任务 20260123 完成\n",
"任务 20260122 完成\n",
"任务 20260121 完成\n",
"任务 20260120 完成\n",
"任务 20260119 完成\n"
2025-05-06 23:42:40 +08:00
]
}
],
2025-02-12 00:21:33 +08:00
"source": [
"\n",
"\n",
"# 使用 HDFStore 存储数据\n",
"all_daily_data = []\n",
"\n",
"# API 调用计数和时间控制变量\n",
"api_call_count = 0\n",
"batch_start_time = time.time()\n",
"\n",
"\n",
"def get_data(trade_date):\n",
" daily_basic_data = pro.daily_basic(ts_code='', trade_date=trade_date)\n",
" if daily_basic_data is not None and not daily_basic_data.empty:\n",
" # 添加交易日期列标识\n",
" daily_basic_data['trade_date'] = trade_date\n",
" daily_basic_data['is_st'] = daily_basic_data.apply(\n",
" lambda row: is_st(name_change_dict, row['ts_code'], row['trade_date']), axis=1\n",
" )\n",
" time.sleep(0.2)\n",
" # print(f\"成功获取并保存 {trade_date} 的每日基础数据\")\n",
" return daily_basic_data\n",
"\n",
"\n",
"# 遍历每个交易日期并获取数据\n",
"with ThreadPoolExecutor(max_workers=2) as executor:\n",
" future_to_date = {executor.submit(get_data, td): td for td in trade_dates}\n",
"\n",
" for future in as_completed(future_to_date):\n",
" trade_date = future_to_date[future] # 获取对应的交易日期\n",
" try:\n",
" result = future.result() # 获取任务执行的结果\n",
" all_daily_data.append(result)\n",
" print(f\"任务 {trade_date} 完成\")\n",
" except Exception as e:\n",
" print(f\"获取 {trade_date} 数据时出错: {e}\")\n",
" # 计数一次 API 调用\n",
" api_call_count += 1\n",
"\n",
" # 每调用 300 次,检查时间是否少于 1 分钟,如果少于则等待剩余时间\n",
" if api_call_count % 150 == 0:\n",
" elapsed = time.time() - batch_start_time\n",
" if elapsed < 60:\n",
" sleep_time = 60 - elapsed\n",
" print(f\"已调用 150 次 API等待 {sleep_time:.2f} 秒以满足速率限制...\")\n",
" time.sleep(sleep_time)\n",
" # 重置批次起始时间\n",
" batch_start_time = time.time()\n",
"\n"
2025-05-06 23:42:40 +08:00
]
2025-02-12 00:21:33 +08:00
},
{
2025-02-15 23:33:34 +08:00
"cell_type": "code",
2025-05-13 15:30:06 +08:00
"execution_count": 5,
2025-02-15 23:33:34 +08:00
"id": "919023c693d7a47a",
2025-02-12 00:21:33 +08:00
"metadata": {
"ExecuteTime": {
2025-04-10 23:17:22 +08:00
"end_time": "2025-04-09T14:58:16.864178Z",
"start_time": "2025-04-09T14:58:16.855084Z"
2025-02-12 00:21:33 +08:00
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" ts_code trade_date close turnover_rate turnover_rate_f \\\n",
"0 301586.SZ 20260123 52.80 4.4195 6.0484 \n",
"1 600871.SH 20260123 2.63 3.5599 17.1067 \n",
"2 002067.SZ 20260123 5.91 9.5542 9.8833 \n",
"3 601225.SH 20260123 21.41 0.4692 1.3502 \n",
"4 688800.SH 20260123 90.22 4.3421 7.2546 \n",
"... ... ... ... ... ... \n",
"27321 688659.SH 20260119 10.42 1.7256 3.1386 \n",
"27322 301021.SZ 20260119 55.92 5.8451 10.3979 \n",
"27323 300102.SZ 20260119 34.23 14.1090 22.0304 \n",
"27324 300088.SZ 20260119 6.36 1.9148 2.3308 \n",
"27325 002261.SZ 20260119 33.77 4.6567 5.1897 \n",
2025-02-12 00:21:33 +08:00
"\n",
" volume_ratio pe pe_ttm pb ps ps_ttm dv_ratio \\\n",
"0 0.98 43.6343 68.1615 3.4308 6.9893 5.8003 0.9091 \n",
"1 1.91 78.9369 79.8327 5.2226 0.6148 0.6127 NaN \n",
"2 0.80 122.3105 140.3743 1.3170 1.5908 1.5491 NaN \n",
"3 1.02 9.2832 10.8507 2.1981 1.1272 1.1740 5.9699 \n",
"4 0.79 105.8709 61.3816 8.0916 7.6847 5.8974 0.2971 \n",
"... ... ... ... ... ... ... ... \n",
"27321 1.01 NaN NaN 3.1071 2.6478 2.4387 NaN \n",
"27322 0.72 391.5181 198.4990 8.4757 19.1357 17.1491 0.1772 \n",
"27323 0.69 327.8681 232.9793 7.4375 12.9485 9.5346 0.1313 \n",
"27324 0.52 44.5231 45.4726 1.8179 1.4366 1.3961 0.9198 \n",
"27325 0.38 NaN NaN 16.3185 10.3586 13.1287 NaN \n",
2025-02-12 00:21:33 +08:00
"\n",
" dv_ttm total_share float_share free_share total_mv \\\n",
"0 0.3788 8.297550e+03 5.291107e+03 3866.1069 4.381107e+05 \n",
"1 NaN 1.895705e+06 1.354701e+06 281911.5987 4.985703e+06 \n",
"2 NaN 1.474854e+05 1.337604e+05 129305.3853 8.716386e+05 \n",
"3 5.4881 9.695000e+05 9.695000e+05 336903.9335 2.075700e+07 \n",
"4 0.2971 2.056743e+04 2.056743e+04 12310.0935 1.855594e+06 \n",
"... ... ... ... ... ... \n",
"27321 NaN 1.600000e+04 1.600000e+04 8796.6880 1.667200e+05 \n",
"27322 0.1772 1.528528e+04 1.527393e+04 8586.0802 8.547528e+05 \n",
"27323 0.1313 9.203339e+04 9.163399e+04 58685.2206 3.150303e+06 \n",
"27324 0.9198 2.497734e+05 2.485504e+05 204186.7350 1.588559e+06 \n",
"27325 NaN 1.259831e+05 1.145652e+05 102798.2760 4.254451e+06 \n",
2025-02-12 00:21:33 +08:00
"\n",
" circ_mv is_st \n",
"0 2.793704e+05 False \n",
"1 3.562864e+06 False \n",
"2 7.905239e+05 False \n",
"3 2.075700e+07 False \n",
"4 1.855594e+06 False \n",
"... ... ... \n",
"27321 1.667200e+05 False \n",
"27322 8.541180e+05 False \n",
"27323 3.136631e+06 False \n",
"27324 1.580780e+06 False \n",
"27325 3.868868e+06 False \n",
2025-02-12 00:21:33 +08:00
"\n",
"[27326 rows x 19 columns]\n"
2025-02-12 00:21:33 +08:00
]
}
],
2025-05-06 23:42:40 +08:00
"source": [
"all_daily_data_df = pd.concat(all_daily_data, ignore_index=True)\n",
"print(all_daily_data_df)"
]
2025-02-12 00:21:33 +08:00
},
{
2025-02-15 23:33:34 +08:00
"cell_type": "code",
2025-05-13 15:30:06 +08:00
"execution_count": 6,
2025-02-15 23:33:34 +08:00
"id": "28cb78d032671b20",
2025-02-12 00:21:33 +08:00
"metadata": {
"ExecuteTime": {
2025-04-10 23:17:22 +08:00
"end_time": "2025-04-09T14:58:16.881685Z",
"start_time": "2025-04-09T14:58:16.871184Z"
2025-02-12 00:21:33 +08:00
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" ts_code trade_date close turnover_rate turnover_rate_f \\\n",
"54 000615.SZ 20260123 3.32 0.7420 0.9114 \n",
"60 600228.SH 20260123 5.63 3.4275 4.8943 \n",
"88 000430.SZ 20260123 7.30 1.4748 2.3665 \n",
"96 603389.SH 20260123 48.20 0.8319 2.1042 \n",
"110 000752.SZ 20260123 10.96 1.4753 1.8177 \n",
"... ... ... ... ... ... \n",
"27130 002586.SZ 20260119 4.68 1.7164 2.9874 \n",
"27154 600265.SH 20260119 19.29 0.2879 0.8066 \n",
"27193 688287.SH 20260119 6.44 0.9802 1.9881 \n",
"27195 300338.SZ 20260119 3.61 1.4127 1.5675 \n",
"27223 000669.SZ 20260119 2.79 1.1437 1.4424 \n",
2025-02-12 00:21:33 +08:00
"\n",
" volume_ratio pe pe_ttm pb ps ps_ttm dv_ratio \\\n",
"54 0.72 NaN NaN NaN 5.4217 5.8063 NaN \n",
"60 1.60 NaN NaN 5.6813 9.6204 6.3997 NaN \n",
"88 1.11 NaN NaN 22.0731 13.6938 12.9047 NaN \n",
"96 0.95 NaN NaN 54.1434 62.5718 60.6578 NaN \n",
"110 0.96 110.3513 19.6504 4.7026 6.8589 6.4652 NaN \n",
"... ... ... ... ... ... ... ... \n",
"27130 1.55 NaN NaN 1.7970 2.1568 2.2158 NaN \n",
"27154 0.79 NaN NaN 288.1848 5.6010 10.1170 NaN \n",
"27193 0.59 NaN NaN 3.1120 26.5172 26.3673 NaN \n",
"27195 0.58 NaN NaN NaN 8.5110 10.1280 NaN \n",
"27223 0.84 NaN NaN NaN 1.4522 1.5001 NaN \n",
2025-02-12 00:21:33 +08:00
"\n",
" dv_ttm total_share float_share free_share total_mv \\\n",
"54 NaN 178749.2693 92531.6398 75331.7788 5.934476e+05 \n",
"60 NaN 41667.2427 41603.7177 29135.5053 2.345866e+05 \n",
"88 NaN 80963.5372 37055.6486 23092.8156 5.910338e+05 \n",
"96 NaN 26275.2000 26275.2000 10387.7487 1.266465e+06 \n",
"110 NaN 26375.8491 26375.8491 21407.3042 2.890793e+05 \n",
"... ... ... ... ... ... \n",
"27130 NaN 114422.3714 108751.8003 62484.0799 5.354967e+05 \n",
"27154 NaN 12980.0000 12980.0000 4633.1947 2.503842e+05 \n",
"27193 NaN 37051.5600 37051.5600 18267.2898 2.386120e+05 \n",
"27195 NaN 40262.4692 34936.1242 31485.3582 1.453475e+05 \n",
"27223 NaN 68040.8797 68040.8797 53950.9653 1.898341e+05 \n",
2025-06-02 22:23:44 +08:00
"\n",
" circ_mv is_st \n",
"54 3.072050e+05 True \n",
"60 2.342289e+05 True \n",
"88 2.705062e+05 True \n",
"96 1.266465e+06 True \n",
"110 2.890793e+05 True \n",
"... ... ... \n",
"27130 5.089584e+05 True \n",
"27154 2.503842e+05 True \n",
"27193 2.386120e+05 True \n",
"27195 1.261194e+05 True \n",
"27223 1.898341e+05 True \n",
2025-11-29 00:23:12 +08:00
"\n",
"[886 rows x 19 columns]\n"
2025-02-12 00:21:33 +08:00
]
}
],
2025-05-06 23:42:40 +08:00
"source": [
"print(all_daily_data_df[all_daily_data_df['is_st']])"
]
2025-02-12 00:21:33 +08:00
},
{
2025-02-15 23:33:34 +08:00
"cell_type": "code",
2025-05-13 15:30:06 +08:00
"execution_count": 7,
2025-02-15 23:33:34 +08:00
"id": "692b58674b7462c9",
2025-02-12 00:21:33 +08:00
"metadata": {
"ExecuteTime": {
2025-04-10 23:17:22 +08:00
"end_time": "2025-04-09T14:58:17.773453Z",
"start_time": "2025-04-09T14:58:16.903459Z"
2025-02-12 00:21:33 +08:00
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"所有每日基础数据获取并保存完毕!\n"
]
}
],
2025-05-06 23:42:40 +08:00
"source": [
"# 将数据保存为 HDF5 文件table 格式)\n",
"all_daily_data_df.to_hdf(h5_filename, key='daily_basic', mode='a', format='table', append=True, data_columns=True)\n",
"\n",
"print(\"所有每日基础数据获取并保存完毕!\")\n"
]
2025-02-12 00:21:33 +08:00
},
{
2025-02-15 23:33:34 +08:00
"cell_type": "code",
2025-05-13 15:30:06 +08:00
"execution_count": 8,
2025-02-15 23:33:34 +08:00
"id": "d7a773fc20293477",
2025-02-12 00:21:33 +08:00
"metadata": {
"ExecuteTime": {
2025-04-10 23:17:22 +08:00
"end_time": "2025-04-09T14:58:24.305403Z",
"start_time": "2025-04-09T14:58:17.816332Z"
2025-02-12 00:21:33 +08:00
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"Index: 9574993 entries, 0 to 27325\n",
2025-02-12 00:21:33 +08:00
"Data columns (total 3 columns):\n",
" # Column Dtype \n",
"--- ------ ----- \n",
" 0 ts_code object\n",
" 1 trade_date object\n",
" 2 is_st bool \n",
"dtypes: bool(1), object(2)\n",
"memory usage: 228.3+ MB\n",
2025-02-12 00:21:33 +08:00
"None\n"
]
}
],
2025-05-06 23:42:40 +08:00
"source": [
"with pd.HDFStore(h5_filename, mode='r') as store:\n",
" df = store[key][['ts_code', 'trade_date', 'is_st']]\n",
" print(df.info())"
]
2025-02-12 00:21:33 +08:00
}
],
"metadata": {
"kernelspec": {
2025-06-02 22:23:44 +08:00
"display_name": "stock",
2025-02-12 00:21:33 +08:00
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
2025-11-29 00:23:12 +08:00
"version": "3.12.11"
2025-02-12 00:21:33 +08:00
}
},
"nbformat": 4,
"nbformat_minor": 5
}