Files
NewStock/code/data/update/update_daily_basic.ipynb

454 lines
17 KiB
Plaintext
Raw Normal View History

2025-02-12 00:21:33 +08:00
{
"cells": [
{
"cell_type": "code",
"id": "18d1d622-b083-4cc4-a6f8-7c1ed2d0edd2",
"metadata": {
"ExecuteTime": {
2025-03-31 23:08:03 +08:00
"end_time": "2025-03-30T16:42:34.194992Z",
"start_time": "2025-03-30T16:42:33.440178Z"
2025-02-12 00:21:33 +08:00
}
},
"source": [
"import tushare as ts\n",
"ts.set_token('3a0741c702ee7e5e5f2bf1f0846bafaafe4e320833240b2a7e4a685f')\n",
"pro = ts.pro_api()"
2025-03-31 23:08:03 +08:00
],
"outputs": [],
"execution_count": 1
2025-02-12 00:21:33 +08:00
},
{
2025-02-15 23:33:34 +08:00
"cell_type": "code",
"id": "14671a7f72de2564",
2025-02-12 00:21:33 +08:00
"metadata": {
"ExecuteTime": {
2025-03-31 23:08:03 +08:00
"end_time": "2025-03-30T16:42:36.432691Z",
"start_time": "2025-03-30T16:42:34.197998Z"
2025-02-12 00:21:33 +08:00
}
},
"source": [
"from datetime import datetime\n",
"import pandas as pd\n",
2025-03-31 23:08:03 +08:00
"import warnings\n",
"\n",
"warnings.filterwarnings(\"ignore\")\n",
"def filter_rows(df):\n",
" # 按照 name 和 start_date 分组\n",
" def select_row(group):\n",
" # 如果有 end_date 不为 NaT 的行,优先保留这些行\n",
" valid_rows = group[group['end_date'].notna()]\n",
" if not valid_rows.empty:\n",
" return valid_rows.iloc[0] # 返回第一个有效行\n",
" else:\n",
" return group.iloc[0] # 如果没有有效行,返回第一行\n",
"\n",
" filtered_df = df.groupby(['name', 'start_date'], group_keys=False).apply(select_row)\n",
" filtered_df = filtered_df.reset_index(drop=True)\n",
" return filtered_df\n",
2025-02-12 00:21:33 +08:00
"\n",
"def is_st(name_change_dict, stock_code, target_date):\n",
" target_date = datetime.strptime(target_date, '%Y%m%d')\n",
" if stock_code not in name_change_dict.keys():\n",
" return False\n",
" df = name_change_dict[stock_code]\n",
" for i in range(len(df)):\n",
" sds = df.iloc[i, 2]\n",
" eds = df.iloc[i, 3]\n",
" if eds is None or eds is pd.NaT:\n",
" eds = datetime.now()\n",
" if (target_date - sds).days >= 0 and (target_date - eds).days <= 0:\n",
" return True\n",
" return False\n",
"\n",
"name_change_df = pd.read_hdf('../../../data/name_change.h5', key='name_change')\n",
"name_change_df = name_change_df.drop_duplicates(keep='first')\n",
"\n",
"# 确保 name_change_df 的日期格式正确\n",
"name_change_df['start_date'] = pd.to_datetime(name_change_df['start_date'], format='%Y%m%d')\n",
"name_change_df['end_date'] = pd.to_datetime(name_change_df['end_date'], format='%Y%m%d', errors='coerce')\n",
"name_change_df = name_change_df[name_change_df.name.str.contains('ST')]\n",
"name_change_dict = {}\n",
"for ts_code, group in name_change_df.groupby('ts_code'):\n",
" # 只保留 'ST' 和 '*ST' 的记录\n",
" st_data = group[(group['change_reason'] == 'ST') | (group['change_reason'] == '*ST')]\n",
" if not st_data.empty:\n",
2025-03-31 23:08:03 +08:00
" name_change_dict[ts_code] = filter_rows(st_data)"
],
"outputs": [],
"execution_count": 2
2025-02-12 00:21:33 +08:00
},
{
2025-02-15 23:33:34 +08:00
"cell_type": "code",
"id": "e7f8cce2f80e2f20",
2025-02-12 00:21:33 +08:00
"metadata": {
"ExecuteTime": {
2025-03-31 23:08:03 +08:00
"end_time": "2025-03-30T16:43:03.790361Z",
"start_time": "2025-03-30T16:42:36.633554Z"
2025-02-12 00:21:33 +08:00
}
},
"source": [
"import time\n",
"from concurrent.futures import ThreadPoolExecutor, as_completed\n",
"\n",
"h5_filename = '../../../data/daily_basic.h5'\n",
"key = '/daily_basic'\n",
"max_date = None\n",
"with pd.HDFStore(h5_filename, mode='r') as store:\n",
" df = store[key][['ts_code', 'trade_date']]\n",
" print(df.info())\n",
" max_date = df['trade_date'].max()\n",
"\n",
"print(max_date)\n",
2025-03-31 23:08:03 +08:00
"trade_cal = pro.trade_cal(exchange='', start_date='20170101', end_date='20250420')\n",
2025-02-12 00:21:33 +08:00
"trade_cal = trade_cal[trade_cal['is_open'] == 1] # 只保留交易日\n",
"trade_dates = trade_cal[trade_cal['cal_date'] > max_date]['cal_date'].tolist()\n",
"start_date = min(trade_dates)\n",
"print(start_date)"
2025-03-31 23:08:03 +08:00
],
2025-02-15 23:33:34 +08:00
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
2025-03-31 23:08:03 +08:00
"<class 'pandas.core.frame.DataFrame'>\n",
"Index: 8453605 entries, 0 to 32308\n",
"Data columns (total 2 columns):\n",
" # Column Dtype \n",
"--- ------ ----- \n",
" 0 ts_code object\n",
" 1 trade_date object\n",
"dtypes: object(2)\n",
"memory usage: 193.5+ MB\n",
"None\n",
"20250321\n",
"20250324\n"
2025-02-15 23:33:34 +08:00
]
}
],
2025-03-31 23:08:03 +08:00
"execution_count": 3
},
{
"cell_type": "code",
"id": "553cfb36-f560-4cc4-b2bc-68323ccc5072",
"metadata": {
"scrolled": true,
"ExecuteTime": {
"end_time": "2025-03-30T16:43:07.947442Z",
"start_time": "2025-03-30T16:43:03.827519Z"
}
},
2025-02-12 00:21:33 +08:00
"source": [
"\n",
"\n",
"# 使用 HDFStore 存储数据\n",
"all_daily_data = []\n",
"\n",
"# API 调用计数和时间控制变量\n",
"api_call_count = 0\n",
"batch_start_time = time.time()\n",
"\n",
"\n",
"def get_data(trade_date):\n",
" daily_basic_data = pro.daily_basic(ts_code='', trade_date=trade_date)\n",
" if daily_basic_data is not None and not daily_basic_data.empty:\n",
" # 添加交易日期列标识\n",
" daily_basic_data['trade_date'] = trade_date\n",
" daily_basic_data['is_st'] = daily_basic_data.apply(\n",
" lambda row: is_st(name_change_dict, row['ts_code'], row['trade_date']), axis=1\n",
" )\n",
" time.sleep(0.2)\n",
" # print(f\"成功获取并保存 {trade_date} 的每日基础数据\")\n",
" return daily_basic_data\n",
"\n",
"\n",
"# 遍历每个交易日期并获取数据\n",
"with ThreadPoolExecutor(max_workers=2) as executor:\n",
" future_to_date = {executor.submit(get_data, td): td for td in trade_dates}\n",
"\n",
" for future in as_completed(future_to_date):\n",
" trade_date = future_to_date[future] # 获取对应的交易日期\n",
" try:\n",
" result = future.result() # 获取任务执行的结果\n",
" all_daily_data.append(result)\n",
" print(f\"任务 {trade_date} 完成\")\n",
" except Exception as e:\n",
" print(f\"获取 {trade_date} 数据时出错: {e}\")\n",
" # 计数一次 API 调用\n",
" api_call_count += 1\n",
"\n",
" # 每调用 300 次,检查时间是否少于 1 分钟,如果少于则等待剩余时间\n",
" if api_call_count % 150 == 0:\n",
" elapsed = time.time() - batch_start_time\n",
" if elapsed < 60:\n",
" sleep_time = 60 - elapsed\n",
" print(f\"已调用 150 次 API等待 {sleep_time:.2f} 秒以满足速率限制...\")\n",
" time.sleep(sleep_time)\n",
" # 重置批次起始时间\n",
" batch_start_time = time.time()\n",
"\n"
2025-03-31 23:08:03 +08:00
],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"任务 20250418 完成\n",
"任务 20250417 完成\n",
"任务 20250416 完成\n",
"任务 20250415 完成\n",
"任务 20250411 完成\n",
"任务 20250414 完成\n",
"任务 20250410 完成\n",
"任务 20250409 完成\n",
"任务 20250408 完成\n",
"任务 20250407 完成\n",
"任务 20250403 完成\n",
"任务 20250402 完成\n",
"任务 20250331 完成\n",
"任务 20250401 完成\n",
"任务 20250327 完成\n",
"任务 20250328 完成\n",
"任务 20250326 完成\n",
"任务 20250324 完成\n",
"任务 20250325 完成\n"
]
}
],
"execution_count": 4
2025-02-12 00:21:33 +08:00
},
{
2025-02-15 23:33:34 +08:00
"cell_type": "code",
"id": "919023c693d7a47a",
2025-02-12 00:21:33 +08:00
"metadata": {
"ExecuteTime": {
2025-03-31 23:08:03 +08:00
"end_time": "2025-03-30T16:43:07.962318Z",
"start_time": "2025-03-30T16:43:07.951757Z"
2025-02-12 00:21:33 +08:00
}
},
2025-03-31 23:08:03 +08:00
"source": [
"all_daily_data_df = pd.concat(all_daily_data, ignore_index=True)\n",
"print(all_daily_data_df)"
],
2025-02-12 00:21:33 +08:00
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
2025-03-31 23:08:03 +08:00
" ts_code trade_date close turnover_rate turnover_rate_f \\\n",
"0 603328.SH 20250327 10.44 1.0910 2.6596 \n",
"1 603989.SH 20250327 15.66 0.9036 2.6145 \n",
"2 603194.SH 20250327 38.03 14.0348 14.0348 \n",
"3 600884.SH 20250327 7.13 1.9769 2.1153 \n",
"4 688325.SH 20250327 47.26 1.5250 1.8078 \n",
"... ... ... ... ... ... \n",
"26946 688539.SH 20250325 26.70 1.0257 1.3011 \n",
"26947 688479.SH 20250325 18.73 0.9840 1.2588 \n",
"26948 000552.SZ 20250325 2.63 1.8147 3.0665 \n",
"26949 688719.SH 20250325 31.64 4.2998 5.1737 \n",
"26950 002709.SZ 20250325 19.50 1.2468 1.4268 \n",
2025-02-12 00:21:33 +08:00
"\n",
2025-03-31 23:08:03 +08:00
" volume_ratio pe pe_ttm pb ps ps_ttm dv_ratio \\\n",
"0 0.79 29.3625 23.3887 2.5786 3.2807 2.9727 1.8582 \n",
"1 0.79 17.8968 27.7940 1.7060 1.8591 1.6666 1.6823 \n",
"2 1.87 18.9266 18.3213 3.2891 2.5755 2.4322 NaN \n",
"3 0.52 20.9930 NaN 0.7305 0.8425 0.9106 2.7224 \n",
"4 0.93 67.1638 50.1073 2.3433 16.1029 10.2149 NaN \n",
"... ... ... ... ... ... ... ... \n",
"26946 0.56 51.5254 83.3548 2.8475 14.5500 13.9718 NaN \n",
"26947 0.61 23.5448 33.4921 1.4043 3.6736 4.5444 NaN \n",
"26948 1.42 8.0989 11.6324 0.8431 1.2501 1.3463 3.8023 \n",
"26949 1.64 26.3323 49.9921 2.0474 4.4195 3.6954 NaN \n",
"26950 0.76 19.7447 78.2248 2.9106 2.4233 3.0741 1.5444 \n",
2025-02-12 00:21:33 +08:00
"\n",
2025-03-31 23:08:03 +08:00
" dv_ttm total_share float_share free_share total_mv \\\n",
"0 1.8582 99844.2611 99844.2611 40955.5563 1.042374e+06 \n",
"1 1.6823 40113.0603 40113.0603 13863.2102 6.281705e+05 \n",
"2 NaN 40100.0000 4982.8436 4982.8436 1.525003e+06 \n",
"3 2.7224 225339.6168 175723.6492 164220.4548 1.606671e+06 \n",
"4 NaN 8494.7740 3830.4117 3231.0886 4.014630e+05 \n",
"... ... ... ... ... ... \n",
"26946 NaN 18592.0000 10286.0800 8109.0800 4.964064e+05 \n",
"26947 NaN 14431.7400 6087.4224 4758.2224 2.703065e+05 \n",
"26948 3.8023 535180.1936 372577.7383 220477.9354 1.407524e+06 \n",
"26949 NaN 11538.5418 7349.9938 6108.5305 3.650795e+05 \n",
"26950 1.5444 191434.3762 138501.6891 121034.9868 3.732970e+06 \n",
2025-02-12 00:21:33 +08:00
"\n",
2025-03-31 23:08:03 +08:00
" circ_mv is_st \n",
"0 1.042374e+06 False \n",
"1 6.281705e+05 False \n",
"2 1.894975e+05 False \n",
"3 1.252910e+06 False \n",
"4 1.810253e+05 False \n",
"... ... ... \n",
"26946 2.746383e+05 False \n",
"26947 1.140174e+05 False \n",
"26948 9.798795e+05 False \n",
"26949 2.325538e+05 False \n",
"26950 2.700783e+06 False \n",
2025-02-12 00:21:33 +08:00
"\n",
2025-03-31 23:08:03 +08:00
"[26951 rows x 19 columns]\n"
2025-02-12 00:21:33 +08:00
]
}
],
2025-03-31 23:08:03 +08:00
"execution_count": 5
2025-02-12 00:21:33 +08:00
},
{
2025-02-15 23:33:34 +08:00
"cell_type": "code",
"id": "28cb78d032671b20",
2025-02-12 00:21:33 +08:00
"metadata": {
"ExecuteTime": {
2025-03-31 23:08:03 +08:00
"end_time": "2025-03-30T16:43:08.000073Z",
"start_time": "2025-03-30T16:43:07.984082Z"
2025-02-12 00:21:33 +08:00
}
},
2025-03-31 23:08:03 +08:00
"source": [
"print(all_daily_data_df[all_daily_data_df['is_st']])"
],
2025-02-12 00:21:33 +08:00
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
2025-03-31 23:08:03 +08:00
" ts_code trade_date close turnover_rate turnover_rate_f \\\n",
"100 002528.SZ 20250327 2.53 0.6855 1.4642 \n",
"128 300163.SZ 20250327 3.15 3.0563 3.2999 \n",
"129 300205.SZ 20250327 4.34 0.9211 1.5246 \n",
"147 000851.SZ 20250327 2.53 2.2990 2.6472 \n",
"299 300097.SZ 20250327 4.88 3.1648 3.6912 \n",
"... ... ... ... ... ... \n",
"26750 000506.SZ 20250325 5.21 1.2689 1.8939 \n",
"26770 002592.SZ 20250325 5.22 1.0547 1.6712 \n",
"26786 600603.SH 20250325 7.63 0.4610 1.0776 \n",
"26828 002528.SZ 20250325 2.51 0.9799 2.0928 \n",
"26906 300097.SZ 20250325 4.92 3.2717 3.8159 \n",
2025-02-12 00:21:33 +08:00
"\n",
2025-03-31 23:08:03 +08:00
" volume_ratio pe pe_ttm pb ps ps_ttm dv_ratio \\\n",
"100 0.43 NaN NaN 7.3528 2.1714 2.7257 0.0000 \n",
"128 0.87 NaN NaN 3.0547 5.9187 5.8999 0.0000 \n",
"129 0.63 94.7108 NaN 1.3743 1.0976 1.5538 0.4608 \n",
"147 0.64 NaN NaN 1.0360 0.4939 0.8666 0.0000 \n",
"299 0.70 10.0614 NaN 2.2055 2.9549 3.1999 0.0000 \n",
"... ... ... ... ... ... ... ... \n",
"26750 0.37 725.4828 NaN 8.2869 17.0204 21.9262 0.0000 \n",
"26770 0.94 14.0192 61.1217 1.6387 2.7253 2.3121 0.0000 \n",
"26786 0.56 15.6086 24.2223 1.3160 1.8461 2.4398 0.0000 \n",
"26828 0.58 NaN NaN 7.2947 2.1542 2.7042 0.0000 \n",
"26906 0.53 10.1438 NaN 2.2236 2.9791 3.2261 0.0000 \n",
2025-02-12 00:21:33 +08:00
"\n",
2025-03-31 23:08:03 +08:00
" dv_ttm total_share float_share free_share total_mv circ_mv \\\n",
"100 NaN 119867.5082 105021.9577 49171.2582 303264.7957 265705.5530 \n",
"128 NaN 47400.0000 41596.4553 38525.5904 149310.0000 131028.8342 \n",
"129 0.4608 43005.6000 42599.1218 25737.4813 186644.3040 184880.1886 \n",
"147 NaN 115786.0020 113197.7266 98311.5254 292938.5851 286390.2483 \n",
"299 NaN 28854.9669 27000.9948 23150.5534 140812.2385 131764.8546 \n",
"... ... ... ... ... ... ... \n",
"26750 NaN 92901.7761 92867.0961 62218.8027 484018.2535 483837.5707 \n",
"26770 NaN 28333.1157 26271.6370 16580.1814 147898.8640 137137.9451 \n",
"26786 NaN 119332.9151 119332.9151 51048.6002 910510.1422 910510.1422 \n",
"26828 NaN 119867.5082 105021.9577 49171.2582 300867.4456 263605.1138 \n",
"26906 NaN 28854.9669 27000.9948 23150.5534 141966.4371 132844.8944 \n",
2025-02-12 00:21:33 +08:00
"\n",
2025-03-31 23:08:03 +08:00
" is_st \n",
"100 True \n",
"128 True \n",
"129 True \n",
"147 True \n",
"299 True \n",
"... ... \n",
"26750 True \n",
"26770 True \n",
"26786 True \n",
"26828 True \n",
"26906 True \n",
2025-02-12 00:21:33 +08:00
"\n",
2025-03-31 23:08:03 +08:00
"[540 rows x 19 columns]\n"
2025-02-12 00:21:33 +08:00
]
}
],
2025-03-31 23:08:03 +08:00
"execution_count": 6
2025-02-12 00:21:33 +08:00
},
{
2025-02-15 23:33:34 +08:00
"cell_type": "code",
"id": "692b58674b7462c9",
2025-02-12 00:21:33 +08:00
"metadata": {
"ExecuteTime": {
2025-03-31 23:08:03 +08:00
"end_time": "2025-03-30T16:43:08.703938Z",
"start_time": "2025-03-30T16:43:08.021067Z"
2025-02-12 00:21:33 +08:00
}
},
2025-03-31 23:08:03 +08:00
"source": [
"# 将数据保存为 HDF5 文件table 格式)\n",
"all_daily_data_df.to_hdf(h5_filename, key='daily_basic', mode='a', format='table', append=True, data_columns=True)\n",
"\n",
"print(\"所有每日基础数据获取并保存完毕!\")\n"
],
2025-02-12 00:21:33 +08:00
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"所有每日基础数据获取并保存完毕!\n"
]
}
],
2025-03-31 23:08:03 +08:00
"execution_count": 7
2025-02-12 00:21:33 +08:00
},
{
2025-02-15 23:33:34 +08:00
"cell_type": "code",
"id": "d7a773fc20293477",
2025-02-12 00:21:33 +08:00
"metadata": {
"ExecuteTime": {
2025-03-31 23:08:03 +08:00
"end_time": "2025-03-30T16:43:15.188800Z",
"start_time": "2025-03-30T16:43:08.725449Z"
2025-02-12 00:21:33 +08:00
}
},
2025-03-31 23:08:03 +08:00
"source": [
"with pd.HDFStore(h5_filename, mode='r') as store:\n",
" df = store[key][['ts_code', 'trade_date', 'is_st']]\n",
" print(df.info())"
],
2025-02-12 00:21:33 +08:00
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
2025-03-31 23:08:03 +08:00
"Index: 8480556 entries, 0 to 26950\n",
2025-02-12 00:21:33 +08:00
"Data columns (total 3 columns):\n",
" # Column Dtype \n",
"--- ------ ----- \n",
" 0 ts_code object\n",
" 1 trade_date object\n",
" 2 is_st bool \n",
"dtypes: bool(1), object(2)\n",
2025-03-31 23:08:03 +08:00
"memory usage: 202.2+ MB\n",
2025-02-12 00:21:33 +08:00
"None\n"
]
}
],
2025-03-31 23:08:03 +08:00
"execution_count": 8
2025-02-12 00:21:33 +08:00
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
2025-03-31 23:08:03 +08:00
"version": "3.11.11"
2025-02-12 00:21:33 +08:00
}
},
"nbformat": 4,
"nbformat_minor": 5
}