Files
NewStock/main/data/update/update_daily_basic.ipynb
2025-06-02 22:23:44 +08:00

475 lines
18 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "18d1d622-b083-4cc4-a6f8-7c1ed2d0edd2",
"metadata": {
"ExecuteTime": {
"end_time": "2025-04-09T14:57:36.913044Z",
"start_time": "2025-04-09T14:57:36.159612Z"
}
},
"outputs": [],
"source": [
"import tushare as ts\n",
"ts.set_token('3a0741c702ee7e5e5f2bf1f0846bafaafe4e320833240b2a7e4a685f')\n",
"pro = ts.pro_api()"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "14671a7f72de2564",
"metadata": {
"ExecuteTime": {
"end_time": "2025-04-09T14:57:39.128278Z",
"start_time": "2025-04-09T14:57:36.918051Z"
}
},
"outputs": [],
"source": [
"from datetime import datetime\n",
"import pandas as pd\n",
"import warnings\n",
"\n",
"warnings.filterwarnings(\"ignore\")\n",
"def filter_rows(df):\n",
" # 按照 name 和 start_date 分组\n",
" def select_row(group):\n",
" # 如果有 end_date 不为 NaT 的行,优先保留这些行\n",
" valid_rows = group[group['end_date'].notna()]\n",
" if not valid_rows.empty:\n",
" return valid_rows.iloc[0] # 返回第一个有效行\n",
" else:\n",
" return group.iloc[0] # 如果没有有效行,返回第一行\n",
"\n",
" filtered_df = df.groupby(['name', 'start_date'], group_keys=False).apply(select_row)\n",
" filtered_df = filtered_df.reset_index(drop=True)\n",
" return filtered_df\n",
"\n",
"def is_st(name_change_dict, stock_code, target_date):\n",
" target_date = datetime.strptime(target_date, '%Y%m%d')\n",
" if stock_code not in name_change_dict.keys():\n",
" return False\n",
" df = name_change_dict[stock_code]\n",
" for i in range(len(df)):\n",
" sds = df.iloc[i, 2]\n",
" eds = df.iloc[i, 3]\n",
" if eds is None or eds is pd.NaT:\n",
" eds = datetime.now()\n",
" if (target_date - sds).days >= 0 and (target_date - eds).days <= 0:\n",
" return True\n",
" return False\n",
"\n",
"name_change_df = pd.read_hdf('/mnt/d/PyProject/NewStock/data/name_change.h5', key='name_change')\n",
"name_change_df = name_change_df.drop_duplicates(keep='first')\n",
"\n",
"# 确保 name_change_df 的日期格式正确\n",
"name_change_df['start_date'] = pd.to_datetime(name_change_df['start_date'], format='%Y%m%d')\n",
"name_change_df['end_date'] = pd.to_datetime(name_change_df['end_date'], format='%Y%m%d', errors='coerce')\n",
"# name_change_df = name_change_df[name_change_df.name.str.contains('ST') ]\n",
"name_change_dict = {}\n",
"for ts_code, group in name_change_df.groupby('ts_code'):\n",
" # 只保留 'ST' 和 '*ST' 的记录\n",
" # st_data = group[(group['change_reason'] == 'ST') | (group['change_reason'] == '*ST')]\n",
" st_data = group[(group['name'].str.contains('ST')) | (group['name'].str.contains('退'))]\n",
" if not st_data.empty:\n",
" name_change_dict[ts_code] = filter_rows(st_data)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "e7f8cce2f80e2f20",
"metadata": {
"ExecuteTime": {
"end_time": "2025-04-09T14:58:09.296046Z",
"start_time": "2025-04-09T14:57:39.339423Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"Index: 8674588 entries, 0 to 26945\n",
"Data columns (total 2 columns):\n",
" # Column Dtype \n",
"--- ------ ----- \n",
" 0 ts_code object\n",
" 1 trade_date object\n",
"dtypes: object(2)\n",
"memory usage: 198.5+ MB\n",
"None\n",
"20250523\n",
"20250526\n"
]
}
],
"source": [
"import time\n",
"from concurrent.futures import ThreadPoolExecutor, as_completed\n",
"\n",
"h5_filename = '/mnt/d/PyProject/NewStock/data/daily_basic.h5'\n",
"key = '/daily_basic'\n",
"max_date = None\n",
"with pd.HDFStore(h5_filename, mode='r') as store:\n",
" df = store[key][['ts_code', 'trade_date']]\n",
" print(df.info())\n",
" max_date = df['trade_date'].max()\n",
"\n",
"print(max_date)\n",
"trade_cal = pro.trade_cal(exchange='', start_date='20170101', end_date='20250720')\n",
"trade_cal = trade_cal[trade_cal['is_open'] == 1] # 只保留交易日\n",
"trade_dates = trade_cal[trade_cal['cal_date'] > max_date]['cal_date'].tolist()\n",
"start_date = min(trade_dates)\n",
"print(start_date)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "553cfb36-f560-4cc4-b2bc-68323ccc5072",
"metadata": {
"ExecuteTime": {
"end_time": "2025-04-09T14:58:16.817010Z",
"start_time": "2025-04-09T14:58:09.326485Z"
},
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"任务 20250718 完成\n",
"任务 20250717 完成\n",
"任务 20250716 完成\n",
"任务 20250715 完成\n",
"任务 20250714 完成\n",
"任务 20250711 完成\n",
"任务 20250709 完成\n",
"任务 20250710 完成\n",
"任务 20250708 完成\n",
"任务 20250707 完成\n",
"任务 20250704 完成\n",
"任务 20250703 完成\n",
"任务 20250702 完成\n",
"任务 20250701 完成\n",
"任务 20250630 完成\n",
"任务 20250627 完成\n",
"任务 20250626 完成\n",
"任务 20250625 完成\n",
"任务 20250624 完成\n",
"任务 20250623 完成\n",
"任务 20250620 完成\n",
"任务 20250619 完成\n",
"任务 20250618 完成\n",
"任务 20250617 完成\n",
"任务 20250616 完成\n",
"任务 20250613 完成\n",
"任务 20250612 完成\n",
"任务 20250611 完成\n",
"任务 20250610 完成\n",
"任务 20250609 完成\n",
"任务 20250606 完成\n",
"任务 20250605 完成\n",
"任务 20250603 完成\n",
"任务 20250604 完成\n",
"任务 20250530 完成\n",
"任务 20250529 完成\n",
"任务 20250528 完成\n",
"任务 20250527 完成\n",
"任务 20250526 完成\n"
]
}
],
"source": [
"\n",
"\n",
"# 使用 HDFStore 存储数据\n",
"all_daily_data = []\n",
"\n",
"# API 调用计数和时间控制变量\n",
"api_call_count = 0\n",
"batch_start_time = time.time()\n",
"\n",
"\n",
"def get_data(trade_date):\n",
" daily_basic_data = pro.daily_basic(ts_code='', trade_date=trade_date)\n",
" if daily_basic_data is not None and not daily_basic_data.empty:\n",
" # 添加交易日期列标识\n",
" daily_basic_data['trade_date'] = trade_date\n",
" daily_basic_data['is_st'] = daily_basic_data.apply(\n",
" lambda row: is_st(name_change_dict, row['ts_code'], row['trade_date']), axis=1\n",
" )\n",
" time.sleep(0.2)\n",
" # print(f\"成功获取并保存 {trade_date} 的每日基础数据\")\n",
" return daily_basic_data\n",
"\n",
"\n",
"# 遍历每个交易日期并获取数据\n",
"with ThreadPoolExecutor(max_workers=2) as executor:\n",
" future_to_date = {executor.submit(get_data, td): td for td in trade_dates}\n",
"\n",
" for future in as_completed(future_to_date):\n",
" trade_date = future_to_date[future] # 获取对应的交易日期\n",
" try:\n",
" result = future.result() # 获取任务执行的结果\n",
" all_daily_data.append(result)\n",
" print(f\"任务 {trade_date} 完成\")\n",
" except Exception as e:\n",
" print(f\"获取 {trade_date} 数据时出错: {e}\")\n",
" # 计数一次 API 调用\n",
" api_call_count += 1\n",
"\n",
" # 每调用 300 次,检查时间是否少于 1 分钟,如果少于则等待剩余时间\n",
" if api_call_count % 150 == 0:\n",
" elapsed = time.time() - batch_start_time\n",
" if elapsed < 60:\n",
" sleep_time = 60 - elapsed\n",
" print(f\"已调用 150 次 API等待 {sleep_time:.2f} 秒以满足速率限制...\")\n",
" time.sleep(sleep_time)\n",
" # 重置批次起始时间\n",
" batch_start_time = time.time()\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "919023c693d7a47a",
"metadata": {
"ExecuteTime": {
"end_time": "2025-04-09T14:58:16.864178Z",
"start_time": "2025-04-09T14:58:16.855084Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" ts_code trade_date close turnover_rate turnover_rate_f \\\n",
"0 603990.SH 20250530 14.96 3.7919 4.9168 \n",
"1 603666.SH 20250530 33.72 2.4954 4.7137 \n",
"2 001339.SZ 20250530 45.78 7.0710 7.0710 \n",
"3 002006.SZ 20250530 16.67 2.4368 3.4806 \n",
"4 603353.SH 20250530 15.21 1.3567 4.1316 \n",
"... ... ... ... ... ... \n",
"26918 002670.SZ 20250526 11.86 0.7662 2.3092 \n",
"26919 839946.BJ 20250526 9.67 4.8520 6.8863 \n",
"26920 688076.SH 20250526 49.59 5.9483 9.5054 \n",
"26921 300519.SZ 20250526 14.44 2.4601 3.8976 \n",
"26922 300468.SZ 20250526 18.15 6.8275 8.8410 \n",
"\n",
" volume_ratio pe pe_ttm pb ps ps_ttm \\\n",
"0 0.65 NaN NaN 5.5665 9.8735 11.0137 \n",
"1 1.15 NaN NaN 3.2133 11.8990 10.3525 \n",
"2 1.22 91.7742 74.3709 5.3909 2.8419 2.7478 \n",
"3 0.81 58.9666 65.5384 3.6508 5.0124 5.4591 \n",
"4 1.10 90.1163 80.8019 1.5917 0.9380 0.9517 \n",
"... ... ... ... ... ... ... \n",
"26918 0.75 137.0866 106.8454 2.0610 15093.0115 14821.3328 \n",
"26919 0.55 NaN NaN 5.7695 2.5489 2.4978 \n",
"26920 3.15 27.5757 22.7263 3.7628 6.8632 6.0784 \n",
"26921 1.14 45.8504 44.3443 2.7022 8.6318 8.8737 \n",
"26922 1.08 142.9746 150.8960 5.8350 13.0086 13.6702 \n",
"\n",
" dv_ratio dv_ttm total_share float_share free_share total_mv \\\n",
"0 0.0000 NaN 30628.2731 30628.2731 23620.5583 4.581990e+05 \n",
"1 0.0000 NaN 20649.0816 20649.0816 10931.3716 6.962870e+05 \n",
"2 0.2622 0.3498 25042.9670 7313.0995 7313.0995 1.146467e+06 \n",
"3 0.7749 0.7749 51979.3440 45516.0000 31865.7600 8.664957e+05 \n",
"4 0.6462 1.3036 17339.4000 17041.8000 5596.0000 2.637323e+05 \n",
"... ... ... ... ... ... ... \n",
"26918 0.0000 NaN 193508.4653 162335.0634 53860.6790 2.295010e+06 \n",
"26919 NaN NaN 13499.0443 9702.8595 6836.5574 1.305358e+05 \n",
"26920 NaN NaN 22487.0915 22487.0915 14071.9565 1.115135e+06 \n",
"26921 2.7701 2.7701 16000.0000 11410.0000 7201.9100 2.310400e+05 \n",
"26922 0.3306 0.3306 53064.9275 52979.4065 40913.5262 9.631284e+05 \n",
"\n",
" circ_mv is_st \n",
"0 4.581990e+05 False \n",
"1 6.962870e+05 False \n",
"2 3.347937e+05 False \n",
"3 7.587517e+05 False \n",
"4 2.592058e+05 False \n",
"... ... ... \n",
"26918 1.925294e+06 False \n",
"26919 9.382665e+04 False \n",
"26920 1.115135e+06 False \n",
"26921 1.647604e+05 False \n",
"26922 9.615762e+05 False \n",
"\n",
"[26923 rows x 19 columns]\n"
]
}
],
"source": [
"all_daily_data_df = pd.concat(all_daily_data, ignore_index=True)\n",
"print(all_daily_data_df)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "28cb78d032671b20",
"metadata": {
"ExecuteTime": {
"end_time": "2025-04-09T14:58:16.881685Z",
"start_time": "2025-04-09T14:58:16.871184Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" ts_code trade_date close turnover_rate turnover_rate_f \\\n",
"16 300536.SZ 20250530 8.67 2.8854 3.5632 \n",
"78 000668.SZ 20250530 7.94 4.1498 7.0226 \n",
"112 002231.SZ 20250530 3.28 8.9944 10.0552 \n",
"147 300313.SZ 20250530 6.28 6.0110 12.4720 \n",
"158 603838.SH 20250530 5.73 0.9777 2.6542 \n",
"... ... ... ... ... ... \n",
"26733 603828.SH 20250526 4.98 0.9734 1.9562 \n",
"26751 600599.SH 20250526 7.46 2.5125 6.3118 \n",
"26785 000820.SZ 20250526 3.02 13.6997 14.0750 \n",
"26885 002005.SZ 20250526 1.77 0.3214 0.5145 \n",
"26905 603869.SH 20250526 6.15 0.3000 0.7946 \n",
"\n",
" volume_ratio pe pe_ttm pb ps ps_ttm dv_ratio \\\n",
"16 0.55 NaN NaN 4.9112 10.9775 12.1174 0.0 \n",
"78 1.07 NaN NaN 1.6212 8.7361 5.6924 0.0 \n",
"112 0.74 NaN NaN 4.3227 3.9056 5.3690 0.0 \n",
"147 0.92 NaN NaN NaN 14.2840 13.5826 0.0 \n",
"158 1.06 NaN NaN 1.9039 6.4291 5.8279 0.0 \n",
"... ... ... ... ... ... ... ... \n",
"26733 0.56 345.783 1670.8958 3.9261 1.2065 1.3013 0.0 \n",
"26751 0.68 NaN NaN 11.2319 3.8238 3.9211 0.0 \n",
"26785 2.40 NaN NaN 12.4588 15.8309 20.1399 0.0 \n",
"26885 0.48 NaN NaN 15.9120 4.2066 4.2221 0.0 \n",
"26905 1.00 149.594 167.2545 0.8344 4.6640 5.0668 0.0 \n",
"\n",
" dv_ttm total_share float_share free_share total_mv \\\n",
"16 NaN 29328.8133 29325.3240 23747.3240 254280.8113 \n",
"78 NaN 14684.1890 14684.1890 8677.2104 116592.4607 \n",
"112 NaN 34685.0017 29481.8767 26371.6067 113766.8056 \n",
"147 NaN 31297.7396 19735.2789 9511.5479 196549.8047 \n",
"158 NaN 32001.6000 32001.6000 11788.1468 183369.1680 \n",
"... ... ... ... ... ... \n",
"26733 NaN 59596.0158 59593.9625 29654.2988 296788.1587 \n",
"26751 NaN 16600.0000 16600.0000 6607.7948 123836.0000 \n",
"26785 NaN 64655.5179 29696.6877 28904.9696 195259.6641 \n",
"26885 NaN 175242.4858 175199.3158 109452.0915 310179.1999 \n",
"26905 NaN 50450.0508 50450.0508 19045.9689 310267.8124 \n",
"\n",
" circ_mv is_st \n",
"16 254250.5591 True \n",
"78 116592.4607 True \n",
"112 96700.5556 True \n",
"147 123937.5515 True \n",
"158 183369.1680 True \n",
"... ... ... \n",
"26733 296777.9333 True \n",
"26751 123836.0000 True \n",
"26785 89683.9969 True \n",
"26885 310102.7890 True \n",
"26905 310267.8124 True \n",
"\n",
"[944 rows x 19 columns]\n"
]
}
],
"source": [
"print(all_daily_data_df[all_daily_data_df['is_st']])"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "692b58674b7462c9",
"metadata": {
"ExecuteTime": {
"end_time": "2025-04-09T14:58:17.773453Z",
"start_time": "2025-04-09T14:58:16.903459Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"所有每日基础数据获取并保存完毕!\n"
]
}
],
"source": [
"# 将数据保存为 HDF5 文件table 格式)\n",
"all_daily_data_df.to_hdf(h5_filename, key='daily_basic', mode='a', format='table', append=True, data_columns=True)\n",
"\n",
"print(\"所有每日基础数据获取并保存完毕!\")\n"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "d7a773fc20293477",
"metadata": {
"ExecuteTime": {
"end_time": "2025-04-09T14:58:24.305403Z",
"start_time": "2025-04-09T14:58:17.816332Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"Index: 8701511 entries, 0 to 26922\n",
"Data columns (total 3 columns):\n",
" # Column Dtype \n",
"--- ------ ----- \n",
" 0 ts_code object\n",
" 1 trade_date object\n",
" 2 is_st bool \n",
"dtypes: bool(1), object(2)\n",
"memory usage: 207.5+ MB\n",
"None\n"
]
}
],
"source": [
"with pd.HDFStore(h5_filename, mode='r') as store:\n",
" df = store[key][['ts_code', 'trade_date', 'is_st']]\n",
" print(df.info())"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "stock",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.2"
}
},
"nbformat": 4,
"nbformat_minor": 5
}