Files
NewStock/code/data/update/update_daily_basic.ipynb

425 lines
16 KiB
Plaintext
Raw Normal View History

2025-02-12 00:21:33 +08:00
{
"cells": [
{
"cell_type": "code",
"id": "18d1d622-b083-4cc4-a6f8-7c1ed2d0edd2",
"metadata": {
"ExecuteTime": {
"end_time": "2025-02-11T15:43:54.745322Z",
"start_time": "2025-02-11T15:43:53.837662Z"
}
},
"source": [
"import tushare as ts\n",
"ts.set_token('3a0741c702ee7e5e5f2bf1f0846bafaafe4e320833240b2a7e4a685f')\n",
"pro = ts.pro_api()"
],
"outputs": [],
"execution_count": 1
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-02-11T15:53:08.235573Z",
"start_time": "2025-02-11T15:53:07.753701Z"
}
},
"cell_type": "code",
"source": [
"from datetime import datetime\n",
"import pandas as pd\n",
"\n",
"def is_st(name_change_dict, stock_code, target_date):\n",
" target_date = datetime.strptime(target_date, '%Y%m%d')\n",
" if stock_code not in name_change_dict.keys():\n",
" return False\n",
" df = name_change_dict[stock_code]\n",
" for i in range(len(df)):\n",
" sds = df.iloc[i, 2]\n",
" eds = df.iloc[i, 3]\n",
" if eds is None or eds is pd.NaT:\n",
" eds = datetime.now()\n",
" if (target_date - sds).days >= 0 and (target_date - eds).days <= 0:\n",
" return True\n",
" return False\n",
"\n",
"name_change_df = pd.read_hdf('../../../data/name_change.h5', key='name_change')\n",
"name_change_df = name_change_df.drop_duplicates(keep='first')\n",
"\n",
"# 确保 name_change_df 的日期格式正确\n",
"name_change_df['start_date'] = pd.to_datetime(name_change_df['start_date'], format='%Y%m%d')\n",
"name_change_df['end_date'] = pd.to_datetime(name_change_df['end_date'], format='%Y%m%d', errors='coerce')\n",
"name_change_df = name_change_df[name_change_df.name.str.contains('ST')]\n",
"name_change_dict = {}\n",
"for ts_code, group in name_change_df.groupby('ts_code'):\n",
" # 只保留 'ST' 和 '*ST' 的记录\n",
" st_data = group[(group['change_reason'] == 'ST') | (group['change_reason'] == '*ST')]\n",
" if not st_data.empty:\n",
" name_change_dict[ts_code] = st_data"
],
"id": "14671a7f72de2564",
"outputs": [],
"execution_count": 31
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-02-11T15:53:19.812860Z",
"start_time": "2025-02-11T15:53:09.614377Z"
}
},
"cell_type": "code",
"source": [
"import time\n",
"from concurrent.futures import ThreadPoolExecutor, as_completed\n",
"\n",
"h5_filename = '../../../data/daily_basic.h5'\n",
"key = '/daily_basic'\n",
"max_date = None\n",
"with pd.HDFStore(h5_filename, mode='r') as store:\n",
" df = store[key][['ts_code', 'trade_date']]\n",
" print(df.info())\n",
" max_date = df['trade_date'].max()\n",
"\n",
"print(max_date)\n",
"trade_cal = pro.trade_cal(exchange='', start_date='20170101', end_date='20250220')\n",
"trade_cal = trade_cal[trade_cal['is_open'] == 1] # 只保留交易日\n",
"trade_dates = trade_cal[trade_cal['cal_date'] > max_date]['cal_date'].tolist()\n",
"start_date = min(trade_dates)\n",
"print(start_date)"
],
"id": "e7f8cce2f80e2f20",
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"Index: 8295494 entries, 0 to 8295493\n",
"Data columns (total 2 columns):\n",
" # Column Dtype \n",
"--- ------ ----- \n",
" 0 ts_code object\n",
" 1 trade_date object\n",
"dtypes: object(2)\n",
"memory usage: 189.9+ MB\n",
"None\n",
"20250210\n",
"20250211\n"
]
}
],
"execution_count": 32
},
{
"cell_type": "code",
"id": "553cfb36-f560-4cc4-b2bc-68323ccc5072",
"metadata": {
"scrolled": true,
"ExecuteTime": {
"end_time": "2025-02-11T15:53:24.100612Z",
"start_time": "2025-02-11T15:53:22.361257Z"
}
},
"source": [
"\n",
"\n",
"# 使用 HDFStore 存储数据\n",
"all_daily_data = []\n",
"\n",
"# API 调用计数和时间控制变量\n",
"api_call_count = 0\n",
"batch_start_time = time.time()\n",
"\n",
"\n",
"def get_data(trade_date):\n",
" daily_basic_data = pro.daily_basic(ts_code='', trade_date=trade_date)\n",
" if daily_basic_data is not None and not daily_basic_data.empty:\n",
" # 添加交易日期列标识\n",
" daily_basic_data['trade_date'] = trade_date\n",
" daily_basic_data['is_st'] = daily_basic_data.apply(\n",
" lambda row: is_st(name_change_dict, row['ts_code'], row['trade_date']), axis=1\n",
" )\n",
" time.sleep(0.2)\n",
" # print(f\"成功获取并保存 {trade_date} 的每日基础数据\")\n",
" return daily_basic_data\n",
"\n",
"\n",
"# 遍历每个交易日期并获取数据\n",
"with ThreadPoolExecutor(max_workers=2) as executor:\n",
" future_to_date = {executor.submit(get_data, td): td for td in trade_dates}\n",
"\n",
" for future in as_completed(future_to_date):\n",
" trade_date = future_to_date[future] # 获取对应的交易日期\n",
" try:\n",
" result = future.result() # 获取任务执行的结果\n",
" all_daily_data.append(result)\n",
" print(f\"任务 {trade_date} 完成\")\n",
" except Exception as e:\n",
" print(f\"获取 {trade_date} 数据时出错: {e}\")\n",
" # 计数一次 API 调用\n",
" api_call_count += 1\n",
"\n",
" # 每调用 300 次,检查时间是否少于 1 分钟,如果少于则等待剩余时间\n",
" if api_call_count % 150 == 0:\n",
" elapsed = time.time() - batch_start_time\n",
" if elapsed < 60:\n",
" sleep_time = 60 - elapsed\n",
" print(f\"已调用 150 次 API等待 {sleep_time:.2f} 秒以满足速率限制...\")\n",
" time.sleep(sleep_time)\n",
" # 重置批次起始时间\n",
" batch_start_time = time.time()\n",
"\n"
],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"任务 20250220 完成\n",
"任务 20250219 完成\n",
"任务 20250218 完成\n",
"任务 20250217 完成\n",
"任务 20250214 完成\n",
"任务 20250213 完成\n",
"任务 20250212 完成\n",
"任务 20250211 完成\n"
]
}
],
"execution_count": 33
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-02-11T15:53:25.913933Z",
"start_time": "2025-02-11T15:53:25.902629Z"
}
},
"cell_type": "code",
"source": [
"all_daily_data_df = pd.concat(all_daily_data, ignore_index=True)\n",
"print(all_daily_data_df)"
],
"id": "919023c693d7a47a",
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" ts_code trade_date close turnover_rate turnover_rate_f \\\n",
"0 002512.SZ 20250211 5.03 5.9759 7.8713 \n",
"1 600966.SH 20250211 4.83 0.6904 1.3494 \n",
"2 600358.SH 20250211 3.68 8.5826 11.3780 \n",
"3 002893.SZ 20250211 9.73 1.9217 2.6415 \n",
"4 300648.SZ 20250211 22.90 1.7775 2.3188 \n",
"... ... ... ... ... ... \n",
"5380 300886.SZ 20250211 21.80 8.9341 13.4176 \n",
"5381 600050.SH 20250211 5.48 2.3899 5.6722 \n",
"5382 300149.SZ 20250211 6.73 3.5271 5.3077 \n",
"5383 002197.SZ 20250211 4.42 4.0058 4.6595 \n",
"5384 688270.SH 20250211 37.34 2.9212 2.9212 \n",
"\n",
" volume_ratio pe pe_ttm pb ps ps_ttm dv_ratio \\\n",
"0 0.87 NaN NaN 12.8888 2.9340 3.0625 0.0000 \n",
"1 1.16 35.5101 15.2315 0.9534 0.3454 0.3402 0.5633 \n",
"2 1.38 NaN NaN 15.2661 3.4220 4.2041 0.0000 \n",
"3 0.85 48.9883 41.5405 2.2074 2.3641 2.3637 0.8222 \n",
"4 0.69 NaN NaN 4.1442 3.7325 3.3186 0.0000 \n",
"... ... ... ... ... ... ... ... \n",
"5380 3.00 NaN 111.0678 2.9043 6.0326 4.9204 0.0000 \n",
"5381 1.15 21.3231 19.5079 1.0668 0.4677 0.4574 2.6625 \n",
"5382 1.34 NaN NaN 2.5009 2.9440 3.3158 0.0000 \n",
"5383 1.41 NaN NaN 1.1195 2.0851 2.5837 0.0000 \n",
"5384 0.75 110.2738 170.0477 3.7594 28.4642 27.3030 NaN \n",
"\n",
" dv_ttm total_share float_share free_share total_mv \\\n",
"0 NaN 1.147095e+05 1.048455e+05 7.959795e+04 5.769885e+05 \n",
"1 0.5633 1.336844e+05 1.336844e+05 6.839785e+04 6.456958e+05 \n",
"2 NaN 5.049367e+04 5.049367e+04 3.808829e+04 1.858167e+05 \n",
"3 0.8222 2.636400e+04 2.027786e+04 1.475173e+04 2.565217e+05 \n",
"4 NaN 1.477839e+04 1.061894e+04 8.140048e+03 3.384251e+05 \n",
"... ... ... ... ... ... \n",
"5380 NaN 7.455500e+03 4.346405e+03 2.894040e+03 1.625299e+05 \n",
"5381 2.6625 3.180058e+06 3.128014e+06 1.317969e+06 1.742672e+07 \n",
"5382 NaN 4.979640e+04 4.970844e+04 3.303210e+04 3.351298e+05 \n",
"5383 NaN 6.143629e+04 5.340007e+04 4.590857e+04 2.715484e+05 \n",
"5384 NaN 2.140516e+04 1.442317e+04 1.442317e+04 7.992687e+05 \n",
"\n",
" circ_mv is_st \n",
"0 5.273728e+05 False \n",
"1 6.456958e+05 False \n",
"2 1.858167e+05 True \n",
"3 1.973036e+05 False \n",
"4 2.431738e+05 False \n",
"... ... ... \n",
"5380 9.475163e+04 False \n",
"5381 1.714152e+07 False \n",
"5382 3.345378e+05 False \n",
"5383 2.360283e+05 True \n",
"5384 5.385612e+05 False \n",
"\n",
"[5385 rows x 19 columns]\n"
]
}
],
"execution_count": 34
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-02-11T15:53:42.062142Z",
"start_time": "2025-02-11T15:53:42.044324Z"
}
},
"cell_type": "code",
"source": "print(all_daily_data_df[all_daily_data_df['is_st']])",
"id": "28cb78d032671b20",
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" ts_code trade_date close turnover_rate turnover_rate_f \\\n",
"2 600358.SH 20250211 3.68 8.5826 11.3780 \n",
"20 000889.SZ 20250211 2.48 2.0846 2.8167 \n",
"50 603879.SH 20250211 3.58 1.7126 2.7285 \n",
"62 002024.SZ 20250211 1.99 0.2997 0.8575 \n",
"65 600078.SH 20250211 5.77 1.0536 1.8102 \n",
"... ... ... ... ... ... \n",
"5327 688309.SH 20250211 13.80 0.5594 1.0928 \n",
"5328 002800.SZ 20250211 10.57 2.0449 3.9025 \n",
"5342 300368.SZ 20250211 4.50 1.5755 2.2505 \n",
"5375 600515.SH 20250211 3.64 0.4111 0.6804 \n",
"5383 002197.SZ 20250211 4.42 4.0058 4.6595 \n",
"\n",
" volume_ratio pe pe_ttm pb ps ps_ttm dv_ratio \\\n",
"2 1.38 NaN NaN 15.2661 3.4220 4.2041 0.0000 \n",
"20 1.08 NaN NaN 20.6126 1.6250 1.6047 0.0000 \n",
"50 1.08 NaN NaN 3.4116 3.8093 3.5391 0.0000 \n",
"62 1.01 NaN NaN 1.5246 0.2944 0.3546 0.0000 \n",
"65 0.97 NaN NaN 2.1866 1.2329 1.2311 0.5373 \n",
"... ... ... ... ... ... ... ... \n",
"5327 0.73 60.8452 186.0174 1.5353 6.7361 13.4432 NaN \n",
"5328 0.72 NaN NaN 3.0468 1.6938 1.3629 0.0000 \n",
"5342 0.99 NaN NaN 7.1301 6.7544 11.8519 0.0000 \n",
"5375 0.91 43.6494 110.6536 1.7765 6.1506 7.8214 0.0000 \n",
"5383 1.41 NaN NaN 1.1195 2.0851 2.5837 0.0000 \n",
"\n",
" dv_ttm total_share float_share free_share total_mv \\\n",
"2 NaN 5.049367e+04 50493.6660 38088.2934 1.858167e+05 \n",
"20 NaN 9.362911e+04 86984.9676 64375.7658 2.322002e+05 \n",
"50 NaN 3.593444e+04 35934.4440 22555.6496 1.286453e+05 \n",
"62 NaN 9.264768e+05 919834.5068 321453.1001 1.843689e+06 \n",
"65 0.5373 6.625729e+04 66257.2861 38563.8247 3.823045e+05 \n",
"... ... ... ... ... ... \n",
"5327 NaN 8.001073e+03 8001.0733 4095.6641 1.104148e+05 \n",
"5328 NaN 1.522531e+04 14165.4100 7422.5200 1.609315e+05 \n",
"5342 NaN 5.289435e+04 52894.3475 37030.2475 2.380246e+05 \n",
"5375 NaN 1.142531e+06 917601.2508 554411.0843 4.158813e+06 \n",
"5383 NaN 6.143629e+04 53400.0687 45908.5733 2.715484e+05 \n",
"\n",
" circ_mv is_st \n",
"2 1.858167e+05 True \n",
"20 2.157227e+05 True \n",
"50 1.286453e+05 True \n",
"62 1.830471e+06 True \n",
"65 3.823045e+05 True \n",
"... ... ... \n",
"5327 1.104148e+05 True \n",
"5328 1.497284e+05 True \n",
"5342 2.380246e+05 True \n",
"5375 3.340069e+06 True \n",
"5383 2.360283e+05 True \n",
"\n",
"[318 rows x 19 columns]\n"
]
}
],
"execution_count": 37
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-02-11T15:53:33.693894Z",
"start_time": "2025-02-11T15:53:33.609884Z"
}
},
"cell_type": "code",
"source": [
"# 将数据保存为 HDF5 文件table 格式)\n",
"all_daily_data_df.to_hdf(h5_filename, key='daily_basic', mode='a', format='table', append=True, data_columns=True)\n",
"\n",
"print(\"所有每日基础数据获取并保存完毕!\")\n"
],
"id": "692b58674b7462c9",
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"所有每日基础数据获取并保存完毕!\n"
]
}
],
"execution_count": 36
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-02-11T15:54:27.868021Z",
"start_time": "2025-02-11T15:54:18.853803Z"
}
},
"cell_type": "code",
"source": [
"with pd.HDFStore(h5_filename, mode='r') as store:\n",
" df = store[key][['ts_code', 'trade_date', 'is_st']]\n",
" print(df.info())"
],
"id": "d7a773fc20293477",
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"Index: 8300879 entries, 0 to 5384\n",
"Data columns (total 3 columns):\n",
" # Column Dtype \n",
"--- ------ ----- \n",
" 0 ts_code object\n",
" 1 trade_date object\n",
" 2 is_st bool \n",
"dtypes: bool(1), object(2)\n",
"memory usage: 197.9+ MB\n",
"None\n"
]
}
],
"execution_count": 39
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.19"
}
},
"nbformat": 4,
"nbformat_minor": 5
}