{ "cells": [ { "cell_type": "code", "id": "18d1d622-b083-4cc4-a6f8-7c1ed2d0edd2", "metadata": { "ExecuteTime": { "end_time": "2025-04-08T13:37:08.050676Z", "start_time": "2025-04-08T13:37:07.328483Z" } }, "source": [ "import tushare as ts\n", "ts.set_token('3a0741c702ee7e5e5f2bf1f0846bafaafe4e320833240b2a7e4a685f')\n", "pro = ts.pro_api()" ], "outputs": [], "execution_count": 1 }, { "cell_type": "code", "id": "14671a7f72de2564", "metadata": { "ExecuteTime": { "end_time": "2025-04-08T13:37:10.251715Z", "start_time": "2025-04-08T13:37:08.055681Z" } }, "source": [ "from datetime import datetime\n", "import pandas as pd\n", "import warnings\n", "\n", "warnings.filterwarnings(\"ignore\")\n", "def filter_rows(df):\n", " # 按照 name 和 start_date 分组\n", " def select_row(group):\n", " # 如果有 end_date 不为 NaT 的行,优先保留这些行\n", " valid_rows = group[group['end_date'].notna()]\n", " if not valid_rows.empty:\n", " return valid_rows.iloc[0] # 返回第一个有效行\n", " else:\n", " return group.iloc[0] # 如果没有有效行,返回第一行\n", "\n", " filtered_df = df.groupby(['name', 'start_date'], group_keys=False).apply(select_row)\n", " filtered_df = filtered_df.reset_index(drop=True)\n", " return filtered_df\n", "\n", "def is_st(name_change_dict, stock_code, target_date):\n", " target_date = datetime.strptime(target_date, '%Y%m%d')\n", " if stock_code not in name_change_dict.keys():\n", " return False\n", " df = name_change_dict[stock_code]\n", " for i in range(len(df)):\n", " sds = df.iloc[i, 2]\n", " eds = df.iloc[i, 3]\n", " if eds is None or eds is pd.NaT:\n", " eds = datetime.now()\n", " if (target_date - sds).days >= 0 and (target_date - eds).days <= 0:\n", " return True\n", " return False\n", "\n", "name_change_df = pd.read_hdf('../../../data/name_change.h5', key='name_change')\n", "name_change_df = name_change_df.drop_duplicates(keep='first')\n", "\n", "# 确保 name_change_df 的日期格式正确\n", "name_change_df['start_date'] = pd.to_datetime(name_change_df['start_date'], format='%Y%m%d')\n", "name_change_df['end_date'] = pd.to_datetime(name_change_df['end_date'], format='%Y%m%d', errors='coerce')\n", "name_change_df = name_change_df[name_change_df.name.str.contains('ST')]\n", "name_change_dict = {}\n", "for ts_code, group in name_change_df.groupby('ts_code'):\n", " # 只保留 'ST' 和 '*ST' 的记录\n", " st_data = group[(group['change_reason'] == 'ST') | (group['change_reason'] == '*ST')]\n", " if not st_data.empty:\n", " name_change_dict[ts_code] = filter_rows(st_data)" ], "outputs": [], "execution_count": 2 }, { "cell_type": "code", "id": "e7f8cce2f80e2f20", "metadata": { "ExecuteTime": { "end_time": "2025-04-08T13:37:37.727419Z", "start_time": "2025-04-08T13:37:10.461897Z" } }, "source": [ "import time\n", "from concurrent.futures import ThreadPoolExecutor, as_completed\n", "\n", "h5_filename = '../../../data/daily_basic.h5'\n", "key = '/daily_basic'\n", "max_date = None\n", "with pd.HDFStore(h5_filename, mode='r') as store:\n", " df = store[key][['ts_code', 'trade_date']]\n", " print(df.info())\n", " max_date = df['trade_date'].max()\n", "\n", "print(max_date)\n", "trade_cal = pro.trade_cal(exchange='', start_date='20170101', end_date='20250420')\n", "trade_cal = trade_cal[trade_cal['is_open'] == 1] # 只保留交易日\n", "trade_dates = trade_cal[trade_cal['cal_date'] > max_date]['cal_date'].tolist()\n", "start_date = min(trade_dates)\n", "print(start_date)" ], "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Index: 8507519 entries, 0 to 5390\n", "Data columns (total 2 columns):\n", " # Column Dtype \n", "--- ------ ----- \n", " 0 ts_code object\n", " 1 trade_date object\n", "dtypes: object(2)\n", "memory usage: 194.7+ MB\n", "None\n", "20250407\n", "20250408\n" ] } ], "execution_count": 3 }, { "cell_type": "code", "id": "553cfb36-f560-4cc4-b2bc-68323ccc5072", "metadata": { "scrolled": true, "ExecuteTime": { "end_time": "2025-04-08T13:37:39.056144Z", "start_time": "2025-04-08T13:37:37.770718Z" } }, "source": [ "\n", "\n", "# 使用 HDFStore 存储数据\n", "all_daily_data = []\n", "\n", "# API 调用计数和时间控制变量\n", "api_call_count = 0\n", "batch_start_time = time.time()\n", "\n", "\n", "def get_data(trade_date):\n", " daily_basic_data = pro.daily_basic(ts_code='', trade_date=trade_date)\n", " if daily_basic_data is not None and not daily_basic_data.empty:\n", " # 添加交易日期列标识\n", " daily_basic_data['trade_date'] = trade_date\n", " daily_basic_data['is_st'] = daily_basic_data.apply(\n", " lambda row: is_st(name_change_dict, row['ts_code'], row['trade_date']), axis=1\n", " )\n", " time.sleep(0.2)\n", " # print(f\"成功获取并保存 {trade_date} 的每日基础数据\")\n", " return daily_basic_data\n", "\n", "\n", "# 遍历每个交易日期并获取数据\n", "with ThreadPoolExecutor(max_workers=2) as executor:\n", " future_to_date = {executor.submit(get_data, td): td for td in trade_dates}\n", "\n", " for future in as_completed(future_to_date):\n", " trade_date = future_to_date[future] # 获取对应的交易日期\n", " try:\n", " result = future.result() # 获取任务执行的结果\n", " all_daily_data.append(result)\n", " print(f\"任务 {trade_date} 完成\")\n", " except Exception as e:\n", " print(f\"获取 {trade_date} 数据时出错: {e}\")\n", " # 计数一次 API 调用\n", " api_call_count += 1\n", "\n", " # 每调用 300 次,检查时间是否少于 1 分钟,如果少于则等待剩余时间\n", " if api_call_count % 150 == 0:\n", " elapsed = time.time() - batch_start_time\n", " if elapsed < 60:\n", " sleep_time = 60 - elapsed\n", " print(f\"已调用 150 次 API,等待 {sleep_time:.2f} 秒以满足速率限制...\")\n", " time.sleep(sleep_time)\n", " # 重置批次起始时间\n", " batch_start_time = time.time()\n", "\n" ], "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "任务 20250417 完成\n", "任务 20250418 完成\n", "任务 20250416 完成\n", "任务 20250415 完成\n", "任务 20250411 完成\n", "任务 20250414 完成\n", "任务 20250410 完成\n", "任务 20250409 完成\n", "任务 20250408 完成\n" ] } ], "execution_count": 4 }, { "cell_type": "code", "id": "919023c693d7a47a", "metadata": { "ExecuteTime": { "end_time": "2025-04-08T13:37:39.072117Z", "start_time": "2025-04-08T13:37:39.062189Z" } }, "source": [ "all_daily_data_df = pd.concat(all_daily_data, ignore_index=True)\n", "print(all_daily_data_df)" ], "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " ts_code trade_date close turnover_rate turnover_rate_f \\\n", "0 300504.SZ 20250408 12.65 2.5494 4.8465 \n", "1 002223.SZ 20250408 34.24 0.9832 1.6194 \n", "2 002036.SZ 20250408 9.13 7.4710 8.1827 \n", "3 688207.SH 20250408 12.29 4.6144 4.6144 \n", "4 002401.SZ 20250408 13.88 4.9037 9.6159 \n", "... ... ... ... ... ... \n", "5387 600610.SH 20250408 7.56 18.8004 29.6937 \n", "5388 002215.SZ 20250408 8.84 5.7658 6.7838 \n", "5389 600694.SH 20250408 25.00 3.3101 5.4481 \n", "5390 600121.SH 20250408 3.66 3.0305 6.3012 \n", "5391 873167.BJ 20250408 21.56 7.8805 14.2434 \n", "\n", " volume_ratio pe pe_ttm pb ps ps_ttm dv_ratio \\\n", "0 1.56 34.0479 220.6414 1.5349 1.3422 1.7126 1.5892 \n", "1 1.07 14.3268 19.7636 2.8291 4.3058 4.6786 3.5030 \n", "2 2.45 NaN NaN 3.6899 0.9822 0.9210 0.0000 \n", "3 1.61 NaN NaN 1.5605 12.1348 26.4230 NaN \n", "4 1.44 40.4258 40.4258 3.0931 2.8715 2.8715 1.2977 \n", "... ... ... ... ... ... ... ... \n", "5387 1.18 NaN NaN 122.1550 7.3648 7.3648 0.0000 \n", "5388 2.49 37.7118 20.0533 2.2997 2.1570 1.7934 1.7092 \n", "5389 3.51 15.4938 13.3524 0.9057 1.0676 1.1271 3.6364 \n", "5390 1.13 15.7764 15.7764 2.3738 1.0605 1.0605 0.0000 \n", "5391 0.79 33.5290 65.6770 3.2183 7.0572 9.9201 NaN \n", "\n", " dv_ttm total_share float_share free_share total_mv \\\n", "0 1.5892 27102.4580 21826.2631 11481.0786 3.428461e+05 \n", "1 3.5030 100247.6929 93867.3649 56990.4202 3.432481e+06 \n", "2 NaN 105938.4915 105290.9483 96132.5171 9.672184e+05 \n", "3 NaN 25897.3147 18867.6306 18867.6306 3.182780e+05 \n", "4 1.2977 37166.8440 37136.3940 18937.9540 5.158758e+05 \n", "... ... ... ... ... ... \n", "5387 NaN 107127.4605 70872.6705 44872.6705 8.098836e+05 \n", "5388 1.7092 100519.1310 79400.9515 67486.1454 8.885891e+05 \n", "5389 3.6364 31305.2571 31305.2571 19020.4513 7.826314e+05 \n", "5390 NaN 121841.2038 121841.2038 58597.2758 4.459388e+05 \n", "5391 NaN 7086.1250 4178.1867 2311.6822 1.527769e+05 \n", "\n", " circ_mv is_st \n", "0 2.761022e+05 False \n", "1 3.214019e+06 False \n", "2 9.613064e+05 False \n", "3 2.318832e+05 False \n", "4 5.154531e+05 False \n", "... ... ... \n", "5387 5.357974e+05 False \n", "5388 7.019044e+05 False \n", "5389 7.826314e+05 False \n", "5390 4.459388e+05 False \n", "5391 9.008171e+04 False \n", "\n", "[5392 rows x 19 columns]\n" ] } ], "execution_count": 5 }, { "cell_type": "code", "id": "28cb78d032671b20", "metadata": { "ExecuteTime": { "end_time": "2025-04-08T13:37:39.103515Z", "start_time": "2025-04-08T13:37:39.093908Z" } }, "source": [ "print(all_daily_data_df[all_daily_data_df['is_st']])" ], "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " ts_code trade_date close turnover_rate turnover_rate_f \\\n", "20 000488.SZ 20250408 1.74 2.5808 3.5449 \n", "21 603608.SH 20250408 4.20 0.2313 0.3624 \n", "88 603363.SH 20250408 3.35 1.2763 1.4156 \n", "124 000989.SZ 20250408 7.60 2.5216 3.5863 \n", "136 300965.SZ 20250408 36.20 1.9389 2.6640 \n", "... ... ... ... ... ... \n", "5261 603879.SH 20250408 4.13 4.3647 6.8212 \n", "5273 002024.SZ 20250408 1.76 0.5005 1.3623 \n", "5298 603828.SH 20250408 4.43 1.3711 2.7554 \n", "5337 600234.SH 20250408 5.53 0.5518 1.0422 \n", "5370 300536.SZ 20250408 7.99 2.2037 2.7214 \n", "\n", " volume_ratio pe pe_ttm pb ps ps_ttm dv_ratio \\\n", "20 0.69 NaN NaN 0.5590 0.2252 0.2252 0.0000 \n", "21 0.35 NaN NaN 1.5767 1.3841 1.5604 0.0000 \n", "88 2.09 NaN NaN NaN 0.4481 0.7781 0.0000 \n", "124 1.71 30.0883 30.0883 1.7332 2.7432 2.7432 5.2053 \n", "136 1.27 NaN NaN 1.7736 NaN NaN 0.0829 \n", "... ... ... ... ... ... ... ... \n", "5261 1.67 NaN NaN 5.6207 4.0072 4.0072 0.0000 \n", "5273 1.06 26.7044 26.7044 1.3118 0.2871 0.2871 0.0000 \n", "5298 0.38 NaN NaN 3.5130 1.0396 1.0348 0.0000 \n", "5337 2.28 NaN NaN 3.2963 20.7089 9.4391 0.0000 \n", "5370 0.86 NaN NaN 4.2696 32.8078 24.2873 0.0000 \n", "\n", " dv_ttm total_share float_share free_share total_mv \\\n", "20 NaN 294145.6200 167582.4530 122004.3211 5.118134e+05 \n", "21 NaN 41971.5446 41971.5446 26785.1109 1.762805e+05 \n", "88 NaN 260296.1826 146776.2912 132325.9245 8.719922e+05 \n", "124 5.2053 85594.2012 69415.3353 48807.3173 6.505159e+05 \n", "136 0.0829 6000.0000 2060.9250 1500.0000 2.172000e+05 \n", "... ... ... ... ... ... \n", "5261 NaN 35934.4440 35934.4440 22993.7696 1.484093e+05 \n", "5273 NaN 926476.7618 925444.1318 340007.5385 1.630599e+06 \n", "5298 NaN 59596.0158 59593.9625 29654.2988 2.640103e+05 \n", "5337 NaN 26252.0973 26252.0973 13899.8888 1.451741e+05 \n", "5370 NaN 29328.8133 29325.3240 23747.3240 2.343372e+05 \n", "\n", " circ_mv is_st \n", "20 2.915935e+05 True \n", "21 1.762805e+05 True \n", "88 4.917006e+05 True \n", "124 5.275565e+05 True \n", "136 7.460549e+04 True \n", "... ... ... \n", "5261 1.484093e+05 True \n", "5273 1.628782e+06 True \n", "5298 2.640013e+05 True \n", "5337 1.451741e+05 True \n", "5370 2.343093e+05 True \n", "\n", "[106 rows x 19 columns]\n" ] } ], "execution_count": 6 }, { "cell_type": "code", "id": "692b58674b7462c9", "metadata": { "ExecuteTime": { "end_time": "2025-04-08T13:37:39.921445Z", "start_time": "2025-04-08T13:37:39.128232Z" } }, "source": [ "# 将数据保存为 HDF5 文件(table 格式)\n", "all_daily_data_df.to_hdf(h5_filename, key='daily_basic', mode='a', format='table', append=True, data_columns=True)\n", "\n", "print(\"所有每日基础数据获取并保存完毕!\")\n" ], "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "所有每日基础数据获取并保存完毕!\n" ] } ], "execution_count": 7 }, { "cell_type": "code", "id": "d7a773fc20293477", "metadata": { "ExecuteTime": { "end_time": "2025-04-08T13:37:46.393814Z", "start_time": "2025-04-08T13:37:39.941474Z" } }, "source": [ "with pd.HDFStore(h5_filename, mode='r') as store:\n", " df = store[key][['ts_code', 'trade_date', 'is_st']]\n", " print(df.info())" ], "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Index: 8512911 entries, 0 to 5391\n", "Data columns (total 3 columns):\n", " # Column Dtype \n", "--- ------ ----- \n", " 0 ts_code object\n", " 1 trade_date object\n", " 2 is_st bool \n", "dtypes: bool(1), object(2)\n", "memory usage: 203.0+ MB\n", "None\n" ] } ], "execution_count": 8 } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.11" } }, "nbformat": 4, "nbformat_minor": 5 }