{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "18d1d622-b083-4cc4-a6f8-7c1ed2d0edd2", "metadata": { "ExecuteTime": { "end_time": "2025-04-06T15:33:43.537483Z", "start_time": "2025-04-06T15:33:42.844004Z" } }, "outputs": [], "source": [ "import tushare as ts\n", "ts.set_token('3a0741c702ee7e5e5f2bf1f0846bafaafe4e320833240b2a7e4a685f')\n", "pro = ts.pro_api()" ] }, { "cell_type": "code", "execution_count": 2, "id": "14671a7f72de2564", "metadata": { "ExecuteTime": { "end_time": "2025-04-06T15:33:45.387772Z", "start_time": "2025-04-06T15:33:43.537483Z" } }, "outputs": [], "source": [ "from datetime import datetime\n", "import pandas as pd\n", "import warnings\n", "\n", "warnings.filterwarnings(\"ignore\")\n", "def filter_rows(df):\n", " # 按照 name 和 start_date 分组\n", " def select_row(group):\n", " # 如果有 end_date 不为 NaT 的行,优先保留这些行\n", " valid_rows = group[group['end_date'].notna()]\n", " if not valid_rows.empty:\n", " return valid_rows.iloc[0] # 返回第一个有效行\n", " else:\n", " return group.iloc[0] # 如果没有有效行,返回第一行\n", "\n", " filtered_df = df.groupby(['name', 'start_date'], group_keys=False).apply(select_row)\n", " filtered_df = filtered_df.reset_index(drop=True)\n", " return filtered_df\n", "\n", "def is_st(name_change_dict, stock_code, target_date):\n", " target_date = datetime.strptime(target_date, '%Y%m%d')\n", " if stock_code not in name_change_dict.keys():\n", " return False\n", " df = name_change_dict[stock_code]\n", " for i in range(len(df)):\n", " sds = df.iloc[i, 2]\n", " eds = df.iloc[i, 3]\n", " if eds is None or eds is pd.NaT:\n", " eds = datetime.now()\n", " if (target_date - sds).days >= 0 and (target_date - eds).days <= 0:\n", " return True\n", " return False\n", "\n", "name_change_df = pd.read_hdf('../../../data/name_change.h5', key='name_change')\n", "name_change_df = name_change_df.drop_duplicates(keep='first')\n", "\n", "# 确保 name_change_df 的日期格式正确\n", "name_change_df['start_date'] = pd.to_datetime(name_change_df['start_date'], format='%Y%m%d')\n", "name_change_df['end_date'] = pd.to_datetime(name_change_df['end_date'], format='%Y%m%d', errors='coerce')\n", "name_change_df = name_change_df[name_change_df.name.str.contains('ST')]\n", "name_change_dict = {}\n", "for ts_code, group in name_change_df.groupby('ts_code'):\n", " # 只保留 'ST' 和 '*ST' 的记录\n", " st_data = group[(group['change_reason'] == 'ST') | (group['change_reason'] == '*ST')]\n", " if not st_data.empty:\n", " name_change_dict[ts_code] = filter_rows(st_data)" ] }, { "cell_type": "code", "execution_count": 3, "id": "e7f8cce2f80e2f20", "metadata": { "ExecuteTime": { "end_time": "2025-04-06T15:33:54.089114Z", "start_time": "2025-04-06T15:33:45.576286Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Index: 8502128 entries, 0 to 21571\n", "Data columns (total 2 columns):\n", " # Column Dtype \n", "--- ------ ----- \n", " 0 ts_code object\n", " 1 trade_date object\n", "dtypes: object(2)\n", "memory usage: 194.6+ MB\n", "None\n", "20250403\n", "20250407\n" ] } ], "source": [ "import time\n", "from concurrent.futures import ThreadPoolExecutor, as_completed\n", "\n", "h5_filename = '../../../data/daily_basic.h5'\n", "key = '/daily_basic'\n", "max_date = None\n", "with pd.HDFStore(h5_filename, mode='r') as store:\n", " df = store[key][['ts_code', 'trade_date']]\n", " print(df.info())\n", " max_date = df['trade_date'].max()\n", "\n", "print(max_date)\n", "trade_cal = pro.trade_cal(exchange='', start_date='20170101', end_date='20250420')\n", "trade_cal = trade_cal[trade_cal['is_open'] == 1] # 只保留交易日\n", "trade_dates = trade_cal[trade_cal['cal_date'] > max_date]['cal_date'].tolist()\n", "start_date = min(trade_dates)\n", "print(start_date)" ] }, { "cell_type": "code", "execution_count": 4, "id": "553cfb36-f560-4cc4-b2bc-68323ccc5072", "metadata": { "ExecuteTime": { "end_time": "2025-04-06T15:33:57.041254Z", "start_time": "2025-04-06T15:33:54.103322Z" }, "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "任务 20250417 完成\n", "任务 20250418 完成\n", "任务 20250416 完成\n", "任务 20250415 完成\n", "任务 20250414 完成\n", "任务 20250411 完成\n", "任务 20250410 完成\n", "任务 20250409 完成\n", "任务 20250408 完成\n", "任务 20250407 完成\n" ] } ], "source": [ "\n", "\n", "# 使用 HDFStore 存储数据\n", "all_daily_data = []\n", "\n", "# API 调用计数和时间控制变量\n", "api_call_count = 0\n", "batch_start_time = time.time()\n", "\n", "\n", "def get_data(trade_date):\n", " daily_basic_data = pro.daily_basic(ts_code='', trade_date=trade_date)\n", " if daily_basic_data is not None and not daily_basic_data.empty:\n", " # 添加交易日期列标识\n", " daily_basic_data['trade_date'] = trade_date\n", " daily_basic_data['is_st'] = daily_basic_data.apply(\n", " lambda row: is_st(name_change_dict, row['ts_code'], row['trade_date']), axis=1\n", " )\n", " time.sleep(0.2)\n", " # print(f\"成功获取并保存 {trade_date} 的每日基础数据\")\n", " return daily_basic_data\n", "\n", "\n", "# 遍历每个交易日期并获取数据\n", "with ThreadPoolExecutor(max_workers=2) as executor:\n", " future_to_date = {executor.submit(get_data, td): td for td in trade_dates}\n", "\n", " for future in as_completed(future_to_date):\n", " trade_date = future_to_date[future] # 获取对应的交易日期\n", " try:\n", " result = future.result() # 获取任务执行的结果\n", " all_daily_data.append(result)\n", " print(f\"任务 {trade_date} 完成\")\n", " except Exception as e:\n", " print(f\"获取 {trade_date} 数据时出错: {e}\")\n", " # 计数一次 API 调用\n", " api_call_count += 1\n", "\n", " # 每调用 300 次,检查时间是否少于 1 分钟,如果少于则等待剩余时间\n", " if api_call_count % 150 == 0:\n", " elapsed = time.time() - batch_start_time\n", " if elapsed < 60:\n", " sleep_time = 60 - elapsed\n", " print(f\"已调用 150 次 API,等待 {sleep_time:.2f} 秒以满足速率限制...\")\n", " time.sleep(sleep_time)\n", " # 重置批次起始时间\n", " batch_start_time = time.time()\n", "\n" ] }, { "cell_type": "code", "execution_count": 5, "id": "919023c693d7a47a", "metadata": { "ExecuteTime": { "end_time": "2025-04-06T15:33:57.072796Z", "start_time": "2025-04-06T15:33:57.061670Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " ts_code trade_date close turnover_rate turnover_rate_f \\\n", "0 000059.SZ 20250407 4.54 1.8414 3.4767 \n", "1 600830.SH 20250407 8.33 2.5217 3.6802 \n", "2 688061.SH 20250407 24.45 3.1011 3.1011 \n", "3 600868.SH 20250407 2.79 3.8477 4.1435 \n", "4 605168.SH 20250407 25.98 1.3857 2.8470 \n", "... ... ... ... ... ... \n", "5386 688259.SH 20250407 34.99 5.9799 11.4393 \n", "5387 301316.SZ 20250407 19.20 7.2272 7.9512 \n", "5388 601116.SH 20250407 10.37 2.3317 7.1579 \n", "5389 605016.SH 20250407 17.20 1.4773 3.9134 \n", "5390 600148.SH 20250407 16.07 2.0776 4.5745 \n", "\n", " volume_ratio pe pe_ttm pb ps ps_ttm dv_ratio \\\n", "0 0.84 103.2927 NaN 0.5851 0.1574 0.1928 0.3084 \n", "1 0.69 71.1750 71.1750 1.7467 11.2902 11.2902 0.1801 \n", "2 2.31 292.8121 NaN 1.1504 6.1795 4.9755 NaN \n", "3 1.16 NaN NaN 2.3425 16.8832 16.0274 0.0000 \n", "4 1.56 10.3735 14.0394 1.9988 1.0366 1.2218 4.5870 \n", "... ... ... ... ... ... ... ... \n", "5386 1.10 66.8795 64.8845 2.6173 5.9119 6.5930 NaN \n", "5387 1.30 94.0750 110.9182 7.1350 5.7094 4.8530 0.4126 \n", "5388 1.78 41.2451 36.3656 1.7811 1.4576 1.4350 1.9286 \n", "5389 1.05 28.7938 22.2858 3.3051 6.4003 4.8254 1.3640 \n", "5390 2.12 3441.4901 274.8323 4.8916 3.2666 3.3043 0.1867 \n", "\n", " dv_ttm total_share float_share free_share total_mv circ_mv \\\n", "0 0.3084 159944.2537 159944.2537 84712.3362 726146.9118 726146.9118 \n", "1 0.1801 45432.2747 45432.2747 31131.0133 378450.8483 378450.8483 \n", "2 NaN 11488.9391 4329.7770 4329.7770 280904.5610 105863.0477 \n", "3 NaN 189814.8679 189814.8679 176264.8506 529583.4814 529583.4814 \n", "4 4.5870 21081.6986 21081.6986 10260.7016 547702.5296 547702.5296 \n", "... ... ... ... ... ... ... \n", "5386 NaN 11170.0000 11170.0000 5839.1660 390838.3000 390838.3000 \n", "5387 0.4126 40400.0000 24282.6503 22071.3403 775680.0000 466226.8858 \n", "5388 1.9286 54767.8400 54767.8400 17840.9208 567942.5008 567942.5008 \n", "5389 1.3640 32308.6400 32308.6400 12196.5716 555708.6080 555708.6080 \n", "5390 0.1867 14151.6450 14151.6450 6427.3300 227416.9352 227416.9352 \n", "\n", " is_st \n", "0 False \n", "1 False \n", "2 False \n", "3 False \n", "4 False \n", "... ... \n", "5386 False \n", "5387 False \n", "5388 False \n", "5389 False \n", "5390 False \n", "\n", "[5391 rows x 19 columns]\n" ] } ], "source": [ "all_daily_data_df = pd.concat(all_daily_data, ignore_index=True)\n", "print(all_daily_data_df)" ] }, { "cell_type": "code", "execution_count": 6, "id": "28cb78d032671b20", "metadata": { "ExecuteTime": { "end_time": "2025-04-06T15:33:57.104132Z", "start_time": "2025-04-06T15:33:57.095010Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " ts_code trade_date close turnover_rate turnover_rate_f \\\n", "16 000656.SZ 20250407 1.28 0.9982 1.1644 \n", "62 002748.SZ 20250407 7.32 0.5503 1.1888 \n", "114 002490.SZ 20250407 3.49 0.7559 1.3380 \n", "128 300165.SZ 20250407 2.78 4.0431 4.7932 \n", "278 600303.SH 20250407 3.22 1.1873 1.4918 \n", "... ... ... ... ... ... \n", "5263 002217.SZ 20250407 2.07 0.1251 0.1569 \n", "5267 002808.SZ 20250407 2.99 4.0901 4.7924 \n", "5290 002602.SZ 20250407 6.44 0.2276 0.2634 \n", "5315 002501.SZ 20250407 1.92 1.5653 2.0207 \n", "5375 300376.SZ 20250407 2.96 1.4873 3.4865 \n", "\n", " volume_ratio pe pe_ttm pb ps ps_ttm dv_ratio \\\n", "16 0.44 NaN NaN NaN 0.1081 0.1637 0.0000 \n", "62 0.61 96.0467 49.7297 1.3328 0.8402 0.8839 1.3661 \n", "114 0.19 NaN NaN 5.6564 2.0529 2.0529 0.0000 \n", "128 2.22 NaN NaN 0.9988 1.3542 1.4288 0.0000 \n", "278 0.77 NaN NaN 1.4997 1.6142 1.6353 0.0000 \n", "... ... ... ... ... ... ... ... \n", "5263 0.23 NaN NaN NaN 3.3436 10.3100 0.0000 \n", "5267 0.79 NaN NaN 2.5039 5.2047 4.8881 0.6689 \n", "5290 0.20 91.5846 53.4453 1.8455 3.6128 2.5226 0.0000 \n", "5315 0.58 NaN NaN 7.1559 14.2934 20.0240 0.0000 \n", "5375 4.52 12.2436 36.2242 0.9837 1.4380 2.0320 1.6554 \n", "\n", " dv_ttm total_share float_share free_share total_mv \\\n", "16 NaN 533971.5816 531174.3236 455354.2392 6.834836e+05 \n", "62 1.3661 24000.0000 24000.0000 11108.5000 1.756800e+05 \n", "114 NaN 79784.8400 54161.3625 30599.6625 2.784491e+05 \n", "128 NaN 49551.1725 42053.2110 35472.8422 1.377523e+05 \n", "278 NaN 68360.4211 67560.4211 53770.9211 2.201206e+05 \n", "... ... ... ... ... ... \n", "5263 NaN 747939.8928 568036.4278 453036.0995 1.548236e+06 \n", "5267 0.6689 26880.0000 18638.3713 15907.0731 8.037120e+04 \n", "5290 NaN 745255.6968 687870.8273 594244.1179 4.799447e+06 \n", "5315 NaN 355000.0000 354999.9006 274999.9006 6.816000e+05 \n", "5375 1.6554 232824.0476 232743.4901 99284.6609 6.891592e+05 \n", "\n", " circ_mv is_st \n", "16 6.799031e+05 True \n", "62 1.756800e+05 True \n", "114 1.890232e+05 True \n", "128 1.169079e+05 True \n", "278 2.175446e+05 True \n", "... ... ... \n", "5263 1.175835e+06 True \n", "5267 5.572873e+04 True \n", "5290 4.429888e+06 True \n", "5315 6.815998e+05 True \n", "5375 6.889207e+05 True \n", "\n", "[106 rows x 19 columns]\n" ] } ], "source": [ "print(all_daily_data_df[all_daily_data_df['is_st']])" ] }, { "cell_type": "code", "execution_count": 7, "id": "692b58674b7462c9", "metadata": { "ExecuteTime": { "end_time": "2025-04-06T15:33:57.927188Z", "start_time": "2025-04-06T15:33:57.127166Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "所有每日基础数据获取并保存完毕!\n" ] } ], "source": [ "# 将数据保存为 HDF5 文件(table 格式)\n", "all_daily_data_df.to_hdf(h5_filename, key='daily_basic', mode='a', format='table', append=True, data_columns=True)\n", "\n", "print(\"所有每日基础数据获取并保存完毕!\")\n" ] }, { "cell_type": "code", "execution_count": 8, "id": "d7a773fc20293477", "metadata": { "ExecuteTime": { "end_time": "2025-04-06T15:34:06.721517Z", "start_time": "2025-04-06T15:33:57.951119Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Index: 8507519 entries, 0 to 5390\n", "Data columns (total 3 columns):\n", " # Column Dtype \n", "--- ------ ----- \n", " 0 ts_code object\n", " 1 trade_date object\n", " 2 is_st bool \n", "dtypes: bool(1), object(2)\n", "memory usage: 202.8+ MB\n", "None\n" ] } ], "source": [ "with pd.HDFStore(h5_filename, mode='r') as store:\n", " df = store[key][['ts_code', 'trade_date', 'is_st']]\n", " print(df.info())" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.11" } }, "nbformat": 4, "nbformat_minor": 5 }