{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "18d1d622-b083-4cc4-a6f8-7c1ed2d0edd2", "metadata": { "ExecuteTime": { "end_time": "2025-02-11T15:43:54.745322Z", "start_time": "2025-02-11T15:43:53.837662Z" } }, "outputs": [], "source": [ "import tushare as ts\n", "ts.set_token('3a0741c702ee7e5e5f2bf1f0846bafaafe4e320833240b2a7e4a685f')\n", "pro = ts.pro_api()" ] }, { "cell_type": "code", "execution_count": 2, "id": "14671a7f72de2564", "metadata": { "ExecuteTime": { "end_time": "2025-02-11T15:53:08.235573Z", "start_time": "2025-02-11T15:53:07.753701Z" } }, "outputs": [], "source": [ "from datetime import datetime\n", "import pandas as pd\n", "\n", "def is_st(name_change_dict, stock_code, target_date):\n", " target_date = datetime.strptime(target_date, '%Y%m%d')\n", " if stock_code not in name_change_dict.keys():\n", " return False\n", " df = name_change_dict[stock_code]\n", " for i in range(len(df)):\n", " sds = df.iloc[i, 2]\n", " eds = df.iloc[i, 3]\n", " if eds is None or eds is pd.NaT:\n", " eds = datetime.now()\n", " if (target_date - sds).days >= 0 and (target_date - eds).days <= 0:\n", " return True\n", " return False\n", "\n", "name_change_df = pd.read_hdf('../../../data/name_change.h5', key='name_change')\n", "name_change_df = name_change_df.drop_duplicates(keep='first')\n", "\n", "# 确保 name_change_df 的日期格式正确\n", "name_change_df['start_date'] = pd.to_datetime(name_change_df['start_date'], format='%Y%m%d')\n", "name_change_df['end_date'] = pd.to_datetime(name_change_df['end_date'], format='%Y%m%d', errors='coerce')\n", "name_change_df = name_change_df[name_change_df.name.str.contains('ST')]\n", "name_change_dict = {}\n", "for ts_code, group in name_change_df.groupby('ts_code'):\n", " # 只保留 'ST' 和 '*ST' 的记录\n", " st_data = group[(group['change_reason'] == 'ST') | (group['change_reason'] == '*ST')]\n", " if not st_data.empty:\n", " name_change_dict[ts_code] = st_data" ] }, { "cell_type": "code", "execution_count": 3, "id": "e7f8cce2f80e2f20", "metadata": { "ExecuteTime": { "end_time": "2025-02-11T15:53:19.812860Z", "start_time": "2025-02-11T15:53:09.614377Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Index: 8291970 entries, 0 to 8291969\n", "Data columns (total 2 columns):\n", " # Column Dtype \n", "--- ------ ----- \n", " 0 ts_code object\n", " 1 trade_date object\n", "dtypes: object(2)\n", "memory usage: 189.8+ MB\n", "None\n", "20250211\n", "20250212\n" ] } ], "source": [ "import time\n", "from concurrent.futures import ThreadPoolExecutor, as_completed\n", "\n", "h5_filename = '../../../data/daily_basic.h5'\n", "key = '/daily_basic'\n", "max_date = None\n", "with pd.HDFStore(h5_filename, mode='r') as store:\n", " df = store[key][['ts_code', 'trade_date']]\n", " print(df.info())\n", " max_date = df['trade_date'].max()\n", "\n", "print(max_date)\n", "trade_cal = pro.trade_cal(exchange='', start_date='20170101', end_date='20250220')\n", "trade_cal = trade_cal[trade_cal['is_open'] == 1] # 只保留交易日\n", "trade_dates = trade_cal[trade_cal['cal_date'] > max_date]['cal_date'].tolist()\n", "start_date = min(trade_dates)\n", "print(start_date)" ] }, { "cell_type": "code", "execution_count": 4, "id": "553cfb36-f560-4cc4-b2bc-68323ccc5072", "metadata": { "ExecuteTime": { "end_time": "2025-02-11T15:53:24.100612Z", "start_time": "2025-02-11T15:53:22.361257Z" }, "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "任务 20250220 完成\n", "任务 20250219 完成\n", "任务 20250217 完成\n", "任务 20250218 完成\n", "任务 20250214 完成\n", "任务 20250213 完成\n", "任务 20250212 完成\n" ] } ], "source": [ "\n", "\n", "# 使用 HDFStore 存储数据\n", "all_daily_data = []\n", "\n", "# API 调用计数和时间控制变量\n", "api_call_count = 0\n", "batch_start_time = time.time()\n", "\n", "\n", "def get_data(trade_date):\n", " daily_basic_data = pro.daily_basic(ts_code='', trade_date=trade_date)\n", " if daily_basic_data is not None and not daily_basic_data.empty:\n", " # 添加交易日期列标识\n", " daily_basic_data['trade_date'] = trade_date\n", " daily_basic_data['is_st'] = daily_basic_data.apply(\n", " lambda row: is_st(name_change_dict, row['ts_code'], row['trade_date']), axis=1\n", " )\n", " time.sleep(0.2)\n", " # print(f\"成功获取并保存 {trade_date} 的每日基础数据\")\n", " return daily_basic_data\n", "\n", "\n", "# 遍历每个交易日期并获取数据\n", "with ThreadPoolExecutor(max_workers=2) as executor:\n", " future_to_date = {executor.submit(get_data, td): td for td in trade_dates}\n", "\n", " for future in as_completed(future_to_date):\n", " trade_date = future_to_date[future] # 获取对应的交易日期\n", " try:\n", " result = future.result() # 获取任务执行的结果\n", " all_daily_data.append(result)\n", " print(f\"任务 {trade_date} 完成\")\n", " except Exception as e:\n", " print(f\"获取 {trade_date} 数据时出错: {e}\")\n", " # 计数一次 API 调用\n", " api_call_count += 1\n", "\n", " # 每调用 300 次,检查时间是否少于 1 分钟,如果少于则等待剩余时间\n", " if api_call_count % 150 == 0:\n", " elapsed = time.time() - batch_start_time\n", " if elapsed < 60:\n", " sleep_time = 60 - elapsed\n", " print(f\"已调用 150 次 API,等待 {sleep_time:.2f} 秒以满足速率限制...\")\n", " time.sleep(sleep_time)\n", " # 重置批次起始时间\n", " batch_start_time = time.time()\n", "\n" ] }, { "cell_type": "code", "execution_count": 5, "id": "919023c693d7a47a", "metadata": { "ExecuteTime": { "end_time": "2025-02-11T15:53:25.913933Z", "start_time": "2025-02-11T15:53:25.902629Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " ts_code trade_date close turnover_rate turnover_rate_f \\\n", "0 601162.SH 20250212 4.77 7.3760 9.7054 \n", "1 603216.SH 20250212 11.42 8.8711 8.8711 \n", "2 872808.BJ 20250212 74.36 4.1219 15.3296 \n", "3 601881.SH 20250212 14.43 0.5617 1.9533 \n", "4 002837.SZ 20250212 42.25 3.8199 5.7136 \n", "... ... ... ... ... ... \n", "5380 603931.SH 20250212 23.83 1.4692 4.6843 \n", "5381 688567.SH 20250212 12.35 1.3091 2.1970 \n", "5382 688530.SH 20250212 19.30 6.6093 6.6093 \n", "5383 301363.SZ 20250212 31.99 2.1990 2.1990 \n", "5384 833533.BJ 20250212 46.02 27.7269 27.7597 \n", "\n", " volume_ratio pe pe_ttm pb ps ps_ttm dv_ratio \\\n", "0 2.00 134.5633 NaN 1.7935 12.0634 19.0461 0.0000 \n", "1 2.09 26.5657 27.5224 1.4454 1.9304 1.9996 2.6270 \n", "2 1.20 142.3485 196.0315 22.9124 22.8711 25.8281 NaN \n", "3 0.84 20.0264 15.5707 1.4245 4.6898 4.4609 2.1067 \n", "4 0.65 91.3544 64.5935 11.2259 8.9056 7.2600 0.3621 \n", "... ... ... ... ... ... ... ... \n", "5380 1.16 27.1631 29.0662 3.0982 6.8392 6.9124 1.1120 \n", "5381 1.01 NaN NaN 1.4955 0.9183 1.0469 NaN \n", "5382 0.99 62.5995 198.4906 3.6879 6.4857 7.9319 NaN \n", "5383 0.98 41.5226 47.9900 3.8396 9.7258 8.9664 0.4982 \n", "5384 0.84 52.3997 62.1858 13.3582 6.6261 5.9638 NaN \n", "\n", " dv_ttm total_share float_share free_share total_mv \\\n", "0 NaN 8.665757e+05 866575.7464 658594.7570 4.133566e+06 \n", "1 2.6270 2.226900e+04 5669.0000 5669.0000 2.543120e+05 \n", "2 NaN 2.000000e+04 19461.9464 5233.0650 1.487200e+06 \n", "3 2.1067 1.093440e+06 724341.7623 208280.6759 1.577834e+07 \n", "4 0.3621 7.438227e+04 64662.2002 43230.4691 3.142651e+06 \n", "... ... ... ... ... ... \n", "5380 1.1120 1.995584e+04 19955.8380 6258.8392 4.755476e+05 \n", "5381 NaN 1.222104e+05 122210.3885 72818.9706 1.509298e+06 \n", "5382 NaN 1.600448e+04 3200.8966 3200.8966 3.088865e+05 \n", "5383 0.4982 4.066600e+04 11215.9100 11215.9100 1.300905e+06 \n", "5384 NaN 1.005826e+04 3796.0235 3791.5280 4.628809e+05 \n", "\n", " circ_mv is_st \n", "0 4.133566e+06 False \n", "1 6.473998e+04 False \n", "2 1.447190e+06 False \n", "3 1.045225e+07 False \n", "4 2.731978e+06 False \n", "... ... ... \n", "5380 4.755476e+05 False \n", "5381 1.509298e+06 False \n", "5382 6.177730e+04 False \n", "5383 3.587970e+05 False \n", "5384 1.746930e+05 False \n", "\n", "[5385 rows x 19 columns]\n" ] } ], "source": [ "all_daily_data_df = pd.concat(all_daily_data, ignore_index=True)\n", "print(all_daily_data_df)" ] }, { "cell_type": "code", "execution_count": 6, "id": "28cb78d032671b20", "metadata": { "ExecuteTime": { "end_time": "2025-02-11T15:53:42.062142Z", "start_time": "2025-02-11T15:53:42.044324Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " ts_code trade_date close turnover_rate turnover_rate_f \\\n", "10 002366.SZ 20250212 5.10 3.8029 4.1742 \n", "48 002124.SZ 20250212 2.80 1.8388 1.9195 \n", "57 000504.SZ 20250212 9.32 0.9666 1.5370 \n", "63 603007.SH 20250212 10.03 2.0477 2.7581 \n", "91 300201.SZ 20250212 5.33 2.3317 3.1604 \n", "... ... ... ... ... ... \n", "5303 002316.SZ 20250212 3.52 3.1023 3.3580 \n", "5335 600568.SH 20250212 1.30 0.3996 0.6514 \n", "5364 002168.SZ 20250212 2.48 0.8869 1.0824 \n", "5367 300600.SZ 20250212 7.19 0.7517 1.4024 \n", "5369 000972.SZ 20250212 3.38 4.6979 7.2993 \n", "\n", " volume_ratio pe pe_ttm pb ps ps_ttm dv_ratio \\\n", "10 0.92 52.0324 56.8856 2.2889 14.2486 11.9214 0.0000 \n", "48 0.97 NaN 260.7218 1.7484 0.6080 0.6154 0.0000 \n", "57 0.83 NaN NaN 12.3702 22.4855 24.7156 0.0000 \n", "63 0.86 NaN NaN 24.6750 55.2244 76.4853 0.0000 \n", "91 0.75 26.1255 26.1088 4.2311 3.9774 4.2028 0.6431 \n", "... ... ... ... ... ... ... ... \n", "5303 0.95 NaN NaN 19.4146 2.2930 2.3153 0.0000 \n", "5335 0.76 NaN NaN 1.1378 4.0571 4.0379 0.0000 \n", "5364 0.88 1024.9794 NaN NaN 7.6515 7.4299 0.0000 \n", "5367 1.18 NaN NaN 2.2914 10.7845 8.9952 0.0000 \n", "5369 0.77 24.0853 120.2360 16.2931 4.5277 4.9137 0.0000 \n", "\n", " dv_ttm total_share float_share free_share total_mv \\\n", "10 NaN 208093.7640 125646.4390 114472.2056 1.061278e+06 \n", "48 NaN 222193.3832 197428.3498 189130.4452 6.221415e+05 \n", "57 NaN 33002.3098 31066.8701 19536.7046 3.075815e+05 \n", "63 NaN 87689.6101 49983.0778 37108.5778 8.795268e+05 \n", "91 0.6431 100904.3607 100450.7422 74110.3317 5.378202e+05 \n", "... ... ... ... ... ... \n", "5303 NaN 39312.0000 31500.7500 29101.6694 1.383782e+05 \n", "5335 NaN 199286.9681 166906.7279 102374.4773 2.590731e+05 \n", "5364 NaN 78416.3368 78416.3368 64258.0991 1.944725e+05 \n", "5367 NaN 29423.4480 24616.3436 13195.4382 2.115546e+05 \n", "5369 NaN 77128.3579 77128.3579 49641.0760 2.606938e+05 \n", "\n", " circ_mv is_st \n", "10 640796.8389 True \n", "48 552799.3794 True \n", "57 289543.2293 True \n", "63 501330.2703 True \n", "91 535402.4559 True \n", "... ... ... \n", "5303 110882.6400 True \n", "5335 216978.7463 True \n", "5364 194472.5153 True \n", "5367 176991.5105 True \n", "5369 260693.8497 True \n", "\n", "[318 rows x 19 columns]\n" ] } ], "source": [ "print(all_daily_data_df[all_daily_data_df['is_st']])" ] }, { "cell_type": "code", "execution_count": 7, "id": "692b58674b7462c9", "metadata": { "ExecuteTime": { "end_time": "2025-02-11T15:53:33.693894Z", "start_time": "2025-02-11T15:53:33.609884Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "所有每日基础数据获取并保存完毕!\n" ] } ], "source": [ "# 将数据保存为 HDF5 文件(table 格式)\n", "all_daily_data_df.to_hdf(h5_filename, key='daily_basic', mode='a', format='table', append=True, data_columns=True)\n", "\n", "print(\"所有每日基础数据获取并保存完毕!\")\n" ] }, { "cell_type": "code", "execution_count": 8, "id": "d7a773fc20293477", "metadata": { "ExecuteTime": { "end_time": "2025-02-11T15:54:27.868021Z", "start_time": "2025-02-11T15:54:18.853803Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Index: 8297355 entries, 0 to 5384\n", "Data columns (total 3 columns):\n", " # Column Dtype \n", "--- ------ ----- \n", " 0 ts_code object\n", " 1 trade_date object\n", " 2 is_st bool \n", "dtypes: bool(1), object(2)\n", "memory usage: 197.8+ MB\n", "None\n" ] } ], "source": [ "with pd.HDFStore(h5_filename, mode='r') as store:\n", " df = store[key][['ts_code', 'trade_date', 'is_st']]\n", " print(df.info())" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.19" } }, "nbformat": 4, "nbformat_minor": 5 }