{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "18d1d622-b083-4cc4-a6f8-7c1ed2d0edd2", "metadata": { "ExecuteTime": { "end_time": "2025-04-09T14:57:36.913044Z", "start_time": "2025-04-09T14:57:36.159612Z" } }, "outputs": [], "source": [ "import tushare as ts\n", "ts.set_token('3a0741c702ee7e5e5f2bf1f0846bafaafe4e320833240b2a7e4a685f')\n", "pro = ts.pro_api()" ] }, { "cell_type": "code", "execution_count": 2, "id": "14671a7f72de2564", "metadata": { "ExecuteTime": { "end_time": "2025-04-09T14:57:39.128278Z", "start_time": "2025-04-09T14:57:36.918051Z" } }, "outputs": [], "source": [ "from datetime import datetime\n", "import pandas as pd\n", "import warnings\n", "\n", "warnings.filterwarnings(\"ignore\")\n", "def filter_rows(df):\n", " # 按照 name 和 start_date 分组\n", " def select_row(group):\n", " # 如果有 end_date 不为 NaT 的行,优先保留这些行\n", " valid_rows = group[group['end_date'].notna()]\n", " if not valid_rows.empty:\n", " return valid_rows.iloc[0] # 返回第一个有效行\n", " else:\n", " return group.iloc[0] # 如果没有有效行,返回第一行\n", "\n", " filtered_df = df.groupby(['name', 'start_date'], group_keys=False).apply(select_row)\n", " filtered_df = filtered_df.reset_index(drop=True)\n", " return filtered_df\n", "\n", "def is_st(name_change_dict, stock_code, target_date):\n", " target_date = datetime.strptime(target_date, '%Y%m%d')\n", " if stock_code not in name_change_dict.keys():\n", " return False\n", " df = name_change_dict[stock_code]\n", " for i in range(len(df)):\n", " sds = df.iloc[i, 2]\n", " eds = df.iloc[i, 3]\n", " if eds is None or eds is pd.NaT:\n", " eds = datetime.now()\n", " if (target_date - sds).days >= 0 and (target_date - eds).days <= 0:\n", " return True\n", " return False\n", "\n", "name_change_df = pd.read_hdf('../../../data/name_change.h5', key='name_change')\n", "name_change_df = name_change_df.drop_duplicates(keep='first')\n", "\n", "# 确保 name_change_df 的日期格式正确\n", "name_change_df['start_date'] = pd.to_datetime(name_change_df['start_date'], format='%Y%m%d')\n", "name_change_df['end_date'] = pd.to_datetime(name_change_df['end_date'], format='%Y%m%d', errors='coerce')\n", "# name_change_df = name_change_df[name_change_df.name.str.contains('ST') ]\n", "name_change_dict = {}\n", "for ts_code, group in name_change_df.groupby('ts_code'):\n", " # 只保留 'ST' 和 '*ST' 的记录\n", " # st_data = group[(group['change_reason'] == 'ST') | (group['change_reason'] == '*ST')]\n", " st_data = group[(group['name'].str.contains('ST')) | (group['name'].str.contains('退'))]\n", " if not st_data.empty:\n", " name_change_dict[ts_code] = filter_rows(st_data)" ] }, { "cell_type": "code", "execution_count": 3, "id": "e7f8cce2f80e2f20", "metadata": { "ExecuteTime": { "end_time": "2025-04-09T14:58:09.296046Z", "start_time": "2025-04-09T14:57:39.339423Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Index: 8615301 entries, 0 to 5388\n", "Data columns (total 2 columns):\n", " # Column Dtype \n", "--- ------ ----- \n", " 0 ts_code object\n", " 1 trade_date object\n", "dtypes: object(2)\n", "memory usage: 197.2+ MB\n", "None\n", "20250508\n", "20250509\n" ] } ], "source": [ "import time\n", "from concurrent.futures import ThreadPoolExecutor, as_completed\n", "\n", "h5_filename = '../../../data/daily_basic.h5'\n", "key = '/daily_basic'\n", "max_date = None\n", "with pd.HDFStore(h5_filename, mode='r') as store:\n", " df = store[key][['ts_code', 'trade_date']]\n", " print(df.info())\n", " max_date = df['trade_date'].max()\n", "\n", "print(max_date)\n", "trade_cal = pro.trade_cal(exchange='', start_date='20170101', end_date='20250720')\n", "trade_cal = trade_cal[trade_cal['is_open'] == 1] # 只保留交易日\n", "trade_dates = trade_cal[trade_cal['cal_date'] > max_date]['cal_date'].tolist()\n", "start_date = min(trade_dates)\n", "print(start_date)" ] }, { "cell_type": "code", "execution_count": 4, "id": "553cfb36-f560-4cc4-b2bc-68323ccc5072", "metadata": { "ExecuteTime": { "end_time": "2025-04-09T14:58:16.817010Z", "start_time": "2025-04-09T14:58:09.326485Z" }, "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "任务 20250718 完成\n", "任务 20250717 完成\n", "任务 20250716 完成\n", "任务 20250715 完成\n", "任务 20250711 完成\n", "任务 20250714 完成\n", "任务 20250710 完成\n", "任务 20250709 完成\n", "任务 20250708 完成\n", "任务 20250707 完成\n", "任务 20250704 完成\n", "任务 20250703 完成\n", "任务 20250702 完成\n", "任务 20250701 完成\n", "任务 20250630 完成\n", "任务 20250627 完成\n", "任务 20250626 完成\n", "任务 20250625 完成\n", "任务 20250624 完成\n", "任务 20250623 完成\n", "任务 20250620 完成\n", "任务 20250619 完成\n", "任务 20250618 完成\n", "任务 20250617 完成\n", "任务 20250616 完成\n", "任务 20250613 完成\n", "任务 20250612 完成\n", "任务 20250611 完成\n", "任务 20250610 完成\n", "任务 20250609 完成\n", "任务 20250606 完成\n", "任务 20250605 完成\n", "任务 20250604 完成\n", "任务 20250603 完成\n", "任务 20250530 完成\n", "任务 20250529 完成\n", "任务 20250528 完成\n", "任务 20250527 完成\n", "任务 20250526 完成\n", "任务 20250523 完成\n", "任务 20250522 完成\n", "任务 20250521 完成\n", "任务 20250520 完成\n", "任务 20250519 完成\n", "任务 20250516 完成\n", "任务 20250515 完成\n", "任务 20250514 完成\n", "任务 20250513 完成\n", "任务 20250512 完成\n", "任务 20250509 完成\n" ] } ], "source": [ "\n", "\n", "# 使用 HDFStore 存储数据\n", "all_daily_data = []\n", "\n", "# API 调用计数和时间控制变量\n", "api_call_count = 0\n", "batch_start_time = time.time()\n", "\n", "\n", "def get_data(trade_date):\n", " daily_basic_data = pro.daily_basic(ts_code='', trade_date=trade_date)\n", " if daily_basic_data is not None and not daily_basic_data.empty:\n", " # 添加交易日期列标识\n", " daily_basic_data['trade_date'] = trade_date\n", " daily_basic_data['is_st'] = daily_basic_data.apply(\n", " lambda row: is_st(name_change_dict, row['ts_code'], row['trade_date']), axis=1\n", " )\n", " time.sleep(0.2)\n", " # print(f\"成功获取并保存 {trade_date} 的每日基础数据\")\n", " return daily_basic_data\n", "\n", "\n", "# 遍历每个交易日期并获取数据\n", "with ThreadPoolExecutor(max_workers=2) as executor:\n", " future_to_date = {executor.submit(get_data, td): td for td in trade_dates}\n", "\n", " for future in as_completed(future_to_date):\n", " trade_date = future_to_date[future] # 获取对应的交易日期\n", " try:\n", " result = future.result() # 获取任务执行的结果\n", " all_daily_data.append(result)\n", " print(f\"任务 {trade_date} 完成\")\n", " except Exception as e:\n", " print(f\"获取 {trade_date} 数据时出错: {e}\")\n", " # 计数一次 API 调用\n", " api_call_count += 1\n", "\n", " # 每调用 300 次,检查时间是否少于 1 分钟,如果少于则等待剩余时间\n", " if api_call_count % 150 == 0:\n", " elapsed = time.time() - batch_start_time\n", " if elapsed < 60:\n", " sleep_time = 60 - elapsed\n", " print(f\"已调用 150 次 API,等待 {sleep_time:.2f} 秒以满足速率限制...\")\n", " time.sleep(sleep_time)\n", " # 重置批次起始时间\n", " batch_start_time = time.time()\n", "\n" ] }, { "cell_type": "code", "execution_count": 5, "id": "919023c693d7a47a", "metadata": { "ExecuteTime": { "end_time": "2025-04-09T14:58:16.864178Z", "start_time": "2025-04-09T14:58:16.855084Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " ts_code trade_date close turnover_rate turnover_rate_f \\\n", "0 300575.SZ 20250509 6.05 1.9284 2.1880 \n", "1 300247.SZ 20250509 3.77 2.1735 2.5437 \n", "2 603038.SH 20250509 15.80 17.5702 32.3972 \n", "3 002030.SZ 20250509 5.82 0.8252 1.2070 \n", "4 600157.SH 20250509 1.36 0.8369 1.0222 \n", "... ... ... ... ... ... \n", "5384 600841.SH 20250509 5.57 1.0271 3.2670 \n", "5385 300968.SZ 20250509 14.76 1.2857 2.7636 \n", "5386 300634.SZ 20250509 25.79 5.2551 9.4581 \n", "5387 300295.SZ 20250509 15.73 3.0347 3.2458 \n", "5388 688370.SH 20250509 19.15 1.2008 1.2008 \n", "\n", " volume_ratio pe pe_ttm pb ps ps_ttm dv_ratio \\\n", "0 0.71 239.8914 NaN 1.3451 1.1608 1.1259 1.9835 \n", "1 0.96 64.6952 53.1680 2.7649 4.4008 3.9673 0.0000 \n", "2 4.47 183.7603 154.4297 3.1047 4.0259 3.7692 0.2434 \n", "3 0.62 NaN NaN 1.0296 9.5754 9.9145 0.2577 \n", "4 0.55 19.3625 26.3896 0.6394 1.0656 1.1327 0.4044 \n", "... ... ... ... ... ... ... ... \n", "5384 0.77 NaN NaN 2.3362 1.1952 1.2860 0.0000 \n", "5385 0.71 115.0812 181.8721 3.2254 4.9990 5.1146 0.3388 \n", "5386 1.01 50.5639 52.9222 4.1166 7.0433 6.7806 0.8063 \n", "5387 0.65 NaN NaN 2.6398 24.2982 28.1758 0.0000 \n", "5388 1.25 29.1668 36.1111 0.9812 4.4106 4.4983 NaN \n", "\n", " dv_ttm total_share float_share free_share total_mv \\\n", "0 1.9835 4.647564e+04 3.427082e+04 3.020469e+04 2.811776e+05 \n", "1 NaN 8.040403e+04 8.032753e+04 6.863630e+04 3.031232e+05 \n", "2 0.2434 2.686771e+04 2.686771e+04 1.457134e+04 4.245098e+05 \n", "3 0.2577 1.403446e+05 1.403446e+05 9.595371e+04 8.168056e+05 \n", "4 0.4044 2.221776e+06 2.221776e+06 1.819047e+06 3.021616e+06 \n", "... ... ... ... ... ... \n", "5384 NaN 1.387822e+05 1.043024e+05 3.279094e+04 7.730167e+05 \n", "5385 0.3388 4.133800e+04 4.133800e+04 1.923185e+04 6.101489e+05 \n", "5386 0.8063 4.512109e+04 4.346809e+04 2.415175e+04 1.163673e+06 \n", "5387 NaN 1.896137e+04 1.675486e+04 1.566518e+04 2.982624e+05 \n", "5388 NaN 1.371079e+04 4.374912e+03 4.374912e+03 2.625616e+05 \n", "\n", " circ_mv is_st \n", "0 2.073385e+05 False \n", "1 3.028348e+05 False \n", "2 4.245098e+05 False \n", "3 8.168056e+05 False \n", "4 3.021616e+06 False \n", "... ... ... \n", "5384 5.809646e+05 False \n", "5385 6.101489e+05 False \n", "5386 1.121042e+06 False \n", "5387 2.635540e+05 False \n", "5388 8.377956e+04 False \n", "\n", "[5389 rows x 19 columns]\n" ] } ], "source": [ "all_daily_data_df = pd.concat(all_daily_data, ignore_index=True)\n", "print(all_daily_data_df)" ] }, { "cell_type": "code", "execution_count": 6, "id": "28cb78d032671b20", "metadata": { "ExecuteTime": { "end_time": "2025-04-09T14:58:16.881685Z", "start_time": "2025-04-09T14:58:16.871184Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " ts_code trade_date close turnover_rate turnover_rate_f \\\n", "54 002496.SZ 20250509 1.43 3.1262 3.2341 \n", "148 603828.SH 20250509 5.04 3.5674 7.1692 \n", "166 600599.SH 20250509 7.70 10.8623 27.2882 \n", "193 000820.SZ 20250509 2.16 5.5698 5.7239 \n", "203 300506.SZ 20250509 3.28 0.6710 0.9449 \n", "... ... ... ... ... ... \n", "5204 002602.SZ 20250509 8.00 1.3867 1.7044 \n", "5253 300147.SZ 20250509 7.37 7.2159 9.3379 \n", "5264 002501.SZ 20250509 2.08 2.4301 3.1371 \n", "5317 600421.SH 20250509 5.27 2.7391 5.8971 \n", "5345 600289.SH 20250509 5.78 1.3847 2.0115 \n", "\n", " volume_ratio pe pe_ttm pb ps ps_ttm dv_ratio \\\n", "54 0.73 NaN NaN 1.6044 7.6992 7.2633 0.0 \n", "148 1.65 349.9490 1691.0271 3.9734 1.2211 1.3170 0.0 \n", "166 4.51 NaN NaN 11.5933 3.9468 4.0472 0.0 \n", "193 1.00 NaN NaN 9.5443 11.2714 14.3393 0.0 \n", "203 0.87 NaN NaN 28.5909 19.5183 19.3088 0.0 \n", "... ... ... ... ... ... ... ... \n", "5204 0.78 49.1432 31.1887 2.2169 2.6358 2.2496 0.0 \n", "5253 1.74 NaN NaN 5.0393 2.6221 2.8487 0.0 \n", "5264 0.87 NaN NaN 22.5816 22.1370 26.0255 0.0 \n", "5317 0.74 NaN NaN 143.1934 8.7976 8.9449 0.0 \n", "5345 0.55 NaN NaN 2.9752 11.3890 11.6628 0.0 \n", "\n", " dv_ttm total_share float_share free_share total_mv \\\n", "54 NaN 150758.9677 118138.6559 114196.4999 2.155853e+05 \n", "148 NaN 59596.0158 59593.9625 29654.2988 3.003639e+05 \n", "166 NaN 16600.0000 16600.0000 6607.7948 1.278200e+05 \n", "193 NaN 64362.0201 29403.1899 28611.4718 1.390220e+05 \n", "203 NaN 69559.6569 57572.5450 40880.9749 2.281557e+05 \n", "... ... ... ... ... ... \n", "5204 NaN 745255.6968 687870.8273 559649.7754 5.962046e+06 \n", "5253 NaN 66127.9045 65745.9042 50804.9121 4.873627e+05 \n", "5264 NaN 355000.0000 354999.9006 274999.9006 7.384000e+05 \n", "5317 NaN 19560.0000 19560.0000 9085.2748 1.030812e+05 \n", "5345 NaN 63105.2069 56592.2684 38956.2787 3.647481e+05 \n", "\n", " circ_mv is_st \n", "54 1.689383e+05 True \n", "148 3.003536e+05 True \n", "166 1.278200e+05 True \n", "193 6.351089e+04 True \n", "203 1.888379e+05 True \n", "... ... ... \n", "5204 5.502967e+06 True \n", "5253 4.845473e+05 True \n", "5264 7.383998e+05 True \n", "5317 1.030812e+05 True \n", "5345 3.271033e+05 True \n", "\n", "[197 rows x 19 columns]\n" ] } ], "source": [ "print(all_daily_data_df[all_daily_data_df['is_st']])" ] }, { "cell_type": "code", "execution_count": 7, "id": "692b58674b7462c9", "metadata": { "ExecuteTime": { "end_time": "2025-04-09T14:58:17.773453Z", "start_time": "2025-04-09T14:58:16.903459Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "所有每日基础数据获取并保存完毕!\n" ] } ], "source": [ "# 将数据保存为 HDF5 文件(table 格式)\n", "all_daily_data_df.to_hdf(h5_filename, key='daily_basic', mode='a', format='table', append=True, data_columns=True)\n", "\n", "print(\"所有每日基础数据获取并保存完毕!\")\n" ] }, { "cell_type": "code", "execution_count": 8, "id": "d7a773fc20293477", "metadata": { "ExecuteTime": { "end_time": "2025-04-09T14:58:24.305403Z", "start_time": "2025-04-09T14:58:17.816332Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Index: 8620690 entries, 0 to 5388\n", "Data columns (total 3 columns):\n", " # Column Dtype \n", "--- ------ ----- \n", " 0 ts_code object\n", " 1 trade_date object\n", " 2 is_st bool \n", "dtypes: bool(1), object(2)\n", "memory usage: 205.5+ MB\n", "None\n" ] } ], "source": [ "with pd.HDFStore(h5_filename, mode='r') as store:\n", " df = store[key][['ts_code', 'trade_date', 'is_st']]\n", " print(df.info())" ] } ], "metadata": { "kernelspec": { "display_name": "new_trader", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.11" } }, "nbformat": 4, "nbformat_minor": 5 }