{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "18d1d622-b083-4cc4-a6f8-7c1ed2d0edd2", "metadata": { "ExecuteTime": { "end_time": "2025-04-09T14:57:36.913044Z", "start_time": "2025-04-09T14:57:36.159612Z" } }, "outputs": [], "source": [ "import tushare as ts\n", "ts.set_token('3a0741c702ee7e5e5f2bf1f0846bafaafe4e320833240b2a7e4a685f')\n", "pro = ts.pro_api()" ] }, { "cell_type": "code", "execution_count": 2, "id": "14671a7f72de2564", "metadata": { "ExecuteTime": { "end_time": "2025-04-09T14:57:39.128278Z", "start_time": "2025-04-09T14:57:36.918051Z" } }, "outputs": [], "source": [ "from datetime import datetime\n", "import pandas as pd\n", "import warnings\n", "\n", "warnings.filterwarnings(\"ignore\")\n", "def filter_rows(df):\n", " # 按照 name 和 start_date 分组\n", " def select_row(group):\n", " # 如果有 end_date 不为 NaT 的行,优先保留这些行\n", " valid_rows = group[group['end_date'].notna()]\n", " if not valid_rows.empty:\n", " return valid_rows.iloc[0] # 返回第一个有效行\n", " else:\n", " return group.iloc[0] # 如果没有有效行,返回第一行\n", "\n", " filtered_df = df.groupby(['name', 'start_date'], group_keys=False).apply(select_row)\n", " filtered_df = filtered_df.reset_index(drop=True)\n", " return filtered_df\n", "\n", "def is_st(name_change_dict, stock_code, target_date):\n", " target_date = datetime.strptime(target_date, '%Y%m%d')\n", " if stock_code not in name_change_dict.keys():\n", " return False\n", " df = name_change_dict[stock_code]\n", " for i in range(len(df)):\n", " sds = df.iloc[i, 2]\n", " eds = df.iloc[i, 3]\n", " if eds is None or eds is pd.NaT:\n", " eds = datetime.now()\n", " if (target_date - sds).days >= 0 and (target_date - eds).days <= 0:\n", " return True\n", " return False\n", "\n", "name_change_df = pd.read_hdf('/mnt/d/PyProject/NewStock/data/name_change.h5', key='name_change')\n", "name_change_df = name_change_df.drop_duplicates(keep='first')\n", "\n", "# 确保 name_change_df 的日期格式正确\n", "name_change_df['start_date'] = pd.to_datetime(name_change_df['start_date'], format='%Y%m%d')\n", "name_change_df['end_date'] = pd.to_datetime(name_change_df['end_date'], format='%Y%m%d', errors='coerce')\n", "# name_change_df = name_change_df[name_change_df.name.str.contains('ST') ]\n", "name_change_dict = {}\n", "for ts_code, group in name_change_df.groupby('ts_code'):\n", " # 只保留 'ST' 和 '*ST' 的记录\n", " # st_data = group[(group['change_reason'] == 'ST') | (group['change_reason'] == '*ST')]\n", " st_data = group[(group['name'].str.contains('ST')) | (group['name'].str.contains('退'))]\n", " if not st_data.empty:\n", " name_change_dict[ts_code] = filter_rows(st_data)" ] }, { "cell_type": "code", "execution_count": 3, "id": "e7f8cce2f80e2f20", "metadata": { "ExecuteTime": { "end_time": "2025-04-09T14:58:09.296046Z", "start_time": "2025-04-09T14:57:39.339423Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Index: 9335158 entries, 0 to 21759\n", "Data columns (total 2 columns):\n", " # Column Dtype \n", "--- ------ ----- \n", " 0 ts_code object\n", " 1 trade_date object\n", "dtypes: object(2)\n", "memory usage: 213.7+ MB\n", "None\n", "20251120\n", "20251121\n" ] } ], "source": [ "import time\n", "from concurrent.futures import ThreadPoolExecutor, as_completed\n", "\n", "h5_filename = '/mnt/d/PyProject/NewStock/data/daily_basic.h5'\n", "key = '/daily_basic'\n", "max_date = None\n", "with pd.HDFStore(h5_filename, mode='r') as store:\n", " df = store[key][['ts_code', 'trade_date']]\n", " print(df.info())\n", " max_date = df['trade_date'].max()\n", "\n", "print(max_date)\n", "trade_cal = pro.trade_cal(exchange='', start_date='20170101', end_date='20251220')\n", "trade_cal = trade_cal[trade_cal['is_open'] == 1] # 只保留交易日\n", "trade_dates = trade_cal[trade_cal['cal_date'] > max_date]['cal_date'].tolist()\n", "start_date = min(trade_dates)\n", "print(start_date)" ] }, { "cell_type": "code", "execution_count": 4, "id": "553cfb36-f560-4cc4-b2bc-68323ccc5072", "metadata": { "ExecuteTime": { "end_time": "2025-04-09T14:58:16.817010Z", "start_time": "2025-04-09T14:58:09.326485Z" }, "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "任务 20251219 完成\n", "任务 20251218 完成\n", "任务 20251217 完成\n", "任务 20251216 完成\n", "任务 20251215 完成\n", "任务 20251212 完成\n", "任务 20251211 完成\n", "任务 20251210 完成\n", "任务 20251209 完成\n", "任务 20251208 完成\n", "任务 20251205 完成\n", "任务 20251204 完成\n", "任务 20251203 完成\n", "任务 20251202 完成\n", "任务 20251201 完成\n", "任务 20251128 完成\n", "任务 20251127 完成\n", "任务 20251126 完成\n", "任务 20251125 完成\n", "任务 20251124 完成\n", "任务 20251121 完成\n" ] } ], "source": [ "\n", "\n", "# 使用 HDFStore 存储数据\n", "all_daily_data = []\n", "\n", "# API 调用计数和时间控制变量\n", "api_call_count = 0\n", "batch_start_time = time.time()\n", "\n", "\n", "def get_data(trade_date):\n", " daily_basic_data = pro.daily_basic(ts_code='', trade_date=trade_date)\n", " if daily_basic_data is not None and not daily_basic_data.empty:\n", " # 添加交易日期列标识\n", " daily_basic_data['trade_date'] = trade_date\n", " daily_basic_data['is_st'] = daily_basic_data.apply(\n", " lambda row: is_st(name_change_dict, row['ts_code'], row['trade_date']), axis=1\n", " )\n", " time.sleep(0.2)\n", " # print(f\"成功获取并保存 {trade_date} 的每日基础数据\")\n", " return daily_basic_data\n", "\n", "\n", "# 遍历每个交易日期并获取数据\n", "with ThreadPoolExecutor(max_workers=2) as executor:\n", " future_to_date = {executor.submit(get_data, td): td for td in trade_dates}\n", "\n", " for future in as_completed(future_to_date):\n", " trade_date = future_to_date[future] # 获取对应的交易日期\n", " try:\n", " result = future.result() # 获取任务执行的结果\n", " all_daily_data.append(result)\n", " print(f\"任务 {trade_date} 完成\")\n", " except Exception as e:\n", " print(f\"获取 {trade_date} 数据时出错: {e}\")\n", " # 计数一次 API 调用\n", " api_call_count += 1\n", "\n", " # 每调用 300 次,检查时间是否少于 1 分钟,如果少于则等待剩余时间\n", " if api_call_count % 150 == 0:\n", " elapsed = time.time() - batch_start_time\n", " if elapsed < 60:\n", " sleep_time = 60 - elapsed\n", " print(f\"已调用 150 次 API,等待 {sleep_time:.2f} 秒以满足速率限制...\")\n", " time.sleep(sleep_time)\n", " # 重置批次起始时间\n", " batch_start_time = time.time()\n", "\n" ] }, { "cell_type": "code", "execution_count": 5, "id": "919023c693d7a47a", "metadata": { "ExecuteTime": { "end_time": "2025-04-09T14:58:16.864178Z", "start_time": "2025-04-09T14:58:16.855084Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " ts_code trade_date close turnover_rate turnover_rate_f \\\n", "0 000559.SZ 20251121 11.64 4.8762 13.4563 \n", "1 002981.SZ 20251121 27.84 1.5833 4.5574 \n", "2 301053.SZ 20251121 32.50 1.0110 2.9907 \n", "3 603093.SH 20251121 18.29 0.7403 3.2151 \n", "4 600269.SH 20251121 5.25 0.8423 1.8459 \n", "... ... ... ... ... ... \n", "5439 600243.SH 20251121 4.78 1.7524 2.1078 \n", "5440 300759.SZ 20251121 28.39 1.0514 1.6405 \n", "5441 600054.SH 20251121 11.10 1.3130 3.1101 \n", "5442 603579.SH 20251121 23.85 2.2265 4.3412 \n", "5443 002528.SZ 20251121 3.03 1.9087 4.0726 \n", "\n", " volume_ratio pe pe_ttm pb ps ps_ttm dv_ratio \\\n", "0 1.09 40.5790 38.2942 4.1055 2.9989 2.7785 1.2842 \n", "1 1.44 33.9003 28.1141 3.4000 2.2070 1.9328 0.9280 \n", "2 1.24 56.6010 98.7688 4.0251 4.4406 4.0870 0.2389 \n", "3 1.21 24.3641 24.7359 2.5390 1.9536 5.0927 0.3609 \n", "4 1.32 9.5849 6.9841 0.6165 2.0486 2.1055 3.0476 \n", "... ... ... ... ... ... ... ... \n", "5439 1.37 NaN NaN 3.3110 8.8659 8.4702 0.0000 \n", "5440 0.86 28.1501 33.3780 3.4547 4.1124 3.7273 0.7056 \n", "5441 1.53 25.7012 28.5474 1.6912 4.1924 3.9403 1.8829 \n", "5442 1.23 25.2677 30.2644 1.7649 3.0372 3.0683 3.8598 \n", "5443 0.61 NaN NaN 35.8962 3.8438 6.1411 0.0000 \n", "\n", " dv_ttm total_share float_share free_share total_mv \\\n", "0 1.5410 331535.8444 331454.4214 120110.9588 3.859077e+06 \n", "1 0.9187 13748.6115 11941.3915 4148.6777 3.827613e+05 \n", "2 0.8961 8421.7803 7749.4689 2619.7738 2.737079e+05 \n", "3 0.4117 61006.5893 61006.5893 14046.4993 1.115811e+06 \n", "4 3.2381 233540.7014 233540.7014 106564.7107 1.226089e+06 \n", "... ... ... ... ... ... \n", "5439 NaN 43885.0000 43885.0000 36485.0000 2.097703e+05 \n", "5440 0.7045 177819.5525 141938.4613 90967.4278 5.048297e+06 \n", "5441 1.5495 72937.9440 51330.0000 21670.4250 8.096112e+05 \n", "5442 1.2636 20335.5564 20335.5564 10429.5044 4.850030e+05 \n", "5443 NaN 119867.5082 105021.9577 49219.1551 3.631985e+05 \n", "\n", " circ_mv is_st \n", "0 3.858129e+06 False \n", "1 3.324483e+05 False \n", "2 2.518577e+05 False \n", "3 1.115811e+06 False \n", "4 1.226089e+06 False \n", "... ... ... \n", "5439 2.097703e+05 True \n", "5440 4.029633e+06 False \n", "5441 5.697630e+05 False \n", "5442 4.850030e+05 False \n", "5443 3.182165e+05 True \n", "\n", "[5444 rows x 19 columns]\n" ] } ], "source": [ "all_daily_data_df = pd.concat(all_daily_data, ignore_index=True)\n", "print(all_daily_data_df)" ] }, { "cell_type": "code", "execution_count": 6, "id": "28cb78d032671b20", "metadata": { "ExecuteTime": { "end_time": "2025-04-09T14:58:16.881685Z", "start_time": "2025-04-09T14:58:16.871184Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " ts_code trade_date close turnover_rate turnover_rate_f \\\n", "55 000909.SZ 20251121 5.63 0.5785 0.9877 \n", "62 002485.SZ 20251121 4.61 0.9593 3.9009 \n", "134 300096.SZ 20251121 7.31 1.6490 1.9675 \n", "154 300343.SZ 20251121 5.48 4.1298 4.7019 \n", "166 600525.SH 20251121 3.53 1.8869 2.7053 \n", "... ... ... ... ... ... \n", "5340 300368.SZ 20251121 14.86 7.3423 10.4878 \n", "5381 300020.SZ 20251121 3.63 1.9995 2.2386 \n", "5383 000506.SZ 20251121 11.55 2.5685 3.8339 \n", "5439 600243.SH 20251121 4.78 1.7524 2.1078 \n", "5443 002528.SZ 20251121 3.03 1.9087 4.0726 \n", "\n", " volume_ratio pe pe_ttm pb ps ps_ttm dv_ratio \\\n", "55 0.99 NaN NaN 2.4818 7.6504 7.4923 0.0 \n", "62 0.51 NaN NaN 2.1295 3.0458 3.2777 0.0 \n", "134 0.81 NaN 50.1694 8.9654 5.6290 6.2215 0.0 \n", "154 0.72 267.9489 106.2988 3.0411 6.7430 6.5207 0.0 \n", "166 0.72 NaN NaN 1.2373 0.5912 0.5968 0.0 \n", "... ... ... ... ... ... ... ... \n", "5340 0.94 NaN NaN 42.1875 42.9123 57.8502 0.0 \n", "5381 1.00 NaN NaN 1.0776 5.2649 21.5375 0.0 \n", "5383 0.78 NaN 239.4225 16.7572 32.2021 20.7023 0.0 \n", "5439 1.37 NaN NaN 3.3110 8.8659 8.4702 0.0 \n", "5443 0.61 NaN NaN 35.8962 3.8438 6.1411 0.0 \n", "\n", " dv_ttm total_share float_share free_share total_mv \\\n", "55 NaN 43771.4245 43771.0570 25634.2299 2.464331e+05 \n", "62 NaN 54400.0000 54400.0000 13377.7333 2.507840e+05 \n", "134 NaN 43000.0000 43000.0000 36039.3251 3.143300e+05 \n", "154 NaN 106896.9119 106621.9389 93649.7579 5.857951e+05 \n", "166 NaN 131878.0152 131878.0152 91981.1744 4.655294e+05 \n", "... ... ... ... ... ... \n", "5340 NaN 52894.3475 52894.3475 37030.2475 7.860100e+05 \n", "5381 NaN 79467.7974 76663.9584 68475.6577 2.884681e+05 \n", "5383 NaN 92901.7761 92858.4361 62210.1427 1.073016e+06 \n", "5439 NaN 43885.0000 43885.0000 36485.0000 2.097703e+05 \n", "5443 NaN 119867.5082 105021.9577 49219.1551 3.631985e+05 \n", "\n", " circ_mv is_st \n", "55 2.464311e+05 True \n", "62 2.507840e+05 True \n", "134 3.143300e+05 True \n", "154 5.842882e+05 True \n", "166 4.655294e+05 True \n", "... ... ... \n", "5340 7.860100e+05 True \n", "5381 2.782902e+05 True \n", "5383 1.072515e+06 True \n", "5439 2.097703e+05 True \n", "5443 3.182165e+05 True \n", "\n", "[186 rows x 19 columns]\n" ] } ], "source": [ "print(all_daily_data_df[all_daily_data_df['is_st']])" ] }, { "cell_type": "code", "execution_count": 7, "id": "692b58674b7462c9", "metadata": { "ExecuteTime": { "end_time": "2025-04-09T14:58:17.773453Z", "start_time": "2025-04-09T14:58:16.903459Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "所有每日基础数据获取并保存完毕!\n" ] } ], "source": [ "# 将数据保存为 HDF5 文件(table 格式)\n", "all_daily_data_df.to_hdf(h5_filename, key='daily_basic', mode='a', format='table', append=True, data_columns=True)\n", "\n", "print(\"所有每日基础数据获取并保存完毕!\")\n" ] }, { "cell_type": "code", "execution_count": 8, "id": "d7a773fc20293477", "metadata": { "ExecuteTime": { "end_time": "2025-04-09T14:58:24.305403Z", "start_time": "2025-04-09T14:58:17.816332Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Index: 9340602 entries, 0 to 5443\n", "Data columns (total 3 columns):\n", " # Column Dtype \n", "--- ------ ----- \n", " 0 ts_code object\n", " 1 trade_date object\n", " 2 is_st bool \n", "dtypes: bool(1), object(2)\n", "memory usage: 222.7+ MB\n", "None\n" ] } ], "source": [ "with pd.HDFStore(h5_filename, mode='r') as store:\n", " df = store[key][['ts_code', 'trade_date', 'is_st']]\n", " print(df.info())" ] } ], "metadata": { "kernelspec": { "display_name": "stock", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.11" } }, "nbformat": 4, "nbformat_minor": 5 }