{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "18d1d622-b083-4cc4-a6f8-7c1ed2d0edd2", "metadata": { "ExecuteTime": { "end_time": "2025-04-09T14:57:36.913044Z", "start_time": "2025-04-09T14:57:36.159612Z" } }, "outputs": [], "source": [ "import tushare as ts\n", "ts.set_token('3a0741c702ee7e5e5f2bf1f0846bafaafe4e320833240b2a7e4a685f')\n", "pro = ts.pro_api()" ] }, { "cell_type": "code", "execution_count": 2, "id": "14671a7f72de2564", "metadata": { "ExecuteTime": { "end_time": "2025-04-09T14:57:39.128278Z", "start_time": "2025-04-09T14:57:36.918051Z" } }, "outputs": [], "source": [ "from datetime import datetime\n", "import pandas as pd\n", "import warnings\n", "\n", "warnings.filterwarnings(\"ignore\")\n", "def filter_rows(df):\n", " # 按照 name 和 start_date 分组\n", " def select_row(group):\n", " # 如果有 end_date 不为 NaT 的行,优先保留这些行\n", " valid_rows = group[group['end_date'].notna()]\n", " if not valid_rows.empty:\n", " return valid_rows.iloc[0] # 返回第一个有效行\n", " else:\n", " return group.iloc[0] # 如果没有有效行,返回第一行\n", "\n", " filtered_df = df.groupby(['name', 'start_date'], group_keys=False).apply(select_row)\n", " filtered_df = filtered_df.reset_index(drop=True)\n", " return filtered_df\n", "\n", "def is_st(name_change_dict, stock_code, target_date):\n", " target_date = datetime.strptime(target_date, '%Y%m%d')\n", " if stock_code not in name_change_dict.keys():\n", " return False\n", " df = name_change_dict[stock_code]\n", " for i in range(len(df)):\n", " sds = df.iloc[i, 2]\n", " eds = df.iloc[i, 3]\n", " if eds is None or eds is pd.NaT:\n", " eds = datetime.now()\n", " if (target_date - sds).days >= 0 and (target_date - eds).days <= 0:\n", " return True\n", " return False\n", "\n", "name_change_df = pd.read_hdf('../../../data/name_change.h5', key='name_change')\n", "name_change_df = name_change_df.drop_duplicates(keep='first')\n", "\n", "# 确保 name_change_df 的日期格式正确\n", "name_change_df['start_date'] = pd.to_datetime(name_change_df['start_date'], format='%Y%m%d')\n", "name_change_df['end_date'] = pd.to_datetime(name_change_df['end_date'], format='%Y%m%d', errors='coerce')\n", "# name_change_df = name_change_df[name_change_df.name.str.contains('ST') ]\n", "name_change_dict = {}\n", "for ts_code, group in name_change_df.groupby('ts_code'):\n", " # 只保留 'ST' 和 '*ST' 的记录\n", " # st_data = group[(group['change_reason'] == 'ST') | (group['change_reason'] == '*ST')]\n", " st_data = group[(group['name'].str.contains('ST')) | (group['name'].str.contains('退'))]\n", " if not st_data.empty:\n", " name_change_dict[ts_code] = filter_rows(st_data)" ] }, { "cell_type": "code", "execution_count": 3, "id": "e7f8cce2f80e2f20", "metadata": { "ExecuteTime": { "end_time": "2025-04-09T14:58:09.296046Z", "start_time": "2025-04-09T14:57:39.339423Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Index: 8647642 entries, 0 to 26951\n", "Data columns (total 2 columns):\n", " # Column Dtype \n", "--- ------ ----- \n", " 0 ts_code object\n", " 1 trade_date object\n", "dtypes: object(2)\n", "memory usage: 197.9+ MB\n", "None\n", "20250516\n", "20250519\n" ] } ], "source": [ "import time\n", "from concurrent.futures import ThreadPoolExecutor, as_completed\n", "\n", "h5_filename = '../../../data/daily_basic.h5'\n", "key = '/daily_basic'\n", "max_date = None\n", "with pd.HDFStore(h5_filename, mode='r') as store:\n", " df = store[key][['ts_code', 'trade_date']]\n", " print(df.info())\n", " max_date = df['trade_date'].max()\n", "\n", "print(max_date)\n", "trade_cal = pro.trade_cal(exchange='', start_date='20170101', end_date='20250720')\n", "trade_cal = trade_cal[trade_cal['is_open'] == 1] # 只保留交易日\n", "trade_dates = trade_cal[trade_cal['cal_date'] > max_date]['cal_date'].tolist()\n", "start_date = min(trade_dates)\n", "print(start_date)" ] }, { "cell_type": "code", "execution_count": 4, "id": "553cfb36-f560-4cc4-b2bc-68323ccc5072", "metadata": { "ExecuteTime": { "end_time": "2025-04-09T14:58:16.817010Z", "start_time": "2025-04-09T14:58:09.326485Z" }, "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "任务 20250717 完成\n", "任务 20250718 完成\n", "任务 20250715 完成\n", "任务 20250716 完成\n", "任务 20250714 完成\n", "任务 20250711 完成\n", "任务 20250709 完成\n", "任务 20250710 完成\n", "任务 20250708 完成\n", "任务 20250707 完成\n", "任务 20250704 完成\n", "任务 20250703 完成\n", "任务 20250702 完成\n", "任务 20250701 完成\n", "任务 20250630 完成\n", "任务 20250627 完成\n", "任务 20250625 完成\n", "任务 20250626 完成\n", "任务 20250624 完成\n", "任务 20250623 完成\n", "任务 20250619 完成\n", "任务 20250620 完成\n", "任务 20250618 完成\n", "任务 20250617 完成\n", "任务 20250616 完成\n", "任务 20250613 完成\n", "任务 20250612 完成\n", "任务 20250611 完成\n", "任务 20250610 完成\n", "任务 20250609 完成\n", "任务 20250606 完成\n", "任务 20250605 完成\n", "任务 20250604 完成\n", "任务 20250603 完成\n", "任务 20250529 完成\n", "任务 20250530 完成\n", "任务 20250527 完成\n", "任务 20250528 完成\n", "任务 20250526 完成\n", "任务 20250523 完成\n", "任务 20250522 完成\n", "任务 20250521 完成\n", "任务 20250520 完成\n", "任务 20250519 完成\n" ] } ], "source": [ "\n", "\n", "# 使用 HDFStore 存储数据\n", "all_daily_data = []\n", "\n", "# API 调用计数和时间控制变量\n", "api_call_count = 0\n", "batch_start_time = time.time()\n", "\n", "\n", "def get_data(trade_date):\n", " daily_basic_data = pro.daily_basic(ts_code='', trade_date=trade_date)\n", " if daily_basic_data is not None and not daily_basic_data.empty:\n", " # 添加交易日期列标识\n", " daily_basic_data['trade_date'] = trade_date\n", " daily_basic_data['is_st'] = daily_basic_data.apply(\n", " lambda row: is_st(name_change_dict, row['ts_code'], row['trade_date']), axis=1\n", " )\n", " time.sleep(0.2)\n", " # print(f\"成功获取并保存 {trade_date} 的每日基础数据\")\n", " return daily_basic_data\n", "\n", "\n", "# 遍历每个交易日期并获取数据\n", "with ThreadPoolExecutor(max_workers=2) as executor:\n", " future_to_date = {executor.submit(get_data, td): td for td in trade_dates}\n", "\n", " for future in as_completed(future_to_date):\n", " trade_date = future_to_date[future] # 获取对应的交易日期\n", " try:\n", " result = future.result() # 获取任务执行的结果\n", " all_daily_data.append(result)\n", " print(f\"任务 {trade_date} 完成\")\n", " except Exception as e:\n", " print(f\"获取 {trade_date} 数据时出错: {e}\")\n", " # 计数一次 API 调用\n", " api_call_count += 1\n", "\n", " # 每调用 300 次,检查时间是否少于 1 分钟,如果少于则等待剩余时间\n", " if api_call_count % 150 == 0:\n", " elapsed = time.time() - batch_start_time\n", " if elapsed < 60:\n", " sleep_time = 60 - elapsed\n", " print(f\"已调用 150 次 API,等待 {sleep_time:.2f} 秒以满足速率限制...\")\n", " time.sleep(sleep_time)\n", " # 重置批次起始时间\n", " batch_start_time = time.time()\n", "\n" ] }, { "cell_type": "code", "execution_count": 5, "id": "919023c693d7a47a", "metadata": { "ExecuteTime": { "end_time": "2025-04-09T14:58:16.864178Z", "start_time": "2025-04-09T14:58:16.855084Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " ts_code trade_date close turnover_rate turnover_rate_f \\\n", "0 000839.SZ 20250523 2.67 0.8124 1.2782 \n", "1 300274.SZ 20250523 60.60 3.2852 3.7071 \n", "2 301356.SZ 20250523 17.59 5.0050 5.0698 \n", "3 600152.SH 20250523 5.73 1.3359 2.0988 \n", "4 300049.SZ 20250523 29.91 1.6066 1.7292 \n", "... ... ... ... ... ... \n", "26941 002458.SZ 20250519 8.36 2.1950 2.5416 \n", "26942 600882.SH 20250519 27.18 2.2244 4.6853 \n", "26943 001283.SZ 20250519 54.51 3.0453 3.0453 \n", "26944 000718.SZ 20250519 2.20 1.4790 2.2404 \n", "26945 002141.SZ 20250519 3.09 4.9267 7.1872 \n", "\n", " volume_ratio pe pe_ttm pb ps ps_ttm dv_ratio \\\n", "0 0.62 NaN NaN 7.4695 3.0824 3.1095 0.0000 \n", "1 1.82 11.3840 9.8414 3.0807 1.6137 1.4907 1.1292 \n", "2 1.43 NaN 18055.4366 1.2789 4.2618 3.3028 0.0000 \n", "3 1.11 NaN NaN 1.7367 1.9844 2.0758 0.0000 \n", "4 1.05 70.3242 80.3071 4.4707 5.9056 5.8725 0.0000 \n", "... ... ... ... ... ... ... ... \n", "26941 1.47 18.3588 24.2570 2.1403 2.9497 3.0116 2.3923 \n", "26942 0.89 122.4919 89.9537 3.0986 2.8733 2.7144 0.0000 \n", "26943 0.92 48.1520 36.6481 2.1043 0.8602 0.8229 0.8691 \n", "26944 1.76 40.4178 55.0402 0.7058 3.1476 3.2425 3.6364 \n", "26945 1.51 NaN NaN 3.8214 7.2461 4.4422 0.0000 \n", "\n", " dv_ttm total_share float_share free_share total_mv \\\n", "0 NaN 391982.6352 391982.6352 249133.8007 1.046594e+06 \n", "1 1.1292 207321.1424 158970.9449 140880.3307 1.256366e+07 \n", "2 NaN 21600.0000 5481.0000 5410.9920 3.799440e+05 \n", "3 NaN 52907.9375 52907.9375 33676.4965 3.031625e+05 \n", "4 NaN 26635.6100 23351.5217 21696.0562 7.966711e+05 \n", "... ... ... ... ... ... \n", "26941 2.3577 110641.2915 74886.8285 64675.1303 9.249612e+05 \n", "26942 NaN 51205.3647 51205.3647 24310.0793 1.391762e+06 \n", "26943 0.8691 8061.0011 5785.5721 5785.5721 4.394052e+05 \n", "26944 3.6364 303463.6384 228209.3122 150654.2061 6.676200e+05 \n", "26945 NaN 103293.5798 103159.2875 70714.2228 3.191772e+05 \n", "\n", " circ_mv is_st \n", "0 1.046594e+06 False \n", "1 9.633639e+06 False \n", "2 9.641079e+04 False \n", "3 3.031625e+05 False \n", "4 6.984440e+05 False \n", "... ... ... \n", "26941 6.260539e+05 False \n", "26942 1.391762e+06 False \n", "26943 3.153715e+05 False \n", "26944 5.020605e+05 False \n", "26945 3.187622e+05 True \n", "\n", "[26946 rows x 19 columns]\n" ] } ], "source": [ "all_daily_data_df = pd.concat(all_daily_data, ignore_index=True)\n", "print(all_daily_data_df)" ] }, { "cell_type": "code", "execution_count": 6, "id": "28cb78d032671b20", "metadata": { "ExecuteTime": { "end_time": "2025-04-09T14:58:16.881685Z", "start_time": "2025-04-09T14:58:16.871184Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " ts_code trade_date close turnover_rate turnover_rate_f \\\n", "23 002898.SZ 20250523 10.20 22.8874 36.4442 \n", "35 000889.SZ 20250523 2.76 1.6609 2.2443 \n", "53 300379.SZ 20250523 6.12 9.3935 9.5800 \n", "58 300268.SZ 20250523 10.27 1.8178 2.5956 \n", "155 000615.SZ 20250523 3.15 1.1640 1.7189 \n", "... ... ... ... ... ... \n", "26880 300147.SZ 20250519 8.80 6.8409 8.8527 \n", "26891 002501.SZ 20250519 2.17 4.4260 5.7136 \n", "26910 600421.SH 20250519 6.39 3.4329 7.3909 \n", "26938 600289.SH 20250519 5.90 1.1380 1.6532 \n", "26945 002141.SZ 20250519 3.09 4.9267 7.1872 \n", "\n", " volume_ratio pe pe_ttm pb ps ps_ttm dv_ratio dv_ttm \\\n", "23 10.43 NaN NaN 3.6011 6.8112 7.2338 0.1961 0.1961 \n", "35 0.52 NaN NaN 27.2957 1.7661 1.7554 0.0000 NaN \n", "53 0.89 NaN NaN 1.0993 4.5062 4.1828 0.0000 NaN \n", "58 0.99 NaN NaN NaN 0.5235 0.5833 0.0000 NaN \n", "155 0.99 NaN NaN NaN 2.1957 2.2727 0.0000 NaN \n", "... ... .. ... ... ... ... ... ... \n", "26880 1.55 NaN NaN 6.0171 3.1309 3.4015 0.0000 NaN \n", "26891 1.83 NaN NaN 23.5587 23.0948 27.1516 0.0000 NaN \n", "26910 0.92 NaN NaN 173.6254 10.6672 10.8459 0.0000 NaN \n", "26938 0.46 NaN NaN 3.0370 11.6255 11.9049 0.0000 NaN \n", "26945 1.51 NaN NaN 3.8214 7.2461 4.4422 0.0000 NaN \n", "\n", " total_share float_share free_share total_mv circ_mv is_st \n", "23 17600.0000 10126.2561 6359.4096 179520.0000 103287.8122 True \n", "35 93629.1116 86984.9676 64375.7658 258416.3480 240078.5106 True \n", "53 55792.2828 52663.7564 51638.5483 341448.7707 322302.1892 True \n", "58 17420.0000 13370.7500 9364.1581 178903.4000 137317.6025 True \n", "155 76297.9719 76250.0287 51632.2709 240338.6115 240187.5904 True \n", "... ... ... ... ... ... ... \n", "26880 66127.9045 65745.9042 50804.9121 581925.5596 578563.9570 True \n", "26891 355000.0000 354999.9006 274999.9006 770350.0000 770349.7843 True \n", "26910 19560.0000 19560.0000 9085.2748 124988.4000 124988.4000 True \n", "26938 63105.2069 56592.2684 38956.2787 372320.7207 333894.3836 True \n", "26945 103293.5798 103159.2875 70714.2228 319177.1616 318762.1984 True \n", "\n", "[944 rows x 19 columns]\n" ] } ], "source": [ "print(all_daily_data_df[all_daily_data_df['is_st']])" ] }, { "cell_type": "code", "execution_count": 7, "id": "692b58674b7462c9", "metadata": { "ExecuteTime": { "end_time": "2025-04-09T14:58:17.773453Z", "start_time": "2025-04-09T14:58:16.903459Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "所有每日基础数据获取并保存完毕!\n" ] } ], "source": [ "# 将数据保存为 HDF5 文件(table 格式)\n", "all_daily_data_df.to_hdf(h5_filename, key='daily_basic', mode='a', format='table', append=True, data_columns=True)\n", "\n", "print(\"所有每日基础数据获取并保存完毕!\")\n" ] }, { "cell_type": "code", "execution_count": 8, "id": "d7a773fc20293477", "metadata": { "ExecuteTime": { "end_time": "2025-04-09T14:58:24.305403Z", "start_time": "2025-04-09T14:58:17.816332Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Index: 8674588 entries, 0 to 26945\n", "Data columns (total 3 columns):\n", " # Column Dtype \n", "--- ------ ----- \n", " 0 ts_code object\n", " 1 trade_date object\n", " 2 is_st bool \n", "dtypes: bool(1), object(2)\n", "memory usage: 206.8+ MB\n", "None\n" ] } ], "source": [ "with pd.HDFStore(h5_filename, mode='r') as store:\n", " df = store[key][['ts_code', 'trade_date', 'is_st']]\n", " print(df.info())" ] } ], "metadata": { "kernelspec": { "display_name": "new_trader", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.11" } }, "nbformat": 4, "nbformat_minor": 5 }