454 lines
17 KiB
Plaintext
454 lines
17 KiB
Plaintext
{
|
||
"cells": [
|
||
{
|
||
"cell_type": "code",
|
||
"id": "18d1d622-b083-4cc4-a6f8-7c1ed2d0edd2",
|
||
"metadata": {
|
||
"ExecuteTime": {
|
||
"end_time": "2025-03-30T16:42:34.194992Z",
|
||
"start_time": "2025-03-30T16:42:33.440178Z"
|
||
}
|
||
},
|
||
"source": [
|
||
"import tushare as ts\n",
|
||
"ts.set_token('3a0741c702ee7e5e5f2bf1f0846bafaafe4e320833240b2a7e4a685f')\n",
|
||
"pro = ts.pro_api()"
|
||
],
|
||
"outputs": [],
|
||
"execution_count": 1
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"id": "14671a7f72de2564",
|
||
"metadata": {
|
||
"ExecuteTime": {
|
||
"end_time": "2025-03-30T16:42:36.432691Z",
|
||
"start_time": "2025-03-30T16:42:34.197998Z"
|
||
}
|
||
},
|
||
"source": [
|
||
"from datetime import datetime\n",
|
||
"import pandas as pd\n",
|
||
"import warnings\n",
|
||
"\n",
|
||
"warnings.filterwarnings(\"ignore\")\n",
|
||
"def filter_rows(df):\n",
|
||
" # 按照 name 和 start_date 分组\n",
|
||
" def select_row(group):\n",
|
||
" # 如果有 end_date 不为 NaT 的行,优先保留这些行\n",
|
||
" valid_rows = group[group['end_date'].notna()]\n",
|
||
" if not valid_rows.empty:\n",
|
||
" return valid_rows.iloc[0] # 返回第一个有效行\n",
|
||
" else:\n",
|
||
" return group.iloc[0] # 如果没有有效行,返回第一行\n",
|
||
"\n",
|
||
" filtered_df = df.groupby(['name', 'start_date'], group_keys=False).apply(select_row)\n",
|
||
" filtered_df = filtered_df.reset_index(drop=True)\n",
|
||
" return filtered_df\n",
|
||
"\n",
|
||
"def is_st(name_change_dict, stock_code, target_date):\n",
|
||
" target_date = datetime.strptime(target_date, '%Y%m%d')\n",
|
||
" if stock_code not in name_change_dict.keys():\n",
|
||
" return False\n",
|
||
" df = name_change_dict[stock_code]\n",
|
||
" for i in range(len(df)):\n",
|
||
" sds = df.iloc[i, 2]\n",
|
||
" eds = df.iloc[i, 3]\n",
|
||
" if eds is None or eds is pd.NaT:\n",
|
||
" eds = datetime.now()\n",
|
||
" if (target_date - sds).days >= 0 and (target_date - eds).days <= 0:\n",
|
||
" return True\n",
|
||
" return False\n",
|
||
"\n",
|
||
"name_change_df = pd.read_hdf('../../../data/name_change.h5', key='name_change')\n",
|
||
"name_change_df = name_change_df.drop_duplicates(keep='first')\n",
|
||
"\n",
|
||
"# 确保 name_change_df 的日期格式正确\n",
|
||
"name_change_df['start_date'] = pd.to_datetime(name_change_df['start_date'], format='%Y%m%d')\n",
|
||
"name_change_df['end_date'] = pd.to_datetime(name_change_df['end_date'], format='%Y%m%d', errors='coerce')\n",
|
||
"name_change_df = name_change_df[name_change_df.name.str.contains('ST')]\n",
|
||
"name_change_dict = {}\n",
|
||
"for ts_code, group in name_change_df.groupby('ts_code'):\n",
|
||
" # 只保留 'ST' 和 '*ST' 的记录\n",
|
||
" st_data = group[(group['change_reason'] == 'ST') | (group['change_reason'] == '*ST')]\n",
|
||
" if not st_data.empty:\n",
|
||
" name_change_dict[ts_code] = filter_rows(st_data)"
|
||
],
|
||
"outputs": [],
|
||
"execution_count": 2
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"id": "e7f8cce2f80e2f20",
|
||
"metadata": {
|
||
"ExecuteTime": {
|
||
"end_time": "2025-03-30T16:43:03.790361Z",
|
||
"start_time": "2025-03-30T16:42:36.633554Z"
|
||
}
|
||
},
|
||
"source": [
|
||
"import time\n",
|
||
"from concurrent.futures import ThreadPoolExecutor, as_completed\n",
|
||
"\n",
|
||
"h5_filename = '../../../data/daily_basic.h5'\n",
|
||
"key = '/daily_basic'\n",
|
||
"max_date = None\n",
|
||
"with pd.HDFStore(h5_filename, mode='r') as store:\n",
|
||
" df = store[key][['ts_code', 'trade_date']]\n",
|
||
" print(df.info())\n",
|
||
" max_date = df['trade_date'].max()\n",
|
||
"\n",
|
||
"print(max_date)\n",
|
||
"trade_cal = pro.trade_cal(exchange='', start_date='20170101', end_date='20250420')\n",
|
||
"trade_cal = trade_cal[trade_cal['is_open'] == 1] # 只保留交易日\n",
|
||
"trade_dates = trade_cal[trade_cal['cal_date'] > max_date]['cal_date'].tolist()\n",
|
||
"start_date = min(trade_dates)\n",
|
||
"print(start_date)"
|
||
],
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"<class 'pandas.core.frame.DataFrame'>\n",
|
||
"Index: 8453605 entries, 0 to 32308\n",
|
||
"Data columns (total 2 columns):\n",
|
||
" # Column Dtype \n",
|
||
"--- ------ ----- \n",
|
||
" 0 ts_code object\n",
|
||
" 1 trade_date object\n",
|
||
"dtypes: object(2)\n",
|
||
"memory usage: 193.5+ MB\n",
|
||
"None\n",
|
||
"20250321\n",
|
||
"20250324\n"
|
||
]
|
||
}
|
||
],
|
||
"execution_count": 3
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"id": "553cfb36-f560-4cc4-b2bc-68323ccc5072",
|
||
"metadata": {
|
||
"scrolled": true,
|
||
"ExecuteTime": {
|
||
"end_time": "2025-03-30T16:43:07.947442Z",
|
||
"start_time": "2025-03-30T16:43:03.827519Z"
|
||
}
|
||
},
|
||
"source": [
|
||
"\n",
|
||
"\n",
|
||
"# 使用 HDFStore 存储数据\n",
|
||
"all_daily_data = []\n",
|
||
"\n",
|
||
"# API 调用计数和时间控制变量\n",
|
||
"api_call_count = 0\n",
|
||
"batch_start_time = time.time()\n",
|
||
"\n",
|
||
"\n",
|
||
"def get_data(trade_date):\n",
|
||
" daily_basic_data = pro.daily_basic(ts_code='', trade_date=trade_date)\n",
|
||
" if daily_basic_data is not None and not daily_basic_data.empty:\n",
|
||
" # 添加交易日期列标识\n",
|
||
" daily_basic_data['trade_date'] = trade_date\n",
|
||
" daily_basic_data['is_st'] = daily_basic_data.apply(\n",
|
||
" lambda row: is_st(name_change_dict, row['ts_code'], row['trade_date']), axis=1\n",
|
||
" )\n",
|
||
" time.sleep(0.2)\n",
|
||
" # print(f\"成功获取并保存 {trade_date} 的每日基础数据\")\n",
|
||
" return daily_basic_data\n",
|
||
"\n",
|
||
"\n",
|
||
"# 遍历每个交易日期并获取数据\n",
|
||
"with ThreadPoolExecutor(max_workers=2) as executor:\n",
|
||
" future_to_date = {executor.submit(get_data, td): td for td in trade_dates}\n",
|
||
"\n",
|
||
" for future in as_completed(future_to_date):\n",
|
||
" trade_date = future_to_date[future] # 获取对应的交易日期\n",
|
||
" try:\n",
|
||
" result = future.result() # 获取任务执行的结果\n",
|
||
" all_daily_data.append(result)\n",
|
||
" print(f\"任务 {trade_date} 完成\")\n",
|
||
" except Exception as e:\n",
|
||
" print(f\"获取 {trade_date} 数据时出错: {e}\")\n",
|
||
" # 计数一次 API 调用\n",
|
||
" api_call_count += 1\n",
|
||
"\n",
|
||
" # 每调用 300 次,检查时间是否少于 1 分钟,如果少于则等待剩余时间\n",
|
||
" if api_call_count % 150 == 0:\n",
|
||
" elapsed = time.time() - batch_start_time\n",
|
||
" if elapsed < 60:\n",
|
||
" sleep_time = 60 - elapsed\n",
|
||
" print(f\"已调用 150 次 API,等待 {sleep_time:.2f} 秒以满足速率限制...\")\n",
|
||
" time.sleep(sleep_time)\n",
|
||
" # 重置批次起始时间\n",
|
||
" batch_start_time = time.time()\n",
|
||
"\n"
|
||
],
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"任务 20250418 完成\n",
|
||
"任务 20250417 完成\n",
|
||
"任务 20250416 完成\n",
|
||
"任务 20250415 完成\n",
|
||
"任务 20250411 完成\n",
|
||
"任务 20250414 完成\n",
|
||
"任务 20250410 完成\n",
|
||
"任务 20250409 完成\n",
|
||
"任务 20250408 完成\n",
|
||
"任务 20250407 完成\n",
|
||
"任务 20250403 完成\n",
|
||
"任务 20250402 完成\n",
|
||
"任务 20250331 完成\n",
|
||
"任务 20250401 完成\n",
|
||
"任务 20250327 完成\n",
|
||
"任务 20250328 完成\n",
|
||
"任务 20250326 完成\n",
|
||
"任务 20250324 完成\n",
|
||
"任务 20250325 完成\n"
|
||
]
|
||
}
|
||
],
|
||
"execution_count": 4
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"id": "919023c693d7a47a",
|
||
"metadata": {
|
||
"ExecuteTime": {
|
||
"end_time": "2025-03-30T16:43:07.962318Z",
|
||
"start_time": "2025-03-30T16:43:07.951757Z"
|
||
}
|
||
},
|
||
"source": [
|
||
"all_daily_data_df = pd.concat(all_daily_data, ignore_index=True)\n",
|
||
"print(all_daily_data_df)"
|
||
],
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
" ts_code trade_date close turnover_rate turnover_rate_f \\\n",
|
||
"0 603328.SH 20250327 10.44 1.0910 2.6596 \n",
|
||
"1 603989.SH 20250327 15.66 0.9036 2.6145 \n",
|
||
"2 603194.SH 20250327 38.03 14.0348 14.0348 \n",
|
||
"3 600884.SH 20250327 7.13 1.9769 2.1153 \n",
|
||
"4 688325.SH 20250327 47.26 1.5250 1.8078 \n",
|
||
"... ... ... ... ... ... \n",
|
||
"26946 688539.SH 20250325 26.70 1.0257 1.3011 \n",
|
||
"26947 688479.SH 20250325 18.73 0.9840 1.2588 \n",
|
||
"26948 000552.SZ 20250325 2.63 1.8147 3.0665 \n",
|
||
"26949 688719.SH 20250325 31.64 4.2998 5.1737 \n",
|
||
"26950 002709.SZ 20250325 19.50 1.2468 1.4268 \n",
|
||
"\n",
|
||
" volume_ratio pe pe_ttm pb ps ps_ttm dv_ratio \\\n",
|
||
"0 0.79 29.3625 23.3887 2.5786 3.2807 2.9727 1.8582 \n",
|
||
"1 0.79 17.8968 27.7940 1.7060 1.8591 1.6666 1.6823 \n",
|
||
"2 1.87 18.9266 18.3213 3.2891 2.5755 2.4322 NaN \n",
|
||
"3 0.52 20.9930 NaN 0.7305 0.8425 0.9106 2.7224 \n",
|
||
"4 0.93 67.1638 50.1073 2.3433 16.1029 10.2149 NaN \n",
|
||
"... ... ... ... ... ... ... ... \n",
|
||
"26946 0.56 51.5254 83.3548 2.8475 14.5500 13.9718 NaN \n",
|
||
"26947 0.61 23.5448 33.4921 1.4043 3.6736 4.5444 NaN \n",
|
||
"26948 1.42 8.0989 11.6324 0.8431 1.2501 1.3463 3.8023 \n",
|
||
"26949 1.64 26.3323 49.9921 2.0474 4.4195 3.6954 NaN \n",
|
||
"26950 0.76 19.7447 78.2248 2.9106 2.4233 3.0741 1.5444 \n",
|
||
"\n",
|
||
" dv_ttm total_share float_share free_share total_mv \\\n",
|
||
"0 1.8582 99844.2611 99844.2611 40955.5563 1.042374e+06 \n",
|
||
"1 1.6823 40113.0603 40113.0603 13863.2102 6.281705e+05 \n",
|
||
"2 NaN 40100.0000 4982.8436 4982.8436 1.525003e+06 \n",
|
||
"3 2.7224 225339.6168 175723.6492 164220.4548 1.606671e+06 \n",
|
||
"4 NaN 8494.7740 3830.4117 3231.0886 4.014630e+05 \n",
|
||
"... ... ... ... ... ... \n",
|
||
"26946 NaN 18592.0000 10286.0800 8109.0800 4.964064e+05 \n",
|
||
"26947 NaN 14431.7400 6087.4224 4758.2224 2.703065e+05 \n",
|
||
"26948 3.8023 535180.1936 372577.7383 220477.9354 1.407524e+06 \n",
|
||
"26949 NaN 11538.5418 7349.9938 6108.5305 3.650795e+05 \n",
|
||
"26950 1.5444 191434.3762 138501.6891 121034.9868 3.732970e+06 \n",
|
||
"\n",
|
||
" circ_mv is_st \n",
|
||
"0 1.042374e+06 False \n",
|
||
"1 6.281705e+05 False \n",
|
||
"2 1.894975e+05 False \n",
|
||
"3 1.252910e+06 False \n",
|
||
"4 1.810253e+05 False \n",
|
||
"... ... ... \n",
|
||
"26946 2.746383e+05 False \n",
|
||
"26947 1.140174e+05 False \n",
|
||
"26948 9.798795e+05 False \n",
|
||
"26949 2.325538e+05 False \n",
|
||
"26950 2.700783e+06 False \n",
|
||
"\n",
|
||
"[26951 rows x 19 columns]\n"
|
||
]
|
||
}
|
||
],
|
||
"execution_count": 5
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"id": "28cb78d032671b20",
|
||
"metadata": {
|
||
"ExecuteTime": {
|
||
"end_time": "2025-03-30T16:43:08.000073Z",
|
||
"start_time": "2025-03-30T16:43:07.984082Z"
|
||
}
|
||
},
|
||
"source": [
|
||
"print(all_daily_data_df[all_daily_data_df['is_st']])"
|
||
],
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
" ts_code trade_date close turnover_rate turnover_rate_f \\\n",
|
||
"100 002528.SZ 20250327 2.53 0.6855 1.4642 \n",
|
||
"128 300163.SZ 20250327 3.15 3.0563 3.2999 \n",
|
||
"129 300205.SZ 20250327 4.34 0.9211 1.5246 \n",
|
||
"147 000851.SZ 20250327 2.53 2.2990 2.6472 \n",
|
||
"299 300097.SZ 20250327 4.88 3.1648 3.6912 \n",
|
||
"... ... ... ... ... ... \n",
|
||
"26750 000506.SZ 20250325 5.21 1.2689 1.8939 \n",
|
||
"26770 002592.SZ 20250325 5.22 1.0547 1.6712 \n",
|
||
"26786 600603.SH 20250325 7.63 0.4610 1.0776 \n",
|
||
"26828 002528.SZ 20250325 2.51 0.9799 2.0928 \n",
|
||
"26906 300097.SZ 20250325 4.92 3.2717 3.8159 \n",
|
||
"\n",
|
||
" volume_ratio pe pe_ttm pb ps ps_ttm dv_ratio \\\n",
|
||
"100 0.43 NaN NaN 7.3528 2.1714 2.7257 0.0000 \n",
|
||
"128 0.87 NaN NaN 3.0547 5.9187 5.8999 0.0000 \n",
|
||
"129 0.63 94.7108 NaN 1.3743 1.0976 1.5538 0.4608 \n",
|
||
"147 0.64 NaN NaN 1.0360 0.4939 0.8666 0.0000 \n",
|
||
"299 0.70 10.0614 NaN 2.2055 2.9549 3.1999 0.0000 \n",
|
||
"... ... ... ... ... ... ... ... \n",
|
||
"26750 0.37 725.4828 NaN 8.2869 17.0204 21.9262 0.0000 \n",
|
||
"26770 0.94 14.0192 61.1217 1.6387 2.7253 2.3121 0.0000 \n",
|
||
"26786 0.56 15.6086 24.2223 1.3160 1.8461 2.4398 0.0000 \n",
|
||
"26828 0.58 NaN NaN 7.2947 2.1542 2.7042 0.0000 \n",
|
||
"26906 0.53 10.1438 NaN 2.2236 2.9791 3.2261 0.0000 \n",
|
||
"\n",
|
||
" dv_ttm total_share float_share free_share total_mv circ_mv \\\n",
|
||
"100 NaN 119867.5082 105021.9577 49171.2582 303264.7957 265705.5530 \n",
|
||
"128 NaN 47400.0000 41596.4553 38525.5904 149310.0000 131028.8342 \n",
|
||
"129 0.4608 43005.6000 42599.1218 25737.4813 186644.3040 184880.1886 \n",
|
||
"147 NaN 115786.0020 113197.7266 98311.5254 292938.5851 286390.2483 \n",
|
||
"299 NaN 28854.9669 27000.9948 23150.5534 140812.2385 131764.8546 \n",
|
||
"... ... ... ... ... ... ... \n",
|
||
"26750 NaN 92901.7761 92867.0961 62218.8027 484018.2535 483837.5707 \n",
|
||
"26770 NaN 28333.1157 26271.6370 16580.1814 147898.8640 137137.9451 \n",
|
||
"26786 NaN 119332.9151 119332.9151 51048.6002 910510.1422 910510.1422 \n",
|
||
"26828 NaN 119867.5082 105021.9577 49171.2582 300867.4456 263605.1138 \n",
|
||
"26906 NaN 28854.9669 27000.9948 23150.5534 141966.4371 132844.8944 \n",
|
||
"\n",
|
||
" is_st \n",
|
||
"100 True \n",
|
||
"128 True \n",
|
||
"129 True \n",
|
||
"147 True \n",
|
||
"299 True \n",
|
||
"... ... \n",
|
||
"26750 True \n",
|
||
"26770 True \n",
|
||
"26786 True \n",
|
||
"26828 True \n",
|
||
"26906 True \n",
|
||
"\n",
|
||
"[540 rows x 19 columns]\n"
|
||
]
|
||
}
|
||
],
|
||
"execution_count": 6
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"id": "692b58674b7462c9",
|
||
"metadata": {
|
||
"ExecuteTime": {
|
||
"end_time": "2025-03-30T16:43:08.703938Z",
|
||
"start_time": "2025-03-30T16:43:08.021067Z"
|
||
}
|
||
},
|
||
"source": [
|
||
"# 将数据保存为 HDF5 文件(table 格式)\n",
|
||
"all_daily_data_df.to_hdf(h5_filename, key='daily_basic', mode='a', format='table', append=True, data_columns=True)\n",
|
||
"\n",
|
||
"print(\"所有每日基础数据获取并保存完毕!\")\n"
|
||
],
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"所有每日基础数据获取并保存完毕!\n"
|
||
]
|
||
}
|
||
],
|
||
"execution_count": 7
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"id": "d7a773fc20293477",
|
||
"metadata": {
|
||
"ExecuteTime": {
|
||
"end_time": "2025-03-30T16:43:15.188800Z",
|
||
"start_time": "2025-03-30T16:43:08.725449Z"
|
||
}
|
||
},
|
||
"source": [
|
||
"with pd.HDFStore(h5_filename, mode='r') as store:\n",
|
||
" df = store[key][['ts_code', 'trade_date', 'is_st']]\n",
|
||
" print(df.info())"
|
||
],
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"<class 'pandas.core.frame.DataFrame'>\n",
|
||
"Index: 8480556 entries, 0 to 26950\n",
|
||
"Data columns (total 3 columns):\n",
|
||
" # Column Dtype \n",
|
||
"--- ------ ----- \n",
|
||
" 0 ts_code object\n",
|
||
" 1 trade_date object\n",
|
||
" 2 is_st bool \n",
|
||
"dtypes: bool(1), object(2)\n",
|
||
"memory usage: 202.2+ MB\n",
|
||
"None\n"
|
||
]
|
||
}
|
||
],
|
||
"execution_count": 8
|
||
}
|
||
],
|
||
"metadata": {
|
||
"kernelspec": {
|
||
"display_name": "Python 3 (ipykernel)",
|
||
"language": "python",
|
||
"name": "python3"
|
||
},
|
||
"language_info": {
|
||
"codemirror_mode": {
|
||
"name": "ipython",
|
||
"version": 3
|
||
},
|
||
"file_extension": ".py",
|
||
"mimetype": "text/x-python",
|
||
"name": "python",
|
||
"nbconvert_exporter": "python",
|
||
"pygments_lexer": "ipython3",
|
||
"version": "3.11.11"
|
||
}
|
||
},
|
||
"nbformat": 4,
|
||
"nbformat_minor": 5
|
||
}
|