445 lines
17 KiB
Plaintext
445 lines
17 KiB
Plaintext
{
|
||
"cells": [
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 1,
|
||
"id": "18d1d622-b083-4cc4-a6f8-7c1ed2d0edd2",
|
||
"metadata": {
|
||
"ExecuteTime": {
|
||
"end_time": "2025-04-06T15:33:43.537483Z",
|
||
"start_time": "2025-04-06T15:33:42.844004Z"
|
||
}
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"import tushare as ts\n",
|
||
"ts.set_token('3a0741c702ee7e5e5f2bf1f0846bafaafe4e320833240b2a7e4a685f')\n",
|
||
"pro = ts.pro_api()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 2,
|
||
"id": "14671a7f72de2564",
|
||
"metadata": {
|
||
"ExecuteTime": {
|
||
"end_time": "2025-04-06T15:33:45.387772Z",
|
||
"start_time": "2025-04-06T15:33:43.537483Z"
|
||
}
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"from datetime import datetime\n",
|
||
"import pandas as pd\n",
|
||
"import warnings\n",
|
||
"\n",
|
||
"warnings.filterwarnings(\"ignore\")\n",
|
||
"def filter_rows(df):\n",
|
||
" # 按照 name 和 start_date 分组\n",
|
||
" def select_row(group):\n",
|
||
" # 如果有 end_date 不为 NaT 的行,优先保留这些行\n",
|
||
" valid_rows = group[group['end_date'].notna()]\n",
|
||
" if not valid_rows.empty:\n",
|
||
" return valid_rows.iloc[0] # 返回第一个有效行\n",
|
||
" else:\n",
|
||
" return group.iloc[0] # 如果没有有效行,返回第一行\n",
|
||
"\n",
|
||
" filtered_df = df.groupby(['name', 'start_date'], group_keys=False).apply(select_row)\n",
|
||
" filtered_df = filtered_df.reset_index(drop=True)\n",
|
||
" return filtered_df\n",
|
||
"\n",
|
||
"def is_st(name_change_dict, stock_code, target_date):\n",
|
||
" target_date = datetime.strptime(target_date, '%Y%m%d')\n",
|
||
" if stock_code not in name_change_dict.keys():\n",
|
||
" return False\n",
|
||
" df = name_change_dict[stock_code]\n",
|
||
" for i in range(len(df)):\n",
|
||
" sds = df.iloc[i, 2]\n",
|
||
" eds = df.iloc[i, 3]\n",
|
||
" if eds is None or eds is pd.NaT:\n",
|
||
" eds = datetime.now()\n",
|
||
" if (target_date - sds).days >= 0 and (target_date - eds).days <= 0:\n",
|
||
" return True\n",
|
||
" return False\n",
|
||
"\n",
|
||
"name_change_df = pd.read_hdf('../../../data/name_change.h5', key='name_change')\n",
|
||
"name_change_df = name_change_df.drop_duplicates(keep='first')\n",
|
||
"\n",
|
||
"# 确保 name_change_df 的日期格式正确\n",
|
||
"name_change_df['start_date'] = pd.to_datetime(name_change_df['start_date'], format='%Y%m%d')\n",
|
||
"name_change_df['end_date'] = pd.to_datetime(name_change_df['end_date'], format='%Y%m%d', errors='coerce')\n",
|
||
"name_change_df = name_change_df[name_change_df.name.str.contains('ST')]\n",
|
||
"name_change_dict = {}\n",
|
||
"for ts_code, group in name_change_df.groupby('ts_code'):\n",
|
||
" # 只保留 'ST' 和 '*ST' 的记录\n",
|
||
" st_data = group[(group['change_reason'] == 'ST') | (group['change_reason'] == '*ST')]\n",
|
||
" if not st_data.empty:\n",
|
||
" name_change_dict[ts_code] = filter_rows(st_data)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 3,
|
||
"id": "e7f8cce2f80e2f20",
|
||
"metadata": {
|
||
"ExecuteTime": {
|
||
"end_time": "2025-04-06T15:33:54.089114Z",
|
||
"start_time": "2025-04-06T15:33:45.576286Z"
|
||
}
|
||
},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"<class 'pandas.core.frame.DataFrame'>\n",
|
||
"Index: 8502128 entries, 0 to 21571\n",
|
||
"Data columns (total 2 columns):\n",
|
||
" # Column Dtype \n",
|
||
"--- ------ ----- \n",
|
||
" 0 ts_code object\n",
|
||
" 1 trade_date object\n",
|
||
"dtypes: object(2)\n",
|
||
"memory usage: 194.6+ MB\n",
|
||
"None\n",
|
||
"20250403\n",
|
||
"20250407\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"import time\n",
|
||
"from concurrent.futures import ThreadPoolExecutor, as_completed\n",
|
||
"\n",
|
||
"h5_filename = '../../../data/daily_basic.h5'\n",
|
||
"key = '/daily_basic'\n",
|
||
"max_date = None\n",
|
||
"with pd.HDFStore(h5_filename, mode='r') as store:\n",
|
||
" df = store[key][['ts_code', 'trade_date']]\n",
|
||
" print(df.info())\n",
|
||
" max_date = df['trade_date'].max()\n",
|
||
"\n",
|
||
"print(max_date)\n",
|
||
"trade_cal = pro.trade_cal(exchange='', start_date='20170101', end_date='20250420')\n",
|
||
"trade_cal = trade_cal[trade_cal['is_open'] == 1] # 只保留交易日\n",
|
||
"trade_dates = trade_cal[trade_cal['cal_date'] > max_date]['cal_date'].tolist()\n",
|
||
"start_date = min(trade_dates)\n",
|
||
"print(start_date)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 4,
|
||
"id": "553cfb36-f560-4cc4-b2bc-68323ccc5072",
|
||
"metadata": {
|
||
"ExecuteTime": {
|
||
"end_time": "2025-04-06T15:33:57.041254Z",
|
||
"start_time": "2025-04-06T15:33:54.103322Z"
|
||
},
|
||
"scrolled": true
|
||
},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"任务 20250417 完成\n",
|
||
"任务 20250418 完成\n",
|
||
"任务 20250416 完成\n",
|
||
"任务 20250415 完成\n",
|
||
"任务 20250414 完成\n",
|
||
"任务 20250411 完成\n",
|
||
"任务 20250410 完成\n",
|
||
"任务 20250409 完成\n",
|
||
"任务 20250408 完成\n",
|
||
"任务 20250407 完成\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"\n",
|
||
"\n",
|
||
"# 使用 HDFStore 存储数据\n",
|
||
"all_daily_data = []\n",
|
||
"\n",
|
||
"# API 调用计数和时间控制变量\n",
|
||
"api_call_count = 0\n",
|
||
"batch_start_time = time.time()\n",
|
||
"\n",
|
||
"\n",
|
||
"def get_data(trade_date):\n",
|
||
" daily_basic_data = pro.daily_basic(ts_code='', trade_date=trade_date)\n",
|
||
" if daily_basic_data is not None and not daily_basic_data.empty:\n",
|
||
" # 添加交易日期列标识\n",
|
||
" daily_basic_data['trade_date'] = trade_date\n",
|
||
" daily_basic_data['is_st'] = daily_basic_data.apply(\n",
|
||
" lambda row: is_st(name_change_dict, row['ts_code'], row['trade_date']), axis=1\n",
|
||
" )\n",
|
||
" time.sleep(0.2)\n",
|
||
" # print(f\"成功获取并保存 {trade_date} 的每日基础数据\")\n",
|
||
" return daily_basic_data\n",
|
||
"\n",
|
||
"\n",
|
||
"# 遍历每个交易日期并获取数据\n",
|
||
"with ThreadPoolExecutor(max_workers=2) as executor:\n",
|
||
" future_to_date = {executor.submit(get_data, td): td for td in trade_dates}\n",
|
||
"\n",
|
||
" for future in as_completed(future_to_date):\n",
|
||
" trade_date = future_to_date[future] # 获取对应的交易日期\n",
|
||
" try:\n",
|
||
" result = future.result() # 获取任务执行的结果\n",
|
||
" all_daily_data.append(result)\n",
|
||
" print(f\"任务 {trade_date} 完成\")\n",
|
||
" except Exception as e:\n",
|
||
" print(f\"获取 {trade_date} 数据时出错: {e}\")\n",
|
||
" # 计数一次 API 调用\n",
|
||
" api_call_count += 1\n",
|
||
"\n",
|
||
" # 每调用 300 次,检查时间是否少于 1 分钟,如果少于则等待剩余时间\n",
|
||
" if api_call_count % 150 == 0:\n",
|
||
" elapsed = time.time() - batch_start_time\n",
|
||
" if elapsed < 60:\n",
|
||
" sleep_time = 60 - elapsed\n",
|
||
" print(f\"已调用 150 次 API,等待 {sleep_time:.2f} 秒以满足速率限制...\")\n",
|
||
" time.sleep(sleep_time)\n",
|
||
" # 重置批次起始时间\n",
|
||
" batch_start_time = time.time()\n",
|
||
"\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 5,
|
||
"id": "919023c693d7a47a",
|
||
"metadata": {
|
||
"ExecuteTime": {
|
||
"end_time": "2025-04-06T15:33:57.072796Z",
|
||
"start_time": "2025-04-06T15:33:57.061670Z"
|
||
}
|
||
},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
" ts_code trade_date close turnover_rate turnover_rate_f \\\n",
|
||
"0 000059.SZ 20250407 4.54 1.8414 3.4767 \n",
|
||
"1 600830.SH 20250407 8.33 2.5217 3.6802 \n",
|
||
"2 688061.SH 20250407 24.45 3.1011 3.1011 \n",
|
||
"3 600868.SH 20250407 2.79 3.8477 4.1435 \n",
|
||
"4 605168.SH 20250407 25.98 1.3857 2.8470 \n",
|
||
"... ... ... ... ... ... \n",
|
||
"5386 688259.SH 20250407 34.99 5.9799 11.4393 \n",
|
||
"5387 301316.SZ 20250407 19.20 7.2272 7.9512 \n",
|
||
"5388 601116.SH 20250407 10.37 2.3317 7.1579 \n",
|
||
"5389 605016.SH 20250407 17.20 1.4773 3.9134 \n",
|
||
"5390 600148.SH 20250407 16.07 2.0776 4.5745 \n",
|
||
"\n",
|
||
" volume_ratio pe pe_ttm pb ps ps_ttm dv_ratio \\\n",
|
||
"0 0.84 103.2927 NaN 0.5851 0.1574 0.1928 0.3084 \n",
|
||
"1 0.69 71.1750 71.1750 1.7467 11.2902 11.2902 0.1801 \n",
|
||
"2 2.31 292.8121 NaN 1.1504 6.1795 4.9755 NaN \n",
|
||
"3 1.16 NaN NaN 2.3425 16.8832 16.0274 0.0000 \n",
|
||
"4 1.56 10.3735 14.0394 1.9988 1.0366 1.2218 4.5870 \n",
|
||
"... ... ... ... ... ... ... ... \n",
|
||
"5386 1.10 66.8795 64.8845 2.6173 5.9119 6.5930 NaN \n",
|
||
"5387 1.30 94.0750 110.9182 7.1350 5.7094 4.8530 0.4126 \n",
|
||
"5388 1.78 41.2451 36.3656 1.7811 1.4576 1.4350 1.9286 \n",
|
||
"5389 1.05 28.7938 22.2858 3.3051 6.4003 4.8254 1.3640 \n",
|
||
"5390 2.12 3441.4901 274.8323 4.8916 3.2666 3.3043 0.1867 \n",
|
||
"\n",
|
||
" dv_ttm total_share float_share free_share total_mv circ_mv \\\n",
|
||
"0 0.3084 159944.2537 159944.2537 84712.3362 726146.9118 726146.9118 \n",
|
||
"1 0.1801 45432.2747 45432.2747 31131.0133 378450.8483 378450.8483 \n",
|
||
"2 NaN 11488.9391 4329.7770 4329.7770 280904.5610 105863.0477 \n",
|
||
"3 NaN 189814.8679 189814.8679 176264.8506 529583.4814 529583.4814 \n",
|
||
"4 4.5870 21081.6986 21081.6986 10260.7016 547702.5296 547702.5296 \n",
|
||
"... ... ... ... ... ... ... \n",
|
||
"5386 NaN 11170.0000 11170.0000 5839.1660 390838.3000 390838.3000 \n",
|
||
"5387 0.4126 40400.0000 24282.6503 22071.3403 775680.0000 466226.8858 \n",
|
||
"5388 1.9286 54767.8400 54767.8400 17840.9208 567942.5008 567942.5008 \n",
|
||
"5389 1.3640 32308.6400 32308.6400 12196.5716 555708.6080 555708.6080 \n",
|
||
"5390 0.1867 14151.6450 14151.6450 6427.3300 227416.9352 227416.9352 \n",
|
||
"\n",
|
||
" is_st \n",
|
||
"0 False \n",
|
||
"1 False \n",
|
||
"2 False \n",
|
||
"3 False \n",
|
||
"4 False \n",
|
||
"... ... \n",
|
||
"5386 False \n",
|
||
"5387 False \n",
|
||
"5388 False \n",
|
||
"5389 False \n",
|
||
"5390 False \n",
|
||
"\n",
|
||
"[5391 rows x 19 columns]\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"all_daily_data_df = pd.concat(all_daily_data, ignore_index=True)\n",
|
||
"print(all_daily_data_df)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 6,
|
||
"id": "28cb78d032671b20",
|
||
"metadata": {
|
||
"ExecuteTime": {
|
||
"end_time": "2025-04-06T15:33:57.104132Z",
|
||
"start_time": "2025-04-06T15:33:57.095010Z"
|
||
}
|
||
},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
" ts_code trade_date close turnover_rate turnover_rate_f \\\n",
|
||
"16 000656.SZ 20250407 1.28 0.9982 1.1644 \n",
|
||
"62 002748.SZ 20250407 7.32 0.5503 1.1888 \n",
|
||
"114 002490.SZ 20250407 3.49 0.7559 1.3380 \n",
|
||
"128 300165.SZ 20250407 2.78 4.0431 4.7932 \n",
|
||
"278 600303.SH 20250407 3.22 1.1873 1.4918 \n",
|
||
"... ... ... ... ... ... \n",
|
||
"5263 002217.SZ 20250407 2.07 0.1251 0.1569 \n",
|
||
"5267 002808.SZ 20250407 2.99 4.0901 4.7924 \n",
|
||
"5290 002602.SZ 20250407 6.44 0.2276 0.2634 \n",
|
||
"5315 002501.SZ 20250407 1.92 1.5653 2.0207 \n",
|
||
"5375 300376.SZ 20250407 2.96 1.4873 3.4865 \n",
|
||
"\n",
|
||
" volume_ratio pe pe_ttm pb ps ps_ttm dv_ratio \\\n",
|
||
"16 0.44 NaN NaN NaN 0.1081 0.1637 0.0000 \n",
|
||
"62 0.61 96.0467 49.7297 1.3328 0.8402 0.8839 1.3661 \n",
|
||
"114 0.19 NaN NaN 5.6564 2.0529 2.0529 0.0000 \n",
|
||
"128 2.22 NaN NaN 0.9988 1.3542 1.4288 0.0000 \n",
|
||
"278 0.77 NaN NaN 1.4997 1.6142 1.6353 0.0000 \n",
|
||
"... ... ... ... ... ... ... ... \n",
|
||
"5263 0.23 NaN NaN NaN 3.3436 10.3100 0.0000 \n",
|
||
"5267 0.79 NaN NaN 2.5039 5.2047 4.8881 0.6689 \n",
|
||
"5290 0.20 91.5846 53.4453 1.8455 3.6128 2.5226 0.0000 \n",
|
||
"5315 0.58 NaN NaN 7.1559 14.2934 20.0240 0.0000 \n",
|
||
"5375 4.52 12.2436 36.2242 0.9837 1.4380 2.0320 1.6554 \n",
|
||
"\n",
|
||
" dv_ttm total_share float_share free_share total_mv \\\n",
|
||
"16 NaN 533971.5816 531174.3236 455354.2392 6.834836e+05 \n",
|
||
"62 1.3661 24000.0000 24000.0000 11108.5000 1.756800e+05 \n",
|
||
"114 NaN 79784.8400 54161.3625 30599.6625 2.784491e+05 \n",
|
||
"128 NaN 49551.1725 42053.2110 35472.8422 1.377523e+05 \n",
|
||
"278 NaN 68360.4211 67560.4211 53770.9211 2.201206e+05 \n",
|
||
"... ... ... ... ... ... \n",
|
||
"5263 NaN 747939.8928 568036.4278 453036.0995 1.548236e+06 \n",
|
||
"5267 0.6689 26880.0000 18638.3713 15907.0731 8.037120e+04 \n",
|
||
"5290 NaN 745255.6968 687870.8273 594244.1179 4.799447e+06 \n",
|
||
"5315 NaN 355000.0000 354999.9006 274999.9006 6.816000e+05 \n",
|
||
"5375 1.6554 232824.0476 232743.4901 99284.6609 6.891592e+05 \n",
|
||
"\n",
|
||
" circ_mv is_st \n",
|
||
"16 6.799031e+05 True \n",
|
||
"62 1.756800e+05 True \n",
|
||
"114 1.890232e+05 True \n",
|
||
"128 1.169079e+05 True \n",
|
||
"278 2.175446e+05 True \n",
|
||
"... ... ... \n",
|
||
"5263 1.175835e+06 True \n",
|
||
"5267 5.572873e+04 True \n",
|
||
"5290 4.429888e+06 True \n",
|
||
"5315 6.815998e+05 True \n",
|
||
"5375 6.889207e+05 True \n",
|
||
"\n",
|
||
"[106 rows x 19 columns]\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"print(all_daily_data_df[all_daily_data_df['is_st']])"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 7,
|
||
"id": "692b58674b7462c9",
|
||
"metadata": {
|
||
"ExecuteTime": {
|
||
"end_time": "2025-04-06T15:33:57.927188Z",
|
||
"start_time": "2025-04-06T15:33:57.127166Z"
|
||
}
|
||
},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"所有每日基础数据获取并保存完毕!\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"# 将数据保存为 HDF5 文件(table 格式)\n",
|
||
"all_daily_data_df.to_hdf(h5_filename, key='daily_basic', mode='a', format='table', append=True, data_columns=True)\n",
|
||
"\n",
|
||
"print(\"所有每日基础数据获取并保存完毕!\")\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 8,
|
||
"id": "d7a773fc20293477",
|
||
"metadata": {
|
||
"ExecuteTime": {
|
||
"end_time": "2025-04-06T15:34:06.721517Z",
|
||
"start_time": "2025-04-06T15:33:57.951119Z"
|
||
}
|
||
},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"<class 'pandas.core.frame.DataFrame'>\n",
|
||
"Index: 8507519 entries, 0 to 5390\n",
|
||
"Data columns (total 3 columns):\n",
|
||
" # Column Dtype \n",
|
||
"--- ------ ----- \n",
|
||
" 0 ts_code object\n",
|
||
" 1 trade_date object\n",
|
||
" 2 is_st bool \n",
|
||
"dtypes: bool(1), object(2)\n",
|
||
"memory usage: 202.8+ MB\n",
|
||
"None\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"with pd.HDFStore(h5_filename, mode='r') as store:\n",
|
||
" df = store[key][['ts_code', 'trade_date', 'is_st']]\n",
|
||
" print(df.info())"
|
||
]
|
||
}
|
||
],
|
||
"metadata": {
|
||
"kernelspec": {
|
||
"display_name": "Python 3 (ipykernel)",
|
||
"language": "python",
|
||
"name": "python3"
|
||
},
|
||
"language_info": {
|
||
"codemirror_mode": {
|
||
"name": "ipython",
|
||
"version": 3
|
||
},
|
||
"file_extension": ".py",
|
||
"mimetype": "text/x-python",
|
||
"name": "python",
|
||
"nbconvert_exporter": "python",
|
||
"pygments_lexer": "ipython3",
|
||
"version": "3.11.11"
|
||
}
|
||
},
|
||
"nbformat": 4,
|
||
"nbformat_minor": 5
|
||
}
|