489 lines
19 KiB
Plaintext
489 lines
19 KiB
Plaintext
{
|
||
"cells": [
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 1,
|
||
"id": "18d1d622-b083-4cc4-a6f8-7c1ed2d0edd2",
|
||
"metadata": {
|
||
"ExecuteTime": {
|
||
"end_time": "2025-04-09T14:57:36.913044Z",
|
||
"start_time": "2025-04-09T14:57:36.159612Z"
|
||
}
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"import tushare as ts\n",
|
||
"ts.set_token('3a0741c702ee7e5e5f2bf1f0846bafaafe4e320833240b2a7e4a685f')\n",
|
||
"pro = ts.pro_api()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 3,
|
||
"id": "14671a7f72de2564",
|
||
"metadata": {
|
||
"ExecuteTime": {
|
||
"end_time": "2025-04-09T14:57:39.128278Z",
|
||
"start_time": "2025-04-09T14:57:36.918051Z"
|
||
}
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"from datetime import datetime\n",
|
||
"import pandas as pd\n",
|
||
"import warnings\n",
|
||
"\n",
|
||
"warnings.filterwarnings(\"ignore\")\n",
|
||
"def filter_rows(df):\n",
|
||
" # 按照 name 和 start_date 分组\n",
|
||
" def select_row(group):\n",
|
||
" # 如果有 end_date 不为 NaT 的行,优先保留这些行\n",
|
||
" valid_rows = group[group['end_date'].notna()]\n",
|
||
" if not valid_rows.empty:\n",
|
||
" return valid_rows.iloc[0] # 返回第一个有效行\n",
|
||
" else:\n",
|
||
" return group.iloc[0] # 如果没有有效行,返回第一行\n",
|
||
"\n",
|
||
" filtered_df = df.groupby(['name', 'start_date'], group_keys=False).apply(select_row)\n",
|
||
" filtered_df = filtered_df.reset_index(drop=True)\n",
|
||
" return filtered_df\n",
|
||
"\n",
|
||
"def is_st(name_change_dict, stock_code, target_date):\n",
|
||
" target_date = datetime.strptime(target_date, '%Y%m%d')\n",
|
||
" if stock_code not in name_change_dict.keys():\n",
|
||
" return False\n",
|
||
" df = name_change_dict[stock_code]\n",
|
||
" for i in range(len(df)):\n",
|
||
" sds = df.iloc[i, 2]\n",
|
||
" eds = df.iloc[i, 3]\n",
|
||
" if eds is None or eds is pd.NaT:\n",
|
||
" eds = datetime.now()\n",
|
||
" if (target_date - sds).days >= 0 and (target_date - eds).days <= 0:\n",
|
||
" return True\n",
|
||
" return False\n",
|
||
"\n",
|
||
"name_change_df = pd.read_hdf('../../../data/name_change.h5', key='name_change')\n",
|
||
"name_change_df = name_change_df.drop_duplicates(keep='first')\n",
|
||
"\n",
|
||
"# 确保 name_change_df 的日期格式正确\n",
|
||
"name_change_df['start_date'] = pd.to_datetime(name_change_df['start_date'], format='%Y%m%d')\n",
|
||
"name_change_df['end_date'] = pd.to_datetime(name_change_df['end_date'], format='%Y%m%d', errors='coerce')\n",
|
||
"# name_change_df = name_change_df[name_change_df.name.str.contains('ST') ]\n",
|
||
"name_change_dict = {}\n",
|
||
"for ts_code, group in name_change_df.groupby('ts_code'):\n",
|
||
" # 只保留 'ST' 和 '*ST' 的记录\n",
|
||
" # st_data = group[(group['change_reason'] == 'ST') | (group['change_reason'] == '*ST')]\n",
|
||
" st_data = group[(group['name'].str.contains('ST')) | (group['name'].str.contains('退'))]\n",
|
||
" if not st_data.empty:\n",
|
||
" name_change_dict[ts_code] = filter_rows(st_data)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 4,
|
||
"id": "e7f8cce2f80e2f20",
|
||
"metadata": {
|
||
"ExecuteTime": {
|
||
"end_time": "2025-04-09T14:58:09.296046Z",
|
||
"start_time": "2025-04-09T14:57:39.339423Z"
|
||
}
|
||
},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"<class 'pandas.core.frame.DataFrame'>\n",
|
||
"Index: 8599138 entries, 0 to 8599137\n",
|
||
"Data columns (total 2 columns):\n",
|
||
" # Column Dtype \n",
|
||
"--- ------ ----- \n",
|
||
" 0 ts_code object\n",
|
||
" 1 trade_date object\n",
|
||
"dtypes: object(2)\n",
|
||
"memory usage: 196.8+ MB\n",
|
||
"None\n",
|
||
"20250430\n",
|
||
"20250506\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"import time\n",
|
||
"from concurrent.futures import ThreadPoolExecutor, as_completed\n",
|
||
"\n",
|
||
"h5_filename = '../../../data/daily_basic.h5'\n",
|
||
"key = '/daily_basic'\n",
|
||
"max_date = None\n",
|
||
"with pd.HDFStore(h5_filename, mode='r') as store:\n",
|
||
" df = store[key][['ts_code', 'trade_date']]\n",
|
||
" print(df.info())\n",
|
||
" max_date = df['trade_date'].max()\n",
|
||
"\n",
|
||
"print(max_date)\n",
|
||
"trade_cal = pro.trade_cal(exchange='', start_date='20170101', end_date='20250720')\n",
|
||
"trade_cal = trade_cal[trade_cal['is_open'] == 1] # 只保留交易日\n",
|
||
"trade_dates = trade_cal[trade_cal['cal_date'] > max_date]['cal_date'].tolist()\n",
|
||
"start_date = min(trade_dates)\n",
|
||
"print(start_date)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 5,
|
||
"id": "553cfb36-f560-4cc4-b2bc-68323ccc5072",
|
||
"metadata": {
|
||
"ExecuteTime": {
|
||
"end_time": "2025-04-09T14:58:16.817010Z",
|
||
"start_time": "2025-04-09T14:58:09.326485Z"
|
||
},
|
||
"scrolled": true
|
||
},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"任务 20250718 完成\n",
|
||
"任务 20250717 完成\n",
|
||
"任务 20250716 完成\n",
|
||
"任务 20250715 完成\n",
|
||
"任务 20250714 完成\n",
|
||
"任务 20250711 完成\n",
|
||
"任务 20250710 完成\n",
|
||
"任务 20250709 完成\n",
|
||
"任务 20250708 完成\n",
|
||
"任务 20250707 完成\n",
|
||
"任务 20250704 完成\n",
|
||
"任务 20250703 完成\n",
|
||
"任务 20250702 完成\n",
|
||
"任务 20250701 完成\n",
|
||
"任务 20250630 完成\n",
|
||
"任务 20250627 完成\n",
|
||
"任务 20250626 完成\n",
|
||
"任务 20250625 完成\n",
|
||
"任务 20250624 完成\n",
|
||
"任务 20250623 完成\n",
|
||
"任务 20250620 完成\n",
|
||
"任务 20250619 完成\n",
|
||
"任务 20250618 完成\n",
|
||
"任务 20250617 完成\n",
|
||
"任务 20250616 完成\n",
|
||
"任务 20250613 完成\n",
|
||
"任务 20250612 完成\n",
|
||
"任务 20250611 完成\n",
|
||
"任务 20250610 完成\n",
|
||
"任务 20250609 完成\n",
|
||
"任务 20250606 完成\n",
|
||
"任务 20250605 完成\n",
|
||
"任务 20250604 完成\n",
|
||
"任务 20250603 完成\n",
|
||
"任务 20250529 完成\n",
|
||
"任务 20250530 完成\n",
|
||
"任务 20250527 完成\n",
|
||
"任务 20250528 完成\n",
|
||
"任务 20250526 完成\n",
|
||
"任务 20250523 完成\n",
|
||
"任务 20250521 完成\n",
|
||
"任务 20250522 完成\n",
|
||
"任务 20250520 完成\n",
|
||
"任务 20250519 完成\n",
|
||
"任务 20250516 完成\n",
|
||
"任务 20250515 完成\n",
|
||
"任务 20250514 完成\n",
|
||
"任务 20250513 完成\n",
|
||
"任务 20250512 完成\n",
|
||
"任务 20250509 完成\n",
|
||
"任务 20250508 完成\n",
|
||
"任务 20250507 完成\n",
|
||
"任务 20250506 完成\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"\n",
|
||
"\n",
|
||
"# 使用 HDFStore 存储数据\n",
|
||
"all_daily_data = []\n",
|
||
"\n",
|
||
"# API 调用计数和时间控制变量\n",
|
||
"api_call_count = 0\n",
|
||
"batch_start_time = time.time()\n",
|
||
"\n",
|
||
"\n",
|
||
"def get_data(trade_date):\n",
|
||
" daily_basic_data = pro.daily_basic(ts_code='', trade_date=trade_date)\n",
|
||
" if daily_basic_data is not None and not daily_basic_data.empty:\n",
|
||
" # 添加交易日期列标识\n",
|
||
" daily_basic_data['trade_date'] = trade_date\n",
|
||
" daily_basic_data['is_st'] = daily_basic_data.apply(\n",
|
||
" lambda row: is_st(name_change_dict, row['ts_code'], row['trade_date']), axis=1\n",
|
||
" )\n",
|
||
" time.sleep(0.2)\n",
|
||
" # print(f\"成功获取并保存 {trade_date} 的每日基础数据\")\n",
|
||
" return daily_basic_data\n",
|
||
"\n",
|
||
"\n",
|
||
"# 遍历每个交易日期并获取数据\n",
|
||
"with ThreadPoolExecutor(max_workers=2) as executor:\n",
|
||
" future_to_date = {executor.submit(get_data, td): td for td in trade_dates}\n",
|
||
"\n",
|
||
" for future in as_completed(future_to_date):\n",
|
||
" trade_date = future_to_date[future] # 获取对应的交易日期\n",
|
||
" try:\n",
|
||
" result = future.result() # 获取任务执行的结果\n",
|
||
" all_daily_data.append(result)\n",
|
||
" print(f\"任务 {trade_date} 完成\")\n",
|
||
" except Exception as e:\n",
|
||
" print(f\"获取 {trade_date} 数据时出错: {e}\")\n",
|
||
" # 计数一次 API 调用\n",
|
||
" api_call_count += 1\n",
|
||
"\n",
|
||
" # 每调用 300 次,检查时间是否少于 1 分钟,如果少于则等待剩余时间\n",
|
||
" if api_call_count % 150 == 0:\n",
|
||
" elapsed = time.time() - batch_start_time\n",
|
||
" if elapsed < 60:\n",
|
||
" sleep_time = 60 - elapsed\n",
|
||
" print(f\"已调用 150 次 API,等待 {sleep_time:.2f} 秒以满足速率限制...\")\n",
|
||
" time.sleep(sleep_time)\n",
|
||
" # 重置批次起始时间\n",
|
||
" batch_start_time = time.time()\n",
|
||
"\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 6,
|
||
"id": "919023c693d7a47a",
|
||
"metadata": {
|
||
"ExecuteTime": {
|
||
"end_time": "2025-04-09T14:58:16.864178Z",
|
||
"start_time": "2025-04-09T14:58:16.855084Z"
|
||
}
|
||
},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
" ts_code trade_date close turnover_rate turnover_rate_f \\\n",
|
||
"0 301261.SZ 20250507 97.25 15.5042 19.6511 \n",
|
||
"1 002643.SZ 20250507 11.12 1.3481 2.3303 \n",
|
||
"2 001211.SZ 20250507 22.11 3.5506 6.1239 \n",
|
||
"3 002466.SZ 20250507 28.98 1.0588 1.5771 \n",
|
||
"4 603005.SH 20250507 29.32 5.1961 6.1690 \n",
|
||
"... ... ... ... ... ... \n",
|
||
"10769 000551.SZ 20250506 12.39 2.0213 3.1432 \n",
|
||
"10770 600792.SH 20250506 3.17 0.8036 2.3531 \n",
|
||
"10771 300176.SZ 20250506 6.62 1.7530 2.5325 \n",
|
||
"10772 000016.SZ 20250506 5.57 13.9545 20.7669 \n",
|
||
"10773 300339.SZ 20250506 56.53 11.3184 11.9579 \n",
|
||
"\n",
|
||
" volume_ratio pe pe_ttm pb ps ps_ttm dv_ratio \\\n",
|
||
"0 0.84 122.6810 146.2352 5.5730 8.2774 8.3189 0.4627 \n",
|
||
"1 0.79 41.9902 45.3885 1.4569 2.8000 2.8594 2.6982 \n",
|
||
"2 0.83 56.0080 58.9563 1.8078 1.1637 1.1399 0.0000 \n",
|
||
"3 0.92 NaN NaN 1.1380 3.6409 3.6410 4.6569 \n",
|
||
"4 1.35 75.6520 71.1174 4.4020 16.9225 16.2060 0.1570 \n",
|
||
"... ... ... ... ... ... ... ... \n",
|
||
"10769 1.20 19.9692 18.7030 1.8602 1.1939 1.1927 0.5650 \n",
|
||
"10770 0.89 NaN NaN 1.1995 0.5271 0.5777 2.1767 \n",
|
||
"10771 1.12 92.1443 96.5538 2.7208 1.4839 1.4627 0.0000 \n",
|
||
"10772 3.66 NaN NaN 5.6643 1.2067 1.1979 0.0000 \n",
|
||
"10773 2.40 279.4392 270.1037 12.8967 13.2445 13.0061 0.0000 \n",
|
||
"\n",
|
||
" dv_ttm total_share float_share free_share total_mv \\\n",
|
||
"0 0.4627 8789.0196 3748.3321 2957.3203 8.547322e+05 \n",
|
||
"1 2.6982 92996.9005 90932.5570 52604.5851 1.034126e+06 \n",
|
||
"2 NaN 7200.0000 6699.6575 3884.4502 1.591920e+05 \n",
|
||
"3 4.6569 164122.1583 147584.5634 99084.9325 4.756260e+06 \n",
|
||
"4 0.1570 65217.1706 65217.1706 54932.1940 1.912167e+06 \n",
|
||
"... ... ... ... ... ... \n",
|
||
"10769 0.5650 40394.4205 40263.2044 25893.0990 5.004869e+05 \n",
|
||
"10770 2.1767 110992.3600 105986.8113 36194.3684 3.518458e+05 \n",
|
||
"10771 NaN 38728.0800 38728.0800 26808.2764 2.563799e+05 \n",
|
||
"10772 NaN 240794.5408 159659.3800 107284.6868 1.341226e+06 \n",
|
||
"10773 NaN 79641.0841 77768.6667 73609.4256 4.502110e+06 \n",
|
||
"\n",
|
||
" circ_mv is_st \n",
|
||
"0 3.645253e+05 False \n",
|
||
"1 1.011170e+06 False \n",
|
||
"2 1.481294e+05 False \n",
|
||
"3 4.277001e+06 False \n",
|
||
"4 1.912167e+06 False \n",
|
||
"... ... ... \n",
|
||
"10769 4.988611e+05 False \n",
|
||
"10770 3.359782e+05 False \n",
|
||
"10771 2.563799e+05 False \n",
|
||
"10772 8.893027e+05 False \n",
|
||
"10773 4.396263e+06 False \n",
|
||
"\n",
|
||
"[10774 rows x 19 columns]\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"all_daily_data_df = pd.concat(all_daily_data, ignore_index=True)\n",
|
||
"print(all_daily_data_df)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 7,
|
||
"id": "28cb78d032671b20",
|
||
"metadata": {
|
||
"ExecuteTime": {
|
||
"end_time": "2025-04-09T14:58:16.881685Z",
|
||
"start_time": "2025-04-09T14:58:16.871184Z"
|
||
}
|
||
},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
" ts_code trade_date close turnover_rate turnover_rate_f \\\n",
|
||
"8 300147.SZ 20250507 6.58 5.3209 6.8857 \n",
|
||
"19 002501.SZ 20250507 2.10 2.8874 3.7273 \n",
|
||
"52 600238.SH 20250507 4.55 11.2843 13.8699 \n",
|
||
"63 300391.SZ 20250507 5.58 5.5505 7.0395 \n",
|
||
"73 600421.SH 20250507 4.99 2.8571 6.1511 \n",
|
||
"... ... ... ... ... ... \n",
|
||
"10647 600243.SH 20250506 2.43 6.7484 8.1172 \n",
|
||
"10652 002528.SZ 20250506 2.35 2.0592 4.3961 \n",
|
||
"10682 300044.SZ 20250506 3.31 12.8866 13.4490 \n",
|
||
"10712 300097.SZ 20250506 4.36 2.5814 3.0107 \n",
|
||
"10733 600200.SH 20250506 3.04 0.2013 0.2433 \n",
|
||
"\n",
|
||
" volume_ratio pe pe_ttm pb ps ps_ttm dv_ratio \\\n",
|
||
"8 1.62 NaN NaN 4.4991 2.3410 2.5434 0.0 \n",
|
||
"19 1.28 NaN NaN 22.7988 22.3498 26.2757 0.0 \n",
|
||
"52 2.57 NaN NaN 20.0224 11.6394 12.3461 0.0 \n",
|
||
"63 1.35 NaN NaN NaN 17.5129 12.5138 0.0 \n",
|
||
"73 0.80 NaN NaN 135.5854 8.3301 8.4697 0.0 \n",
|
||
"... ... ... ... ... ... ... ... \n",
|
||
"10647 0.73 NaN NaN 1.6685 4.5071 4.6210 0.0 \n",
|
||
"10652 1.52 NaN NaN 15.5269 2.9812 3.6083 0.0 \n",
|
||
"10682 2.91 NaN NaN 24.3171 17.6463 26.1361 0.0 \n",
|
||
"10712 0.99 NaN NaN 2.7137 3.2758 3.8102 0.0 \n",
|
||
"10733 0.05 30.7156 NaN 1.2351 1.3543 1.7858 0.0 \n",
|
||
"\n",
|
||
" dv_ttm total_share float_share free_share total_mv \\\n",
|
||
"8 NaN 66127.9045 65745.9042 50804.9121 435121.6116 \n",
|
||
"19 NaN 355000.0000 354999.9006 274999.9006 745500.0000 \n",
|
||
"52 NaN 44820.0000 44500.1580 36204.3908 203931.0000 \n",
|
||
"63 NaN 35033.6112 35033.6112 27623.1259 195487.5505 \n",
|
||
"73 NaN 19560.0000 19560.0000 9085.2748 97604.4000 \n",
|
||
"... ... ... ... ... ... \n",
|
||
"10647 NaN 43885.0000 43885.0000 36485.0000 106640.5500 \n",
|
||
"10652 NaN 119867.5082 104974.0608 49171.2582 281688.6443 \n",
|
||
"10682 NaN 76386.9228 76375.7508 73182.1277 252840.7145 \n",
|
||
"10712 NaN 28854.9669 27000.9948 23150.5534 125807.6557 \n",
|
||
"10733 NaN 71215.1832 71087.9480 58808.3718 216494.1569 \n",
|
||
"\n",
|
||
" circ_mv is_st \n",
|
||
"8 432608.0496 True \n",
|
||
"19 745499.7913 True \n",
|
||
"52 202475.7189 True \n",
|
||
"63 195487.5505 True \n",
|
||
"73 97604.4000 True \n",
|
||
"... ... ... \n",
|
||
"10647 106640.5500 True \n",
|
||
"10652 246689.0429 True \n",
|
||
"10682 252803.7351 True \n",
|
||
"10712 117724.3373 True \n",
|
||
"10733 216107.3619 True \n",
|
||
"\n",
|
||
"[394 rows x 19 columns]\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"print(all_daily_data_df[all_daily_data_df['is_st']])"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 8,
|
||
"id": "692b58674b7462c9",
|
||
"metadata": {
|
||
"ExecuteTime": {
|
||
"end_time": "2025-04-09T14:58:17.773453Z",
|
||
"start_time": "2025-04-09T14:58:16.903459Z"
|
||
}
|
||
},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"所有每日基础数据获取并保存完毕!\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"# 将数据保存为 HDF5 文件(table 格式)\n",
|
||
"all_daily_data_df.to_hdf(h5_filename, key='daily_basic', mode='a', format='table', append=True, data_columns=True)\n",
|
||
"\n",
|
||
"print(\"所有每日基础数据获取并保存完毕!\")\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 9,
|
||
"id": "d7a773fc20293477",
|
||
"metadata": {
|
||
"ExecuteTime": {
|
||
"end_time": "2025-04-09T14:58:24.305403Z",
|
||
"start_time": "2025-04-09T14:58:17.816332Z"
|
||
}
|
||
},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"<class 'pandas.core.frame.DataFrame'>\n",
|
||
"Index: 8609912 entries, 0 to 10773\n",
|
||
"Data columns (total 3 columns):\n",
|
||
" # Column Dtype \n",
|
||
"--- ------ ----- \n",
|
||
" 0 ts_code object\n",
|
||
" 1 trade_date object\n",
|
||
" 2 is_st bool \n",
|
||
"dtypes: bool(1), object(2)\n",
|
||
"memory usage: 205.3+ MB\n",
|
||
"None\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"with pd.HDFStore(h5_filename, mode='r') as store:\n",
|
||
" df = store[key][['ts_code', 'trade_date', 'is_st']]\n",
|
||
" print(df.info())"
|
||
]
|
||
}
|
||
],
|
||
"metadata": {
|
||
"kernelspec": {
|
||
"display_name": "new_trader",
|
||
"language": "python",
|
||
"name": "python3"
|
||
},
|
||
"language_info": {
|
||
"codemirror_mode": {
|
||
"name": "ipython",
|
||
"version": 3
|
||
},
|
||
"file_extension": ".py",
|
||
"mimetype": "text/x-python",
|
||
"name": "python",
|
||
"nbconvert_exporter": "python",
|
||
"pygments_lexer": "ipython3",
|
||
"version": "3.11.11"
|
||
}
|
||
},
|
||
"nbformat": 4,
|
||
"nbformat_minor": 5
|
||
}
|