433 lines
17 KiB
Plaintext
433 lines
17 KiB
Plaintext
{
|
||
"cells": [
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 1,
|
||
"id": "18d1d622-b083-4cc4-a6f8-7c1ed2d0edd2",
|
||
"metadata": {
|
||
"ExecuteTime": {
|
||
"end_time": "2025-04-09T14:57:36.913044Z",
|
||
"start_time": "2025-04-09T14:57:36.159612Z"
|
||
}
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"import tushare as ts\n",
|
||
"ts.set_token('3a0741c702ee7e5e5f2bf1f0846bafaafe4e320833240b2a7e4a685f')\n",
|
||
"pro = ts.pro_api()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 2,
|
||
"id": "14671a7f72de2564",
|
||
"metadata": {
|
||
"ExecuteTime": {
|
||
"end_time": "2025-04-09T14:57:39.128278Z",
|
||
"start_time": "2025-04-09T14:57:36.918051Z"
|
||
}
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"from datetime import datetime\n",
|
||
"import pandas as pd\n",
|
||
"import warnings\n",
|
||
"\n",
|
||
"warnings.filterwarnings(\"ignore\")\n",
|
||
"def filter_rows(df):\n",
|
||
" # 按照 name 和 start_date 分组\n",
|
||
" def select_row(group):\n",
|
||
" # 如果有 end_date 不为 NaT 的行,优先保留这些行\n",
|
||
" valid_rows = group[group['end_date'].notna()]\n",
|
||
" if not valid_rows.empty:\n",
|
||
" return valid_rows.iloc[0] # 返回第一个有效行\n",
|
||
" else:\n",
|
||
" return group.iloc[0] # 如果没有有效行,返回第一行\n",
|
||
"\n",
|
||
" filtered_df = df.groupby(['name', 'start_date'], group_keys=False).apply(select_row)\n",
|
||
" filtered_df = filtered_df.reset_index(drop=True)\n",
|
||
" return filtered_df\n",
|
||
"\n",
|
||
"def is_st(name_change_dict, stock_code, target_date):\n",
|
||
" target_date = datetime.strptime(target_date, '%Y%m%d')\n",
|
||
" if stock_code not in name_change_dict.keys():\n",
|
||
" return False\n",
|
||
" df = name_change_dict[stock_code]\n",
|
||
" for i in range(len(df)):\n",
|
||
" sds = df.iloc[i, 2]\n",
|
||
" eds = df.iloc[i, 3]\n",
|
||
" if eds is None or eds is pd.NaT:\n",
|
||
" eds = datetime.now()\n",
|
||
" if (target_date - sds).days >= 0 and (target_date - eds).days <= 0:\n",
|
||
" return True\n",
|
||
" return False\n",
|
||
"\n",
|
||
"name_change_df = pd.read_hdf('/mnt/d/PyProject/NewStock/data/name_change.h5', key='name_change')\n",
|
||
"name_change_df = name_change_df.drop_duplicates(keep='first')\n",
|
||
"\n",
|
||
"# 确保 name_change_df 的日期格式正确\n",
|
||
"name_change_df['start_date'] = pd.to_datetime(name_change_df['start_date'], format='%Y%m%d')\n",
|
||
"name_change_df['end_date'] = pd.to_datetime(name_change_df['end_date'], format='%Y%m%d', errors='coerce')\n",
|
||
"# name_change_df = name_change_df[name_change_df.name.str.contains('ST') ]\n",
|
||
"name_change_dict = {}\n",
|
||
"for ts_code, group in name_change_df.groupby('ts_code'):\n",
|
||
" # 只保留 'ST' 和 '*ST' 的记录\n",
|
||
" # st_data = group[(group['change_reason'] == 'ST') | (group['change_reason'] == '*ST')]\n",
|
||
" st_data = group[(group['name'].str.contains('ST')) | (group['name'].str.contains('退'))]\n",
|
||
" if not st_data.empty:\n",
|
||
" name_change_dict[ts_code] = filter_rows(st_data)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 3,
|
||
"id": "e7f8cce2f80e2f20",
|
||
"metadata": {
|
||
"ExecuteTime": {
|
||
"end_time": "2025-04-09T14:58:09.296046Z",
|
||
"start_time": "2025-04-09T14:57:39.339423Z"
|
||
}
|
||
},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"<class 'pandas.core.frame.DataFrame'>\n",
|
||
"Index: 9155905 entries, 0 to 27115\n",
|
||
"Data columns (total 2 columns):\n",
|
||
" # Column Dtype \n",
|
||
"--- ------ ----- \n",
|
||
" 0 ts_code object\n",
|
||
" 1 trade_date object\n",
|
||
"dtypes: object(2)\n",
|
||
"memory usage: 209.6+ MB\n",
|
||
"None\n",
|
||
"20250926\n",
|
||
"20250929\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"import time\n",
|
||
"from concurrent.futures import ThreadPoolExecutor, as_completed\n",
|
||
"\n",
|
||
"h5_filename = '/mnt/d/PyProject/NewStock/data/daily_basic.h5'\n",
|
||
"key = '/daily_basic'\n",
|
||
"max_date = None\n",
|
||
"with pd.HDFStore(h5_filename, mode='r') as store:\n",
|
||
" df = store[key][['ts_code', 'trade_date']]\n",
|
||
" print(df.info())\n",
|
||
" max_date = df['trade_date'].max()\n",
|
||
"\n",
|
||
"print(max_date)\n",
|
||
"trade_cal = pro.trade_cal(exchange='', start_date='20170101', end_date='20251020')\n",
|
||
"trade_cal = trade_cal[trade_cal['is_open'] == 1] # 只保留交易日\n",
|
||
"trade_dates = trade_cal[trade_cal['cal_date'] > max_date]['cal_date'].tolist()\n",
|
||
"start_date = min(trade_dates)\n",
|
||
"print(start_date)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 4,
|
||
"id": "553cfb36-f560-4cc4-b2bc-68323ccc5072",
|
||
"metadata": {
|
||
"ExecuteTime": {
|
||
"end_time": "2025-04-09T14:58:16.817010Z",
|
||
"start_time": "2025-04-09T14:58:09.326485Z"
|
||
},
|
||
"scrolled": true
|
||
},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"任务 20251017 完成\n",
|
||
"任务 20251020 完成\n",
|
||
"任务 20251015 完成\n",
|
||
"任务 20251016 完成\n",
|
||
"任务 20251014 完成\n",
|
||
"任务 20251013 完成\n",
|
||
"任务 20251010 完成\n",
|
||
"任务 20251009 完成\n",
|
||
"任务 20250930 完成\n",
|
||
"任务 20250929 完成\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"\n",
|
||
"\n",
|
||
"# 使用 HDFStore 存储数据\n",
|
||
"all_daily_data = []\n",
|
||
"\n",
|
||
"# API 调用计数和时间控制变量\n",
|
||
"api_call_count = 0\n",
|
||
"batch_start_time = time.time()\n",
|
||
"\n",
|
||
"\n",
|
||
"def get_data(trade_date):\n",
|
||
" daily_basic_data = pro.daily_basic(ts_code='', trade_date=trade_date)\n",
|
||
" if daily_basic_data is not None and not daily_basic_data.empty:\n",
|
||
" # 添加交易日期列标识\n",
|
||
" daily_basic_data['trade_date'] = trade_date\n",
|
||
" daily_basic_data['is_st'] = daily_basic_data.apply(\n",
|
||
" lambda row: is_st(name_change_dict, row['ts_code'], row['trade_date']), axis=1\n",
|
||
" )\n",
|
||
" time.sleep(0.2)\n",
|
||
" # print(f\"成功获取并保存 {trade_date} 的每日基础数据\")\n",
|
||
" return daily_basic_data\n",
|
||
"\n",
|
||
"\n",
|
||
"# 遍历每个交易日期并获取数据\n",
|
||
"with ThreadPoolExecutor(max_workers=2) as executor:\n",
|
||
" future_to_date = {executor.submit(get_data, td): td for td in trade_dates}\n",
|
||
"\n",
|
||
" for future in as_completed(future_to_date):\n",
|
||
" trade_date = future_to_date[future] # 获取对应的交易日期\n",
|
||
" try:\n",
|
||
" result = future.result() # 获取任务执行的结果\n",
|
||
" all_daily_data.append(result)\n",
|
||
" print(f\"任务 {trade_date} 完成\")\n",
|
||
" except Exception as e:\n",
|
||
" print(f\"获取 {trade_date} 数据时出错: {e}\")\n",
|
||
" # 计数一次 API 调用\n",
|
||
" api_call_count += 1\n",
|
||
"\n",
|
||
" # 每调用 300 次,检查时间是否少于 1 分钟,如果少于则等待剩余时间\n",
|
||
" if api_call_count % 150 == 0:\n",
|
||
" elapsed = time.time() - batch_start_time\n",
|
||
" if elapsed < 60:\n",
|
||
" sleep_time = 60 - elapsed\n",
|
||
" print(f\"已调用 150 次 API,等待 {sleep_time:.2f} 秒以满足速率限制...\")\n",
|
||
" time.sleep(sleep_time)\n",
|
||
" # 重置批次起始时间\n",
|
||
" batch_start_time = time.time()\n",
|
||
"\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 5,
|
||
"id": "919023c693d7a47a",
|
||
"metadata": {
|
||
"ExecuteTime": {
|
||
"end_time": "2025-04-09T14:58:16.864178Z",
|
||
"start_time": "2025-04-09T14:58:16.855084Z"
|
||
}
|
||
},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
" ts_code trade_date close turnover_rate turnover_rate_f \\\n",
|
||
"0 600642.SH 20251010 8.03 0.4806 1.3835 \n",
|
||
"1 600295.SH 20251010 10.76 0.8549 3.7056 \n",
|
||
"2 600444.SH 20251010 19.00 9.6611 17.4605 \n",
|
||
"3 605100.SH 20251010 28.72 3.4770 7.6902 \n",
|
||
"4 301399.SZ 20251010 19.53 3.9562 4.6772 \n",
|
||
"... ... ... ... ... ... \n",
|
||
"21679 600653.SH 20250929 2.13 2.1746 2.9589 \n",
|
||
"21680 002344.SZ 20250929 4.49 1.7080 3.6338 \n",
|
||
"21681 301162.SZ 20250929 60.30 2.8491 3.5744 \n",
|
||
"21682 920077.BJ 20250929 14.43 1.1113 1.6410 \n",
|
||
"21683 300283.SZ 20250929 7.04 4.8583 5.7018 \n",
|
||
"\n",
|
||
" volume_ratio pe pe_ttm pb ps ps_ttm dv_ratio \\\n",
|
||
"0 1.49 9.9635 10.2617 1.1073 1.3268 1.3600 4.9816 \n",
|
||
"1 1.56 16.3053 16.4683 1.4839 1.0603 1.1230 7.4349 \n",
|
||
"2 2.84 69.2746 55.7147 3.8398 3.6313 3.5392 0.5263 \n",
|
||
"3 0.55 66.7896 123.2961 2.7276 5.3634 6.7180 2.0794 \n",
|
||
"4 0.94 60.7990 75.8958 2.7675 6.8812 7.1828 1.2177 \n",
|
||
"... ... ... ... ... ... ... ... \n",
|
||
"21679 0.72 107.4073 227.6354 5.4498 0.9887 0.9724 0.0000 \n",
|
||
"21680 0.70 64.8238 75.9239 0.6834 5.5516 5.5560 0.9577 \n",
|
||
"21681 0.96 85.4251 76.2427 5.3380 14.5424 12.3677 0.5586 \n",
|
||
"21682 0.51 90.3399 82.4861 3.3572 5.2895 4.1636 NaN \n",
|
||
"21683 0.94 NaN NaN 3.2821 1.1161 0.9970 0.2499 \n",
|
||
"\n",
|
||
" dv_ttm total_share float_share free_share total_mv \\\n",
|
||
"0 5.6040 489407.9376 489381.3156 170006.8520 3.929946e+06 \n",
|
||
"1 5.5762 279877.6254 197557.6254 45577.9458 3.011483e+06 \n",
|
||
"2 0.5789 14642.1932 14642.1932 8101.7360 2.782017e+05 \n",
|
||
"3 1.0446 17113.2000 16993.2000 7683.2000 4.914911e+05 \n",
|
||
"4 1.0594 18502.0000 5468.3586 4625.5000 3.613441e+05 \n",
|
||
"... ... ... ... ... ... \n",
|
||
"21679 NaN 194638.0317 194638.0317 143048.5612 4.145790e+05 \n",
|
||
"21680 0.8463 128261.6960 128145.0092 60233.0025 5.758950e+05 \n",
|
||
"21681 0.9704 13258.3724 8522.5548 6793.1764 7.994799e+05 \n",
|
||
"21682 NaN 58768.1817 31695.6817 21464.7599 8.480249e+05 \n",
|
||
"21683 NaN 49697.8222 36721.8502 31289.2680 3.498727e+05 \n",
|
||
"\n",
|
||
" circ_mv is_st \n",
|
||
"0 3.929732e+06 False \n",
|
||
"1 2.125720e+06 False \n",
|
||
"2 2.782017e+05 False \n",
|
||
"3 4.880447e+05 False \n",
|
||
"4 1.067970e+05 False \n",
|
||
"... ... ... \n",
|
||
"21679 4.145790e+05 False \n",
|
||
"21680 5.753711e+05 False \n",
|
||
"21681 5.139101e+05 False \n",
|
||
"21682 4.573687e+05 False \n",
|
||
"21683 2.585218e+05 False \n",
|
||
"\n",
|
||
"[21684 rows x 19 columns]\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"all_daily_data_df = pd.concat(all_daily_data, ignore_index=True)\n",
|
||
"print(all_daily_data_df)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 6,
|
||
"id": "28cb78d032671b20",
|
||
"metadata": {
|
||
"ExecuteTime": {
|
||
"end_time": "2025-04-09T14:58:16.881685Z",
|
||
"start_time": "2025-04-09T14:58:16.871184Z"
|
||
}
|
||
},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
" ts_code trade_date close turnover_rate turnover_rate_f \\\n",
|
||
"9 300313.SZ 20251010 8.84 3.1146 6.4625 \n",
|
||
"20 603838.SH 20251010 7.80 0.5503 1.5146 \n",
|
||
"29 603813.SH 20251010 24.06 1.5835 4.5173 \n",
|
||
"48 002742.SZ 20251010 4.65 1.0473 1.2924 \n",
|
||
"69 603559.SH 20251010 8.50 0.2072 0.2945 \n",
|
||
"... ... ... ... ... ... \n",
|
||
"21466 603021.SH 20250929 4.62 1.3860 2.3418 \n",
|
||
"21552 300020.SZ 20250929 3.58 1.5031 1.6828 \n",
|
||
"21554 000506.SZ 20250929 10.88 10.5560 15.7565 \n",
|
||
"21603 600636.SH 20250929 8.29 0.4693 0.7963 \n",
|
||
"21661 603843.SH 20250929 5.17 0.3798 0.5364 \n",
|
||
"\n",
|
||
" volume_ratio pe pe_ttm pb ps ps_ttm dv_ratio dv_ttm \\\n",
|
||
"9 1.30 NaN NaN NaN 20.1067 20.9731 0.0000 NaN \n",
|
||
"20 0.57 NaN NaN 2.6121 8.7517 6.9304 0.0000 NaN \n",
|
||
"29 1.88 NaN NaN 4.5222 8.4776 7.5124 1.0313 NaN \n",
|
||
"48 1.28 NaN NaN NaN 1.6800 2.1226 0.0000 NaN \n",
|
||
"69 0.60 NaN NaN 3.5043 9.5964 8.2315 0.0000 NaN \n",
|
||
"... ... .. ... ... ... ... ... ... \n",
|
||
"21466 0.80 NaN NaN NaN 3.5891 3.7851 0.0000 NaN \n",
|
||
"21552 1.00 NaN NaN 0.9812 5.1924 18.4036 0.0000 NaN \n",
|
||
"21554 3.17 NaN NaN 16.4257 30.3341 23.4860 0.0000 NaN \n",
|
||
"21603 0.81 NaN NaN 1.7909 12.8512 11.0116 0.4825 0.6031 \n",
|
||
"21661 0.05 NaN NaN 12.5612 2.6558 3.1369 0.0000 NaN \n",
|
||
"\n",
|
||
" total_share float_share free_share total_mv circ_mv is_st \n",
|
||
"9 31297.7396 19735.2789 9511.5479 2.766720e+05 1.744599e+05 True \n",
|
||
"20 32001.6000 32001.6000 11627.0468 2.496125e+05 2.496125e+05 True \n",
|
||
"29 10501.5000 10501.5000 3681.2000 2.526661e+05 2.526661e+05 True \n",
|
||
"48 43200.0000 43185.8082 34994.8239 2.008800e+05 2.008140e+05 True \n",
|
||
"69 40127.6979 40127.6979 28231.9697 3.410854e+05 3.410854e+05 True \n",
|
||
"... ... ... ... ... ... ... \n",
|
||
"21466 31994.8070 31994.8070 18936.7934 1.478160e+05 1.478160e+05 True \n",
|
||
"21552 79467.7974 76663.9584 68475.6577 2.844947e+05 2.744570e+05 True \n",
|
||
"21554 92901.7761 92858.4361 62210.1427 1.010771e+06 1.010300e+06 True \n",
|
||
"21603 43863.6802 43863.6802 25849.6552 3.636299e+05 3.636299e+05 True \n",
|
||
"21661 69962.3237 69962.3237 49541.4702 3.617052e+05 3.617052e+05 True \n",
|
||
"\n",
|
||
"[749 rows x 19 columns]\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"print(all_daily_data_df[all_daily_data_df['is_st']])"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 7,
|
||
"id": "692b58674b7462c9",
|
||
"metadata": {
|
||
"ExecuteTime": {
|
||
"end_time": "2025-04-09T14:58:17.773453Z",
|
||
"start_time": "2025-04-09T14:58:16.903459Z"
|
||
}
|
||
},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"所有每日基础数据获取并保存完毕!\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"# 将数据保存为 HDF5 文件(table 格式)\n",
|
||
"all_daily_data_df.to_hdf(h5_filename, key='daily_basic', mode='a', format='table', append=True, data_columns=True)\n",
|
||
"\n",
|
||
"print(\"所有每日基础数据获取并保存完毕!\")\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 8,
|
||
"id": "d7a773fc20293477",
|
||
"metadata": {
|
||
"ExecuteTime": {
|
||
"end_time": "2025-04-09T14:58:24.305403Z",
|
||
"start_time": "2025-04-09T14:58:17.816332Z"
|
||
}
|
||
},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"<class 'pandas.core.frame.DataFrame'>\n",
|
||
"Index: 9177589 entries, 0 to 21683\n",
|
||
"Data columns (total 3 columns):\n",
|
||
" # Column Dtype \n",
|
||
"--- ------ ----- \n",
|
||
" 0 ts_code object\n",
|
||
" 1 trade_date object\n",
|
||
" 2 is_st bool \n",
|
||
"dtypes: bool(1), object(2)\n",
|
||
"memory usage: 218.8+ MB\n",
|
||
"None\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"with pd.HDFStore(h5_filename, mode='r') as store:\n",
|
||
" df = store[key][['ts_code', 'trade_date', 'is_st']]\n",
|
||
" print(df.info())"
|
||
]
|
||
}
|
||
],
|
||
"metadata": {
|
||
"kernelspec": {
|
||
"display_name": "stock",
|
||
"language": "python",
|
||
"name": "python3"
|
||
},
|
||
"language_info": {
|
||
"codemirror_mode": {
|
||
"name": "ipython",
|
||
"version": 3
|
||
},
|
||
"file_extension": ".py",
|
||
"mimetype": "text/x-python",
|
||
"name": "python",
|
||
"nbconvert_exporter": "python",
|
||
"pygments_lexer": "ipython3",
|
||
"version": "3.13.2"
|
||
}
|
||
},
|
||
"nbformat": 4,
|
||
"nbformat_minor": 5
|
||
}
|