Files
NewStock/code/data/update/update_daily_basic.ipynb
2025-04-10 23:17:22 +08:00

443 lines
17 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
{
"cells": [
{
"cell_type": "code",
"id": "18d1d622-b083-4cc4-a6f8-7c1ed2d0edd2",
"metadata": {
"ExecuteTime": {
"end_time": "2025-04-09T14:57:36.913044Z",
"start_time": "2025-04-09T14:57:36.159612Z"
}
},
"source": [
"import tushare as ts\n",
"ts.set_token('3a0741c702ee7e5e5f2bf1f0846bafaafe4e320833240b2a7e4a685f')\n",
"pro = ts.pro_api()"
],
"outputs": [],
"execution_count": 1
},
{
"cell_type": "code",
"id": "14671a7f72de2564",
"metadata": {
"ExecuteTime": {
"end_time": "2025-04-09T14:57:39.128278Z",
"start_time": "2025-04-09T14:57:36.918051Z"
}
},
"source": [
"from datetime import datetime\n",
"import pandas as pd\n",
"import warnings\n",
"\n",
"warnings.filterwarnings(\"ignore\")\n",
"def filter_rows(df):\n",
" # 按照 name 和 start_date 分组\n",
" def select_row(group):\n",
" # 如果有 end_date 不为 NaT 的行,优先保留这些行\n",
" valid_rows = group[group['end_date'].notna()]\n",
" if not valid_rows.empty:\n",
" return valid_rows.iloc[0] # 返回第一个有效行\n",
" else:\n",
" return group.iloc[0] # 如果没有有效行,返回第一行\n",
"\n",
" filtered_df = df.groupby(['name', 'start_date'], group_keys=False).apply(select_row)\n",
" filtered_df = filtered_df.reset_index(drop=True)\n",
" return filtered_df\n",
"\n",
"def is_st(name_change_dict, stock_code, target_date):\n",
" target_date = datetime.strptime(target_date, '%Y%m%d')\n",
" if stock_code not in name_change_dict.keys():\n",
" return False\n",
" df = name_change_dict[stock_code]\n",
" for i in range(len(df)):\n",
" sds = df.iloc[i, 2]\n",
" eds = df.iloc[i, 3]\n",
" if eds is None or eds is pd.NaT:\n",
" eds = datetime.now()\n",
" if (target_date - sds).days >= 0 and (target_date - eds).days <= 0:\n",
" return True\n",
" return False\n",
"\n",
"name_change_df = pd.read_hdf('../../../data/name_change.h5', key='name_change')\n",
"name_change_df = name_change_df.drop_duplicates(keep='first')\n",
"\n",
"# 确保 name_change_df 的日期格式正确\n",
"name_change_df['start_date'] = pd.to_datetime(name_change_df['start_date'], format='%Y%m%d')\n",
"name_change_df['end_date'] = pd.to_datetime(name_change_df['end_date'], format='%Y%m%d', errors='coerce')\n",
"name_change_df = name_change_df[name_change_df.name.str.contains('ST')]\n",
"name_change_dict = {}\n",
"for ts_code, group in name_change_df.groupby('ts_code'):\n",
" # 只保留 'ST' 和 '*ST' 的记录\n",
" st_data = group[(group['change_reason'] == 'ST') | (group['change_reason'] == '*ST')]\n",
" if not st_data.empty:\n",
" name_change_dict[ts_code] = filter_rows(st_data)"
],
"outputs": [],
"execution_count": 2
},
{
"cell_type": "code",
"id": "e7f8cce2f80e2f20",
"metadata": {
"ExecuteTime": {
"end_time": "2025-04-09T14:58:09.296046Z",
"start_time": "2025-04-09T14:57:39.339423Z"
}
},
"source": [
"import time\n",
"from concurrent.futures import ThreadPoolExecutor, as_completed\n",
"\n",
"h5_filename = '../../../data/daily_basic.h5'\n",
"key = '/daily_basic'\n",
"max_date = None\n",
"with pd.HDFStore(h5_filename, mode='r') as store:\n",
" df = store[key][['ts_code', 'trade_date']]\n",
" print(df.info())\n",
" max_date = df['trade_date'].max()\n",
"\n",
"print(max_date)\n",
"trade_cal = pro.trade_cal(exchange='', start_date='20170101', end_date='20250420')\n",
"trade_cal = trade_cal[trade_cal['is_open'] == 1] # 只保留交易日\n",
"trade_dates = trade_cal[trade_cal['cal_date'] > max_date]['cal_date'].tolist()\n",
"start_date = min(trade_dates)\n",
"print(start_date)"
],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"Index: 8512911 entries, 0 to 5391\n",
"Data columns (total 2 columns):\n",
" # Column Dtype \n",
"--- ------ ----- \n",
" 0 ts_code object\n",
" 1 trade_date object\n",
"dtypes: object(2)\n",
"memory usage: 194.8+ MB\n",
"None\n",
"20250408\n",
"20250409\n"
]
}
],
"execution_count": 3
},
{
"cell_type": "code",
"id": "553cfb36-f560-4cc4-b2bc-68323ccc5072",
"metadata": {
"scrolled": true,
"ExecuteTime": {
"end_time": "2025-04-09T14:58:16.817010Z",
"start_time": "2025-04-09T14:58:09.326485Z"
}
},
"source": [
"\n",
"\n",
"# 使用 HDFStore 存储数据\n",
"all_daily_data = []\n",
"\n",
"# API 调用计数和时间控制变量\n",
"api_call_count = 0\n",
"batch_start_time = time.time()\n",
"\n",
"\n",
"def get_data(trade_date):\n",
" daily_basic_data = pro.daily_basic(ts_code='', trade_date=trade_date)\n",
" if daily_basic_data is not None and not daily_basic_data.empty:\n",
" # 添加交易日期列标识\n",
" daily_basic_data['trade_date'] = trade_date\n",
" daily_basic_data['is_st'] = daily_basic_data.apply(\n",
" lambda row: is_st(name_change_dict, row['ts_code'], row['trade_date']), axis=1\n",
" )\n",
" time.sleep(0.2)\n",
" # print(f\"成功获取并保存 {trade_date} 的每日基础数据\")\n",
" return daily_basic_data\n",
"\n",
"\n",
"# 遍历每个交易日期并获取数据\n",
"with ThreadPoolExecutor(max_workers=2) as executor:\n",
" future_to_date = {executor.submit(get_data, td): td for td in trade_dates}\n",
"\n",
" for future in as_completed(future_to_date):\n",
" trade_date = future_to_date[future] # 获取对应的交易日期\n",
" try:\n",
" result = future.result() # 获取任务执行的结果\n",
" all_daily_data.append(result)\n",
" print(f\"任务 {trade_date} 完成\")\n",
" except Exception as e:\n",
" print(f\"获取 {trade_date} 数据时出错: {e}\")\n",
" # 计数一次 API 调用\n",
" api_call_count += 1\n",
"\n",
" # 每调用 300 次,检查时间是否少于 1 分钟,如果少于则等待剩余时间\n",
" if api_call_count % 150 == 0:\n",
" elapsed = time.time() - batch_start_time\n",
" if elapsed < 60:\n",
" sleep_time = 60 - elapsed\n",
" print(f\"已调用 150 次 API等待 {sleep_time:.2f} 秒以满足速率限制...\")\n",
" time.sleep(sleep_time)\n",
" # 重置批次起始时间\n",
" batch_start_time = time.time()\n",
"\n"
],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"任务 20250418 完成\n",
"任务 20250417 完成\n",
"任务 20250416 完成\n",
"任务 20250415 完成\n",
"任务 20250414 完成\n",
"任务 20250411 完成\n",
"任务 20250410 完成\n",
"任务 20250409 完成\n"
]
}
],
"execution_count": 4
},
{
"cell_type": "code",
"id": "919023c693d7a47a",
"metadata": {
"ExecuteTime": {
"end_time": "2025-04-09T14:58:16.864178Z",
"start_time": "2025-04-09T14:58:16.855084Z"
}
},
"source": [
"all_daily_data_df = pd.concat(all_daily_data, ignore_index=True)\n",
"print(all_daily_data_df)"
],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" ts_code trade_date close turnover_rate turnover_rate_f \\\n",
"0 300285.SZ 20250409 16.61 2.1086 2.2506 \n",
"1 300458.SZ 20250409 44.48 9.9286 11.7046 \n",
"2 605090.SH 20250409 23.81 0.6834 1.1888 \n",
"3 688686.SH 20250409 69.52 1.6005 5.7492 \n",
"4 002057.SZ 20250409 7.18 4.7461 7.1088 \n",
"... ... ... ... ... ... \n",
"5390 301511.SZ 20250409 12.23 3.4040 4.6900 \n",
"5391 688355.SH 20250409 15.84 1.4154 4.4898 \n",
"5392 600019.SH 20250409 6.83 0.4729 1.2898 \n",
"5393 603507.SH 20250409 22.00 30.8936 42.4775 \n",
"5394 600886.SH 20250409 14.58 0.7795 2.4989 \n",
"\n",
" volume_ratio pe pe_ttm pb ps ps_ttm dv_ratio \\\n",
"0 1.11 29.0985 27.1266 2.5144 4.2913 4.1010 0.6020 \n",
"1 1.54 168.9309 168.9309 9.3966 12.3119 12.3119 0.3364 \n",
"2 1.00 11.8377 9.0427 1.7135 0.5819 0.6421 3.2226 \n",
"3 1.18 43.8690 61.1222 2.9105 9.0031 9.2377 NaN \n",
"4 1.35 19.8304 29.3370 1.7625 1.9656 2.0487 3.2191 \n",
"... ... ... ... ... ... ... ... \n",
"5390 1.36 58.1209 NaN 1.9116 1.1803 1.1129 0.3212 \n",
"5391 1.31 133.9017 29.7427 1.8103 3.6805 3.1067 NaN \n",
"5392 1.28 12.5281 15.7915 0.7518 0.4344 0.4503 4.4796 \n",
"5393 2.89 22.7537 22.7537 1.6401 1.0276 1.0276 1.3553 \n",
"5394 1.04 17.4059 16.1402 1.8424 2.0579 1.9930 3.1604 \n",
"\n",
" dv_ttm total_share float_share free_share total_mv \\\n",
"0 0.6020 9.970483e+04 8.039498e+04 75323.2612 1.656097e+06 \n",
"1 0.3364 6.332851e+04 5.179696e+04 43937.3622 2.816852e+06 \n",
"2 3.2226 6.492580e+04 6.426965e+04 36946.4646 1.545883e+06 \n",
"3 NaN 1.222355e+04 1.222355e+04 3402.7889 8.497809e+05 \n",
"4 3.2191 7.584828e+04 7.501396e+04 50081.8345 5.445906e+05 \n",
"... ... ... ... ... ... \n",
"5390 0.3212 6.303220e+04 3.736720e+04 27120.6014 7.708838e+05 \n",
"5391 NaN 1.239561e+04 1.239561e+04 3907.6756 1.963464e+05 \n",
"5392 4.4796 2.190864e+06 2.178208e+06 798651.6922 1.496360e+07 \n",
"5393 1.3553 1.843013e+04 1.843013e+04 13404.1045 4.054629e+05 \n",
"5394 3.1604 8.004494e+05 7.454180e+05 232532.2636 1.167055e+07 \n",
"\n",
" circ_mv is_st \n",
"0 1.335361e+06 False \n",
"1 2.303929e+06 False \n",
"2 1.530260e+06 False \n",
"3 8.497809e+05 False \n",
"4 5.386002e+05 False \n",
"... ... ... \n",
"5390 4.570009e+05 False \n",
"5391 1.963464e+05 False \n",
"5392 1.487716e+07 False \n",
"5393 4.054629e+05 False \n",
"5394 1.086819e+07 False \n",
"\n",
"[5395 rows x 19 columns]\n"
]
}
],
"execution_count": 5
},
{
"cell_type": "code",
"id": "28cb78d032671b20",
"metadata": {
"ExecuteTime": {
"end_time": "2025-04-09T14:58:16.881685Z",
"start_time": "2025-04-09T14:58:16.871184Z"
}
},
"source": [
"print(all_daily_data_df[all_daily_data_df['is_st']])"
],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" ts_code trade_date close turnover_rate turnover_rate_f \\\n",
"85 002822.SZ 20250409 3.11 1.8467 1.9219 \n",
"123 603959.SH 20250409 3.27 1.7568 2.2420 \n",
"181 688282.SH 20250409 42.59 2.5546 3.0570 \n",
"259 600777.SH 20250409 2.66 1.9331 2.4597 \n",
"283 002052.SZ 20250409 6.15 1.5326 2.5481 \n",
"... ... ... ... ... ... \n",
"5286 002602.SZ 20250409 5.93 3.0376 3.5162 \n",
"5345 002501.SZ 20250409 1.89 4.3252 5.5834 \n",
"5364 600387.SH 20250409 2.34 0.0904 0.1163 \n",
"5366 002656.SZ 20250409 1.95 2.7047 3.0210 \n",
"5378 300013.SZ 20250409 3.57 2.8370 3.1107 \n",
"\n",
" volume_ratio pe pe_ttm pb ps ps_ttm dv_ratio \\\n",
"85 2.59 NaN NaN 1.2023 0.5923 0.7314 0.0 \n",
"123 2.22 NaN NaN 4.3282 0.7749 1.1811 0.0 \n",
"181 1.07 NaN NaN 2.9277 172.3150 21.9335 NaN \n",
"259 0.96 6.9694 7.6204 0.8381 2.0443 2.0567 0.0 \n",
"283 0.74 NaN NaN NaN 19.5551 17.1988 0.0 \n",
"... ... ... ... ... ... ... ... \n",
"5286 3.30 84.3318 49.2129 1.6993 3.3267 2.3228 0.0 \n",
"5345 1.75 NaN NaN 7.0441 14.0701 19.7111 0.0 \n",
"5364 1.33 NaN NaN 0.3818 0.5148 0.8454 0.0 \n",
"5366 1.75 NaN NaN 3.8456 4.7986 5.9354 0.0 \n",
"5378 0.90 NaN NaN 8.2438 4.8281 4.2666 0.0 \n",
"\n",
" dv_ttm total_share float_share free_share total_mv \\\n",
"85 NaN 73467.1821 56245.3696 54046.3738 2.284829e+05 \n",
"123 NaN 49029.8992 49029.8992 38419.3842 1.603278e+05 \n",
"181 NaN 8800.0000 3652.0000 3051.8414 3.747920e+05 \n",
"259 NaN 680049.5825 636615.2391 500325.8436 1.808932e+06 \n",
"283 NaN 74595.9694 74595.5944 44867.2806 4.587652e+05 \n",
"... ... ... ... ... ... \n",
"5286 NaN 745255.6968 687870.8273 594244.1179 4.419366e+06 \n",
"5345 NaN 355000.0000 354999.9006 274999.9006 6.709500e+05 \n",
"5364 NaN 46814.4464 40404.8492 31411.4405 1.095458e+05 \n",
"5366 NaN 71251.9844 60945.7555 54564.8212 1.389414e+05 \n",
"5378 NaN 55835.8894 44606.0865 40680.8215 1.993341e+05 \n",
"\n",
" circ_mv is_st \n",
"85 1.749231e+05 True \n",
"123 1.603278e+05 True \n",
"181 1.555387e+05 True \n",
"259 1.693397e+06 True \n",
"283 4.587629e+05 True \n",
"... ... ... \n",
"5286 4.079074e+06 True \n",
"5345 6.709498e+05 True \n",
"5364 9.454735e+04 True \n",
"5366 1.188442e+05 True \n",
"5378 1.592437e+05 True \n",
"\n",
"[106 rows x 19 columns]\n"
]
}
],
"execution_count": 6
},
{
"cell_type": "code",
"id": "692b58674b7462c9",
"metadata": {
"ExecuteTime": {
"end_time": "2025-04-09T14:58:17.773453Z",
"start_time": "2025-04-09T14:58:16.903459Z"
}
},
"source": [
"# 将数据保存为 HDF5 文件table 格式)\n",
"all_daily_data_df.to_hdf(h5_filename, key='daily_basic', mode='a', format='table', append=True, data_columns=True)\n",
"\n",
"print(\"所有每日基础数据获取并保存完毕!\")\n"
],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"所有每日基础数据获取并保存完毕!\n"
]
}
],
"execution_count": 7
},
{
"cell_type": "code",
"id": "d7a773fc20293477",
"metadata": {
"ExecuteTime": {
"end_time": "2025-04-09T14:58:24.305403Z",
"start_time": "2025-04-09T14:58:17.816332Z"
}
},
"source": [
"with pd.HDFStore(h5_filename, mode='r') as store:\n",
" df = store[key][['ts_code', 'trade_date', 'is_st']]\n",
" print(df.info())"
],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"Index: 8518306 entries, 0 to 5394\n",
"Data columns (total 3 columns):\n",
" # Column Dtype \n",
"--- ------ ----- \n",
" 0 ts_code object\n",
" 1 trade_date object\n",
" 2 is_st bool \n",
"dtypes: bool(1), object(2)\n",
"memory usage: 203.1+ MB\n",
"None\n"
]
}
],
"execution_count": 8
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.11"
}
},
"nbformat": 4,
"nbformat_minor": 5
}