Files
NewStock/main/data/update/update_daily_basic.ipynb
liaozhaorun dc29f153ca 1、load model
2、修改update data相关函数
2025-10-13 15:04:48 +08:00

433 lines
17 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "18d1d622-b083-4cc4-a6f8-7c1ed2d0edd2",
"metadata": {
"ExecuteTime": {
"end_time": "2025-04-09T14:57:36.913044Z",
"start_time": "2025-04-09T14:57:36.159612Z"
}
},
"outputs": [],
"source": [
"import tushare as ts\n",
"ts.set_token('3a0741c702ee7e5e5f2bf1f0846bafaafe4e320833240b2a7e4a685f')\n",
"pro = ts.pro_api()"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "14671a7f72de2564",
"metadata": {
"ExecuteTime": {
"end_time": "2025-04-09T14:57:39.128278Z",
"start_time": "2025-04-09T14:57:36.918051Z"
}
},
"outputs": [],
"source": [
"from datetime import datetime\n",
"import pandas as pd\n",
"import warnings\n",
"\n",
"warnings.filterwarnings(\"ignore\")\n",
"def filter_rows(df):\n",
" # 按照 name 和 start_date 分组\n",
" def select_row(group):\n",
" # 如果有 end_date 不为 NaT 的行,优先保留这些行\n",
" valid_rows = group[group['end_date'].notna()]\n",
" if not valid_rows.empty:\n",
" return valid_rows.iloc[0] # 返回第一个有效行\n",
" else:\n",
" return group.iloc[0] # 如果没有有效行,返回第一行\n",
"\n",
" filtered_df = df.groupby(['name', 'start_date'], group_keys=False).apply(select_row)\n",
" filtered_df = filtered_df.reset_index(drop=True)\n",
" return filtered_df\n",
"\n",
"def is_st(name_change_dict, stock_code, target_date):\n",
" target_date = datetime.strptime(target_date, '%Y%m%d')\n",
" if stock_code not in name_change_dict.keys():\n",
" return False\n",
" df = name_change_dict[stock_code]\n",
" for i in range(len(df)):\n",
" sds = df.iloc[i, 2]\n",
" eds = df.iloc[i, 3]\n",
" if eds is None or eds is pd.NaT:\n",
" eds = datetime.now()\n",
" if (target_date - sds).days >= 0 and (target_date - eds).days <= 0:\n",
" return True\n",
" return False\n",
"\n",
"name_change_df = pd.read_hdf('/mnt/d/PyProject/NewStock/data/name_change.h5', key='name_change')\n",
"name_change_df = name_change_df.drop_duplicates(keep='first')\n",
"\n",
"# 确保 name_change_df 的日期格式正确\n",
"name_change_df['start_date'] = pd.to_datetime(name_change_df['start_date'], format='%Y%m%d')\n",
"name_change_df['end_date'] = pd.to_datetime(name_change_df['end_date'], format='%Y%m%d', errors='coerce')\n",
"# name_change_df = name_change_df[name_change_df.name.str.contains('ST') ]\n",
"name_change_dict = {}\n",
"for ts_code, group in name_change_df.groupby('ts_code'):\n",
" # 只保留 'ST' 和 '*ST' 的记录\n",
" # st_data = group[(group['change_reason'] == 'ST') | (group['change_reason'] == '*ST')]\n",
" st_data = group[(group['name'].str.contains('ST')) | (group['name'].str.contains('退'))]\n",
" if not st_data.empty:\n",
" name_change_dict[ts_code] = filter_rows(st_data)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "e7f8cce2f80e2f20",
"metadata": {
"ExecuteTime": {
"end_time": "2025-04-09T14:58:09.296046Z",
"start_time": "2025-04-09T14:57:39.339423Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"Index: 9155905 entries, 0 to 27115\n",
"Data columns (total 2 columns):\n",
" # Column Dtype \n",
"--- ------ ----- \n",
" 0 ts_code object\n",
" 1 trade_date object\n",
"dtypes: object(2)\n",
"memory usage: 209.6+ MB\n",
"None\n",
"20250926\n",
"20250929\n"
]
}
],
"source": [
"import time\n",
"from concurrent.futures import ThreadPoolExecutor, as_completed\n",
"\n",
"h5_filename = '/mnt/d/PyProject/NewStock/data/daily_basic.h5'\n",
"key = '/daily_basic'\n",
"max_date = None\n",
"with pd.HDFStore(h5_filename, mode='r') as store:\n",
" df = store[key][['ts_code', 'trade_date']]\n",
" print(df.info())\n",
" max_date = df['trade_date'].max()\n",
"\n",
"print(max_date)\n",
"trade_cal = pro.trade_cal(exchange='', start_date='20170101', end_date='20251020')\n",
"trade_cal = trade_cal[trade_cal['is_open'] == 1] # 只保留交易日\n",
"trade_dates = trade_cal[trade_cal['cal_date'] > max_date]['cal_date'].tolist()\n",
"start_date = min(trade_dates)\n",
"print(start_date)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "553cfb36-f560-4cc4-b2bc-68323ccc5072",
"metadata": {
"ExecuteTime": {
"end_time": "2025-04-09T14:58:16.817010Z",
"start_time": "2025-04-09T14:58:09.326485Z"
},
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"任务 20251017 完成\n",
"任务 20251020 完成\n",
"任务 20251015 完成\n",
"任务 20251016 完成\n",
"任务 20251014 完成\n",
"任务 20251013 完成\n",
"任务 20251010 完成\n",
"任务 20251009 完成\n",
"任务 20250930 完成\n",
"任务 20250929 完成\n"
]
}
],
"source": [
"\n",
"\n",
"# 使用 HDFStore 存储数据\n",
"all_daily_data = []\n",
"\n",
"# API 调用计数和时间控制变量\n",
"api_call_count = 0\n",
"batch_start_time = time.time()\n",
"\n",
"\n",
"def get_data(trade_date):\n",
" daily_basic_data = pro.daily_basic(ts_code='', trade_date=trade_date)\n",
" if daily_basic_data is not None and not daily_basic_data.empty:\n",
" # 添加交易日期列标识\n",
" daily_basic_data['trade_date'] = trade_date\n",
" daily_basic_data['is_st'] = daily_basic_data.apply(\n",
" lambda row: is_st(name_change_dict, row['ts_code'], row['trade_date']), axis=1\n",
" )\n",
" time.sleep(0.2)\n",
" # print(f\"成功获取并保存 {trade_date} 的每日基础数据\")\n",
" return daily_basic_data\n",
"\n",
"\n",
"# 遍历每个交易日期并获取数据\n",
"with ThreadPoolExecutor(max_workers=2) as executor:\n",
" future_to_date = {executor.submit(get_data, td): td for td in trade_dates}\n",
"\n",
" for future in as_completed(future_to_date):\n",
" trade_date = future_to_date[future] # 获取对应的交易日期\n",
" try:\n",
" result = future.result() # 获取任务执行的结果\n",
" all_daily_data.append(result)\n",
" print(f\"任务 {trade_date} 完成\")\n",
" except Exception as e:\n",
" print(f\"获取 {trade_date} 数据时出错: {e}\")\n",
" # 计数一次 API 调用\n",
" api_call_count += 1\n",
"\n",
" # 每调用 300 次,检查时间是否少于 1 分钟,如果少于则等待剩余时间\n",
" if api_call_count % 150 == 0:\n",
" elapsed = time.time() - batch_start_time\n",
" if elapsed < 60:\n",
" sleep_time = 60 - elapsed\n",
" print(f\"已调用 150 次 API等待 {sleep_time:.2f} 秒以满足速率限制...\")\n",
" time.sleep(sleep_time)\n",
" # 重置批次起始时间\n",
" batch_start_time = time.time()\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "919023c693d7a47a",
"metadata": {
"ExecuteTime": {
"end_time": "2025-04-09T14:58:16.864178Z",
"start_time": "2025-04-09T14:58:16.855084Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" ts_code trade_date close turnover_rate turnover_rate_f \\\n",
"0 600642.SH 20251010 8.03 0.4806 1.3835 \n",
"1 600295.SH 20251010 10.76 0.8549 3.7056 \n",
"2 600444.SH 20251010 19.00 9.6611 17.4605 \n",
"3 605100.SH 20251010 28.72 3.4770 7.6902 \n",
"4 301399.SZ 20251010 19.53 3.9562 4.6772 \n",
"... ... ... ... ... ... \n",
"21679 600653.SH 20250929 2.13 2.1746 2.9589 \n",
"21680 002344.SZ 20250929 4.49 1.7080 3.6338 \n",
"21681 301162.SZ 20250929 60.30 2.8491 3.5744 \n",
"21682 920077.BJ 20250929 14.43 1.1113 1.6410 \n",
"21683 300283.SZ 20250929 7.04 4.8583 5.7018 \n",
"\n",
" volume_ratio pe pe_ttm pb ps ps_ttm dv_ratio \\\n",
"0 1.49 9.9635 10.2617 1.1073 1.3268 1.3600 4.9816 \n",
"1 1.56 16.3053 16.4683 1.4839 1.0603 1.1230 7.4349 \n",
"2 2.84 69.2746 55.7147 3.8398 3.6313 3.5392 0.5263 \n",
"3 0.55 66.7896 123.2961 2.7276 5.3634 6.7180 2.0794 \n",
"4 0.94 60.7990 75.8958 2.7675 6.8812 7.1828 1.2177 \n",
"... ... ... ... ... ... ... ... \n",
"21679 0.72 107.4073 227.6354 5.4498 0.9887 0.9724 0.0000 \n",
"21680 0.70 64.8238 75.9239 0.6834 5.5516 5.5560 0.9577 \n",
"21681 0.96 85.4251 76.2427 5.3380 14.5424 12.3677 0.5586 \n",
"21682 0.51 90.3399 82.4861 3.3572 5.2895 4.1636 NaN \n",
"21683 0.94 NaN NaN 3.2821 1.1161 0.9970 0.2499 \n",
"\n",
" dv_ttm total_share float_share free_share total_mv \\\n",
"0 5.6040 489407.9376 489381.3156 170006.8520 3.929946e+06 \n",
"1 5.5762 279877.6254 197557.6254 45577.9458 3.011483e+06 \n",
"2 0.5789 14642.1932 14642.1932 8101.7360 2.782017e+05 \n",
"3 1.0446 17113.2000 16993.2000 7683.2000 4.914911e+05 \n",
"4 1.0594 18502.0000 5468.3586 4625.5000 3.613441e+05 \n",
"... ... ... ... ... ... \n",
"21679 NaN 194638.0317 194638.0317 143048.5612 4.145790e+05 \n",
"21680 0.8463 128261.6960 128145.0092 60233.0025 5.758950e+05 \n",
"21681 0.9704 13258.3724 8522.5548 6793.1764 7.994799e+05 \n",
"21682 NaN 58768.1817 31695.6817 21464.7599 8.480249e+05 \n",
"21683 NaN 49697.8222 36721.8502 31289.2680 3.498727e+05 \n",
"\n",
" circ_mv is_st \n",
"0 3.929732e+06 False \n",
"1 2.125720e+06 False \n",
"2 2.782017e+05 False \n",
"3 4.880447e+05 False \n",
"4 1.067970e+05 False \n",
"... ... ... \n",
"21679 4.145790e+05 False \n",
"21680 5.753711e+05 False \n",
"21681 5.139101e+05 False \n",
"21682 4.573687e+05 False \n",
"21683 2.585218e+05 False \n",
"\n",
"[21684 rows x 19 columns]\n"
]
}
],
"source": [
"all_daily_data_df = pd.concat(all_daily_data, ignore_index=True)\n",
"print(all_daily_data_df)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "28cb78d032671b20",
"metadata": {
"ExecuteTime": {
"end_time": "2025-04-09T14:58:16.881685Z",
"start_time": "2025-04-09T14:58:16.871184Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" ts_code trade_date close turnover_rate turnover_rate_f \\\n",
"9 300313.SZ 20251010 8.84 3.1146 6.4625 \n",
"20 603838.SH 20251010 7.80 0.5503 1.5146 \n",
"29 603813.SH 20251010 24.06 1.5835 4.5173 \n",
"48 002742.SZ 20251010 4.65 1.0473 1.2924 \n",
"69 603559.SH 20251010 8.50 0.2072 0.2945 \n",
"... ... ... ... ... ... \n",
"21466 603021.SH 20250929 4.62 1.3860 2.3418 \n",
"21552 300020.SZ 20250929 3.58 1.5031 1.6828 \n",
"21554 000506.SZ 20250929 10.88 10.5560 15.7565 \n",
"21603 600636.SH 20250929 8.29 0.4693 0.7963 \n",
"21661 603843.SH 20250929 5.17 0.3798 0.5364 \n",
"\n",
" volume_ratio pe pe_ttm pb ps ps_ttm dv_ratio dv_ttm \\\n",
"9 1.30 NaN NaN NaN 20.1067 20.9731 0.0000 NaN \n",
"20 0.57 NaN NaN 2.6121 8.7517 6.9304 0.0000 NaN \n",
"29 1.88 NaN NaN 4.5222 8.4776 7.5124 1.0313 NaN \n",
"48 1.28 NaN NaN NaN 1.6800 2.1226 0.0000 NaN \n",
"69 0.60 NaN NaN 3.5043 9.5964 8.2315 0.0000 NaN \n",
"... ... .. ... ... ... ... ... ... \n",
"21466 0.80 NaN NaN NaN 3.5891 3.7851 0.0000 NaN \n",
"21552 1.00 NaN NaN 0.9812 5.1924 18.4036 0.0000 NaN \n",
"21554 3.17 NaN NaN 16.4257 30.3341 23.4860 0.0000 NaN \n",
"21603 0.81 NaN NaN 1.7909 12.8512 11.0116 0.4825 0.6031 \n",
"21661 0.05 NaN NaN 12.5612 2.6558 3.1369 0.0000 NaN \n",
"\n",
" total_share float_share free_share total_mv circ_mv is_st \n",
"9 31297.7396 19735.2789 9511.5479 2.766720e+05 1.744599e+05 True \n",
"20 32001.6000 32001.6000 11627.0468 2.496125e+05 2.496125e+05 True \n",
"29 10501.5000 10501.5000 3681.2000 2.526661e+05 2.526661e+05 True \n",
"48 43200.0000 43185.8082 34994.8239 2.008800e+05 2.008140e+05 True \n",
"69 40127.6979 40127.6979 28231.9697 3.410854e+05 3.410854e+05 True \n",
"... ... ... ... ... ... ... \n",
"21466 31994.8070 31994.8070 18936.7934 1.478160e+05 1.478160e+05 True \n",
"21552 79467.7974 76663.9584 68475.6577 2.844947e+05 2.744570e+05 True \n",
"21554 92901.7761 92858.4361 62210.1427 1.010771e+06 1.010300e+06 True \n",
"21603 43863.6802 43863.6802 25849.6552 3.636299e+05 3.636299e+05 True \n",
"21661 69962.3237 69962.3237 49541.4702 3.617052e+05 3.617052e+05 True \n",
"\n",
"[749 rows x 19 columns]\n"
]
}
],
"source": [
"print(all_daily_data_df[all_daily_data_df['is_st']])"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "692b58674b7462c9",
"metadata": {
"ExecuteTime": {
"end_time": "2025-04-09T14:58:17.773453Z",
"start_time": "2025-04-09T14:58:16.903459Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"所有每日基础数据获取并保存完毕!\n"
]
}
],
"source": [
"# 将数据保存为 HDF5 文件table 格式)\n",
"all_daily_data_df.to_hdf(h5_filename, key='daily_basic', mode='a', format='table', append=True, data_columns=True)\n",
"\n",
"print(\"所有每日基础数据获取并保存完毕!\")\n"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "d7a773fc20293477",
"metadata": {
"ExecuteTime": {
"end_time": "2025-04-09T14:58:24.305403Z",
"start_time": "2025-04-09T14:58:17.816332Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"Index: 9177589 entries, 0 to 21683\n",
"Data columns (total 3 columns):\n",
" # Column Dtype \n",
"--- ------ ----- \n",
" 0 ts_code object\n",
" 1 trade_date object\n",
" 2 is_st bool \n",
"dtypes: bool(1), object(2)\n",
"memory usage: 218.8+ MB\n",
"None\n"
]
}
],
"source": [
"with pd.HDFStore(h5_filename, mode='r') as store:\n",
" df = store[key][['ts_code', 'trade_date', 'is_st']]\n",
" print(df.info())"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "stock",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.2"
}
},
"nbformat": 4,
"nbformat_minor": 5
}