426 lines
16 KiB
Plaintext
426 lines
16 KiB
Plaintext
{
|
||
"cells": [
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 1,
|
||
"id": "18d1d622-b083-4cc4-a6f8-7c1ed2d0edd2",
|
||
"metadata": {
|
||
"ExecuteTime": {
|
||
"end_time": "2025-02-11T15:43:54.745322Z",
|
||
"start_time": "2025-02-11T15:43:53.837662Z"
|
||
}
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"import tushare as ts\n",
|
||
"ts.set_token('3a0741c702ee7e5e5f2bf1f0846bafaafe4e320833240b2a7e4a685f')\n",
|
||
"pro = ts.pro_api()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 2,
|
||
"id": "14671a7f72de2564",
|
||
"metadata": {
|
||
"ExecuteTime": {
|
||
"end_time": "2025-02-11T15:53:08.235573Z",
|
||
"start_time": "2025-02-11T15:53:07.753701Z"
|
||
}
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"from datetime import datetime\n",
|
||
"import pandas as pd\n",
|
||
"\n",
|
||
"def is_st(name_change_dict, stock_code, target_date):\n",
|
||
" target_date = datetime.strptime(target_date, '%Y%m%d')\n",
|
||
" if stock_code not in name_change_dict.keys():\n",
|
||
" return False\n",
|
||
" df = name_change_dict[stock_code]\n",
|
||
" for i in range(len(df)):\n",
|
||
" sds = df.iloc[i, 2]\n",
|
||
" eds = df.iloc[i, 3]\n",
|
||
" if eds is None or eds is pd.NaT:\n",
|
||
" eds = datetime.now()\n",
|
||
" if (target_date - sds).days >= 0 and (target_date - eds).days <= 0:\n",
|
||
" return True\n",
|
||
" return False\n",
|
||
"\n",
|
||
"name_change_df = pd.read_hdf('../../../data/name_change.h5', key='name_change')\n",
|
||
"name_change_df = name_change_df.drop_duplicates(keep='first')\n",
|
||
"\n",
|
||
"# 确保 name_change_df 的日期格式正确\n",
|
||
"name_change_df['start_date'] = pd.to_datetime(name_change_df['start_date'], format='%Y%m%d')\n",
|
||
"name_change_df['end_date'] = pd.to_datetime(name_change_df['end_date'], format='%Y%m%d', errors='coerce')\n",
|
||
"name_change_df = name_change_df[name_change_df.name.str.contains('ST')]\n",
|
||
"name_change_dict = {}\n",
|
||
"for ts_code, group in name_change_df.groupby('ts_code'):\n",
|
||
" # 只保留 'ST' 和 '*ST' 的记录\n",
|
||
" st_data = group[(group['change_reason'] == 'ST') | (group['change_reason'] == '*ST')]\n",
|
||
" if not st_data.empty:\n",
|
||
" name_change_dict[ts_code] = st_data"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 3,
|
||
"id": "e7f8cce2f80e2f20",
|
||
"metadata": {
|
||
"ExecuteTime": {
|
||
"end_time": "2025-02-11T15:53:19.812860Z",
|
||
"start_time": "2025-02-11T15:53:09.614377Z"
|
||
}
|
||
},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"<class 'pandas.core.frame.DataFrame'>\n",
|
||
"Index: 8291970 entries, 0 to 8291969\n",
|
||
"Data columns (total 2 columns):\n",
|
||
" # Column Dtype \n",
|
||
"--- ------ ----- \n",
|
||
" 0 ts_code object\n",
|
||
" 1 trade_date object\n",
|
||
"dtypes: object(2)\n",
|
||
"memory usage: 189.8+ MB\n",
|
||
"None\n",
|
||
"20250211\n",
|
||
"20250212\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"import time\n",
|
||
"from concurrent.futures import ThreadPoolExecutor, as_completed\n",
|
||
"\n",
|
||
"h5_filename = '../../../data/daily_basic.h5'\n",
|
||
"key = '/daily_basic'\n",
|
||
"max_date = None\n",
|
||
"with pd.HDFStore(h5_filename, mode='r') as store:\n",
|
||
" df = store[key][['ts_code', 'trade_date']]\n",
|
||
" print(df.info())\n",
|
||
" max_date = df['trade_date'].max()\n",
|
||
"\n",
|
||
"print(max_date)\n",
|
||
"trade_cal = pro.trade_cal(exchange='', start_date='20170101', end_date='20250220')\n",
|
||
"trade_cal = trade_cal[trade_cal['is_open'] == 1] # 只保留交易日\n",
|
||
"trade_dates = trade_cal[trade_cal['cal_date'] > max_date]['cal_date'].tolist()\n",
|
||
"start_date = min(trade_dates)\n",
|
||
"print(start_date)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 4,
|
||
"id": "553cfb36-f560-4cc4-b2bc-68323ccc5072",
|
||
"metadata": {
|
||
"ExecuteTime": {
|
||
"end_time": "2025-02-11T15:53:24.100612Z",
|
||
"start_time": "2025-02-11T15:53:22.361257Z"
|
||
},
|
||
"scrolled": true
|
||
},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"任务 20250220 完成\n",
|
||
"任务 20250219 完成\n",
|
||
"任务 20250217 完成\n",
|
||
"任务 20250218 完成\n",
|
||
"任务 20250214 完成\n",
|
||
"任务 20250213 完成\n",
|
||
"任务 20250212 完成\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"\n",
|
||
"\n",
|
||
"# 使用 HDFStore 存储数据\n",
|
||
"all_daily_data = []\n",
|
||
"\n",
|
||
"# API 调用计数和时间控制变量\n",
|
||
"api_call_count = 0\n",
|
||
"batch_start_time = time.time()\n",
|
||
"\n",
|
||
"\n",
|
||
"def get_data(trade_date):\n",
|
||
" daily_basic_data = pro.daily_basic(ts_code='', trade_date=trade_date)\n",
|
||
" if daily_basic_data is not None and not daily_basic_data.empty:\n",
|
||
" # 添加交易日期列标识\n",
|
||
" daily_basic_data['trade_date'] = trade_date\n",
|
||
" daily_basic_data['is_st'] = daily_basic_data.apply(\n",
|
||
" lambda row: is_st(name_change_dict, row['ts_code'], row['trade_date']), axis=1\n",
|
||
" )\n",
|
||
" time.sleep(0.2)\n",
|
||
" # print(f\"成功获取并保存 {trade_date} 的每日基础数据\")\n",
|
||
" return daily_basic_data\n",
|
||
"\n",
|
||
"\n",
|
||
"# 遍历每个交易日期并获取数据\n",
|
||
"with ThreadPoolExecutor(max_workers=2) as executor:\n",
|
||
" future_to_date = {executor.submit(get_data, td): td for td in trade_dates}\n",
|
||
"\n",
|
||
" for future in as_completed(future_to_date):\n",
|
||
" trade_date = future_to_date[future] # 获取对应的交易日期\n",
|
||
" try:\n",
|
||
" result = future.result() # 获取任务执行的结果\n",
|
||
" all_daily_data.append(result)\n",
|
||
" print(f\"任务 {trade_date} 完成\")\n",
|
||
" except Exception as e:\n",
|
||
" print(f\"获取 {trade_date} 数据时出错: {e}\")\n",
|
||
" # 计数一次 API 调用\n",
|
||
" api_call_count += 1\n",
|
||
"\n",
|
||
" # 每调用 300 次,检查时间是否少于 1 分钟,如果少于则等待剩余时间\n",
|
||
" if api_call_count % 150 == 0:\n",
|
||
" elapsed = time.time() - batch_start_time\n",
|
||
" if elapsed < 60:\n",
|
||
" sleep_time = 60 - elapsed\n",
|
||
" print(f\"已调用 150 次 API,等待 {sleep_time:.2f} 秒以满足速率限制...\")\n",
|
||
" time.sleep(sleep_time)\n",
|
||
" # 重置批次起始时间\n",
|
||
" batch_start_time = time.time()\n",
|
||
"\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 5,
|
||
"id": "919023c693d7a47a",
|
||
"metadata": {
|
||
"ExecuteTime": {
|
||
"end_time": "2025-02-11T15:53:25.913933Z",
|
||
"start_time": "2025-02-11T15:53:25.902629Z"
|
||
}
|
||
},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
" ts_code trade_date close turnover_rate turnover_rate_f \\\n",
|
||
"0 601162.SH 20250212 4.77 7.3760 9.7054 \n",
|
||
"1 603216.SH 20250212 11.42 8.8711 8.8711 \n",
|
||
"2 872808.BJ 20250212 74.36 4.1219 15.3296 \n",
|
||
"3 601881.SH 20250212 14.43 0.5617 1.9533 \n",
|
||
"4 002837.SZ 20250212 42.25 3.8199 5.7136 \n",
|
||
"... ... ... ... ... ... \n",
|
||
"5380 603931.SH 20250212 23.83 1.4692 4.6843 \n",
|
||
"5381 688567.SH 20250212 12.35 1.3091 2.1970 \n",
|
||
"5382 688530.SH 20250212 19.30 6.6093 6.6093 \n",
|
||
"5383 301363.SZ 20250212 31.99 2.1990 2.1990 \n",
|
||
"5384 833533.BJ 20250212 46.02 27.7269 27.7597 \n",
|
||
"\n",
|
||
" volume_ratio pe pe_ttm pb ps ps_ttm dv_ratio \\\n",
|
||
"0 2.00 134.5633 NaN 1.7935 12.0634 19.0461 0.0000 \n",
|
||
"1 2.09 26.5657 27.5224 1.4454 1.9304 1.9996 2.6270 \n",
|
||
"2 1.20 142.3485 196.0315 22.9124 22.8711 25.8281 NaN \n",
|
||
"3 0.84 20.0264 15.5707 1.4245 4.6898 4.4609 2.1067 \n",
|
||
"4 0.65 91.3544 64.5935 11.2259 8.9056 7.2600 0.3621 \n",
|
||
"... ... ... ... ... ... ... ... \n",
|
||
"5380 1.16 27.1631 29.0662 3.0982 6.8392 6.9124 1.1120 \n",
|
||
"5381 1.01 NaN NaN 1.4955 0.9183 1.0469 NaN \n",
|
||
"5382 0.99 62.5995 198.4906 3.6879 6.4857 7.9319 NaN \n",
|
||
"5383 0.98 41.5226 47.9900 3.8396 9.7258 8.9664 0.4982 \n",
|
||
"5384 0.84 52.3997 62.1858 13.3582 6.6261 5.9638 NaN \n",
|
||
"\n",
|
||
" dv_ttm total_share float_share free_share total_mv \\\n",
|
||
"0 NaN 8.665757e+05 866575.7464 658594.7570 4.133566e+06 \n",
|
||
"1 2.6270 2.226900e+04 5669.0000 5669.0000 2.543120e+05 \n",
|
||
"2 NaN 2.000000e+04 19461.9464 5233.0650 1.487200e+06 \n",
|
||
"3 2.1067 1.093440e+06 724341.7623 208280.6759 1.577834e+07 \n",
|
||
"4 0.3621 7.438227e+04 64662.2002 43230.4691 3.142651e+06 \n",
|
||
"... ... ... ... ... ... \n",
|
||
"5380 1.1120 1.995584e+04 19955.8380 6258.8392 4.755476e+05 \n",
|
||
"5381 NaN 1.222104e+05 122210.3885 72818.9706 1.509298e+06 \n",
|
||
"5382 NaN 1.600448e+04 3200.8966 3200.8966 3.088865e+05 \n",
|
||
"5383 0.4982 4.066600e+04 11215.9100 11215.9100 1.300905e+06 \n",
|
||
"5384 NaN 1.005826e+04 3796.0235 3791.5280 4.628809e+05 \n",
|
||
"\n",
|
||
" circ_mv is_st \n",
|
||
"0 4.133566e+06 False \n",
|
||
"1 6.473998e+04 False \n",
|
||
"2 1.447190e+06 False \n",
|
||
"3 1.045225e+07 False \n",
|
||
"4 2.731978e+06 False \n",
|
||
"... ... ... \n",
|
||
"5380 4.755476e+05 False \n",
|
||
"5381 1.509298e+06 False \n",
|
||
"5382 6.177730e+04 False \n",
|
||
"5383 3.587970e+05 False \n",
|
||
"5384 1.746930e+05 False \n",
|
||
"\n",
|
||
"[5385 rows x 19 columns]\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"all_daily_data_df = pd.concat(all_daily_data, ignore_index=True)\n",
|
||
"print(all_daily_data_df)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 6,
|
||
"id": "28cb78d032671b20",
|
||
"metadata": {
|
||
"ExecuteTime": {
|
||
"end_time": "2025-02-11T15:53:42.062142Z",
|
||
"start_time": "2025-02-11T15:53:42.044324Z"
|
||
}
|
||
},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
" ts_code trade_date close turnover_rate turnover_rate_f \\\n",
|
||
"10 002366.SZ 20250212 5.10 3.8029 4.1742 \n",
|
||
"48 002124.SZ 20250212 2.80 1.8388 1.9195 \n",
|
||
"57 000504.SZ 20250212 9.32 0.9666 1.5370 \n",
|
||
"63 603007.SH 20250212 10.03 2.0477 2.7581 \n",
|
||
"91 300201.SZ 20250212 5.33 2.3317 3.1604 \n",
|
||
"... ... ... ... ... ... \n",
|
||
"5303 002316.SZ 20250212 3.52 3.1023 3.3580 \n",
|
||
"5335 600568.SH 20250212 1.30 0.3996 0.6514 \n",
|
||
"5364 002168.SZ 20250212 2.48 0.8869 1.0824 \n",
|
||
"5367 300600.SZ 20250212 7.19 0.7517 1.4024 \n",
|
||
"5369 000972.SZ 20250212 3.38 4.6979 7.2993 \n",
|
||
"\n",
|
||
" volume_ratio pe pe_ttm pb ps ps_ttm dv_ratio \\\n",
|
||
"10 0.92 52.0324 56.8856 2.2889 14.2486 11.9214 0.0000 \n",
|
||
"48 0.97 NaN 260.7218 1.7484 0.6080 0.6154 0.0000 \n",
|
||
"57 0.83 NaN NaN 12.3702 22.4855 24.7156 0.0000 \n",
|
||
"63 0.86 NaN NaN 24.6750 55.2244 76.4853 0.0000 \n",
|
||
"91 0.75 26.1255 26.1088 4.2311 3.9774 4.2028 0.6431 \n",
|
||
"... ... ... ... ... ... ... ... \n",
|
||
"5303 0.95 NaN NaN 19.4146 2.2930 2.3153 0.0000 \n",
|
||
"5335 0.76 NaN NaN 1.1378 4.0571 4.0379 0.0000 \n",
|
||
"5364 0.88 1024.9794 NaN NaN 7.6515 7.4299 0.0000 \n",
|
||
"5367 1.18 NaN NaN 2.2914 10.7845 8.9952 0.0000 \n",
|
||
"5369 0.77 24.0853 120.2360 16.2931 4.5277 4.9137 0.0000 \n",
|
||
"\n",
|
||
" dv_ttm total_share float_share free_share total_mv \\\n",
|
||
"10 NaN 208093.7640 125646.4390 114472.2056 1.061278e+06 \n",
|
||
"48 NaN 222193.3832 197428.3498 189130.4452 6.221415e+05 \n",
|
||
"57 NaN 33002.3098 31066.8701 19536.7046 3.075815e+05 \n",
|
||
"63 NaN 87689.6101 49983.0778 37108.5778 8.795268e+05 \n",
|
||
"91 0.6431 100904.3607 100450.7422 74110.3317 5.378202e+05 \n",
|
||
"... ... ... ... ... ... \n",
|
||
"5303 NaN 39312.0000 31500.7500 29101.6694 1.383782e+05 \n",
|
||
"5335 NaN 199286.9681 166906.7279 102374.4773 2.590731e+05 \n",
|
||
"5364 NaN 78416.3368 78416.3368 64258.0991 1.944725e+05 \n",
|
||
"5367 NaN 29423.4480 24616.3436 13195.4382 2.115546e+05 \n",
|
||
"5369 NaN 77128.3579 77128.3579 49641.0760 2.606938e+05 \n",
|
||
"\n",
|
||
" circ_mv is_st \n",
|
||
"10 640796.8389 True \n",
|
||
"48 552799.3794 True \n",
|
||
"57 289543.2293 True \n",
|
||
"63 501330.2703 True \n",
|
||
"91 535402.4559 True \n",
|
||
"... ... ... \n",
|
||
"5303 110882.6400 True \n",
|
||
"5335 216978.7463 True \n",
|
||
"5364 194472.5153 True \n",
|
||
"5367 176991.5105 True \n",
|
||
"5369 260693.8497 True \n",
|
||
"\n",
|
||
"[318 rows x 19 columns]\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"print(all_daily_data_df[all_daily_data_df['is_st']])"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 7,
|
||
"id": "692b58674b7462c9",
|
||
"metadata": {
|
||
"ExecuteTime": {
|
||
"end_time": "2025-02-11T15:53:33.693894Z",
|
||
"start_time": "2025-02-11T15:53:33.609884Z"
|
||
}
|
||
},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"所有每日基础数据获取并保存完毕!\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"# 将数据保存为 HDF5 文件(table 格式)\n",
|
||
"all_daily_data_df.to_hdf(h5_filename, key='daily_basic', mode='a', format='table', append=True, data_columns=True)\n",
|
||
"\n",
|
||
"print(\"所有每日基础数据获取并保存完毕!\")\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 8,
|
||
"id": "d7a773fc20293477",
|
||
"metadata": {
|
||
"ExecuteTime": {
|
||
"end_time": "2025-02-11T15:54:27.868021Z",
|
||
"start_time": "2025-02-11T15:54:18.853803Z"
|
||
}
|
||
},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"<class 'pandas.core.frame.DataFrame'>\n",
|
||
"Index: 8297355 entries, 0 to 5384\n",
|
||
"Data columns (total 3 columns):\n",
|
||
" # Column Dtype \n",
|
||
"--- ------ ----- \n",
|
||
" 0 ts_code object\n",
|
||
" 1 trade_date object\n",
|
||
" 2 is_st bool \n",
|
||
"dtypes: bool(1), object(2)\n",
|
||
"memory usage: 197.8+ MB\n",
|
||
"None\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"with pd.HDFStore(h5_filename, mode='r') as store:\n",
|
||
" df = store[key][['ts_code', 'trade_date', 'is_st']]\n",
|
||
" print(df.info())"
|
||
]
|
||
}
|
||
],
|
||
"metadata": {
|
||
"kernelspec": {
|
||
"display_name": "Python 3 (ipykernel)",
|
||
"language": "python",
|
||
"name": "python3"
|
||
},
|
||
"language_info": {
|
||
"codemirror_mode": {
|
||
"name": "ipython",
|
||
"version": 3
|
||
},
|
||
"file_extension": ".py",
|
||
"mimetype": "text/x-python",
|
||
"name": "python",
|
||
"nbconvert_exporter": "python",
|
||
"pygments_lexer": "ipython3",
|
||
"version": "3.8.19"
|
||
}
|
||
},
|
||
"nbformat": 4,
|
||
"nbformat_minor": 5
|
||
}
|