Files
NewStock/main/data/update/update_is_st.ipynb

232 lines
8.8 KiB
Plaintext
Raw Normal View History

2025-02-12 00:21:33 +08:00
{
"cells": [
{
"cell_type": "code",
2025-05-06 23:42:40 +08:00
"execution_count": 3,
2025-02-12 00:21:33 +08:00
"id": "17cc645336d4eb18",
"metadata": {
"ExecuteTime": {
"end_time": "2025-02-08T16:55:19.819017Z",
"start_time": "2025-02-08T16:55:18.958639Z"
}
},
2025-05-06 23:42:40 +08:00
"outputs": [],
2025-02-12 00:21:33 +08:00
"source": [
"import pandas as pd\n",
"import tushare as ts"
2025-05-06 23:42:40 +08:00
]
2025-02-12 00:21:33 +08:00
},
{
2025-05-06 23:42:40 +08:00
"cell_type": "code",
"execution_count": 8,
"id": "48ae71ed02d61819",
2025-02-12 00:21:33 +08:00
"metadata": {
"ExecuteTime": {
"end_time": "2025-02-08T16:55:27.578361Z",
"start_time": "2025-02-08T16:55:19.882313Z"
}
},
"outputs": [],
2025-05-06 23:42:40 +08:00
"source": [
"daily_basic = pd.read_hdf('../../../data/daily_basic.h5', key='daily_basic')\n"
]
2025-02-12 00:21:33 +08:00
},
{
2025-05-06 23:42:40 +08:00
"cell_type": "code",
"execution_count": 5,
"id": "e6606a96e5728b8",
2025-02-12 00:21:33 +08:00
"metadata": {
"ExecuteTime": {
"end_time": "2025-02-08T16:55:27.938078Z",
"start_time": "2025-02-08T16:55:27.584226Z"
}
},
"outputs": [],
"source": [
"from datetime import datetime\n",
2025-05-06 23:42:40 +08:00
"import warnings\n",
"\n",
"warnings.filterwarnings(\"ignore\")\n",
"def filter_rows(df):\n",
" # 按照 name 和 start_date 分组\n",
" def select_row(group):\n",
" # 如果有 end_date 不为 NaT 的行,优先保留这些行\n",
" valid_rows = group[group['end_date'].notna()]\n",
" if not valid_rows.empty:\n",
" return valid_rows.iloc[0] # 返回第一个有效行\n",
" else:\n",
" return group.iloc[0] # 如果没有有效行,返回第一行\n",
2025-02-12 00:21:33 +08:00
"\n",
2025-05-06 23:42:40 +08:00
" filtered_df = df.groupby(['name', 'start_date'], group_keys=False).apply(select_row)\n",
" filtered_df = filtered_df.reset_index(drop=True)\n",
" return filtered_df\n",
2025-02-12 00:21:33 +08:00
"\n",
"def is_st(name_change_dict, stock_code, target_date):\n",
" target_date = datetime.strptime(target_date, '%Y%m%d')\n",
" if stock_code not in name_change_dict.keys():\n",
" return False\n",
" df = name_change_dict[stock_code]\n",
" for i in range(len(df)):\n",
" sds = df.iloc[i, 2]\n",
" eds = df.iloc[i, 3]\n",
2025-05-06 23:42:40 +08:00
" if eds is None or eds is pd.NaT:\n",
" eds = datetime.now()\n",
2025-02-12 00:21:33 +08:00
" if (target_date - sds).days >= 0 and (target_date - eds).days <= 0:\n",
" return True\n",
" return False\n",
"\n",
2025-05-06 23:42:40 +08:00
"name_change_df = pd.read_hdf('../../../data/name_change.h5', key='name_change')\n",
"name_change_df = name_change_df.drop_duplicates(keep='first')\n",
2025-02-12 00:21:33 +08:00
"\n",
2025-05-06 23:42:40 +08:00
"# 确保 name_change_df 的日期格式正确\n",
"name_change_df['start_date'] = pd.to_datetime(name_change_df['start_date'], format='%Y%m%d')\n",
"name_change_df['end_date'] = pd.to_datetime(name_change_df['end_date'], format='%Y%m%d', errors='coerce')\n",
"name_change_df = name_change_df[name_change_df.name.str.contains('ST')]\n",
"name_change_dict = {}\n",
"for ts_code, group in name_change_df.groupby('ts_code'):\n",
" # 只保留 'ST' 和 '*ST' 的记录\n",
" # st_data = group[(group['change_reason'] == 'ST') | (group['change_reason'] == '*ST')]\n",
" st_data = group[group['name'].str.contains('ST')]\n",
" if not st_data.empty:\n",
" name_change_dict[ts_code] = filter_rows(st_data)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "41bc125d",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" ts_code trade_date close turnover_rate turnover_rate_f \\\n",
"0 603848.SH 20250430 14.36 0.5401 4.6897 \n",
"1 300290.SZ 20250430 16.30 2.8540 3.5686 \n",
"2 603877.SH 20250430 15.90 0.3794 1.2707 \n",
"3 834639.BJ 20250430 8.37 6.1158 7.8866 \n",
"4 000909.SZ 20250430 5.72 0.6104 1.0424 \n",
"... ... ... ... ... ... \n",
"8594006 600708.SH 20170103 9.03 0.7694 1.0169 \n",
"8594007 600712.SH 20170103 10.29 0.5859 0.8028 \n",
"8594008 001872.SZ 20170103 19.33 1.0970 5.4258 \n",
"8594009 001914.SZ 20170103 12.37 3.2627 6.6991 \n",
"8594010 302132.SZ 20170103 23.28 0.4912 1.5149 \n",
"\n",
" volume_ratio pe pe_ttm pb ps ps_ttm \\\n",
"0 1.31 23.3421 25.6176 2.3433 3.7254 3.8065 \n",
"1 1.00 NaN NaN 13.1076 13.5867 13.5756 \n",
"2 0.98 29.1494 33.6975 1.6522 1.1075 1.1304 \n",
"3 0.87 70.0984 215.1863 2.0171 0.8405 0.8329 \n",
"4 0.55 NaN NaN 2.3539 7.7727 8.2925 \n",
"... ... ... ... ... ... ... \n",
"8594006 0.85 23.3367 22.2458 1.4847 0.9613 0.9248 \n",
"8594007 0.67 202.4855 287.1454 5.1852 2.3682 2.5386 \n",
"8594008 0.77 23.6158 23.1883 2.7052 6.6556 6.5584 \n",
"8594009 1.02 20.5631 15.1595 2.1186 1.4950 1.2600 \n",
"8594010 0.74 91.3908 84.6980 6.9391 8.9531 8.8570 \n",
"\n",
" dv_ratio dv_ttm total_share float_share free_share total_mv \\\n",
"0 2.0904 2.0904 40391.1511 40240.6511 4634.6511 5.800169e+05 \n",
"1 0.0000 NaN 63973.2569 63922.1969 51122.1969 1.042764e+06 \n",
"2 3.7471 3.7471 47382.5333 46932.3226 14014.3219 7.533823e+05 \n",
"3 NaN NaN 20160.0000 11721.5883 9089.7537 1.687392e+05 \n",
"4 0.0000 NaN 43771.4245 43771.0570 25634.2299 2.503725e+05 \n",
"... ... ... ... ... ... ... \n",
"8594006 1.1074 1.1074 131871.9966 75088.9215 56812.2811 1.190804e+06 \n",
"8594007 0.1555 0.1555 54465.5360 53795.9475 39266.3119 5.604504e+05 \n",
"8594008 2.1211 2.1211 64476.3730 46486.6050 9398.8050 1.246328e+06 \n",
"8594009 0.4042 0.4042 66696.1416 66678.0666 32475.1786 8.250313e+05 \n",
"8594010 0.2291 0.2291 39384.0333 30419.3588 9862.3809 9.168603e+05 \n",
"\n",
" circ_mv is_st \n",
"0 5.778557e+05 False \n",
"1 1.041932e+06 False \n",
"2 7.462239e+05 False \n",
"3 9.810969e+04 False \n",
"4 2.503704e+05 True \n",
"... ... ... \n",
"8594006 6.780530e+05 False \n",
"8594007 5.535603e+05 False \n",
"8594008 8.985861e+05 False \n",
"8594009 8.248077e+05 False \n",
"8594010 7.081627e+05 False \n",
"\n",
"[8594011 rows x 19 columns]\n"
]
}
2025-02-12 00:21:33 +08:00
],
2025-05-06 23:42:40 +08:00
"source": [
"print(daily_basic)"
]
},
{
"cell_type": "code",
"execution_count": 10,
2025-02-12 00:21:33 +08:00
"id": "initial_id",
2025-05-06 23:42:40 +08:00
"metadata": {
"ExecuteTime": {
"end_time": "2025-02-08T16:59:20.537632Z",
"start_time": "2025-02-08T16:55:27.971219Z"
},
"collapsed": true
},
2025-02-12 00:21:33 +08:00
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"is st...\n",
" ts_code trade_date is_st\n",
2025-05-06 23:42:40 +08:00
"0 603848.SH 20250430 False\n",
"1 300290.SZ 20250430 False\n",
"2 603877.SH 20250430 False\n",
"3 834639.BJ 20250430 False\n",
"4 000909.SZ 20250430 True\n"
2025-02-12 00:21:33 +08:00
]
}
],
2025-05-06 23:42:40 +08:00
"source": [
"from datetime import datetime\n",
"import pandas as pd\n",
"\n",
"\n",
"\n",
"print('is st...')\n",
"# 创建一个新的列 is_st判断每只股票是否是 ST\n",
"daily_basic['is_st'] = daily_basic.apply(\n",
" lambda row: is_st(name_change_dict, row['ts_code'], row['trade_date']), axis=1\n",
")\n",
"\n",
"# 保存结果到新的 HDF5 文件\n",
"daily_basic.to_hdf('../../../data/daily_basic.h5', key='daily_basic', mode='w', format='table')\n",
"\n",
"# 输出部分结果\n",
"print(daily_basic[['ts_code', 'trade_date', 'is_st']].head())\n"
]
2025-02-12 00:21:33 +08:00
}
],
"metadata": {
"kernelspec": {
2025-05-06 23:42:40 +08:00
"display_name": "new_trader",
2025-02-12 00:21:33 +08:00
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
2025-05-06 23:42:40 +08:00
"version": 3
2025-02-12 00:21:33 +08:00
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
2025-05-06 23:42:40 +08:00
"pygments_lexer": "ipython3",
"version": "3.11.11"
2025-02-12 00:21:33 +08:00
}
},
"nbformat": 4,
"nbformat_minor": 5
}