init
This commit is contained in:
441
code/data/cyq_perf.ipynb
Normal file
441
code/data/cyq_perf.ipynb
Normal file
@@ -0,0 +1,441 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "initial_id",
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2025-03-12T15:31:25.004019Z",
|
||||
"start_time": "2025-03-12T15:31:24.322440Z"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from operator import index\n",
|
||||
"\n",
|
||||
"import tushare as ts\n",
|
||||
"import pandas as pd\n",
|
||||
"import time\n",
|
||||
"\n",
|
||||
"ts.set_token('3a0741c702ee7e5e5f2bf1f0846bafaafe4e320833240b2a7e4a685f')\n",
|
||||
"pro = ts.pro_api()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"id": "972a5ac9f79fe373",
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2025-03-12T15:31:40.917015Z",
|
||||
"start_time": "2025-03-12T15:31:35.958771Z"
|
||||
}
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" ts_code trade_date his_low his_high cost_5pct cost_15pct \\\n",
|
||||
"0 000001.SZ 20180104 0.2 12.7 7.2 7.9 \n",
|
||||
"1 000002.SZ 20180104 0.3 31.8 14.1 15.6 \n",
|
||||
"2 000004.SZ 20180104 0.8 53.2 21.6 22.0 \n",
|
||||
"3 000008.SZ 20180104 0.1 13.9 7.2 7.8 \n",
|
||||
"4 000009.SZ 20180104 0.3 15.0 5.9 5.9 \n",
|
||||
"... ... ... ... ... ... ... \n",
|
||||
"3091 603991.SH 20180104 12.0 67.8 26.4 27.0 \n",
|
||||
"3092 603993.SH 20180104 1.5 8.1 5.6 5.8 \n",
|
||||
"3093 603997.SH 20180104 5.4 31.5 9.9 10.2 \n",
|
||||
"3094 603998.SH 20180104 3.9 18.9 9.8 10.1 \n",
|
||||
"3095 603999.SH 20180104 5.4 30.9 6.9 7.2 \n",
|
||||
"\n",
|
||||
" cost_50pct cost_85pct cost_95pct weight_avg winner_rate \n",
|
||||
"0 10.6 11.3 11.9 9.93 71.97 \n",
|
||||
"1 20.1 23.1 24.3 19.62 99.34 \n",
|
||||
"2 23.6 27.6 29.6 24.71 45.41 \n",
|
||||
"3 8.6 9.2 10.5 8.64 47.04 \n",
|
||||
"4 6.6 7.6 7.9 6.76 38.14 \n",
|
||||
"... ... ... ... ... ... \n",
|
||||
"3091 27.6 30.6 34.2 28.54 57.36 \n",
|
||||
"3092 6.3 7.1 7.6 6.34 73.50 \n",
|
||||
"3093 10.5 11.7 11.7 10.84 11.28 \n",
|
||||
"3094 11.9 13.5 15.7 12.13 17.93 \n",
|
||||
"3095 7.8 9.6 9.9 8.17 21.83 \n",
|
||||
"\n",
|
||||
"[3096 rows x 11 columns]\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"\n",
|
||||
"df = pro.cyq_perf(trade_date='20180104')\n",
|
||||
"print(df)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "1b5a82fbf4e380de",
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2025-03-12T15:30:20.421604Z",
|
||||
"start_time": "2025-03-12T15:30:20.224851Z"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import pandas as pd\n",
|
||||
"import time\n",
|
||||
"\n",
|
||||
"h5_filename = '../../../data/sw_daily.h5'\n",
|
||||
"\n",
|
||||
"trade_cal = pro.trade_cal(exchange='', start_date='20170101', end_date='20250420')\n",
|
||||
"trade_cal = trade_cal[trade_cal['is_open'] == 1] # 只保留交易日\n",
|
||||
"trade_dates = trade_cal['cal_date'].tolist()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "f448da220816bf98",
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"start_time": "2025-03-12T15:30:20.436796Z"
|
||||
},
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"任务 20250418 完成\n",
|
||||
"任务 20250417 完成\n",
|
||||
"任务 20250416 完成\n",
|
||||
"任务 20250415 完成\n",
|
||||
"任务 20250414 完成\n",
|
||||
"任务 20250411 完成\n",
|
||||
"任务 20250410 完成\n",
|
||||
"任务 20250409 完成\n",
|
||||
"任务 20250408 完成\n",
|
||||
"任务 20250407 完成\n",
|
||||
"任务 20250403 完成\n",
|
||||
"任务 20250402 完成\n",
|
||||
"任务 20250401 完成\n",
|
||||
"任务 20250331 完成\n",
|
||||
"任务 20250328 完成\n",
|
||||
"任务 20250327 完成\n",
|
||||
"任务 20250326 完成\n",
|
||||
"任务 20250325 完成\n",
|
||||
"任务 20250324 完成\n",
|
||||
"任务 20250321 完成\n",
|
||||
"任务 20250320 完成\n",
|
||||
"任务 20250319 完成\n",
|
||||
"任务 20250318 完成\n",
|
||||
"任务 20250317 完成\n",
|
||||
"任务 20250314 完成\n",
|
||||
"任务 20250313 完成\n",
|
||||
"任务 20250312 完成\n",
|
||||
"任务 20250311 完成\n",
|
||||
"任务 20250310 完成\n",
|
||||
"任务 20250307 完成\n",
|
||||
"任务 20250306 完成\n",
|
||||
"任务 20250305 完成\n",
|
||||
"任务 20250304 完成\n",
|
||||
"任务 20250303 完成\n",
|
||||
"任务 20250228 完成\n",
|
||||
"任务 20250227 完成\n",
|
||||
"任务 20250226 完成\n",
|
||||
"任务 20250225 完成\n",
|
||||
"任务 20250224 完成\n",
|
||||
"任务 20250221 完成\n",
|
||||
"任务 20250220 完成\n",
|
||||
"任务 20250219 完成\n",
|
||||
"任务 20250218 完成\n",
|
||||
"任务 20250217 完成\n",
|
||||
"任务 20250214 完成\n",
|
||||
"任务 20250213 完成\n",
|
||||
"任务 20250212 完成\n",
|
||||
"任务 20250211 完成\n",
|
||||
"任务 20250210 完成\n",
|
||||
"任务 20250207 完成\n",
|
||||
"任务 20250206 完成\n",
|
||||
"任务 20250205 完成\n",
|
||||
"任务 20250127 完成\n",
|
||||
"任务 20250124 完成\n",
|
||||
"任务 20250123 完成\n",
|
||||
"任务 20250122 完成\n",
|
||||
"任务 20250121 完成\n",
|
||||
"任务 20250120 完成\n",
|
||||
"任务 20250117 完成\n",
|
||||
"任务 20250116 完成\n",
|
||||
"任务 20250115 完成\n",
|
||||
"任务 20250114 完成\n",
|
||||
"任务 20250113 完成\n",
|
||||
"任务 20250110 完成\n",
|
||||
"任务 20250109 完成\n",
|
||||
"任务 20250108 完成\n",
|
||||
"任务 20250107 完成\n",
|
||||
"任务 20250106 完成\n",
|
||||
"任务 20250103 完成\n",
|
||||
"任务 20250102 完成\n",
|
||||
"任务 20241231 完成\n",
|
||||
"任务 20241230 完成\n",
|
||||
"任务 20241227 完成\n",
|
||||
"任务 20241226 完成\n",
|
||||
"任务 20241225 完成\n",
|
||||
"任务 20241224 完成\n",
|
||||
"任务 20241223 完成\n",
|
||||
"任务 20241220 完成\n",
|
||||
"任务 20241219 完成\n",
|
||||
"任务 20241218 完成\n",
|
||||
"任务 20241217 完成\n",
|
||||
"任务 20241216 完成\n",
|
||||
"任务 20241213 完成\n",
|
||||
"任务 20241212 完成\n",
|
||||
"任务 20241211 完成\n",
|
||||
"任务 20241210 完成\n",
|
||||
"任务 20241209 完成\n",
|
||||
"任务 20241206 完成\n",
|
||||
"任务 20241205 完成\n",
|
||||
"任务 20241204 完成\n",
|
||||
"任务 20241203 完成\n",
|
||||
"任务 20241202 完成\n",
|
||||
"任务 20241129 完成\n",
|
||||
"任务 20241128 完成\n",
|
||||
"任务 20241127 完成\n",
|
||||
"任务 20241126 完成\n",
|
||||
"任务 20241125 完成\n",
|
||||
"任务 20241122 完成\n",
|
||||
"任务 20241121 完成\n",
|
||||
"任务 20241120 完成\n",
|
||||
"任务 20241119 完成\n",
|
||||
"任务 20241118 完成\n",
|
||||
"任务 20241115 完成\n",
|
||||
"任务 20241114 完成\n",
|
||||
"任务 20241113 完成\n",
|
||||
"任务 20241112 完成\n",
|
||||
"任务 20241111 完成\n",
|
||||
"任务 20241108 完成\n",
|
||||
"任务 20241107 完成\n",
|
||||
"任务 20241106 完成\n",
|
||||
"任务 20241105 完成\n",
|
||||
"任务 20241104 完成\n",
|
||||
"任务 20241101 完成\n",
|
||||
"任务 20241031 完成\n",
|
||||
"任务 20241030 完成\n",
|
||||
"任务 20241029 完成\n",
|
||||
"任务 20241028 完成\n",
|
||||
"任务 20241025 完成\n",
|
||||
"任务 20241024 完成\n",
|
||||
"任务 20241022 完成\n",
|
||||
"任务 20241023 完成\n",
|
||||
"任务 20241021 完成\n",
|
||||
"任务 20241018 完成\n",
|
||||
"任务 20241017 完成\n",
|
||||
"任务 20241016 完成\n",
|
||||
"任务 20241015 完成\n",
|
||||
"任务 20241014 完成\n",
|
||||
"任务 20241010 完成\n",
|
||||
"任务 20241011 完成\n",
|
||||
"任务 20241009 完成\n",
|
||||
"任务 20241008 完成\n",
|
||||
"任务 20240930 完成\n",
|
||||
"任务 20240927 完成\n",
|
||||
"任务 20240926 完成\n",
|
||||
"任务 20240925 完成\n",
|
||||
"任务 20240924 完成\n",
|
||||
"任务 20240923 完成\n",
|
||||
"任务 20240919 完成\n",
|
||||
"任务 20240920 完成\n",
|
||||
"任务 20240913 完成\n",
|
||||
"任务 20240918 完成\n",
|
||||
"任务 20240911 完成\n",
|
||||
"任务 20240912 完成\n",
|
||||
"任务 20240910 完成\n",
|
||||
"任务 20240909 完成\n",
|
||||
"任务 20240905 完成\n",
|
||||
"任务 20240906 完成\n",
|
||||
"任务 20240904 完成\n",
|
||||
"任务 20240903 完成\n",
|
||||
"任务 20240902 完成\n",
|
||||
"任务 20240830 完成\n",
|
||||
"任务 20240829 完成\n",
|
||||
"任务 20240828 完成\n",
|
||||
"任务 20240827 完成\n",
|
||||
"任务 20240826 完成\n",
|
||||
"任务 20240823 完成\n",
|
||||
"任务 20240822 完成\n",
|
||||
"任务 20240821 完成\n",
|
||||
"任务 20240820 完成\n",
|
||||
"任务 20240819 完成\n",
|
||||
"任务 20240816 完成\n",
|
||||
"任务 20240815 完成\n",
|
||||
"任务 20240814 完成\n",
|
||||
"任务 20240813 完成\n",
|
||||
"任务 20240812 完成\n",
|
||||
"任务 20240809 完成\n",
|
||||
"任务 20240808 完成\n",
|
||||
"任务 20240807 完成\n",
|
||||
"任务 20240806 完成\n",
|
||||
"任务 20240805 完成\n",
|
||||
"任务 20240802 完成\n",
|
||||
"任务 20240801 完成\n",
|
||||
"任务 20240731 完成\n",
|
||||
"任务 20240730 完成\n",
|
||||
"任务 20240729 完成\n",
|
||||
"任务 20240726 完成\n",
|
||||
"任务 20240725 完成\n",
|
||||
"任务 20240724 完成\n",
|
||||
"任务 20240723 完成\n",
|
||||
"任务 20240722 完成\n",
|
||||
"任务 20240719 完成\n",
|
||||
"任务 20240718 完成\n",
|
||||
"任务 20240717 完成\n",
|
||||
"任务 20240716 完成\n",
|
||||
"任务 20240715 完成\n",
|
||||
"任务 20240712 完成\n",
|
||||
"任务 20240711 完成\n",
|
||||
"任务 20240710 完成\n",
|
||||
"任务 20240709 完成\n",
|
||||
"任务 20240708 完成\n",
|
||||
"任务 20240705 完成\n",
|
||||
"任务 20240704 完成\n",
|
||||
"任务 20240703 完成\n",
|
||||
"任务 20240702 完成\n",
|
||||
"任务 20240701 完成\n",
|
||||
"任务 20240628 完成\n",
|
||||
"任务 20240627 完成\n",
|
||||
"任务 20240626 完成\n",
|
||||
"任务 20240625 完成\n",
|
||||
"任务 20240624 完成\n",
|
||||
"任务 20240621 完成\n",
|
||||
"任务 20240620 完成\n",
|
||||
"任务 20240619 完成\n",
|
||||
"任务 20240618 完成\n",
|
||||
"任务 20240617 完成\n",
|
||||
"任务 20240614 完成\n",
|
||||
"任务 20240613 完成\n",
|
||||
"任务 20240612 完成\n",
|
||||
"任务 20240611 完成\n",
|
||||
"任务 20240607 完成\n",
|
||||
"任务 20240606 完成\n",
|
||||
"任务 20240605 完成\n",
|
||||
"任务 20240604 完成\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from concurrent.futures import ThreadPoolExecutor, as_completed\n",
|
||||
"\n",
|
||||
"all_daily_data = []\n",
|
||||
"\n",
|
||||
"# API 调用计数和时间控制变量\n",
|
||||
"api_call_count = 0\n",
|
||||
"batch_start_time = time.time()\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def get_data(trade_date):\n",
|
||||
" time.sleep(0.1)\n",
|
||||
" data = pro.cyq_perf(trade_date=trade_date)\n",
|
||||
" if data is not None and not data.empty:\n",
|
||||
" return data\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"with ThreadPoolExecutor(max_workers=2) as executor:\n",
|
||||
" future_to_date = {executor.submit(get_data, td): td for td in trade_dates}\n",
|
||||
"\n",
|
||||
" for future in as_completed(future_to_date):\n",
|
||||
" trade_date = future_to_date[future] # 获取对应的交易日期\n",
|
||||
" try:\n",
|
||||
" result = future.result() # 获取任务执行的结果\n",
|
||||
" all_daily_data.append(result)\n",
|
||||
" print(f\"任务 {trade_date} 完成\")\n",
|
||||
" except Exception as e:\n",
|
||||
" print(f\"获取 {trade_date} 数据时出错: {e}\")\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "907f732d3c397bf",
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2025-03-12T15:31:10.381348500Z",
|
||||
"start_time": "2025-03-12T15:23:41.345460Z"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"\n",
|
||||
"# 将所有数据合并为一个 DataFrame\n",
|
||||
"all_daily_data_df = pd.concat(all_daily_data, ignore_index=True)\n",
|
||||
"\n",
|
||||
"# 将数据保存为 HDF5 文件(table 格式)\n",
|
||||
"all_daily_data_df.to_hdf('../../data/cyq_perf.h5', key='cyq_perf', mode='w', format='table', data_columns=True)\n",
|
||||
"\n",
|
||||
"print(\"所有每日基础数据获取并保存完毕!\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"id": "73e829ac-ff3d-408e-beb3-0b87f5b00b19",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" ts_code trade_date\n",
|
||||
"0 000001.SZ 20250312\n",
|
||||
"1 000002.SZ 20250312\n",
|
||||
"2 000004.SZ 20250312\n",
|
||||
"3 000006.SZ 20250312\n",
|
||||
"4 000007.SZ 20250312\n",
|
||||
"... ... ...\n",
|
||||
"7465732 603991.SH 20180102\n",
|
||||
"7465733 603993.SH 20180102\n",
|
||||
"7465734 603997.SH 20180102\n",
|
||||
"7465735 603998.SH 20180102\n",
|
||||
"7465736 603999.SH 20180102\n",
|
||||
"\n",
|
||||
"[7465737 rows x 2 columns]\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"h5_filename = '../../data/cyq_perf.h5'\n",
|
||||
"key = '/cyq_perf'\n",
|
||||
"max_date = None\n",
|
||||
"with pd.HDFStore(h5_filename, mode='r') as store:\n",
|
||||
" df = store[key][['ts_code', 'trade_date']]\n",
|
||||
" print(df)\n",
|
||||
" max_date = df['trade_date'].min()"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.11"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
148
code/data/index_and_industry.ipynb
Normal file
148
code/data/index_and_industry.ipynb
Normal file
@@ -0,0 +1,148 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"id": "initial_id",
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2025-03-30T16:42:23.864275Z",
|
||||
"start_time": "2025-03-30T16:42:22.963221Z"
|
||||
}
|
||||
},
|
||||
"source": [
|
||||
"from operator import index\n",
|
||||
"\n",
|
||||
"import tushare as ts\n",
|
||||
"import pandas as pd\n",
|
||||
"import time\n",
|
||||
"\n",
|
||||
"ts.set_token('3a0741c702ee7e5e5f2bf1f0846bafaafe4e320833240b2a7e4a685f')\n",
|
||||
"pro = ts.pro_api()"
|
||||
],
|
||||
"outputs": [],
|
||||
"execution_count": 1
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"id": "f448da220816bf98",
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2025-03-30T16:42:25.559047Z",
|
||||
"start_time": "2025-03-30T16:42:23.868783Z"
|
||||
}
|
||||
},
|
||||
"source": [
|
||||
"# 定义四个指数\n",
|
||||
"index_list = ['399300.SH', '000905.SH', '000852.SH', '399006.SZ']\n",
|
||||
"\n",
|
||||
"# 获取并存储数据\n",
|
||||
"all_data = []\n",
|
||||
"\n",
|
||||
"for ts_code in index_list:\n",
|
||||
" df = pro.index_daily(ts_code=ts_code) # 可根据需要设置日期\n",
|
||||
" df['ts_code'] = ts_code # 添加ts_code列来区分数据\n",
|
||||
" all_data.append(df)\n",
|
||||
"\n",
|
||||
"# 合并所有数据\n",
|
||||
"final_df = pd.concat(all_data, ignore_index=True)\n",
|
||||
"\n",
|
||||
"# 存储到H5文件\n",
|
||||
"final_df.to_hdf('../../data/index_data.h5', key='index_data', mode='w')\n",
|
||||
"\n",
|
||||
"print(\"数据已经成功存储到index_data.h5文件中\")"
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"数据已经成功存储到index_data.h5文件中\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"C:\\Users\\liaozhaorun\\AppData\\Local\\Temp\\ipykernel_6192\\3209233630.py:13: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n",
|
||||
" final_df = pd.concat(all_data, ignore_index=True)\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"execution_count": 2
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"id": "907f732d3c397bf",
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2025-03-30T16:42:25.802535Z",
|
||||
"start_time": "2025-03-30T16:42:25.766399Z"
|
||||
}
|
||||
},
|
||||
"source": [
|
||||
"h5_filename = '../../data/index_data.h5'\n",
|
||||
"key = '/index_data'\n",
|
||||
"with pd.HDFStore(h5_filename, mode='r') as store:\n",
|
||||
" df = store[key]\n",
|
||||
" print(df)\n"
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" ts_code trade_date close open high low \\\n",
|
||||
"0 000905.SH 20250328 5916.0314 5954.7297 5973.8015 5904.9159 \n",
|
||||
"1 000905.SH 20250327 5957.6017 5932.5165 6000.6615 5891.7664 \n",
|
||||
"2 000905.SH 20250326 5948.4986 5935.8537 5983.4739 5935.8537 \n",
|
||||
"3 000905.SH 20250325 5946.9510 5969.4164 5993.9312 5929.6734 \n",
|
||||
"4 000905.SH 20250324 5969.0789 5973.0466 5987.0606 5882.8780 \n",
|
||||
"... ... ... ... ... ... ... \n",
|
||||
"13423 399006.SZ 20100607 1069.4680 1005.0280 1075.2250 1001.7020 \n",
|
||||
"13424 399006.SZ 20100604 1027.6810 989.6810 1027.6810 986.5040 \n",
|
||||
"13425 399006.SZ 20100603 998.3940 1002.3550 1026.7020 997.7750 \n",
|
||||
"13426 399006.SZ 20100602 997.1190 967.6090 997.1190 952.6110 \n",
|
||||
"13427 399006.SZ 20100601 973.2330 986.0150 994.7930 948.1180 \n",
|
||||
"\n",
|
||||
" pre_close change pct_chg vol amount \n",
|
||||
"0 5957.6017 -41.5703 -0.6978 1.342619e+08 1.688995e+08 \n",
|
||||
"1 5948.4986 9.1031 0.1530 1.347089e+08 1.765905e+08 \n",
|
||||
"2 5946.9510 1.5476 0.0260 1.367021e+08 1.716958e+08 \n",
|
||||
"3 5969.0789 -22.1279 -0.3707 1.474839e+08 1.922270e+08 \n",
|
||||
"4 5971.9302 -2.8513 -0.0477 1.691924e+08 2.200943e+08 \n",
|
||||
"... ... ... ... ... ... \n",
|
||||
"13423 1027.6810 41.7870 4.0661 2.655275e+06 9.106095e+06 \n",
|
||||
"13424 998.3940 29.2870 2.9334 1.500295e+06 5.269441e+06 \n",
|
||||
"13425 997.1190 1.2750 0.1279 1.616805e+06 6.240835e+06 \n",
|
||||
"13426 973.2330 23.8860 2.4543 1.074628e+06 4.001206e+06 \n",
|
||||
"13427 1000.0000 -26.7670 -2.6767 1.356285e+06 4.924177e+06 \n",
|
||||
"\n",
|
||||
"[13428 rows x 11 columns]\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"execution_count": 3
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.11"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
2167
code/data/industry_daily.ipynb
Normal file
2167
code/data/industry_daily.ipynb
Normal file
File diff suppressed because it is too large
Load Diff
5894
code/data/industry_data.ipynb
Normal file
5894
code/data/industry_data.ipynb
Normal file
File diff suppressed because it is too large
Load Diff
273
code/data/kpl_concept.ipynb
Normal file
273
code/data/kpl_concept.ipynb
Normal file
@@ -0,0 +1,273 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "initial_id",
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2025-03-12T15:28:49.275220Z",
|
||||
"start_time": "2025-03-12T15:28:48.624632Z"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from operator import index\n",
|
||||
"\n",
|
||||
"import tushare as ts\n",
|
||||
"import pandas as pd\n",
|
||||
"import time\n",
|
||||
"\n",
|
||||
"ts.set_token('3a0741c702ee7e5e5f2bf1f0846bafaafe4e320833240b2a7e4a685f')\n",
|
||||
"pro = ts.pro_api()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "972a5ac9f79fe373",
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2025-03-12T15:28:49.280632Z",
|
||||
"start_time": "2025-03-12T15:28:49.275220Z"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"\n",
|
||||
"# df = pro.cyq_perf(start_date='20220101', end_date='20220429')\n",
|
||||
"# print(df)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "f448da220816bf98",
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2025-03-12T15:39:50.128089Z",
|
||||
"start_time": "2025-03-12T15:28:49.437760Z"
|
||||
},
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"成功获取并保存 20250228 的每日基础数据\n",
|
||||
"成功获取并保存 20250227 的每日基础数据\n",
|
||||
"成功获取并保存 20250226 的每日基础数据\n",
|
||||
"成功获取并保存 20250225 的每日基础数据\n",
|
||||
"成功获取并保存 20250224 的每日基础数据\n",
|
||||
"成功获取并保存 20250221 的每日基础数据\n",
|
||||
"成功获取并保存 20250220 的每日基础数据\n",
|
||||
"成功获取并保存 20250219 的每日基础数据\n",
|
||||
"成功获取并保存 20250218 的每日基础数据\n",
|
||||
"成功获取并保存 20250217 的每日基础数据\n",
|
||||
"成功获取并保存 20250214 的每日基础数据\n",
|
||||
"成功获取并保存 20250213 的每日基础数据\n",
|
||||
"成功获取并保存 20250212 的每日基础数据\n",
|
||||
"成功获取并保存 20250211 的每日基础数据\n",
|
||||
"成功获取并保存 20250210 的每日基础数据\n",
|
||||
"成功获取并保存 20250207 的每日基础数据\n",
|
||||
"成功获取并保存 20250206 的每日基础数据\n",
|
||||
"成功获取并保存 20250205 的每日基础数据\n",
|
||||
"成功获取并保存 20250127 的每日基础数据\n",
|
||||
"成功获取并保存 20250124 的每日基础数据\n",
|
||||
"成功获取并保存 20250123 的每日基础数据\n",
|
||||
"成功获取并保存 20250122 的每日基础数据\n",
|
||||
"成功获取并保存 20250121 的每日基础数据\n",
|
||||
"成功获取并保存 20250120 的每日基础数据\n",
|
||||
"成功获取并保存 20250117 的每日基础数据\n",
|
||||
"成功获取并保存 20250116 的每日基础数据\n",
|
||||
"成功获取并保存 20250115 的每日基础数据\n",
|
||||
"成功获取并保存 20250114 的每日基础数据\n",
|
||||
"成功获取并保存 20250113 的每日基础数据\n",
|
||||
"成功获取并保存 20250110 的每日基础数据\n",
|
||||
"成功获取并保存 20250109 的每日基础数据\n",
|
||||
"成功获取并保存 20250108 的每日基础数据\n",
|
||||
"成功获取并保存 20250107 的每日基础数据\n",
|
||||
"成功获取并保存 20250106 的每日基础数据\n",
|
||||
"成功获取并保存 20250103 的每日基础数据\n",
|
||||
"成功获取并保存 20250102 的每日基础数据\n",
|
||||
"成功获取并保存 20241231 的每日基础数据\n",
|
||||
"成功获取并保存 20241230 的每日基础数据\n",
|
||||
"成功获取并保存 20241227 的每日基础数据\n",
|
||||
"成功获取并保存 20241226 的每日基础数据\n",
|
||||
"成功获取并保存 20241225 的每日基础数据\n",
|
||||
"成功获取并保存 20241224 的每日基础数据\n",
|
||||
"成功获取并保存 20241223 的每日基础数据\n",
|
||||
"成功获取并保存 20241220 的每日基础数据\n",
|
||||
"成功获取并保存 20241219 的每日基础数据\n",
|
||||
"成功获取并保存 20241218 的每日基础数据\n",
|
||||
"成功获取并保存 20241217 的每日基础数据\n",
|
||||
"成功获取并保存 20241216 的每日基础数据\n",
|
||||
"成功获取并保存 20241213 的每日基础数据\n",
|
||||
"成功获取并保存 20241212 的每日基础数据\n",
|
||||
"成功获取并保存 20241211 的每日基础数据\n",
|
||||
"成功获取并保存 20241210 的每日基础数据\n",
|
||||
"成功获取并保存 20241209 的每日基础数据\n",
|
||||
"成功获取并保存 20241206 的每日基础数据\n",
|
||||
"成功获取并保存 20241205 的每日基础数据\n",
|
||||
"成功获取并保存 20241204 的每日基础数据\n",
|
||||
"成功获取并保存 20241203 的每日基础数据\n",
|
||||
"成功获取并保存 20241202 的每日基础数据\n",
|
||||
"成功获取并保存 20241129 的每日基础数据\n",
|
||||
"成功获取并保存 20241128 的每日基础数据\n",
|
||||
"成功获取并保存 20241127 的每日基础数据\n",
|
||||
"成功获取并保存 20241126 的每日基础数据\n",
|
||||
"成功获取并保存 20241125 的每日基础数据\n",
|
||||
"成功获取并保存 20241122 的每日基础数据\n",
|
||||
"成功获取并保存 20241121 的每日基础数据\n",
|
||||
"成功获取并保存 20241120 的每日基础数据\n",
|
||||
"成功获取并保存 20241119 的每日基础数据\n",
|
||||
"成功获取并保存 20241118 的每日基础数据\n",
|
||||
"成功获取并保存 20241115 的每日基础数据\n",
|
||||
"成功获取并保存 20241114 的每日基础数据\n",
|
||||
"成功获取并保存 20241113 的每日基础数据\n",
|
||||
"成功获取并保存 20241112 的每日基础数据\n",
|
||||
"成功获取并保存 20241111 的每日基础数据\n",
|
||||
"成功获取并保存 20241108 的每日基础数据\n",
|
||||
"成功获取并保存 20241107 的每日基础数据\n",
|
||||
"成功获取并保存 20241106 的每日基础数据\n",
|
||||
"成功获取并保存 20241105 的每日基础数据\n",
|
||||
"成功获取并保存 20241104 的每日基础数据\n",
|
||||
"成功获取并保存 20241101 的每日基础数据\n",
|
||||
"成功获取并保存 20241031 的每日基础数据\n",
|
||||
"成功获取并保存 20241030 的每日基础数据\n",
|
||||
"成功获取并保存 20241029 的每日基础数据\n",
|
||||
"成功获取并保存 20241028 的每日基础数据\n",
|
||||
"成功获取并保存 20241025 的每日基础数据\n",
|
||||
"成功获取并保存 20241024 的每日基础数据\n",
|
||||
"成功获取并保存 20241023 的每日基础数据\n",
|
||||
"成功获取并保存 20241022 的每日基础数据\n",
|
||||
"成功获取并保存 20241021 的每日基础数据\n",
|
||||
"成功获取并保存 20241014 的每日基础数据\n",
|
||||
"150 1741835004.3988936 1741834982.2357981\n",
|
||||
"已调用 150 次 API,等待 37.84 秒以满足速率限制...\n",
|
||||
"300 1741835064.0700593 1741835042.2372077\n",
|
||||
"已调用 150 次 API,等待 38.17 秒以满足速率限制...\n",
|
||||
"450 1741835124.4976892 1741835102.2381623\n",
|
||||
"已调用 150 次 API,等待 37.74 秒以满足速率限制...\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"ename": "KeyboardInterrupt",
|
||||
"evalue": "",
|
||||
"output_type": "error",
|
||||
"traceback": [
|
||||
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
|
||||
"\u001b[1;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)",
|
||||
"Cell \u001b[1;32mIn[4], line 22\u001b[0m\n\u001b[0;32m 19\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m trade_date \u001b[38;5;129;01min\u001b[39;00m trade_dates:\n\u001b[0;32m 20\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m 21\u001b[0m \u001b[38;5;66;03m# 获取每日基础数据\u001b[39;00m\n\u001b[1;32m---> 22\u001b[0m kpl_concept \u001b[38;5;241m=\u001b[39m pro\u001b[38;5;241m.\u001b[39mkpl_concept(trade_date\u001b[38;5;241m=\u001b[39mtrade_date)\n\u001b[0;32m 23\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m kpl_concept \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m kpl_concept\u001b[38;5;241m.\u001b[39mempty:\n\u001b[0;32m 24\u001b[0m all_daily_data\u001b[38;5;241m.\u001b[39mappend(kpl_concept)\n",
|
||||
"File \u001b[1;32mE:\\Python\\anaconda\\envs\\new_trader\\Lib\\site-packages\\tushare\\pro\\client.py:41\u001b[0m, in \u001b[0;36mDataApi.query\u001b[1;34m(self, api_name, fields, **kwargs)\u001b[0m\n\u001b[0;32m 33\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mquery\u001b[39m(\u001b[38;5;28mself\u001b[39m, api_name, fields\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[0;32m 34\u001b[0m req_params \u001b[38;5;241m=\u001b[39m {\n\u001b[0;32m 35\u001b[0m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mapi_name\u001b[39m\u001b[38;5;124m'\u001b[39m: api_name,\n\u001b[0;32m 36\u001b[0m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mtoken\u001b[39m\u001b[38;5;124m'\u001b[39m: \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m__token,\n\u001b[0;32m 37\u001b[0m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mparams\u001b[39m\u001b[38;5;124m'\u001b[39m: kwargs,\n\u001b[0;32m 38\u001b[0m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mfields\u001b[39m\u001b[38;5;124m'\u001b[39m: fields\n\u001b[0;32m 39\u001b[0m }\n\u001b[1;32m---> 41\u001b[0m res \u001b[38;5;241m=\u001b[39m requests\u001b[38;5;241m.\u001b[39mpost(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m__http_url\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m/\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mapi_name\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m, json\u001b[38;5;241m=\u001b[39mreq_params, timeout\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m__timeout)\n\u001b[0;32m 42\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m res:\n\u001b[0;32m 43\u001b[0m result \u001b[38;5;241m=\u001b[39m json\u001b[38;5;241m.\u001b[39mloads(res\u001b[38;5;241m.\u001b[39mtext)\n",
|
||||
"File \u001b[1;32mE:\\Python\\anaconda\\envs\\new_trader\\Lib\\site-packages\\requests\\api.py:115\u001b[0m, in \u001b[0;36mpost\u001b[1;34m(url, data, json, **kwargs)\u001b[0m\n\u001b[0;32m 103\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mpost\u001b[39m(url, data\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m, json\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[0;32m 104\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124mr\u001b[39m\u001b[38;5;124;03m\"\"\"Sends a POST request.\u001b[39;00m\n\u001b[0;32m 105\u001b[0m \n\u001b[0;32m 106\u001b[0m \u001b[38;5;124;03m :param url: URL for the new :class:`Request` object.\u001b[39;00m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 112\u001b[0m \u001b[38;5;124;03m :rtype: requests.Response\u001b[39;00m\n\u001b[0;32m 113\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[1;32m--> 115\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m request(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mpost\u001b[39m\u001b[38;5;124m\"\u001b[39m, url, data\u001b[38;5;241m=\u001b[39mdata, json\u001b[38;5;241m=\u001b[39mjson, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n",
|
||||
"File \u001b[1;32mE:\\Python\\anaconda\\envs\\new_trader\\Lib\\site-packages\\requests\\api.py:59\u001b[0m, in \u001b[0;36mrequest\u001b[1;34m(method, url, **kwargs)\u001b[0m\n\u001b[0;32m 55\u001b[0m \u001b[38;5;66;03m# By using the 'with' statement we are sure the session is closed, thus we\u001b[39;00m\n\u001b[0;32m 56\u001b[0m \u001b[38;5;66;03m# avoid leaving sockets open which can trigger a ResourceWarning in some\u001b[39;00m\n\u001b[0;32m 57\u001b[0m \u001b[38;5;66;03m# cases, and look like a memory leak in others.\u001b[39;00m\n\u001b[0;32m 58\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m sessions\u001b[38;5;241m.\u001b[39mSession() \u001b[38;5;28;01mas\u001b[39;00m session:\n\u001b[1;32m---> 59\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m session\u001b[38;5;241m.\u001b[39mrequest(method\u001b[38;5;241m=\u001b[39mmethod, url\u001b[38;5;241m=\u001b[39murl, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n",
|
||||
"File \u001b[1;32mE:\\Python\\anaconda\\envs\\new_trader\\Lib\\site-packages\\requests\\sessions.py:589\u001b[0m, in \u001b[0;36mSession.request\u001b[1;34m(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json)\u001b[0m\n\u001b[0;32m 584\u001b[0m send_kwargs \u001b[38;5;241m=\u001b[39m {\n\u001b[0;32m 585\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtimeout\u001b[39m\u001b[38;5;124m\"\u001b[39m: timeout,\n\u001b[0;32m 586\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mallow_redirects\u001b[39m\u001b[38;5;124m\"\u001b[39m: allow_redirects,\n\u001b[0;32m 587\u001b[0m }\n\u001b[0;32m 588\u001b[0m send_kwargs\u001b[38;5;241m.\u001b[39mupdate(settings)\n\u001b[1;32m--> 589\u001b[0m resp \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msend(prep, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39msend_kwargs)\n\u001b[0;32m 591\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m resp\n",
|
||||
"File \u001b[1;32mE:\\Python\\anaconda\\envs\\new_trader\\Lib\\site-packages\\requests\\sessions.py:724\u001b[0m, in \u001b[0;36mSession.send\u001b[1;34m(self, request, **kwargs)\u001b[0m\n\u001b[0;32m 721\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m allow_redirects:\n\u001b[0;32m 722\u001b[0m \u001b[38;5;66;03m# Redirect resolving generator.\u001b[39;00m\n\u001b[0;32m 723\u001b[0m gen \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mresolve_redirects(r, request, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[1;32m--> 724\u001b[0m history \u001b[38;5;241m=\u001b[39m [resp \u001b[38;5;28;01mfor\u001b[39;00m resp \u001b[38;5;129;01min\u001b[39;00m gen]\n\u001b[0;32m 725\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m 726\u001b[0m history \u001b[38;5;241m=\u001b[39m []\n",
|
||||
"File \u001b[1;32mE:\\Python\\anaconda\\envs\\new_trader\\Lib\\site-packages\\requests\\sessions.py:724\u001b[0m, in \u001b[0;36m<listcomp>\u001b[1;34m(.0)\u001b[0m\n\u001b[0;32m 721\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m allow_redirects:\n\u001b[0;32m 722\u001b[0m \u001b[38;5;66;03m# Redirect resolving generator.\u001b[39;00m\n\u001b[0;32m 723\u001b[0m gen \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mresolve_redirects(r, request, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[1;32m--> 724\u001b[0m history \u001b[38;5;241m=\u001b[39m [resp \u001b[38;5;28;01mfor\u001b[39;00m resp \u001b[38;5;129;01min\u001b[39;00m gen]\n\u001b[0;32m 725\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m 726\u001b[0m history \u001b[38;5;241m=\u001b[39m []\n",
|
||||
"File \u001b[1;32mE:\\Python\\anaconda\\envs\\new_trader\\Lib\\site-packages\\requests\\sessions.py:265\u001b[0m, in \u001b[0;36mSessionRedirectMixin.resolve_redirects\u001b[1;34m(self, resp, req, stream, timeout, verify, cert, proxies, yield_requests, **adapter_kwargs)\u001b[0m\n\u001b[0;32m 263\u001b[0m \u001b[38;5;28;01myield\u001b[39;00m req\n\u001b[0;32m 264\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m--> 265\u001b[0m resp \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msend(\n\u001b[0;32m 266\u001b[0m req,\n\u001b[0;32m 267\u001b[0m stream\u001b[38;5;241m=\u001b[39mstream,\n\u001b[0;32m 268\u001b[0m timeout\u001b[38;5;241m=\u001b[39mtimeout,\n\u001b[0;32m 269\u001b[0m verify\u001b[38;5;241m=\u001b[39mverify,\n\u001b[0;32m 270\u001b[0m cert\u001b[38;5;241m=\u001b[39mcert,\n\u001b[0;32m 271\u001b[0m proxies\u001b[38;5;241m=\u001b[39mproxies,\n\u001b[0;32m 272\u001b[0m allow_redirects\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m,\n\u001b[0;32m 273\u001b[0m \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39madapter_kwargs,\n\u001b[0;32m 274\u001b[0m )\n\u001b[0;32m 276\u001b[0m extract_cookies_to_jar(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcookies, prepared_request, resp\u001b[38;5;241m.\u001b[39mraw)\n\u001b[0;32m 278\u001b[0m \u001b[38;5;66;03m# extract redirect url, if any, for the next loop\u001b[39;00m\n",
|
||||
"File \u001b[1;32mE:\\Python\\anaconda\\envs\\new_trader\\Lib\\site-packages\\requests\\sessions.py:703\u001b[0m, in \u001b[0;36mSession.send\u001b[1;34m(self, request, **kwargs)\u001b[0m\n\u001b[0;32m 700\u001b[0m start \u001b[38;5;241m=\u001b[39m preferred_clock()\n\u001b[0;32m 702\u001b[0m \u001b[38;5;66;03m# Send the request\u001b[39;00m\n\u001b[1;32m--> 703\u001b[0m r \u001b[38;5;241m=\u001b[39m adapter\u001b[38;5;241m.\u001b[39msend(request, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[0;32m 705\u001b[0m \u001b[38;5;66;03m# Total elapsed time of the request (approximately)\u001b[39;00m\n\u001b[0;32m 706\u001b[0m elapsed \u001b[38;5;241m=\u001b[39m preferred_clock() \u001b[38;5;241m-\u001b[39m start\n",
|
||||
"File \u001b[1;32mE:\\Python\\anaconda\\envs\\new_trader\\Lib\\site-packages\\requests\\adapters.py:667\u001b[0m, in \u001b[0;36mHTTPAdapter.send\u001b[1;34m(self, request, stream, timeout, verify, cert, proxies)\u001b[0m\n\u001b[0;32m 664\u001b[0m timeout \u001b[38;5;241m=\u001b[39m TimeoutSauce(connect\u001b[38;5;241m=\u001b[39mtimeout, read\u001b[38;5;241m=\u001b[39mtimeout)\n\u001b[0;32m 666\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m--> 667\u001b[0m resp \u001b[38;5;241m=\u001b[39m conn\u001b[38;5;241m.\u001b[39murlopen(\n\u001b[0;32m 668\u001b[0m method\u001b[38;5;241m=\u001b[39mrequest\u001b[38;5;241m.\u001b[39mmethod,\n\u001b[0;32m 669\u001b[0m url\u001b[38;5;241m=\u001b[39murl,\n\u001b[0;32m 670\u001b[0m body\u001b[38;5;241m=\u001b[39mrequest\u001b[38;5;241m.\u001b[39mbody,\n\u001b[0;32m 671\u001b[0m headers\u001b[38;5;241m=\u001b[39mrequest\u001b[38;5;241m.\u001b[39mheaders,\n\u001b[0;32m 672\u001b[0m redirect\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m,\n\u001b[0;32m 673\u001b[0m assert_same_host\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m,\n\u001b[0;32m 674\u001b[0m preload_content\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m,\n\u001b[0;32m 675\u001b[0m decode_content\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m,\n\u001b[0;32m 676\u001b[0m retries\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmax_retries,\n\u001b[0;32m 677\u001b[0m timeout\u001b[38;5;241m=\u001b[39mtimeout,\n\u001b[0;32m 678\u001b[0m chunked\u001b[38;5;241m=\u001b[39mchunked,\n\u001b[0;32m 679\u001b[0m )\n\u001b[0;32m 681\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m (ProtocolError, \u001b[38;5;167;01mOSError\u001b[39;00m) \u001b[38;5;28;01mas\u001b[39;00m err:\n\u001b[0;32m 682\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mConnectionError\u001b[39;00m(err, request\u001b[38;5;241m=\u001b[39mrequest)\n",
|
||||
"File \u001b[1;32mE:\\Python\\anaconda\\envs\\new_trader\\Lib\\site-packages\\urllib3\\connectionpool.py:787\u001b[0m, in \u001b[0;36mHTTPConnectionPool.urlopen\u001b[1;34m(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, preload_content, decode_content, **response_kw)\u001b[0m\n\u001b[0;32m 784\u001b[0m response_conn \u001b[38;5;241m=\u001b[39m conn \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m release_conn \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m 786\u001b[0m \u001b[38;5;66;03m# Make the request on the HTTPConnection object\u001b[39;00m\n\u001b[1;32m--> 787\u001b[0m response \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_make_request(\n\u001b[0;32m 788\u001b[0m conn,\n\u001b[0;32m 789\u001b[0m method,\n\u001b[0;32m 790\u001b[0m url,\n\u001b[0;32m 791\u001b[0m timeout\u001b[38;5;241m=\u001b[39mtimeout_obj,\n\u001b[0;32m 792\u001b[0m body\u001b[38;5;241m=\u001b[39mbody,\n\u001b[0;32m 793\u001b[0m headers\u001b[38;5;241m=\u001b[39mheaders,\n\u001b[0;32m 794\u001b[0m chunked\u001b[38;5;241m=\u001b[39mchunked,\n\u001b[0;32m 795\u001b[0m retries\u001b[38;5;241m=\u001b[39mretries,\n\u001b[0;32m 796\u001b[0m response_conn\u001b[38;5;241m=\u001b[39mresponse_conn,\n\u001b[0;32m 797\u001b[0m preload_content\u001b[38;5;241m=\u001b[39mpreload_content,\n\u001b[0;32m 798\u001b[0m decode_content\u001b[38;5;241m=\u001b[39mdecode_content,\n\u001b[0;32m 799\u001b[0m \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mresponse_kw,\n\u001b[0;32m 800\u001b[0m )\n\u001b[0;32m 802\u001b[0m \u001b[38;5;66;03m# Everything went great!\u001b[39;00m\n\u001b[0;32m 803\u001b[0m clean_exit \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n",
|
||||
"File \u001b[1;32mE:\\Python\\anaconda\\envs\\new_trader\\Lib\\site-packages\\urllib3\\connectionpool.py:534\u001b[0m, in \u001b[0;36mHTTPConnectionPool._make_request\u001b[1;34m(self, conn, method, url, body, headers, retries, timeout, chunked, response_conn, preload_content, decode_content, enforce_content_length)\u001b[0m\n\u001b[0;32m 532\u001b[0m \u001b[38;5;66;03m# Receive the response from the server\u001b[39;00m\n\u001b[0;32m 533\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m--> 534\u001b[0m response \u001b[38;5;241m=\u001b[39m conn\u001b[38;5;241m.\u001b[39mgetresponse()\n\u001b[0;32m 535\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m (BaseSSLError, \u001b[38;5;167;01mOSError\u001b[39;00m) \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[0;32m 536\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_raise_timeout(err\u001b[38;5;241m=\u001b[39me, url\u001b[38;5;241m=\u001b[39murl, timeout_value\u001b[38;5;241m=\u001b[39mread_timeout)\n",
|
||||
"File \u001b[1;32mE:\\Python\\anaconda\\envs\\new_trader\\Lib\\site-packages\\urllib3\\connection.py:516\u001b[0m, in \u001b[0;36mHTTPConnection.getresponse\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 513\u001b[0m _shutdown \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mgetattr\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msock, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mshutdown\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m)\n\u001b[0;32m 515\u001b[0m \u001b[38;5;66;03m# Get the response from http.client.HTTPConnection\u001b[39;00m\n\u001b[1;32m--> 516\u001b[0m httplib_response \u001b[38;5;241m=\u001b[39m \u001b[38;5;28msuper\u001b[39m()\u001b[38;5;241m.\u001b[39mgetresponse()\n\u001b[0;32m 518\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m 519\u001b[0m assert_header_parsing(httplib_response\u001b[38;5;241m.\u001b[39mmsg)\n",
|
||||
"File \u001b[1;32mE:\\Python\\anaconda\\envs\\new_trader\\Lib\\http\\client.py:1395\u001b[0m, in \u001b[0;36mHTTPConnection.getresponse\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 1393\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m 1394\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m-> 1395\u001b[0m response\u001b[38;5;241m.\u001b[39mbegin()\n\u001b[0;32m 1396\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mConnectionError\u001b[39;00m:\n\u001b[0;32m 1397\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mclose()\n",
|
||||
"File \u001b[1;32mE:\\Python\\anaconda\\envs\\new_trader\\Lib\\http\\client.py:325\u001b[0m, in \u001b[0;36mHTTPResponse.begin\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 323\u001b[0m \u001b[38;5;66;03m# read until we get a non-100 response\u001b[39;00m\n\u001b[0;32m 324\u001b[0m \u001b[38;5;28;01mwhile\u001b[39;00m \u001b[38;5;28;01mTrue\u001b[39;00m:\n\u001b[1;32m--> 325\u001b[0m version, status, reason \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_read_status()\n\u001b[0;32m 326\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m status \u001b[38;5;241m!=\u001b[39m CONTINUE:\n\u001b[0;32m 327\u001b[0m \u001b[38;5;28;01mbreak\u001b[39;00m\n",
|
||||
"File \u001b[1;32mE:\\Python\\anaconda\\envs\\new_trader\\Lib\\http\\client.py:286\u001b[0m, in \u001b[0;36mHTTPResponse._read_status\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 285\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_read_status\u001b[39m(\u001b[38;5;28mself\u001b[39m):\n\u001b[1;32m--> 286\u001b[0m line \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mstr\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mfp\u001b[38;5;241m.\u001b[39mreadline(_MAXLINE \u001b[38;5;241m+\u001b[39m \u001b[38;5;241m1\u001b[39m), \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124miso-8859-1\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m 287\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(line) \u001b[38;5;241m>\u001b[39m _MAXLINE:\n\u001b[0;32m 288\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m LineTooLong(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mstatus line\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
|
||||
"File \u001b[1;32mE:\\Python\\anaconda\\envs\\new_trader\\Lib\\socket.py:718\u001b[0m, in \u001b[0;36mSocketIO.readinto\u001b[1;34m(self, b)\u001b[0m\n\u001b[0;32m 716\u001b[0m \u001b[38;5;28;01mwhile\u001b[39;00m \u001b[38;5;28;01mTrue\u001b[39;00m:\n\u001b[0;32m 717\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m--> 718\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_sock\u001b[38;5;241m.\u001b[39mrecv_into(b)\n\u001b[0;32m 719\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m timeout:\n\u001b[0;32m 720\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_timeout_occurred \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n",
|
||||
"\u001b[1;31mKeyboardInterrupt\u001b[0m: "
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import tushare as ts\n",
|
||||
"import pandas as pd\n",
|
||||
"import time\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# 获取交易日历\n",
|
||||
"trade_cal = pro.trade_cal(exchange='', start_date='20170101', end_date='20250301')\n",
|
||||
"trade_cal = trade_cal[trade_cal['is_open'] == 1] # 只保留交易日\n",
|
||||
"trade_dates = trade_cal['cal_date'].tolist() # 获取所有交易日期列表\n",
|
||||
"\n",
|
||||
"# 使用 HDFStore 存储数据\n",
|
||||
"all_daily_data = []\n",
|
||||
"\n",
|
||||
"# API 调用计数和时间控制变量\n",
|
||||
"api_call_count = 0\n",
|
||||
"batch_start_time = time.time()\n",
|
||||
"\n",
|
||||
"# 遍历每个交易日期并获取数据\n",
|
||||
"for trade_date in trade_dates:\n",
|
||||
" try:\n",
|
||||
" # 获取每日基础数据\n",
|
||||
" kpl_concept = pro.kpl_concept(trade_date=trade_date)\n",
|
||||
" if kpl_concept is not None and not kpl_concept.empty:\n",
|
||||
" all_daily_data.append(kpl_concept)\n",
|
||||
" print(f\"成功获取并保存 {trade_date} 的每日基础数据\")\n",
|
||||
"\n",
|
||||
" # 计数一次 API 调用\n",
|
||||
" api_call_count += 1\n",
|
||||
"\n",
|
||||
" # 每调用 300 次,检查时间是否少于 1 分钟,如果少于则等待剩余时间\n",
|
||||
" if api_call_count % 150 == 0:\n",
|
||||
" print(api_call_count,time.time(), batch_start_time)\n",
|
||||
" elapsed = time.time() - batch_start_time\n",
|
||||
" if elapsed < 60:\n",
|
||||
" sleep_time = 60 - elapsed\n",
|
||||
" print(f\"已调用 150 次 API,等待 {sleep_time:.2f} 秒以满足速率限制...\")\n",
|
||||
" time.sleep(sleep_time)\n",
|
||||
" # 重置批次起始时间\n",
|
||||
" batch_start_time = time.time()\n",
|
||||
"\n",
|
||||
" except Exception as e:\n",
|
||||
" print(f\"获取 {trade_date} 数据时出错: {e}\")\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "907f732d3c397bf",
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2025-03-12T15:39:50.141920800Z",
|
||||
"start_time": "2025-03-12T15:23:41.345460Z"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"\n",
|
||||
"# 将所有数据合并为一个 DataFrame\n",
|
||||
"all_daily_data_df = pd.concat(all_daily_data, ignore_index=True)\n",
|
||||
"\n",
|
||||
"# 将数据保存为 HDF5 文件(table 格式)\n",
|
||||
"all_daily_data_df.to_hdf('../../data/kpl_concept.h5', key='kpl_concept', mode='w', format='table', data_columns=True)\n",
|
||||
"\n",
|
||||
"print(\"所有每日基础数据获取并保存完毕!\")"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.11"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
194
code/data/update/cyq-perf.ipynb
Normal file
194
code/data/update/cyq-perf.ipynb
Normal file
@@ -0,0 +1,194 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"id": "f74ce078-f7e8-4733-a14c-14d8815a3626",
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2025-03-30T16:42:31.596637Z",
|
||||
"start_time": "2025-03-30T16:42:30.883319Z"
|
||||
}
|
||||
},
|
||||
"source": [
|
||||
"import tushare as ts\n",
|
||||
"ts.set_token('3a0741c702ee7e5e5f2bf1f0846bafaafe4e320833240b2a7e4a685f')\n",
|
||||
"pro = ts.pro_api()"
|
||||
],
|
||||
"outputs": [],
|
||||
"execution_count": 1
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"id": "44dd8d87-e60b-49e5-aed9-efaa7f92d4fe",
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2025-03-30T16:42:37.590148Z",
|
||||
"start_time": "2025-03-30T16:42:31.596637Z"
|
||||
}
|
||||
},
|
||||
"source": [
|
||||
"import pandas as pd\n",
|
||||
"import time\n",
|
||||
"\n",
|
||||
"h5_filename = '../../../data/cyq_perf.h5'\n",
|
||||
"key = '/cyq_perf'\n",
|
||||
"max_date = None\n",
|
||||
"with pd.HDFStore(h5_filename, mode='r') as store:\n",
|
||||
" df = store[key][['ts_code', 'trade_date']]\n",
|
||||
" print(df)\n",
|
||||
" max_date = df['trade_date'].max()\n",
|
||||
"\n",
|
||||
"print(max_date)\n",
|
||||
"trade_cal = pro.trade_cal(exchange='', start_date='20170101', end_date='20250420')\n",
|
||||
"trade_cal = trade_cal[trade_cal['is_open'] == 1] # 只保留交易日\n",
|
||||
"trade_dates = trade_cal[trade_cal['cal_date'] > max_date]['cal_date'].tolist()\n",
|
||||
"start_date = min(trade_dates)\n",
|
||||
"print(f'start_date: {start_date}')"
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" ts_code trade_date\n",
|
||||
"0 000001.SZ 20250312\n",
|
||||
"1 000002.SZ 20250312\n",
|
||||
"2 000004.SZ 20250312\n",
|
||||
"3 000006.SZ 20250312\n",
|
||||
"4 000007.SZ 20250312\n",
|
||||
"... ... ...\n",
|
||||
"32304 920108.BJ 20250314\n",
|
||||
"32305 920111.BJ 20250314\n",
|
||||
"32306 920116.BJ 20250314\n",
|
||||
"32307 920118.BJ 20250314\n",
|
||||
"32308 920128.BJ 20250314\n",
|
||||
"\n",
|
||||
"[7503415 rows x 2 columns]\n",
|
||||
"20250321\n",
|
||||
"start_date: 20250324\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"execution_count": 2
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"id": "747acc47-0884-4f76-90fb-276f6494e31d",
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2025-03-30T16:43:29.275885Z",
|
||||
"start_time": "2025-03-30T16:42:37.858763Z"
|
||||
}
|
||||
},
|
||||
"source": [
|
||||
"from concurrent.futures import ThreadPoolExecutor, as_completed\n",
|
||||
"\n",
|
||||
"all_daily_data = []\n",
|
||||
"\n",
|
||||
"# API 调用计数和时间控制变量\n",
|
||||
"api_call_count = 0\n",
|
||||
"batch_start_time = time.time()\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def get_data(trade_date):\n",
|
||||
" time.sleep(0.1)\n",
|
||||
" data = pro.cyq_perf(trade_date=trade_date)\n",
|
||||
" if data is not None and not data.empty:\n",
|
||||
" return data\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"with ThreadPoolExecutor(max_workers=2) as executor:\n",
|
||||
" future_to_date = {executor.submit(get_data, td): td for td in trade_dates}\n",
|
||||
"\n",
|
||||
" for future in as_completed(future_to_date):\n",
|
||||
" trade_date = future_to_date[future] # 获取对应的交易日期\n",
|
||||
" try:\n",
|
||||
" result = future.result() # 获取任务执行的结果\n",
|
||||
" all_daily_data.append(result)\n",
|
||||
" print(f\"任务 {trade_date} 完成\")\n",
|
||||
" except Exception as e:\n",
|
||||
" print(f\"获取 {trade_date} 数据时出错: {e}\")\n",
|
||||
"\n"
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"任务 20250418 完成\n",
|
||||
"任务 20250417 完成\n",
|
||||
"任务 20250415 完成\n",
|
||||
"任务 20250416 完成\n",
|
||||
"任务 20250411 完成\n",
|
||||
"任务 20250414 完成\n",
|
||||
"任务 20250409 完成\n",
|
||||
"任务 20250410 完成\n",
|
||||
"任务 20250408 完成\n",
|
||||
"任务 20250407 完成\n",
|
||||
"任务 20250403 完成\n",
|
||||
"任务 20250402 完成\n",
|
||||
"任务 20250401 完成\n",
|
||||
"任务 20250331 完成\n",
|
||||
"任务 20250328 完成\n",
|
||||
"任务 20250327 完成\n",
|
||||
"任务 20250326 完成\n",
|
||||
"任务 20250325 完成\n",
|
||||
"任务 20250324 完成\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"execution_count": 3
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"id": "c6765638-481f-40d8-a259-2e7b25362618",
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2025-03-30T16:43:30.100678Z",
|
||||
"start_time": "2025-03-30T16:43:29.311710Z"
|
||||
}
|
||||
},
|
||||
"source": [
|
||||
"all_daily_data_df = pd.concat(all_daily_data, ignore_index=True)\n",
|
||||
"\n",
|
||||
"# 将所有数据合并为一个 DataFrame\n",
|
||||
"\n",
|
||||
"# 将数据保存为 HDF5 文件(table 格式)\n",
|
||||
"all_daily_data_df.to_hdf(h5_filename, key=key, mode='a', format='table', append=True, data_columns=True)\n",
|
||||
"\n",
|
||||
"print(\"所有每日基础数据获取并保存完毕!\")"
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"所有每日基础数据获取并保存完毕!\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"execution_count": 4
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.11"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
194
code/data/update/index_data.ipynb
Normal file
194
code/data/update/index_data.ipynb
Normal file
@@ -0,0 +1,194 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "f74ce078-f7e8-4733-a14c-14d8815a3626",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import tushare as ts\n",
|
||||
"ts.set_token('3a0741c702ee7e5e5f2bf1f0846bafaafe4e320833240b2a7e4a685f')\n",
|
||||
"pro = ts.pro_api()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "44dd8d87-e60b-49e5-aed9-efaa7f92d4fe",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" ts_code trade_date\n",
|
||||
"0 801001.SI 20250221\n",
|
||||
"1 801002.SI 20250221\n",
|
||||
"2 801003.SI 20250221\n",
|
||||
"3 801005.SI 20250221\n",
|
||||
"4 801010.SI 20250221\n",
|
||||
"... ... ...\n",
|
||||
"1044388 857344.SI 20170103\n",
|
||||
"1044389 857411.SI 20170103\n",
|
||||
"1044390 857421.SI 20170103\n",
|
||||
"1044391 857431.SI 20170103\n",
|
||||
"1044392 858811.SI 20170103\n",
|
||||
"\n",
|
||||
"[1044393 rows x 2 columns]\n",
|
||||
"20250221\n",
|
||||
"start_date: 20250224\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import pandas as pd\n",
|
||||
"import time\n",
|
||||
"\n",
|
||||
"h5_filename = '../../../data/sw_daily.h5'\n",
|
||||
"key = '/sw_daily'\n",
|
||||
"max_date = None\n",
|
||||
"with pd.HDFStore(h5_filename, mode='r') as store:\n",
|
||||
" df = store[key][['ts_code', 'trade_date']]\n",
|
||||
" print(df)\n",
|
||||
" max_date = df['trade_date'].max()\n",
|
||||
"\n",
|
||||
"print(max_date)\n",
|
||||
"trade_cal = pro.trade_cal(exchange='', start_date='20170101', end_date='20250420')\n",
|
||||
"trade_cal = trade_cal[trade_cal['is_open'] == 1] # 只保留交易日\n",
|
||||
"trade_dates = trade_cal[trade_cal['cal_date'] > max_date]['cal_date'].tolist()\n",
|
||||
"start_date = min(trade_dates)\n",
|
||||
"print(f'start_date: {start_date}')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "747acc47-0884-4f76-90fb-276f6494e31d",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"任务 20250417 完成\n",
|
||||
"任务 20250418 完成\n",
|
||||
"任务 20250416 完成\n",
|
||||
"任务 20250415 完成\n",
|
||||
"任务 20250411 完成\n",
|
||||
"任务 20250414 完成\n",
|
||||
"任务 20250410 完成\n",
|
||||
"任务 20250409 完成\n",
|
||||
"任务 20250408 完成\n",
|
||||
"任务 20250403 完成\n",
|
||||
"任务 20250407 完成\n",
|
||||
"任务 20250402 完成\n",
|
||||
"任务 20250401 完成\n",
|
||||
"任务 20250331 完成\n",
|
||||
"任务 20250328 完成\n",
|
||||
"任务 20250327 完成\n",
|
||||
"任务 20250326 完成\n",
|
||||
"任务 20250325 完成\n",
|
||||
"任务 20250324 完成\n",
|
||||
"任务 20250321 完成\n",
|
||||
"任务 20250320 完成\n",
|
||||
"任务 20250319 完成\n",
|
||||
"任务 20250317 完成\n",
|
||||
"任务 20250314 完成\n",
|
||||
"任务 20250318 完成\n",
|
||||
"任务 20250313 完成\n",
|
||||
"任务 20250312 完成\n",
|
||||
"任务 20250311 完成\n",
|
||||
"任务 20250310 完成\n",
|
||||
"任务 20250307 完成\n",
|
||||
"任务 20250306 完成\n",
|
||||
"任务 20250305 完成\n",
|
||||
"任务 20250304 完成\n",
|
||||
"任务 20250303 完成\n",
|
||||
"任务 20250228 完成\n",
|
||||
"任务 20250227 完成\n",
|
||||
"任务 20250226 完成\n",
|
||||
"任务 20250225 完成\n",
|
||||
"任务 20250224 完成\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from concurrent.futures import ThreadPoolExecutor, as_completed\n",
|
||||
"\n",
|
||||
"all_daily_data = []\n",
|
||||
"\n",
|
||||
"# API 调用计数和时间控制变量\n",
|
||||
"api_call_count = 0\n",
|
||||
"batch_start_time = time.time()\n",
|
||||
"\n",
|
||||
"index_list = ['399300.SH', '000905.SH', '000852.SH', '399006.SZ']\n",
|
||||
"def get_data(trade_date):\n",
|
||||
" time.sleep(0.1)\n",
|
||||
" data = pro.sw_daily(trade_date=trade_date)\n",
|
||||
" if data is not None and not data.empty:\n",
|
||||
" return data\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"with ThreadPoolExecutor(max_workers=2) as executor:\n",
|
||||
" future_to_date = {executor.submit(get_data, td): td for td in trade_dates}\n",
|
||||
"\n",
|
||||
" for future in as_completed(future_to_date):\n",
|
||||
" trade_date = future_to_date[future] # 获取对应的交易日期\n",
|
||||
" try:\n",
|
||||
" result = future.result() # 获取任务执行的结果\n",
|
||||
" all_daily_data.append(result)\n",
|
||||
" print(f\"任务 {trade_date} 完成\")\n",
|
||||
" except Exception as e:\n",
|
||||
" print(f\"获取 {trade_date} 数据时出错: {e}\")\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "c6765638-481f-40d8-a259-2e7b25362618",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"所有每日基础数据获取并保存完毕!\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"all_daily_data_df = pd.concat(all_daily_data, ignore_index=True)\n",
|
||||
"\n",
|
||||
"# 将所有数据合并为一个 DataFrame\n",
|
||||
"\n",
|
||||
"# 将数据保存为 HDF5 文件(table 格式)\n",
|
||||
"all_daily_data_df.to_hdf(h5_filename, key=key, mode='a', format='table', append=True, data_columns=True)\n",
|
||||
"\n",
|
||||
"print(\"所有每日基础数据获取并保存完毕!\")"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.11"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
194
code/data/update/sw_daily.ipynb
Normal file
194
code/data/update/sw_daily.ipynb
Normal file
@@ -0,0 +1,194 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"id": "f74ce078-f7e8-4733-a14c-14d8815a3626",
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2025-03-30T16:42:32.996500Z",
|
||||
"start_time": "2025-03-30T16:42:32.209631Z"
|
||||
}
|
||||
},
|
||||
"source": [
|
||||
"import tushare as ts\n",
|
||||
"ts.set_token('3a0741c702ee7e5e5f2bf1f0846bafaafe4e320833240b2a7e4a685f')\n",
|
||||
"pro = ts.pro_api()"
|
||||
],
|
||||
"outputs": [],
|
||||
"execution_count": 1
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"id": "44dd8d87-e60b-49e5-aed9-efaa7f92d4fe",
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2025-03-30T16:42:34.591433Z",
|
||||
"start_time": "2025-03-30T16:42:32.996500Z"
|
||||
}
|
||||
},
|
||||
"source": [
|
||||
"import pandas as pd\n",
|
||||
"import time\n",
|
||||
"\n",
|
||||
"h5_filename = '../../../data/sw_daily.h5'\n",
|
||||
"key = '/sw_daily'\n",
|
||||
"max_date = None\n",
|
||||
"with pd.HDFStore(h5_filename, mode='r') as store:\n",
|
||||
" df = store[key][['ts_code', 'trade_date']]\n",
|
||||
" print(df)\n",
|
||||
" max_date = df['trade_date'].max()\n",
|
||||
"\n",
|
||||
"print(max_date)\n",
|
||||
"trade_cal = pro.trade_cal(exchange='', start_date='20170101', end_date='20250420')\n",
|
||||
"trade_cal = trade_cal[trade_cal['is_open'] == 1] # 只保留交易日\n",
|
||||
"trade_dates = trade_cal[trade_cal['cal_date'] > max_date]['cal_date'].tolist()\n",
|
||||
"start_date = min(trade_dates)\n",
|
||||
"print(f'start_date: {start_date}')"
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" ts_code trade_date\n",
|
||||
"0 801001.SI 20250221\n",
|
||||
"1 801002.SI 20250221\n",
|
||||
"2 801003.SI 20250221\n",
|
||||
"3 801005.SI 20250221\n",
|
||||
"4 801010.SI 20250221\n",
|
||||
"... ... ...\n",
|
||||
"2629 859811.SI 20250314\n",
|
||||
"2630 859821.SI 20250314\n",
|
||||
"2631 859822.SI 20250314\n",
|
||||
"2632 859852.SI 20250314\n",
|
||||
"2633 859951.SI 20250314\n",
|
||||
"\n",
|
||||
"[1053173 rows x 2 columns]\n",
|
||||
"20250321\n",
|
||||
"start_date: 20250324\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"execution_count": 2
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"id": "747acc47-0884-4f76-90fb-276f6494e31d",
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2025-03-30T16:42:37.718270Z",
|
||||
"start_time": "2025-03-30T16:42:34.817305Z"
|
||||
}
|
||||
},
|
||||
"source": [
|
||||
"from concurrent.futures import ThreadPoolExecutor, as_completed\n",
|
||||
"\n",
|
||||
"all_daily_data = []\n",
|
||||
"\n",
|
||||
"# API 调用计数和时间控制变量\n",
|
||||
"api_call_count = 0\n",
|
||||
"batch_start_time = time.time()\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def get_data(trade_date):\n",
|
||||
" time.sleep(0.1)\n",
|
||||
" data = pro.sw_daily(trade_date=trade_date)\n",
|
||||
" if data is not None and not data.empty:\n",
|
||||
" return data\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"with ThreadPoolExecutor(max_workers=2) as executor:\n",
|
||||
" future_to_date = {executor.submit(get_data, td): td for td in trade_dates}\n",
|
||||
"\n",
|
||||
" for future in as_completed(future_to_date):\n",
|
||||
" trade_date = future_to_date[future] # 获取对应的交易日期\n",
|
||||
" try:\n",
|
||||
" result = future.result() # 获取任务执行的结果\n",
|
||||
" all_daily_data.append(result)\n",
|
||||
" print(f\"任务 {trade_date} 完成\")\n",
|
||||
" except Exception as e:\n",
|
||||
" print(f\"获取 {trade_date} 数据时出错: {e}\")\n",
|
||||
"\n"
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"任务 20250417 完成\n",
|
||||
"任务 20250418 完成\n",
|
||||
"任务 20250416 完成\n",
|
||||
"任务 20250415 完成\n",
|
||||
"任务 20250414 完成\n",
|
||||
"任务 20250411 完成\n",
|
||||
"任务 20250410 完成\n",
|
||||
"任务 20250409 完成\n",
|
||||
"任务 20250408 完成\n",
|
||||
"任务 20250407 完成\n",
|
||||
"任务 20250403 完成\n",
|
||||
"任务 20250402 完成\n",
|
||||
"任务 20250401 完成\n",
|
||||
"任务 20250331 完成\n",
|
||||
"任务 20250328 完成\n",
|
||||
"任务 20250327 完成\n",
|
||||
"任务 20250326 完成\n",
|
||||
"任务 20250325 完成\n",
|
||||
"任务 20250324 完成\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"execution_count": 3
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"id": "c6765638-481f-40d8-a259-2e7b25362618",
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2025-03-30T16:42:37.922827Z",
|
||||
"start_time": "2025-03-30T16:42:37.739040Z"
|
||||
}
|
||||
},
|
||||
"source": [
|
||||
"all_daily_data_df = pd.concat(all_daily_data, ignore_index=True)\n",
|
||||
"\n",
|
||||
"# 将所有数据合并为一个 DataFrame\n",
|
||||
"\n",
|
||||
"# 将数据保存为 HDF5 文件(table 格式)\n",
|
||||
"all_daily_data_df.to_hdf(h5_filename, key=key, mode='a', format='table', append=True, data_columns=True)\n",
|
||||
"\n",
|
||||
"print(\"所有每日基础数据获取并保存完毕!\")"
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"所有每日基础数据获取并保存完毕!\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"execution_count": 4
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.11"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
BIN
code/model/lightgbm_model_UpdateRegression_2025-2-25.pkl
Normal file
BIN
code/model/lightgbm_model_UpdateRegression_2025-2-25.pkl
Normal file
Binary file not shown.
5
code/test.py
Normal file
5
code/test.py
Normal file
@@ -0,0 +1,5 @@
|
||||
import torch
|
||||
print(torch.__version__)
|
||||
|
||||
print(torch.version.cuda)
|
||||
print(torch.backends.cudnn.version())
|
||||
1317
code/train/ClassifyLR.ipynb
Normal file
1317
code/train/ClassifyLR.ipynb
Normal file
File diff suppressed because one or more lines are too long
967
code/train/DoubleQuntile.ipynb
Normal file
967
code/train/DoubleQuntile.ipynb
Normal file
File diff suppressed because one or more lines are too long
1385
code/train/DoubleRank.ipynb
Normal file
1385
code/train/DoubleRank.ipynb
Normal file
File diff suppressed because it is too large
Load Diff
1016
code/train/PlUpdateClassify.ipynb
Normal file
1016
code/train/PlUpdateClassify.ipynb
Normal file
File diff suppressed because one or more lines are too long
1250
code/train/RollingRank.py
Normal file
1250
code/train/RollingRank.py
Normal file
File diff suppressed because it is too large
Load Diff
1723
code/train/TRank.ipynb
Normal file
1723
code/train/TRank.ipynb
Normal file
File diff suppressed because it is too large
Load Diff
1362
code/train/Transformer.ipynb
Normal file
1362
code/train/Transformer.ipynb
Normal file
File diff suppressed because one or more lines are too long
1570
code/train/UpdateRank.ipynb
Normal file
1570
code/train/UpdateRank.ipynb
Normal file
File diff suppressed because one or more lines are too long
896
code/train/V1-copy.ipynb
Normal file
896
code/train/V1-copy.ipynb
Normal file
@@ -0,0 +1,896 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2025-02-09T14:52:54.170824Z",
|
||||
"start_time": "2025-02-09T14:52:53.544850Z"
|
||||
}
|
||||
},
|
||||
"cell_type": "code",
|
||||
"source": [
|
||||
"%load_ext autoreload\n",
|
||||
"%autoreload 2\n",
|
||||
"\n",
|
||||
"from utils.utils import read_and_merge_h5_data"
|
||||
],
|
||||
"id": "79a7758178bafdd3",
|
||||
"outputs": [],
|
||||
"execution_count": 1
|
||||
},
|
||||
{
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2025-02-09T14:53:36.873700Z",
|
||||
"start_time": "2025-02-09T14:52:54.170824Z"
|
||||
}
|
||||
},
|
||||
"cell_type": "code",
|
||||
"source": [
|
||||
"print('daily data')\n",
|
||||
"df = read_and_merge_h5_data('../../data/daily_data.h5', key='daily_data',\n",
|
||||
" columns=['ts_code', 'trade_date', 'open', 'close', 'high', 'low', 'vol'],\n",
|
||||
" df=None)\n",
|
||||
"\n",
|
||||
"print('daily basic')\n",
|
||||
"df = read_and_merge_h5_data('../../data/daily_basic.h5', key='daily_basic_with_st',\n",
|
||||
" columns=['ts_code', 'trade_date', 'turnover_rate', 'pe_ttm', 'circ_mv', 'volume_ratio',\n",
|
||||
" 'is_st'], df=df)\n",
|
||||
"\n",
|
||||
"print('stk limit')\n",
|
||||
"df = read_and_merge_h5_data('../../data/stk_limit.h5', key='stk_limit',\n",
|
||||
" columns=['ts_code', 'trade_date', 'pre_close', 'up_limit', 'down_limit'],\n",
|
||||
" df=df)\n",
|
||||
"print('money flow')\n",
|
||||
"df = read_and_merge_h5_data('../../data/money_flow.h5', key='money_flow',\n",
|
||||
" columns=['ts_code', 'trade_date', 'buy_sm_vol', 'sell_sm_vol', 'buy_lg_vol', 'sell_lg_vol',\n",
|
||||
" 'buy_elg_vol', 'sell_elg_vol', 'net_mf_vol'],\n",
|
||||
" df=df)"
|
||||
],
|
||||
"id": "a79cafb06a7e0e43",
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"daily data\n",
|
||||
"daily basic\n",
|
||||
"stk limit\n",
|
||||
"money flow\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"execution_count": 2
|
||||
},
|
||||
{
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2025-02-09T14:53:37.426404Z",
|
||||
"start_time": "2025-02-09T14:53:36.955552Z"
|
||||
}
|
||||
},
|
||||
"cell_type": "code",
|
||||
"source": "origin_columns = df.columns.tolist()",
|
||||
"id": "c4e9e1d31da6dba6",
|
||||
"outputs": [],
|
||||
"execution_count": 3
|
||||
},
|
||||
{
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2025-02-09T14:53:38.164112Z",
|
||||
"start_time": "2025-02-09T14:53:38.070007Z"
|
||||
}
|
||||
},
|
||||
"cell_type": "code",
|
||||
"source": [
|
||||
"import numpy as np\n",
|
||||
"import talib\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def get_technical_factor(df):\n",
|
||||
" df['up'] = (df['high'] - df[['close', 'open']].max(axis=1)) / df['close']\n",
|
||||
" df['down'] = (df[['close', 'open']].min(axis=1) - df['low']) / df['close']\n",
|
||||
"\n",
|
||||
" df['atr_14'] = talib.ATR(df['high'], df['low'], df['close'], timeperiod=14)\n",
|
||||
" df['atr_6'] = talib.ATR(df['high'], df['low'], df['close'], timeperiod=6)\n",
|
||||
"\n",
|
||||
" df['obv'] = talib.OBV(df['close'], df['vol'])\n",
|
||||
" df['maobv_6'] = talib.SMA(df['obv'], timeperiod=6)\n",
|
||||
" df['obv-maobv_6'] = df['obv'] - df['maobv_6']\n",
|
||||
"\n",
|
||||
" df['rsi_3'] = talib.RSI(df['close'], timeperiod=3)\n",
|
||||
" df['rsi_6'] = talib.RSI(df['close'], timeperiod=6)\n",
|
||||
" df['rsi_9'] = talib.RSI(df['close'], timeperiod=9)\n",
|
||||
"\n",
|
||||
" df['return_10'] = df['close'] / df['close'].shift(10) - 1\n",
|
||||
" df['return_20'] = df['close'] / df['close'].shift(20) - 1\n",
|
||||
"\n",
|
||||
" # # 计算 _rank_return_10 和 _rank_return_20\n",
|
||||
" # df['_rank_return_10'] = df['return_10'].rank(pct=True)\n",
|
||||
" # df['_rank_return_20'] = df['return_20'].rank(pct=True)\n",
|
||||
"\n",
|
||||
" # 计算 avg_close_5\n",
|
||||
" df['avg_close_5'] = df['close'].rolling(window=5).mean() / df['close']\n",
|
||||
"\n",
|
||||
" # 计算 std_return_5, std_return_15, std_return_25, std_return_252, std_return_2522\n",
|
||||
" df['std_return_5'] = df['close'].pct_change().shift(-1).rolling(window=5).std()\n",
|
||||
" df['std_return_15'] = df['close'].pct_change().shift(-1).rolling(window=15).std()\n",
|
||||
" df['std_return_25'] = df['close'].pct_change().shift(-1).rolling(window=25).std()\n",
|
||||
" df['std_return_90'] = df['close'].pct_change().shift(-1).rolling(window=90).std()\n",
|
||||
" df['std_return_90_2'] = df['close'].shift(10).pct_change().shift(-1).rolling(window=90).std()\n",
|
||||
"\n",
|
||||
" # 计算 std_return_5 / std_return_252 和 std_return_5 / std_return_25\n",
|
||||
" df['std_return_5 / std_return_90'] = df['std_return_5'] / df['std_return_90']\n",
|
||||
" df['std_return_5 / std_return_25'] = df['std_return_5'] / df['std_return_25']\n",
|
||||
"\n",
|
||||
" # 计算 std_return_252 - std_return_2522\n",
|
||||
" df['std_return_90 - std_return_90_2'] = df['std_return_90'] - df['std_return_90_2']\n",
|
||||
" return df\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def get_act_factor(df):\n",
|
||||
" # 计算 m_ta_ema(close, 5), m_ta_ema(close, 13), m_ta_ema(close, 20), m_ta_ema(close, 60)\n",
|
||||
" df['ema_5'] = talib.EMA(df['close'], timeperiod=5)\n",
|
||||
" df['ema_13'] = talib.EMA(df['close'], timeperiod=13)\n",
|
||||
" df['ema_20'] = talib.EMA(df['close'], timeperiod=20)\n",
|
||||
" df['ema_60'] = talib.EMA(df['close'], timeperiod=60)\n",
|
||||
"\n",
|
||||
" # 计算 act_factor1, act_factor2, act_factor3, act_factor4\n",
|
||||
" df['act_factor1'] = np.arctan((df['ema_5'] / df['ema_5'].shift(1) - 1) * 100) * 57.3 / 50\n",
|
||||
" df['act_factor2'] = np.arctan((df['ema_13'] / df['ema_13'].shift(1) - 1) * 100) * 57.3 / 40\n",
|
||||
" df['act_factor3'] = np.arctan((df['ema_20'] / df['ema_20'].shift(1) - 1) * 100) * 57.3 / 21\n",
|
||||
" df['act_factor4'] = np.arctan((df['ema_60'] / df['ema_60'].shift(1) - 1) * 100) * 57.3 / 10\n",
|
||||
"\n",
|
||||
" # 计算 act_factor5 和 act_factor6\n",
|
||||
" df['act_factor5'] = df['act_factor1'] + df['act_factor2'] + df['act_factor3'] + df['act_factor4']\n",
|
||||
" df['act_factor6'] = (df['act_factor1'] - df['act_factor2']) / np.sqrt(\n",
|
||||
" df['act_factor1'] ** 2 + df['act_factor2'] ** 2)\n",
|
||||
"\n",
|
||||
" # 根据 'trade_date' 进行分组,在每个组内分别计算 'act_factor1', 'act_factor2', 'act_factor3' 的排名\n",
|
||||
" df['rank_act_factor1'] = df.groupby('trade_date')['act_factor1'].rank(ascending=False, pct=True)\n",
|
||||
" df['rank_act_factor2'] = df.groupby('trade_date')['act_factor2'].rank(ascending=False, pct=True)\n",
|
||||
" df['rank_act_factor3'] = df.groupby('trade_date')['act_factor3'].rank(ascending=False, pct=True)\n",
|
||||
"\n",
|
||||
" return df\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def get_money_flow_factor(df):\n",
|
||||
" df['active_buy_volume_large'] = df['buy_lg_vol'] / df['net_mf_vol']\n",
|
||||
" df['active_buy_volume_big'] = df['buy_elg_vol'] / df['net_mf_vol']\n",
|
||||
" df['active_buy_volume_small'] = df['buy_sm_vol'] / df['net_mf_vol']\n",
|
||||
"\n",
|
||||
" df['buy_lg_vol - sell_lg_vol'] = (df['buy_lg_vol'] - df['sell_lg_vol']) / df['net_mf_vol']\n",
|
||||
" df['buy_elg_vol - sell_elg_vol'] = (df['buy_elg_vol'] - df['sell_elg_vol']) / df['net_mf_vol']\n",
|
||||
"\n",
|
||||
" # # 你还提到了一些其他字段:\n",
|
||||
" # df['net_active_buy_volume_main'] = df['net_mf_vol'] / df['buy_sm_vol']\n",
|
||||
" # df['netflow_amount_main'] = df['net_mf_vol'] / df['buy_sm_vol'] # 这里假设 'net_mf_vol' 是主流资金流\n",
|
||||
"\n",
|
||||
" # df['active_sell_volume_large'] = df['sell_lg_vol'] / df['sell_sm_vol']\n",
|
||||
" # df['active_sell_volume_big'] = df['sell_elg_vol'] / df['sell_sm_vol']\n",
|
||||
" # df['active_sell_volume_small'] = df['sell_sm_vol'] / df['sell_sm_vol']\n",
|
||||
"\n",
|
||||
" return df\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def get_alpha_factor(df):\n",
|
||||
" df['alpha_022'] = df['close'] - df['close'].shift(5)\n",
|
||||
"\n",
|
||||
" # alpha_003: (close - open) / (high - low)\n",
|
||||
" df['alpha_003'] = (df['close'] - df['open']) / (df['high'] - df['low'])\n",
|
||||
"\n",
|
||||
" # alpha_007: rank(correlation(close, volume, 5))\n",
|
||||
" df['alpha_007'] = df['close'].rolling(5).corr(df['vol']).rank(axis=1)\n",
|
||||
"\n",
|
||||
" # alpha_013: rank(sum(close, 5) - sum(close, 20))\n",
|
||||
" df['alpha_013'] = (df['close'].rolling(5).sum() - df['close'].rolling(20).sum()).rank(axis=1)\n",
|
||||
" return df\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def get_future_data(df):\n",
|
||||
" df['future_return1'] = (df['close'].shift(-1) - df['close']) / df['close']\n",
|
||||
" df['future_return2'] = (df['open'].shift(-2) - df['open'].shift(-1)) / df['open'].shift(-1)\n",
|
||||
" df['future_return3'] = (df['close'].shift(-2) - df['close'].shift(-1)) / df['close'].shift(-1)\n",
|
||||
" df['future_return4'] = (df['close'].shift(-2) - df['open'].shift(-1)) / df['open'].shift(-1)\n",
|
||||
" df['future_return5'] = (df['close'].shift(-5) - df['open'].shift(-1)) / df['open'].shift(-1)\n",
|
||||
" df['future_return6'] = (df['close'].shift(-10) - df['open'].shift(-1)) / df['open'].shift(-1)\n",
|
||||
" df['future_return7'] = (df['close'].shift(-20) - df['open'].shift(-1)) / df['open'].shift(-1)\n",
|
||||
" df['future_close1'] = (df['close'].shift(-1) - df['close']) / df['close']\n",
|
||||
" df['future_close2'] = (df['close'].shift(-2) - df['close']) / df['close']\n",
|
||||
" df['future_close3'] = (df['close'].shift(-3) - df['close']) / df['close']\n",
|
||||
" df['future_close4'] = (df['close'].shift(-4) - df['close']) / df['close']\n",
|
||||
" df['future_close5'] = (df['close'].shift(-5) - df['close']) / df['close']\n",
|
||||
" df['future_af11'] = df['act_factor1'].shift(-1)\n",
|
||||
" df['future_af12'] = df['act_factor1'].shift(-2)\n",
|
||||
" df['future_af13'] = df['act_factor1'].shift(-3)\n",
|
||||
" df['future_af14'] = df['act_factor1'].shift(-4)\n",
|
||||
" df['future_af15'] = df['act_factor1'].shift(-5)\n",
|
||||
" df['future_af21'] = df['act_factor2'].shift(-1)\n",
|
||||
" df['future_af22'] = df['act_factor2'].shift(-2)\n",
|
||||
" df['future_af23'] = df['act_factor2'].shift(-3)\n",
|
||||
" df['future_af24'] = df['act_factor2'].shift(-4)\n",
|
||||
" df['future_af25'] = df['act_factor2'].shift(-5)\n",
|
||||
" df['future_af31'] = df['act_factor3'].shift(-1)\n",
|
||||
" df['future_af32'] = df['act_factor3'].shift(-2)\n",
|
||||
" df['future_af33'] = df['act_factor3'].shift(-3)\n",
|
||||
" df['future_af34'] = df['act_factor3'].shift(-4)\n",
|
||||
" df['future_af35'] = df['act_factor3'].shift(-5)\n",
|
||||
"\n",
|
||||
" return df\n"
|
||||
],
|
||||
"id": "a735bc02ceb4d872",
|
||||
"outputs": [],
|
||||
"execution_count": 4
|
||||
},
|
||||
{
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2025-02-09T14:53:49.153376Z",
|
||||
"start_time": "2025-02-09T14:53:38.164112Z"
|
||||
}
|
||||
},
|
||||
"cell_type": "code",
|
||||
"source": [
|
||||
"df = get_technical_factor(df)\n",
|
||||
"df = get_act_factor(df)\n",
|
||||
"df = get_money_flow_factor(df)\n",
|
||||
"df = get_future_data(df)\n",
|
||||
"# df = df.drop(columns=origin_columns)\n",
|
||||
"\n",
|
||||
"print(df.info())"
|
||||
],
|
||||
"id": "53f86ddc0677a6d7",
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"<class 'pandas.core.frame.DataFrame'>\n",
|
||||
"RangeIndex: 8364308 entries, 0 to 8364307\n",
|
||||
"Data columns (total 83 columns):\n",
|
||||
" # Column Dtype \n",
|
||||
"--- ------ ----- \n",
|
||||
" 0 ts_code object \n",
|
||||
" 1 trade_date datetime64[ns]\n",
|
||||
" 2 open float64 \n",
|
||||
" 3 close float64 \n",
|
||||
" 4 high float64 \n",
|
||||
" 5 low float64 \n",
|
||||
" 6 vol float64 \n",
|
||||
" 7 is_st object \n",
|
||||
" 8 up_limit float64 \n",
|
||||
" 9 down_limit float64 \n",
|
||||
" 10 buy_sm_vol float64 \n",
|
||||
" 11 sell_sm_vol float64 \n",
|
||||
" 12 buy_lg_vol float64 \n",
|
||||
" 13 sell_lg_vol float64 \n",
|
||||
" 14 buy_elg_vol float64 \n",
|
||||
" 15 sell_elg_vol float64 \n",
|
||||
" 16 net_mf_vol float64 \n",
|
||||
" 17 up float64 \n",
|
||||
" 18 down float64 \n",
|
||||
" 19 atr_14 float64 \n",
|
||||
" 20 atr_6 float64 \n",
|
||||
" 21 obv float64 \n",
|
||||
" 22 maobv_6 float64 \n",
|
||||
" 23 obv-maobv_6 float64 \n",
|
||||
" 24 rsi_3 float64 \n",
|
||||
" 25 rsi_6 float64 \n",
|
||||
" 26 rsi_9 float64 \n",
|
||||
" 27 return_10 float64 \n",
|
||||
" 28 return_20 float64 \n",
|
||||
" 29 avg_close_5 float64 \n",
|
||||
" 30 std_return_5 float64 \n",
|
||||
" 31 std_return_15 float64 \n",
|
||||
" 32 std_return_25 float64 \n",
|
||||
" 33 std_return_90 float64 \n",
|
||||
" 34 std_return_90_2 float64 \n",
|
||||
" 35 std_return_5 / std_return_90 float64 \n",
|
||||
" 36 std_return_5 / std_return_25 float64 \n",
|
||||
" 37 std_return_90 - std_return_90_2 float64 \n",
|
||||
" 38 ema_5 float64 \n",
|
||||
" 39 ema_13 float64 \n",
|
||||
" 40 ema_20 float64 \n",
|
||||
" 41 ema_60 float64 \n",
|
||||
" 42 act_factor1 float64 \n",
|
||||
" 43 act_factor2 float64 \n",
|
||||
" 44 act_factor3 float64 \n",
|
||||
" 45 act_factor4 float64 \n",
|
||||
" 46 act_factor5 float64 \n",
|
||||
" 47 act_factor6 float64 \n",
|
||||
" 48 rank_act_factor1 float64 \n",
|
||||
" 49 rank_act_factor2 float64 \n",
|
||||
" 50 rank_act_factor3 float64 \n",
|
||||
" 51 active_buy_volume_large float64 \n",
|
||||
" 52 active_buy_volume_big float64 \n",
|
||||
" 53 active_buy_volume_small float64 \n",
|
||||
" 54 buy_lg_vol - sell_lg_vol float64 \n",
|
||||
" 55 buy_elg_vol - sell_elg_vol float64 \n",
|
||||
" 56 future_return1 float64 \n",
|
||||
" 57 future_return2 float64 \n",
|
||||
" 58 future_return3 float64 \n",
|
||||
" 59 future_return4 float64 \n",
|
||||
" 60 future_return5 float64 \n",
|
||||
" 61 future_return6 float64 \n",
|
||||
" 62 future_return7 float64 \n",
|
||||
" 63 future_close1 float64 \n",
|
||||
" 64 future_close2 float64 \n",
|
||||
" 65 future_close3 float64 \n",
|
||||
" 66 future_close4 float64 \n",
|
||||
" 67 future_close5 float64 \n",
|
||||
" 68 future_af11 float64 \n",
|
||||
" 69 future_af12 float64 \n",
|
||||
" 70 future_af13 float64 \n",
|
||||
" 71 future_af14 float64 \n",
|
||||
" 72 future_af15 float64 \n",
|
||||
" 73 future_af21 float64 \n",
|
||||
" 74 future_af22 float64 \n",
|
||||
" 75 future_af23 float64 \n",
|
||||
" 76 future_af24 float64 \n",
|
||||
" 77 future_af25 float64 \n",
|
||||
" 78 future_af31 float64 \n",
|
||||
" 79 future_af32 float64 \n",
|
||||
" 80 future_af33 float64 \n",
|
||||
" 81 future_af34 float64 \n",
|
||||
" 82 future_af35 float64 \n",
|
||||
"dtypes: datetime64[ns](1), float64(80), object(2)\n",
|
||||
"memory usage: 5.2+ GB\n",
|
||||
"None\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"execution_count": 5
|
||||
},
|
||||
{
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2025-02-09T14:55:28.712343Z",
|
||||
"start_time": "2025-02-09T14:53:49.279168Z"
|
||||
}
|
||||
},
|
||||
"cell_type": "code",
|
||||
"source": [
|
||||
"def filter_data(df):\n",
|
||||
" df = df.groupby('trade_date').apply(lambda x: x.nlargest(1000, 'act_factor3'))\n",
|
||||
" df = df[df['is_st'] == False]\n",
|
||||
" df = df[df['is_st'] == False]\n",
|
||||
" df = df[~df['ts_code'].str.startswith('30')]\n",
|
||||
" df = df[~df['ts_code'].str.startswith('68')]\n",
|
||||
" df = df[~df['ts_code'].str.startswith('8')]\n",
|
||||
" df = df.reset_index(drop=True)\n",
|
||||
" return df\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"df = filter_data(df)\n",
|
||||
"print(df.info())"
|
||||
],
|
||||
"id": "dbe2fd8021b9417f",
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"<class 'pandas.core.frame.DataFrame'>\n",
|
||||
"RangeIndex: 1136157 entries, 0 to 1136156\n",
|
||||
"Data columns (total 83 columns):\n",
|
||||
" # Column Non-Null Count Dtype \n",
|
||||
"--- ------ -------------- ----- \n",
|
||||
" 0 ts_code 1136157 non-null object \n",
|
||||
" 1 trade_date 1136157 non-null datetime64[ns]\n",
|
||||
" 2 open 1136157 non-null float64 \n",
|
||||
" 3 close 1136157 non-null float64 \n",
|
||||
" 4 high 1136157 non-null float64 \n",
|
||||
" 5 low 1136157 non-null float64 \n",
|
||||
" 6 vol 1136157 non-null float64 \n",
|
||||
" 7 is_st 1136157 non-null object \n",
|
||||
" 8 up_limit 1135878 non-null float64 \n",
|
||||
" 9 down_limit 1135878 non-null float64 \n",
|
||||
" 10 buy_sm_vol 1135663 non-null float64 \n",
|
||||
" 11 sell_sm_vol 1135663 non-null float64 \n",
|
||||
" 12 buy_lg_vol 1135663 non-null float64 \n",
|
||||
" 13 sell_lg_vol 1135663 non-null float64 \n",
|
||||
" 14 buy_elg_vol 1135663 non-null float64 \n",
|
||||
" 15 sell_elg_vol 1135663 non-null float64 \n",
|
||||
" 16 net_mf_vol 1135663 non-null float64 \n",
|
||||
" 17 up 1136157 non-null float64 \n",
|
||||
" 18 down 1136157 non-null float64 \n",
|
||||
" 19 atr_14 1136157 non-null float64 \n",
|
||||
" 20 atr_6 1136157 non-null float64 \n",
|
||||
" 21 obv 1136157 non-null float64 \n",
|
||||
" 22 maobv_6 1136157 non-null float64 \n",
|
||||
" 23 obv-maobv_6 1136157 non-null float64 \n",
|
||||
" 24 rsi_3 1136157 non-null float64 \n",
|
||||
" 25 rsi_6 1136157 non-null float64 \n",
|
||||
" 26 rsi_9 1136157 non-null float64 \n",
|
||||
" 27 return_10 1136157 non-null float64 \n",
|
||||
" 28 return_20 1136157 non-null float64 \n",
|
||||
" 29 avg_close_5 1136157 non-null float64 \n",
|
||||
" 30 std_return_5 1136157 non-null float64 \n",
|
||||
" 31 std_return_15 1136157 non-null float64 \n",
|
||||
" 32 std_return_25 1136157 non-null float64 \n",
|
||||
" 33 std_return_90 1136131 non-null float64 \n",
|
||||
" 34 std_return_90_2 1136129 non-null float64 \n",
|
||||
" 35 std_return_5 / std_return_90 1136131 non-null float64 \n",
|
||||
" 36 std_return_5 / std_return_25 1136157 non-null float64 \n",
|
||||
" 37 std_return_90 - std_return_90_2 1136129 non-null float64 \n",
|
||||
" 38 ema_5 1136157 non-null float64 \n",
|
||||
" 39 ema_13 1136157 non-null float64 \n",
|
||||
" 40 ema_20 1136157 non-null float64 \n",
|
||||
" 41 ema_60 1136153 non-null float64 \n",
|
||||
" 42 act_factor1 1136157 non-null float64 \n",
|
||||
" 43 act_factor2 1136157 non-null float64 \n",
|
||||
" 44 act_factor3 1136157 non-null float64 \n",
|
||||
" 45 act_factor4 1136152 non-null float64 \n",
|
||||
" 46 act_factor5 1136152 non-null float64 \n",
|
||||
" 47 act_factor6 1136157 non-null float64 \n",
|
||||
" 48 rank_act_factor1 1136157 non-null float64 \n",
|
||||
" 49 rank_act_factor2 1136157 non-null float64 \n",
|
||||
" 50 rank_act_factor3 1136157 non-null float64 \n",
|
||||
" 51 active_buy_volume_large 1135659 non-null float64 \n",
|
||||
" 52 active_buy_volume_big 1135636 non-null float64 \n",
|
||||
" 53 active_buy_volume_small 1135663 non-null float64 \n",
|
||||
" 54 buy_lg_vol - sell_lg_vol 1135660 non-null float64 \n",
|
||||
" 55 buy_elg_vol - sell_elg_vol 1135640 non-null float64 \n",
|
||||
" 56 future_return1 1136157 non-null float64 \n",
|
||||
" 57 future_return2 1136157 non-null float64 \n",
|
||||
" 58 future_return3 1136157 non-null float64 \n",
|
||||
" 59 future_return4 1136157 non-null float64 \n",
|
||||
" 60 future_return5 1136157 non-null float64 \n",
|
||||
" 61 future_return6 1136157 non-null float64 \n",
|
||||
" 62 future_return7 1136157 non-null float64 \n",
|
||||
" 63 future_close1 1136157 non-null float64 \n",
|
||||
" 64 future_close2 1136157 non-null float64 \n",
|
||||
" 65 future_close3 1136157 non-null float64 \n",
|
||||
" 66 future_close4 1136157 non-null float64 \n",
|
||||
" 67 future_close5 1136157 non-null float64 \n",
|
||||
" 68 future_af11 1136157 non-null float64 \n",
|
||||
" 69 future_af12 1136157 non-null float64 \n",
|
||||
" 70 future_af13 1136157 non-null float64 \n",
|
||||
" 71 future_af14 1136157 non-null float64 \n",
|
||||
" 72 future_af15 1136157 non-null float64 \n",
|
||||
" 73 future_af21 1136157 non-null float64 \n",
|
||||
" 74 future_af22 1136157 non-null float64 \n",
|
||||
" 75 future_af23 1136157 non-null float64 \n",
|
||||
" 76 future_af24 1136157 non-null float64 \n",
|
||||
" 77 future_af25 1136157 non-null float64 \n",
|
||||
" 78 future_af31 1136157 non-null float64 \n",
|
||||
" 79 future_af32 1136157 non-null float64 \n",
|
||||
" 80 future_af33 1136157 non-null float64 \n",
|
||||
" 81 future_af34 1136157 non-null float64 \n",
|
||||
" 82 future_af35 1136157 non-null float64 \n",
|
||||
"dtypes: datetime64[ns](1), float64(80), object(2)\n",
|
||||
"memory usage: 719.5+ MB\n",
|
||||
"None\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"execution_count": 6
|
||||
},
|
||||
{
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2025-02-09T15:00:45.828404Z",
|
||||
"start_time": "2025-02-09T15:00:45.294830Z"
|
||||
}
|
||||
},
|
||||
"cell_type": "code",
|
||||
"source": [
|
||||
"def remove_outliers_iqr(series, lower_quantile=0.05, upper_quantile=0.95, threshold=1.5):\n",
|
||||
" Q1 = series.quantile(lower_quantile)\n",
|
||||
" Q3 = series.quantile(upper_quantile)\n",
|
||||
" IQR = Q3 - Q1\n",
|
||||
" lower_bound = Q1 - threshold * IQR\n",
|
||||
" upper_bound = Q3 + threshold * IQR\n",
|
||||
" # 过滤掉低于下边界或高于上边界的极值\n",
|
||||
" return (series >= lower_bound) & (series <= upper_bound)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def neutralize_labels(labels, features, feature_columns, z_threshold=3, method='regression'):\n",
|
||||
" labels_no_outliers = remove_outliers_iqr(labels)\n",
|
||||
" return labels_no_outliers\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"train_data = df[df['trade_date'] <= '2023-01-01']\n",
|
||||
"test_data = df[df['trade_date'] >= '2023-01-01']\n",
|
||||
"\n",
|
||||
"feature_columns = [col for col in df.columns if col not in ['trade_date',\n",
|
||||
" 'ts_code',\n",
|
||||
" 'label']]\n",
|
||||
"feature_columns = [col for col in feature_columns if 'future' not in col]\n",
|
||||
"feature_columns = [col for col in feature_columns if 'score' not in col]\n",
|
||||
"feature_columns = [col for col in feature_columns if col not in origin_columns]\n",
|
||||
"\n",
|
||||
"# for column in [column for column in train_data.columns if 'future' in column]:\n",
|
||||
"# label_index = neutralize_labels(train_data[column], train_data, feature_columns, z_threshold=3, method='regression')\n",
|
||||
"# train_data = train_data[label_index]\n",
|
||||
"# label_index = neutralize_labels(test_data[column], test_data, feature_columns, z_threshold=3, method='regression')\n",
|
||||
"# test_data = test_data[label_index]\n",
|
||||
"\n",
|
||||
"print(len(train_data))\n",
|
||||
"print(len(test_data))"
|
||||
],
|
||||
"id": "5f3d9aece75318cd",
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"['up', 'down', 'atr_14', 'atr_6', 'obv', 'maobv_6', 'obv-maobv_6', 'rsi_3', 'rsi_6', 'rsi_9', 'return_10', 'return_20', 'avg_close_5', 'std_return_5', 'std_return_15', 'std_return_25', 'std_return_90', 'std_return_90_2', 'std_return_5 / std_return_90', 'std_return_5 / std_return_25', 'std_return_90 - std_return_90_2', 'ema_5', 'ema_13', 'ema_20', 'ema_60', 'act_factor1', 'act_factor2', 'act_factor3', 'act_factor4', 'act_factor5', 'act_factor6', 'rank_act_factor1', 'rank_act_factor2', 'rank_act_factor3', 'active_buy_volume_large', 'active_buy_volume_big', 'active_buy_volume_small', 'buy_lg_vol - sell_lg_vol', 'buy_elg_vol - sell_elg_vol']\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"execution_count": 19
|
||||
},
|
||||
{
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2025-02-09T14:56:05.319915Z",
|
||||
"start_time": "2025-02-09T14:56:03.355725Z"
|
||||
}
|
||||
},
|
||||
"cell_type": "code",
|
||||
"source": [
|
||||
"def get_qcuts(series, quantiles):\n",
|
||||
" q = pd.qcut(series, q=quantiles, labels=False, duplicates='drop')\n",
|
||||
" return q[-1] # 返回窗口最后一个元素的分位数标签\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"window = 5\n",
|
||||
"quantiles = 20\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def get_label(df):\n",
|
||||
" labels = df['future_af13'] - df['act_factor1']\n",
|
||||
" # labels = df['future_close3']\n",
|
||||
" return labels\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"train_data['label'], test_data['label'] = get_label(train_data), get_label(test_data)\n",
|
||||
"\n",
|
||||
"train_data, test_data = train_data.dropna(subset=['label']), test_data.dropna(subset=['label'])\n",
|
||||
"train_data, test_data = train_data.replace([np.inf, -np.inf], np.nan).dropna(), test_data.replace([np.inf, -np.inf],\n",
|
||||
" np.nan).dropna()\n",
|
||||
"train_data, test_data = train_data.reset_index(drop=True), test_data.reset_index(drop=True)\n",
|
||||
"\n",
|
||||
"print(len(train_data))\n",
|
||||
"print(len(test_data))"
|
||||
],
|
||||
"id": "f4f16d63ad18d1bc",
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"875004\n",
|
||||
"最小日期: 2017-01-03\n",
|
||||
"最大日期: 2022-12-30\n",
|
||||
"260581\n",
|
||||
"最小日期: 2023-01-03\n",
|
||||
"最大日期: 2025-01-27\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"execution_count": 13
|
||||
},
|
||||
{
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2025-02-09T14:56:05.480695Z",
|
||||
"start_time": "2025-02-09T14:56:05.367238Z"
|
||||
}
|
||||
},
|
||||
"cell_type": "code",
|
||||
"source": [
|
||||
"import lightgbm as lgb\n",
|
||||
"import numpy as np\n",
|
||||
"import matplotlib.pyplot as plt\n",
|
||||
"import optuna\n",
|
||||
"from sklearn.model_selection import KFold\n",
|
||||
"from sklearn.metrics import mean_absolute_error\n",
|
||||
"import os\n",
|
||||
"import json\n",
|
||||
"import pickle\n",
|
||||
"import hashlib\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def objective(trial, X, y, num_boost_round, params):\n",
|
||||
" # 参数网格\n",
|
||||
" X, y = X.reset_index(drop=True), y.reset_index(drop=True)\n",
|
||||
" param_grid = {\n",
|
||||
" \"n_estimators\": trial.suggest_categorical(\"n_estimators\", [10000]),\n",
|
||||
" \"learning_rate\": trial.suggest_float(\"learning_rate\", 0.01, 0.3),\n",
|
||||
" \"num_leaves\": trial.suggest_int(\"num_leaves\", 20, 3000, step=25),\n",
|
||||
" \"max_depth\": trial.suggest_int(\"max_depth\", 3, 16),\n",
|
||||
" \"min_data_in_leaf\": trial.suggest_int(\"min_data_in_leaf\", 200, 10000, step=100),\n",
|
||||
" \"lambda_l1\": trial.suggest_int(\"lambda_l1\", 0, 100, step=5),\n",
|
||||
" \"lambda_l2\": trial.suggest_int(\"lambda_l2\", 0, 100, step=5),\n",
|
||||
" \"min_gain_to_split\": trial.suggest_float(\"min_gain_to_split\", 0, 15),\n",
|
||||
" \"bagging_fraction\": trial.suggest_float(\"bagging_fraction\", 0.2, 0.95, step=0.1),\n",
|
||||
" \"bagging_freq\": trial.suggest_categorical(\"bagging_freq\", [1]),\n",
|
||||
" \"feature_fraction\": trial.suggest_float(\"feature_fraction\", 0.2, 0.95, step=0.1),\n",
|
||||
" \"random_state\": 1,\n",
|
||||
" \"objective\": 'regression',\n",
|
||||
" 'verbosity': -1\n",
|
||||
" }\n",
|
||||
" # 5折交叉验证\n",
|
||||
" cv = KFold(n_splits=5, shuffle=False)\n",
|
||||
"\n",
|
||||
" cv_scores = np.empty(5)\n",
|
||||
" for idx, (train_idx, test_idx) in enumerate(cv.split(X, y)):\n",
|
||||
" X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]\n",
|
||||
" y_train, y_test = y[train_idx], y[test_idx]\n",
|
||||
"\n",
|
||||
" # LGBM建模\n",
|
||||
" model = lgb.LGBMRegressor(**param_grid, num_boost_round=num_boost_round)\n",
|
||||
" model.fit(\n",
|
||||
" X_train,\n",
|
||||
" y_train,\n",
|
||||
" eval_set=[(X_test, y_test)],\n",
|
||||
" eval_metric=\"l2\",\n",
|
||||
" callbacks=[\n",
|
||||
" # LightGBMPruningCallback(trial, \"l2\"),\n",
|
||||
" lgb.early_stopping(50, first_metric_only=True),\n",
|
||||
" lgb.log_evaluation(period=-1)\n",
|
||||
" ],\n",
|
||||
" )\n",
|
||||
" # 模型预测\n",
|
||||
" preds = model.predict(X_test)\n",
|
||||
" # 优化指标logloss最小\n",
|
||||
" cv_scores[idx] = mean_absolute_error(y_test, preds)\n",
|
||||
"\n",
|
||||
" return np.mean(cv_scores)\n",
|
||||
"\n",
|
||||
"def generate_key(params, feature_columns, num_boost_round):\n",
|
||||
" key_data = {\n",
|
||||
" \"params\": params,\n",
|
||||
" \"feature_columns\": feature_columns,\n",
|
||||
" \"num_boost_round\": num_boost_round\n",
|
||||
" }\n",
|
||||
" # 转换成排序后的 JSON 字符串,再生成 md5 hash\n",
|
||||
" key_str = json.dumps(key_data, sort_keys=True)\n",
|
||||
" return hashlib.md5(key_str.encode('utf-8')).hexdigest()\n",
|
||||
"\n",
|
||||
"def train_light_model(df, params, feature_columns, callbacks, evals,\n",
|
||||
" print_feature_importance=True, num_boost_round=100,\n",
|
||||
" use_optuna=False):\n",
|
||||
" cache_file = 'light_model.pkl'\n",
|
||||
" cache_key = generate_key(params, feature_columns, num_boost_round)\n",
|
||||
"\n",
|
||||
" # 检查缓存文件是否存在\n",
|
||||
" if os.path.exists(cache_file):\n",
|
||||
" try:\n",
|
||||
" with open(cache_file, 'rb') as f:\n",
|
||||
" cache_data = pickle.load(f)\n",
|
||||
" if cache_data.get('key') == cache_key:\n",
|
||||
" print(\"加载缓存模型...\")\n",
|
||||
" return cache_data.get('model')\n",
|
||||
" else:\n",
|
||||
" print(\"缓存模型的参数与当前参数不匹配,重新训练模型。\")\n",
|
||||
" except Exception as e:\n",
|
||||
" print(f\"加载缓存失败: {e},重新训练模型。\")\n",
|
||||
" else:\n",
|
||||
" print(\"未发现缓存模型,开始训练新模型。\")\n",
|
||||
" # 确保数据按照 date 和 label 排序\n",
|
||||
" df_sorted = df.sort_values(by=['trade_date', 'label'], ascending=[True, False]) # 按日期升序、标签降序排序\n",
|
||||
" df_sorted = df_sorted.sort_values(by='trade_date')\n",
|
||||
" unique_dates = df_sorted['trade_date'].unique()\n",
|
||||
" val_date_count = int(len(unique_dates) * 0.1)\n",
|
||||
" val_dates = unique_dates[-val_date_count:]\n",
|
||||
" val_indices = df_sorted[df_sorted['trade_date'].isin(val_dates)].index\n",
|
||||
" train_indices = df_sorted[~df_sorted['trade_date'].isin(val_dates)].index\n",
|
||||
"\n",
|
||||
" # 获取训练集和验证集的样本\n",
|
||||
" train_df = df_sorted.iloc[train_indices]\n",
|
||||
" val_df = df_sorted.iloc[val_indices]\n",
|
||||
"\n",
|
||||
" X_train = train_df[feature_columns]\n",
|
||||
" y_train = train_df['label']\n",
|
||||
"\n",
|
||||
" X_val = val_df[feature_columns]\n",
|
||||
" y_val = val_df['label']\n",
|
||||
"\n",
|
||||
" train_data = lgb.Dataset(X_train, label=y_train)\n",
|
||||
" val_data = lgb.Dataset(X_val, label=y_val)\n",
|
||||
" if use_optuna:\n",
|
||||
" # study = optuna.create_study(direction='minimize' if classify else 'maximize')\n",
|
||||
" study = optuna.create_study(direction='minimize')\n",
|
||||
" study.optimize(lambda trial: objective(trial, X_train, y_train, num_boost_round, params), n_trials=20)\n",
|
||||
"\n",
|
||||
" print(f\"Best parameters: {study.best_trial.params}\")\n",
|
||||
" print(f\"Best score: {study.best_trial.value}\")\n",
|
||||
"\n",
|
||||
" params.update(study.best_trial.params)\n",
|
||||
" model = lgb.train(\n",
|
||||
" params, train_data, num_boost_round=num_boost_round,\n",
|
||||
" valid_sets=[train_data, val_data], valid_names=['train', 'valid'],\n",
|
||||
" callbacks=callbacks\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" # 打印特征重要性(如果需要)\n",
|
||||
" if print_feature_importance:\n",
|
||||
" lgb.plot_metric(evals)\n",
|
||||
" lgb.plot_tree(model, figsize=(20, 8))\n",
|
||||
" lgb.plot_importance(model, importance_type='split', max_num_features=20)\n",
|
||||
" plt.show()\n",
|
||||
" # with open(cache_file, 'wb') as f:\n",
|
||||
" # pickle.dump({'key': cache_key,\n",
|
||||
" # 'model': model,\n",
|
||||
" # 'feature_columns': feature_columns}, f)\n",
|
||||
" # print(\"模型训练完成并已保存缓存。\")\n",
|
||||
" return model\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"from catboost import CatBoostRegressor\n",
|
||||
"import pandas as pd\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def train_catboost(df, num_boost_round, params=None):\n",
|
||||
" \"\"\"\n",
|
||||
" 训练 CatBoost 排序模型\n",
|
||||
" - df: 包含因子、date、instrument 和 label 的 DataFrame\n",
|
||||
" - num_boost_round: 训练的轮数\n",
|
||||
" - print_feature_importance: 是否打印特征重要性\n",
|
||||
" - plot: 是否绘制特征重要性图\n",
|
||||
" - split_date: 用于划分训练集和验证集的日期(比如 '2020-01-01')\n",
|
||||
"\n",
|
||||
" 返回训练好的模型\n",
|
||||
" \"\"\"\n",
|
||||
" df_sorted = df.sort_values(by=['date', 'label'], ascending=[True, False])\n",
|
||||
"\n",
|
||||
" # 提取特征和标签\n",
|
||||
" feature_columns = [col for col in df.columns if col not in ['date',\n",
|
||||
" 'instrument',\n",
|
||||
" 'label']]\n",
|
||||
" feature_columns = [col for col in feature_columns if 'future' not in col]\n",
|
||||
" feature_columns = [col for col in feature_columns if 'score' not in col]\n",
|
||||
"\n",
|
||||
" df_sorted = df_sorted.sort_values(by='date')\n",
|
||||
" unique_dates = df_sorted['date'].unique()\n",
|
||||
" val_date_count = int(len(unique_dates) * 0.1)\n",
|
||||
" val_dates = unique_dates[-val_date_count:]\n",
|
||||
" val_indices = df_sorted[df_sorted['date'].isin(val_dates)].index\n",
|
||||
" train_indices = df_sorted[~df_sorted['date'].isin(val_dates)].index\n",
|
||||
"\n",
|
||||
" # 获取训练集和验证集的样本\n",
|
||||
" train_df = df_sorted.iloc[train_indices].sort_values(by=['date', 'label'], ascending=[True, False])\n",
|
||||
" val_df = df_sorted.iloc[val_indices].sort_values(by=['date', 'label'], ascending=[True, False])\n",
|
||||
"\n",
|
||||
" X_train = train_df[feature_columns]\n",
|
||||
" y_train = train_df['label']\n",
|
||||
"\n",
|
||||
" X_val = val_df[feature_columns]\n",
|
||||
" y_val = val_df['label']\n",
|
||||
"\n",
|
||||
" model = CatBoostRegressor(**params, iterations=num_boost_round)\n",
|
||||
" model.fit(X_train,\n",
|
||||
" y_train,\n",
|
||||
" eval_set=(X_val, y_val))\n",
|
||||
"\n",
|
||||
" return model"
|
||||
],
|
||||
"id": "8f134d435f71e9e2",
|
||||
"outputs": [],
|
||||
"execution_count": 14
|
||||
},
|
||||
{
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2025-02-09T14:56:05.576927Z",
|
||||
"start_time": "2025-02-09T14:56:05.480695Z"
|
||||
}
|
||||
},
|
||||
"cell_type": "code",
|
||||
"source": [
|
||||
"light_params = {\n",
|
||||
" 'objective': 'regression',\n",
|
||||
" 'metric': 'l2',\n",
|
||||
" 'learning_rate': 0.05,\n",
|
||||
" 'is_unbalance': True,\n",
|
||||
" 'num_leaves': 2048,\n",
|
||||
" 'min_data_in_leaf': 16,\n",
|
||||
" 'max_depth': 32,\n",
|
||||
" 'max_bin': 1024,\n",
|
||||
" 'nthread': 2,\n",
|
||||
" 'feature_fraction': 0.7,\n",
|
||||
" 'bagging_fraction': 0.7,\n",
|
||||
" 'bagging_freq': 5,\n",
|
||||
" 'lambda_l1': 80,\n",
|
||||
" 'lambda_l2': 65,\n",
|
||||
" 'verbosity': -1\n",
|
||||
"}"
|
||||
],
|
||||
"id": "4a4542e1ed6afe7d",
|
||||
"outputs": [],
|
||||
"execution_count": 15
|
||||
},
|
||||
{
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2025-02-09T14:57:25.341222Z",
|
||||
"start_time": "2025-02-09T14:56:05.640256Z"
|
||||
}
|
||||
},
|
||||
"cell_type": "code",
|
||||
"source": [
|
||||
"print('train data size: ', len(train_data))\n",
|
||||
"df = train_data\n",
|
||||
"\n",
|
||||
"evals = {}\n",
|
||||
"light_model = train_light_model(train_data, light_params, feature_columns,\n",
|
||||
" [lgb.log_evaluation(period=500),\n",
|
||||
" lgb.callback.record_evaluation(evals),\n",
|
||||
" lgb.early_stopping(50, first_metric_only=True)\n",
|
||||
" ], evals,\n",
|
||||
" num_boost_round=1000, use_optuna=False,\n",
|
||||
" print_feature_importance=False)"
|
||||
],
|
||||
"id": "beeb098799ecfa6a",
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"train data size: 875004\n",
|
||||
"未发现缓存模型,开始训练新模型。\n",
|
||||
"Training until validation scores don't improve for 50 rounds\n",
|
||||
"Early stopping, best iteration is:\n",
|
||||
"[378]\ttrain's l2: 0.435049\tvalid's l2: 0.589178\n",
|
||||
"Evaluated only: l2\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"execution_count": 16
|
||||
},
|
||||
{
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2025-02-09T14:57:27.394697Z",
|
||||
"start_time": "2025-02-09T14:57:25.373274Z"
|
||||
}
|
||||
},
|
||||
"cell_type": "code",
|
||||
"source": [
|
||||
"test_data['score'] = light_model.predict(test_data[feature_columns])\n",
|
||||
"predictions = test_data.loc[test_data.groupby('trade_date')['score'].idxmax()]"
|
||||
],
|
||||
"id": "5bb96ca8492e74d",
|
||||
"outputs": [],
|
||||
"execution_count": 17
|
||||
},
|
||||
{
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2025-02-09T14:57:27.489570Z",
|
||||
"start_time": "2025-02-09T14:57:27.397368Z"
|
||||
}
|
||||
},
|
||||
"cell_type": "code",
|
||||
"source": "predictions[['trade_date', 'score', 'ts_code']].to_csv('predictions.csv', index=False)",
|
||||
"id": "5d1522a7538db91b",
|
||||
"outputs": [],
|
||||
"execution_count": 18
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 2
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython2",
|
||||
"version": "2.7.6"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
BIN
code/train/best_model.pth
Normal file
BIN
code/train/best_model.pth
Normal file
Binary file not shown.
5004
code/train/catboost_info/catboost_training.json
Normal file
5004
code/train/catboost_info/catboost_training.json
Normal file
File diff suppressed because it is too large
Load Diff
BIN
code/train/catboost_info/learn/events.out.tfevents
Normal file
BIN
code/train/catboost_info/learn/events.out.tfevents
Normal file
Binary file not shown.
5001
code/train/catboost_info/learn_error.tsv
Normal file
5001
code/train/catboost_info/learn_error.tsv
Normal file
File diff suppressed because it is too large
Load Diff
BIN
code/train/catboost_info/test/events.out.tfevents
Normal file
BIN
code/train/catboost_info/test/events.out.tfevents
Normal file
Binary file not shown.
BIN
code/train/catboost_info/test1/events.out.tfevents
Normal file
BIN
code/train/catboost_info/test1/events.out.tfevents
Normal file
Binary file not shown.
5001
code/train/catboost_info/test_error.tsv
Normal file
5001
code/train/catboost_info/test_error.tsv
Normal file
File diff suppressed because it is too large
Load Diff
5001
code/train/catboost_info/time_left.tsv
Normal file
5001
code/train/catboost_info/time_left.tsv
Normal file
File diff suppressed because it is too large
Load Diff
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
384
code/train/code.ipynb
Normal file
384
code/train/code.ipynb
Normal file
@@ -0,0 +1,384 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"metadata": {},
|
||||
"cell_type": "markdown",
|
||||
"source": [
|
||||
"\n",
|
||||
"\n",
|
||||
"def get_technical_factor(df):\n",
|
||||
" # 按股票和日期排序\n",
|
||||
" df = df.sort_values(by=['ts_code', 'trade_date'])\n",
|
||||
" grouped = df.groupby('ts_code', group_keys=False)\n",
|
||||
"\n",
|
||||
" df['return_skew'] = grouped['pct_chg'].rolling(window=5).skew().reset_index(0, drop=True)\n",
|
||||
" df['return_kurtosis'] = grouped['pct_chg'].rolling(window=5).kurt().reset_index(0, drop=True)\n",
|
||||
"\n",
|
||||
" # 因子 1:短期成交量变化率\n",
|
||||
" df['volume_change_rate'] = (\n",
|
||||
" grouped['vol'].rolling(window=2).mean() /\n",
|
||||
" grouped['vol'].rolling(window=5).mean() - 1\n",
|
||||
" ).reset_index(level=0, drop=True) # 确保索引对齐\n",
|
||||
"\n",
|
||||
" # 因子 2:成交量突破信号\n",
|
||||
" max_volume = grouped['vol'].rolling(window=5).max().reset_index(level=0, drop=True) # 确保索引对齐\n",
|
||||
" df['cat_volume_breakout'] = (df['vol'] > max_volume)\n",
|
||||
"\n",
|
||||
" # 因子 3:换手率均线偏离度\n",
|
||||
" mean_turnover = grouped['turnover_rate'].rolling(window=3).mean().reset_index(level=0, drop=True)\n",
|
||||
" std_turnover = grouped['turnover_rate'].rolling(window=3).std().reset_index(level=0, drop=True)\n",
|
||||
" df['turnover_deviation'] = (df['turnover_rate'] - mean_turnover) / std_turnover\n",
|
||||
"\n",
|
||||
" # 因子 4:换手率激增信号\n",
|
||||
" df['cat_turnover_spike'] = (df['turnover_rate'] > mean_turnover + 2 * std_turnover)\n",
|
||||
"\n",
|
||||
" # 因子 5:量比均值\n",
|
||||
" df['avg_volume_ratio'] = grouped['volume_ratio'].rolling(window=3).mean().reset_index(level=0, drop=True)\n",
|
||||
"\n",
|
||||
" # 因子 6:量比突破信号\n",
|
||||
" max_volume_ratio = grouped['volume_ratio'].rolling(window=5).max().reset_index(level=0, drop=True)\n",
|
||||
" df['cat_volume_ratio_breakout'] = (df['volume_ratio'] > max_volume_ratio)\n",
|
||||
"\n",
|
||||
" # 因子 7:成交量与换手率的综合动量因子\n",
|
||||
" alpha = 0.5\n",
|
||||
" df['momentum_factor'] = df['volume_change_rate'] + alpha * df['turnover_deviation']\n",
|
||||
"\n",
|
||||
" # 因子 8:量价共振因子\n",
|
||||
" df['price_change_rate'] = grouped['close'].pct_change()\n",
|
||||
" df['resonance_factor'] = df['volume_ratio'] * df['price_change_rate']\n",
|
||||
"\n",
|
||||
" # 计算 up 和 down\n",
|
||||
" df['log_close'] = np.log(df['close'])\n",
|
||||
"\n",
|
||||
" df['vol_spike'] = grouped.apply(\n",
|
||||
" lambda x: pd.Series(x['vol'].rolling(20).mean(), index=x.index)\n",
|
||||
" )\n",
|
||||
" df['cat_vol_spike'] = df['vol'] > 2 * df['vol_spike']\n",
|
||||
" df['vol_std_5'] = df['vol'].pct_change().rolling(5).std()\n",
|
||||
"\n",
|
||||
" df['up'] = (df['high'] - df[['close', 'open']].max(axis=1)) / df['close']\n",
|
||||
" df['down'] = (df[['close', 'open']].min(axis=1) - df['low']) / df['close']\n",
|
||||
"\n",
|
||||
" # 计算 ATR\n",
|
||||
" df['atr_14'] = grouped.apply(\n",
|
||||
" lambda x: pd.Series(talib.ATR(x['high'].values, x['low'].values, x['close'].values, timeperiod=14),\n",
|
||||
" index=x.index)\n",
|
||||
" )\n",
|
||||
" df['atr_6'] = grouped.apply(\n",
|
||||
" lambda x: pd.Series(talib.ATR(x['high'].values, x['low'].values, x['close'].values, timeperiod=6),\n",
|
||||
" index=x.index)\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" # 计算 OBV 及其均线\n",
|
||||
" df['obv'] = grouped.apply(\n",
|
||||
" lambda x: pd.Series(talib.OBV(x['close'].values, x['vol'].values), index=x.index)\n",
|
||||
" )\n",
|
||||
" df['maobv_6'] = grouped.apply(\n",
|
||||
" lambda x: pd.Series(talib.SMA(x['obv'].values, timeperiod=6), index=x.index)\n",
|
||||
" )\n",
|
||||
" df['obv-maobv_6'] = df['obv'] - df['maobv_6']\n",
|
||||
"\n",
|
||||
" # 计算 RSI\n",
|
||||
" df['rsi_3'] = grouped.apply(\n",
|
||||
" lambda x: pd.Series(talib.RSI(x['close'].values, timeperiod=3), index=x.index)\n",
|
||||
" )\n",
|
||||
" df['rsi_6'] = grouped.apply(\n",
|
||||
" lambda x: pd.Series(talib.RSI(x['close'].values, timeperiod=6), index=x.index)\n",
|
||||
" )\n",
|
||||
" df['rsi_9'] = grouped.apply(\n",
|
||||
" lambda x: pd.Series(talib.RSI(x['close'].values, timeperiod=9), index=x.index)\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" # 计算 return_10 和 return_20\n",
|
||||
" df['return_5'] = grouped['close'].apply(lambda x: x / x.shift(5) - 1)\n",
|
||||
" df['return_10'] = grouped['close'].apply(lambda x: x / x.shift(10) - 1)\n",
|
||||
" df['return_20'] = grouped['close'].apply(lambda x: x / x.shift(20) - 1)\n",
|
||||
"\n",
|
||||
" # df['avg_close_5'] = grouped['close'].apply(lambda x: x.rolling(window=5).mean() / x)\n",
|
||||
"\n",
|
||||
" # 计算标准差指标\n",
|
||||
" df['std_return_5'] = grouped['close'].apply(lambda x: x.pct_change().rolling(window=5).std())\n",
|
||||
" df['std_return_15'] = grouped['close'].apply(lambda x: x.pct_change().rolling(window=15).std())\n",
|
||||
" df['std_return_25'] = grouped['close'].apply(lambda x: x.pct_change().rolling(window=25).std())\n",
|
||||
" df['std_return_90'] = grouped['close'].apply(lambda x: x.pct_change().rolling(window=90).std())\n",
|
||||
" df['std_return_90_2'] = grouped['close'].apply(lambda x: x.shift(10).pct_change().rolling(window=90).std())\n",
|
||||
"\n",
|
||||
" # 计算比值指标\n",
|
||||
" df['std_return_5 / std_return_90'] = df['std_return_5'] / df['std_return_90']\n",
|
||||
" df['std_return_5 / std_return_25'] = df['std_return_5'] / df['std_return_25']\n",
|
||||
"\n",
|
||||
" # 计算标准差差值\n",
|
||||
" df['std_return_90 - std_return_90_2'] = df['std_return_90'] - df['std_return_90_2']\n",
|
||||
"\n",
|
||||
" return df\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def get_act_factor(df, cat=True):\n",
|
||||
" # 按股票和日期排序\n",
|
||||
" df = df.sort_values(by=['ts_code', 'trade_date'])\n",
|
||||
" grouped = df.groupby('ts_code', group_keys=False)\n",
|
||||
" # 计算 EMA 指标\n",
|
||||
" df['_ema_5'] = grouped['close'].apply(\n",
|
||||
" lambda x: pd.Series(talib.EMA(x.values, timeperiod=5), index=x.index)\n",
|
||||
" )\n",
|
||||
" df['_ema_13'] = grouped['close'].apply(\n",
|
||||
" lambda x: pd.Series(talib.EMA(x.values, timeperiod=13), index=x.index)\n",
|
||||
" )\n",
|
||||
" df['_ema_20'] = grouped['close'].apply(\n",
|
||||
" lambda x: pd.Series(talib.EMA(x.values, timeperiod=20), index=x.index)\n",
|
||||
" )\n",
|
||||
" df['_ema_60'] = grouped['close'].apply(\n",
|
||||
" lambda x: pd.Series(talib.EMA(x.values, timeperiod=60), index=x.index)\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" # 计算 act_factor1, act_factor2, act_factor3, act_factor4\n",
|
||||
" df['act_factor1'] = grouped['_ema_5'].apply(\n",
|
||||
" lambda x: np.arctan((x / x.shift(1) - 1) * 100) * 57.3 / 50\n",
|
||||
" )\n",
|
||||
" df['act_factor2'] = grouped['_ema_13'].apply(\n",
|
||||
" lambda x: np.arctan((x / x.shift(1) - 1) * 100) * 57.3 / 40\n",
|
||||
" )\n",
|
||||
" df['act_factor3'] = grouped['_ema_20'].apply(\n",
|
||||
" lambda x: np.arctan((x / x.shift(1) - 1) * 100) * 57.3 / 21\n",
|
||||
" )\n",
|
||||
" df['act_factor4'] = grouped['_ema_60'].apply(\n",
|
||||
" lambda x: np.arctan((x / x.shift(1) - 1) * 100) * 57.3 / 10\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" if cat:\n",
|
||||
" df['cat_af1'] = df['act_factor1'] > 0\n",
|
||||
" df['cat_af2'] = df['act_factor2'] > df['act_factor1']\n",
|
||||
" df['cat_af3'] = df['act_factor3'] > df['act_factor2']\n",
|
||||
" df['cat_af4'] = df['act_factor4'] > df['act_factor3']\n",
|
||||
"\n",
|
||||
" # 计算 act_factor5 和 act_factor6\n",
|
||||
" df['act_factor5'] = df['act_factor1'] + df['act_factor2'] + df['act_factor3'] + df['act_factor4']\n",
|
||||
" df['act_factor6'] = (df['act_factor1'] - df['act_factor2']) / np.sqrt(\n",
|
||||
" df['act_factor1'] ** 2 + df['act_factor2'] ** 2)\n",
|
||||
"\n",
|
||||
" # 根据 trade_date 截面计算排名\n",
|
||||
" df['rank_act_factor1'] = df.groupby('trade_date', group_keys=False)['act_factor1'].rank(ascending=False, pct=True)\n",
|
||||
" df['rank_act_factor2'] = df.groupby('trade_date', group_keys=False)['act_factor2'].rank(ascending=False, pct=True)\n",
|
||||
" df['rank_act_factor3'] = df.groupby('trade_date', group_keys=False)['act_factor3'].rank(ascending=False, pct=True)\n",
|
||||
"\n",
|
||||
" return df\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def get_money_flow_factor(df):\n",
|
||||
" # 计算资金流相关因子(字段名称见 tushare 数据说明)\n",
|
||||
" df['active_buy_volume_large'] = df['buy_lg_vol'] / df['net_mf_vol']\n",
|
||||
" df['active_buy_volume_big'] = df['buy_elg_vol'] / df['net_mf_vol']\n",
|
||||
" df['active_buy_volume_small'] = df['buy_sm_vol'] / df['net_mf_vol']\n",
|
||||
"\n",
|
||||
" df['buy_lg_vol_minus_sell_lg_vol'] = (df['buy_lg_vol'] - df['sell_lg_vol']) / df['net_mf_vol']\n",
|
||||
" df['buy_elg_vol_minus_sell_elg_vol'] = (df['buy_elg_vol'] - df['sell_elg_vol']) / df['net_mf_vol']\n",
|
||||
"\n",
|
||||
" df['log(circ_mv)'] = np.log(df['circ_mv'])\n",
|
||||
" return df\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def get_alpha_factor(df):\n",
|
||||
" df = df.sort_values(by=['ts_code', 'trade_date'])\n",
|
||||
" grouped = df.groupby('ts_code')\n",
|
||||
"\n",
|
||||
" # alpha_022: 当前 close 与 5 日前 close 差值\n",
|
||||
" df['alpha_022'] = grouped['close'].transform(lambda x: x - x.shift(5))\n",
|
||||
"\n",
|
||||
" # alpha_003: (close - open) / (high - low)\n",
|
||||
" df['alpha_003'] = np.where(df['high'] != df['low'],\n",
|
||||
" (df['close'] - df['open']) / (df['high'] - df['low']),\n",
|
||||
" 0)\n",
|
||||
"\n",
|
||||
" # alpha_007: 计算过去5日 close 与 vol 的相关性,并按 trade_date 排名\n",
|
||||
" df['alpha_007'] = grouped.apply(lambda x: x['close'].rolling(5).corr(x['vol'])).reset_index(level=0, drop=True)\n",
|
||||
" df['alpha_007'] = df.groupby('trade_date', group_keys=False)['alpha_007'].rank(ascending=True, pct=True)\n",
|
||||
"\n",
|
||||
" # alpha_013: 计算过去5日 close 之和 - 20日 close 之和,并按 trade_date 排名\n",
|
||||
" df['alpha_013'] = grouped['close'].transform(lambda x: x.rolling(5).sum() - x.rolling(20).sum())\n",
|
||||
" df['alpha_013'] = df.groupby('trade_date', group_keys=False)['alpha_013'].rank(ascending=True, pct=True)\n",
|
||||
"\n",
|
||||
" return df\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def get_limit_factor(df):\n",
|
||||
" # 按股票和日期排序\n",
|
||||
" df = df.sort_values(by=['ts_code', 'trade_date'])\n",
|
||||
"\n",
|
||||
" # 分组处理\n",
|
||||
" grouped = df.groupby('ts_code', group_keys=False)\n",
|
||||
"\n",
|
||||
" # 1. 今日是否涨停/跌停\n",
|
||||
" df['cat_up_limit'] = (df['close'] == df['up_limit']).astype(int) # 是否涨停(1表示涨停,0表示未涨停)\n",
|
||||
" df['cat_down_limit'] = (df['close'] == df['down_limit']).astype(int) # 是否跌停(1表示跌停,0表示未跌停)\n",
|
||||
"\n",
|
||||
" # 2. 最近涨跌停次数(过去20个交易日)\n",
|
||||
" df['up_limit_count_10d'] = grouped['cat_up_limit'].rolling(window=10, min_periods=1).sum().reset_index(level=0,\n",
|
||||
" drop=True)\n",
|
||||
" df['down_limit_count_10d'] = grouped['cat_down_limit'].rolling(window=10, min_periods=1).sum().reset_index(level=0,\n",
|
||||
" drop=True)\n",
|
||||
"\n",
|
||||
" # 3. 最近连续涨跌停天数\n",
|
||||
" def calculate_consecutive_limits(series):\n",
|
||||
" \"\"\"\n",
|
||||
" 计算连续涨停/跌停天数。\n",
|
||||
" \"\"\"\n",
|
||||
" consecutive_up = series * (series.groupby((series != series.shift()).cumsum()).cumcount() + 1)\n",
|
||||
" consecutive_down = series * (series.groupby((series != series.shift()).cumsum()).cumcount() + 1)\n",
|
||||
" return consecutive_up, consecutive_down\n",
|
||||
"\n",
|
||||
" # 连续涨停天数\n",
|
||||
" df['consecutive_up_limit'] = grouped['cat_up_limit'].apply(\n",
|
||||
" lambda x: calculate_consecutive_limits(x)[0]\n",
|
||||
" ).reset_index(level=0, drop=True)\n",
|
||||
"\n",
|
||||
" # 连续跌停天数\n",
|
||||
" # df['consecutive_down_limit'] = grouped['cat_down_limit'].apply(\n",
|
||||
" # lambda x: calculate_consecutive_limits(x)[1]\n",
|
||||
" # ).reset_index(level=0, drop=True)\n",
|
||||
"\n",
|
||||
" return df\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def get_cyp_perf_factor(df):\n",
|
||||
" # 预处理:按股票代码和时间排序\n",
|
||||
" df = df.sort_values(by=['ts_code', 'trade_date'])\n",
|
||||
"\n",
|
||||
" # 按股票代码分组处理\n",
|
||||
" grouped = df.groupby('ts_code', group_keys=False)\n",
|
||||
"\n",
|
||||
" df['ctrl_strength'] = (df['cost_85pct'] - df['cost_15pct']) / (df['his_high'] - df['his_low'])\n",
|
||||
"\n",
|
||||
" df['low_cost_dev'] = (df['close'] - df['cost_5pct']) / (df['cost_50pct'] - df['cost_5pct'])\n",
|
||||
"\n",
|
||||
" df['asymmetry'] = (df['cost_95pct'] - df['cost_50pct']) / (df['cost_50pct'] - df['cost_5pct'])\n",
|
||||
"\n",
|
||||
" df['lock_factor'] = df['turnover_rate'] * (\n",
|
||||
" 1 - (df['cost_95pct'] - df['cost_5pct']) / (df['his_high'] - df['his_low']))\n",
|
||||
"\n",
|
||||
" df['vol_break'] = np.where((df['close'] > df['cost_85pct']) & (df['volume_ratio'] > 2), 1, 0)\n",
|
||||
"\n",
|
||||
" df['weight_roc5'] = grouped['weight_avg'].apply(lambda x: x.pct_change(5))\n",
|
||||
"\n",
|
||||
" def rolling_corr(group):\n",
|
||||
" roc_close = group['close'].pct_change()\n",
|
||||
" roc_weight = group['weight_avg'].pct_change()\n",
|
||||
" return roc_close.rolling(10).corr(roc_weight)\n",
|
||||
"\n",
|
||||
" df['price_cost_divergence'] = grouped.apply(rolling_corr)\n",
|
||||
"\n",
|
||||
" def calc_atr(group):\n",
|
||||
" high, low, close = group['high'], group['low'], group['close']\n",
|
||||
" tr = np.maximum(high - low,\n",
|
||||
" np.maximum(abs(high - close.shift()),\n",
|
||||
" abs(low - close.shift())))\n",
|
||||
" return tr.rolling(14).mean()\n",
|
||||
"\n",
|
||||
" df['atr_14'] = grouped.apply(calc_atr)\n",
|
||||
" df['cost_atr_adj'] = (df['cost_95pct'] - df['cost_5pct']) / df['atr_14']\n",
|
||||
"\n",
|
||||
" # 12. 小盘股筹码集中度\n",
|
||||
" df['smallcap_concentration'] = (1 / df['circ_mv']) * (df['cost_85pct'] - df['cost_15pct'])\n",
|
||||
"\n",
|
||||
" # 16. 筹码稳定性指数 (20日波动率)\n",
|
||||
" df['weight_std20'] = grouped['weight_avg'].apply(lambda x: x.rolling(20).std())\n",
|
||||
" df['cost_stability'] = df['weight_std20'] / grouped['weight_avg'].transform(lambda x: x.rolling(20).mean())\n",
|
||||
"\n",
|
||||
" # 17. 成本区间突破标记\n",
|
||||
" df['high_cost_break_days'] = grouped.apply(lambda g: g['close'].gt(g['cost_95pct']).rolling(5).sum())\n",
|
||||
"\n",
|
||||
" # 18. 黄金筹码共振 (复合事件)\n",
|
||||
" df['cat_golden_resonance'] = ((df['close'] > df['weight_avg']) &\n",
|
||||
" (df['volume_ratio'] > 1.5) &\n",
|
||||
" (df['winner_rate'] > 0.7))\n",
|
||||
"\n",
|
||||
" # 20. 筹码-流动性风险\n",
|
||||
" df['liquidity_risk'] = (df['cost_95pct'] - df['cost_5pct']) * (\n",
|
||||
" 1 / grouped['vol'].transform(lambda x: x.rolling(10).mean()))\n",
|
||||
"\n",
|
||||
" df.drop(columns=['weight_std20'], inplace=True, errors='ignore')\n",
|
||||
"\n",
|
||||
" return df\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def get_mv_factors(df):\n",
|
||||
" \"\"\"\n",
|
||||
" 计算多个因子并生成最终的综合因子。\n",
|
||||
"\n",
|
||||
" 参数:\n",
|
||||
" df (pd.DataFrame): 包含 ts_code, trade_date, turnover_rate, pe_ttm, pb, ps, circ_mv, volume_ratio, vol 等列的数据框。\n",
|
||||
"\n",
|
||||
" 返回:\n",
|
||||
" pd.DataFrame: 包含新增因子和最终综合因子的数据框。\n",
|
||||
" \"\"\"\n",
|
||||
" # 按 ts_code 和 trade_date 排序\n",
|
||||
" df = df.sort_values(by=['ts_code', 'trade_date'])\n",
|
||||
"\n",
|
||||
" # 按 ts_code 分组\n",
|
||||
" grouped = df.groupby('ts_code', group_keys=False)\n",
|
||||
"\n",
|
||||
" # 1. 市值流动比因子\n",
|
||||
" df['mv_turnover_ratio'] = df['turnover_rate'] / df['circ_mv']\n",
|
||||
"\n",
|
||||
" # 2. 市值调整成交量因子\n",
|
||||
" df['mv_adjusted_volume'] = df['vol'] / df['circ_mv']\n",
|
||||
"\n",
|
||||
" # 3. 市值加权换手率因子\n",
|
||||
" df['mv_weighted_turnover'] = df['turnover_rate'] * (1 / df['circ_mv'])\n",
|
||||
"\n",
|
||||
" # 4. 非线性市值成交量因子\n",
|
||||
" df['nonlinear_mv_volume'] = df['vol'] / df['circ_mv']\n",
|
||||
"\n",
|
||||
" # 5. 市值量比因子\n",
|
||||
" df['mv_volume_ratio'] = df['volume_ratio'] / df['circ_mv']\n",
|
||||
"\n",
|
||||
" # 6. 市值动量因子\n",
|
||||
" df['mv_momentum'] = df['turnover_rate'] * df['volume_ratio'] / df['circ_mv']\n",
|
||||
"\n",
|
||||
" # 7. 市值波动率因子\n",
|
||||
" df['turnover_std'] = grouped['turnover_rate'].rolling(window=20).std().reset_index(level=0, drop=True)\n",
|
||||
" df['mv_volatility'] = grouped.apply(lambda x: x['turnover_std'] / x['circ_mv']).reset_index(level=0, drop=True)\n",
|
||||
"\n",
|
||||
" # 8. 市值成长性因子\n",
|
||||
" df['volume_growth'] = grouped['vol'].pct_change(periods=20).reset_index(level=0, drop=True)\n",
|
||||
" df['mv_growth'] = grouped.apply(lambda x: x['volume_growth'] / x['circ_mv']).reset_index(level=0, drop=True)\n",
|
||||
"\n",
|
||||
" # # 标准化因子\n",
|
||||
" # factor_columns = [\n",
|
||||
" # 'mv_turnover_ratio', 'mv_adjusted_volume', 'mv_weighted_turnover',\n",
|
||||
" # 'nonlinear_mv_volume', 'mv_volume_ratio', 'mv_momentum',\n",
|
||||
" # 'mv_volatility', 'mv_growth'\n",
|
||||
" # ]\n",
|
||||
" # scaler = StandardScaler()\n",
|
||||
" # df[factor_columns] = scaler.fit_transform(df[factor_columns])\n",
|
||||
" #\n",
|
||||
" # # 加权合成因子\n",
|
||||
" # weights = [0.2, 0.15, 0.15, 0.1, 0.1, 0.1, 0.1, 0.1] # 各因子权重\n",
|
||||
" # df['final_combined_factor'] = df[factor_columns].dot(weights)\n",
|
||||
"\n",
|
||||
" return df"
|
||||
],
|
||||
"id": "505e825945e4b8cf"
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 2
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython2",
|
||||
"version": "2.7.6"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
1359
code/train/predictions.tsv
Normal file
1359
code/train/predictions.tsv
Normal file
File diff suppressed because it is too large
Load Diff
262
code/train/predictions_test.tsv
Normal file
262
code/train/predictions_test.tsv
Normal file
@@ -0,0 +1,262 @@
|
||||
trade_date,score,ts_code
|
||||
2024-03-01,0.48944956369028353,600515.SH
|
||||
2024-03-04,0.4910973129007265,601377.SH
|
||||
2024-03-05,0.49755441935533035,601377.SH
|
||||
2024-03-06,0.48530227310531543,000950.SZ
|
||||
2024-03-07,0.4852382924633875,002252.SZ
|
||||
2024-03-08,0.4295690224121115,000650.SZ
|
||||
2024-03-11,0.4772666567296332,002539.SZ
|
||||
2024-03-12,0.44912255948878027,002030.SZ
|
||||
2024-03-13,0.48036470172022555,603366.SH
|
||||
2024-03-14,0.47455902892485136,002616.SZ
|
||||
2024-03-15,0.4898609892396381,002616.SZ
|
||||
2024-03-18,0.4898609892396381,002616.SZ
|
||||
2024-03-19,0.4585803401938102,600749.SH
|
||||
2024-03-20,0.4717940942781549,601616.SH
|
||||
2024-03-21,0.4898609892396381,002616.SZ
|
||||
2024-03-22,0.4921881826907514,600268.SH
|
||||
2024-03-25,0.4529948903273269,000910.SZ
|
||||
2024-03-26,0.468535149074664,600163.SH
|
||||
2024-03-27,0.45345883846621166,600268.SH
|
||||
2024-03-28,0.4629678911934166,603689.SH
|
||||
2024-03-29,0.4576699667200496,600479.SH
|
||||
2024-04-01,0.4880682627555449,000407.SZ
|
||||
2024-04-02,0.4658882567202438,002228.SZ
|
||||
2024-04-03,0.46661866733947643,600279.SH
|
||||
2024-04-08,0.4348058590086616,601199.SH
|
||||
2024-04-09,0.440404827771838,600279.SH
|
||||
2024-04-10,0.45436095343545857,600279.SH
|
||||
2024-04-11,0.4860610115557819,002807.SZ
|
||||
2024-04-12,0.49251811801038575,002807.SZ
|
||||
2024-04-15,0.45282134310413785,002267.SZ
|
||||
2024-04-16,0.44573083040693495,601200.SH
|
||||
2024-04-17,0.4755082230960848,600681.SH
|
||||
2024-04-18,0.46969708080904654,600681.SH
|
||||
2024-04-19,0.4763960575203315,601963.SH
|
||||
2024-04-22,0.4797611675632624,601006.SH
|
||||
2024-04-23,0.4417060612494253,600905.SH
|
||||
2024-04-24,0.467078149060861,600681.SH
|
||||
2024-04-25,0.48593234463506524,601016.SH
|
||||
2024-04-26,0.485704669384595,600248.SH
|
||||
2024-04-29,0.4856232616952012,000088.SZ
|
||||
2024-04-30,0.47658127432737996,600717.SH
|
||||
2024-05-06,0.485599247202529,600057.SH
|
||||
2024-05-07,0.48502517567793635,600928.SH
|
||||
2024-05-08,0.4875800938455082,601997.SH
|
||||
2024-05-09,0.4764814479757618,601990.SH
|
||||
2024-05-10,0.4767299756060024,601187.SH
|
||||
2024-05-13,0.48339009051626075,002252.SZ
|
||||
2024-05-14,0.49804720312809037,601236.SH
|
||||
2024-05-15,0.4754440788180303,601669.SH
|
||||
2024-05-16,0.47188100886784723,002252.SZ
|
||||
2024-05-17,0.4738812084576224,000505.SZ
|
||||
2024-05-20,0.47680745811007763,600300.SH
|
||||
2024-05-21,0.4914822821325402,000919.SZ
|
||||
2024-05-22,0.43568399742751623,603777.SH
|
||||
2024-05-23,0.48230237417866073,000919.SZ
|
||||
2024-05-24,0.45217189085141946,600846.SH
|
||||
2024-05-27,0.4822386303700534,000012.SZ
|
||||
2024-05-28,0.4911043379651643,000012.SZ
|
||||
2024-05-29,0.4593967921412618,000012.SZ
|
||||
2024-05-30,0.46327623228814613,600033.SH
|
||||
2024-05-31,0.48434217554317305,601006.SH
|
||||
2024-06-03,0.49496311907072943,601158.SH
|
||||
2024-06-04,0.49311491712360267,601158.SH
|
||||
2024-06-05,0.4662101696226407,600681.SH
|
||||
2024-06-06,0.44598437609141306,600507.SH
|
||||
2024-06-07,0.46683069546117434,002936.SZ
|
||||
2024-06-11,0.48392216170099567,002936.SZ
|
||||
2024-06-12,0.48392216170099567,002936.SZ
|
||||
2024-06-13,0.4856066420239958,601555.SH
|
||||
2024-06-14,0.4626378515738043,600273.SH
|
||||
2024-06-17,0.4930611701154713,601868.SH
|
||||
2024-06-18,0.4850760483461876,600909.SH
|
||||
2024-06-19,0.48789755402053436,600909.SH
|
||||
2024-06-20,0.47168404865503394,600918.SH
|
||||
2024-06-21,0.4780215106901423,002939.SZ
|
||||
2024-06-24,0.4670168740500108,600507.SH
|
||||
2024-06-25,0.4462181161419812,600507.SH
|
||||
2024-06-26,0.47871585166988867,601108.SH
|
||||
2024-06-27,0.4550530100208326,601108.SH
|
||||
2024-06-28,0.4682668022638408,600056.SH
|
||||
2024-07-01,0.4956846242134755,601555.SH
|
||||
2024-07-02,0.4855061855012652,601375.SH
|
||||
2024-07-03,0.48208040944932096,000166.SZ
|
||||
2024-07-04,0.4741422882624783,002763.SZ
|
||||
2024-07-05,0.4207747223564723,600279.SH
|
||||
2024-07-08,0.4505916830616898,000778.SZ
|
||||
2024-07-09,0.47552387696457243,600108.SH
|
||||
2024-07-10,0.48433484852590997,601990.SH
|
||||
2024-07-11,0.4670143394143526,600597.SH
|
||||
2024-07-12,0.48598259773635294,601228.SH
|
||||
2024-07-15,0.46747598383004435,600704.SH
|
||||
2024-07-16,0.46517324677556354,601228.SH
|
||||
2024-07-17,0.4669329317249588,600681.SH
|
||||
2024-07-18,0.4342287048693295,603886.SH
|
||||
2024-07-19,0.4688897638752658,002936.SZ
|
||||
2024-07-22,0.4655336697859469,601008.SH
|
||||
2024-07-23,0.4385299287889017,000959.SZ
|
||||
2024-07-24,0.40980850428167526,600059.SH
|
||||
2024-07-25,0.4354808457559223,600928.SH
|
||||
2024-07-26,0.43863748866816843,002936.SZ
|
||||
2024-07-29,0.4757869671100767,002936.SZ
|
||||
2024-07-30,0.45055950278777573,600056.SH
|
||||
2024-07-31,0.43441391812299834,000672.SZ
|
||||
2024-08-01,0.46163952351711074,600597.SH
|
||||
2024-08-02,0.45912773679288216,601528.SH
|
||||
2024-08-05,0.42637336325271274,000012.SZ
|
||||
2024-08-06,0.44417354831155165,000498.SZ
|
||||
2024-08-07,0.46761174293945373,002029.SZ
|
||||
2024-08-08,0.44084265961702834,600061.SH
|
||||
2024-08-09,0.44756951600219536,600061.SH
|
||||
2024-08-12,0.43865468410238356,600517.SH
|
||||
2024-08-13,0.40119132713124916,600798.SH
|
||||
2024-08-14,0.4147687048594177,600279.SH
|
||||
2024-08-15,0.4285641100025751,600283.SH
|
||||
2024-08-16,0.43481178678490995,600279.SH
|
||||
2024-08-19,0.4334514206803078,000055.SZ
|
||||
2024-08-20,0.4271544196243579,600925.SH
|
||||
2024-08-21,0.44140030057394675,601718.SH
|
||||
2024-08-22,0.4260043286627239,002266.SZ
|
||||
2024-08-23,0.4272469992425721,002266.SZ
|
||||
2024-08-26,0.43298605818458036,601718.SH
|
||||
2024-08-27,0.42993335252901094,600308.SH
|
||||
2024-08-28,0.44014909636073596,601216.SH
|
||||
2024-08-29,0.4549086267823673,002108.SZ
|
||||
2024-08-30,0.4432950051114155,603817.SH
|
||||
2024-09-02,0.44924217940667366,603759.SH
|
||||
2024-09-03,0.45929491873592476,603759.SH
|
||||
2024-09-04,0.4555291720504659,603817.SH
|
||||
2024-09-05,0.45486187663776934,000581.SZ
|
||||
2024-09-06,0.458767486527876,600016.SH
|
||||
2024-09-09,0.42622859802922114,000725.SZ
|
||||
2024-09-10,0.47344109719180894,002239.SZ
|
||||
2024-09-11,0.4602775423090333,600050.SH
|
||||
2024-09-12,0.4581305095531178,603856.SH
|
||||
2024-09-13,0.48196532955068866,002697.SZ
|
||||
2024-09-18,0.48196532955068866,002697.SZ
|
||||
2024-09-19,0.4448663482243619,600210.SH
|
||||
2024-09-20,0.4894480847057984,600293.SH
|
||||
2024-09-23,0.48392216170099567,002239.SZ
|
||||
2024-09-24,0.43493260807615913,603182.SH
|
||||
2024-09-25,0.4279590881784511,601369.SH
|
||||
2024-09-26,0.457735727285402,600526.SH
|
||||
2024-09-27,0.464687096497251,002818.SZ
|
||||
2024-09-30,0.21397500413643297,600081.SH
|
||||
2024-10-08,0.22763716829204592,605333.SH
|
||||
2024-10-09,0.44498910797127905,600495.SH
|
||||
2024-10-10,0.4635725472634731,600251.SH
|
||||
2024-10-11,0.46083590103602623,000967.SZ
|
||||
2024-10-14,0.47310129519500743,600251.SH
|
||||
2024-10-15,0.4432757845922387,601727.SH
|
||||
2024-10-16,0.49142274407028197,002267.SZ
|
||||
2024-10-17,0.4969633584025033,600032.SH
|
||||
2024-10-18,0.49050625194789943,002267.SZ
|
||||
2024-10-21,0.4839204785725789,000709.SZ
|
||||
2024-10-22,0.47843667953797847,002237.SZ
|
||||
2024-10-23,0.4785259848937853,601577.SH
|
||||
2024-10-24,0.48592022305116356,601577.SH
|
||||
2024-10-25,0.4799806860019888,600820.SH
|
||||
2024-10-28,0.4616976069244002,002135.SZ
|
||||
2024-10-29,0.48907368099203313,600330.SH
|
||||
2024-10-30,0.47786058189302977,600516.SH
|
||||
2024-10-31,0.46572447890972274,600969.SH
|
||||
2024-11-01,0.46269570933167936,600261.SH
|
||||
2024-11-04,0.4819410916205754,600249.SH
|
||||
2024-11-05,0.4866489103957071,603167.SH
|
||||
2024-11-06,0.485704669384595,600423.SH
|
||||
2024-11-07,0.4906974157098039,600249.SH
|
||||
2024-11-08,0.47099968069944914,002328.SZ
|
||||
2024-11-11,0.47924756292999116,600103.SH
|
||||
2024-11-12,0.4864515748363791,600103.SH
|
||||
2024-11-13,0.4682636704183581,600419.SH
|
||||
2024-11-14,0.47314641913653194,000533.SZ
|
||||
2024-11-15,0.48200633444426155,600103.SH
|
||||
2024-11-18,0.4734312305146418,600300.SH
|
||||
2024-11-19,0.481591587838489,601113.SH
|
||||
2024-11-20,0.485704669384595,600493.SH
|
||||
2024-11-21,0.48103812387602335,600515.SH
|
||||
2024-11-22,0.48907368099203313,600284.SH
|
||||
2024-11-25,0.49351892138415066,603111.SH
|
||||
2024-11-26,0.49311491712360267,601187.SH
|
||||
2024-11-27,0.49823564043257407,601187.SH
|
||||
2024-11-28,0.49478522113574214,002091.SZ
|
||||
2024-11-29,0.4744243783704977,002390.SZ
|
||||
2024-12-02,0.47703304354620096,002390.SZ
|
||||
2024-12-03,0.48674050526244056,002566.SZ
|
||||
2024-12-04,0.4959275225246577,002753.SZ
|
||||
2024-12-05,0.48839819807517926,002753.SZ
|
||||
2024-12-06,0.4870618149295468,603128.SH
|
||||
2024-12-09,0.49249410351771356,600425.SH
|
||||
2024-12-10,0.4959275225246577,002772.SZ
|
||||
2024-12-11,0.48502517567793635,600035.SH
|
||||
2024-12-12,0.48907368099203313,600035.SH
|
||||
2024-12-13,0.4901095168698787,600035.SH
|
||||
2024-12-16,0.485704669384595,603577.SH
|
||||
2024-12-17,0.42271811250681296,000533.SZ
|
||||
2024-12-18,0.45253512664134277,000026.SZ
|
||||
2024-12-19,0.4668403018194925,601000.SH
|
||||
2024-12-20,0.47643624443298405,000026.SZ
|
||||
2024-12-23,0.4405301681206755,600305.SH
|
||||
2024-12-24,0.44052806294048863,000883.SZ
|
||||
2024-12-25,0.448810439825342,000589.SZ
|
||||
2024-12-26,0.47338854249265216,600582.SH
|
||||
2024-12-27,0.4495112908067394,000830.SZ
|
||||
2024-12-30,0.463164320301011,601006.SH
|
||||
2024-12-31,0.4773178744459276,600004.SH
|
||||
2025-01-02,0.47702414327290926,600572.SH
|
||||
2025-01-03,0.4474782636368997,601163.SH
|
||||
2025-01-06,0.4333771722554744,600821.SH
|
||||
2025-01-07,0.44770531932040636,600004.SH
|
||||
2025-01-08,0.47082512104142743,600116.SH
|
||||
2025-01-09,0.4541624257750102,600004.SH
|
||||
2025-01-10,0.4505883376349118,600905.SH
|
||||
2025-01-13,0.4505883376349118,600905.SH
|
||||
2025-01-14,0.4629277092655053,000088.SZ
|
||||
2025-01-15,0.4849467618585074,601222.SH
|
||||
2025-01-16,0.4904213884330184,600273.SH
|
||||
2025-01-17,0.49427977331421474,002267.SZ
|
||||
2025-01-20,0.4921881826907514,000088.SZ
|
||||
2025-01-21,0.49980885843191936,002233.SZ
|
||||
2025-01-22,0.49226959038014517,603817.SH
|
||||
2025-01-23,0.4845623679956794,600731.SH
|
||||
2025-01-24,0.4894480847057984,002443.SZ
|
||||
2025-01-27,0.480828304199342,600475.SH
|
||||
2025-02-05,0.49381191740852215,600475.SH
|
||||
2025-02-06,0.48051107405830695,600219.SH
|
||||
2025-02-07,0.4912337545022996,002365.SZ
|
||||
2025-02-10,0.4891404807504195,002606.SZ
|
||||
2025-02-11,0.48918316877240914,002454.SZ
|
||||
2025-02-12,0.48841224882795287,605138.SH
|
||||
2025-02-13,0.49427977331421474,603022.SH
|
||||
2025-02-14,0.4961990011809636,002454.SZ
|
||||
2025-02-17,0.49980885843191936,603172.SH
|
||||
2025-02-18,0.4859761480409009,600526.SH
|
||||
2025-02-19,0.47951904158629705,600526.SH
|
||||
2025-02-20,0.4865640468808261,002972.SZ
|
||||
2025-02-21,0.4878226668596109,002972.SZ
|
||||
2025-02-24,0.49427977331421474,002972.SZ
|
||||
2025-02-25,0.4959275225246577,002972.SZ
|
||||
2025-02-26,0.49559758720502334,000850.SZ
|
||||
2025-02-27,0.4962265008275085,600969.SH
|
||||
2025-02-28,0.45857905703475876,000931.SZ
|
||||
2025-03-03,0.475929514181944,600704.SH
|
||||
2025-03-04,0.4855766924008325,603176.SH
|
||||
2025-03-05,0.48947041607005387,600749.SH
|
||||
2025-03-06,0.48947041607005387,600749.SH
|
||||
2025-03-07,0.48852295338866114,002948.SZ
|
||||
2025-03-10,0.48013938012027435,600749.SH
|
||||
2025-03-11,0.4697442502219659,603916.SH
|
||||
2025-03-12,0.46825376442882116,600969.SH
|
||||
2025-03-13,0.485704669384595,601311.SH
|
||||
2025-03-14,0.47290759442027386,600784.SH
|
||||
2025-03-17,0.48236846890282975,002204.SZ
|
||||
2025-03-18,0.4753161955607809,600784.SH
|
||||
2025-03-19,0.4898609892396381,002627.SZ
|
||||
2025-03-20,0.4898609892396381,002627.SZ
|
||||
2025-03-21,0.47660773064492085,000589.SZ
|
||||
2025-03-24,0.4753538392607698,000589.SZ
|
||||
2025-03-25,0.4628733846203298,000589.SZ
|
||||
2025-03-26,0.45948501496487415,603367.SH
|
||||
2025-03-27,0.47591884404751766,600017.SH
|
||||
2025-03-28,0.4765671044505851,600925.SH
|
||||
|
1116
code/train/predictions_train.tsv
Normal file
1116
code/train/predictions_train.tsv
Normal file
File diff suppressed because it is too large
Load Diff
738
code/train/utils/factor.py
Normal file
738
code/train/utils/factor.py
Normal file
@@ -0,0 +1,738 @@
|
||||
import numpy as np
|
||||
import talib
|
||||
import pandas as pd
|
||||
|
||||
def get_technical_factor(df):
|
||||
# 按股票和日期排序
|
||||
df = df.sort_values(by=['ts_code', 'trade_date'])
|
||||
grouped = df.groupby('ts_code', group_keys=False)
|
||||
|
||||
df['return_skew'] = grouped['pct_chg'].rolling(window=5).skew().reset_index(0, drop=True)
|
||||
df['return_kurtosis'] = grouped['pct_chg'].rolling(window=5).kurt().reset_index(0, drop=True)
|
||||
|
||||
# 因子 1:短期成交量变化率
|
||||
df['volume_change_rate'] = (
|
||||
grouped['vol'].rolling(window=2).mean() /
|
||||
grouped['vol'].rolling(window=5).mean() - 1
|
||||
).reset_index(level=0, drop=True) # 确保索引对齐
|
||||
|
||||
# 因子 2:成交量突破信号
|
||||
max_volume = grouped['vol'].rolling(window=5).max().reset_index(level=0, drop=True) # 确保索引对齐
|
||||
df['cat_volume_breakout'] = (df['vol'] > max_volume)
|
||||
|
||||
# 因子 3:换手率均线偏离度
|
||||
mean_turnover = grouped['turnover_rate'].rolling(window=3).mean().reset_index(level=0, drop=True)
|
||||
std_turnover = grouped['turnover_rate'].rolling(window=3).std().reset_index(level=0, drop=True)
|
||||
df['turnover_deviation'] = (df['turnover_rate'] - mean_turnover) / std_turnover
|
||||
|
||||
# 因子 4:换手率激增信号
|
||||
df['cat_turnover_spike'] = (df['turnover_rate'] > mean_turnover + 2 * std_turnover)
|
||||
|
||||
# 因子 5:量比均值
|
||||
df['avg_volume_ratio'] = grouped['volume_ratio'].rolling(window=3).mean().reset_index(level=0, drop=True)
|
||||
|
||||
# 因子 6:量比突破信号
|
||||
max_volume_ratio = grouped['volume_ratio'].rolling(window=5).max().reset_index(level=0, drop=True)
|
||||
df['cat_volume_ratio_breakout'] = (df['volume_ratio'] > max_volume_ratio)
|
||||
|
||||
# 因子 7:成交量与换手率的综合动量因子
|
||||
alpha = 0.5
|
||||
df['momentum_factor'] = df['volume_change_rate'] + alpha * df['turnover_deviation']
|
||||
|
||||
# 因子 8:量价共振因子
|
||||
df['price_change_rate'] = grouped['close'].pct_change()
|
||||
df['resonance_factor'] = df['volume_ratio'] * df['price_change_rate']
|
||||
|
||||
# 计算 up 和 down
|
||||
df['log_close'] = np.log(df['close'])
|
||||
|
||||
df['vol_spike'] = grouped.apply(
|
||||
lambda x: pd.Series(x['vol'].rolling(20).mean(), index=x.index)
|
||||
)
|
||||
df['cat_vol_spike'] = df['vol'] > 2 * df['vol_spike']
|
||||
df['vol_std_5'] = df['vol'].pct_change().rolling(5).std()
|
||||
|
||||
df['up'] = (df['high'] - df[['close', 'open']].max(axis=1)) / df['close']
|
||||
df['down'] = (df[['close', 'open']].min(axis=1) - df['low']) / df['close']
|
||||
|
||||
# 计算 ATR
|
||||
df['atr_14'] = grouped.apply(
|
||||
lambda x: pd.Series(talib.ATR(x['high'].values, x['low'].values, x['close'].values, timeperiod=14),
|
||||
index=x.index)
|
||||
)
|
||||
df['atr_6'] = grouped.apply(
|
||||
lambda x: pd.Series(talib.ATR(x['high'].values, x['low'].values, x['close'].values, timeperiod=6),
|
||||
index=x.index)
|
||||
)
|
||||
|
||||
# 计算 OBV 及其均线
|
||||
df['obv'] = grouped.apply(
|
||||
lambda x: pd.Series(talib.OBV(x['close'].values, x['vol'].values), index=x.index)
|
||||
)
|
||||
df['maobv_6'] = grouped.apply(
|
||||
lambda x: pd.Series(talib.SMA(x['obv'].values, timeperiod=6), index=x.index)
|
||||
)
|
||||
df['obv-maobv_6'] = df['obv'] - df['maobv_6']
|
||||
|
||||
# 计算 RSI
|
||||
df['rsi_3'] = grouped.apply(
|
||||
lambda x: pd.Series(talib.RSI(x['close'].values, timeperiod=3), index=x.index)
|
||||
)
|
||||
df['rsi_6'] = grouped.apply(
|
||||
lambda x: pd.Series(talib.RSI(x['close'].values, timeperiod=6), index=x.index)
|
||||
)
|
||||
df['rsi_9'] = grouped.apply(
|
||||
lambda x: pd.Series(talib.RSI(x['close'].values, timeperiod=9), index=x.index)
|
||||
)
|
||||
|
||||
# 计算 return_10 和 return_20
|
||||
df['return_5'] = grouped['close'].apply(lambda x: x / x.shift(5) - 1)
|
||||
df['return_10'] = grouped['close'].apply(lambda x: x / x.shift(10) - 1)
|
||||
df['return_20'] = grouped['close'].apply(lambda x: x / x.shift(20) - 1)
|
||||
|
||||
# df['avg_close_5'] = grouped['close'].apply(lambda x: x.rolling(window=5).mean() / x)
|
||||
|
||||
# 计算标准差指标
|
||||
df['std_return_5'] = grouped['close'].apply(lambda x: x.pct_change().rolling(window=5).std())
|
||||
df['std_return_15'] = grouped['close'].apply(lambda x: x.pct_change().rolling(window=15).std())
|
||||
df['std_return_25'] = grouped['close'].apply(lambda x: x.pct_change().rolling(window=25).std())
|
||||
df['std_return_90'] = grouped['close'].apply(lambda x: x.pct_change().rolling(window=90).std())
|
||||
df['std_return_90_2'] = grouped['close'].apply(lambda x: x.shift(10).pct_change().rolling(window=90).std())
|
||||
|
||||
# 计算比值指标
|
||||
df['std_return_5 / std_return_90'] = df['std_return_5'] / df['std_return_90']
|
||||
df['std_return_5 / std_return_25'] = df['std_return_5'] / df['std_return_25']
|
||||
|
||||
# 计算标准差差值
|
||||
df['std_return_90 - std_return_90_2'] = df['std_return_90'] - df['std_return_90_2']
|
||||
|
||||
return df
|
||||
|
||||
|
||||
def get_act_factor(df, cat=True):
|
||||
# 按股票和日期排序
|
||||
df = df.sort_values(by=['ts_code', 'trade_date'])
|
||||
grouped = df.groupby('ts_code', group_keys=False)
|
||||
# 计算 EMA 指标
|
||||
df['_ema_5'] = grouped['close'].apply(
|
||||
lambda x: pd.Series(talib.EMA(x.values, timeperiod=5), index=x.index)
|
||||
)
|
||||
df['_ema_13'] = grouped['close'].apply(
|
||||
lambda x: pd.Series(talib.EMA(x.values, timeperiod=13), index=x.index)
|
||||
)
|
||||
df['_ema_20'] = grouped['close'].apply(
|
||||
lambda x: pd.Series(talib.EMA(x.values, timeperiod=20), index=x.index)
|
||||
)
|
||||
df['_ema_60'] = grouped['close'].apply(
|
||||
lambda x: pd.Series(talib.EMA(x.values, timeperiod=60), index=x.index)
|
||||
)
|
||||
|
||||
# 计算 act_factor1, act_factor2, act_factor3, act_factor4
|
||||
df['act_factor1'] = grouped['_ema_5'].apply(
|
||||
lambda x: np.arctan((x / x.shift(1) - 1) * 100) * 57.3 / 50
|
||||
)
|
||||
df['act_factor2'] = grouped['_ema_13'].apply(
|
||||
lambda x: np.arctan((x / x.shift(1) - 1) * 100) * 57.3 / 40
|
||||
)
|
||||
df['act_factor3'] = grouped['_ema_20'].apply(
|
||||
lambda x: np.arctan((x / x.shift(1) - 1) * 100) * 57.3 / 21
|
||||
)
|
||||
df['act_factor4'] = grouped['_ema_60'].apply(
|
||||
lambda x: np.arctan((x / x.shift(1) - 1) * 100) * 57.3 / 10
|
||||
)
|
||||
|
||||
if cat:
|
||||
df['cat_af1'] = df['act_factor1'] > 0
|
||||
df['cat_af2'] = df['act_factor2'] > df['act_factor1']
|
||||
df['cat_af3'] = df['act_factor3'] > df['act_factor2']
|
||||
df['cat_af4'] = df['act_factor4'] > df['act_factor3']
|
||||
|
||||
# 计算 act_factor5 和 act_factor6
|
||||
df['act_factor5'] = df['act_factor1'] + df['act_factor2'] + df['act_factor3'] + df['act_factor4']
|
||||
df['act_factor6'] = (df['act_factor1'] - df['act_factor2']) / np.sqrt(
|
||||
df['act_factor1'] ** 2 + df['act_factor2'] ** 2)
|
||||
|
||||
# 根据 trade_date 截面计算排名
|
||||
df['rank_act_factor1'] = df.groupby('trade_date', group_keys=False)['act_factor1'].rank(ascending=False, pct=True)
|
||||
df['rank_act_factor2'] = df.groupby('trade_date', group_keys=False)['act_factor2'].rank(ascending=False, pct=True)
|
||||
df['rank_act_factor3'] = df.groupby('trade_date', group_keys=False)['act_factor3'].rank(ascending=False, pct=True)
|
||||
|
||||
return df
|
||||
|
||||
|
||||
def get_money_flow_factor(df):
|
||||
# 计算资金流相关因子(字段名称见 tushare 数据说明)
|
||||
df['active_buy_volume_large'] = df['buy_lg_vol'] / df['net_mf_vol']
|
||||
df['active_buy_volume_big'] = df['buy_elg_vol'] / df['net_mf_vol']
|
||||
df['active_buy_volume_small'] = df['buy_sm_vol'] / df['net_mf_vol']
|
||||
|
||||
df['buy_lg_vol_minus_sell_lg_vol'] = (df['buy_lg_vol'] - df['sell_lg_vol']) / df['net_mf_vol']
|
||||
df['buy_elg_vol_minus_sell_elg_vol'] = (df['buy_elg_vol'] - df['sell_elg_vol']) / df['net_mf_vol']
|
||||
|
||||
df['log(circ_mv)'] = np.log(df['circ_mv'])
|
||||
return df
|
||||
|
||||
|
||||
def get_alpha_factor(df):
|
||||
df = df.sort_values(by=['ts_code', 'trade_date'])
|
||||
grouped = df.groupby('ts_code')
|
||||
|
||||
# alpha_022: 当前 close 与 5 日前 close 差值
|
||||
# df['alpha_022'] = grouped['close'].transform(lambda x: x - x.shift(5))
|
||||
def rolling_covariance(x, y, window):
|
||||
return x.rolling(window).cov(y)
|
||||
|
||||
def delta(series, period):
|
||||
return series.diff(period)
|
||||
|
||||
def rank(series):
|
||||
return series.rank(pct=True)
|
||||
|
||||
def stddev(series, window):
|
||||
return series.rolling(window).std()
|
||||
|
||||
# 计算改进后的 Alpha 22 因子
|
||||
window_high_volume = 5
|
||||
window_close_stddev = 20
|
||||
period_delta = 5
|
||||
|
||||
df['cov'] = rolling_covariance(df['high'], df['volume'], window_high_volume)
|
||||
df['delta_cov'] = delta(df['cov'], period_delta)
|
||||
df['_rank_stddev'] = rank(stddev(df['close'], window_close_stddev))
|
||||
df['alpha_22_improved'] = -1 * df['delta_cov'] * df['_rank_stddev']
|
||||
|
||||
# alpha_003: (close - open) / (high - low)
|
||||
df['alpha_003'] = np.where(df['high'] != df['low'],
|
||||
(df['close'] - df['open']) / (df['high'] - df['low']),
|
||||
0)
|
||||
|
||||
# alpha_007: 计算过去5日 close 与 vol 的相关性,并按 trade_date 排名
|
||||
df['alpha_007'] = grouped.apply(lambda x: x['close'].rolling(5).corr(x['vol'])).reset_index(level=0, drop=True)
|
||||
df['alpha_007'] = df.groupby('trade_date', group_keys=False)['alpha_007'].rank(ascending=True, pct=True)
|
||||
|
||||
# alpha_013: 计算过去5日 close 之和 - 20日 close 之和,并按 trade_date 排名
|
||||
df['alpha_013'] = grouped['close'].transform(lambda x: x.rolling(5).sum() - x.rolling(20).sum())
|
||||
df['alpha_013'] = df.groupby('trade_date', group_keys=False)['alpha_013'].rank(ascending=True, pct=True)
|
||||
|
||||
return df
|
||||
|
||||
|
||||
def get_limit_factor(df):
|
||||
# 按股票和日期排序
|
||||
df = df.sort_values(by=['ts_code', 'trade_date'])
|
||||
|
||||
# 分组处理
|
||||
grouped = df.groupby('ts_code', group_keys=False)
|
||||
|
||||
# 1. 今日是否涨停/跌停
|
||||
df['cat_up_limit'] = (df['close'] == df['up_limit']).astype(int) # 是否涨停(1表示涨停,0表示未涨停)
|
||||
df['cat_down_limit'] = (df['close'] == df['down_limit']).astype(int) # 是否跌停(1表示跌停,0表示未跌停)
|
||||
|
||||
# 2. 最近涨跌停次数(过去20个交易日)
|
||||
df['up_limit_count_10d'] = grouped['cat_up_limit'].rolling(window=10, min_periods=1).sum().reset_index(level=0,
|
||||
drop=True)
|
||||
df['down_limit_count_10d'] = grouped['cat_down_limit'].rolling(window=10, min_periods=1).sum().reset_index(level=0,
|
||||
drop=True)
|
||||
|
||||
# 3. 最近连续涨跌停天数
|
||||
def calculate_consecutive_limits(series):
|
||||
"""
|
||||
计算连续涨停/跌停天数。
|
||||
"""
|
||||
consecutive_up = series * (series.groupby((series != series.shift()).cumsum()).cumcount() + 1)
|
||||
consecutive_down = series * (series.groupby((series != series.shift()).cumsum()).cumcount() + 1)
|
||||
return consecutive_up, consecutive_down
|
||||
|
||||
# 连续涨停天数
|
||||
df['consecutive_up_limit'] = grouped['cat_up_limit'].apply(
|
||||
lambda x: calculate_consecutive_limits(x)[0]
|
||||
).reset_index(level=0, drop=True)
|
||||
|
||||
# 连续跌停天数
|
||||
# df['consecutive_down_limit'] = grouped['cat_down_limit'].apply(
|
||||
# lambda x: calculate_consecutive_limits(x)[1]
|
||||
# ).reset_index(level=0, drop=True)
|
||||
|
||||
return df
|
||||
|
||||
|
||||
def get_cyp_perf_factor(df):
|
||||
# 预处理:按股票代码和时间排序
|
||||
df = df.sort_values(by=['ts_code', 'trade_date'])
|
||||
|
||||
# 按股票代码分组处理
|
||||
grouped = df.groupby('ts_code', group_keys=False)
|
||||
|
||||
df['ctrl_strength'] = (df['cost_85pct'] - df['cost_15pct']) / (df['his_high'] - df['his_low'])
|
||||
|
||||
df['low_cost_dev'] = (df['close'] - df['cost_5pct']) / (df['cost_50pct'] - df['cost_5pct'])
|
||||
|
||||
df['asymmetry'] = (df['cost_95pct'] - df['cost_50pct']) / (df['cost_50pct'] - df['cost_5pct'])
|
||||
|
||||
df['lock_factor'] = df['turnover_rate'] * (
|
||||
1 - (df['cost_95pct'] - df['cost_5pct']) / (df['his_high'] - df['his_low']))
|
||||
|
||||
df['vol_break'] = np.where((df['close'] > df['cost_85pct']) & (df['volume_ratio'] > 2), 1, 0)
|
||||
|
||||
df['weight_roc5'] = grouped['weight_avg'].apply(lambda x: x.pct_change(5))
|
||||
|
||||
def rolling_corr(group):
|
||||
roc_close = group['close'].pct_change()
|
||||
roc_weight = group['weight_avg'].pct_change()
|
||||
return roc_close.rolling(10).corr(roc_weight)
|
||||
|
||||
df['price_cost_divergence'] = grouped.apply(rolling_corr)
|
||||
|
||||
def calc_atr(group):
|
||||
high, low, close = group['high'], group['low'], group['close']
|
||||
tr = np.maximum(high - low,
|
||||
np.maximum(abs(high - close.shift()),
|
||||
abs(low - close.shift())))
|
||||
return tr.rolling(14).mean()
|
||||
|
||||
df['atr_14'] = grouped.apply(calc_atr)
|
||||
df['cost_atr_adj'] = (df['cost_95pct'] - df['cost_5pct']) / df['atr_14']
|
||||
|
||||
# 12. 小盘股筹码集中度
|
||||
df['smallcap_concentration'] = (1 / df['circ_mv']) * (df['cost_85pct'] - df['cost_15pct'])
|
||||
|
||||
# 16. 筹码稳定性指数 (20日波动率)
|
||||
df['weight_std20'] = grouped['weight_avg'].apply(lambda x: x.rolling(20).std())
|
||||
df['cost_stability'] = df['weight_std20'] / grouped['weight_avg'].transform(lambda x: x.rolling(20).mean())
|
||||
|
||||
# 17. 成本区间突破标记
|
||||
df['high_cost_break_days'] = grouped.apply(lambda g: g['close'].gt(g['cost_95pct']).rolling(5).sum())
|
||||
|
||||
# 18. 黄金筹码共振 (复合事件)
|
||||
df['cat_golden_resonance'] = ((df['close'] > df['weight_avg']) &
|
||||
(df['volume_ratio'] > 1.5) &
|
||||
(df['winner_rate'] > 0.7))
|
||||
|
||||
# 20. 筹码-流动性风险
|
||||
df['liquidity_risk'] = (df['cost_95pct'] - df['cost_5pct']) * (
|
||||
1 / grouped['vol'].transform(lambda x: x.rolling(10).mean()))
|
||||
|
||||
df.drop(columns=['weight_std20'], inplace=True, errors='ignore')
|
||||
|
||||
return df
|
||||
|
||||
|
||||
def get_mv_factors(df):
|
||||
"""
|
||||
计算多个因子并生成最终的综合因子。
|
||||
|
||||
参数:
|
||||
df (pd.DataFrame): 包含 ts_code, trade_date, turnover_rate, pe_ttm, pb, ps, circ_mv, volume_ratio, vol 等列的数据框。
|
||||
|
||||
返回:
|
||||
pd.DataFrame: 包含新增因子和最终综合因子的数据框。
|
||||
"""
|
||||
# 按 ts_code 和 trade_date 排序
|
||||
df = df.sort_values(by=['ts_code', 'trade_date'])
|
||||
|
||||
# 按 ts_code 分组
|
||||
grouped = df.groupby('ts_code', group_keys=False)
|
||||
|
||||
# 1. 市值流动比因子
|
||||
df['mv_turnover_ratio'] = df['turnover_rate'] / df['circ_mv']
|
||||
|
||||
# 2. 市值调整成交量因子
|
||||
df['mv_adjusted_volume'] = df['vol'] / df['circ_mv']
|
||||
|
||||
# 3. 市值加权换手率因子
|
||||
df['mv_weighted_turnover'] = df['turnover_rate'] * (1 / df['circ_mv'])
|
||||
|
||||
# 4. 非线性市值成交量因子
|
||||
df['nonlinear_mv_volume'] = df['vol'] / df['circ_mv']
|
||||
|
||||
# 5. 市值量比因子
|
||||
df['mv_volume_ratio'] = df['volume_ratio'] / df['circ_mv']
|
||||
|
||||
# 6. 市值动量因子
|
||||
df['mv_momentum'] = df['turnover_rate'] * df['volume_ratio'] / df['circ_mv']
|
||||
|
||||
# 7. 市值波动率因子
|
||||
df['turnover_std'] = grouped['turnover_rate'].rolling(window=20).std().reset_index(level=0, drop=True)
|
||||
df['mv_volatility'] = grouped.apply(lambda x: x['turnover_std'] / x['circ_mv']).reset_index(level=0, drop=True)
|
||||
|
||||
# 8. 市值成长性因子
|
||||
df['volume_growth'] = grouped['vol'].pct_change(periods=20).reset_index(level=0, drop=True)
|
||||
df['mv_growth'] = grouped.apply(lambda x: x['volume_growth'] / x['circ_mv']).reset_index(level=0, drop=True)
|
||||
|
||||
# # 标准化因子
|
||||
# factor_columns = [
|
||||
# 'mv_turnover_ratio', 'mv_adjusted_volume', 'mv_weighted_turnover',
|
||||
# 'nonlinear_mv_volume', 'mv_volume_ratio', 'mv_momentum',
|
||||
# 'mv_volatility', 'mv_growth'
|
||||
# ]
|
||||
# scaler = StandardScaler()
|
||||
# df[factor_columns] = scaler.fit_transform(df[factor_columns])
|
||||
#
|
||||
# # 加权合成因子
|
||||
# weights = [0.2, 0.15, 0.15, 0.1, 0.1, 0.1, 0.1, 0.1] # 各因子权重
|
||||
# df['final_combined_factor'] = df[factor_columns].dot(weights)
|
||||
|
||||
return df
|
||||
|
||||
|
||||
import numpy as np
|
||||
import talib
|
||||
|
||||
|
||||
def get_rolling_factor(df):
|
||||
old_columns = df.columns.tolist()[:]
|
||||
# 按股票和日期排序
|
||||
df = df.sort_values(by=['ts_code', 'trade_date'])
|
||||
grouped = df.groupby('ts_code', group_keys=False)
|
||||
|
||||
df["gap_next_open"] = (df["open"].shift(-1) - df["close"]) / df["close"]
|
||||
|
||||
df['return_skew'] = grouped['pct_chg'].rolling(window=5).skew().reset_index(0, drop=True)
|
||||
df['return_kurtosis'] = grouped['pct_chg'].rolling(window=5).kurt().reset_index(0, drop=True)
|
||||
|
||||
# 因子 1:短期成交量变化率
|
||||
df['volume_change_rate'] = (
|
||||
grouped['vol'].rolling(window=2).mean() /
|
||||
grouped['vol'].rolling(window=10).mean() - 1
|
||||
).reset_index(level=0, drop=True) # 确保索引对齐
|
||||
|
||||
# 因子 2:成交量突破信号
|
||||
max_volume = grouped['vol'].rolling(window=5).max().reset_index(level=0, drop=True) # 确保索引对齐
|
||||
df['cat_volume_breakout'] = (df['vol'] > max_volume)
|
||||
|
||||
# 因子 3:换手率均线偏离度
|
||||
mean_turnover = grouped['turnover_rate'].rolling(window=3).mean().reset_index(level=0, drop=True)
|
||||
std_turnover = grouped['turnover_rate'].rolling(window=3).std().reset_index(level=0, drop=True)
|
||||
df['turnover_deviation'] = (df['turnover_rate'] - mean_turnover) / std_turnover
|
||||
|
||||
# 因子 4:换手率激增信号
|
||||
df['cat_turnover_spike'] = (df['turnover_rate'] > mean_turnover + 2 * std_turnover)
|
||||
|
||||
# 因子 5:量比均值
|
||||
df['avg_volume_ratio'] = grouped['volume_ratio'].rolling(window=3).mean().reset_index(level=0, drop=True)
|
||||
|
||||
# 因子 6:量比突破信号
|
||||
max_volume_ratio = grouped['volume_ratio'].rolling(window=5).max().reset_index(level=0, drop=True)
|
||||
df['cat_volume_ratio_breakout'] = (df['volume_ratio'] > max_volume_ratio)
|
||||
|
||||
df['vol_spike'] = grouped.apply(
|
||||
lambda x: pd.Series(x['vol'].rolling(20).mean(), index=x.index)
|
||||
)
|
||||
df['vol_std_5'] = df['vol'].pct_change().rolling(5).std()
|
||||
|
||||
# 计算 ATR
|
||||
df['atr_14'] = grouped.apply(
|
||||
lambda x: pd.Series(talib.ATR(x['high'].values, x['low'].values, x['close'].values, timeperiod=14),
|
||||
index=x.index)
|
||||
)
|
||||
df['atr_6'] = grouped.apply(
|
||||
lambda x: pd.Series(talib.ATR(x['high'].values, x['low'].values, x['close'].values, timeperiod=6),
|
||||
index=x.index)
|
||||
)
|
||||
|
||||
# 计算 OBV 及其均线
|
||||
df['obv'] = grouped.apply(
|
||||
lambda x: pd.Series(talib.OBV(x['close'].values, x['vol'].values), index=x.index)
|
||||
)
|
||||
df['maobv_6'] = grouped.apply(
|
||||
lambda x: pd.Series(talib.SMA(x['obv'].values, timeperiod=6), index=x.index)
|
||||
)
|
||||
|
||||
df['rsi_3'] = grouped.apply(
|
||||
lambda x: pd.Series(talib.RSI(x['close'].values, timeperiod=3), index=x.index)
|
||||
)
|
||||
df['rsi_6'] = grouped.apply(
|
||||
lambda x: pd.Series(talib.RSI(x['close'].values, timeperiod=6), index=x.index)
|
||||
)
|
||||
df['rsi_9'] = grouped.apply(
|
||||
lambda x: pd.Series(talib.RSI(x['close'].values, timeperiod=9), index=x.index)
|
||||
)
|
||||
|
||||
# 计算 return_10 和 return_20
|
||||
df['return_5'] = grouped['close'].apply(lambda x: x / x.shift(5) - 1)
|
||||
df['return_10'] = grouped['close'].apply(lambda x: x / x.shift(10) - 1)
|
||||
df['return_20'] = grouped['close'].apply(lambda x: x / x.shift(20) - 1)
|
||||
|
||||
# df['avg_close_5'] = grouped['close'].apply(lambda x: x.rolling(window=5).mean() / x)
|
||||
|
||||
# 计算标准差指标
|
||||
df['std_return_5'] = grouped['close'].apply(lambda x: x.pct_change().rolling(window=5).std())
|
||||
df['std_return_15'] = grouped['close'].apply(lambda x: x.pct_change().rolling(window=15).std())
|
||||
df['std_return_25'] = grouped['close'].apply(lambda x: x.pct_change().rolling(window=25).std())
|
||||
df['std_return_90'] = grouped['close'].apply(lambda x: x.pct_change().rolling(window=90).std())
|
||||
df['std_return_90_2'] = grouped['close'].apply(lambda x: x.shift(10).pct_change().rolling(window=90).std())
|
||||
|
||||
# 计算 EMA 指标
|
||||
df['_ema_5'] = grouped['close'].apply(
|
||||
lambda x: pd.Series(talib.EMA(x.values, timeperiod=5), index=x.index)
|
||||
)
|
||||
df['_ema_13'] = grouped['close'].apply(
|
||||
lambda x: pd.Series(talib.EMA(x.values, timeperiod=13), index=x.index)
|
||||
)
|
||||
df['_ema_20'] = grouped['close'].apply(
|
||||
lambda x: pd.Series(talib.EMA(x.values, timeperiod=20), index=x.index)
|
||||
)
|
||||
df['_ema_60'] = grouped['close'].apply(
|
||||
lambda x: pd.Series(talib.EMA(x.values, timeperiod=60), index=x.index)
|
||||
)
|
||||
|
||||
# 计算 act_factor1, act_factor2, act_factor3, act_factor4
|
||||
df['act_factor1'] = grouped['_ema_5'].apply(
|
||||
lambda x: np.arctan((x / x.shift(1) - 1) * 100) * 57.3 / 50
|
||||
)
|
||||
df['act_factor2'] = grouped['_ema_13'].apply(
|
||||
lambda x: np.arctan((x / x.shift(1) - 1) * 100) * 57.3 / 40
|
||||
)
|
||||
df['act_factor3'] = grouped['_ema_20'].apply(
|
||||
lambda x: np.arctan((x / x.shift(1) - 1) * 100) * 57.3 / 21
|
||||
)
|
||||
df['act_factor4'] = grouped['_ema_60'].apply(
|
||||
lambda x: np.arctan((x / x.shift(1) - 1) * 100) * 57.3 / 10
|
||||
)
|
||||
|
||||
# 根据 trade_date 截面计算排名
|
||||
df['rank_act_factor1'] = df.groupby('trade_date', group_keys=False)['act_factor1'].rank(ascending=False, pct=True)
|
||||
df['rank_act_factor2'] = df.groupby('trade_date', group_keys=False)['act_factor2'].rank(ascending=False, pct=True)
|
||||
df['rank_act_factor3'] = df.groupby('trade_date', group_keys=False)['act_factor3'].rank(ascending=False, pct=True)
|
||||
|
||||
df['log(circ_mv)'] = np.log(df['circ_mv'])
|
||||
|
||||
def rolling_covariance(x, y, window):
|
||||
return x.rolling(window).cov(y)
|
||||
|
||||
def delta(series, period):
|
||||
return series.diff(period)
|
||||
|
||||
def rank(series):
|
||||
return series.rank(pct=True)
|
||||
|
||||
def stddev(series, window):
|
||||
return series.rolling(window).std()
|
||||
|
||||
window_high_volume = 5
|
||||
window_close_stddev = 20
|
||||
period_delta = 5
|
||||
df['cov'] = rolling_covariance(df['high'], df['vol'], window_high_volume)
|
||||
df['delta_cov'] = delta(df['cov'], period_delta)
|
||||
df['_rank_stddev'] = rank(stddev(df['close'], window_close_stddev))
|
||||
df['alpha_22_improved'] = -1 * df['delta_cov'] * df['_rank_stddev']
|
||||
|
||||
df['alpha_003'] = np.where(df['high'] != df['low'],
|
||||
(df['close'] - df['open']) / (df['high'] - df['low']),
|
||||
0)
|
||||
|
||||
df['alpha_007'] = grouped.apply(lambda x: x['close'].rolling(5).corr(x['vol'])).reset_index(level=0, drop=True)
|
||||
df['alpha_007'] = df.groupby('trade_date', group_keys=False)['alpha_007'].rank(ascending=True, pct=True)
|
||||
|
||||
df['alpha_013'] = grouped['close'].transform(lambda x: x.rolling(5).sum() - x.rolling(20).sum())
|
||||
df['alpha_013'] = df.groupby('trade_date', group_keys=False)['alpha_013'].rank(ascending=True, pct=True)
|
||||
|
||||
df['cat_up_limit'] = (df['close'] == df['up_limit']) # 是否涨停(1表示涨停,0表示未涨停)
|
||||
df['cat_down_limit'] = (df['close'] == df['down_limit']) # 是否跌停(1表示跌停,0表示未跌停)
|
||||
df['up_limit_count_10d'] = grouped['cat_up_limit'].rolling(window=10, min_periods=1).sum().reset_index(level=0,
|
||||
drop=True)
|
||||
df['down_limit_count_10d'] = grouped['cat_down_limit'].rolling(window=10, min_periods=1).sum().reset_index(level=0,
|
||||
drop=True)
|
||||
|
||||
# 3. 最近连续涨跌停天数
|
||||
def calculate_consecutive_limits(series):
|
||||
"""
|
||||
计算连续涨停/跌停天数。
|
||||
"""
|
||||
consecutive_up = series * (series.groupby((series != series.shift()).cumsum()).cumcount() + 1)
|
||||
consecutive_down = series * (series.groupby((series != series.shift()).cumsum()).cumcount() + 1)
|
||||
return consecutive_up, consecutive_down
|
||||
|
||||
# 连续涨停天数
|
||||
df['consecutive_up_limit'] = grouped['cat_up_limit'].apply(
|
||||
lambda x: calculate_consecutive_limits(x)[0]
|
||||
).reset_index(level=0, drop=True)
|
||||
|
||||
df['vol_break'] = np.where((df['close'] > df['cost_85pct']) & (df['volume_ratio'] > 2), 1, 0)
|
||||
|
||||
df['weight_roc5'] = grouped['weight_avg'].apply(lambda x: x.pct_change(5))
|
||||
|
||||
def rolling_corr(group):
|
||||
roc_close = group['close'].pct_change()
|
||||
roc_weight = group['weight_avg'].pct_change()
|
||||
return roc_close.rolling(10).corr(roc_weight)
|
||||
|
||||
df['price_cost_divergence'] = grouped.apply(rolling_corr)
|
||||
|
||||
df['smallcap_concentration'] = (1 / df['circ_mv']) * (df['cost_85pct'] - df['cost_15pct'])
|
||||
|
||||
# 16. 筹码稳定性指数 (20日波动率)
|
||||
df['weight_std20'] = grouped['weight_avg'].apply(lambda x: x.rolling(20).std())
|
||||
df['cost_stability'] = df['weight_std20'] / grouped['weight_avg'].transform(lambda x: x.rolling(20).mean())
|
||||
|
||||
# 17. 成本区间突破标记
|
||||
df['high_cost_break_days'] = grouped.apply(lambda g: g['close'].gt(g['cost_95pct']).rolling(5).sum())
|
||||
|
||||
# 20. 筹码-流动性风险
|
||||
df['liquidity_risk'] = (df['cost_95pct'] - df['cost_5pct']) * (
|
||||
1 / grouped['vol'].transform(lambda x: x.rolling(10).mean()))
|
||||
|
||||
# 7. 市值波动率因子
|
||||
df['turnover_std'] = grouped['turnover_rate'].rolling(window=20).std().reset_index(level=0, drop=True)
|
||||
df['mv_volatility'] = grouped.apply(lambda x: x['turnover_std'] / x['circ_mv']).reset_index(level=0, drop=True)
|
||||
|
||||
# 8. 市值成长性因子
|
||||
df['volume_growth'] = grouped['vol'].pct_change(periods=20).reset_index(level=0, drop=True)
|
||||
df['mv_growth'] = grouped.apply(lambda x: x['volume_growth'] / x['circ_mv']).reset_index(level=0, drop=True)
|
||||
|
||||
df.drop(columns=['weight_std20'], inplace=True, errors='ignore')
|
||||
new_columns = [col for col in df.columns.tolist()[:] if col not in old_columns]
|
||||
|
||||
return df, new_columns
|
||||
|
||||
|
||||
def get_simple_factor(df):
|
||||
old_columns = df.columns.tolist()[:]
|
||||
df = df.sort_values(by=['ts_code', 'trade_date'])
|
||||
|
||||
alpha = 0.5
|
||||
df['momentum_factor'] = df['volume_change_rate'] + alpha * df['turnover_deviation']
|
||||
df['resonance_factor'] = df['volume_ratio'] * df['pct_chg']
|
||||
df['log_close'] = np.log(df['close'])
|
||||
|
||||
df['cat_vol_spike'] = df['vol'] > 2 * df['vol_spike']
|
||||
|
||||
df['up'] = (df['high'] - df[['close', 'open']].max(axis=1)) / df['close']
|
||||
df['down'] = (df[['close', 'open']].min(axis=1) - df['low']) / df['close']
|
||||
|
||||
df['obv-maobv_6'] = df['obv'] - df['maobv_6']
|
||||
|
||||
# 计算比值指标
|
||||
df['std_return_5 / std_return_90'] = df['std_return_5'] / df['std_return_90']
|
||||
df['std_return_5 / std_return_25'] = df['std_return_5'] / df['std_return_25']
|
||||
|
||||
# 计算标准差差值
|
||||
df['std_return_90 - std_return_90_2'] = df['std_return_90'] - df['std_return_90_2']
|
||||
|
||||
df['cat_af1'] = df['act_factor1'] > 0
|
||||
df['cat_af2'] = df['act_factor2'] > df['act_factor1']
|
||||
df['cat_af3'] = df['act_factor3'] > df['act_factor2']
|
||||
df['cat_af4'] = df['act_factor4'] > df['act_factor3']
|
||||
|
||||
# 计算 act_factor5 和 act_factor6
|
||||
df['act_factor5'] = df['act_factor1'] + df['act_factor2'] + df['act_factor3'] + df['act_factor4']
|
||||
df['act_factor6'] = (df['act_factor1'] - df['act_factor2']) / np.sqrt(
|
||||
df['act_factor1'] ** 2 + df['act_factor2'] ** 2)
|
||||
|
||||
df['active_buy_volume_large'] = df['buy_lg_vol'] / df['net_mf_vol']
|
||||
df['active_buy_volume_big'] = df['buy_elg_vol'] / df['net_mf_vol']
|
||||
df['active_buy_volume_small'] = df['buy_sm_vol'] / df['net_mf_vol']
|
||||
|
||||
df['buy_lg_vol_minus_sell_lg_vol'] = (df['buy_lg_vol'] - df['sell_lg_vol']) / df['net_mf_vol']
|
||||
df['buy_elg_vol_minus_sell_elg_vol'] = (df['buy_elg_vol'] - df['sell_elg_vol']) / df['net_mf_vol']
|
||||
|
||||
df['log(circ_mv)'] = np.log(df['circ_mv'])
|
||||
|
||||
df['ctrl_strength'] = (df['cost_85pct'] - df['cost_15pct']) / (df['his_high'] - df['his_low'])
|
||||
|
||||
df['low_cost_dev'] = (df['close'] - df['cost_5pct']) / (df['cost_50pct'] - df['cost_5pct'])
|
||||
|
||||
df['asymmetry'] = (df['cost_95pct'] - df['cost_50pct']) / (df['cost_50pct'] - df['cost_5pct'])
|
||||
|
||||
df['lock_factor'] = df['turnover_rate'] * (
|
||||
1 - (df['cost_95pct'] - df['cost_5pct']) / (df['his_high'] - df['his_low']))
|
||||
|
||||
df['cat_vol_break'] = (df['close'] > df['cost_85pct']) & (df['volume_ratio'] > 2)
|
||||
|
||||
df['cost_atr_adj'] = (df['cost_95pct'] - df['cost_5pct']) / df['atr_14']
|
||||
|
||||
# 12. 小盘股筹码集中度
|
||||
df['smallcap_concentration'] = (1 / df['circ_mv']) * (df['cost_85pct'] - df['cost_15pct'])
|
||||
|
||||
df['cat_golden_resonance'] = ((df['close'] > df['weight_avg']) &
|
||||
(df['volume_ratio'] > 1.5) &
|
||||
(df['winner_rate'] > 0.7))
|
||||
|
||||
df['mv_turnover_ratio'] = df['turnover_rate'] / df['circ_mv']
|
||||
|
||||
df['mv_adjusted_volume'] = df['vol'] / df['circ_mv']
|
||||
|
||||
df['mv_weighted_turnover'] = df['turnover_rate'] * (1 / df['circ_mv'])
|
||||
|
||||
df['nonlinear_mv_volume'] = df['vol'] / df['circ_mv']
|
||||
|
||||
df['mv_volume_ratio'] = df['volume_ratio'] / df['circ_mv']
|
||||
|
||||
df['mv_momentum'] = df['turnover_rate'] * df['volume_ratio'] / df['circ_mv']
|
||||
|
||||
drop_columns = [col for col in df.columns if col.startswith('_')]
|
||||
df.drop(columns=drop_columns, inplace=True, errors='ignore')
|
||||
|
||||
new_columns = [col for col in df.columns.tolist()[:] if col not in old_columns]
|
||||
return df, new_columns
|
||||
|
||||
|
||||
def calculate_indicators(df):
|
||||
"""
|
||||
计算四个指标:当日涨跌幅、5日移动平均、RSI、MACD。
|
||||
"""
|
||||
df = df.sort_values('trade_date')
|
||||
df['daily_return'] = (df['close'] - df['pre_close']) / df['pre_close'] * 100
|
||||
# df['5_day_ma'] = df['close'].rolling(window=5).mean()
|
||||
delta = df['close'].diff()
|
||||
gain = delta.where(delta > 0, 0)
|
||||
loss = -delta.where(delta < 0, 0)
|
||||
avg_gain = gain.rolling(window=14).mean()
|
||||
avg_loss = loss.rolling(window=14).mean()
|
||||
rs = avg_gain / avg_loss
|
||||
df['RSI'] = 100 - (100 / (1 + rs))
|
||||
|
||||
# 计算MACD
|
||||
ema12 = df['close'].ewm(span=12, adjust=False).mean()
|
||||
ema26 = df['close'].ewm(span=26, adjust=False).mean()
|
||||
df['MACD'] = ema12 - ema26
|
||||
df['Signal_line'] = df['MACD'].ewm(span=9, adjust=False).mean()
|
||||
df['MACD_hist'] = df['MACD'] - df['Signal_line']
|
||||
|
||||
# 4. 情绪因子1:市场上涨比例(Up Ratio)
|
||||
df['up_ratio'] = df['daily_return'].apply(lambda x: 1 if x > 0 else 0)
|
||||
df['up_ratio_20d'] = df['up_ratio'].rolling(window=20).mean() # 过去20天上涨比例
|
||||
|
||||
# 5. 情绪因子2:成交量变化率(Volume Change Rate)
|
||||
df['volume_mean'] = df['vol'].rolling(window=20).mean() # 过去20天的平均成交量
|
||||
df['volume_change_rate'] = (df['vol'] - df['volume_mean']) / df['volume_mean'] * 100 # 成交量变化率
|
||||
|
||||
# 6. 情绪因子3:波动率(Volatility)
|
||||
df['volatility'] = df['daily_return'].rolling(window=20).std() # 过去20天的日收益率标准差
|
||||
|
||||
# 7. 情绪因子4:成交额变化率(Amount Change Rate)
|
||||
df['amount_mean'] = df['amount'].rolling(window=20).mean() # 过去20天的平均成交额
|
||||
df['amount_change_rate'] = (df['amount'] - df['amount_mean']) / df['amount_mean'] * 100 # 成交额变化率
|
||||
|
||||
return df
|
||||
|
||||
|
||||
def generate_index_indicators(h5_filename):
|
||||
df = pd.read_hdf(h5_filename, key='index_data')
|
||||
df['trade_date'] = pd.to_datetime(df['trade_date'], format='%Y%m%d')
|
||||
df = df.sort_values('trade_date')
|
||||
|
||||
# 计算每个ts_code的相关指标
|
||||
df_indicators = []
|
||||
for ts_code in df['ts_code'].unique():
|
||||
df_index = df[df['ts_code'] == ts_code].copy()
|
||||
df_index = calculate_indicators(df_index)
|
||||
df_indicators.append(df_index)
|
||||
|
||||
# 合并所有指数的结果
|
||||
df_all_indicators = pd.concat(df_indicators, ignore_index=True)
|
||||
|
||||
# 保留trade_date列,并将同一天的数据按ts_code合并成一行
|
||||
df_final = df_all_indicators.pivot_table(
|
||||
index='trade_date',
|
||||
columns='ts_code',
|
||||
values=['daily_return', 'RSI', 'MACD', 'Signal_line',
|
||||
'MACD_hist', 'up_ratio_20d', 'volume_change_rate', 'volatility',
|
||||
'amount_change_rate', 'amount_mean'],
|
||||
aggfunc='last'
|
||||
)
|
||||
|
||||
df_final.columns = [f"{col[1]}_{col[0]}" for col in df_final.columns]
|
||||
df_final = df_final.reset_index()
|
||||
|
||||
return df_final
|
||||
Reference in New Issue
Block a user