新环境
This commit is contained in:
12
.gitignore
vendored
12
.gitignore
vendored
@@ -1,6 +1,6 @@
|
||||
/data/
|
||||
/data-copy/
|
||||
**/.ipynb_checkpoints/
|
||||
/.virtual_documents/
|
||||
/stocks_list.csv
|
||||
/.idea/NewStock.iml
|
||||
/data/
|
||||
/data-copy/
|
||||
**/.ipynb_checkpoints/
|
||||
/.virtual_documents/
|
||||
/stocks_list.csv
|
||||
/.idea/NewStock.iml
|
||||
|
||||
22
.idea/.gitignore
generated
vendored
22
.idea/.gitignore
generated
vendored
@@ -1,12 +1,12 @@
|
||||
# Default ignored files
|
||||
/shelf/
|
||||
/workspace.xml
|
||||
# Editor-based HTTP Client requests
|
||||
/httpRequests/
|
||||
# Datasource local storage ignored files
|
||||
/dataSources/
|
||||
/dataSources.local.xml
|
||||
|
||||
.ipynb_checkpoints
|
||||
|
||||
# Default ignored files
|
||||
/shelf/
|
||||
/workspace.xml
|
||||
# Editor-based HTTP Client requests
|
||||
/httpRequests/
|
||||
# Datasource local storage ignored files
|
||||
/dataSources/
|
||||
/dataSources.local.xml
|
||||
|
||||
.ipynb_checkpoints
|
||||
|
||||
../data/
|
||||
12
.idea/misc.xml
generated
12
.idea/misc.xml
generated
@@ -1,7 +1,7 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="Black">
|
||||
<option name="sdkName" value="E:\Python\anaconda\envs\try_trader" />
|
||||
</component>
|
||||
<component name="ProjectRootManager" version="2" project-jdk-name="E:\Python\anaconda\envs\new_trader" project-jdk-type="Python SDK" />
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="Black">
|
||||
<option name="sdkName" value="E:\Python\anaconda\envs\try_trader" />
|
||||
</component>
|
||||
<component name="ProjectRootManager" version="2" project-jdk-name="E:\Python\anaconda\envs\new_trader" project-jdk-type="Python SDK" />
|
||||
</project>
|
||||
14
.idea/modules.xml
generated
14
.idea/modules.xml
generated
@@ -1,8 +1,8 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="ProjectModuleManager">
|
||||
<modules>
|
||||
<module fileurl="file://$PROJECT_DIR$/.idea/NewStock.iml" filepath="$PROJECT_DIR$/.idea/NewStock.iml" />
|
||||
</modules>
|
||||
</component>
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="ProjectModuleManager">
|
||||
<modules>
|
||||
<module fileurl="file://$PROJECT_DIR$/.idea/NewStock.iml" filepath="$PROJECT_DIR$/.idea/NewStock.iml" />
|
||||
</modules>
|
||||
</component>
|
||||
</project>
|
||||
10
.idea/vcs.xml
generated
10
.idea/vcs.xml
generated
@@ -1,6 +1,6 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="VcsDirectoryMappings">
|
||||
<mapping directory="$PROJECT_DIR$" vcs="Git" />
|
||||
</component>
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="VcsDirectoryMappings">
|
||||
<mapping directory="$PROJECT_DIR$" vcs="Git" />
|
||||
</component>
|
||||
</project>
|
||||
30
.vscode/launch.json
vendored
30
.vscode/launch.json
vendored
@@ -1,16 +1,16 @@
|
||||
{
|
||||
// 使用 IntelliSense 了解相关属性。
|
||||
// 悬停以查看现有属性的描述。
|
||||
// 欲了解更多信息,请访问: https://go.microsoft.com/fwlink/?linkid=830387
|
||||
"version": "0.2.0",
|
||||
"configurations": [
|
||||
{
|
||||
"name": "Python 调试程序: 当前文件",
|
||||
"type": "debugpy",
|
||||
"request": "launch",
|
||||
"program": "${file}",
|
||||
"console": "integratedTerminal",
|
||||
"cwd": "${fileDirname}"
|
||||
}
|
||||
]
|
||||
{
|
||||
// 使用 IntelliSense 了解相关属性。
|
||||
// 悬停以查看现有属性的描述。
|
||||
// 欲了解更多信息,请访问: https://go.microsoft.com/fwlink/?linkid=830387
|
||||
"version": "0.2.0",
|
||||
"configurations": [
|
||||
{
|
||||
"name": "Python 调试程序: 当前文件",
|
||||
"type": "debugpy",
|
||||
"request": "launch",
|
||||
"program": "${file}",
|
||||
"console": "integratedTerminal",
|
||||
"cwd": "${fileDirname}"
|
||||
}
|
||||
]
|
||||
}
|
||||
12
.vscode/settings.json
vendored
12
.vscode/settings.json
vendored
@@ -1,7 +1,7 @@
|
||||
{
|
||||
"terminal.integrated.env.windows": {
|
||||
"PYTHONPATH": "${workspaceFolder};${env:PYTHONPATH}"
|
||||
},
|
||||
"jupyter.notebookFileRoot": "${fileDirname}",
|
||||
"python.dataScience.notebookFileRoot": "${workspaceFolder}"
|
||||
{
|
||||
"terminal.integrated.env.windows": {
|
||||
"PYTHONPATH": "${workspaceFolder};${env:PYTHONPATH}"
|
||||
},
|
||||
"jupyter.notebookFileRoot": "${fileDirname}",
|
||||
"python.dataScience.notebookFileRoot": "${workspaceFolder}"
|
||||
}
|
||||
BIN
main/__pycache__/__init__.cpython-313.pyc
Normal file
BIN
main/__pycache__/__init__.cpython-313.pyc
Normal file
Binary file not shown.
File diff suppressed because it is too large
Load Diff
BIN
main/factor/__pycache__/__init__.cpython-313.pyc
Normal file
BIN
main/factor/__pycache__/__init__.cpython-313.pyc
Normal file
Binary file not shown.
BIN
main/factor/__pycache__/factor.cpython-313.pyc
Normal file
BIN
main/factor/__pycache__/factor.cpython-313.pyc
Normal file
Binary file not shown.
@@ -1,63 +1,63 @@
|
||||
序号 因子名称 (Factor Name / Column Name) 因子类别 (Factor Category) 简要说明
|
||||
1 pe_ttm 价值类因子 (Value) 市盈率 TTM
|
||||
2 return_5, return_20 动量类因子 (Momentum) 过去5日/20日收益率
|
||||
3 act_factor1 to act_factor4 动量类 / 技术类因子 (Momentum / Technical) 基于不同周期EMA斜率计算的动量/趋势因子
|
||||
4 std_return_5, std_return_90, std_return_90_2 波动率类因子 (Volatility) 不同窗口期或延迟窗口期的滚动收益率标准差
|
||||
5 upside_vol, downside_vol 波动率类因子 (Volatility) N日滚动上/下行波动率
|
||||
6 vol_ratio 波动率类因子 (Volatility) 上行波动率 / 下行波动率
|
||||
7 std_return_5 / std_return_90 波动率类因子 (Volatility) 短期波动率 / 长期波动率 比率
|
||||
8 std_return_90 - std_return_90_2 波动率类因子 (Volatility) 长期波动率与其10日前值的差值(波动变化)
|
||||
9 volatility (来自指数计算) 波动率类 / 市场因子 (Volatility / Market) 指数(或个股)的20日滚动收益率标准差
|
||||
10 log(circ_mv) (或 log_circ_mv) 市值类因子 (Size) 流通市值的对数值
|
||||
11 cs_rank_size 市值类因子 (Size) 对数流通市值的截面排序
|
||||
12 vol 流动性类因子 (Liquidity) 成交量 (通常需要与其他指标结合或处理)
|
||||
13 turnover_rate 流动性类因子 (Liquidity) 换手率
|
||||
14 volume_ratio 流动性类因子 (Liquidity) 量比
|
||||
15 turnover_deviation 流动性类因子 (Liquidity) 换手率与其3日滚动均值的标准差倍数偏离
|
||||
16 cat_turnover_spike 流动性类 / 分类因子 (Liquidity / Categorical) 换手率是否显著高于近期均值
|
||||
17 volume_change_rate 流动性类因子 (Liquidity) 短期滚动成交量均值 / 长期滚动成交量均值 - 1
|
||||
18 cat_volume_breakout 流动性类 / 分类因子 (Liquidity / Categorical) 当日成交量是否大于过去5日最大成交量
|
||||
19 avg_volume_ratio 流动性类因子 (Liquidity) 3日滚动量比均值
|
||||
20 cat_volume_ratio_breakout 流动性类 / 分类因子 (Liquidity / Categorical) 当日量比是否大于过去5日最大量比
|
||||
21 vol_spike (Rolling Mean Vol) 流动性类因子 (Liquidity) 20日滚动成交量均值
|
||||
22 vol_std_5 流动性类 / 波动率因子 (Liquidity / Volatility) 成交量日变化率的5日滚动标准差
|
||||
23 volume_growth 流动性类因子 (Liquidity) 20日成交量变化率
|
||||
24 turnover_std 流动性类 / 波动率因子 (Liquidity / Volatility) 换手率的20日滚动标准差
|
||||
25 flow_lg_elg_intensity 资金流 / 流动性类因子 (Money Flow / Liquidity) (大单+超大单)净买入量 / 总成交量
|
||||
26 flow_divergence_diff, flow_divergence_ratio 资金流 / 情绪类因子 (Money Flow / Sentiment) 散户与主力资金流的差异或比率
|
||||
27 lg_elg_buy_prop 资金流 / 流动性类因子 (Money Flow / Liquidity) (大单+超大单)买入量 / 总买入量
|
||||
28 flow_struct_buy_change 资金流 / 流动性类因子 (Money Flow / Liquidity) 主力买入占比的日变化
|
||||
29 flow_lg_elg_accel 资金流 / 动量类因子 (Money Flow / Momentum) 主力资金流加速度
|
||||
30 active_buy_volume_large/big/small 资金流 / 流动性类因子 (Money Flow / Liquidity) 不同规模主动买入量 / 净流入量
|
||||
31 buy_lg/elg_vol_minus_sell_lg/elg_vol 资金流 / 流动性类因子 (Money Flow / Liquidity) 不同规模净买入量 / 总净流入量
|
||||
32 cs_rank_net_lg_flow_val, cs_rank_elg_buy_ratio, cs_rank_lg_sm_flow_diverge, cs_rank_elg_buy_sell_sm_ratio 资金流 / 复合因子 (截面排序) 各种资金流指标的截面排序
|
||||
33 cs_rank_ind_adj_lg_flow 资金流 / 复合因子 (行业调整+截面排序) 行业调整后的大单净流入截面排序
|
||||
34 chip_concentration_range, chip_skewness, cost_support_15pct_change, weight_roc5, cost_stability, ctrl_strength, low_cost_dev, asymmetry, cost_conc_std_N, profit_pressure, underwater_resistance, cs_rank_rel_profit_margin, cs_rank_cost_breadth, cs_rank_dist_to_upper_cost 定位类因子 (Positioning) / 技术类 基于持仓成本分布 (cost_*, weight_avg) 计算的各种指标及其截面排序
|
||||
35 winner_rate, cs_rank_winner_rate 定位类因子 (Positioning) / 技术类 获利盘比例及其截面排序
|
||||
36 floating_chip_proxy, price_cost_divergence, high_cost_break_days, liquidity_risk, lock_factor, cost_atr_adj, smallcap_concentration, cat_golden_resonance 定位类因子 (Positioning) / 复合因子 结合持仓成本与其他信息(价格、成交、波动率、市值)的复合指标
|
||||
37 cat_winner_price_zone 定位类 / 分类因子 (Positioning / Categorical) 基于成本和获利盘划分的区域类别
|
||||
38 flow_chip_consistency, profit_taking_vs_absorb, vol_amp_loss, vol_drop_profit_cnt, cost_break_confirm_cnt, vol_wgt_hist_pos, cs_rank_vol_x_profit_margin, cs_rank_cost_dist_vol_ratio 定位类因子 (Positioning) / 复合因子 进一步结合定位、资金流、量价的复杂交互因子
|
||||
39 return_skew, return_kurtosis 技术类 / 统计特征 (Technical / Stats) 滚动收益率的偏度与峰度
|
||||
40 rsi_3 技术类 / 动量类因子 (Technical / Momentum) 3日相对强弱指数
|
||||
41 obv, maobv_6, obv-maobv_6 技术类 / 量价因子 (Technical / Volume) 能量潮及其均线、差离
|
||||
42 atr_14, atr_6 技术类 / 波动率类因子 (Technical / Volatility) 平均真实波幅
|
||||
43 log_close 技术类 / 量价因子 (Technical / Price) 收盘价对数
|
||||
44 up, down 技术类 / 量价因子 (Technical / Price Action) 标准化上影线、下影线长度
|
||||
45 alpha_22_improved, alpha_003, alpha_007, alpha_013 技术类 / Alpha因子 (Technical / Alpha) WorldQuant Alpha 因子实现
|
||||
46 atr_norm_channel_pos 技术类 / 量价因子 (Technical / Price Action) ATR 标准化的价格通道位置
|
||||
47 turnover_diff_skew 技术类 / 流动性类 (Technical / Liquidity) 换手率变化率的偏度
|
||||
48 pullback_strong_N_M 技术类 / 动量类因子 (Technical / Momentum) 近期强势股的回调幅度
|
||||
49 vol_adj_roc 技术类 / 复合因子 (动量+波动率) 波动率调整后的 N 日变化率
|
||||
50 ar, br, arbr 情绪类 / 技术类因子 (Sentiment / Technical) ARBR 人气意愿指标
|
||||
51 up_ratio_20d (来自指数计算) 情绪类 / 市场因子 (Sentiment / Market) 指数(或个股)过去20天上涨天数比例
|
||||
52 cat_up_limit, cat_down_limit, up_limit_count_10d, down_limit_count_10d, consecutive_up_limit 事件驱动 / 市场状态因子 (Event / Market State) 涨跌停相关状态和计数
|
||||
53 momentum_factor, resonance_factor 复合因子 (量价) (Composite - P/V) 基于量、价、换手率等的简单复合
|
||||
54 cat_af2, cat_af3, cat_af4 复合因子 / 分类因子 (Composite / Cat.) act_factor 之间的比较
|
||||
55 act_factor5, act_factor6 复合因子 (技术类) (Composite - Technical) act_factor 1-4 的组合
|
||||
56 mv_volatility, mv_growth, mv_turnover_ratio, mv_adjusted_volume, mv_weighted_turnover, nonlinear_mv_volume, mv_volume_ratio, mv_momentum 复合因子 (市值+流动性/量价) 考虑了市值影响的量价、流动性或动量指标
|
||||
57 cap_neutral_cost_metric (占位符) 复合因子 / Alpha因子 (占位符) 市值行业中性化的成本指标(需实现)
|
||||
58 hurst_exponent_flow (占位符) 资金流 / 统计因子 (占位符) 资金流的 Hurst 指数(需实现)
|
||||
59 intraday_lg_flow_corr_N (占位符) 复合因子 (价格行为+资金流) (占位符) 日内趋势与大单流相关性(需实现)
|
||||
60 industry_* (来自 industry_df) 行业因子 (Industry) 对应行业的各种指标(如行业收益率、行业动量等)
|
||||
61 *_deviation (来自 create_deviation_within_dates) 复合因子 (相对行业) 个股因子相对于行业均值的偏离
|
||||
序号 因子名称 (Factor Name / Column Name) 因子类别 (Factor Category) 简要说明
|
||||
1 pe_ttm 价值类因子 (Value) 市盈率 TTM
|
||||
2 return_5, return_20 动量类因子 (Momentum) 过去5日/20日收益率
|
||||
3 act_factor1 to act_factor4 动量类 / 技术类因子 (Momentum / Technical) 基于不同周期EMA斜率计算的动量/趋势因子
|
||||
4 std_return_5, std_return_90, std_return_90_2 波动率类因子 (Volatility) 不同窗口期或延迟窗口期的滚动收益率标准差
|
||||
5 upside_vol, downside_vol 波动率类因子 (Volatility) N日滚动上/下行波动率
|
||||
6 vol_ratio 波动率类因子 (Volatility) 上行波动率 / 下行波动率
|
||||
7 std_return_5 / std_return_90 波动率类因子 (Volatility) 短期波动率 / 长期波动率 比率
|
||||
8 std_return_90 - std_return_90_2 波动率类因子 (Volatility) 长期波动率与其10日前值的差值(波动变化)
|
||||
9 volatility (来自指数计算) 波动率类 / 市场因子 (Volatility / Market) 指数(或个股)的20日滚动收益率标准差
|
||||
10 log(circ_mv) (或 log_circ_mv) 市值类因子 (Size) 流通市值的对数值
|
||||
11 cs_rank_size 市值类因子 (Size) 对数流通市值的截面排序
|
||||
12 vol 流动性类因子 (Liquidity) 成交量 (通常需要与其他指标结合或处理)
|
||||
13 turnover_rate 流动性类因子 (Liquidity) 换手率
|
||||
14 volume_ratio 流动性类因子 (Liquidity) 量比
|
||||
15 turnover_deviation 流动性类因子 (Liquidity) 换手率与其3日滚动均值的标准差倍数偏离
|
||||
16 cat_turnover_spike 流动性类 / 分类因子 (Liquidity / Categorical) 换手率是否显著高于近期均值
|
||||
17 volume_change_rate 流动性类因子 (Liquidity) 短期滚动成交量均值 / 长期滚动成交量均值 - 1
|
||||
18 cat_volume_breakout 流动性类 / 分类因子 (Liquidity / Categorical) 当日成交量是否大于过去5日最大成交量
|
||||
19 avg_volume_ratio 流动性类因子 (Liquidity) 3日滚动量比均值
|
||||
20 cat_volume_ratio_breakout 流动性类 / 分类因子 (Liquidity / Categorical) 当日量比是否大于过去5日最大量比
|
||||
21 vol_spike (Rolling Mean Vol) 流动性类因子 (Liquidity) 20日滚动成交量均值
|
||||
22 vol_std_5 流动性类 / 波动率因子 (Liquidity / Volatility) 成交量日变化率的5日滚动标准差
|
||||
23 volume_growth 流动性类因子 (Liquidity) 20日成交量变化率
|
||||
24 turnover_std 流动性类 / 波动率因子 (Liquidity / Volatility) 换手率的20日滚动标准差
|
||||
25 flow_lg_elg_intensity 资金流 / 流动性类因子 (Money Flow / Liquidity) (大单+超大单)净买入量 / 总成交量
|
||||
26 flow_divergence_diff, flow_divergence_ratio 资金流 / 情绪类因子 (Money Flow / Sentiment) 散户与主力资金流的差异或比率
|
||||
27 lg_elg_buy_prop 资金流 / 流动性类因子 (Money Flow / Liquidity) (大单+超大单)买入量 / 总买入量
|
||||
28 flow_struct_buy_change 资金流 / 流动性类因子 (Money Flow / Liquidity) 主力买入占比的日变化
|
||||
29 flow_lg_elg_accel 资金流 / 动量类因子 (Money Flow / Momentum) 主力资金流加速度
|
||||
30 active_buy_volume_large/big/small 资金流 / 流动性类因子 (Money Flow / Liquidity) 不同规模主动买入量 / 净流入量
|
||||
31 buy_lg/elg_vol_minus_sell_lg/elg_vol 资金流 / 流动性类因子 (Money Flow / Liquidity) 不同规模净买入量 / 总净流入量
|
||||
32 cs_rank_net_lg_flow_val, cs_rank_elg_buy_ratio, cs_rank_lg_sm_flow_diverge, cs_rank_elg_buy_sell_sm_ratio 资金流 / 复合因子 (截面排序) 各种资金流指标的截面排序
|
||||
33 cs_rank_ind_adj_lg_flow 资金流 / 复合因子 (行业调整+截面排序) 行业调整后的大单净流入截面排序
|
||||
34 chip_concentration_range, chip_skewness, cost_support_15pct_change, weight_roc5, cost_stability, ctrl_strength, low_cost_dev, asymmetry, cost_conc_std_N, profit_pressure, underwater_resistance, cs_rank_rel_profit_margin, cs_rank_cost_breadth, cs_rank_dist_to_upper_cost 定位类因子 (Positioning) / 技术类 基于持仓成本分布 (cost_*, weight_avg) 计算的各种指标及其截面排序
|
||||
35 winner_rate, cs_rank_winner_rate 定位类因子 (Positioning) / 技术类 获利盘比例及其截面排序
|
||||
36 floating_chip_proxy, price_cost_divergence, high_cost_break_days, liquidity_risk, lock_factor, cost_atr_adj, smallcap_concentration, cat_golden_resonance 定位类因子 (Positioning) / 复合因子 结合持仓成本与其他信息(价格、成交、波动率、市值)的复合指标
|
||||
37 cat_winner_price_zone 定位类 / 分类因子 (Positioning / Categorical) 基于成本和获利盘划分的区域类别
|
||||
38 flow_chip_consistency, profit_taking_vs_absorb, vol_amp_loss, vol_drop_profit_cnt, cost_break_confirm_cnt, vol_wgt_hist_pos, cs_rank_vol_x_profit_margin, cs_rank_cost_dist_vol_ratio 定位类因子 (Positioning) / 复合因子 进一步结合定位、资金流、量价的复杂交互因子
|
||||
39 return_skew, return_kurtosis 技术类 / 统计特征 (Technical / Stats) 滚动收益率的偏度与峰度
|
||||
40 rsi_3 技术类 / 动量类因子 (Technical / Momentum) 3日相对强弱指数
|
||||
41 obv, maobv_6, obv-maobv_6 技术类 / 量价因子 (Technical / Volume) 能量潮及其均线、差离
|
||||
42 atr_14, atr_6 技术类 / 波动率类因子 (Technical / Volatility) 平均真实波幅
|
||||
43 log_close 技术类 / 量价因子 (Technical / Price) 收盘价对数
|
||||
44 up, down 技术类 / 量价因子 (Technical / Price Action) 标准化上影线、下影线长度
|
||||
45 alpha_22_improved, alpha_003, alpha_007, alpha_013 技术类 / Alpha因子 (Technical / Alpha) WorldQuant Alpha 因子实现
|
||||
46 atr_norm_channel_pos 技术类 / 量价因子 (Technical / Price Action) ATR 标准化的价格通道位置
|
||||
47 turnover_diff_skew 技术类 / 流动性类 (Technical / Liquidity) 换手率变化率的偏度
|
||||
48 pullback_strong_N_M 技术类 / 动量类因子 (Technical / Momentum) 近期强势股的回调幅度
|
||||
49 vol_adj_roc 技术类 / 复合因子 (动量+波动率) 波动率调整后的 N 日变化率
|
||||
50 ar, br, arbr 情绪类 / 技术类因子 (Sentiment / Technical) ARBR 人气意愿指标
|
||||
51 up_ratio_20d (来自指数计算) 情绪类 / 市场因子 (Sentiment / Market) 指数(或个股)过去20天上涨天数比例
|
||||
52 cat_up_limit, cat_down_limit, up_limit_count_10d, down_limit_count_10d, consecutive_up_limit 事件驱动 / 市场状态因子 (Event / Market State) 涨跌停相关状态和计数
|
||||
53 momentum_factor, resonance_factor 复合因子 (量价) (Composite - P/V) 基于量、价、换手率等的简单复合
|
||||
54 cat_af2, cat_af3, cat_af4 复合因子 / 分类因子 (Composite / Cat.) act_factor 之间的比较
|
||||
55 act_factor5, act_factor6 复合因子 (技术类) (Composite - Technical) act_factor 1-4 的组合
|
||||
56 mv_volatility, mv_growth, mv_turnover_ratio, mv_adjusted_volume, mv_weighted_turnover, nonlinear_mv_volume, mv_volume_ratio, mv_momentum 复合因子 (市值+流动性/量价) 考虑了市值影响的量价、流动性或动量指标
|
||||
57 cap_neutral_cost_metric (占位符) 复合因子 / Alpha因子 (占位符) 市值行业中性化的成本指标(需实现)
|
||||
58 hurst_exponent_flow (占位符) 资金流 / 统计因子 (占位符) 资金流的 Hurst 指数(需实现)
|
||||
59 intraday_lg_flow_corr_N (占位符) 复合因子 (价格行为+资金流) (占位符) 日内趋势与大单流相关性(需实现)
|
||||
60 industry_* (来自 industry_df) 行业因子 (Industry) 对应行业的各种指标(如行业收益率、行业动量等)
|
||||
61 *_deviation (来自 create_deviation_within_dates) 复合因子 (相对行业) 个股因子相对于行业均值的偏离
|
||||
62 complex_factor_gplearn_1 复合因子 (GP生成) DEAP/GP 找到的因子表达式 1
|
||||
|
File diff suppressed because it is too large
Load Diff
@@ -1,63 +1,63 @@
|
||||
序号 因子名称 (Factor Name / Column Name) 因子类别 (Factor Category) 简要说明
|
||||
1 pe_ttm 价值类因子 (Value) 市盈率 TTM
|
||||
2 return_5, return_20 动量类因子 (Momentum) 过去5日/20日收益率
|
||||
3 act_factor1 to act_factor4 动量类 / 技术类因子 (Momentum / Technical) 基于不同周期EMA斜率计算的动量/趋势因子
|
||||
4 std_return_5, std_return_90, std_return_90_2 波动率类因子 (Volatility) 不同窗口期或延迟窗口期的滚动收益率标准差
|
||||
5 upside_vol, downside_vol 波动率类因子 (Volatility) N日滚动上/下行波动率
|
||||
6 vol_ratio 波动率类因子 (Volatility) 上行波动率 / 下行波动率
|
||||
7 std_return_5 / std_return_90 波动率类因子 (Volatility) 短期波动率 / 长期波动率 比率
|
||||
8 std_return_90 - std_return_90_2 波动率类因子 (Volatility) 长期波动率与其10日前值的差值(波动变化)
|
||||
9 volatility (来自指数计算) 波动率类 / 市场因子 (Volatility / Market) 指数(或个股)的20日滚动收益率标准差
|
||||
10 log(circ_mv) (或 log_circ_mv) 市值类因子 (Size) 流通市值的对数值
|
||||
11 cs_rank_size 市值类因子 (Size) 对数流通市值的截面排序
|
||||
12 vol 流动性类因子 (Liquidity) 成交量 (通常需要与其他指标结合或处理)
|
||||
13 turnover_rate 流动性类因子 (Liquidity) 换手率
|
||||
14 volume_ratio 流动性类因子 (Liquidity) 量比
|
||||
15 turnover_deviation 流动性类因子 (Liquidity) 换手率与其3日滚动均值的标准差倍数偏离
|
||||
16 cat_turnover_spike 流动性类 / 分类因子 (Liquidity / Categorical) 换手率是否显著高于近期均值
|
||||
17 volume_change_rate 流动性类因子 (Liquidity) 短期滚动成交量均值 / 长期滚动成交量均值 - 1
|
||||
18 cat_volume_breakout 流动性类 / 分类因子 (Liquidity / Categorical) 当日成交量是否大于过去5日最大成交量
|
||||
19 avg_volume_ratio 流动性类因子 (Liquidity) 3日滚动量比均值
|
||||
20 cat_volume_ratio_breakout 流动性类 / 分类因子 (Liquidity / Categorical) 当日量比是否大于过去5日最大量比
|
||||
21 vol_spike (Rolling Mean Vol) 流动性类因子 (Liquidity) 20日滚动成交量均值
|
||||
22 vol_std_5 流动性类 / 波动率因子 (Liquidity / Volatility) 成交量日变化率的5日滚动标准差
|
||||
23 volume_growth 流动性类因子 (Liquidity) 20日成交量变化率
|
||||
24 turnover_std 流动性类 / 波动率因子 (Liquidity / Volatility) 换手率的20日滚动标准差
|
||||
25 flow_lg_elg_intensity 资金流 / 流动性类因子 (Money Flow / Liquidity) (大单+超大单)净买入量 / 总成交量
|
||||
26 flow_divergence_diff, flow_divergence_ratio 资金流 / 情绪类因子 (Money Flow / Sentiment) 散户与主力资金流的差异或比率
|
||||
27 lg_elg_buy_prop 资金流 / 流动性类因子 (Money Flow / Liquidity) (大单+超大单)买入量 / 总买入量
|
||||
28 flow_struct_buy_change 资金流 / 流动性类因子 (Money Flow / Liquidity) 主力买入占比的日变化
|
||||
29 flow_lg_elg_accel 资金流 / 动量类因子 (Money Flow / Momentum) 主力资金流加速度
|
||||
30 active_buy_volume_large/big/small 资金流 / 流动性类因子 (Money Flow / Liquidity) 不同规模主动买入量 / 净流入量
|
||||
31 buy_lg/elg_vol_minus_sell_lg/elg_vol 资金流 / 流动性类因子 (Money Flow / Liquidity) 不同规模净买入量 / 总净流入量
|
||||
32 cs_rank_net_lg_flow_val, cs_rank_elg_buy_ratio, cs_rank_lg_sm_flow_diverge, cs_rank_elg_buy_sell_sm_ratio 资金流 / 复合因子 (截面排序) 各种资金流指标的截面排序
|
||||
33 cs_rank_ind_adj_lg_flow 资金流 / 复合因子 (行业调整+截面排序) 行业调整后的大单净流入截面排序
|
||||
34 chip_concentration_range, chip_skewness, cost_support_15pct_change, weight_roc5, cost_stability, ctrl_strength, low_cost_dev, asymmetry, cost_conc_std_N, profit_pressure, underwater_resistance, cs_rank_rel_profit_margin, cs_rank_cost_breadth, cs_rank_dist_to_upper_cost 定位类因子 (Positioning) / 技术类 基于持仓成本分布 (cost_*, weight_avg) 计算的各种指标及其截面排序
|
||||
35 winner_rate, cs_rank_winner_rate 定位类因子 (Positioning) / 技术类 获利盘比例及其截面排序
|
||||
36 floating_chip_proxy, price_cost_divergence, high_cost_break_days, liquidity_risk, lock_factor, cost_atr_adj, smallcap_concentration, cat_golden_resonance 定位类因子 (Positioning) / 复合因子 结合持仓成本与其他信息(价格、成交、波动率、市值)的复合指标
|
||||
37 cat_winner_price_zone 定位类 / 分类因子 (Positioning / Categorical) 基于成本和获利盘划分的区域类别
|
||||
38 flow_chip_consistency, profit_taking_vs_absorb, vol_amp_loss, vol_drop_profit_cnt, cost_break_confirm_cnt, vol_wgt_hist_pos, cs_rank_vol_x_profit_margin, cs_rank_cost_dist_vol_ratio 定位类因子 (Positioning) / 复合因子 进一步结合定位、资金流、量价的复杂交互因子
|
||||
39 return_skew, return_kurtosis 技术类 / 统计特征 (Technical / Stats) 滚动收益率的偏度与峰度
|
||||
40 rsi_3 技术类 / 动量类因子 (Technical / Momentum) 3日相对强弱指数
|
||||
41 obv, maobv_6, obv-maobv_6 技术类 / 量价因子 (Technical / Volume) 能量潮及其均线、差离
|
||||
42 atr_14, atr_6 技术类 / 波动率类因子 (Technical / Volatility) 平均真实波幅
|
||||
43 log_close 技术类 / 量价因子 (Technical / Price) 收盘价对数
|
||||
44 up, down 技术类 / 量价因子 (Technical / Price Action) 标准化上影线、下影线长度
|
||||
45 alpha_22_improved, alpha_003, alpha_007, alpha_013 技术类 / Alpha因子 (Technical / Alpha) WorldQuant Alpha 因子实现
|
||||
46 atr_norm_channel_pos 技术类 / 量价因子 (Technical / Price Action) ATR 标准化的价格通道位置
|
||||
47 turnover_diff_skew 技术类 / 流动性类 (Technical / Liquidity) 换手率变化率的偏度
|
||||
48 pullback_strong_N_M 技术类 / 动量类因子 (Technical / Momentum) 近期强势股的回调幅度
|
||||
49 vol_adj_roc 技术类 / 复合因子 (动量+波动率) 波动率调整后的 N 日变化率
|
||||
50 ar, br, arbr 情绪类 / 技术类因子 (Sentiment / Technical) ARBR 人气意愿指标
|
||||
51 up_ratio_20d (来自指数计算) 情绪类 / 市场因子 (Sentiment / Market) 指数(或个股)过去20天上涨天数比例
|
||||
52 cat_up_limit, cat_down_limit, up_limit_count_10d, down_limit_count_10d, consecutive_up_limit 事件驱动 / 市场状态因子 (Event / Market State) 涨跌停相关状态和计数
|
||||
53 momentum_factor, resonance_factor 复合因子 (量价) (Composite - P/V) 基于量、价、换手率等的简单复合
|
||||
54 cat_af2, cat_af3, cat_af4 复合因子 / 分类因子 (Composite / Cat.) act_factor 之间的比较
|
||||
55 act_factor5, act_factor6 复合因子 (技术类) (Composite - Technical) act_factor 1-4 的组合
|
||||
56 mv_volatility, mv_growth, mv_turnover_ratio, mv_adjusted_volume, mv_weighted_turnover, nonlinear_mv_volume, mv_volume_ratio, mv_momentum 复合因子 (市值+流动性/量价) 考虑了市值影响的量价、流动性或动量指标
|
||||
57 cap_neutral_cost_metric (占位符) 复合因子 / Alpha因子 (占位符) 市值行业中性化的成本指标(需实现)
|
||||
58 hurst_exponent_flow (占位符) 资金流 / 统计因子 (占位符) 资金流的 Hurst 指数(需实现)
|
||||
59 intraday_lg_flow_corr_N (占位符) 复合因子 (价格行为+资金流) (占位符) 日内趋势与大单流相关性(需实现)
|
||||
60 industry_* (来自 industry_df) 行业因子 (Industry) 对应行业的各种指标(如行业收益率、行业动量等)
|
||||
61 *_deviation (来自 create_deviation_within_dates) 复合因子 (相对行业) 个股因子相对于行业均值的偏离
|
||||
序号 因子名称 (Factor Name / Column Name) 因子类别 (Factor Category) 简要说明
|
||||
1 pe_ttm 价值类因子 (Value) 市盈率 TTM
|
||||
2 return_5, return_20 动量类因子 (Momentum) 过去5日/20日收益率
|
||||
3 act_factor1 to act_factor4 动量类 / 技术类因子 (Momentum / Technical) 基于不同周期EMA斜率计算的动量/趋势因子
|
||||
4 std_return_5, std_return_90, std_return_90_2 波动率类因子 (Volatility) 不同窗口期或延迟窗口期的滚动收益率标准差
|
||||
5 upside_vol, downside_vol 波动率类因子 (Volatility) N日滚动上/下行波动率
|
||||
6 vol_ratio 波动率类因子 (Volatility) 上行波动率 / 下行波动率
|
||||
7 std_return_5 / std_return_90 波动率类因子 (Volatility) 短期波动率 / 长期波动率 比率
|
||||
8 std_return_90 - std_return_90_2 波动率类因子 (Volatility) 长期波动率与其10日前值的差值(波动变化)
|
||||
9 volatility (来自指数计算) 波动率类 / 市场因子 (Volatility / Market) 指数(或个股)的20日滚动收益率标准差
|
||||
10 log(circ_mv) (或 log_circ_mv) 市值类因子 (Size) 流通市值的对数值
|
||||
11 cs_rank_size 市值类因子 (Size) 对数流通市值的截面排序
|
||||
12 vol 流动性类因子 (Liquidity) 成交量 (通常需要与其他指标结合或处理)
|
||||
13 turnover_rate 流动性类因子 (Liquidity) 换手率
|
||||
14 volume_ratio 流动性类因子 (Liquidity) 量比
|
||||
15 turnover_deviation 流动性类因子 (Liquidity) 换手率与其3日滚动均值的标准差倍数偏离
|
||||
16 cat_turnover_spike 流动性类 / 分类因子 (Liquidity / Categorical) 换手率是否显著高于近期均值
|
||||
17 volume_change_rate 流动性类因子 (Liquidity) 短期滚动成交量均值 / 长期滚动成交量均值 - 1
|
||||
18 cat_volume_breakout 流动性类 / 分类因子 (Liquidity / Categorical) 当日成交量是否大于过去5日最大成交量
|
||||
19 avg_volume_ratio 流动性类因子 (Liquidity) 3日滚动量比均值
|
||||
20 cat_volume_ratio_breakout 流动性类 / 分类因子 (Liquidity / Categorical) 当日量比是否大于过去5日最大量比
|
||||
21 vol_spike (Rolling Mean Vol) 流动性类因子 (Liquidity) 20日滚动成交量均值
|
||||
22 vol_std_5 流动性类 / 波动率因子 (Liquidity / Volatility) 成交量日变化率的5日滚动标准差
|
||||
23 volume_growth 流动性类因子 (Liquidity) 20日成交量变化率
|
||||
24 turnover_std 流动性类 / 波动率因子 (Liquidity / Volatility) 换手率的20日滚动标准差
|
||||
25 flow_lg_elg_intensity 资金流 / 流动性类因子 (Money Flow / Liquidity) (大单+超大单)净买入量 / 总成交量
|
||||
26 flow_divergence_diff, flow_divergence_ratio 资金流 / 情绪类因子 (Money Flow / Sentiment) 散户与主力资金流的差异或比率
|
||||
27 lg_elg_buy_prop 资金流 / 流动性类因子 (Money Flow / Liquidity) (大单+超大单)买入量 / 总买入量
|
||||
28 flow_struct_buy_change 资金流 / 流动性类因子 (Money Flow / Liquidity) 主力买入占比的日变化
|
||||
29 flow_lg_elg_accel 资金流 / 动量类因子 (Money Flow / Momentum) 主力资金流加速度
|
||||
30 active_buy_volume_large/big/small 资金流 / 流动性类因子 (Money Flow / Liquidity) 不同规模主动买入量 / 净流入量
|
||||
31 buy_lg/elg_vol_minus_sell_lg/elg_vol 资金流 / 流动性类因子 (Money Flow / Liquidity) 不同规模净买入量 / 总净流入量
|
||||
32 cs_rank_net_lg_flow_val, cs_rank_elg_buy_ratio, cs_rank_lg_sm_flow_diverge, cs_rank_elg_buy_sell_sm_ratio 资金流 / 复合因子 (截面排序) 各种资金流指标的截面排序
|
||||
33 cs_rank_ind_adj_lg_flow 资金流 / 复合因子 (行业调整+截面排序) 行业调整后的大单净流入截面排序
|
||||
34 chip_concentration_range, chip_skewness, cost_support_15pct_change, weight_roc5, cost_stability, ctrl_strength, low_cost_dev, asymmetry, cost_conc_std_N, profit_pressure, underwater_resistance, cs_rank_rel_profit_margin, cs_rank_cost_breadth, cs_rank_dist_to_upper_cost 定位类因子 (Positioning) / 技术类 基于持仓成本分布 (cost_*, weight_avg) 计算的各种指标及其截面排序
|
||||
35 winner_rate, cs_rank_winner_rate 定位类因子 (Positioning) / 技术类 获利盘比例及其截面排序
|
||||
36 floating_chip_proxy, price_cost_divergence, high_cost_break_days, liquidity_risk, lock_factor, cost_atr_adj, smallcap_concentration, cat_golden_resonance 定位类因子 (Positioning) / 复合因子 结合持仓成本与其他信息(价格、成交、波动率、市值)的复合指标
|
||||
37 cat_winner_price_zone 定位类 / 分类因子 (Positioning / Categorical) 基于成本和获利盘划分的区域类别
|
||||
38 flow_chip_consistency, profit_taking_vs_absorb, vol_amp_loss, vol_drop_profit_cnt, cost_break_confirm_cnt, vol_wgt_hist_pos, cs_rank_vol_x_profit_margin, cs_rank_cost_dist_vol_ratio 定位类因子 (Positioning) / 复合因子 进一步结合定位、资金流、量价的复杂交互因子
|
||||
39 return_skew, return_kurtosis 技术类 / 统计特征 (Technical / Stats) 滚动收益率的偏度与峰度
|
||||
40 rsi_3 技术类 / 动量类因子 (Technical / Momentum) 3日相对强弱指数
|
||||
41 obv, maobv_6, obv-maobv_6 技术类 / 量价因子 (Technical / Volume) 能量潮及其均线、差离
|
||||
42 atr_14, atr_6 技术类 / 波动率类因子 (Technical / Volatility) 平均真实波幅
|
||||
43 log_close 技术类 / 量价因子 (Technical / Price) 收盘价对数
|
||||
44 up, down 技术类 / 量价因子 (Technical / Price Action) 标准化上影线、下影线长度
|
||||
45 alpha_22_improved, alpha_003, alpha_007, alpha_013 技术类 / Alpha因子 (Technical / Alpha) WorldQuant Alpha 因子实现
|
||||
46 atr_norm_channel_pos 技术类 / 量价因子 (Technical / Price Action) ATR 标准化的价格通道位置
|
||||
47 turnover_diff_skew 技术类 / 流动性类 (Technical / Liquidity) 换手率变化率的偏度
|
||||
48 pullback_strong_N_M 技术类 / 动量类因子 (Technical / Momentum) 近期强势股的回调幅度
|
||||
49 vol_adj_roc 技术类 / 复合因子 (动量+波动率) 波动率调整后的 N 日变化率
|
||||
50 ar, br, arbr 情绪类 / 技术类因子 (Sentiment / Technical) ARBR 人气意愿指标
|
||||
51 up_ratio_20d (来自指数计算) 情绪类 / 市场因子 (Sentiment / Market) 指数(或个股)过去20天上涨天数比例
|
||||
52 cat_up_limit, cat_down_limit, up_limit_count_10d, down_limit_count_10d, consecutive_up_limit 事件驱动 / 市场状态因子 (Event / Market State) 涨跌停相关状态和计数
|
||||
53 momentum_factor, resonance_factor 复合因子 (量价) (Composite - P/V) 基于量、价、换手率等的简单复合
|
||||
54 cat_af2, cat_af3, cat_af4 复合因子 / 分类因子 (Composite / Cat.) act_factor 之间的比较
|
||||
55 act_factor5, act_factor6 复合因子 (技术类) (Composite - Technical) act_factor 1-4 的组合
|
||||
56 mv_volatility, mv_growth, mv_turnover_ratio, mv_adjusted_volume, mv_weighted_turnover, nonlinear_mv_volume, mv_volume_ratio, mv_momentum 复合因子 (市值+流动性/量价) 考虑了市值影响的量价、流动性或动量指标
|
||||
57 cap_neutral_cost_metric (占位符) 复合因子 / Alpha因子 (占位符) 市值行业中性化的成本指标(需实现)
|
||||
58 hurst_exponent_flow (占位符) 资金流 / 统计因子 (占位符) 资金流的 Hurst 指数(需实现)
|
||||
59 intraday_lg_flow_corr_N (占位符) 复合因子 (价格行为+资金流) (占位符) 日内趋势与大单流相关性(需实现)
|
||||
60 industry_* (来自 industry_df) 行业因子 (Industry) 对应行业的各种指标(如行业收益率、行业动量等)
|
||||
61 *_deviation (来自 create_deviation_within_dates) 复合因子 (相对行业) 个股因子相对于行业均值的偏离
|
||||
62 complex_factor_gplearn_1 复合因子 (GP生成) DEAP/GP 找到的因子表达式 1
|
||||
@@ -1,193 +1,193 @@
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from scipy.stats import spearmanr # 用于因子3的原始思路,但实际简化了
|
||||
|
||||
epsilon = 1e-10
|
||||
|
||||
def _safe_divide(numerator, denominator, default_val=0.0):
|
||||
"""安全除法"""
|
||||
with np.errstate(divide='ignore', invalid='ignore'):
|
||||
result = numerator / denominator
|
||||
result[~np.isfinite(result)] = default_val
|
||||
return result
|
||||
|
||||
# --- 修改后的因子计算函数 ---
|
||||
|
||||
def calculate_size_style_strength_factor(df: pd.DataFrame, N: int = 5, factor_name_suffix: str = '') -> pd.DataFrame:
|
||||
"""
|
||||
计算大小盘风格相对强度因子。
|
||||
返回: 以 trade_date 为索引,因子值为列的 DataFrame。
|
||||
"""
|
||||
factor_name = f'size_style_strength_{N}{factor_name_suffix}'
|
||||
print(f"Calculating {factor_name}...")
|
||||
|
||||
required_indices = ['399300.SZ', '000905.SH', '000852.SH']
|
||||
if not all(idx in df['ts_code'].unique() for idx in required_indices):
|
||||
print(f"Error: DataFrame 中缺少部分必需的指数代码 ({required_indices})。返回空因子 Series。")
|
||||
return pd.DataFrame(index=df['trade_date'].unique(), columns=[factor_name]).rename_axis('trade_date')
|
||||
|
||||
# 1. 计算各指数N日收益率
|
||||
df_copy = df.copy() # 操作副本,避免修改原始传入df
|
||||
df_copy['_ret_N'] = df_copy.groupby('ts_code')['close'].pct_change(periods=N)
|
||||
|
||||
# 2. Pivot 以方便截面计算
|
||||
pivot_ret_N = df_copy.pivot_table(index='trade_date', columns='ts_code', values='_ret_N')
|
||||
|
||||
# 确保列存在并获取
|
||||
large_ret = pivot_ret_N.get('399300.SZ', pd.Series(np.nan, index=pivot_ret_N.index))
|
||||
mid_ret = pivot_ret_N.get('000905.SH', pd.Series(np.nan, index=pivot_ret_N.index))
|
||||
small_ret = pivot_ret_N.get('000852.SH', pd.Series(np.nan, index=pivot_ret_N.index))
|
||||
|
||||
# 3. 计算因子 (结果是每日一个标量值)
|
||||
large_small_diff = large_ret - small_ret
|
||||
avg_large_small_ret = (large_ret + small_ret) / 2
|
||||
# 计算中盘偏离因子,处理NaN,如果中盘收益为NaN,则偏离因子不起调整作用(乘以1)
|
||||
mid_deviation_raw = mid_ret - avg_large_small_ret
|
||||
mid_deviation_factor = 1 + np.sign(mid_ret.fillna(0)) * np.abs(mid_deviation_raw.fillna(0))
|
||||
|
||||
daily_factor_values = large_small_diff * mid_deviation_factor
|
||||
daily_factor_values.name = factor_name # 给 Series 命名
|
||||
|
||||
print(f"Finished {factor_name}.")
|
||||
return daily_factor_values.to_frame() # 转换为 DataFrame 返回
|
||||
|
||||
def calculate_volatility_structure_factor(df: pd.DataFrame, N: int = 10, factor_name_suffix: str = '') -> pd.DataFrame:
|
||||
"""
|
||||
计算市场波动结构因子。
|
||||
返回: 以 trade_date 为索引,因子值为列的 DataFrame。
|
||||
"""
|
||||
factor_name = f'vol_structure_idx_{N}{factor_name_suffix}'
|
||||
print(f"Calculating {factor_name}...")
|
||||
|
||||
required_indices = ['399300.SZ', '000905.SH', '000852.SH']
|
||||
if not all(idx in df['ts_code'].unique() for idx in required_indices):
|
||||
print(f"Error: DataFrame 中缺少部分必需的指数代码 ({required_indices})。返回空因子 Series。")
|
||||
return pd.DataFrame(index=df['trade_date'].unique(), columns=[factor_name]).rename_axis('trade_date')
|
||||
|
||||
if 'pct_chg' not in df.columns:
|
||||
print(f"Error: DataFrame 缺少 'pct_chg' 列。将为 {factor_name} 填充 NaN。")
|
||||
return pd.DataFrame(index=df['trade_date'].unique(), columns=[factor_name]).rename_axis('trade_date')
|
||||
|
||||
df_copy = df.copy()
|
||||
# 1. 计算各指数N日波动率
|
||||
df_copy['_vol_N'] = df_copy.groupby('ts_code')['pct_chg'].rolling(N, min_periods=max(1, N//2)).std().reset_index(level=0, drop=True)
|
||||
|
||||
# 2. Pivot
|
||||
pivot_vol_N = df_copy.pivot_table(index='trade_date', columns='ts_code', values='_vol_N')
|
||||
|
||||
large_vol = pivot_vol_N.get('399300.SZ', pd.Series(np.nan, index=pivot_vol_N.index))
|
||||
mid_vol = pivot_vol_N.get('000905.SH', pd.Series(np.nan, index=pivot_vol_N.index))
|
||||
small_vol = pivot_vol_N.get('000852.SH', pd.Series(np.nan, index=pivot_vol_N.index))
|
||||
|
||||
# 3. 计算因子
|
||||
daily_factor_values = _safe_divide((small_vol - mid_vol), large_vol)
|
||||
daily_factor_values.name = factor_name
|
||||
|
||||
print(f"Finished {factor_name}.")
|
||||
return daily_factor_values.to_frame()
|
||||
|
||||
def calculate_market_divergence_factor(df: pd.DataFrame, factor_name_suffix: str = '') -> pd.DataFrame:
|
||||
"""
|
||||
计算市场分化度因子 (基于每日三个指数收益率符号的一致性)。
|
||||
返回: 以 trade_date 为索引,因子值为列的 DataFrame。
|
||||
"""
|
||||
factor_name = f'market_divergence_score{factor_name_suffix}'
|
||||
print(f"Calculating {factor_name}...")
|
||||
|
||||
required_indices = ['399300.SZ', '000905.SH', '000852.SH']
|
||||
if not all(idx in df['ts_code'].unique() for idx in required_indices):
|
||||
print(f"Error: DataFrame 中缺少部分必需的指数代码 ({required_indices})。返回空因子 Series。")
|
||||
return pd.DataFrame(index=df['trade_date'].unique(), columns=[factor_name]).rename_axis('trade_date')
|
||||
|
||||
if 'pct_chg' not in df.columns:
|
||||
print(f"Error: DataFrame 缺少 'pct_chg' 列。将为 {factor_name} 填充 NaN。")
|
||||
return pd.DataFrame(index=df['trade_date'].unique(), columns=[factor_name]).rename_axis('trade_date')
|
||||
|
||||
pivot_pct_chg = df.pivot_table(index='trade_date', columns='ts_code', values='pct_chg')
|
||||
|
||||
# 确保列存在
|
||||
idx_large_col = '399300.SZ'
|
||||
idx_mid_col = '000905.SH'
|
||||
idx_small_col = '000852.SH'
|
||||
|
||||
# 使用 reindex 确保所有期望的列都存在,缺失的填充NaN
|
||||
pivot_pct_chg = pivot_pct_chg.reindex(columns=[idx_large_col, idx_mid_col, idx_small_col])
|
||||
|
||||
def daily_divergence_score_calc(row):
|
||||
# 当天只有这三个指数的收益率 Series
|
||||
valid_returns = row.dropna() # 获取非 NaN 的收益率
|
||||
if len(valid_returns) < 2: # 如果有效数据少于2个,无法判断分化
|
||||
return np.nan
|
||||
|
||||
signs = np.sign(valid_returns)
|
||||
unique_sign_count = len(signs.unique())
|
||||
|
||||
if unique_sign_count == 1: # 所有符号相同 (或都为0,sign后也是0)
|
||||
return 0.0 # 分化度最低 (高度一致)
|
||||
elif unique_sign_count == 2 and 0 in signs.unique(): # 一个方向,一个0
|
||||
return 0.25 # 较低分化
|
||||
elif unique_sign_count == 2: # 两个方向 (例如两正一负,或两负一正)
|
||||
return 0.75 # 较高分化
|
||||
elif unique_sign_count == 3: # 三个不同方向 (+, -, 0)
|
||||
return 1.0 # 分化度最高
|
||||
return np.nan # 其他未覆盖的情况 (理论上不应发生)
|
||||
|
||||
daily_factor_values = pivot_pct_chg[[idx_large_col, idx_mid_col, idx_small_col]].apply(daily_divergence_score_calc, axis=1)
|
||||
daily_factor_values.name = factor_name
|
||||
|
||||
print(f"Finished {factor_name}.")
|
||||
return daily_factor_values.to_frame()
|
||||
|
||||
# --- 整合所有因子计算到一个主函数 ---
|
||||
def generate_daily_index_relation_factors(df_input: pd.DataFrame) -> pd.DataFrame:
|
||||
"""
|
||||
计算所有基于大中小盘指数关系的每日截面因子。
|
||||
|
||||
Args:
|
||||
df_input (pd.DataFrame): 长格式的指数行情数据,包含 'ts_code', 'trade_date', 'close', 'pct_chg'。
|
||||
|
||||
Returns:
|
||||
pd.DataFrame: 以 'trade_date' 为索引,各因子为列的 DataFrame。
|
||||
"""
|
||||
# 确保输入 df 不被修改
|
||||
df = df_input.sort_values(['ts_code', 'trade_date']).reset_index(drop=True)
|
||||
|
||||
# 计算各个因子 (每个函数返回一个单列或多列的 DataFrame,索引为 trade_date)
|
||||
factor1_df = calculate_size_style_strength_factor(df, N=5)
|
||||
factor2_df = calculate_volatility_structure_factor(df, N=10)
|
||||
factor3_df = calculate_market_divergence_factor(df)
|
||||
|
||||
# 还可以继续添加其他每日截面因子...
|
||||
|
||||
# 合并所有因子 DataFrame
|
||||
# 使用 functools.reduce 和 pd.merge 来优雅地合并多个 DataFrame
|
||||
from functools import reduce
|
||||
daily_factors_list = [factor1_df, factor2_df, factor3_df]
|
||||
# 过滤掉可能因错误产生的完全为空或只有NaN的DataFrame
|
||||
daily_factors_list = [f_df for f_df in daily_factors_list if not f_df.empty and not f_df.iloc[:,0].isna().all()]
|
||||
|
||||
if not daily_factors_list:
|
||||
print("警告: 未能成功计算任何因子。返回空 DataFrame。")
|
||||
# 返回一个以日期为索引的空DataFrame,或者基于输入df的日期
|
||||
return pd.DataFrame(index=df['trade_date'].unique()).rename_axis('trade_date')
|
||||
|
||||
# 使用 outer join 以保留所有日期,并确保索引是 trade_date
|
||||
final_factors_df = reduce(lambda left, right: pd.merge(left, right, on='trade_date', how='outer'),
|
||||
daily_factors_list)
|
||||
|
||||
final_factors_df = final_factors_df.sort_index() # 按日期排序
|
||||
|
||||
return final_factors_df
|
||||
|
||||
# --- 使用示例 ---
|
||||
# 假设 all_indices_df 是你包含 '399300.SZ', '000905.SH', '000852.SH' 三个指数的长格式行情数据
|
||||
# 确保它有 'ts_code', 'trade_date', 'open', 'high', 'low', 'close', 'vol', 'pct_chg' 列
|
||||
# all_indices_df['trade_date'] = pd.to_datetime(all_indices_df['trade_date'])
|
||||
# all_indices_df = all_indices_df.sort_values(['ts_code', 'trade_date'])
|
||||
|
||||
# daily_market_factors = generate_daily_index_relation_factors(all_indices_df)
|
||||
# print("\n每日市场风格/情绪因子:")
|
||||
# print(daily_market_factors.tail())
|
||||
|
||||
# 后续,你可以将 daily_market_factors 与你的个股数据 pdf 按 'trade_date' 合并
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from scipy.stats import spearmanr # 用于因子3的原始思路,但实际简化了
|
||||
|
||||
epsilon = 1e-10
|
||||
|
||||
def _safe_divide(numerator, denominator, default_val=0.0):
|
||||
"""安全除法"""
|
||||
with np.errstate(divide='ignore', invalid='ignore'):
|
||||
result = numerator / denominator
|
||||
result[~np.isfinite(result)] = default_val
|
||||
return result
|
||||
|
||||
# --- 修改后的因子计算函数 ---
|
||||
|
||||
def calculate_size_style_strength_factor(df: pd.DataFrame, N: int = 5, factor_name_suffix: str = '') -> pd.DataFrame:
|
||||
"""
|
||||
计算大小盘风格相对强度因子。
|
||||
返回: 以 trade_date 为索引,因子值为列的 DataFrame。
|
||||
"""
|
||||
factor_name = f'size_style_strength_{N}{factor_name_suffix}'
|
||||
print(f"Calculating {factor_name}...")
|
||||
|
||||
required_indices = ['399300.SZ', '000905.SH', '000852.SH']
|
||||
if not all(idx in df['ts_code'].unique() for idx in required_indices):
|
||||
print(f"Error: DataFrame 中缺少部分必需的指数代码 ({required_indices})。返回空因子 Series。")
|
||||
return pd.DataFrame(index=df['trade_date'].unique(), columns=[factor_name]).rename_axis('trade_date')
|
||||
|
||||
# 1. 计算各指数N日收益率
|
||||
df_copy = df.copy() # 操作副本,避免修改原始传入df
|
||||
df_copy['_ret_N'] = df_copy.groupby('ts_code')['close'].pct_change(periods=N)
|
||||
|
||||
# 2. Pivot 以方便截面计算
|
||||
pivot_ret_N = df_copy.pivot_table(index='trade_date', columns='ts_code', values='_ret_N')
|
||||
|
||||
# 确保列存在并获取
|
||||
large_ret = pivot_ret_N.get('399300.SZ', pd.Series(np.nan, index=pivot_ret_N.index))
|
||||
mid_ret = pivot_ret_N.get('000905.SH', pd.Series(np.nan, index=pivot_ret_N.index))
|
||||
small_ret = pivot_ret_N.get('000852.SH', pd.Series(np.nan, index=pivot_ret_N.index))
|
||||
|
||||
# 3. 计算因子 (结果是每日一个标量值)
|
||||
large_small_diff = large_ret - small_ret
|
||||
avg_large_small_ret = (large_ret + small_ret) / 2
|
||||
# 计算中盘偏离因子,处理NaN,如果中盘收益为NaN,则偏离因子不起调整作用(乘以1)
|
||||
mid_deviation_raw = mid_ret - avg_large_small_ret
|
||||
mid_deviation_factor = 1 + np.sign(mid_ret.fillna(0)) * np.abs(mid_deviation_raw.fillna(0))
|
||||
|
||||
daily_factor_values = large_small_diff * mid_deviation_factor
|
||||
daily_factor_values.name = factor_name # 给 Series 命名
|
||||
|
||||
print(f"Finished {factor_name}.")
|
||||
return daily_factor_values.to_frame() # 转换为 DataFrame 返回
|
||||
|
||||
def calculate_volatility_structure_factor(df: pd.DataFrame, N: int = 10, factor_name_suffix: str = '') -> pd.DataFrame:
|
||||
"""
|
||||
计算市场波动结构因子。
|
||||
返回: 以 trade_date 为索引,因子值为列的 DataFrame。
|
||||
"""
|
||||
factor_name = f'vol_structure_idx_{N}{factor_name_suffix}'
|
||||
print(f"Calculating {factor_name}...")
|
||||
|
||||
required_indices = ['399300.SZ', '000905.SH', '000852.SH']
|
||||
if not all(idx in df['ts_code'].unique() for idx in required_indices):
|
||||
print(f"Error: DataFrame 中缺少部分必需的指数代码 ({required_indices})。返回空因子 Series。")
|
||||
return pd.DataFrame(index=df['trade_date'].unique(), columns=[factor_name]).rename_axis('trade_date')
|
||||
|
||||
if 'pct_chg' not in df.columns:
|
||||
print(f"Error: DataFrame 缺少 'pct_chg' 列。将为 {factor_name} 填充 NaN。")
|
||||
return pd.DataFrame(index=df['trade_date'].unique(), columns=[factor_name]).rename_axis('trade_date')
|
||||
|
||||
df_copy = df.copy()
|
||||
# 1. 计算各指数N日波动率
|
||||
df_copy['_vol_N'] = df_copy.groupby('ts_code')['pct_chg'].rolling(N, min_periods=max(1, N//2)).std().reset_index(level=0, drop=True)
|
||||
|
||||
# 2. Pivot
|
||||
pivot_vol_N = df_copy.pivot_table(index='trade_date', columns='ts_code', values='_vol_N')
|
||||
|
||||
large_vol = pivot_vol_N.get('399300.SZ', pd.Series(np.nan, index=pivot_vol_N.index))
|
||||
mid_vol = pivot_vol_N.get('000905.SH', pd.Series(np.nan, index=pivot_vol_N.index))
|
||||
small_vol = pivot_vol_N.get('000852.SH', pd.Series(np.nan, index=pivot_vol_N.index))
|
||||
|
||||
# 3. 计算因子
|
||||
daily_factor_values = _safe_divide((small_vol - mid_vol), large_vol)
|
||||
daily_factor_values.name = factor_name
|
||||
|
||||
print(f"Finished {factor_name}.")
|
||||
return daily_factor_values.to_frame()
|
||||
|
||||
def calculate_market_divergence_factor(df: pd.DataFrame, factor_name_suffix: str = '') -> pd.DataFrame:
|
||||
"""
|
||||
计算市场分化度因子 (基于每日三个指数收益率符号的一致性)。
|
||||
返回: 以 trade_date 为索引,因子值为列的 DataFrame。
|
||||
"""
|
||||
factor_name = f'market_divergence_score{factor_name_suffix}'
|
||||
print(f"Calculating {factor_name}...")
|
||||
|
||||
required_indices = ['399300.SZ', '000905.SH', '000852.SH']
|
||||
if not all(idx in df['ts_code'].unique() for idx in required_indices):
|
||||
print(f"Error: DataFrame 中缺少部分必需的指数代码 ({required_indices})。返回空因子 Series。")
|
||||
return pd.DataFrame(index=df['trade_date'].unique(), columns=[factor_name]).rename_axis('trade_date')
|
||||
|
||||
if 'pct_chg' not in df.columns:
|
||||
print(f"Error: DataFrame 缺少 'pct_chg' 列。将为 {factor_name} 填充 NaN。")
|
||||
return pd.DataFrame(index=df['trade_date'].unique(), columns=[factor_name]).rename_axis('trade_date')
|
||||
|
||||
pivot_pct_chg = df.pivot_table(index='trade_date', columns='ts_code', values='pct_chg')
|
||||
|
||||
# 确保列存在
|
||||
idx_large_col = '399300.SZ'
|
||||
idx_mid_col = '000905.SH'
|
||||
idx_small_col = '000852.SH'
|
||||
|
||||
# 使用 reindex 确保所有期望的列都存在,缺失的填充NaN
|
||||
pivot_pct_chg = pivot_pct_chg.reindex(columns=[idx_large_col, idx_mid_col, idx_small_col])
|
||||
|
||||
def daily_divergence_score_calc(row):
|
||||
# 当天只有这三个指数的收益率 Series
|
||||
valid_returns = row.dropna() # 获取非 NaN 的收益率
|
||||
if len(valid_returns) < 2: # 如果有效数据少于2个,无法判断分化
|
||||
return np.nan
|
||||
|
||||
signs = np.sign(valid_returns)
|
||||
unique_sign_count = len(signs.unique())
|
||||
|
||||
if unique_sign_count == 1: # 所有符号相同 (或都为0,sign后也是0)
|
||||
return 0.0 # 分化度最低 (高度一致)
|
||||
elif unique_sign_count == 2 and 0 in signs.unique(): # 一个方向,一个0
|
||||
return 0.25 # 较低分化
|
||||
elif unique_sign_count == 2: # 两个方向 (例如两正一负,或两负一正)
|
||||
return 0.75 # 较高分化
|
||||
elif unique_sign_count == 3: # 三个不同方向 (+, -, 0)
|
||||
return 1.0 # 分化度最高
|
||||
return np.nan # 其他未覆盖的情况 (理论上不应发生)
|
||||
|
||||
daily_factor_values = pivot_pct_chg[[idx_large_col, idx_mid_col, idx_small_col]].apply(daily_divergence_score_calc, axis=1)
|
||||
daily_factor_values.name = factor_name
|
||||
|
||||
print(f"Finished {factor_name}.")
|
||||
return daily_factor_values.to_frame()
|
||||
|
||||
# --- 整合所有因子计算到一个主函数 ---
|
||||
def generate_daily_index_relation_factors(df_input: pd.DataFrame) -> pd.DataFrame:
|
||||
"""
|
||||
计算所有基于大中小盘指数关系的每日截面因子。
|
||||
|
||||
Args:
|
||||
df_input (pd.DataFrame): 长格式的指数行情数据,包含 'ts_code', 'trade_date', 'close', 'pct_chg'。
|
||||
|
||||
Returns:
|
||||
pd.DataFrame: 以 'trade_date' 为索引,各因子为列的 DataFrame。
|
||||
"""
|
||||
# 确保输入 df 不被修改
|
||||
df = df_input.sort_values(['ts_code', 'trade_date']).reset_index(drop=True)
|
||||
|
||||
# 计算各个因子 (每个函数返回一个单列或多列的 DataFrame,索引为 trade_date)
|
||||
factor1_df = calculate_size_style_strength_factor(df, N=5)
|
||||
factor2_df = calculate_volatility_structure_factor(df, N=10)
|
||||
factor3_df = calculate_market_divergence_factor(df)
|
||||
|
||||
# 还可以继续添加其他每日截面因子...
|
||||
|
||||
# 合并所有因子 DataFrame
|
||||
# 使用 functools.reduce 和 pd.merge 来优雅地合并多个 DataFrame
|
||||
from functools import reduce
|
||||
daily_factors_list = [factor1_df, factor2_df, factor3_df]
|
||||
# 过滤掉可能因错误产生的完全为空或只有NaN的DataFrame
|
||||
daily_factors_list = [f_df for f_df in daily_factors_list if not f_df.empty and not f_df.iloc[:,0].isna().all()]
|
||||
|
||||
if not daily_factors_list:
|
||||
print("警告: 未能成功计算任何因子。返回空 DataFrame。")
|
||||
# 返回一个以日期为索引的空DataFrame,或者基于输入df的日期
|
||||
return pd.DataFrame(index=df['trade_date'].unique()).rename_axis('trade_date')
|
||||
|
||||
# 使用 outer join 以保留所有日期,并确保索引是 trade_date
|
||||
final_factors_df = reduce(lambda left, right: pd.merge(left, right, on='trade_date', how='outer'),
|
||||
daily_factors_list)
|
||||
|
||||
final_factors_df = final_factors_df.sort_index() # 按日期排序
|
||||
|
||||
return final_factors_df
|
||||
|
||||
# --- 使用示例 ---
|
||||
# 假设 all_indices_df 是你包含 '399300.SZ', '000905.SH', '000852.SH' 三个指数的长格式行情数据
|
||||
# 确保它有 'ts_code', 'trade_date', 'open', 'high', 'low', 'close', 'vol', 'pct_chg' 列
|
||||
# all_indices_df['trade_date'] = pd.to_datetime(all_indices_df['trade_date'])
|
||||
# all_indices_df = all_indices_df.sort_values(['ts_code', 'trade_date'])
|
||||
|
||||
# daily_market_factors = generate_daily_index_relation_factors(all_indices_df)
|
||||
# print("\n每日市场风格/情绪因子:")
|
||||
# print(daily_market_factors.tail())
|
||||
|
||||
# 后续,你可以将 daily_market_factors 与你的个股数据 pdf 按 'trade_date' 合并
|
||||
# pdf_with_market_factors = pd.merge(pdf, daily_market_factors, on='trade_date', how='left')
|
||||
@@ -1,7 +1,7 @@
|
||||
|
||||
|
||||
from main.utils.utils import read_and_merge_h5_data, merge_with_industry_data
|
||||
|
||||
|
||||
import sys
|
||||
|
||||
|
||||
from main.utils.utils import read_and_merge_h5_data, merge_with_industry_data
|
||||
|
||||
|
||||
import sys
|
||||
print(sys.path)
|
||||
@@ -1,222 +1,222 @@
|
||||
from tqdm import tqdm
|
||||
|
||||
from main.factor.factor import get_rolling_factor, get_simple_factor
|
||||
from main.utils.utils import read_and_merge_h5_data
|
||||
import pandas as pd
|
||||
|
||||
|
||||
def create_factor_table_clickhouse(clickhouse_host: str, clickhouse_port: int,
|
||||
clickhouse_user: str, clickhouse_password: str,
|
||||
clickhouse_database: str, table_name: str = 'factor_data'):
|
||||
"""
|
||||
在 ClickHouse 中创建 factor_data 表,考虑读取速度。
|
||||
"""
|
||||
try:
|
||||
print('create factor table')
|
||||
client = Client(host=clickhouse_host, port=clickhouse_port, user=clickhouse_user,
|
||||
password=clickhouse_password, database=clickhouse_database)
|
||||
|
||||
create_table_query = f"""
|
||||
CREATE TABLE IF NOT EXISTS {table_name}
|
||||
(
|
||||
date Date,
|
||||
asset_id String,
|
||||
factor_name String,
|
||||
factor_value Float64
|
||||
)
|
||||
ENGINE = MergeTree()
|
||||
PARTITION BY toYYYYMM(date)
|
||||
ORDER BY (date, asset_id, factor_name)
|
||||
"""
|
||||
|
||||
client.execute(create_table_query)
|
||||
print(f"成功在 ClickHouse 数据库 '{clickhouse_database}' 中创建表 '{table_name}'!")
|
||||
|
||||
except Exception as e:
|
||||
print(f"创建 ClickHouse 表发生错误: {e}")
|
||||
finally:
|
||||
if 'client' in locals() and client.connection:
|
||||
client.disconnect()
|
||||
|
||||
|
||||
def write_features_to_clickhouse(df: pd.DataFrame, feature_columns: list,
|
||||
clickhouse_host: str, clickhouse_port: int,
|
||||
clickhouse_user: str, clickhouse_password: str,
|
||||
clickhouse_database: str, table_name: str = 'stock_factor',
|
||||
batch_size: int = 5000): # 设置批次大小
|
||||
"""
|
||||
将 DataFrame 中指定的特征列分批写入 ClickHouse 的宽表,动态添加列。
|
||||
"""
|
||||
try:
|
||||
client = Client(host=clickhouse_host, port=clickhouse_port, user=clickhouse_user,
|
||||
password=clickhouse_password, database=clickhouse_database)
|
||||
|
||||
if 'ts_code' not in df.columns or 'trade_date' not in df.columns:
|
||||
raise ValueError("DataFrame 必须包含 'ts_code' 和 'trade_date' 列。")
|
||||
|
||||
existing_columns = set()
|
||||
columns_query = f"DESCRIBE TABLE {table_name}"
|
||||
columns_result = client.execute(columns_query)
|
||||
for col in columns_result:
|
||||
existing_columns.add(col[0])
|
||||
|
||||
for factor_name in feature_columns:
|
||||
if factor_name not in existing_columns:
|
||||
if factor_name not in df.columns:
|
||||
print(f"警告: 特征 '{factor_name}' 不存在于 DataFrame 中,将跳过添加列。")
|
||||
continue
|
||||
|
||||
factor_series = df[factor_name]
|
||||
factor_dtype = factor_series.dtype
|
||||
|
||||
clickhouse_dtype = None
|
||||
if pd.api.types.is_float_dtype(factor_dtype):
|
||||
clickhouse_dtype = 'Float64'
|
||||
elif pd.api.types.is_integer_dtype(factor_dtype):
|
||||
clickhouse_dtype = 'Int64'
|
||||
elif factor_dtype == 'object':
|
||||
print(f"警告: 特征 '{factor_name}' 的数据类型为 object,将跳过添加列。")
|
||||
continue
|
||||
else:
|
||||
clickhouse_dtype = 'Float64'
|
||||
|
||||
if clickhouse_dtype:
|
||||
add_column_query = f"ALTER TABLE {table_name} ADD COLUMN IF NOT EXISTS {factor_name} {clickhouse_dtype}"
|
||||
client.execute(add_column_query)
|
||||
print(f"在表 '{table_name}' 中添加了新列: {factor_name} ({clickhouse_dtype})")
|
||||
existing_columns.add(factor_name)
|
||||
|
||||
insert_columns_order = ['date', 'asset_id'] + [col for col in feature_columns if
|
||||
col in existing_columns and col in df.columns]
|
||||
|
||||
# 分批处理 DataFrame
|
||||
num_rows = len(df)
|
||||
for i in tqdm(range(0, num_rows, batch_size), desc="写入批次"):
|
||||
batch_df = df[i:i + batch_size]
|
||||
data_to_insert_batch = []
|
||||
for row in batch_df.itertuples(index=False):
|
||||
insert_row = [getattr(row, 'trade_date'), getattr(row, 'ts_code')]
|
||||
for factor in feature_columns:
|
||||
if factor in existing_columns and factor in df.columns:
|
||||
try:
|
||||
insert_row.append(getattr(row, factor))
|
||||
except AttributeError:
|
||||
insert_row.append(None)
|
||||
data_to_insert_batch.append(tuple(insert_row))
|
||||
write_batch_to_clickhouse(client, table_name, data_to_insert_batch, insert_columns_order)
|
||||
|
||||
except Exception as e:
|
||||
print(f"写入 ClickHouse 发生错误: {e}")
|
||||
finally:
|
||||
if 'client' in locals() and client.connection:
|
||||
client.disconnect()
|
||||
|
||||
|
||||
def write_batch_to_clickhouse(client, table_name, data_to_insert, columns_order):
|
||||
"""将一个批次的数据写入 ClickHouse"""
|
||||
if data_to_insert:
|
||||
insert_query_final = f"INSERT INTO {table_name} ({', '.join(columns_order)}) VALUES"
|
||||
try:
|
||||
client.execute(insert_query_final, data_to_insert)
|
||||
print(f"成功写入 {len(data_to_insert)} 条数据到 ClickHouse 表 '{table_name}'!")
|
||||
except Exception as e:
|
||||
print(f"写入 ClickHouse 批次数据发生错误: {e}")
|
||||
|
||||
|
||||
# -------------------- 使用示例 --------------------
|
||||
if __name__ == "__main__":
|
||||
# 示例 DataFrame
|
||||
|
||||
print('daily data')
|
||||
df = read_and_merge_h5_data('../../data/daily_data.h5', key='daily_data',
|
||||
columns=['ts_code', 'trade_date', 'open', 'close', 'high', 'low', 'vol', 'pct_chg'],
|
||||
df=None)
|
||||
|
||||
print('daily basic')
|
||||
df = read_and_merge_h5_data('../../data/daily_basic.h5', key='daily_basic',
|
||||
columns=['ts_code', 'trade_date', 'turnover_rate', 'pe_ttm', 'circ_mv', 'volume_ratio',
|
||||
'is_st'], df=df, join='inner')
|
||||
df = df[df['trade_date'] >= '2021-01-01']
|
||||
|
||||
print('stk limit')
|
||||
df = read_and_merge_h5_data('../../data/stk_limit.h5', key='stk_limit',
|
||||
columns=['ts_code', 'trade_date', 'pre_close', 'up_limit', 'down_limit'],
|
||||
df=df)
|
||||
print('money flow')
|
||||
df = read_and_merge_h5_data('../../data/money_flow.h5', key='money_flow',
|
||||
columns=['ts_code', 'trade_date', 'buy_sm_vol', 'sell_sm_vol', 'buy_lg_vol',
|
||||
'sell_lg_vol',
|
||||
'buy_elg_vol', 'sell_elg_vol', 'net_mf_vol'],
|
||||
df=df)
|
||||
print('cyq perf')
|
||||
df = read_and_merge_h5_data('../../data/cyq_perf.h5', key='cyq_perf',
|
||||
columns=['ts_code', 'trade_date', 'his_low', 'his_high', 'cost_5pct', 'cost_15pct',
|
||||
'cost_50pct',
|
||||
'cost_85pct', 'cost_95pct', 'weight_avg', 'winner_rate'],
|
||||
df=df)
|
||||
print(df.info())
|
||||
|
||||
origin_columns = df.columns.tolist()
|
||||
origin_columns = [col for col in origin_columns if 'cyq' not in col]
|
||||
print(origin_columns)
|
||||
|
||||
|
||||
def filter_data(df):
|
||||
# df = df.groupby('trade_date').apply(lambda x: x.nlargest(1000, 'act_factor1'))
|
||||
df = df[~df['is_st']]
|
||||
df = df[~df['ts_code'].str.endswith('BJ')]
|
||||
df = df[~df['ts_code'].str.startswith('30')]
|
||||
df = df[~df['ts_code'].str.startswith('68')]
|
||||
df = df[~df['ts_code'].str.startswith('8')]
|
||||
df = df[df['trade_date'] >= '20180101']
|
||||
if 'in_date' in df.columns:
|
||||
df = df.drop(columns=['in_date'])
|
||||
df = df.reset_index(drop=True)
|
||||
return df
|
||||
|
||||
|
||||
df = filter_data(df)
|
||||
df, _ = get_rolling_factor(df)
|
||||
df, _ = get_simple_factor(df)
|
||||
# df['test'] = 1
|
||||
# df['test2'] = 2
|
||||
# df = df.merge(industry_df, on=['l2_code', 'trade_date'], how='left')
|
||||
df = df.rename(columns={'l2_code': 'cat_l2_code'})
|
||||
# df = df.merge(index_data, on='trade_date', how='left')
|
||||
|
||||
print(df.info())
|
||||
|
||||
feature_columns = [col for col in df.columns if col in df.columns]
|
||||
feature_columns = [col for col in feature_columns if col not in ['trade_date',
|
||||
'ts_code',
|
||||
'label']]
|
||||
feature_columns = [col for col in feature_columns if 'future' not in col]
|
||||
feature_columns = [col for col in feature_columns if 'label' not in col]
|
||||
feature_columns = [col for col in feature_columns if 'score' not in col]
|
||||
feature_columns = [col for col in feature_columns if 'gen' not in col]
|
||||
feature_columns = [col for col in feature_columns if 'is_st' not in col]
|
||||
# feature_columns = [col for col in feature_columns if 'pe_ttm' not in col]
|
||||
# feature_columns = [col for col in feature_columns if 'volatility' not in col]
|
||||
# feature_columns = [col for col in feature_columns if 'circ_mv' not in col]
|
||||
feature_columns = [col for col in feature_columns if 'cat_l2_code' not in col]
|
||||
feature_columns = [col for col in feature_columns if col not in origin_columns]
|
||||
feature_columns = [col for col in feature_columns if not col.startswith('_')]
|
||||
|
||||
print(feature_columns)
|
||||
|
||||
# 替换为您的 ClickHouse 连接信息
|
||||
clickhouse_host = '127.0.0.1'
|
||||
clickhouse_port = 9000
|
||||
clickhouse_user = 'default'
|
||||
clickhouse_password = 'clickhouse520102'
|
||||
clickhouse_database = 'stock_data'
|
||||
|
||||
# create_factor_table_clickhouse(clickhouse_host, clickhouse_port,
|
||||
# clickhouse_user, clickhouse_password,
|
||||
# clickhouse_database)
|
||||
|
||||
write_features_to_clickhouse(
|
||||
df[[col for col in df.columns if col in ['ts_code', 'trade_date'] or col in feature_columns]], feature_columns,
|
||||
clickhouse_host, clickhouse_port,
|
||||
clickhouse_user, clickhouse_password,
|
||||
clickhouse_database)
|
||||
from tqdm import tqdm
|
||||
|
||||
from main.factor.factor import get_rolling_factor, get_simple_factor
|
||||
from main.utils.utils import read_and_merge_h5_data
|
||||
import pandas as pd
|
||||
|
||||
|
||||
def create_factor_table_clickhouse(clickhouse_host: str, clickhouse_port: int,
|
||||
clickhouse_user: str, clickhouse_password: str,
|
||||
clickhouse_database: str, table_name: str = 'factor_data'):
|
||||
"""
|
||||
在 ClickHouse 中创建 factor_data 表,考虑读取速度。
|
||||
"""
|
||||
try:
|
||||
print('create factor table')
|
||||
client = Client(host=clickhouse_host, port=clickhouse_port, user=clickhouse_user,
|
||||
password=clickhouse_password, database=clickhouse_database)
|
||||
|
||||
create_table_query = f"""
|
||||
CREATE TABLE IF NOT EXISTS {table_name}
|
||||
(
|
||||
date Date,
|
||||
asset_id String,
|
||||
factor_name String,
|
||||
factor_value Float64
|
||||
)
|
||||
ENGINE = MergeTree()
|
||||
PARTITION BY toYYYYMM(date)
|
||||
ORDER BY (date, asset_id, factor_name)
|
||||
"""
|
||||
|
||||
client.execute(create_table_query)
|
||||
print(f"成功在 ClickHouse 数据库 '{clickhouse_database}' 中创建表 '{table_name}'!")
|
||||
|
||||
except Exception as e:
|
||||
print(f"创建 ClickHouse 表发生错误: {e}")
|
||||
finally:
|
||||
if 'client' in locals() and client.connection:
|
||||
client.disconnect()
|
||||
|
||||
|
||||
def write_features_to_clickhouse(df: pd.DataFrame, feature_columns: list,
|
||||
clickhouse_host: str, clickhouse_port: int,
|
||||
clickhouse_user: str, clickhouse_password: str,
|
||||
clickhouse_database: str, table_name: str = 'stock_factor',
|
||||
batch_size: int = 5000): # 设置批次大小
|
||||
"""
|
||||
将 DataFrame 中指定的特征列分批写入 ClickHouse 的宽表,动态添加列。
|
||||
"""
|
||||
try:
|
||||
client = Client(host=clickhouse_host, port=clickhouse_port, user=clickhouse_user,
|
||||
password=clickhouse_password, database=clickhouse_database)
|
||||
|
||||
if 'ts_code' not in df.columns or 'trade_date' not in df.columns:
|
||||
raise ValueError("DataFrame 必须包含 'ts_code' 和 'trade_date' 列。")
|
||||
|
||||
existing_columns = set()
|
||||
columns_query = f"DESCRIBE TABLE {table_name}"
|
||||
columns_result = client.execute(columns_query)
|
||||
for col in columns_result:
|
||||
existing_columns.add(col[0])
|
||||
|
||||
for factor_name in feature_columns:
|
||||
if factor_name not in existing_columns:
|
||||
if factor_name not in df.columns:
|
||||
print(f"警告: 特征 '{factor_name}' 不存在于 DataFrame 中,将跳过添加列。")
|
||||
continue
|
||||
|
||||
factor_series = df[factor_name]
|
||||
factor_dtype = factor_series.dtype
|
||||
|
||||
clickhouse_dtype = None
|
||||
if pd.api.types.is_float_dtype(factor_dtype):
|
||||
clickhouse_dtype = 'Float64'
|
||||
elif pd.api.types.is_integer_dtype(factor_dtype):
|
||||
clickhouse_dtype = 'Int64'
|
||||
elif factor_dtype == 'object':
|
||||
print(f"警告: 特征 '{factor_name}' 的数据类型为 object,将跳过添加列。")
|
||||
continue
|
||||
else:
|
||||
clickhouse_dtype = 'Float64'
|
||||
|
||||
if clickhouse_dtype:
|
||||
add_column_query = f"ALTER TABLE {table_name} ADD COLUMN IF NOT EXISTS {factor_name} {clickhouse_dtype}"
|
||||
client.execute(add_column_query)
|
||||
print(f"在表 '{table_name}' 中添加了新列: {factor_name} ({clickhouse_dtype})")
|
||||
existing_columns.add(factor_name)
|
||||
|
||||
insert_columns_order = ['date', 'asset_id'] + [col for col in feature_columns if
|
||||
col in existing_columns and col in df.columns]
|
||||
|
||||
# 分批处理 DataFrame
|
||||
num_rows = len(df)
|
||||
for i in tqdm(range(0, num_rows, batch_size), desc="写入批次"):
|
||||
batch_df = df[i:i + batch_size]
|
||||
data_to_insert_batch = []
|
||||
for row in batch_df.itertuples(index=False):
|
||||
insert_row = [getattr(row, 'trade_date'), getattr(row, 'ts_code')]
|
||||
for factor in feature_columns:
|
||||
if factor in existing_columns and factor in df.columns:
|
||||
try:
|
||||
insert_row.append(getattr(row, factor))
|
||||
except AttributeError:
|
||||
insert_row.append(None)
|
||||
data_to_insert_batch.append(tuple(insert_row))
|
||||
write_batch_to_clickhouse(client, table_name, data_to_insert_batch, insert_columns_order)
|
||||
|
||||
except Exception as e:
|
||||
print(f"写入 ClickHouse 发生错误: {e}")
|
||||
finally:
|
||||
if 'client' in locals() and client.connection:
|
||||
client.disconnect()
|
||||
|
||||
|
||||
def write_batch_to_clickhouse(client, table_name, data_to_insert, columns_order):
|
||||
"""将一个批次的数据写入 ClickHouse"""
|
||||
if data_to_insert:
|
||||
insert_query_final = f"INSERT INTO {table_name} ({', '.join(columns_order)}) VALUES"
|
||||
try:
|
||||
client.execute(insert_query_final, data_to_insert)
|
||||
print(f"成功写入 {len(data_to_insert)} 条数据到 ClickHouse 表 '{table_name}'!")
|
||||
except Exception as e:
|
||||
print(f"写入 ClickHouse 批次数据发生错误: {e}")
|
||||
|
||||
|
||||
# -------------------- 使用示例 --------------------
|
||||
if __name__ == "__main__":
|
||||
# 示例 DataFrame
|
||||
|
||||
print('daily data')
|
||||
df = read_and_merge_h5_data('../../data/daily_data.h5', key='daily_data',
|
||||
columns=['ts_code', 'trade_date', 'open', 'close', 'high', 'low', 'vol', 'pct_chg'],
|
||||
df=None)
|
||||
|
||||
print('daily basic')
|
||||
df = read_and_merge_h5_data('../../data/daily_basic.h5', key='daily_basic',
|
||||
columns=['ts_code', 'trade_date', 'turnover_rate', 'pe_ttm', 'circ_mv', 'volume_ratio',
|
||||
'is_st'], df=df, join='inner')
|
||||
df = df[df['trade_date'] >= '2021-01-01']
|
||||
|
||||
print('stk limit')
|
||||
df = read_and_merge_h5_data('../../data/stk_limit.h5', key='stk_limit',
|
||||
columns=['ts_code', 'trade_date', 'pre_close', 'up_limit', 'down_limit'],
|
||||
df=df)
|
||||
print('money flow')
|
||||
df = read_and_merge_h5_data('../../data/money_flow.h5', key='money_flow',
|
||||
columns=['ts_code', 'trade_date', 'buy_sm_vol', 'sell_sm_vol', 'buy_lg_vol',
|
||||
'sell_lg_vol',
|
||||
'buy_elg_vol', 'sell_elg_vol', 'net_mf_vol'],
|
||||
df=df)
|
||||
print('cyq perf')
|
||||
df = read_and_merge_h5_data('../../data/cyq_perf.h5', key='cyq_perf',
|
||||
columns=['ts_code', 'trade_date', 'his_low', 'his_high', 'cost_5pct', 'cost_15pct',
|
||||
'cost_50pct',
|
||||
'cost_85pct', 'cost_95pct', 'weight_avg', 'winner_rate'],
|
||||
df=df)
|
||||
print(df.info())
|
||||
|
||||
origin_columns = df.columns.tolist()
|
||||
origin_columns = [col for col in origin_columns if 'cyq' not in col]
|
||||
print(origin_columns)
|
||||
|
||||
|
||||
def filter_data(df):
|
||||
# df = df.groupby('trade_date').apply(lambda x: x.nlargest(1000, 'act_factor1'))
|
||||
df = df[~df['is_st']]
|
||||
df = df[~df['ts_code'].str.endswith('BJ')]
|
||||
df = df[~df['ts_code'].str.startswith('30')]
|
||||
df = df[~df['ts_code'].str.startswith('68')]
|
||||
df = df[~df['ts_code'].str.startswith('8')]
|
||||
df = df[df['trade_date'] >= '20180101']
|
||||
if 'in_date' in df.columns:
|
||||
df = df.drop(columns=['in_date'])
|
||||
df = df.reset_index(drop=True)
|
||||
return df
|
||||
|
||||
|
||||
df = filter_data(df)
|
||||
df, _ = get_rolling_factor(df)
|
||||
df, _ = get_simple_factor(df)
|
||||
# df['test'] = 1
|
||||
# df['test2'] = 2
|
||||
# df = df.merge(industry_df, on=['l2_code', 'trade_date'], how='left')
|
||||
df = df.rename(columns={'l2_code': 'cat_l2_code'})
|
||||
# df = df.merge(index_data, on='trade_date', how='left')
|
||||
|
||||
print(df.info())
|
||||
|
||||
feature_columns = [col for col in df.columns if col in df.columns]
|
||||
feature_columns = [col for col in feature_columns if col not in ['trade_date',
|
||||
'ts_code',
|
||||
'label']]
|
||||
feature_columns = [col for col in feature_columns if 'future' not in col]
|
||||
feature_columns = [col for col in feature_columns if 'label' not in col]
|
||||
feature_columns = [col for col in feature_columns if 'score' not in col]
|
||||
feature_columns = [col for col in feature_columns if 'gen' not in col]
|
||||
feature_columns = [col for col in feature_columns if 'is_st' not in col]
|
||||
# feature_columns = [col for col in feature_columns if 'pe_ttm' not in col]
|
||||
# feature_columns = [col for col in feature_columns if 'volatility' not in col]
|
||||
# feature_columns = [col for col in feature_columns if 'circ_mv' not in col]
|
||||
feature_columns = [col for col in feature_columns if 'cat_l2_code' not in col]
|
||||
feature_columns = [col for col in feature_columns if col not in origin_columns]
|
||||
feature_columns = [col for col in feature_columns if not col.startswith('_')]
|
||||
|
||||
print(feature_columns)
|
||||
|
||||
# 替换为您的 ClickHouse 连接信息
|
||||
clickhouse_host = '127.0.0.1'
|
||||
clickhouse_port = 9000
|
||||
clickhouse_user = 'default'
|
||||
clickhouse_password = 'clickhouse520102'
|
||||
clickhouse_database = 'stock_data'
|
||||
|
||||
# create_factor_table_clickhouse(clickhouse_host, clickhouse_port,
|
||||
# clickhouse_user, clickhouse_password,
|
||||
# clickhouse_database)
|
||||
|
||||
write_features_to_clickhouse(
|
||||
df[[col for col in df.columns if col in ['ts_code', 'trade_date'] or col in feature_columns]], feature_columns,
|
||||
clickhouse_host, clickhouse_port,
|
||||
clickhouse_user, clickhouse_password,
|
||||
clickhouse_database)
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
import sys
|
||||
print(sys.path)
|
||||
|
||||
from main.utils.utils import read_and_merge_h5_data, merge_with_industry_data
|
||||
import sys
|
||||
print(sys.path)
|
||||
|
||||
from main.utils.utils import read_and_merge_h5_data, merge_with_industry_data
|
||||
|
||||
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because it is too large
Load Diff
File diff suppressed because one or more lines are too long
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
Binary file not shown.
File diff suppressed because it is too large
Load Diff
Binary file not shown.
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -1,15 +1,15 @@
|
||||
from operator import index
|
||||
|
||||
import tushare as ts
|
||||
import pandas as pd
|
||||
import time
|
||||
import akshare as ak
|
||||
|
||||
from main.factor.factor import calculate_arbr
|
||||
|
||||
ts.set_token('3a0741c702ee7e5e5f2bf1f0846bafaafe4e320833240b2a7e4a685f')
|
||||
pro = ts.pro_api()
|
||||
|
||||
df = pro.dc_member(trade_date='20190105')
|
||||
|
||||
from operator import index
|
||||
|
||||
import tushare as ts
|
||||
import pandas as pd
|
||||
import time
|
||||
import akshare as ak
|
||||
|
||||
from main.factor.factor import calculate_arbr
|
||||
|
||||
ts.set_token('3a0741c702ee7e5e5f2bf1f0846bafaafe4e320833240b2a7e4a685f')
|
||||
pro = ts.pro_api()
|
||||
|
||||
df = pro.dc_member(trade_date='20190105')
|
||||
|
||||
print(df)
|
||||
1130
main/train/test1.tsv
1130
main/train/test1.tsv
File diff suppressed because it is too large
Load Diff
@@ -1,72 +1,72 @@
|
||||
trade_date,score,ts_code
|
||||
2024-12-09,1.1968650846005326,600593.SH
|
||||
2024-12-10,0.21490252296809745,002611.SZ
|
||||
2024-12-11,0.5721914798956016,002321.SZ
|
||||
2024-12-12,0.6509338263544048,600628.SH
|
||||
2024-12-13,2.1288113028385376,600628.SH
|
||||
2024-12-16,1.378346480524284,002086.SZ
|
||||
2024-12-17,1.45986967550941,002741.SZ
|
||||
2024-12-18,1.3436778254529067,600579.SH
|
||||
2024-12-19,0.41218776805787716,600796.SH
|
||||
2024-12-20,1.0840917563770454,603421.SH
|
||||
2024-12-23,1.00141172278312,600889.SH
|
||||
2024-12-24,1.0354156548919864,600725.SH
|
||||
2024-12-25,0.9562524807100355,600066.SH
|
||||
2024-12-26,1.1279048294352958,002916.SZ
|
||||
2024-12-27,0.6532174116474766,002068.SZ
|
||||
2024-12-30,-0.1308794790538431,002918.SZ
|
||||
2024-12-31,0.7160474599127873,600857.SH
|
||||
2025-01-02,1.5067649520721882,002449.SZ
|
||||
2025-01-03,0.9282246137432282,603379.SH
|
||||
2025-01-06,0.6797051204009213,603893.SH
|
||||
2025-01-07,0.9376184079476354,603236.SH
|
||||
2025-01-08,0.9064516934700023,603308.SH
|
||||
2025-01-09,0.9314493554789942,000880.SZ
|
||||
2025-01-10,0.5025761501709369,600584.SH
|
||||
2025-01-13,0.7483210862212708,000063.SZ
|
||||
2025-01-14,1.2632673941368837,000063.SZ
|
||||
2025-01-15,1.8580661802761587,002917.SZ
|
||||
2025-01-16,1.1918089652002073,600693.SH
|
||||
2025-01-17,0.8288939941365315,600693.SH
|
||||
2025-01-20,0.677726091977902,002577.SZ
|
||||
2025-01-21,1.8336548268410158,603893.SH
|
||||
2025-01-22,1.0395051538956546,000573.SZ
|
||||
2025-01-23,0.4308220427423068,003021.SZ
|
||||
2025-01-24,1.8057941775723685,002862.SZ
|
||||
2025-01-27,1.216662909774701,002779.SZ
|
||||
2025-02-05,0.8484867753831473,603990.SH
|
||||
2025-02-06,0.5038824073142949,001380.SZ
|
||||
2025-02-07,0.7672133571524726,002031.SZ
|
||||
2025-02-10,0.5417223016033719,000681.SZ
|
||||
2025-02-11,0.9399374716518157,000034.SZ
|
||||
2025-02-12,1.8742056631297925,000856.SZ
|
||||
2025-02-13,1.4837670146272484,600633.SH
|
||||
2025-02-14,1.2043600916692372,605488.SH
|
||||
2025-02-17,1.1259104542173328,603918.SH
|
||||
2025-02-18,1.1806931791732853,600126.SH
|
||||
2025-02-19,1.020437698817749,603956.SH
|
||||
2025-02-20,0.5818349669113919,003021.SZ
|
||||
2025-02-21,1.0941497070930342,603950.SH
|
||||
2025-02-24,1.867258980329339,600602.SH
|
||||
2025-02-25,0.8646726218943293,002691.SZ
|
||||
2025-02-26,1.2878484406363957,002245.SZ
|
||||
2025-02-27,1.3013902577988068,600173.SH
|
||||
2025-02-28,0.7804376426721501,603040.SH
|
||||
2025-03-03,0.45593268249434266,002345.SZ
|
||||
2025-03-04,0.9265705061587579,600589.SH
|
||||
2025-03-05,0.766962270753268,002575.SZ
|
||||
2025-03-06,0.7030260458187082,601100.SH
|
||||
2025-03-07,0.924011193171594,002896.SZ
|
||||
2025-03-10,1.0811487252993004,600126.SH
|
||||
2025-03-11,1.272392599656189,002896.SZ
|
||||
2025-03-12,1.0905437448562905,002276.SZ
|
||||
2025-03-13,1.0688995313878895,003038.SZ
|
||||
2025-03-14,1.2418913857438587,001256.SZ
|
||||
2025-03-17,1.004550155323,001256.SZ
|
||||
2025-03-18,0.7517848278576412,600403.SH
|
||||
2025-03-19,1.5106246878723002,605008.SH
|
||||
2025-03-20,1.1509811695536982,600610.SH
|
||||
2025-03-21,0.6033998331536018,603196.SH
|
||||
2025-03-24,0.3456173948047773,002345.SZ
|
||||
2025-03-25,1.470314131581159,600320.SH
|
||||
2025-03-26,0.745243100558546,603325.SH
|
||||
trade_date,score,ts_code
|
||||
2024-12-09,1.1968650846005326,600593.SH
|
||||
2024-12-10,0.21490252296809745,002611.SZ
|
||||
2024-12-11,0.5721914798956016,002321.SZ
|
||||
2024-12-12,0.6509338263544048,600628.SH
|
||||
2024-12-13,2.1288113028385376,600628.SH
|
||||
2024-12-16,1.378346480524284,002086.SZ
|
||||
2024-12-17,1.45986967550941,002741.SZ
|
||||
2024-12-18,1.3436778254529067,600579.SH
|
||||
2024-12-19,0.41218776805787716,600796.SH
|
||||
2024-12-20,1.0840917563770454,603421.SH
|
||||
2024-12-23,1.00141172278312,600889.SH
|
||||
2024-12-24,1.0354156548919864,600725.SH
|
||||
2024-12-25,0.9562524807100355,600066.SH
|
||||
2024-12-26,1.1279048294352958,002916.SZ
|
||||
2024-12-27,0.6532174116474766,002068.SZ
|
||||
2024-12-30,-0.1308794790538431,002918.SZ
|
||||
2024-12-31,0.7160474599127873,600857.SH
|
||||
2025-01-02,1.5067649520721882,002449.SZ
|
||||
2025-01-03,0.9282246137432282,603379.SH
|
||||
2025-01-06,0.6797051204009213,603893.SH
|
||||
2025-01-07,0.9376184079476354,603236.SH
|
||||
2025-01-08,0.9064516934700023,603308.SH
|
||||
2025-01-09,0.9314493554789942,000880.SZ
|
||||
2025-01-10,0.5025761501709369,600584.SH
|
||||
2025-01-13,0.7483210862212708,000063.SZ
|
||||
2025-01-14,1.2632673941368837,000063.SZ
|
||||
2025-01-15,1.8580661802761587,002917.SZ
|
||||
2025-01-16,1.1918089652002073,600693.SH
|
||||
2025-01-17,0.8288939941365315,600693.SH
|
||||
2025-01-20,0.677726091977902,002577.SZ
|
||||
2025-01-21,1.8336548268410158,603893.SH
|
||||
2025-01-22,1.0395051538956546,000573.SZ
|
||||
2025-01-23,0.4308220427423068,003021.SZ
|
||||
2025-01-24,1.8057941775723685,002862.SZ
|
||||
2025-01-27,1.216662909774701,002779.SZ
|
||||
2025-02-05,0.8484867753831473,603990.SH
|
||||
2025-02-06,0.5038824073142949,001380.SZ
|
||||
2025-02-07,0.7672133571524726,002031.SZ
|
||||
2025-02-10,0.5417223016033719,000681.SZ
|
||||
2025-02-11,0.9399374716518157,000034.SZ
|
||||
2025-02-12,1.8742056631297925,000856.SZ
|
||||
2025-02-13,1.4837670146272484,600633.SH
|
||||
2025-02-14,1.2043600916692372,605488.SH
|
||||
2025-02-17,1.1259104542173328,603918.SH
|
||||
2025-02-18,1.1806931791732853,600126.SH
|
||||
2025-02-19,1.020437698817749,603956.SH
|
||||
2025-02-20,0.5818349669113919,003021.SZ
|
||||
2025-02-21,1.0941497070930342,603950.SH
|
||||
2025-02-24,1.867258980329339,600602.SH
|
||||
2025-02-25,0.8646726218943293,002691.SZ
|
||||
2025-02-26,1.2878484406363957,002245.SZ
|
||||
2025-02-27,1.3013902577988068,600173.SH
|
||||
2025-02-28,0.7804376426721501,603040.SH
|
||||
2025-03-03,0.45593268249434266,002345.SZ
|
||||
2025-03-04,0.9265705061587579,600589.SH
|
||||
2025-03-05,0.766962270753268,002575.SZ
|
||||
2025-03-06,0.7030260458187082,601100.SH
|
||||
2025-03-07,0.924011193171594,002896.SZ
|
||||
2025-03-10,1.0811487252993004,600126.SH
|
||||
2025-03-11,1.272392599656189,002896.SZ
|
||||
2025-03-12,1.0905437448562905,002276.SZ
|
||||
2025-03-13,1.0688995313878895,003038.SZ
|
||||
2025-03-14,1.2418913857438587,001256.SZ
|
||||
2025-03-17,1.004550155323,001256.SZ
|
||||
2025-03-18,0.7517848278576412,600403.SH
|
||||
2025-03-19,1.5106246878723002,605008.SH
|
||||
2025-03-20,1.1509811695536982,600610.SH
|
||||
2025-03-21,0.6033998331536018,603196.SH
|
||||
2025-03-24,0.3456173948047773,002345.SZ
|
||||
2025-03-25,1.470314131581159,600320.SH
|
||||
2025-03-26,0.745243100558546,603325.SH
|
||||
|
||||
|
BIN
main/utils/__pycache__/__init__.cpython-313.pyc
Normal file
BIN
main/utils/__pycache__/__init__.cpython-313.pyc
Normal file
Binary file not shown.
BIN
main/utils/__pycache__/factor.cpython-313.pyc
Normal file
BIN
main/utils/__pycache__/factor.cpython-313.pyc
Normal file
Binary file not shown.
BIN
main/utils/__pycache__/factor_processor.cpython-313.pyc
Normal file
BIN
main/utils/__pycache__/factor_processor.cpython-313.pyc
Normal file
Binary file not shown.
BIN
main/utils/__pycache__/utils.cpython-313.pyc
Normal file
BIN
main/utils/__pycache__/utils.cpython-313.pyc
Normal file
Binary file not shown.
1560
main/utils/factor.py
1560
main/utils/factor.py
File diff suppressed because it is too large
Load Diff
@@ -1,233 +1,233 @@
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
from scipy.stats import ks_2samp
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
|
||||
|
||||
def remove_shifted_features(train_data, feature_columns, ks_threshold=0.05, wasserstein_threshold=0.1, size=0.8,
|
||||
log=True, val_data=None):
|
||||
dropped_features = []
|
||||
|
||||
if val_data is None:
|
||||
all_dates = sorted(train_data['trade_date'].unique().tolist()) # 获取所有唯一的 trade_date
|
||||
split_date = all_dates[int(len(all_dates) * size)] # 划分点为倒数第 validation_days 天
|
||||
train_data_split = train_data[train_data['trade_date'] < split_date] # 训练集
|
||||
val_data_split = train_data[train_data['trade_date'] >= split_date] # 验证集
|
||||
else:
|
||||
train_data_split = train_data
|
||||
val_data_split = val_data
|
||||
|
||||
# **统计数据漂移**
|
||||
numeric_columns = train_data_split.select_dtypes(include=['float64', 'int64']).columns
|
||||
numeric_columns = [col for col in numeric_columns if col in feature_columns]
|
||||
for feature in numeric_columns:
|
||||
ks_stat, p_value = ks_2samp(train_data_split[feature], val_data_split[feature])
|
||||
# wasserstein_dist = wasserstein_distance(train_data_split[feature], val_data_split[feature])
|
||||
|
||||
# if p_value < ks_threshold or wasserstein_dist > wasserstein_threshold:
|
||||
if p_value < ks_threshold:
|
||||
dropped_features.append(feature)
|
||||
if log:
|
||||
print(f"检测到 {len(dropped_features)} 个可能漂移的特征: {dropped_features}")
|
||||
|
||||
# **应用阈值进行最终筛选**
|
||||
filtered_features = [f for f in feature_columns if f not in dropped_features]
|
||||
|
||||
return filtered_features, dropped_features
|
||||
|
||||
|
||||
def remove_outliers_label_percentile(label: pd.Series, lower_percentile: float = 0.01, upper_percentile: float = 0.99,
|
||||
log=True):
|
||||
if not (0 <= lower_percentile < upper_percentile <= 1):
|
||||
raise ValueError("Percentile values must satisfy 0 <= lower_percentile < upper_percentile <= 1.")
|
||||
|
||||
# Calculate lower and upper bounds based on percentiles
|
||||
lower_bound = label.quantile(lower_percentile)
|
||||
upper_bound = label.quantile(upper_percentile)
|
||||
|
||||
# Filter out values outside the bounds
|
||||
filtered_label = label[(label >= lower_bound) & (label <= upper_bound)]
|
||||
|
||||
# Print the number of removed outliers
|
||||
if log:
|
||||
print(f"Removed {len(label) - len(filtered_label)} outliers.")
|
||||
return filtered_label
|
||||
|
||||
|
||||
def calculate_risk_adjusted_target(df, days=5):
|
||||
df = df.sort_values(by=['ts_code', 'trade_date'])
|
||||
|
||||
df['future_close'] = df.groupby('ts_code')['close'].shift(-days)
|
||||
df['future_open'] = df.groupby('ts_code')['open'].shift(-1)
|
||||
df['future_return'] = (df['future_close'] - df['future_open']) / df['future_open']
|
||||
|
||||
df['future_volatility'] = df.groupby('ts_code')['future_return'].rolling(days, min_periods=1).std().reset_index(
|
||||
level=0, drop=True)
|
||||
sharpe_ratio = df['future_return'] * df['future_volatility']
|
||||
sharpe_ratio.replace([np.inf, -np.inf], np.nan, inplace=True)
|
||||
|
||||
return sharpe_ratio
|
||||
|
||||
|
||||
def calculate_score(df, days=5, lambda_param=1.0):
|
||||
def calculate_max_drawdown(prices):
|
||||
peak = prices.iloc[0] # 初始化峰值
|
||||
max_drawdown = 0 # 初始化最大回撤
|
||||
|
||||
for price in prices:
|
||||
if price > peak:
|
||||
peak = price # 更新峰值
|
||||
else:
|
||||
drawdown = (peak - price) / peak # 计算当前回撤
|
||||
max_drawdown = max(max_drawdown, drawdown) # 更新最大回撤
|
||||
|
||||
return max_drawdown
|
||||
|
||||
def compute_stock_score(stock_df):
|
||||
stock_df = stock_df.sort_values(by=['trade_date'])
|
||||
future_return = stock_df['future_return']
|
||||
# 使用已有的 pct_chg 字段计算波动率
|
||||
volatility = stock_df['pct_chg'].rolling(days).std().shift(-days)
|
||||
max_drawdown = stock_df['close'].rolling(days).apply(calculate_max_drawdown, raw=False).shift(-days)
|
||||
score = future_return - lambda_param * max_drawdown
|
||||
return score
|
||||
|
||||
# # 确保 DataFrame 按照股票代码和交易日期排序
|
||||
# df = df.sort_values(by=['ts_code', 'trade_date'])
|
||||
|
||||
# 对每个股票分别计算 score
|
||||
df['score'] = df.groupby('ts_code').apply(compute_stock_score).reset_index(level=0, drop=True)
|
||||
|
||||
return df['score']
|
||||
|
||||
|
||||
def remove_highly_correlated_features(df, feature_columns, threshold=0.9):
|
||||
numeric_features = df[feature_columns].select_dtypes(include=[np.number]).columns.tolist()
|
||||
if not numeric_features:
|
||||
raise ValueError("No numeric features found in the provided data.")
|
||||
|
||||
corr_matrix = df[numeric_features].corr().abs()
|
||||
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
|
||||
to_drop = [column for column in upper.columns if any(upper[column] > threshold)]
|
||||
remaining_features = [col for col in feature_columns if col not in to_drop
|
||||
or 'act' in col or 'af' in col]
|
||||
return remaining_features
|
||||
|
||||
|
||||
def cross_sectional_standardization(df, features):
|
||||
df_sorted = df.sort_values(by='trade_date') # 按时间排序
|
||||
df_standardized = df_sorted.copy()
|
||||
|
||||
for date in df_sorted['trade_date'].unique():
|
||||
# 获取当前时间点的数据
|
||||
current_data = df_standardized[df_standardized['trade_date'] == date]
|
||||
|
||||
# 只对指定特征进行标准化
|
||||
scaler = StandardScaler()
|
||||
standardized_values = scaler.fit_transform(current_data[features])
|
||||
|
||||
# 将标准化结果重新赋值回去
|
||||
df_standardized.loc[df_standardized['trade_date'] == date, features] = standardized_values
|
||||
|
||||
return df_standardized
|
||||
|
||||
|
||||
def neutralize_manual(df, features, industry_col, mkt_cap_col):
|
||||
""" 手动实现简单回归以提升速度 """
|
||||
|
||||
for col in features:
|
||||
residuals = []
|
||||
for _, group in df.groupby(industry_col):
|
||||
if len(group) > 1:
|
||||
x = np.log(group[mkt_cap_col]) # 市值对数
|
||||
y = group[col] # 因子值
|
||||
beta = np.cov(y, x)[0, 1] / np.var(x) # 计算斜率
|
||||
alpha = np.mean(y) - beta * np.mean(x) # 计算截距
|
||||
resid = y - (alpha + beta * x) # 计算残差
|
||||
residuals.extend(resid)
|
||||
else:
|
||||
residuals.extend(group[col]) # 样本不足时保留原值
|
||||
|
||||
df[col] = residuals
|
||||
|
||||
return df
|
||||
|
||||
|
||||
def mad_filter(df, features, n=3):
|
||||
for col in features:
|
||||
median = df[col].median()
|
||||
mad = np.median(np.abs(df[col] - median))
|
||||
upper = median + n * mad
|
||||
lower = median - n * mad
|
||||
df[col] = np.clip(df[col], lower, upper) # 截断极值
|
||||
return df
|
||||
|
||||
|
||||
def percentile_filter(df, features, lower_percentile=0.01, upper_percentile=0.99):
|
||||
for col in features:
|
||||
# 按日期分组计算上下百分位数
|
||||
lower_bound = df.groupby('trade_date')[col].transform(
|
||||
lambda x: x.quantile(lower_percentile)
|
||||
)
|
||||
upper_bound = df.groupby('trade_date')[col].transform(
|
||||
lambda x: x.quantile(upper_percentile)
|
||||
)
|
||||
# 截断超出范围的值
|
||||
df[col] = np.clip(df[col], lower_bound, upper_bound)
|
||||
return df
|
||||
|
||||
|
||||
from scipy.stats import iqr
|
||||
|
||||
|
||||
def iqr_filter(df, features):
|
||||
for col in features:
|
||||
df[col] = df.groupby('trade_date')[col].transform(
|
||||
lambda x: (x - x.median()) / iqr(x) if iqr(x) != 0 else x
|
||||
)
|
||||
return df
|
||||
|
||||
|
||||
def quantile_filter(df, features, lower_quantile=0.01, upper_quantile=0.99, window=60):
|
||||
df = df.copy()
|
||||
for col in features:
|
||||
# 计算 rolling 统计量,需要按日期进行 groupby
|
||||
rolling_lower = df.groupby('trade_date')[col].transform(
|
||||
lambda x: x.rolling(window=min(len(x), window)).quantile(lower_quantile))
|
||||
rolling_upper = df.groupby('trade_date')[col].transform(
|
||||
lambda x: x.rolling(window=min(len(x), window)).quantile(upper_quantile))
|
||||
|
||||
# 对数据进行裁剪
|
||||
df[col] = np.clip(df[col], rolling_lower, rolling_upper)
|
||||
|
||||
return df
|
||||
|
||||
|
||||
def time_series_quantile_filter(df, features, lower_quantile=0.01, upper_quantile=0.99, window=60):
|
||||
df = df.copy()
|
||||
# 确保按股票和时间排序
|
||||
df = df.sort_values(['ts_code', 'trade_date'])
|
||||
grouped = df.groupby('ts_code')
|
||||
for col in features:
|
||||
# 对每个股票的时间序列计算滚动分位数
|
||||
rolling_lower = grouped[col].rolling(window=window, min_periods=window // 2).quantile(lower_quantile)
|
||||
rolling_upper = grouped[col].rolling(window=window, min_periods=window // 2).quantile(upper_quantile)
|
||||
# rolling结果带有多重索引,需要对齐
|
||||
rolling_lower = rolling_lower.reset_index(level=0, drop=True)
|
||||
rolling_upper = rolling_upper.reset_index(level=0, drop=True)
|
||||
# 应用 clip
|
||||
df[col] = np.clip(df[col], rolling_lower, rolling_upper)
|
||||
return df
|
||||
|
||||
|
||||
def cross_sectional_quantile_filter(df, features, lower_quantile=0.01, upper_quantile=0.99):
|
||||
df = df.copy()
|
||||
grouped = df.groupby('trade_date')
|
||||
for col in features:
|
||||
# 计算每日截面的分位数边界
|
||||
lower_bound = grouped[col].transform(lambda x: x.quantile(lower_quantile))
|
||||
upper_bound = grouped[col].transform(lambda x: x.quantile(upper_quantile))
|
||||
# 应用 clip
|
||||
df[col] = np.clip(df[col], lower_bound, upper_bound)
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
from scipy.stats import ks_2samp
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
|
||||
|
||||
def remove_shifted_features(train_data, feature_columns, ks_threshold=0.05, wasserstein_threshold=0.1, size=0.8,
|
||||
log=True, val_data=None):
|
||||
dropped_features = []
|
||||
|
||||
if val_data is None:
|
||||
all_dates = sorted(train_data['trade_date'].unique().tolist()) # 获取所有唯一的 trade_date
|
||||
split_date = all_dates[int(len(all_dates) * size)] # 划分点为倒数第 validation_days 天
|
||||
train_data_split = train_data[train_data['trade_date'] < split_date] # 训练集
|
||||
val_data_split = train_data[train_data['trade_date'] >= split_date] # 验证集
|
||||
else:
|
||||
train_data_split = train_data
|
||||
val_data_split = val_data
|
||||
|
||||
# **统计数据漂移**
|
||||
numeric_columns = train_data_split.select_dtypes(include=['float64', 'int64']).columns
|
||||
numeric_columns = [col for col in numeric_columns if col in feature_columns]
|
||||
for feature in numeric_columns:
|
||||
ks_stat, p_value = ks_2samp(train_data_split[feature], val_data_split[feature])
|
||||
# wasserstein_dist = wasserstein_distance(train_data_split[feature], val_data_split[feature])
|
||||
|
||||
# if p_value < ks_threshold or wasserstein_dist > wasserstein_threshold:
|
||||
if p_value < ks_threshold:
|
||||
dropped_features.append(feature)
|
||||
if log:
|
||||
print(f"检测到 {len(dropped_features)} 个可能漂移的特征: {dropped_features}")
|
||||
|
||||
# **应用阈值进行最终筛选**
|
||||
filtered_features = [f for f in feature_columns if f not in dropped_features]
|
||||
|
||||
return filtered_features, dropped_features
|
||||
|
||||
|
||||
def remove_outliers_label_percentile(label: pd.Series, lower_percentile: float = 0.01, upper_percentile: float = 0.99,
|
||||
log=True):
|
||||
if not (0 <= lower_percentile < upper_percentile <= 1):
|
||||
raise ValueError("Percentile values must satisfy 0 <= lower_percentile < upper_percentile <= 1.")
|
||||
|
||||
# Calculate lower and upper bounds based on percentiles
|
||||
lower_bound = label.quantile(lower_percentile)
|
||||
upper_bound = label.quantile(upper_percentile)
|
||||
|
||||
# Filter out values outside the bounds
|
||||
filtered_label = label[(label >= lower_bound) & (label <= upper_bound)]
|
||||
|
||||
# Print the number of removed outliers
|
||||
if log:
|
||||
print(f"Removed {len(label) - len(filtered_label)} outliers.")
|
||||
return filtered_label
|
||||
|
||||
|
||||
def calculate_risk_adjusted_target(df, days=5):
|
||||
df = df.sort_values(by=['ts_code', 'trade_date'])
|
||||
|
||||
df['future_close'] = df.groupby('ts_code')['close'].shift(-days)
|
||||
df['future_open'] = df.groupby('ts_code')['open'].shift(-1)
|
||||
df['future_return'] = (df['future_close'] - df['future_open']) / df['future_open']
|
||||
|
||||
df['future_volatility'] = df.groupby('ts_code')['future_return'].rolling(days, min_periods=1).std().reset_index(
|
||||
level=0, drop=True)
|
||||
sharpe_ratio = df['future_return'] * df['future_volatility']
|
||||
sharpe_ratio.replace([np.inf, -np.inf], np.nan, inplace=True)
|
||||
|
||||
return sharpe_ratio
|
||||
|
||||
|
||||
def calculate_score(df, days=5, lambda_param=1.0):
|
||||
def calculate_max_drawdown(prices):
|
||||
peak = prices.iloc[0] # 初始化峰值
|
||||
max_drawdown = 0 # 初始化最大回撤
|
||||
|
||||
for price in prices:
|
||||
if price > peak:
|
||||
peak = price # 更新峰值
|
||||
else:
|
||||
drawdown = (peak - price) / peak # 计算当前回撤
|
||||
max_drawdown = max(max_drawdown, drawdown) # 更新最大回撤
|
||||
|
||||
return max_drawdown
|
||||
|
||||
def compute_stock_score(stock_df):
|
||||
stock_df = stock_df.sort_values(by=['trade_date'])
|
||||
future_return = stock_df['future_return']
|
||||
# 使用已有的 pct_chg 字段计算波动率
|
||||
volatility = stock_df['pct_chg'].rolling(days).std().shift(-days)
|
||||
max_drawdown = stock_df['close'].rolling(days).apply(calculate_max_drawdown, raw=False).shift(-days)
|
||||
score = future_return - lambda_param * max_drawdown
|
||||
return score
|
||||
|
||||
# # 确保 DataFrame 按照股票代码和交易日期排序
|
||||
# df = df.sort_values(by=['ts_code', 'trade_date'])
|
||||
|
||||
# 对每个股票分别计算 score
|
||||
df['score'] = df.groupby('ts_code').apply(compute_stock_score).reset_index(level=0, drop=True)
|
||||
|
||||
return df['score']
|
||||
|
||||
|
||||
def remove_highly_correlated_features(df, feature_columns, threshold=0.9):
|
||||
numeric_features = df[feature_columns].select_dtypes(include=[np.number]).columns.tolist()
|
||||
if not numeric_features:
|
||||
raise ValueError("No numeric features found in the provided data.")
|
||||
|
||||
corr_matrix = df[numeric_features].corr().abs()
|
||||
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
|
||||
to_drop = [column for column in upper.columns if any(upper[column] > threshold)]
|
||||
remaining_features = [col for col in feature_columns if col not in to_drop
|
||||
or 'act' in col or 'af' in col]
|
||||
return remaining_features
|
||||
|
||||
|
||||
def cross_sectional_standardization(df, features):
|
||||
df_sorted = df.sort_values(by='trade_date') # 按时间排序
|
||||
df_standardized = df_sorted.copy()
|
||||
|
||||
for date in df_sorted['trade_date'].unique():
|
||||
# 获取当前时间点的数据
|
||||
current_data = df_standardized[df_standardized['trade_date'] == date]
|
||||
|
||||
# 只对指定特征进行标准化
|
||||
scaler = StandardScaler()
|
||||
standardized_values = scaler.fit_transform(current_data[features])
|
||||
|
||||
# 将标准化结果重新赋值回去
|
||||
df_standardized.loc[df_standardized['trade_date'] == date, features] = standardized_values
|
||||
|
||||
return df_standardized
|
||||
|
||||
|
||||
def neutralize_manual(df, features, industry_col, mkt_cap_col):
|
||||
""" 手动实现简单回归以提升速度 """
|
||||
|
||||
for col in features:
|
||||
residuals = []
|
||||
for _, group in df.groupby(industry_col):
|
||||
if len(group) > 1:
|
||||
x = np.log(group[mkt_cap_col]) # 市值对数
|
||||
y = group[col] # 因子值
|
||||
beta = np.cov(y, x)[0, 1] / np.var(x) # 计算斜率
|
||||
alpha = np.mean(y) - beta * np.mean(x) # 计算截距
|
||||
resid = y - (alpha + beta * x) # 计算残差
|
||||
residuals.extend(resid)
|
||||
else:
|
||||
residuals.extend(group[col]) # 样本不足时保留原值
|
||||
|
||||
df[col] = residuals
|
||||
|
||||
return df
|
||||
|
||||
|
||||
def mad_filter(df, features, n=3):
|
||||
for col in features:
|
||||
median = df[col].median()
|
||||
mad = np.median(np.abs(df[col] - median))
|
||||
upper = median + n * mad
|
||||
lower = median - n * mad
|
||||
df[col] = np.clip(df[col], lower, upper) # 截断极值
|
||||
return df
|
||||
|
||||
|
||||
def percentile_filter(df, features, lower_percentile=0.01, upper_percentile=0.99):
|
||||
for col in features:
|
||||
# 按日期分组计算上下百分位数
|
||||
lower_bound = df.groupby('trade_date')[col].transform(
|
||||
lambda x: x.quantile(lower_percentile)
|
||||
)
|
||||
upper_bound = df.groupby('trade_date')[col].transform(
|
||||
lambda x: x.quantile(upper_percentile)
|
||||
)
|
||||
# 截断超出范围的值
|
||||
df[col] = np.clip(df[col], lower_bound, upper_bound)
|
||||
return df
|
||||
|
||||
|
||||
from scipy.stats import iqr
|
||||
|
||||
|
||||
def iqr_filter(df, features):
|
||||
for col in features:
|
||||
df[col] = df.groupby('trade_date')[col].transform(
|
||||
lambda x: (x - x.median()) / iqr(x) if iqr(x) != 0 else x
|
||||
)
|
||||
return df
|
||||
|
||||
|
||||
def quantile_filter(df, features, lower_quantile=0.01, upper_quantile=0.99, window=60):
|
||||
df = df.copy()
|
||||
for col in features:
|
||||
# 计算 rolling 统计量,需要按日期进行 groupby
|
||||
rolling_lower = df.groupby('trade_date')[col].transform(
|
||||
lambda x: x.rolling(window=min(len(x), window)).quantile(lower_quantile))
|
||||
rolling_upper = df.groupby('trade_date')[col].transform(
|
||||
lambda x: x.rolling(window=min(len(x), window)).quantile(upper_quantile))
|
||||
|
||||
# 对数据进行裁剪
|
||||
df[col] = np.clip(df[col], rolling_lower, rolling_upper)
|
||||
|
||||
return df
|
||||
|
||||
|
||||
def time_series_quantile_filter(df, features, lower_quantile=0.01, upper_quantile=0.99, window=60):
|
||||
df = df.copy()
|
||||
# 确保按股票和时间排序
|
||||
df = df.sort_values(['ts_code', 'trade_date'])
|
||||
grouped = df.groupby('ts_code')
|
||||
for col in features:
|
||||
# 对每个股票的时间序列计算滚动分位数
|
||||
rolling_lower = grouped[col].rolling(window=window, min_periods=window // 2).quantile(lower_quantile)
|
||||
rolling_upper = grouped[col].rolling(window=window, min_periods=window // 2).quantile(upper_quantile)
|
||||
# rolling结果带有多重索引,需要对齐
|
||||
rolling_lower = rolling_lower.reset_index(level=0, drop=True)
|
||||
rolling_upper = rolling_upper.reset_index(level=0, drop=True)
|
||||
# 应用 clip
|
||||
df[col] = np.clip(df[col], rolling_lower, rolling_upper)
|
||||
return df
|
||||
|
||||
|
||||
def cross_sectional_quantile_filter(df, features, lower_quantile=0.01, upper_quantile=0.99):
|
||||
df = df.copy()
|
||||
grouped = df.groupby('trade_date')
|
||||
for col in features:
|
||||
# 计算每日截面的分位数边界
|
||||
lower_bound = grouped[col].transform(lambda x: x.quantile(lower_quantile))
|
||||
upper_bound = grouped[col].transform(lambda x: x.quantile(upper_quantile))
|
||||
# 应用 clip
|
||||
df[col] = np.clip(df[col], lower_bound, upper_bound)
|
||||
return df
|
||||
@@ -1,154 +1,154 @@
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
|
||||
def read_and_merge_h5_data(h5_filename, key, columns, df=None, join='left', on=['ts_code', 'trade_date'], prefix=None):
|
||||
processed_columns = []
|
||||
for col in columns:
|
||||
if col.startswith('_'):
|
||||
processed_columns.append(col[1:]) # 去掉下划线
|
||||
else:
|
||||
processed_columns.append(col)
|
||||
|
||||
# 从 HDF5 文件读取数据,选择需要的列
|
||||
data = pd.read_hdf(h5_filename, key=key, columns=processed_columns)
|
||||
|
||||
# 修改列名,如果列名以前有 _,加上 _
|
||||
for col in data.columns:
|
||||
if col not in columns: # 只有不在 columns 中的列才需要加下划线
|
||||
new_col = f'_{col}'
|
||||
data.rename(columns={col: new_col}, inplace=True)
|
||||
|
||||
if prefix is not None:
|
||||
for col in data.columns:
|
||||
if col not in ['ts_code', 'trade_date']: # 只有不在 columns 中的列才需要加下划线
|
||||
new_col = f'{prefix}_{col}'
|
||||
data.rename(columns={col: new_col}, inplace=True)
|
||||
|
||||
# 如果传入的 df 不为空,则进行合并
|
||||
if df is not None and not df.empty:
|
||||
print(f'{join} merge on {on}')
|
||||
if 'trade_date' in on:
|
||||
# 确保两个 DataFrame 都有 ts_code 和 trade_date 列
|
||||
df['trade_date'] = pd.to_datetime(df['trade_date'], format='%Y%m%d')
|
||||
data['trade_date'] = pd.to_datetime(data['trade_date'], format='%Y%m%d')
|
||||
|
||||
# 根据 ts_code 和 trade_date 合并
|
||||
merged_df = pd.merge(df, data, on=on, how=join)
|
||||
else:
|
||||
# 如果 df 为空,则直接返回读取的数据
|
||||
merged_df = data
|
||||
|
||||
return merged_df
|
||||
|
||||
|
||||
def merge_with_industry_data(df, industry_df):
|
||||
# 确保日期字段是 datetime 类型
|
||||
df['trade_date'] = pd.to_datetime(df['trade_date'])
|
||||
industry_df['in_date'] = pd.to_datetime(industry_df['in_date'])
|
||||
|
||||
# 对 industry_df 按 ts_code 和 in_date 排序
|
||||
industry_df_sorted = industry_df.sort_values(['in_date', 'ts_code'])
|
||||
|
||||
# 对原始 df 按 ts_code 和 trade_date 排序
|
||||
df_sorted = df.sort_values(['trade_date', 'ts_code'])
|
||||
|
||||
# 使用 merge_asof 进行向后合并
|
||||
merged = pd.merge_asof(
|
||||
df_sorted,
|
||||
industry_df_sorted,
|
||||
by='ts_code', # 按 ts_code 分组
|
||||
left_on='trade_date',
|
||||
right_on='in_date',
|
||||
direction='backward'
|
||||
)
|
||||
|
||||
# 获取每个 ts_code 的最早 in_date 记录
|
||||
min_in_date_per_ts = (industry_df_sorted
|
||||
.groupby('ts_code')
|
||||
.first()
|
||||
.reset_index()[['ts_code', 'l2_code']])
|
||||
|
||||
# 填充未匹配到的记录(trade_date 早于所有 in_date 的情况)
|
||||
merged['l2_code'] = merged['l2_code'].fillna(
|
||||
merged['ts_code'].map(min_in_date_per_ts.set_index('ts_code')['l2_code'])
|
||||
)
|
||||
|
||||
# 保留需要的列并重置索引
|
||||
result = merged.reset_index(drop=True)
|
||||
return result
|
||||
|
||||
|
||||
def calculate_risk_adjusted_return(df, days=1, method='ratio', lambda_=0.5, eps=1e-8):
|
||||
"""
|
||||
计算单只股票的风险调整收益。
|
||||
|
||||
参数:
|
||||
- df: DataFrame,包含 'ts_code' 和 'close' 列,按日期排序(假设 'trade_date' 已排序)。
|
||||
- days: 预测未来多少天的收益,默认1天。
|
||||
- method: 'ratio'(收益/波动率) 或 'difference'(收益 - λ * 波动率)。
|
||||
- lambda_: 风险惩罚系数,仅当 method='difference' 时有效。
|
||||
- eps: 防止除零的小常数。
|
||||
|
||||
返回:
|
||||
- df:添加 'risk_adj_return' 列的 DataFrame,表示风险调整后的收益。
|
||||
"""
|
||||
# 确保数据按 ts_code 和 trade_date 排序
|
||||
df = df.sort_values(by=['ts_code', 'trade_date'])
|
||||
|
||||
# 计算未来的对数收益率
|
||||
df['future_return'] = np.log(df.groupby('ts_code')['close'].shift(-days) / df['close'])
|
||||
|
||||
# 计算历史收益(对数收益率)
|
||||
df['historical_return'] = np.log(df.groupby('ts_code')['close'].shift(1) / df['close'])
|
||||
|
||||
# 计算波动率(历史收益的标准差)
|
||||
df['volatility'] = df.groupby('ts_code')['historical_return'].rolling(window=days).std().reset_index(level=0,
|
||||
drop=True)
|
||||
|
||||
# 根据选择的 method 计算风险调整收益
|
||||
if method == 'ratio':
|
||||
# 收益/波动率(防止除零)
|
||||
df['risk_adj_return'] = df['future_return'] / (df['volatility'] + eps)
|
||||
elif method == 'difference':
|
||||
# 收益 - λ * 波动率
|
||||
df['risk_adj_return'] = df['future_return'] - lambda_ * df['volatility']
|
||||
else:
|
||||
raise ValueError("Invalid method. Use 'ratio' or 'difference'.")
|
||||
|
||||
return df
|
||||
|
||||
# import polars as pl
|
||||
#
|
||||
# def read_and_merge_h5_data_polars(h5_filename, key, columns, df=None, join='left', on=['ts_code', 'trade_date']):
|
||||
# processed_columns = []
|
||||
# for col in columns:
|
||||
# if col.startswith('_'):
|
||||
# processed_columns.append(col[1:]) # 去掉下划线
|
||||
# else:
|
||||
# processed_columns.append(col)
|
||||
#
|
||||
# # 从 HDF5 文件读取数据,选择需要的列
|
||||
# pd_df = pd.read_hdf(h5_filename, key=key, columns=processed_columns)
|
||||
#
|
||||
# # 将 Pandas DataFrame 转换为 Polars DataFrame
|
||||
# data = pl.from_pandas(pd_df)
|
||||
#
|
||||
# # 修改列名,如果列名以前有 _,加上 _
|
||||
# data = data.rename({col: f'_{col}' for col in data.columns if col not in columns})
|
||||
#
|
||||
# # 如果传入的 df 不为空,则进行合并
|
||||
# if df is not None and not df.is_empty():
|
||||
# print(f'{join} merge on {on}')
|
||||
#
|
||||
# # 确保两个 DataFrame 都有 ts_code 和 trade_date 列
|
||||
# # df = df.with_columns(pl.col('trade_date').str.strptime(pl.Datetime, format='%Y%m%d'))
|
||||
# # data = data.with_columns(pl.col('trade_date').str.strptime(pl.Datetime, format='%Y%m%d'))
|
||||
#
|
||||
# # 根据 ts_code 和 trade_date 合并
|
||||
# merged_df = df.join(data, on=on, how=join)
|
||||
# else:
|
||||
# # 如果 df 为空,则直接返回读取的数据
|
||||
# merged_df = data
|
||||
#
|
||||
# return merged_df
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
|
||||
def read_and_merge_h5_data(h5_filename, key, columns, df=None, join='left', on=['ts_code', 'trade_date'], prefix=None):
|
||||
processed_columns = []
|
||||
for col in columns:
|
||||
if col.startswith('_'):
|
||||
processed_columns.append(col[1:]) # 去掉下划线
|
||||
else:
|
||||
processed_columns.append(col)
|
||||
|
||||
# 从 HDF5 文件读取数据,选择需要的列
|
||||
data = pd.read_hdf(h5_filename, key=key, columns=processed_columns)
|
||||
|
||||
# 修改列名,如果列名以前有 _,加上 _
|
||||
for col in data.columns:
|
||||
if col not in columns: # 只有不在 columns 中的列才需要加下划线
|
||||
new_col = f'_{col}'
|
||||
data.rename(columns={col: new_col}, inplace=True)
|
||||
|
||||
if prefix is not None:
|
||||
for col in data.columns:
|
||||
if col not in ['ts_code', 'trade_date']: # 只有不在 columns 中的列才需要加下划线
|
||||
new_col = f'{prefix}_{col}'
|
||||
data.rename(columns={col: new_col}, inplace=True)
|
||||
|
||||
# 如果传入的 df 不为空,则进行合并
|
||||
if df is not None and not df.empty:
|
||||
print(f'{join} merge on {on}')
|
||||
if 'trade_date' in on:
|
||||
# 确保两个 DataFrame 都有 ts_code 和 trade_date 列
|
||||
df['trade_date'] = pd.to_datetime(df['trade_date'], format='%Y%m%d')
|
||||
data['trade_date'] = pd.to_datetime(data['trade_date'], format='%Y%m%d')
|
||||
|
||||
# 根据 ts_code 和 trade_date 合并
|
||||
merged_df = pd.merge(df, data, on=on, how=join)
|
||||
else:
|
||||
# 如果 df 为空,则直接返回读取的数据
|
||||
merged_df = data
|
||||
|
||||
return merged_df
|
||||
|
||||
|
||||
def merge_with_industry_data(df, industry_df):
|
||||
# 确保日期字段是 datetime 类型
|
||||
df['trade_date'] = pd.to_datetime(df['trade_date'])
|
||||
industry_df['in_date'] = pd.to_datetime(industry_df['in_date'])
|
||||
|
||||
# 对 industry_df 按 ts_code 和 in_date 排序
|
||||
industry_df_sorted = industry_df.sort_values(['in_date', 'ts_code'])
|
||||
|
||||
# 对原始 df 按 ts_code 和 trade_date 排序
|
||||
df_sorted = df.sort_values(['trade_date', 'ts_code'])
|
||||
|
||||
# 使用 merge_asof 进行向后合并
|
||||
merged = pd.merge_asof(
|
||||
df_sorted,
|
||||
industry_df_sorted,
|
||||
by='ts_code', # 按 ts_code 分组
|
||||
left_on='trade_date',
|
||||
right_on='in_date',
|
||||
direction='backward'
|
||||
)
|
||||
|
||||
# 获取每个 ts_code 的最早 in_date 记录
|
||||
min_in_date_per_ts = (industry_df_sorted
|
||||
.groupby('ts_code')
|
||||
.first()
|
||||
.reset_index()[['ts_code', 'l2_code']])
|
||||
|
||||
# 填充未匹配到的记录(trade_date 早于所有 in_date 的情况)
|
||||
merged['l2_code'] = merged['l2_code'].fillna(
|
||||
merged['ts_code'].map(min_in_date_per_ts.set_index('ts_code')['l2_code'])
|
||||
)
|
||||
|
||||
# 保留需要的列并重置索引
|
||||
result = merged.reset_index(drop=True)
|
||||
return result
|
||||
|
||||
|
||||
def calculate_risk_adjusted_return(df, days=1, method='ratio', lambda_=0.5, eps=1e-8):
|
||||
"""
|
||||
计算单只股票的风险调整收益。
|
||||
|
||||
参数:
|
||||
- df: DataFrame,包含 'ts_code' 和 'close' 列,按日期排序(假设 'trade_date' 已排序)。
|
||||
- days: 预测未来多少天的收益,默认1天。
|
||||
- method: 'ratio'(收益/波动率) 或 'difference'(收益 - λ * 波动率)。
|
||||
- lambda_: 风险惩罚系数,仅当 method='difference' 时有效。
|
||||
- eps: 防止除零的小常数。
|
||||
|
||||
返回:
|
||||
- df:添加 'risk_adj_return' 列的 DataFrame,表示风险调整后的收益。
|
||||
"""
|
||||
# 确保数据按 ts_code 和 trade_date 排序
|
||||
df = df.sort_values(by=['ts_code', 'trade_date'])
|
||||
|
||||
# 计算未来的对数收益率
|
||||
df['future_return'] = np.log(df.groupby('ts_code')['close'].shift(-days) / df['close'])
|
||||
|
||||
# 计算历史收益(对数收益率)
|
||||
df['historical_return'] = np.log(df.groupby('ts_code')['close'].shift(1) / df['close'])
|
||||
|
||||
# 计算波动率(历史收益的标准差)
|
||||
df['volatility'] = df.groupby('ts_code')['historical_return'].rolling(window=days).std().reset_index(level=0,
|
||||
drop=True)
|
||||
|
||||
# 根据选择的 method 计算风险调整收益
|
||||
if method == 'ratio':
|
||||
# 收益/波动率(防止除零)
|
||||
df['risk_adj_return'] = df['future_return'] / (df['volatility'] + eps)
|
||||
elif method == 'difference':
|
||||
# 收益 - λ * 波动率
|
||||
df['risk_adj_return'] = df['future_return'] - lambda_ * df['volatility']
|
||||
else:
|
||||
raise ValueError("Invalid method. Use 'ratio' or 'difference'.")
|
||||
|
||||
return df
|
||||
|
||||
# import polars as pl
|
||||
#
|
||||
# def read_and_merge_h5_data_polars(h5_filename, key, columns, df=None, join='left', on=['ts_code', 'trade_date']):
|
||||
# processed_columns = []
|
||||
# for col in columns:
|
||||
# if col.startswith('_'):
|
||||
# processed_columns.append(col[1:]) # 去掉下划线
|
||||
# else:
|
||||
# processed_columns.append(col)
|
||||
#
|
||||
# # 从 HDF5 文件读取数据,选择需要的列
|
||||
# pd_df = pd.read_hdf(h5_filename, key=key, columns=processed_columns)
|
||||
#
|
||||
# # 将 Pandas DataFrame 转换为 Polars DataFrame
|
||||
# data = pl.from_pandas(pd_df)
|
||||
#
|
||||
# # 修改列名,如果列名以前有 _,加上 _
|
||||
# data = data.rename({col: f'_{col}' for col in data.columns if col not in columns})
|
||||
#
|
||||
# # 如果传入的 df 不为空,则进行合并
|
||||
# if df is not None and not df.is_empty():
|
||||
# print(f'{join} merge on {on}')
|
||||
#
|
||||
# # 确保两个 DataFrame 都有 ts_code 和 trade_date 列
|
||||
# # df = df.with_columns(pl.col('trade_date').str.strptime(pl.Datetime, format='%Y%m%d'))
|
||||
# # data = data.with_columns(pl.col('trade_date').str.strptime(pl.Datetime, format='%Y%m%d'))
|
||||
#
|
||||
# # 根据 ts_code 和 trade_date 合并
|
||||
# merged_df = df.join(data, on=on, how=join)
|
||||
# else:
|
||||
# # 如果 df 为空,则直接返回读取的数据
|
||||
# merged_df = data
|
||||
#
|
||||
# return merged_df
|
||||
|
||||
Reference in New Issue
Block a user