RollingRank赚钱- Sharp-1.43

This commit is contained in:
liaozhaorun
2025-04-28 11:02:52 +08:00
parent 94cd9aa6c8
commit 9e598d4ed0
93 changed files with 18134 additions and 4342 deletions

0
main/utils/__init__.py Normal file
View File

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

780
main/utils/factor.py Normal file
View File

@@ -0,0 +1,780 @@
import numpy as np
import talib
import pandas as pd
def get_technical_factor(df):
# 按股票和日期排序
df = df.sort_values(by=['ts_code', 'trade_date'])
grouped = df.groupby('ts_code', group_keys=False)
df['return_skew'] = grouped['pct_chg'].rolling(window=5).skew().reset_index(0, drop=True)
df['return_kurtosis'] = grouped['pct_chg'].rolling(window=5).kurt().reset_index(0, drop=True)
# 因子 1短期成交量变化率
df['volume_change_rate'] = (
grouped['vol'].rolling(window=2).mean() /
grouped['vol'].rolling(window=5).mean() - 1
).reset_index(level=0, drop=True) # 确保索引对齐
# 因子 2成交量突破信号
max_volume = grouped['vol'].rolling(window=5).max().reset_index(level=0, drop=True) # 确保索引对齐
df['cat_volume_breakout'] = (df['vol'] > max_volume)
# 因子 3换手率均线偏离度
mean_turnover = grouped['turnover_rate'].rolling(window=3).mean().reset_index(level=0, drop=True)
std_turnover = grouped['turnover_rate'].rolling(window=3).std().reset_index(level=0, drop=True)
df['turnover_deviation'] = (df['turnover_rate'] - mean_turnover) / std_turnover
# 因子 4换手率激增信号
df['cat_turnover_spike'] = (df['turnover_rate'] > mean_turnover + 2 * std_turnover)
# 因子 5量比均值
df['avg_volume_ratio'] = grouped['volume_ratio'].rolling(window=3).mean().reset_index(level=0, drop=True)
# 因子 6量比突破信号
max_volume_ratio = grouped['volume_ratio'].rolling(window=5).max().reset_index(level=0, drop=True)
df['cat_volume_ratio_breakout'] = (df['volume_ratio'] > max_volume_ratio)
# 因子 7成交量与换手率的综合动量因子
alpha = 0.5
df['momentum_factor'] = df['volume_change_rate'] + alpha * df['turnover_deviation']
# 因子 8量价共振因子
df['price_change_rate'] = grouped['close'].pct_change()
df['resonance_factor'] = df['volume_ratio'] * df['price_change_rate']
# 计算 up 和 down
df['log_close'] = np.log(df['close'])
df['vol_spike'] = grouped.apply(
lambda x: pd.Series(x['vol'].rolling(20).mean(), index=x.index)
)
df['cat_vol_spike'] = df['vol'] > 2 * df['vol_spike']
df['vol_std_5'] = df['vol'].pct_change().rolling(5).std()
df['up'] = (df['high'] - df[['close', 'open']].max(axis=1)) / df['close']
df['down'] = (df[['close', 'open']].min(axis=1) - df['low']) / df['close']
# 计算 ATR
df['atr_14'] = grouped.apply(
lambda x: pd.Series(talib.ATR(x['high'].values, x['low'].values, x['close'].values, timeperiod=14),
index=x.index)
)
df['atr_6'] = grouped.apply(
lambda x: pd.Series(talib.ATR(x['high'].values, x['low'].values, x['close'].values, timeperiod=6),
index=x.index)
)
# 计算 OBV 及其均线
df['obv'] = grouped.apply(
lambda x: pd.Series(talib.OBV(x['close'].values, x['vol'].values), index=x.index)
)
df['maobv_6'] = grouped.apply(
lambda x: pd.Series(talib.SMA(x['obv'].values, timeperiod=6), index=x.index)
)
df['obv-maobv_6'] = df['obv'] - df['maobv_6']
# 计算 RSI
df['rsi_3'] = grouped.apply(
lambda x: pd.Series(talib.RSI(x['close'].values, timeperiod=3), index=x.index)
)
df['rsi_6'] = grouped.apply(
lambda x: pd.Series(talib.RSI(x['close'].values, timeperiod=6), index=x.index)
)
df['rsi_9'] = grouped.apply(
lambda x: pd.Series(talib.RSI(x['close'].values, timeperiod=9), index=x.index)
)
# 计算 return_10 和 return_20
df['return_5'] = grouped['close'].apply(lambda x: x / x.shift(5) - 1)
df['return_10'] = grouped['close'].apply(lambda x: x / x.shift(10) - 1)
df['return_20'] = grouped['close'].apply(lambda x: x / x.shift(20) - 1)
# df['avg_close_5'] = grouped['close'].apply(lambda x: x.rolling(window=5).mean() / x)
# 计算标准差指标
df['std_return_5'] = grouped['close'].apply(lambda x: x.pct_change().rolling(window=5).std())
df['std_return_15'] = grouped['close'].apply(lambda x: x.pct_change().rolling(window=15).std())
df['std_return_25'] = grouped['close'].apply(lambda x: x.pct_change().rolling(window=25).std())
df['std_return_90'] = grouped['close'].apply(lambda x: x.pct_change().rolling(window=90).std())
df['std_return_90_2'] = grouped['close'].apply(lambda x: x.shift(10).pct_change().rolling(window=90).std())
# 计算比值指标
df['std_return_5 / std_return_90'] = df['std_return_5'] / df['std_return_90']
df['std_return_5 / std_return_25'] = df['std_return_5'] / df['std_return_25']
# 计算标准差差值
df['std_return_90 - std_return_90_2'] = df['std_return_90'] - df['std_return_90_2']
return df
def get_act_factor(df, cat=True):
# 按股票和日期排序
df = df.sort_values(by=['ts_code', 'trade_date'])
grouped = df.groupby('ts_code', group_keys=False)
# 计算 EMA 指标
df['_ema_5'] = grouped['close'].apply(
lambda x: pd.Series(talib.EMA(x.values, timeperiod=5), index=x.index)
)
df['_ema_13'] = grouped['close'].apply(
lambda x: pd.Series(talib.EMA(x.values, timeperiod=13), index=x.index)
)
df['_ema_20'] = grouped['close'].apply(
lambda x: pd.Series(talib.EMA(x.values, timeperiod=20), index=x.index)
)
df['_ema_60'] = grouped['close'].apply(
lambda x: pd.Series(talib.EMA(x.values, timeperiod=60), index=x.index)
)
# 计算 act_factor1, act_factor2, act_factor3, act_factor4
df['act_factor1'] = grouped['_ema_5'].apply(
lambda x: np.arctan((x / x.shift(1) - 1) * 100) * 57.3 / 50
)
df['act_factor2'] = grouped['_ema_13'].apply(
lambda x: np.arctan((x / x.shift(1) - 1) * 100) * 57.3 / 40
)
df['act_factor3'] = grouped['_ema_20'].apply(
lambda x: np.arctan((x / x.shift(1) - 1) * 100) * 57.3 / 21
)
df['act_factor4'] = grouped['_ema_60'].apply(
lambda x: np.arctan((x / x.shift(1) - 1) * 100) * 57.3 / 10
)
if cat:
df['cat_af1'] = df['act_factor1'] > 0
df['cat_af2'] = df['act_factor2'] > df['act_factor1']
df['cat_af3'] = df['act_factor3'] > df['act_factor2']
df['cat_af4'] = df['act_factor4'] > df['act_factor3']
# 计算 act_factor5 和 act_factor6
df['act_factor5'] = df['act_factor1'] + df['act_factor2'] + df['act_factor3'] + df['act_factor4']
df['act_factor6'] = (df['act_factor1'] - df['act_factor2']) / np.sqrt(
df['act_factor1'] ** 2 + df['act_factor2'] ** 2)
# 根据 trade_date 截面计算排名
df['rank_act_factor1'] = df.groupby('trade_date', group_keys=False)['act_factor1'].rank(ascending=False, pct=True)
df['rank_act_factor2'] = df.groupby('trade_date', group_keys=False)['act_factor2'].rank(ascending=False, pct=True)
df['rank_act_factor3'] = df.groupby('trade_date', group_keys=False)['act_factor3'].rank(ascending=False, pct=True)
return df
def get_money_flow_factor(df):
# 计算资金流相关因子(字段名称见 tushare 数据说明)
df['active_buy_volume_large'] = df['buy_lg_vol'] / df['net_mf_vol']
df['active_buy_volume_big'] = df['buy_elg_vol'] / df['net_mf_vol']
df['active_buy_volume_small'] = df['buy_sm_vol'] / df['net_mf_vol']
df['buy_lg_vol_minus_sell_lg_vol'] = (df['buy_lg_vol'] - df['sell_lg_vol']) / df['net_mf_vol']
df['buy_elg_vol_minus_sell_elg_vol'] = (df['buy_elg_vol'] - df['sell_elg_vol']) / df['net_mf_vol']
df['log(circ_mv)'] = np.log(df['circ_mv'])
return df
def get_alpha_factor(df):
df = df.sort_values(by=['ts_code', 'trade_date'])
grouped = df.groupby('ts_code')
# alpha_022: 当前 close 与 5 日前 close 差值
# df['alpha_022'] = grouped['close'].transform(lambda x: x - x.shift(5))
def rolling_covariance(x, y, window):
return x.rolling(window).cov(y)
def delta(series, period):
return series.diff(period)
def rank(series):
return series.rank(pct=True)
def stddev(series, window):
return series.rolling(window).std()
# 计算改进后的 Alpha 22 因子
window_high_volume = 5
window_close_stddev = 20
period_delta = 5
df['cov'] = rolling_covariance(df['high'], df['volume'], window_high_volume)
df['delta_cov'] = delta(df['cov'], period_delta)
df['_rank_stddev'] = rank(stddev(df['close'], window_close_stddev))
df['alpha_22_improved'] = -1 * df['delta_cov'] * df['_rank_stddev']
# alpha_003: (close - open) / (high - low)
df['alpha_003'] = np.where(df['high'] != df['low'],
(df['close'] - df['open']) / (df['high'] - df['low']),
0)
# alpha_007: 计算过去5日 close 与 vol 的相关性,并按 trade_date 排名
df['alpha_007'] = grouped.apply(lambda x: x['close'].rolling(5).corr(x['vol'])).reset_index(level=0, drop=True)
df['alpha_007'] = df.groupby('trade_date', group_keys=False)['alpha_007'].rank(ascending=True, pct=True)
# alpha_013: 计算过去5日 close 之和 - 20日 close 之和,并按 trade_date 排名
df['alpha_013'] = grouped['close'].transform(lambda x: x.rolling(5).sum() - x.rolling(20).sum())
df['alpha_013'] = df.groupby('trade_date', group_keys=False)['alpha_013'].rank(ascending=True, pct=True)
return df
def get_limit_factor(df):
# 按股票和日期排序
df = df.sort_values(by=['ts_code', 'trade_date'])
# 分组处理
grouped = df.groupby('ts_code', group_keys=False)
# 1. 今日是否涨停/跌停
df['cat_up_limit'] = (df['close'] == df['up_limit']).astype(int) # 是否涨停1表示涨停0表示未涨停
df['cat_down_limit'] = (df['close'] == df['down_limit']).astype(int) # 是否跌停1表示跌停0表示未跌停
# 2. 最近涨跌停次数过去20个交易日
df['up_limit_count_10d'] = grouped['cat_up_limit'].rolling(window=10, min_periods=1).sum().reset_index(level=0,
drop=True)
df['down_limit_count_10d'] = grouped['cat_down_limit'].rolling(window=10, min_periods=1).sum().reset_index(level=0,
drop=True)
# 3. 最近连续涨跌停天数
def calculate_consecutive_limits(series):
"""
计算连续涨停/跌停天数。
"""
consecutive_up = series * (series.groupby((series != series.shift()).cumsum()).cumcount() + 1)
consecutive_down = series * (series.groupby((series != series.shift()).cumsum()).cumcount() + 1)
return consecutive_up, consecutive_down
# 连续涨停天数
df['consecutive_up_limit'] = grouped['cat_up_limit'].apply(
lambda x: calculate_consecutive_limits(x)[0]
).reset_index(level=0, drop=True)
# 连续跌停天数
# df['consecutive_down_limit'] = grouped['cat_down_limit'].apply(
# lambda x: calculate_consecutive_limits(x)[1]
# ).reset_index(level=0, drop=True)
return df
def get_cyp_perf_factor(df):
# 预处理:按股票代码和时间排序
df = df.sort_values(by=['ts_code', 'trade_date'])
# 按股票代码分组处理
grouped = df.groupby('ts_code', group_keys=False)
df['ctrl_strength'] = (df['cost_85pct'] - df['cost_15pct']) / (df['his_high'] - df['his_low'])
df['low_cost_dev'] = (df['close'] - df['cost_5pct']) / (df['cost_50pct'] - df['cost_5pct'])
df['asymmetry'] = (df['cost_95pct'] - df['cost_50pct']) / (df['cost_50pct'] - df['cost_5pct'])
df['lock_factor'] = df['turnover_rate'] * (
1 - (df['cost_95pct'] - df['cost_5pct']) / (df['his_high'] - df['his_low']))
df['vol_break'] = np.where((df['close'] > df['cost_85pct']) & (df['volume_ratio'] > 2), 1, 0)
df['weight_roc5'] = grouped['weight_avg'].apply(lambda x: x.pct_change(5))
def rolling_corr(group):
roc_close = group['close'].pct_change()
roc_weight = group['weight_avg'].pct_change()
return roc_close.rolling(10).corr(roc_weight)
df['price_cost_divergence'] = grouped.apply(rolling_corr)
def calc_atr(group):
high, low, close = group['high'], group['low'], group['close']
tr = np.maximum(high - low,
np.maximum(abs(high - close.shift()),
abs(low - close.shift())))
return tr.rolling(14).mean()
df['atr_14'] = grouped.apply(calc_atr)
df['cost_atr_adj'] = (df['cost_95pct'] - df['cost_5pct']) / df['atr_14']
# 12. 小盘股筹码集中度
df['smallcap_concentration'] = (1 / df['circ_mv']) * (df['cost_85pct'] - df['cost_15pct'])
# 16. 筹码稳定性指数 (20日波动率)
df['weight_std20'] = grouped['weight_avg'].apply(lambda x: x.rolling(20).std())
df['cost_stability'] = df['weight_std20'] / grouped['weight_avg'].transform(lambda x: x.rolling(20).mean())
# 17. 成本区间突破标记
df['high_cost_break_days'] = grouped.apply(lambda g: g['close'].gt(g['cost_95pct']).rolling(5).sum())
# 18. 黄金筹码共振 (复合事件)
df['cat_golden_resonance'] = ((df['close'] > df['weight_avg']) &
(df['volume_ratio'] > 1.5) &
(df['winner_rate'] > 0.7))
# 20. 筹码-流动性风险
df['liquidity_risk'] = (df['cost_95pct'] - df['cost_5pct']) * (
1 / grouped['vol'].transform(lambda x: x.rolling(10).mean()))
df.drop(columns=['weight_std20'], inplace=True, errors='ignore')
return df
def get_mv_factors(df):
"""
计算多个因子并生成最终的综合因子。
参数:
df (pd.DataFrame): 包含 ts_code, trade_date, turnover_rate, pe_ttm, pb, ps, circ_mv, volume_ratio, vol 等列的数据框。
返回:
pd.DataFrame: 包含新增因子和最终综合因子的数据框。
"""
# 按 ts_code 和 trade_date 排序
df = df.sort_values(by=['ts_code', 'trade_date'])
# 按 ts_code 分组
grouped = df.groupby('ts_code', group_keys=False)
# 1. 市值流动比因子
df['mv_turnover_ratio'] = df['turnover_rate'] / df['circ_mv']
# 2. 市值调整成交量因子
df['mv_adjusted_volume'] = df['vol'] / df['circ_mv']
# 3. 市值加权换手率因子
df['mv_weighted_turnover'] = df['turnover_rate'] * (1 / df['circ_mv'])
# 4. 非线性市值成交量因子
df['nonlinear_mv_volume'] = df['vol'] / df['circ_mv']
# 5. 市值量比因子
df['mv_volume_ratio'] = df['volume_ratio'] / df['circ_mv']
# 6. 市值动量因子
df['mv_momentum'] = df['turnover_rate'] * df['volume_ratio'] / df['circ_mv']
# 7. 市值波动率因子
df['turnover_std'] = grouped['turnover_rate'].rolling(window=20).std().reset_index(level=0, drop=True)
df['mv_volatility'] = grouped.apply(lambda x: x['turnover_std'] / x['circ_mv']).reset_index(level=0, drop=True)
# 8. 市值成长性因子
df['volume_growth'] = grouped['vol'].pct_change(periods=20).reset_index(level=0, drop=True)
df['mv_growth'] = grouped.apply(lambda x: x['volume_growth'] / x['circ_mv']).reset_index(level=0, drop=True)
# # 标准化因子
# factor_columns = [
# 'mv_turnover_ratio', 'mv_adjusted_volume', 'mv_weighted_turnover',
# 'nonlinear_mv_volume', 'mv_volume_ratio', 'mv_momentum',
# 'mv_volatility', 'mv_growth'
# ]
# scaler = StandardScaler()
# df[factor_columns] = scaler.fit_transform(df[factor_columns])
#
# # 加权合成因子
# weights = [0.2, 0.15, 0.15, 0.1, 0.1, 0.1, 0.1, 0.1] # 各因子权重
# df['final_combined_factor'] = df[factor_columns].dot(weights)
return df
import numpy as np
import talib
def get_rolling_factor(df):
old_columns = df.columns.tolist()[:]
# 按股票和日期排序
df = df.sort_values(by=['ts_code', 'trade_date'])
grouped = df.groupby('ts_code', group_keys=False)
df["gap_next_open"] = (df["open"].shift(-1) - df["close"]) / df["close"]
df['return_skew'] = grouped['pct_chg'].rolling(window=5).skew().reset_index(0, drop=True)
df['return_kurtosis'] = grouped['pct_chg'].rolling(window=5).kurt().reset_index(0, drop=True)
# 因子 1短期成交量变化率
df['volume_change_rate'] = (
grouped['vol'].rolling(window=2).mean() /
grouped['vol'].rolling(window=10).mean() - 1
).reset_index(level=0, drop=True) # 确保索引对齐
# 因子 2成交量突破信号
max_volume = grouped['vol'].rolling(window=5).max().reset_index(level=0, drop=True) # 确保索引对齐
df['cat_volume_breakout'] = (df['vol'] > max_volume)
# 因子 3换手率均线偏离度
mean_turnover = grouped['turnover_rate'].rolling(window=3).mean().reset_index(level=0, drop=True)
std_turnover = grouped['turnover_rate'].rolling(window=3).std().reset_index(level=0, drop=True)
df['turnover_deviation'] = (df['turnover_rate'] - mean_turnover) / std_turnover
# 因子 4换手率激增信号
df['cat_turnover_spike'] = (df['turnover_rate'] > mean_turnover + 2 * std_turnover)
# 因子 5量比均值
df['avg_volume_ratio'] = grouped['volume_ratio'].rolling(window=3).mean().reset_index(level=0, drop=True)
# 因子 6量比突破信号
max_volume_ratio = grouped['volume_ratio'].rolling(window=5).max().reset_index(level=0, drop=True)
df['cat_volume_ratio_breakout'] = (df['volume_ratio'] > max_volume_ratio)
df['vol_spike'] = grouped.apply(
lambda x: pd.Series(x['vol'].rolling(20).mean(), index=x.index)
)
df['vol_std_5'] = df['vol'].pct_change().rolling(5).std()
# 计算 ATR
df['atr_14'] = grouped.apply(
lambda x: pd.Series(talib.ATR(x['high'].values, x['low'].values, x['close'].values, timeperiod=14),
index=x.index)
)
df['atr_6'] = grouped.apply(
lambda x: pd.Series(talib.ATR(x['high'].values, x['low'].values, x['close'].values, timeperiod=6),
index=x.index)
)
# 计算 OBV 及其均线
df['obv'] = grouped.apply(
lambda x: pd.Series(talib.OBV(x['close'].values, x['vol'].values), index=x.index)
)
df['maobv_6'] = grouped.apply(
lambda x: pd.Series(talib.SMA(x['obv'].values, timeperiod=6), index=x.index)
)
df['rsi_3'] = grouped.apply(
lambda x: pd.Series(talib.RSI(x['close'].values, timeperiod=3), index=x.index)
)
df['rsi_6'] = grouped.apply(
lambda x: pd.Series(talib.RSI(x['close'].values, timeperiod=6), index=x.index)
)
df['rsi_9'] = grouped.apply(
lambda x: pd.Series(talib.RSI(x['close'].values, timeperiod=9), index=x.index)
)
# 计算 return_10 和 return_20
df['return_5'] = grouped['close'].apply(lambda x: x / x.shift(5) - 1)
df['return_10'] = grouped['close'].apply(lambda x: x / x.shift(10) - 1)
df['return_20'] = grouped['close'].apply(lambda x: x / x.shift(20) - 1)
# df['avg_close_5'] = grouped['close'].apply(lambda x: x.rolling(window=5).mean() / x)
# 计算标准差指标
df['std_return_5'] = grouped['close'].apply(lambda x: x.pct_change().rolling(window=5).std())
df['std_return_15'] = grouped['close'].apply(lambda x: x.pct_change().rolling(window=15).std())
df['std_return_25'] = grouped['close'].apply(lambda x: x.pct_change().rolling(window=25).std())
df['std_return_90'] = grouped['close'].apply(lambda x: x.pct_change().rolling(window=90).std())
df['std_return_90_2'] = grouped['close'].apply(lambda x: x.shift(10).pct_change().rolling(window=90).std())
# 计算 EMA 指标
df['_ema_5'] = grouped['close'].apply(
lambda x: pd.Series(talib.EMA(x.values, timeperiod=5), index=x.index)
)
df['_ema_13'] = grouped['close'].apply(
lambda x: pd.Series(talib.EMA(x.values, timeperiod=13), index=x.index)
)
df['_ema_20'] = grouped['close'].apply(
lambda x: pd.Series(talib.EMA(x.values, timeperiod=20), index=x.index)
)
df['_ema_60'] = grouped['close'].apply(
lambda x: pd.Series(talib.EMA(x.values, timeperiod=60), index=x.index)
)
# 计算 act_factor1, act_factor2, act_factor3, act_factor4
df['act_factor1'] = grouped['_ema_5'].apply(
lambda x: np.arctan((x / x.shift(1) - 1) * 100) * 57.3 / 50
)
df['act_factor2'] = grouped['_ema_13'].apply(
lambda x: np.arctan((x / x.shift(1) - 1) * 100) * 57.3 / 40
)
df['act_factor3'] = grouped['_ema_20'].apply(
lambda x: np.arctan((x / x.shift(1) - 1) * 100) * 57.3 / 21
)
df['act_factor4'] = grouped['_ema_60'].apply(
lambda x: np.arctan((x / x.shift(1) - 1) * 100) * 57.3 / 10
)
# 根据 trade_date 截面计算排名
df['rank_act_factor1'] = df.groupby('trade_date', group_keys=False)['act_factor1'].rank(ascending=False, pct=True)
df['rank_act_factor2'] = df.groupby('trade_date', group_keys=False)['act_factor2'].rank(ascending=False, pct=True)
df['rank_act_factor3'] = df.groupby('trade_date', group_keys=False)['act_factor3'].rank(ascending=False, pct=True)
df['log(circ_mv)'] = np.log(df['circ_mv'])
def rolling_covariance(x, y, window):
return x.rolling(window).cov(y)
def delta(series, period):
return series.diff(period)
def rank(series):
return series.rank(pct=True)
def stddev(series, window):
return series.rolling(window).std()
window_high_volume = 5
window_close_stddev = 20
period_delta = 5
df['cov'] = rolling_covariance(df['high'], df['vol'], window_high_volume)
df['delta_cov'] = delta(df['cov'], period_delta)
df['_rank_stddev'] = rank(stddev(df['close'], window_close_stddev))
df['alpha_22_improved'] = -1 * df['delta_cov'] * df['_rank_stddev']
df['alpha_003'] = np.where(df['high'] != df['low'],
(df['close'] - df['open']) / (df['high'] - df['low']),
0)
df['alpha_007'] = grouped.apply(lambda x: x['close'].rolling(5).corr(x['vol'])).reset_index(level=0, drop=True)
df['alpha_007'] = df.groupby('trade_date', group_keys=False)['alpha_007'].rank(ascending=True, pct=True)
df['alpha_013'] = grouped['close'].transform(lambda x: x.rolling(5).sum() - x.rolling(20).sum())
df['alpha_013'] = df.groupby('trade_date', group_keys=False)['alpha_013'].rank(ascending=True, pct=True)
df['cat_up_limit'] = (df['close'] == df['up_limit']) # 是否涨停1表示涨停0表示未涨停
df['cat_down_limit'] = (df['close'] == df['down_limit']) # 是否跌停1表示跌停0表示未跌停
df['up_limit_count_10d'] = grouped['cat_up_limit'].rolling(window=10, min_periods=1).sum().reset_index(level=0,
drop=True)
df['down_limit_count_10d'] = grouped['cat_down_limit'].rolling(window=10, min_periods=1).sum().reset_index(level=0,
drop=True)
# 3. 最近连续涨跌停天数
def calculate_consecutive_limits(series):
"""
计算连续涨停/跌停天数。
"""
consecutive_up = series * (series.groupby((series != series.shift()).cumsum()).cumcount() + 1)
consecutive_down = series * (series.groupby((series != series.shift()).cumsum()).cumcount() + 1)
return consecutive_up, consecutive_down
# 连续涨停天数
df['consecutive_up_limit'] = grouped['cat_up_limit'].apply(
lambda x: calculate_consecutive_limits(x)[0]
).reset_index(level=0, drop=True)
df['vol_break'] = np.where((df['close'] > df['cost_85pct']) & (df['volume_ratio'] > 2), 1, 0)
df['weight_roc5'] = grouped['weight_avg'].apply(lambda x: x.pct_change(5))
def rolling_corr(group):
roc_close = group['close'].pct_change()
roc_weight = group['weight_avg'].pct_change()
return roc_close.rolling(10).corr(roc_weight)
df['price_cost_divergence'] = grouped.apply(rolling_corr)
df['smallcap_concentration'] = (1 / df['circ_mv']) * (df['cost_85pct'] - df['cost_15pct'])
# 16. 筹码稳定性指数 (20日波动率)
df['weight_std20'] = grouped['weight_avg'].apply(lambda x: x.rolling(20).std())
df['cost_stability'] = df['weight_std20'] / grouped['weight_avg'].transform(lambda x: x.rolling(20).mean())
# 17. 成本区间突破标记
df['high_cost_break_days'] = grouped.apply(lambda g: g['close'].gt(g['cost_95pct']).rolling(5).sum())
# 20. 筹码-流动性风险
df['liquidity_risk'] = (df['cost_95pct'] - df['cost_5pct']) * (
1 / grouped['vol'].transform(lambda x: x.rolling(10).mean()))
# 7. 市值波动率因子
df['turnover_std'] = grouped['turnover_rate'].rolling(window=20).std().reset_index(level=0, drop=True)
df['mv_volatility'] = grouped.apply(lambda x: x['turnover_std'] / x['circ_mv']).reset_index(level=0, drop=True)
# 8. 市值成长性因子
df['volume_growth'] = grouped['vol'].pct_change(periods=20).reset_index(level=0, drop=True)
df['mv_growth'] = grouped.apply(lambda x: x['volume_growth'] / x['circ_mv']).reset_index(level=0, drop=True)
df.drop(columns=['weight_std20'], inplace=True, errors='ignore')
new_columns = [col for col in df.columns.tolist()[:] if col not in old_columns]
return df, new_columns
def get_simple_factor(df):
old_columns = df.columns.tolist()[:]
df = df.sort_values(by=['ts_code', 'trade_date'])
alpha = 0.5
df['momentum_factor'] = df['volume_change_rate'] + alpha * df['turnover_deviation']
df['resonance_factor'] = df['volume_ratio'] * df['pct_chg']
df['log_close'] = np.log(df['close'])
df['cat_vol_spike'] = df['vol'] > 2 * df['vol_spike']
df['up'] = (df['high'] - df[['close', 'open']].max(axis=1)) / df['close']
df['down'] = (df[['close', 'open']].min(axis=1) - df['low']) / df['close']
df['obv-maobv_6'] = df['obv'] - df['maobv_6']
# 计算比值指标
df['std_return_5 / std_return_90'] = df['std_return_5'] / df['std_return_90']
df['std_return_5 / std_return_25'] = df['std_return_5'] / df['std_return_25']
# 计算标准差差值
df['std_return_90 - std_return_90_2'] = df['std_return_90'] - df['std_return_90_2']
df['cat_af1'] = df['act_factor1'] > 0
df['cat_af2'] = df['act_factor2'] > df['act_factor1']
df['cat_af3'] = df['act_factor3'] > df['act_factor2']
df['cat_af4'] = df['act_factor4'] > df['act_factor3']
# 计算 act_factor5 和 act_factor6
df['act_factor5'] = df['act_factor1'] + df['act_factor2'] + df['act_factor3'] + df['act_factor4']
df['act_factor6'] = (df['act_factor1'] - df['act_factor2']) / np.sqrt(
df['act_factor1'] ** 2 + df['act_factor2'] ** 2)
df['active_buy_volume_large'] = df['buy_lg_vol'] / df['net_mf_vol']
df['active_buy_volume_big'] = df['buy_elg_vol'] / df['net_mf_vol']
df['active_buy_volume_small'] = df['buy_sm_vol'] / df['net_mf_vol']
df['buy_lg_vol_minus_sell_lg_vol'] = (df['buy_lg_vol'] - df['sell_lg_vol']) / df['net_mf_vol']
df['buy_elg_vol_minus_sell_elg_vol'] = (df['buy_elg_vol'] - df['sell_elg_vol']) / df['net_mf_vol']
df['log(circ_mv)'] = np.log(df['circ_mv'])
df['ctrl_strength'] = (df['cost_85pct'] - df['cost_15pct']) / (df['his_high'] - df['his_low'])
df['low_cost_dev'] = (df['close'] - df['cost_5pct']) / (df['cost_50pct'] - df['cost_5pct'])
df['asymmetry'] = (df['cost_95pct'] - df['cost_50pct']) / (df['cost_50pct'] - df['cost_5pct'])
df['lock_factor'] = df['turnover_rate'] * (
1 - (df['cost_95pct'] - df['cost_5pct']) / (df['his_high'] - df['his_low']))
df['cat_vol_break'] = (df['close'] > df['cost_85pct']) & (df['volume_ratio'] > 2)
df['cost_atr_adj'] = (df['cost_95pct'] - df['cost_5pct']) / df['atr_14']
# 12. 小盘股筹码集中度
df['smallcap_concentration'] = (1 / df['circ_mv']) * (df['cost_85pct'] - df['cost_15pct'])
df['cat_golden_resonance'] = ((df['close'] > df['weight_avg']) &
(df['volume_ratio'] > 1.5) &
(df['winner_rate'] > 0.7))
df['mv_turnover_ratio'] = df['turnover_rate'] / df['circ_mv']
df['mv_adjusted_volume'] = df['vol'] / df['circ_mv']
df['mv_weighted_turnover'] = df['turnover_rate'] * (1 / df['circ_mv'])
df['nonlinear_mv_volume'] = df['vol'] / df['circ_mv']
df['mv_volume_ratio'] = df['volume_ratio'] / df['circ_mv']
df['mv_momentum'] = df['turnover_rate'] * df['volume_ratio'] / df['circ_mv']
drop_columns = [col for col in df.columns if col.startswith('_')]
df.drop(columns=drop_columns, inplace=True, errors='ignore')
new_columns = [col for col in df.columns.tolist()[:] if col not in old_columns]
return df, new_columns
def calculate_indicators(df):
"""
计算四个指标当日涨跌幅、5日移动平均、RSI、MACD。
"""
df = df.sort_values('trade_date')
df['daily_return'] = (df['close'] - df['pre_close']) / df['pre_close'] * 100
# df['5_day_ma'] = df['close'].rolling(window=5).mean()
delta = df['close'].diff()
gain = delta.where(delta > 0, 0)
loss = -delta.where(delta < 0, 0)
avg_gain = gain.rolling(window=14).mean()
avg_loss = loss.rolling(window=14).mean()
rs = avg_gain / avg_loss
df['RSI'] = 100 - (100 / (1 + rs))
# 计算MACD
ema12 = df['close'].ewm(span=12, adjust=False).mean()
ema26 = df['close'].ewm(span=26, adjust=False).mean()
df['MACD'] = ema12 - ema26
df['Signal_line'] = df['MACD'].ewm(span=9, adjust=False).mean()
df['MACD_hist'] = df['MACD'] - df['Signal_line']
# 4. 情绪因子1市场上涨比例Up Ratio
df['up_ratio'] = df['daily_return'].apply(lambda x: 1 if x > 0 else 0)
df['up_ratio_20d'] = df['up_ratio'].rolling(window=20).mean() # 过去20天上涨比例
# 5. 情绪因子2成交量变化率Volume Change Rate
df['volume_mean'] = df['vol'].rolling(window=20).mean() # 过去20天的平均成交量
df['volume_change_rate'] = (df['vol'] - df['volume_mean']) / df['volume_mean'] * 100 # 成交量变化率
# 6. 情绪因子3波动率Volatility
df['volatility'] = df['daily_return'].rolling(window=20).std() # 过去20天的日收益率标准差
# 7. 情绪因子4成交额变化率Amount Change Rate
df['amount_mean'] = df['amount'].rolling(window=20).mean() # 过去20天的平均成交额
df['amount_change_rate'] = (df['amount'] - df['amount_mean']) / df['amount_mean'] * 100 # 成交额变化率
return df
def generate_index_indicators(h5_filename):
df = pd.read_hdf(h5_filename, key='index_data')
df['trade_date'] = pd.to_datetime(df['trade_date'], format='%Y%m%d')
df = df.sort_values('trade_date')
# 计算每个ts_code的相关指标
df_indicators = []
for ts_code in df['ts_code'].unique():
df_index = df[df['ts_code'] == ts_code].copy()
df_index = calculate_indicators(df_index)
df_indicators.append(df_index)
# 合并所有指数的结果
df_all_indicators = pd.concat(df_indicators, ignore_index=True)
# 保留trade_date列并将同一天的数据按ts_code合并成一行
df_final = df_all_indicators.pivot_table(
index='trade_date',
columns='ts_code',
values=['daily_return', 'RSI', 'MACD', 'Signal_line',
'MACD_hist', 'up_ratio_20d', 'volume_change_rate', 'volatility',
'amount_change_rate', 'amount_mean'],
aggfunc='last'
)
df_final.columns = [f"{col[1]}_{col[0]}" for col in df_final.columns]
df_final = df_final.reset_index()
return df_final
def read_industry_data(h5_filename):
# 读取 H5 文件中所有的行业数据
industry_data = pd.read_hdf(h5_filename, key='sw_daily', columns=[
'ts_code', 'trade_date', 'open', 'close', 'high', 'low', 'pe', 'pb', 'vol'
]) # 假设 H5 文件的键是 'industry_data'
industry_data = industry_data.sort_values(by=['ts_code', 'trade_date'])
industry_data = industry_data.reindex()
industry_data['trade_date'] = pd.to_datetime(industry_data['trade_date'], format='%Y%m%d')
grouped = industry_data.groupby('ts_code', group_keys=False)
industry_data['obv'] = grouped.apply(
lambda x: pd.Series(talib.OBV(x['close'].values, x['vol'].values), index=x.index)
)
industry_data['return_5'] = grouped['close'].apply(lambda x: x / x.shift(5) - 1)
industry_data['return_20'] = grouped['close'].apply(lambda x: x / x.shift(20) - 1)
industry_data = get_act_factor(industry_data, cat=False)
industry_data = industry_data.sort_values(by=['trade_date', 'ts_code'])
# # 计算每天每个 ts_code 的因子和当天所有 ts_code 的中位数的偏差
# factor_columns = ['obv', 'return_5', 'return_20', 'act_factor1', 'act_factor2', 'act_factor3', 'act_factor4'] # 因子列
#
# for factor in factor_columns:
# if factor in industry_data.columns:
# # 计算每天每个 ts_code 的因子值与当天所有 ts_code 的中位数的偏差
# industry_data[f'{factor}_deviation'] = industry_data.groupby('trade_date')[factor].transform(
# lambda x: x - x.mean())
industry_data['return_5_percentile'] = industry_data.groupby('trade_date')['return_5'].transform(
lambda x: x.rank(pct=True))
industry_data['return_20_percentile'] = industry_data.groupby('trade_date')['return_20'].transform(
lambda x: x.rank(pct=True))
industry_data = industry_data.drop(columns=['open', 'close', 'high', 'low', 'pe', 'pb', 'vol'])
industry_data = industry_data.rename(
columns={col: f'industry_{col}' for col in industry_data.columns if col not in ['ts_code', 'trade_date']})
industry_data = industry_data.rename(columns={'ts_code': 'cat_l2_code'})
return industry_data

View File

@@ -0,0 +1,233 @@
import numpy as np
import pandas as pd
from scipy.stats import ks_2samp
from sklearn.preprocessing import StandardScaler
def remove_shifted_features(train_data, feature_columns, ks_threshold=0.05, wasserstein_threshold=0.1, size=0.8,
log=True, val_data=None):
dropped_features = []
if val_data is None:
all_dates = sorted(train_data['trade_date'].unique().tolist()) # 获取所有唯一的 trade_date
split_date = all_dates[int(len(all_dates) * size)] # 划分点为倒数第 validation_days 天
train_data_split = train_data[train_data['trade_date'] < split_date] # 训练集
val_data_split = train_data[train_data['trade_date'] >= split_date] # 验证集
else:
train_data_split = train_data
val_data_split = val_data
# **统计数据漂移**
numeric_columns = train_data_split.select_dtypes(include=['float64', 'int64']).columns
numeric_columns = [col for col in numeric_columns if col in feature_columns]
for feature in numeric_columns:
ks_stat, p_value = ks_2samp(train_data_split[feature], val_data_split[feature])
# wasserstein_dist = wasserstein_distance(train_data_split[feature], val_data_split[feature])
# if p_value < ks_threshold or wasserstein_dist > wasserstein_threshold:
if p_value < ks_threshold:
dropped_features.append(feature)
if log:
print(f"检测到 {len(dropped_features)} 个可能漂移的特征: {dropped_features}")
# **应用阈值进行最终筛选**
filtered_features = [f for f in feature_columns if f not in dropped_features]
return filtered_features, dropped_features
def remove_outliers_label_percentile(label: pd.Series, lower_percentile: float = 0.01, upper_percentile: float = 0.99,
log=True):
if not (0 <= lower_percentile < upper_percentile <= 1):
raise ValueError("Percentile values must satisfy 0 <= lower_percentile < upper_percentile <= 1.")
# Calculate lower and upper bounds based on percentiles
lower_bound = label.quantile(lower_percentile)
upper_bound = label.quantile(upper_percentile)
# Filter out values outside the bounds
filtered_label = label[(label >= lower_bound) & (label <= upper_bound)]
# Print the number of removed outliers
if log:
print(f"Removed {len(label) - len(filtered_label)} outliers.")
return filtered_label
def calculate_risk_adjusted_target(df, days=5):
df = df.sort_values(by=['ts_code', 'trade_date'])
df['future_close'] = df.groupby('ts_code')['close'].shift(-days)
df['future_open'] = df.groupby('ts_code')['open'].shift(-1)
df['future_return'] = (df['future_close'] - df['future_open']) / df['future_open']
df['future_volatility'] = df.groupby('ts_code')['future_return'].rolling(days, min_periods=1).std().reset_index(
level=0, drop=True)
sharpe_ratio = df['future_return'] * df['future_volatility']
sharpe_ratio.replace([np.inf, -np.inf], np.nan, inplace=True)
return sharpe_ratio
def calculate_score(df, days=5, lambda_param=1.0):
def calculate_max_drawdown(prices):
peak = prices.iloc[0] # 初始化峰值
max_drawdown = 0 # 初始化最大回撤
for price in prices:
if price > peak:
peak = price # 更新峰值
else:
drawdown = (peak - price) / peak # 计算当前回撤
max_drawdown = max(max_drawdown, drawdown) # 更新最大回撤
return max_drawdown
def compute_stock_score(stock_df):
stock_df = stock_df.sort_values(by=['trade_date'])
future_return = stock_df['future_return']
# 使用已有的 pct_chg 字段计算波动率
volatility = stock_df['pct_chg'].rolling(days).std().shift(-days)
max_drawdown = stock_df['close'].rolling(days).apply(calculate_max_drawdown, raw=False).shift(-days)
score = future_return - lambda_param * max_drawdown
return score
# # 确保 DataFrame 按照股票代码和交易日期排序
# df = df.sort_values(by=['ts_code', 'trade_date'])
# 对每个股票分别计算 score
df['score'] = df.groupby('ts_code').apply(compute_stock_score).reset_index(level=0, drop=True)
return df['score']
def remove_highly_correlated_features(df, feature_columns, threshold=0.9):
numeric_features = df[feature_columns].select_dtypes(include=[np.number]).columns.tolist()
if not numeric_features:
raise ValueError("No numeric features found in the provided data.")
corr_matrix = df[numeric_features].corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop = [column for column in upper.columns if any(upper[column] > threshold)]
remaining_features = [col for col in feature_columns if col not in to_drop
or 'act' in col or 'af' in col]
return remaining_features
def cross_sectional_standardization(df, features):
df_sorted = df.sort_values(by='trade_date') # 按时间排序
df_standardized = df_sorted.copy()
for date in df_sorted['trade_date'].unique():
# 获取当前时间点的数据
current_data = df_standardized[df_standardized['trade_date'] == date]
# 只对指定特征进行标准化
scaler = StandardScaler()
standardized_values = scaler.fit_transform(current_data[features])
# 将标准化结果重新赋值回去
df_standardized.loc[df_standardized['trade_date'] == date, features] = standardized_values
return df_standardized
def neutralize_manual(df, features, industry_col, mkt_cap_col):
""" 手动实现简单回归以提升速度 """
for col in features:
residuals = []
for _, group in df.groupby(industry_col):
if len(group) > 1:
x = np.log(group[mkt_cap_col]) # 市值对数
y = group[col] # 因子值
beta = np.cov(y, x)[0, 1] / np.var(x) # 计算斜率
alpha = np.mean(y) - beta * np.mean(x) # 计算截距
resid = y - (alpha + beta * x) # 计算残差
residuals.extend(resid)
else:
residuals.extend(group[col]) # 样本不足时保留原值
df[col] = residuals
return df
def mad_filter(df, features, n=3):
for col in features:
median = df[col].median()
mad = np.median(np.abs(df[col] - median))
upper = median + n * mad
lower = median - n * mad
df[col] = np.clip(df[col], lower, upper) # 截断极值
return df
def percentile_filter(df, features, lower_percentile=0.01, upper_percentile=0.99):
for col in features:
# 按日期分组计算上下百分位数
lower_bound = df.groupby('trade_date')[col].transform(
lambda x: x.quantile(lower_percentile)
)
upper_bound = df.groupby('trade_date')[col].transform(
lambda x: x.quantile(upper_percentile)
)
# 截断超出范围的值
df[col] = np.clip(df[col], lower_bound, upper_bound)
return df
from scipy.stats import iqr
def iqr_filter(df, features):
for col in features:
df[col] = df.groupby('trade_date')[col].transform(
lambda x: (x - x.median()) / iqr(x) if iqr(x) != 0 else x
)
return df
def quantile_filter(df, features, lower_quantile=0.01, upper_quantile=0.99, window=60):
df = df.copy()
for col in features:
# 计算 rolling 统计量,需要按日期进行 groupby
rolling_lower = df.groupby('trade_date')[col].transform(
lambda x: x.rolling(window=min(len(x), window)).quantile(lower_quantile))
rolling_upper = df.groupby('trade_date')[col].transform(
lambda x: x.rolling(window=min(len(x), window)).quantile(upper_quantile))
# 对数据进行裁剪
df[col] = np.clip(df[col], rolling_lower, rolling_upper)
return df
def time_series_quantile_filter(df, features, lower_quantile=0.01, upper_quantile=0.99, window=60):
df = df.copy()
# 确保按股票和时间排序
df = df.sort_values(['ts_code', 'trade_date'])
grouped = df.groupby('ts_code')
for col in features:
# 对每个股票的时间序列计算滚动分位数
rolling_lower = grouped[col].rolling(window=window, min_periods=window // 2).quantile(lower_quantile)
rolling_upper = grouped[col].rolling(window=window, min_periods=window // 2).quantile(upper_quantile)
# rolling结果带有多重索引需要对齐
rolling_lower = rolling_lower.reset_index(level=0, drop=True)
rolling_upper = rolling_upper.reset_index(level=0, drop=True)
# 应用 clip
df[col] = np.clip(df[col], rolling_lower, rolling_upper)
return df
def cross_sectional_quantile_filter(df, features, lower_quantile=0.01, upper_quantile=0.99):
df = df.copy()
grouped = df.groupby('trade_date')
for col in features:
# 计算每日截面的分位数边界
lower_bound = grouped[col].transform(lambda x: x.quantile(lower_quantile))
upper_bound = grouped[col].transform(lambda x: x.quantile(upper_quantile))
# 应用 clip
df[col] = np.clip(df[col], lower_bound, upper_bound)
return df

154
main/utils/utils.py Normal file
View File

@@ -0,0 +1,154 @@
import numpy as np
import pandas as pd
def read_and_merge_h5_data(h5_filename, key, columns, df=None, join='left', on=['ts_code', 'trade_date'], prefix=None):
processed_columns = []
for col in columns:
if col.startswith('_'):
processed_columns.append(col[1:]) # 去掉下划线
else:
processed_columns.append(col)
# 从 HDF5 文件读取数据,选择需要的列
data = pd.read_hdf(h5_filename, key=key, columns=processed_columns)
# 修改列名,如果列名以前有 _加上 _
for col in data.columns:
if col not in columns: # 只有不在 columns 中的列才需要加下划线
new_col = f'_{col}'
data.rename(columns={col: new_col}, inplace=True)
if prefix is not None:
for col in data.columns:
if col not in ['ts_code', 'trade_date']: # 只有不在 columns 中的列才需要加下划线
new_col = f'{prefix}_{col}'
data.rename(columns={col: new_col}, inplace=True)
# 如果传入的 df 不为空,则进行合并
if df is not None and not df.empty:
print(f'{join} merge on {on}')
if 'trade_date' in on:
# 确保两个 DataFrame 都有 ts_code 和 trade_date 列
df['trade_date'] = pd.to_datetime(df['trade_date'], format='%Y%m%d')
data['trade_date'] = pd.to_datetime(data['trade_date'], format='%Y%m%d')
# 根据 ts_code 和 trade_date 合并
merged_df = pd.merge(df, data, on=on, how=join)
else:
# 如果 df 为空,则直接返回读取的数据
merged_df = data
return merged_df
def merge_with_industry_data(df, industry_df):
# 确保日期字段是 datetime 类型
df['trade_date'] = pd.to_datetime(df['trade_date'])
industry_df['in_date'] = pd.to_datetime(industry_df['in_date'])
# 对 industry_df 按 ts_code 和 in_date 排序
industry_df_sorted = industry_df.sort_values(['in_date', 'ts_code'])
# 对原始 df 按 ts_code 和 trade_date 排序
df_sorted = df.sort_values(['trade_date', 'ts_code'])
# 使用 merge_asof 进行向后合并
merged = pd.merge_asof(
df_sorted,
industry_df_sorted,
by='ts_code', # 按 ts_code 分组
left_on='trade_date',
right_on='in_date',
direction='backward'
)
# 获取每个 ts_code 的最早 in_date 记录
min_in_date_per_ts = (industry_df_sorted
.groupby('ts_code')
.first()
.reset_index()[['ts_code', 'l2_code']])
# 填充未匹配到的记录trade_date 早于所有 in_date 的情况)
merged['l2_code'] = merged['l2_code'].fillna(
merged['ts_code'].map(min_in_date_per_ts.set_index('ts_code')['l2_code'])
)
# 保留需要的列并重置索引
result = merged.reset_index(drop=True)
return result
def calculate_risk_adjusted_return(df, days=1, method='ratio', lambda_=0.5, eps=1e-8):
"""
计算单只股票的风险调整收益。
参数:
- df: DataFrame包含 'ts_code''close' 列,按日期排序(假设 'trade_date' 已排序)。
- days: 预测未来多少天的收益默认1天。
- method: 'ratio'(收益/波动率) 或 'difference'(收益 - λ * 波动率)。
- lambda_: 风险惩罚系数,仅当 method='difference' 时有效。
- eps: 防止除零的小常数。
返回:
- df添加 'risk_adj_return' 列的 DataFrame表示风险调整后的收益。
"""
# 确保数据按 ts_code 和 trade_date 排序
df = df.sort_values(by=['ts_code', 'trade_date'])
# 计算未来的对数收益率
df['future_return'] = np.log(df.groupby('ts_code')['close'].shift(-days) / df['close'])
# 计算历史收益(对数收益率)
df['historical_return'] = np.log(df.groupby('ts_code')['close'].shift(1) / df['close'])
# 计算波动率(历史收益的标准差)
df['volatility'] = df.groupby('ts_code')['historical_return'].rolling(window=days).std().reset_index(level=0,
drop=True)
# 根据选择的 method 计算风险调整收益
if method == 'ratio':
# 收益/波动率(防止除零)
df['risk_adj_return'] = df['future_return'] / (df['volatility'] + eps)
elif method == 'difference':
# 收益 - λ * 波动率
df['risk_adj_return'] = df['future_return'] - lambda_ * df['volatility']
else:
raise ValueError("Invalid method. Use 'ratio' or 'difference'.")
return df
# import polars as pl
#
# def read_and_merge_h5_data_polars(h5_filename, key, columns, df=None, join='left', on=['ts_code', 'trade_date']):
# processed_columns = []
# for col in columns:
# if col.startswith('_'):
# processed_columns.append(col[1:]) # 去掉下划线
# else:
# processed_columns.append(col)
#
# # 从 HDF5 文件读取数据,选择需要的列
# pd_df = pd.read_hdf(h5_filename, key=key, columns=processed_columns)
#
# # 将 Pandas DataFrame 转换为 Polars DataFrame
# data = pl.from_pandas(pd_df)
#
# # 修改列名,如果列名以前有 _加上 _
# data = data.rename({col: f'_{col}' for col in data.columns if col not in columns})
#
# # 如果传入的 df 不为空,则进行合并
# if df is not None and not df.is_empty():
# print(f'{join} merge on {on}')
#
# # 确保两个 DataFrame 都有 ts_code 和 trade_date 列
# # df = df.with_columns(pl.col('trade_date').str.strptime(pl.Datetime, format='%Y%m%d'))
# # data = data.with_columns(pl.col('trade_date').str.strptime(pl.Datetime, format='%Y%m%d'))
#
# # 根据 ts_code 和 trade_date 合并
# merged_df = df.join(data, on=on, how=join)
# else:
# # 如果 df 为空,则直接返回读取的数据
# merged_df = data
#
# return merged_df