Files
NewStock/code/utils/factor.py
2025-04-28 11:01:47 +08:00

738 lines
31 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import numpy as np
import talib
import pandas as pd
def get_technical_factor(df):
# 按股票和日期排序
df = df.sort_values(by=['ts_code', 'trade_date'])
grouped = df.groupby('ts_code', group_keys=False)
df['return_skew'] = grouped['pct_chg'].rolling(window=5).skew().reset_index(0, drop=True)
df['return_kurtosis'] = grouped['pct_chg'].rolling(window=5).kurt().reset_index(0, drop=True)
# 因子 1短期成交量变化率
df['volume_change_rate'] = (
grouped['vol'].rolling(window=2).mean() /
grouped['vol'].rolling(window=5).mean() - 1
).reset_index(level=0, drop=True) # 确保索引对齐
# 因子 2成交量突破信号
max_volume = grouped['vol'].rolling(window=5).max().reset_index(level=0, drop=True) # 确保索引对齐
df['cat_volume_breakout'] = (df['vol'] > max_volume)
# 因子 3换手率均线偏离度
mean_turnover = grouped['turnover_rate'].rolling(window=3).mean().reset_index(level=0, drop=True)
std_turnover = grouped['turnover_rate'].rolling(window=3).std().reset_index(level=0, drop=True)
df['turnover_deviation'] = (df['turnover_rate'] - mean_turnover) / std_turnover
# 因子 4换手率激增信号
df['cat_turnover_spike'] = (df['turnover_rate'] > mean_turnover + 2 * std_turnover)
# 因子 5量比均值
df['avg_volume_ratio'] = grouped['volume_ratio'].rolling(window=3).mean().reset_index(level=0, drop=True)
# 因子 6量比突破信号
max_volume_ratio = grouped['volume_ratio'].rolling(window=5).max().reset_index(level=0, drop=True)
df['cat_volume_ratio_breakout'] = (df['volume_ratio'] > max_volume_ratio)
# 因子 7成交量与换手率的综合动量因子
alpha = 0.5
df['momentum_factor'] = df['volume_change_rate'] + alpha * df['turnover_deviation']
# 因子 8量价共振因子
df['price_change_rate'] = grouped['close'].pct_change()
df['resonance_factor'] = df['volume_ratio'] * df['price_change_rate']
# 计算 up 和 down
df['log_close'] = np.log(df['close'])
df['vol_spike'] = grouped.apply(
lambda x: pd.Series(x['vol'].rolling(20).mean(), index=x.index)
)
df['cat_vol_spike'] = df['vol'] > 2 * df['vol_spike']
df['vol_std_5'] = df['vol'].pct_change().rolling(5).std()
df['up'] = (df['high'] - df[['close', 'open']].max(axis=1)) / df['close']
df['down'] = (df[['close', 'open']].min(axis=1) - df['low']) / df['close']
# 计算 ATR
df['atr_14'] = grouped.apply(
lambda x: pd.Series(talib.ATR(x['high'].values, x['low'].values, x['close'].values, timeperiod=14),
index=x.index)
)
df['atr_6'] = grouped.apply(
lambda x: pd.Series(talib.ATR(x['high'].values, x['low'].values, x['close'].values, timeperiod=6),
index=x.index)
)
# 计算 OBV 及其均线
df['obv'] = grouped.apply(
lambda x: pd.Series(talib.OBV(x['close'].values, x['vol'].values), index=x.index)
)
df['maobv_6'] = grouped.apply(
lambda x: pd.Series(talib.SMA(x['obv'].values, timeperiod=6), index=x.index)
)
df['obv-maobv_6'] = df['obv'] - df['maobv_6']
# 计算 RSI
df['rsi_3'] = grouped.apply(
lambda x: pd.Series(talib.RSI(x['close'].values, timeperiod=3), index=x.index)
)
df['rsi_6'] = grouped.apply(
lambda x: pd.Series(talib.RSI(x['close'].values, timeperiod=6), index=x.index)
)
df['rsi_9'] = grouped.apply(
lambda x: pd.Series(talib.RSI(x['close'].values, timeperiod=9), index=x.index)
)
# 计算 return_10 和 return_20
df['return_5'] = grouped['close'].apply(lambda x: x / x.shift(5) - 1)
df['return_10'] = grouped['close'].apply(lambda x: x / x.shift(10) - 1)
df['return_20'] = grouped['close'].apply(lambda x: x / x.shift(20) - 1)
# df['avg_close_5'] = grouped['close'].apply(lambda x: x.rolling(window=5).mean() / x)
# 计算标准差指标
df['std_return_5'] = grouped['close'].apply(lambda x: x.pct_change().rolling(window=5).std())
df['std_return_15'] = grouped['close'].apply(lambda x: x.pct_change().rolling(window=15).std())
df['std_return_25'] = grouped['close'].apply(lambda x: x.pct_change().rolling(window=25).std())
df['std_return_90'] = grouped['close'].apply(lambda x: x.pct_change().rolling(window=90).std())
df['std_return_90_2'] = grouped['close'].apply(lambda x: x.shift(10).pct_change().rolling(window=90).std())
# 计算比值指标
df['std_return_5 / std_return_90'] = df['std_return_5'] / df['std_return_90']
df['std_return_5 / std_return_25'] = df['std_return_5'] / df['std_return_25']
# 计算标准差差值
df['std_return_90 - std_return_90_2'] = df['std_return_90'] - df['std_return_90_2']
return df
def get_act_factor(df, cat=True):
# 按股票和日期排序
df = df.sort_values(by=['ts_code', 'trade_date'])
grouped = df.groupby('ts_code', group_keys=False)
# 计算 EMA 指标
df['_ema_5'] = grouped['close'].apply(
lambda x: pd.Series(talib.EMA(x.values, timeperiod=5), index=x.index)
)
df['_ema_13'] = grouped['close'].apply(
lambda x: pd.Series(talib.EMA(x.values, timeperiod=13), index=x.index)
)
df['_ema_20'] = grouped['close'].apply(
lambda x: pd.Series(talib.EMA(x.values, timeperiod=20), index=x.index)
)
df['_ema_60'] = grouped['close'].apply(
lambda x: pd.Series(talib.EMA(x.values, timeperiod=60), index=x.index)
)
# 计算 act_factor1, act_factor2, act_factor3, act_factor4
df['act_factor1'] = grouped['_ema_5'].apply(
lambda x: np.arctan((x / x.shift(1) - 1) * 100) * 57.3 / 50
)
df['act_factor2'] = grouped['_ema_13'].apply(
lambda x: np.arctan((x / x.shift(1) - 1) * 100) * 57.3 / 40
)
df['act_factor3'] = grouped['_ema_20'].apply(
lambda x: np.arctan((x / x.shift(1) - 1) * 100) * 57.3 / 21
)
df['act_factor4'] = grouped['_ema_60'].apply(
lambda x: np.arctan((x / x.shift(1) - 1) * 100) * 57.3 / 10
)
if cat:
df['cat_af1'] = df['act_factor1'] > 0
df['cat_af2'] = df['act_factor2'] > df['act_factor1']
df['cat_af3'] = df['act_factor3'] > df['act_factor2']
df['cat_af4'] = df['act_factor4'] > df['act_factor3']
# 计算 act_factor5 和 act_factor6
df['act_factor5'] = df['act_factor1'] + df['act_factor2'] + df['act_factor3'] + df['act_factor4']
df['act_factor6'] = (df['act_factor1'] - df['act_factor2']) / np.sqrt(
df['act_factor1'] ** 2 + df['act_factor2'] ** 2)
# 根据 trade_date 截面计算排名
df['rank_act_factor1'] = df.groupby('trade_date', group_keys=False)['act_factor1'].rank(ascending=False, pct=True)
df['rank_act_factor2'] = df.groupby('trade_date', group_keys=False)['act_factor2'].rank(ascending=False, pct=True)
df['rank_act_factor3'] = df.groupby('trade_date', group_keys=False)['act_factor3'].rank(ascending=False, pct=True)
return df
def get_money_flow_factor(df):
# 计算资金流相关因子(字段名称见 tushare 数据说明)
df['active_buy_volume_large'] = df['buy_lg_vol'] / df['net_mf_vol']
df['active_buy_volume_big'] = df['buy_elg_vol'] / df['net_mf_vol']
df['active_buy_volume_small'] = df['buy_sm_vol'] / df['net_mf_vol']
df['buy_lg_vol_minus_sell_lg_vol'] = (df['buy_lg_vol'] - df['sell_lg_vol']) / df['net_mf_vol']
df['buy_elg_vol_minus_sell_elg_vol'] = (df['buy_elg_vol'] - df['sell_elg_vol']) / df['net_mf_vol']
df['log(circ_mv)'] = np.log(df['circ_mv'])
return df
def get_alpha_factor(df):
df = df.sort_values(by=['ts_code', 'trade_date'])
grouped = df.groupby('ts_code')
# alpha_022: 当前 close 与 5 日前 close 差值
# df['alpha_022'] = grouped['close'].transform(lambda x: x - x.shift(5))
def rolling_covariance(x, y, window):
return x.rolling(window).cov(y)
def delta(series, period):
return series.diff(period)
def rank(series):
return series.rank(pct=True)
def stddev(series, window):
return series.rolling(window).std()
# 计算改进后的 Alpha 22 因子
window_high_volume = 5
window_close_stddev = 20
period_delta = 5
df['cov'] = rolling_covariance(df['high'], df['volume'], window_high_volume)
df['delta_cov'] = delta(df['cov'], period_delta)
df['_rank_stddev'] = rank(stddev(df['close'], window_close_stddev))
df['alpha_22_improved'] = -1 * df['delta_cov'] * df['_rank_stddev']
# alpha_003: (close - open) / (high - low)
df['alpha_003'] = np.where(df['high'] != df['low'],
(df['close'] - df['open']) / (df['high'] - df['low']),
0)
# alpha_007: 计算过去5日 close 与 vol 的相关性,并按 trade_date 排名
df['alpha_007'] = grouped.apply(lambda x: x['close'].rolling(5).corr(x['vol'])).reset_index(level=0, drop=True)
df['alpha_007'] = df.groupby('trade_date', group_keys=False)['alpha_007'].rank(ascending=True, pct=True)
# alpha_013: 计算过去5日 close 之和 - 20日 close 之和,并按 trade_date 排名
df['alpha_013'] = grouped['close'].transform(lambda x: x.rolling(5).sum() - x.rolling(20).sum())
df['alpha_013'] = df.groupby('trade_date', group_keys=False)['alpha_013'].rank(ascending=True, pct=True)
return df
def get_limit_factor(df):
# 按股票和日期排序
df = df.sort_values(by=['ts_code', 'trade_date'])
# 分组处理
grouped = df.groupby('ts_code', group_keys=False)
# 1. 今日是否涨停/跌停
df['cat_up_limit'] = (df['close'] == df['up_limit']).astype(int) # 是否涨停1表示涨停0表示未涨停
df['cat_down_limit'] = (df['close'] == df['down_limit']).astype(int) # 是否跌停1表示跌停0表示未跌停
# 2. 最近涨跌停次数过去20个交易日
df['up_limit_count_10d'] = grouped['cat_up_limit'].rolling(window=10, min_periods=1).sum().reset_index(level=0,
drop=True)
df['down_limit_count_10d'] = grouped['cat_down_limit'].rolling(window=10, min_periods=1).sum().reset_index(level=0,
drop=True)
# 3. 最近连续涨跌停天数
def calculate_consecutive_limits(series):
"""
计算连续涨停/跌停天数。
"""
consecutive_up = series * (series.groupby((series != series.shift()).cumsum()).cumcount() + 1)
consecutive_down = series * (series.groupby((series != series.shift()).cumsum()).cumcount() + 1)
return consecutive_up, consecutive_down
# 连续涨停天数
df['consecutive_up_limit'] = grouped['cat_up_limit'].apply(
lambda x: calculate_consecutive_limits(x)[0]
).reset_index(level=0, drop=True)
# 连续跌停天数
# df['consecutive_down_limit'] = grouped['cat_down_limit'].apply(
# lambda x: calculate_consecutive_limits(x)[1]
# ).reset_index(level=0, drop=True)
return df
def get_cyp_perf_factor(df):
# 预处理:按股票代码和时间排序
df = df.sort_values(by=['ts_code', 'trade_date'])
# 按股票代码分组处理
grouped = df.groupby('ts_code', group_keys=False)
df['ctrl_strength'] = (df['cost_85pct'] - df['cost_15pct']) / (df['his_high'] - df['his_low'])
df['low_cost_dev'] = (df['close'] - df['cost_5pct']) / (df['cost_50pct'] - df['cost_5pct'])
df['asymmetry'] = (df['cost_95pct'] - df['cost_50pct']) / (df['cost_50pct'] - df['cost_5pct'])
df['lock_factor'] = df['turnover_rate'] * (
1 - (df['cost_95pct'] - df['cost_5pct']) / (df['his_high'] - df['his_low']))
df['vol_break'] = np.where((df['close'] > df['cost_85pct']) & (df['volume_ratio'] > 2), 1, 0)
df['weight_roc5'] = grouped['weight_avg'].apply(lambda x: x.pct_change(5))
def rolling_corr(group):
roc_close = group['close'].pct_change()
roc_weight = group['weight_avg'].pct_change()
return roc_close.rolling(10).corr(roc_weight)
df['price_cost_divergence'] = grouped.apply(rolling_corr)
def calc_atr(group):
high, low, close = group['high'], group['low'], group['close']
tr = np.maximum(high - low,
np.maximum(abs(high - close.shift()),
abs(low - close.shift())))
return tr.rolling(14).mean()
df['atr_14'] = grouped.apply(calc_atr)
df['cost_atr_adj'] = (df['cost_95pct'] - df['cost_5pct']) / df['atr_14']
# 12. 小盘股筹码集中度
df['smallcap_concentration'] = (1 / df['circ_mv']) * (df['cost_85pct'] - df['cost_15pct'])
# 16. 筹码稳定性指数 (20日波动率)
df['weight_std20'] = grouped['weight_avg'].apply(lambda x: x.rolling(20).std())
df['cost_stability'] = df['weight_std20'] / grouped['weight_avg'].transform(lambda x: x.rolling(20).mean())
# 17. 成本区间突破标记
df['high_cost_break_days'] = grouped.apply(lambda g: g['close'].gt(g['cost_95pct']).rolling(5).sum())
# 18. 黄金筹码共振 (复合事件)
df['cat_golden_resonance'] = ((df['close'] > df['weight_avg']) &
(df['volume_ratio'] > 1.5) &
(df['winner_rate'] > 0.7))
# 20. 筹码-流动性风险
df['liquidity_risk'] = (df['cost_95pct'] - df['cost_5pct']) * (
1 / grouped['vol'].transform(lambda x: x.rolling(10).mean()))
df.drop(columns=['weight_std20'], inplace=True, errors='ignore')
return df
def get_mv_factors(df):
"""
计算多个因子并生成最终的综合因子。
参数:
df (pd.DataFrame): 包含 ts_code, trade_date, turnover_rate, pe_ttm, pb, ps, circ_mv, volume_ratio, vol 等列的数据框。
返回:
pd.DataFrame: 包含新增因子和最终综合因子的数据框。
"""
# 按 ts_code 和 trade_date 排序
df = df.sort_values(by=['ts_code', 'trade_date'])
# 按 ts_code 分组
grouped = df.groupby('ts_code', group_keys=False)
# 1. 市值流动比因子
df['mv_turnover_ratio'] = df['turnover_rate'] / df['circ_mv']
# 2. 市值调整成交量因子
df['mv_adjusted_volume'] = df['vol'] / df['circ_mv']
# 3. 市值加权换手率因子
df['mv_weighted_turnover'] = df['turnover_rate'] * (1 / df['circ_mv'])
# 4. 非线性市值成交量因子
df['nonlinear_mv_volume'] = df['vol'] / df['circ_mv']
# 5. 市值量比因子
df['mv_volume_ratio'] = df['volume_ratio'] / df['circ_mv']
# 6. 市值动量因子
df['mv_momentum'] = df['turnover_rate'] * df['volume_ratio'] / df['circ_mv']
# 7. 市值波动率因子
df['turnover_std'] = grouped['turnover_rate'].rolling(window=20).std().reset_index(level=0, drop=True)
df['mv_volatility'] = grouped.apply(lambda x: x['turnover_std'] / x['circ_mv']).reset_index(level=0, drop=True)
# 8. 市值成长性因子
df['volume_growth'] = grouped['vol'].pct_change(periods=20).reset_index(level=0, drop=True)
df['mv_growth'] = grouped.apply(lambda x: x['volume_growth'] / x['circ_mv']).reset_index(level=0, drop=True)
# # 标准化因子
# factor_columns = [
# 'mv_turnover_ratio', 'mv_adjusted_volume', 'mv_weighted_turnover',
# 'nonlinear_mv_volume', 'mv_volume_ratio', 'mv_momentum',
# 'mv_volatility', 'mv_growth'
# ]
# scaler = StandardScaler()
# df[factor_columns] = scaler.fit_transform(df[factor_columns])
#
# # 加权合成因子
# weights = [0.2, 0.15, 0.15, 0.1, 0.1, 0.1, 0.1, 0.1] # 各因子权重
# df['final_combined_factor'] = df[factor_columns].dot(weights)
return df
import numpy as np
import talib
def get_rolling_factor(df):
old_columns = df.columns.tolist()[:]
# 按股票和日期排序
df = df.sort_values(by=['ts_code', 'trade_date'])
grouped = df.groupby('ts_code', group_keys=False)
df["gap_next_open"] = (df["open"].shift(-1) - df["close"]) / df["close"]
df['return_skew'] = grouped['pct_chg'].rolling(window=5).skew().reset_index(0, drop=True)
df['return_kurtosis'] = grouped['pct_chg'].rolling(window=5).kurt().reset_index(0, drop=True)
# 因子 1短期成交量变化率
df['volume_change_rate'] = (
grouped['vol'].rolling(window=2).mean() /
grouped['vol'].rolling(window=10).mean() - 1
).reset_index(level=0, drop=True) # 确保索引对齐
# 因子 2成交量突破信号
max_volume = grouped['vol'].rolling(window=5).max().reset_index(level=0, drop=True) # 确保索引对齐
df['cat_volume_breakout'] = (df['vol'] > max_volume)
# 因子 3换手率均线偏离度
mean_turnover = grouped['turnover_rate'].rolling(window=3).mean().reset_index(level=0, drop=True)
std_turnover = grouped['turnover_rate'].rolling(window=3).std().reset_index(level=0, drop=True)
df['turnover_deviation'] = (df['turnover_rate'] - mean_turnover) / std_turnover
# 因子 4换手率激增信号
df['cat_turnover_spike'] = (df['turnover_rate'] > mean_turnover + 2 * std_turnover)
# 因子 5量比均值
df['avg_volume_ratio'] = grouped['volume_ratio'].rolling(window=3).mean().reset_index(level=0, drop=True)
# 因子 6量比突破信号
max_volume_ratio = grouped['volume_ratio'].rolling(window=5).max().reset_index(level=0, drop=True)
df['cat_volume_ratio_breakout'] = (df['volume_ratio'] > max_volume_ratio)
df['vol_spike'] = grouped.apply(
lambda x: pd.Series(x['vol'].rolling(20).mean(), index=x.index)
)
df['vol_std_5'] = df['vol'].pct_change().rolling(5).std()
# 计算 ATR
df['atr_14'] = grouped.apply(
lambda x: pd.Series(talib.ATR(x['high'].values, x['low'].values, x['close'].values, timeperiod=14),
index=x.index)
)
df['atr_6'] = grouped.apply(
lambda x: pd.Series(talib.ATR(x['high'].values, x['low'].values, x['close'].values, timeperiod=6),
index=x.index)
)
# 计算 OBV 及其均线
df['obv'] = grouped.apply(
lambda x: pd.Series(talib.OBV(x['close'].values, x['vol'].values), index=x.index)
)
df['maobv_6'] = grouped.apply(
lambda x: pd.Series(talib.SMA(x['obv'].values, timeperiod=6), index=x.index)
)
df['rsi_3'] = grouped.apply(
lambda x: pd.Series(talib.RSI(x['close'].values, timeperiod=3), index=x.index)
)
df['rsi_6'] = grouped.apply(
lambda x: pd.Series(talib.RSI(x['close'].values, timeperiod=6), index=x.index)
)
df['rsi_9'] = grouped.apply(
lambda x: pd.Series(talib.RSI(x['close'].values, timeperiod=9), index=x.index)
)
# 计算 return_10 和 return_20
df['return_5'] = grouped['close'].apply(lambda x: x / x.shift(5) - 1)
df['return_10'] = grouped['close'].apply(lambda x: x / x.shift(10) - 1)
df['return_20'] = grouped['close'].apply(lambda x: x / x.shift(20) - 1)
# df['avg_close_5'] = grouped['close'].apply(lambda x: x.rolling(window=5).mean() / x)
# 计算标准差指标
df['std_return_5'] = grouped['close'].apply(lambda x: x.pct_change().rolling(window=5).std())
df['std_return_15'] = grouped['close'].apply(lambda x: x.pct_change().rolling(window=15).std())
df['std_return_25'] = grouped['close'].apply(lambda x: x.pct_change().rolling(window=25).std())
df['std_return_90'] = grouped['close'].apply(lambda x: x.pct_change().rolling(window=90).std())
df['std_return_90_2'] = grouped['close'].apply(lambda x: x.shift(10).pct_change().rolling(window=90).std())
# 计算 EMA 指标
df['_ema_5'] = grouped['close'].apply(
lambda x: pd.Series(talib.EMA(x.values, timeperiod=5), index=x.index)
)
df['_ema_13'] = grouped['close'].apply(
lambda x: pd.Series(talib.EMA(x.values, timeperiod=13), index=x.index)
)
df['_ema_20'] = grouped['close'].apply(
lambda x: pd.Series(talib.EMA(x.values, timeperiod=20), index=x.index)
)
df['_ema_60'] = grouped['close'].apply(
lambda x: pd.Series(talib.EMA(x.values, timeperiod=60), index=x.index)
)
# 计算 act_factor1, act_factor2, act_factor3, act_factor4
df['act_factor1'] = grouped['_ema_5'].apply(
lambda x: np.arctan((x / x.shift(1) - 1) * 100) * 57.3 / 50
)
df['act_factor2'] = grouped['_ema_13'].apply(
lambda x: np.arctan((x / x.shift(1) - 1) * 100) * 57.3 / 40
)
df['act_factor3'] = grouped['_ema_20'].apply(
lambda x: np.arctan((x / x.shift(1) - 1) * 100) * 57.3 / 21
)
df['act_factor4'] = grouped['_ema_60'].apply(
lambda x: np.arctan((x / x.shift(1) - 1) * 100) * 57.3 / 10
)
# 根据 trade_date 截面计算排名
df['rank_act_factor1'] = df.groupby('trade_date', group_keys=False)['act_factor1'].rank(ascending=False, pct=True)
df['rank_act_factor2'] = df.groupby('trade_date', group_keys=False)['act_factor2'].rank(ascending=False, pct=True)
df['rank_act_factor3'] = df.groupby('trade_date', group_keys=False)['act_factor3'].rank(ascending=False, pct=True)
df['log(circ_mv)'] = np.log(df['circ_mv'])
def rolling_covariance(x, y, window):
return x.rolling(window).cov(y)
def delta(series, period):
return series.diff(period)
def rank(series):
return series.rank(pct=True)
def stddev(series, window):
return series.rolling(window).std()
window_high_volume = 5
window_close_stddev = 20
period_delta = 5
df['cov'] = rolling_covariance(df['high'], df['vol'], window_high_volume)
df['delta_cov'] = delta(df['cov'], period_delta)
df['_rank_stddev'] = rank(stddev(df['close'], window_close_stddev))
df['alpha_22_improved'] = -1 * df['delta_cov'] * df['_rank_stddev']
df['alpha_003'] = np.where(df['high'] != df['low'],
(df['close'] - df['open']) / (df['high'] - df['low']),
0)
df['alpha_007'] = grouped.apply(lambda x: x['close'].rolling(5).corr(x['vol'])).reset_index(level=0, drop=True)
df['alpha_007'] = df.groupby('trade_date', group_keys=False)['alpha_007'].rank(ascending=True, pct=True)
df['alpha_013'] = grouped['close'].transform(lambda x: x.rolling(5).sum() - x.rolling(20).sum())
df['alpha_013'] = df.groupby('trade_date', group_keys=False)['alpha_013'].rank(ascending=True, pct=True)
df['cat_up_limit'] = (df['close'] == df['up_limit']) # 是否涨停1表示涨停0表示未涨停
df['cat_down_limit'] = (df['close'] == df['down_limit']) # 是否跌停1表示跌停0表示未跌停
df['up_limit_count_10d'] = grouped['cat_up_limit'].rolling(window=10, min_periods=1).sum().reset_index(level=0,
drop=True)
df['down_limit_count_10d'] = grouped['cat_down_limit'].rolling(window=10, min_periods=1).sum().reset_index(level=0,
drop=True)
# 3. 最近连续涨跌停天数
def calculate_consecutive_limits(series):
"""
计算连续涨停/跌停天数。
"""
consecutive_up = series * (series.groupby((series != series.shift()).cumsum()).cumcount() + 1)
consecutive_down = series * (series.groupby((series != series.shift()).cumsum()).cumcount() + 1)
return consecutive_up, consecutive_down
# 连续涨停天数
df['consecutive_up_limit'] = grouped['cat_up_limit'].apply(
lambda x: calculate_consecutive_limits(x)[0]
).reset_index(level=0, drop=True)
df['vol_break'] = np.where((df['close'] > df['cost_85pct']) & (df['volume_ratio'] > 2), 1, 0)
df['weight_roc5'] = grouped['weight_avg'].apply(lambda x: x.pct_change(5))
def rolling_corr(group):
roc_close = group['close'].pct_change()
roc_weight = group['weight_avg'].pct_change()
return roc_close.rolling(10).corr(roc_weight)
df['price_cost_divergence'] = grouped.apply(rolling_corr)
df['smallcap_concentration'] = (1 / df['circ_mv']) * (df['cost_85pct'] - df['cost_15pct'])
# 16. 筹码稳定性指数 (20日波动率)
df['weight_std20'] = grouped['weight_avg'].apply(lambda x: x.rolling(20).std())
df['cost_stability'] = df['weight_std20'] / grouped['weight_avg'].transform(lambda x: x.rolling(20).mean())
# 17. 成本区间突破标记
df['high_cost_break_days'] = grouped.apply(lambda g: g['close'].gt(g['cost_95pct']).rolling(5).sum())
# 20. 筹码-流动性风险
df['liquidity_risk'] = (df['cost_95pct'] - df['cost_5pct']) * (
1 / grouped['vol'].transform(lambda x: x.rolling(10).mean()))
# 7. 市值波动率因子
df['turnover_std'] = grouped['turnover_rate'].rolling(window=20).std().reset_index(level=0, drop=True)
df['mv_volatility'] = grouped.apply(lambda x: x['turnover_std'] / x['circ_mv']).reset_index(level=0, drop=True)
# 8. 市值成长性因子
df['volume_growth'] = grouped['vol'].pct_change(periods=20).reset_index(level=0, drop=True)
df['mv_growth'] = grouped.apply(lambda x: x['volume_growth'] / x['circ_mv']).reset_index(level=0, drop=True)
df.drop(columns=['weight_std20'], inplace=True, errors='ignore')
new_columns = [col for col in df.columns.tolist()[:] if col not in old_columns]
return df, new_columns
def get_simple_factor(df):
old_columns = df.columns.tolist()[:]
df = df.sort_values(by=['ts_code', 'trade_date'])
alpha = 0.5
df['momentum_factor'] = df['volume_change_rate'] + alpha * df['turnover_deviation']
df['resonance_factor'] = df['volume_ratio'] * df['pct_chg']
df['log_close'] = np.log(df['close'])
df['cat_vol_spike'] = df['vol'] > 2 * df['vol_spike']
df['up'] = (df['high'] - df[['close', 'open']].max(axis=1)) / df['close']
df['down'] = (df[['close', 'open']].min(axis=1) - df['low']) / df['close']
df['obv-maobv_6'] = df['obv'] - df['maobv_6']
# 计算比值指标
df['std_return_5 / std_return_90'] = df['std_return_5'] / df['std_return_90']
df['std_return_5 / std_return_25'] = df['std_return_5'] / df['std_return_25']
# 计算标准差差值
df['std_return_90 - std_return_90_2'] = df['std_return_90'] - df['std_return_90_2']
df['cat_af1'] = df['act_factor1'] > 0
df['cat_af2'] = df['act_factor2'] > df['act_factor1']
df['cat_af3'] = df['act_factor3'] > df['act_factor2']
df['cat_af4'] = df['act_factor4'] > df['act_factor3']
# 计算 act_factor5 和 act_factor6
df['act_factor5'] = df['act_factor1'] + df['act_factor2'] + df['act_factor3'] + df['act_factor4']
df['act_factor6'] = (df['act_factor1'] - df['act_factor2']) / np.sqrt(
df['act_factor1'] ** 2 + df['act_factor2'] ** 2)
df['active_buy_volume_large'] = df['buy_lg_vol'] / df['net_mf_vol']
df['active_buy_volume_big'] = df['buy_elg_vol'] / df['net_mf_vol']
df['active_buy_volume_small'] = df['buy_sm_vol'] / df['net_mf_vol']
df['buy_lg_vol_minus_sell_lg_vol'] = (df['buy_lg_vol'] - df['sell_lg_vol']) / df['net_mf_vol']
df['buy_elg_vol_minus_sell_elg_vol'] = (df['buy_elg_vol'] - df['sell_elg_vol']) / df['net_mf_vol']
df['log(circ_mv)'] = np.log(df['circ_mv'])
df['ctrl_strength'] = (df['cost_85pct'] - df['cost_15pct']) / (df['his_high'] - df['his_low'])
df['low_cost_dev'] = (df['close'] - df['cost_5pct']) / (df['cost_50pct'] - df['cost_5pct'])
df['asymmetry'] = (df['cost_95pct'] - df['cost_50pct']) / (df['cost_50pct'] - df['cost_5pct'])
df['lock_factor'] = df['turnover_rate'] * (
1 - (df['cost_95pct'] - df['cost_5pct']) / (df['his_high'] - df['his_low']))
df['cat_vol_break'] = (df['close'] > df['cost_85pct']) & (df['volume_ratio'] > 2)
df['cost_atr_adj'] = (df['cost_95pct'] - df['cost_5pct']) / df['atr_14']
# 12. 小盘股筹码集中度
df['smallcap_concentration'] = (1 / df['circ_mv']) * (df['cost_85pct'] - df['cost_15pct'])
df['cat_golden_resonance'] = ((df['close'] > df['weight_avg']) &
(df['volume_ratio'] > 1.5) &
(df['winner_rate'] > 0.7))
df['mv_turnover_ratio'] = df['turnover_rate'] / df['circ_mv']
df['mv_adjusted_volume'] = df['vol'] / df['circ_mv']
df['mv_weighted_turnover'] = df['turnover_rate'] * (1 / df['circ_mv'])
df['nonlinear_mv_volume'] = df['vol'] / df['circ_mv']
df['mv_volume_ratio'] = df['volume_ratio'] / df['circ_mv']
df['mv_momentum'] = df['turnover_rate'] * df['volume_ratio'] / df['circ_mv']
drop_columns = [col for col in df.columns if col.startswith('_')]
df.drop(columns=drop_columns, inplace=True, errors='ignore')
new_columns = [col for col in df.columns.tolist()[:] if col not in old_columns]
return df, new_columns
def calculate_indicators(df):
"""
计算四个指标当日涨跌幅、5日移动平均、RSI、MACD。
"""
df = df.sort_values('trade_date')
df['daily_return'] = (df['close'] - df['pre_close']) / df['pre_close'] * 100
# df['5_day_ma'] = df['close'].rolling(window=5).mean()
delta = df['close'].diff()
gain = delta.where(delta > 0, 0)
loss = -delta.where(delta < 0, 0)
avg_gain = gain.rolling(window=14).mean()
avg_loss = loss.rolling(window=14).mean()
rs = avg_gain / avg_loss
df['RSI'] = 100 - (100 / (1 + rs))
# 计算MACD
ema12 = df['close'].ewm(span=12, adjust=False).mean()
ema26 = df['close'].ewm(span=26, adjust=False).mean()
df['MACD'] = ema12 - ema26
df['Signal_line'] = df['MACD'].ewm(span=9, adjust=False).mean()
df['MACD_hist'] = df['MACD'] - df['Signal_line']
# 4. 情绪因子1市场上涨比例Up Ratio
df['up_ratio'] = df['daily_return'].apply(lambda x: 1 if x > 0 else 0)
df['up_ratio_20d'] = df['up_ratio'].rolling(window=20).mean() # 过去20天上涨比例
# 5. 情绪因子2成交量变化率Volume Change Rate
df['volume_mean'] = df['vol'].rolling(window=20).mean() # 过去20天的平均成交量
df['volume_change_rate'] = (df['vol'] - df['volume_mean']) / df['volume_mean'] * 100 # 成交量变化率
# 6. 情绪因子3波动率Volatility
df['volatility'] = df['daily_return'].rolling(window=20).std() # 过去20天的日收益率标准差
# 7. 情绪因子4成交额变化率Amount Change Rate
df['amount_mean'] = df['amount'].rolling(window=20).mean() # 过去20天的平均成交额
df['amount_change_rate'] = (df['amount'] - df['amount_mean']) / df['amount_mean'] * 100 # 成交额变化率
return df
def generate_index_indicators(h5_filename):
df = pd.read_hdf(h5_filename, key='index_data')
df['trade_date'] = pd.to_datetime(df['trade_date'], format='%Y%m%d')
df = df.sort_values('trade_date')
# 计算每个ts_code的相关指标
df_indicators = []
for ts_code in df['ts_code'].unique():
df_index = df[df['ts_code'] == ts_code].copy()
df_index = calculate_indicators(df_index)
df_indicators.append(df_index)
# 合并所有指数的结果
df_all_indicators = pd.concat(df_indicators, ignore_index=True)
# 保留trade_date列并将同一天的数据按ts_code合并成一行
df_final = df_all_indicators.pivot_table(
index='trade_date',
columns='ts_code',
values=['daily_return', 'RSI', 'MACD', 'Signal_line',
'MACD_hist', 'up_ratio_20d', 'volume_change_rate', 'volatility',
'amount_change_rate', 'amount_mean'],
aggfunc='last'
)
df_final.columns = [f"{col[1]}_{col[0]}" for col in df_final.columns]
df_final = df_final.reset_index()
return df_final