Files
NewStock/main/factor/factor.py
2025-04-28 11:02:52 +08:00

1533 lines
72 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import numpy as np
import pandas as pd
import talib
def get_rolling_factor(df):
old_columns = df.columns.tolist()[:]
# 按股票和日期排序(如果尚未排序)
df = df.sort_values(by=['ts_code', 'trade_date'])
grouped = df.groupby('ts_code', group_keys=False)
epsilon = 1e-8
df['lg_elg_net_buy_vol'] = df['buy_lg_vol'] + df['buy_elg_vol'] - df['sell_lg_vol'] - df['sell_elg_vol']
# 检查 'volume' 列是否存在且有效
df['flow_lg_elg_intensity'] = df['lg_elg_net_buy_vol'] / (df['vol'] + epsilon)
# 2. 散户与主力背离度 (Retail vs Institutional Divergence)
# 衡量小单净流入与(大单+超大单)净流入的差异或比率
df['sm_net_buy_vol'] = df['buy_sm_vol'] - df['sell_sm_vol']
df['flow_divergence_diff'] = df['sm_net_buy_vol'] - df['lg_elg_net_buy_vol']
# 比率形式可能更稳定
df['flow_divergence_ratio'] = df['sm_net_buy_vol'] / (
df['lg_elg_net_buy_vol'] + np.sign(df['lg_elg_net_buy_vol']) * epsilon + epsilon) # 复杂处理避免0/0
# 3. 资金流结构变动 (Flow Structure Change - Relative Strength of Large Flow)
# 大单+超大单买入额占总买入额的比例的变化
df['total_buy_vol'] = df['buy_sm_vol'] + df['buy_lg_vol'] + df['buy_elg_vol']
df['lg_elg_buy_prop'] = (df['buy_lg_vol'] + df['buy_elg_vol']) / (df['total_buy_vol'] + epsilon)
df['flow_struct_buy_change'] = grouped['lg_elg_buy_prop'].diff(1) # 1日变化
# 4. 资金流加速度 (Flow Acceleration)
# 净主力资金流的变化率(二阶导)
df['lg_elg_net_buy_vol_change'] = grouped['lg_elg_net_buy_vol'].diff(1)
df['flow_lg_elg_accel'] = grouped['lg_elg_net_buy_vol_change'].diff(1)
# # 5. 极端资金流事件 (Categorical: Extreme Flow Event)
# # 定义主力资金流强度是否处于其历史极端水平例如过去N天的90分位数以上或10分位数以下
# rolling_window = 20 # 可调整窗口期
# # Step 1: Calculate the rolling quantiles separately
# rolling_high = grouped['flow_lg_elg_intensity'].rolling(rolling_window, min_periods=1).quantile(0.9) # min_periods=1 保证窗口未满时也有输出
# rolling_low = grouped['flow_lg_elg_intensity'].rolling(rolling_window, min_periods=1).quantile(0.1)
# # Step 2: Assign the results to the DataFrame
# # 确保 df 和 rolling_high/low 的索引是一致的
# # 如果 df 的索引在此期间没有被修改过,这通常是安全的
# df['flow_lg_elg_intensity_rolling_high'] = rolling_high
# df['flow_lg_elg_intensity_rolling_low'] = rolling_low
# # Step 3: Continue with the logic using the new columns
# conditions_flow = [
# df['flow_lg_elg_intensity'] > df['flow_lg_elg_intensity_rolling_high'],
# df['flow_lg_elg_intensity'] < df['flow_lg_elg_intensity_rolling_low']
# ]
# choices_flow = [1, -1] # 1: 极端流入, -1: 极端流出
# df['cat_extreme_flow'] = np.select(conditions_flow, choices_flow, default=0)
# --- 筹码分布因子 ---
# 6. 筹码集中度 (Chip Concentration)
# 衡量筹码分布的紧密程度,例如 95% 与 5% 成本价的差距,相对于当前价格进行标准化
# 检查 'close' 列是否存在且有效
df['chip_concentration_range'] = (df['cost_95pct'] - df['cost_5pct']) / (df['close'] + epsilon)
# 7. 筹码分布偏度 (Chip Distribution Skewness Proxy)
# 比较中位数成本 (cost_50pct) 和加权平均成本 (weight_avg)
# weight_avg > cost_50pct 暗示高成本区有较多筹码(右偏)
df['chip_skewness'] = (df['weight_avg'] - df['cost_50pct']) / (df['cost_50pct'] + epsilon)
# 8. 浮筹比例 (Floating Chips Proxy)
# 衡量短期内例如15%成本线以下)的筹码比例与总获利盘比例的关系
# winner_rate 高但 cost_15pct 接近当前价,可能意味着大部分获利盘成本不高,易浮动
# 这里简化为:获利盘比例 与 (当前价-15%成本价)/当前价 的乘积
price_dist_cost15 = (df['close'] - df['cost_15pct']) / (df['close'] + epsilon)
df['floating_chip_proxy'] = df['winner_rate'] * np.maximum(0, price_dist_cost15) # 只考虑价格高于15%成本线的情况
# 9. 成本支撑强度变化 (Cost Support Strength Change)
# 观察低位筹码成本(如 5% 或 15% 分位点)的变化率,看支撑位是上移还是下移
df['cost_support_15pct_change'] = grouped['cost_15pct'].pct_change(1) * 100 # 百分比变化
# 10. 获利盘压力/支撑区 (Categorical: Winner Rate Zone & Price Position)
# 结合获利盘比例和当前价格相对于筹码成本的位置
# 例如: 价格在 85% 成本线之上 & 获利盘 > 0.8 -> 高位派发风险区?
# 价格在 15% 成本线之下 & 获利盘 < 0.2 -> 低位吸筹潜力区?
conditions_winner = [
(df['close'] > df['cost_85pct']) & (df['winner_rate'] > 0.8), # 高位 & 高获利盘
(df['close'] < df['cost_15pct']) & (df['winner_rate'] < 0.2), # 低位 & 低获利盘
(df['close'] > df['cost_50pct']) & (df['winner_rate'] > 0.5), # 中高位 & 多数获利
(df['close'] < df['cost_50pct']) & (df['winner_rate'] < 0.5), # 中低位 & 多数亏损
]
choices_winner = [1, 2, 3, 4] # 1:高风险区, 2:低潜力区, 3:中上获利区, 4:中下亏损区
df['cat_winner_price_zone'] = np.select(conditions_winner, choices_winner, default=0) # 0: 其他
# --- 结合因子 ---
# 11. 主力行为与筹码结构一致性 (Flow-Chip Consistency)
# 例如:主力净买入发生在价格接近下方筹码密集区(如 cost_15pct 到 cost_50pct
price_near_low_support = (df['close'] > df['cost_15pct']) & (df['close'] < df['cost_50pct'])
df['flow_chip_consistency'] = df['lg_elg_net_buy_vol'] * price_near_low_support.astype(int)
# 可以进一步标准化或做成 categorical
# 12. 获利了结压力/承接盘强度 (Profit-Taking Pressure vs Absorption)
# 在高获利盘(winner_rate > 0.7)的情况下,观察主力资金是净流出(了结)还是净流入(高位换手/承接)
high_winner_rate_flag = (df['winner_rate'] > 0.7).astype(int)
df['profit_taking_vs_absorb'] = df['lg_elg_net_buy_vol'] * high_winner_rate_flag
# 正值表示高获利盘下主力仍在买入(承接),负值表示主力在卖出(了结)
# 清理临时列和可能产生的 NaN (可选,根据需要处理)
cols_to_drop = ['lg_elg_net_buy_vol', 'sm_net_buy_vol', 'total_buy_vol', 'lg_elg_buy_prop',
'lg_elg_net_buy_vol_change', 'flow_lg_elg_intensity_rolling_high',
'flow_lg_elg_intensity_rolling_low']
# df = df.drop(columns=cols_to_drop)
window = 20
df['_is_positive'] = (df['pct_chg'] > 0).astype(int)
df['_is_negative'] = (df['pct_chg'] < 0).astype(int)
df['cat_is_positive'] = (df['pct_chg'] > 0).astype(int)
# 分离正负收益率 (用于计算各自的均值和平方均值)
# 注意:这里我们保留原始收益率用于计算,而不是 clip 到 0
df['_pos_returns'] = df['pct_chg'].where(df['pct_chg'] > 0, 0) # 非正设为0便于求和
df['_neg_returns'] = df['pct_chg'].where(df['pct_chg'] < 0, 0) # 非负设为0便于求和
# 计算收益率的平方 (用于计算 E[X^2])
df['_pos_returns_sq'] = np.square(df['_pos_returns'])
df['_neg_returns_sq'] = np.square(df['_neg_returns']) # 平方后负数变正
# 4. 计算滚动统计量 (使用内置函数,速度较快)
# 计算正收益日的统计量
rolling_pos_count = grouped['_is_positive'].rolling(window, min_periods=max(1, window // 2)).sum()
rolling_pos_sum = grouped['_pos_returns'].rolling(window, min_periods=max(1, window // 2)).sum()
rolling_pos_sum_sq = grouped['_pos_returns_sq'].rolling(window, min_periods=max(1, window // 2)).sum()
# 计算负收益日的统计量
rolling_neg_count = grouped['_is_negative'].rolling(window, min_periods=max(1, window // 2)).sum()
rolling_neg_sum = grouped['_neg_returns'].rolling(window, min_periods=max(1, window // 2)).sum()
rolling_neg_sum_sq = grouped['_neg_returns_sq'].rolling(window, min_periods=max(1, window // 2)).sum()
# 5. 计算方差和标准差
pos_mean_sq = rolling_pos_sum_sq / rolling_pos_count
pos_mean = rolling_pos_sum / rolling_pos_count
pos_var = pos_mean_sq - np.square(pos_mean)
pos_var = pos_var.where(rolling_pos_count >= 2, np.nan).clip(lower=0)
upside_vol = np.sqrt(pos_var)
neg_mean_sq = rolling_neg_sum_sq / rolling_neg_count
neg_mean = rolling_neg_sum / rolling_neg_count # 注意 neg_mean 是负数
neg_var = neg_mean_sq - np.square(neg_mean)
neg_var = neg_var.where(rolling_neg_count >= 2, np.nan).clip(lower=0)
downside_vol = np.sqrt(neg_var)
# rolling 操作后结果带有 MultiIndex需要去除股票代码层级以便合并
df['upside_vol'] = upside_vol.reset_index(level=0, drop=True)
df['downside_vol'] = downside_vol.reset_index(level=0, drop=True)
df['vol_ratio'] = df['upside_vol'] / df['downside_vol']
df['vol_ratio'] = df['vol_ratio'].replace([np.inf, -np.inf], np.nan).fillna(0) # 或 fillna(np.nan)
df['return_skew'] = grouped['pct_chg'].rolling(window=5).skew().reset_index(0, drop=True)
df['return_kurtosis'] = grouped['pct_chg'].rolling(window=5).kurt().reset_index(0, drop=True)
# 因子 1短期成交量变化率
df['volume_change_rate'] = (
grouped['vol'].rolling(window=2).mean() /
grouped['vol'].rolling(window=10).mean() - 1
).reset_index(level=0, drop=True) # 确保索引对齐
# 因子 2成交量突破信号
max_volume = grouped['vol'].rolling(window=5).max().reset_index(level=0, drop=True) # 确保索引对齐
df['cat_volume_breakout'] = (df['vol'] > max_volume)
# 因子 3换手率均线偏离度
mean_turnover = grouped['turnover_rate'].rolling(window=3).mean().reset_index(level=0, drop=True)
std_turnover = grouped['turnover_rate'].rolling(window=3).std().reset_index(level=0, drop=True)
df['turnover_deviation'] = (df['turnover_rate'] - mean_turnover) / std_turnover
# 因子 4换手率激增信号
df['cat_turnover_spike'] = (df['turnover_rate'] > mean_turnover + 2 * std_turnover)
# 因子 5量比均值
df['avg_volume_ratio'] = grouped['volume_ratio'].rolling(window=3).mean().reset_index(level=0, drop=True)
# 因子 6量比突破信号
max_volume_ratio = grouped['volume_ratio'].rolling(window=5).max().reset_index(level=0, drop=True)
df['cat_volume_ratio_breakout'] = (df['volume_ratio'] > max_volume_ratio)
df['vol_spike'] = grouped.apply(
lambda x: pd.Series(x['vol'].rolling(20).mean(), index=x.index)
)
df['vol_std_5'] = grouped['vol'].pct_change().rolling(window=5).std()
# 计算 ATR
df['atr_14'] = grouped.apply(
lambda x: pd.Series(talib.ATR(x['high'].values, x['low'].values, x['close'].values, timeperiod=14),
index=x.index)
)
df['atr_6'] = grouped.apply(
lambda x: pd.Series(talib.ATR(x['high'].values, x['low'].values, x['close'].values, timeperiod=6),
index=x.index)
)
# 计算 OBV 及其均线
df['obv'] = grouped.apply(
lambda x: pd.Series(talib.OBV(x['close'].values, x['vol'].values), index=x.index)
)
print(df.columns)
df['maobv_6'] = grouped.apply(
lambda x: pd.Series(talib.SMA(x['obv'].values, timeperiod=6), index=x.index)
)
df['rsi_3'] = grouped.apply(
lambda x: pd.Series(talib.RSI(x['close'].values, timeperiod=3), index=x.index)
)
# df['rsi_6'] = grouped.apply(
# lambda x: pd.Series(talib.RSI(x['close'].values, timeperiod=6), index=x.index)
# )
# df['rsi_9'] = grouped.apply(
# lambda x: pd.Series(talib.RSI(x['close'].values, timeperiod=9), index=x.index)
# )
# 计算 return_10 和 return_20
df['return_5'] = grouped['close'].apply(lambda x: x / x.shift(5) - 1)
# df['return_10'] = grouped['close'].apply(lambda x: x / x.shift(10) - 1)
df['return_20'] = grouped['close'].apply(lambda x: x / x.shift(20) - 1)
# df['avg_close_5'] = grouped['close'].apply(lambda x: x.rolling(window=5).mean() / x)
# 计算标准差指标
df['std_return_5'] = grouped['close'].apply(lambda x: x.pct_change().rolling(window=5).std())
# df['std_return_15'] = grouped['close'].apply(lambda x: x.pct_change().rolling(window=15).std())
# df['std_return_25'] = grouped['close'].apply(lambda x: x.pct_change().rolling(window=25).std())
df['std_return_90'] = grouped['close'].apply(lambda x: x.pct_change().rolling(window=90).std())
df['std_return_90_2'] = grouped['close'].apply(lambda x: x.shift(10).pct_change().rolling(window=90).std())
# 计算 EMA 指标
df['_ema_5'] = grouped['close'].apply(
lambda x: pd.Series(talib.EMA(x.values, timeperiod=5), index=x.index)
)
df['_ema_13'] = grouped['close'].apply(
lambda x: pd.Series(talib.EMA(x.values, timeperiod=13), index=x.index)
)
df['_ema_20'] = grouped['close'].apply(
lambda x: pd.Series(talib.EMA(x.values, timeperiod=20), index=x.index)
)
df['_ema_60'] = grouped['close'].apply(
lambda x: pd.Series(talib.EMA(x.values, timeperiod=60), index=x.index)
)
# 计算 act_factor1, act_factor2, act_factor3, act_factor4
df['act_factor1'] = grouped['_ema_5'].apply(
lambda x: np.arctan((x / x.shift(1) - 1) * 100) * 57.3 / 50
)
df['act_factor2'] = grouped['_ema_13'].apply(
lambda x: np.arctan((x / x.shift(1) - 1) * 100) * 57.3 / 40
)
df['act_factor3'] = grouped['_ema_20'].apply(
lambda x: np.arctan((x / x.shift(1) - 1) * 100) * 57.3 / 21
)
df['act_factor4'] = grouped['_ema_60'].apply(
lambda x: np.arctan((x / x.shift(1) - 1) * 100) * 57.3 / 10
)
# 根据 trade_date 截面计算排名
df['rank_act_factor1'] = df.groupby('trade_date', group_keys=False)['act_factor1'].rank(ascending=False, pct=True)
df['rank_act_factor2'] = df.groupby('trade_date', group_keys=False)['act_factor2'].rank(ascending=False, pct=True)
df['rank_act_factor3'] = df.groupby('trade_date', group_keys=False)['act_factor3'].rank(ascending=False, pct=True)
df['log_circ_mv'] = np.log(df['circ_mv'])
window_high_volume = 5
window_close_stddev = 20
period_delta = 5
# 计算每只股票的滚动协方差
def calculate_rolling_cov(group):
return group['high'].rolling(window_high_volume).cov(group['vol'])
df['cov'] = grouped.apply(calculate_rolling_cov)
# 计算每只股票的协方差差分
def calculate_delta_cov(group):
return group['cov'].diff(period_delta)
df['delta_cov'] = grouped.apply(calculate_delta_cov)
# 计算每只股票的滚动标准差
def calculate_stddev_close(group):
return group['close'].rolling(window_close_stddev).std()
df['_stddev_close'] = grouped.apply(calculate_stddev_close)
df['_rank_stddev'] = df.groupby('trade_date')['_stddev_close'].rank(pct=True)
df['alpha_22_improved'] = -1 * df['delta_cov'] * df['_rank_stddev']
df['alpha_003'] = np.where(df['high'] != df['low'],
(df['close'] - df['open']) / (df['high'] - df['low']),
0)
df['alpha_007'] = grouped.apply(lambda x: x['close'].rolling(5).corr(x['vol']))
df['alpha_007'] = df.groupby('trade_date', group_keys=False)['alpha_007'].rank(ascending=True, pct=True)
df['alpha_013'] = grouped['close'].transform(lambda x: x.rolling(5).sum() - x.rolling(20).sum())
df['alpha_013'] = df.groupby('trade_date', group_keys=False)['alpha_013'].rank(ascending=True, pct=True)
df['cat_up_limit'] = (df['close'] == df['up_limit']) # 是否涨停1表示涨停0表示未涨停
df['cat_down_limit'] = (df['close'] == df['down_limit']) # 是否跌停1表示跌停0表示未跌停
df['up_limit_count_10d'] = grouped['cat_up_limit'].rolling(window=10, min_periods=1).sum().reset_index(level=0,
drop=True)
df['down_limit_count_10d'] = grouped['cat_down_limit'].rolling(window=10, min_periods=1).sum().reset_index(level=0,
drop=True)
# 3. 最近连续涨跌停天数
def calculate_consecutive_limits(series):
"""
计算连续涨停/跌停天数。
"""
consecutive_up = series * (series.groupby((series != series.shift()).cumsum()).cumcount() + 1)
consecutive_down = series * (series.groupby((series != series.shift()).cumsum()).cumcount() + 1)
return consecutive_up, consecutive_down
# 连续涨停天数
df['consecutive_up_limit'] = grouped['cat_up_limit'].apply(
lambda x: calculate_consecutive_limits(x)[0]
)
df['vol_break'] = np.where((df['close'] > df['cost_85pct']) & (df['volume_ratio'] > 2), 1, 0)
df['weight_roc5'] = grouped['weight_avg'].apply(lambda x: x.pct_change(5))
def rolling_corr(group):
roc_close = group['close'].pct_change()
roc_weight = group['weight_avg'].pct_change()
return roc_close.rolling(10).corr(roc_weight)
df['price_cost_divergence'] = grouped.apply(rolling_corr)
df['smallcap_concentration'] = (1 / df['log_circ_mv']) * (df['cost_85pct'] - df['cost_15pct'])
# 16. 筹码稳定性指数 (20日波动率)
df['weight_std20'] = grouped['weight_avg'].apply(lambda x: x.rolling(20).std())
df['cost_stability'] = df['weight_std20'] / grouped['weight_avg'].transform(lambda x: x.rolling(20).mean())
# 17. 成本区间突破标记
df['high_cost_break_days'] = grouped.apply(lambda g: g['close'].gt(g['cost_95pct']).rolling(5).sum())
# 20. 筹码-流动性风险
df['liquidity_risk'] = (df['cost_95pct'] - df['cost_5pct']) * (
1 / grouped['vol'].transform(lambda x: x.rolling(10).mean()))
# 7. 市值波动率因子 (使用 grouped)
df['turnover_std'] = grouped['turnover_rate'].transform(lambda x: x.rolling(window=20).std())
df['mv_volatility'] = grouped.apply(lambda x: x['turnover_std'] / x['log_circ_mv'])
# 8. 市值成长性因子
df['volume_growth'] = grouped['vol'].pct_change(periods=20)
df['mv_growth'] = df['volume_growth'] / df['log_circ_mv']
# AR 指标
df["ar"] = grouped.apply(
lambda x: (x["high"].div(x["open"]).rolling(3).sum()) / (x["open"].div(x["low"]).rolling(3).sum()) * 100)
# BR 指标
df["pre_close"] = grouped["close"].shift(1)
df["br_up"] = (df["high"] - df["pre_close"]).clip(lower=0)
df["br_down"] = (df["pre_close"] - df["low"]).clip(lower=0)
df["br"] = grouped.apply(lambda x: (x["br_up"].rolling(3).sum()) / (x["br_down"].rolling(3).sum()) * 100)
# ARBR
df['arbr'] = df['ar'] - df['br']
df.drop(columns=["pre_close", "br_up", "br_down", 'ar', 'br'], inplace=True)
df.drop(columns=['weight_std20'], inplace=True, errors='ignore')
df.drop(
columns=['_is_positive', '_is_negative', '_pos_returns', '_neg_returns', '_pos_returns_sq', '_neg_returns_sq'],
inplace=True, errors='ignore')
new_columns = [col for col in df.columns.tolist()[:] if col not in old_columns]
return df, new_columns
def get_simple_factor(df):
old_columns = df.columns.tolist()[:]
df = df.sort_values(by=['ts_code', 'trade_date'])
alpha = 0.5
df['momentum_factor'] = df['volume_change_rate'] + alpha * df['turnover_deviation']
df['resonance_factor'] = df['volume_ratio'] * df['pct_chg']
df['log_close'] = np.log(df['close'])
df['cat_vol_spike'] = df['vol'] > 2 * df['vol_spike']
df['up'] = (df['high'] - df[['close', 'open']].max(axis=1)) / df['close']
df['down'] = (df[['close', 'open']].min(axis=1) - df['low']) / df['close']
df['obv_maobv_6'] = df['obv'] - df['maobv_6']
# 计算比值指标
df['std_return_5_over_std_return_90'] = df['std_return_5'] / df['std_return_90']
# df['std_return_5 / std_return_25'] = df['std_return_5'] / df['std_return_25']
# 计算标准差差值
df['std_return_90_minus_std_return_90_2'] = df['std_return_90'] - df['std_return_90_2']
# df['cat_af1'] = df['act_factor1'] > 0
df['cat_af2'] = df['act_factor2'] > df['act_factor1']
df['cat_af3'] = df['act_factor3'] > df['act_factor2']
df['cat_af4'] = df['act_factor4'] > df['act_factor3']
# 计算 act_factor5 和 act_factor6
df['act_factor5'] = df['act_factor1'] + df['act_factor2'] + df['act_factor3'] + df['act_factor4']
df['act_factor6'] = (df['act_factor1'] - df['act_factor2']) / np.sqrt(
df['act_factor1'] ** 2 + df['act_factor2'] ** 2)
df['active_buy_volume_large'] = df['buy_lg_vol'] / df['net_mf_vol']
df['active_buy_volume_big'] = df['buy_elg_vol'] / df['net_mf_vol']
df['active_buy_volume_small'] = df['buy_sm_vol'] / df['net_mf_vol']
df['buy_lg_vol_minus_sell_lg_vol'] = (df['buy_lg_vol'] - df['sell_lg_vol']) / df['net_mf_vol']
df['buy_elg_vol_minus_sell_elg_vol'] = (df['buy_elg_vol'] - df['sell_elg_vol']) / df['net_mf_vol']
df['log_circ_mv'] = np.log(df['circ_mv'])
df['ctrl_strength'] = (df['cost_85pct'] - df['cost_15pct']) / (df['his_high'] - df['his_low'])
df['low_cost_dev'] = (df['close'] - df['cost_5pct']) / (df['cost_50pct'] - df['cost_5pct'])
df['asymmetry'] = (df['cost_95pct'] - df['cost_50pct']) / (df['cost_50pct'] - df['cost_5pct'])
df['lock_factor'] = df['turnover_rate'] * (
1 - (df['cost_95pct'] - df['cost_5pct']) / (df['his_high'] - df['his_low']))
df['cat_vol_break'] = (df['close'] > df['cost_85pct']) & (df['volume_ratio'] > 2)
df['cost_atr_adj'] = (df['cost_95pct'] - df['cost_5pct']) / df['atr_14']
# 12. 小盘股筹码集中度
df['smallcap_concentration'] = (1 / df['log_circ_mv']) * (df['cost_85pct'] - df['cost_15pct'])
df['cat_golden_resonance'] = ((df['close'] > df['weight_avg']) &
(df['volume_ratio'] > 1.5) &
(df['winner_rate'] > 0.7))
df['mv_turnover_ratio'] = df['turnover_rate'] / df['log_circ_mv']
df['mv_adjusted_volume'] = df['vol'] / df['log_circ_mv']
df['mv_weighted_turnover'] = df['turnover_rate'] * (1 / df['log_circ_mv'])
df['nonlinear_mv_volume'] = df['vol'] / df['log_circ_mv']
df['mv_volume_ratio'] = df['volume_ratio'] / df['log_circ_mv']
df['mv_momentum'] = df['turnover_rate'] * df['volume_ratio'] / df['log_circ_mv']
drop_columns = [col for col in df.columns if col.startswith('_')]
df.drop(columns=drop_columns, inplace=True, errors='ignore')
new_columns = [col for col in df.columns.tolist()[:] if col not in old_columns]
return df, new_columns
import pandas as pd
import numpy as np
from scipy.stats import linregress # For factor 4 (if implementing slope directly)
# from hurst import compute_Hc # For factor 18, needs pip install hurst
# import statsmodels.api as sm # For factor 16, needs pip install statsmodels
# --- Constants ---
epsilon = 1e-10 # Prevent division by zero
# --- Helper Functions ---
def _safe_divide(a, b, default_val=0):
"""Safe division, returns default_val for division by zero or NaN/inf results."""
with np.errstate(divide='ignore', invalid='ignore'):
result = a / b
# Replace NaN, Inf, -Inf resulting from division or invalid ops
result[~np.isfinite(result)] = default_val
return result
# --- Factor Calculation Functions (In-Place Modification) ---
# Category 1: Large Player Intent & Behavior
def lg_flow_mom_corr(df: pd.DataFrame, N: int = 20, M: int = 60, factor_name: str = None):
"""
Calculates Factor 1: Large Flow & Price Momentum Concordance (In-place).
WARNING: Modifies df in-place.
"""
if factor_name is None:
factor_name = f'lg_flow_mom_corr_{N}_{M}'
print(f"Calculating {factor_name}...")
_temp_cols = ['_net_lg_flow_val', '_rolling_net_lg_flow', '_price_mom']
try:
df['_net_lg_flow_val'] = (df['buy_lg_vol'] + df['buy_elg_vol'] - df['sell_lg_vol'] - df['sell_elg_vol']) * df['close']
df['_rolling_net_lg_flow'] = df.groupby('ts_code')['_net_lg_flow_val'].rolling(N, min_periods=max(1, N // 2)).sum().reset_index(level=0, drop=True)
df['_price_mom'] = df.groupby('ts_code')['close'].pct_change(N)
# Calculate correlation on the temporary Series to handle alignment
factor_series = df['_rolling_net_lg_flow'].rolling(M, min_periods=max(1, M // 2)).corr(df['_price_mom'])
df[factor_name] = factor_series
except Exception as e:
print(f"Error calculating {factor_name}: {e}")
df[factor_name] = np.nan # Assign NaN on error
finally:
# Cleanup intermediate columns
cols_to_drop = [col for col in _temp_cols if col in df.columns]
if cols_to_drop:
df.drop(columns=cols_to_drop, inplace=True)
print(f"Finished {factor_name}.")
def lg_buy_consolidation(df: pd.DataFrame, N: int = 20, vol_quantile: float = 0.2, factor_name: str = None):
"""
Calculates Factor 2: Large Buying during Consolidation (In-place).
WARNING: Modifies df in-place.
"""
if factor_name is None:
factor_name = f'lg_buy_consolidation_{N}'
print(f"Calculating {factor_name}...")
_temp_cols = ['_rolling_std', '_net_lg_flow_ratio', '_rolling_net_lg_flow_ratio_mean', '_std_threshold']
try:
df['_rolling_std'] = df.groupby('ts_code')['close'].rolling(N, min_periods=max(1, N // 2)).std().reset_index(level=0, drop=True)
df['_net_lg_flow_ratio'] = _safe_divide(
(df['buy_lg_vol'] + df['buy_elg_vol'] - df['sell_lg_vol'] - df['sell_elg_vol']),
df['vol']
)
df['_rolling_net_lg_flow_ratio_mean'] = df.groupby('ts_code')['_net_lg_flow_ratio'].rolling(N, min_periods=max(1, N // 2)).mean().reset_index(level=0, drop=True)
df['_std_threshold'] = df.groupby('trade_date')['_rolling_std'].transform(lambda x: x.quantile(vol_quantile))
df[factor_name] = df['_rolling_net_lg_flow_ratio_mean'].where(df['_rolling_std'] < df['_std_threshold'])
except Exception as e:
print(f"Error calculating {factor_name}: {e}")
df[factor_name] = np.nan
finally:
cols_to_drop = [col for col in _temp_cols if col in df.columns]
if cols_to_drop:
df.drop(columns=cols_to_drop, inplace=True)
print(f"Finished {factor_name}.")
def lg_flow_accel(df: pd.DataFrame, factor_name: str = 'lg_flow_accel'):
"""
Calculates Factor 3: Large Flow Acceleration (In-place).
WARNING: Modifies df in-place.
"""
print(f"Calculating {factor_name}...")
_temp_cols = ['_net_lg_flow_vol']
try:
df['_net_lg_flow_vol'] = df['buy_lg_vol'] + df['buy_elg_vol'] - df['sell_lg_vol'] - df['sell_elg_vol']
df[factor_name] = df.groupby('ts_code')['_net_lg_flow_vol'].diff(1).diff(1)
except Exception as e:
print(f"Error calculating {factor_name}: {e}")
df[factor_name] = np.nan
finally:
cols_to_drop = [col for col in _temp_cols if col in df.columns]
if cols_to_drop:
df.drop(columns=cols_to_drop, inplace=True)
print(f"Finished {factor_name}.")
def intraday_lg_flow_corr(df: pd.DataFrame, N: int = 20, factor_name: str = None):
"""
Calculates Factor 4: (Approx) Intraday Trend & Large Flow Correlation (In-place).
NOTE: Direct rolling correlation between two rolling series is complex/slow in pandas.
This provides a placeholder or requires significant optimization/pre-calculation.
WARNING: Modifies df in-place. Placeholder implementation returns NaN.
"""
if factor_name is None:
factor_name = f'intraday_lg_flow_corr_{N}'
print(f"Calculating {factor_name} (Placeholder - complex implementation)...")
df[factor_name] = np.nan # Placeholder, see previous thought process for detailed logic needed
print(f"Finished {factor_name} (Placeholder).")
# Category 2: Cost Basis & PnL Status
def profit_pressure(df: pd.DataFrame, factor_name: str = 'profit_pressure'):
"""
Calculates Factor 5: Profit Pressure Index (In-place).
WARNING: Modifies df in-place.
"""
print(f"Calculating {factor_name}...")
_temp_cols = ['_profit_margin_85', '_profit_margin_95']
try:
df['_profit_margin_85'] = _safe_divide(df['close'], df['cost_85pct']) - 1
df['_profit_margin_95'] = _safe_divide(df['close'], df['cost_95pct']) - 1
df[factor_name] = df['winner_rate'] * 0.5 * (df['_profit_margin_85'] + df['_profit_margin_95'])
except Exception as e:
print(f"Error calculating {factor_name}: {e}")
df[factor_name] = np.nan
finally:
cols_to_drop = [col for col in _temp_cols if col in df.columns]
if cols_to_drop:
df.drop(columns=cols_to_drop, inplace=True)
print(f"Finished {factor_name}.")
def underwater_resistance(df: pd.DataFrame, factor_name: str = 'underwater_resistance'):
"""
Calculates Factor 6: Resistance from Underwater Positions (In-place).
WARNING: Modifies df in-place.
"""
print(f"Calculating {factor_name}...")
_temp_cols = ['_underwater_ratio', '_dist_to_cost_15']
try:
df['_underwater_ratio'] = 1.0 - df['winner_rate']
df['_dist_to_cost_15'] = np.maximum(0, df['cost_15pct'] - df['close']) / (df['close'] + epsilon)
df[factor_name] = df['_underwater_ratio'] * df['_dist_to_cost_15']
except Exception as e:
print(f"Error calculating {factor_name}: {e}")
df[factor_name] = np.nan
finally:
cols_to_drop = [col for col in _temp_cols if col in df.columns]
if cols_to_drop:
df.drop(columns=cols_to_drop, inplace=True)
print(f"Finished {factor_name}.")
def cost_conc_std(df: pd.DataFrame, N: int = 20, factor_name: str = None):
"""
Calculates Factor 7: Cost Concentration Change (Std Dev) (In-place).
WARNING: Modifies df in-place.
"""
if factor_name is None:
factor_name = f'cost_conc_std_{N}'
print(f"Calculating {factor_name}...")
_temp_cols = ['_cost_range_norm']
try:
df['_cost_range_norm'] = _safe_divide(
(df['cost_85pct'] - df['cost_15pct']),
(df['weight_avg'] + epsilon)
)
# Need to calculate rolling std on the temp col before dropping it
factor_series = df.groupby('ts_code')['_cost_range_norm'].rolling(N, min_periods=max(1, N//2)).std().reset_index(level=0, drop=True)
df[factor_name] = factor_series
except Exception as e:
print(f"Error calculating {factor_name}: {e}")
df[factor_name] = np.nan
finally:
cols_to_drop = [col for col in _temp_cols if col in df.columns]
if cols_to_drop:
df.drop(columns=cols_to_drop, inplace=True)
print(f"Finished {factor_name}.")
def profit_decay(df: pd.DataFrame, N: int = 20, factor_name: str = None):
"""
Calculates Factor 8: Profit Expectation Decay (In-place).
WARNING: Modifies df in-place.
"""
if factor_name is None:
factor_name = f'profit_decay_{N}'
print(f"Calculating {factor_name}...")
_temp_cols = ['_ret_N', '_winner_rate_change_N']
try:
df['_ret_N'] = _safe_divide(df['close'], df.groupby('ts_code')['close'].shift(N)) - 1
df['_winner_rate_change_N'] = df.groupby('ts_code')['winner_rate'].diff(N)
df[factor_name] = _safe_divide(df['_ret_N'], df['_winner_rate_change_N'])
except Exception as e:
print(f"Error calculating {factor_name}: {e}")
df[factor_name] = np.nan
finally:
cols_to_drop = [col for col in _temp_cols if col in df.columns]
if cols_to_drop:
df.drop(columns=cols_to_drop, inplace=True)
print(f"Finished {factor_name}.")
# Category 3: Volatility Source & Market State
def vol_amp_loss(df: pd.DataFrame, N: int = 20, factor_name: str = None):
"""
Calculates Factor 9: Volatility Amplification when Underwater (In-place).
WARNING: Modifies df in-place.
"""
if factor_name is None:
factor_name = f'vol_amp_loss_{N}'
print(f"Calculating {factor_name}...")
_temp_cols = ['_vol_N', '_loss_degree']
try:
df['_vol_N'] = df.groupby('ts_code')['pct_chg'].rolling(N, min_periods=max(1, N // 2)).std().reset_index(level=0, drop=True)
df['_loss_degree'] = np.maximum(0, df['weight_avg'] - df['close']) / (df['close'] + epsilon)
df[factor_name] = df['_vol_N'] * df['_loss_degree']
except Exception as e:
print(f"Error calculating {factor_name}: {e}")
df[factor_name] = np.nan
finally:
cols_to_drop = [col for col in _temp_cols if col in df.columns]
if cols_to_drop:
df.drop(columns=cols_to_drop, inplace=True)
print(f"Finished {factor_name}.")
def vol_drop_profit_cnt(df: pd.DataFrame, N: int = 20, M: int = 5, profit_thresh: float = 0.1, drop_thresh: float = -0.03, vol_multiple: float = 2.0, factor_name: str = None):
"""
Calculates Factor 10: High Volume Drop when Profitable (Count over M days) (In-place).
WARNING: Modifies df in-place.
"""
if factor_name is None:
factor_name = f'vol_drop_profit_cnt_{M}'
print(f"Calculating {factor_name}...")
_temp_cols = ['_is_profitable', '_is_dropping', '_rolling_mean_vol', '_rolling_std_vol', '_is_high_vol', '_event']
try:
df['_is_profitable'] = df['close'] > df['weight_avg'] * (1 + profit_thresh)
df['_is_dropping'] = df['pct_chg'] < drop_thresh
df['_rolling_mean_vol'] = df.groupby('ts_code')['vol'].rolling(N, min_periods=1).mean().reset_index(level=0, drop=True)
df['_rolling_std_vol'] = df.groupby('ts_code')['vol'].rolling(N, min_periods=2).std().reset_index(level=0, drop=True).fillna(0)
df['_is_high_vol'] = df['vol'] > (df['_rolling_mean_vol'] + vol_multiple * df['_rolling_std_vol'])
df['_event'] = (df['_is_profitable'] & df['_is_dropping'] & df['_is_high_vol']).astype(int)
factor_series = df.groupby('ts_code')['_event'].rolling(M, min_periods=1).sum().reset_index(level=0, drop=True)
df[factor_name] = factor_series
except Exception as e:
print(f"Error calculating {factor_name}: {e}")
df[factor_name] = np.nan
finally:
cols_to_drop = [col for col in _temp_cols if col in df.columns]
if cols_to_drop:
df.drop(columns=cols_to_drop, inplace=True)
print(f"Finished {factor_name}.")
def lg_flow_vol_interact(df: pd.DataFrame, N: int = 20, factor_name: str = None):
"""
Calculates Factor 11: Large Flow Driven Volatility (Interaction Term) (In-place).
WARNING: Modifies df in-place.
"""
if factor_name is None:
factor_name = f'lg_flow_vol_interact_{N}'
print(f"Calculating {factor_name}...")
_temp_cols = ['_vol_N', '_net_lg_flow_val', '_total_val', '_abs_net_lg_flow_ratio', '_abs_net_lg_flow_ratio_N']
try:
df['_vol_N'] = df.groupby('ts_code')['pct_chg'].rolling(N, min_periods=max(1, N // 2)).std().reset_index(level=0, drop=True)
df['_net_lg_flow_val'] = (df['buy_lg_vol'] + df['buy_elg_vol'] - df['sell_lg_vol'] - df['sell_elg_vol']) * df['close']
df['_total_val'] = df['vol'] * df['close']
df['_abs_net_lg_flow_ratio'] = abs(df['_net_lg_flow_val']) / (df['_total_val'] + epsilon)
df['_abs_net_lg_flow_ratio_N'] = df.groupby('ts_code')['_abs_net_lg_flow_ratio'].rolling(N, min_periods=max(1, N // 2)).mean().reset_index(level=0, drop=True)
df[factor_name] = df['_vol_N'] * df['_abs_net_lg_flow_ratio_N']
except Exception as e:
print(f"Error calculating {factor_name}: {e}")
df[factor_name] = np.nan
finally:
cols_to_drop = [col for col in _temp_cols if col in df.columns]
if cols_to_drop:
df.drop(columns=cols_to_drop, inplace=True)
print(f"Finished {factor_name}.")
def cost_break_confirm_cnt(df: pd.DataFrame, M: int = 5, factor_name: str = None):
"""
Calculates Factor 12: Cost Breakout Confirmation Count (In-place).
WARNING: Modifies df in-place.
"""
if factor_name is None:
factor_name = f'cost_break_confirm_cnt_{M}'
print(f"Calculating {factor_name}...")
_temp_cols = ['_prev_cost_85', '_prev_cost_15', '_break_up', '_break_down', '_net_lg_flow_vol', '_confirm_up', '_confirm_down', '_net_confirm']
try:
df['_prev_cost_85'] = df.groupby('ts_code')['cost_85pct'].shift(1)
df['_prev_cost_15'] = df.groupby('ts_code')['cost_15pct'].shift(1)
df['_break_up'] = df['close'] > df['_prev_cost_85']
df['_break_down'] = df['close'] < df['_prev_cost_15']
df['_net_lg_flow_vol'] = df['buy_lg_vol'] + df['buy_elg_vol'] - df['sell_lg_vol'] - df['sell_elg_vol']
df['_confirm_up'] = (df['_break_up'] & (df['_net_lg_flow_vol'] > 0)).astype(int)
df['_confirm_down'] = (df['_break_down'] & (df['_net_lg_flow_vol'] < 0)).astype(int)
df['_net_confirm'] = df['_confirm_up'] - df['_confirm_down']
factor_series = df.groupby('ts_code')['_net_confirm'].rolling(M, min_periods=1).sum().reset_index(level=0, drop=True)
df[factor_name] = factor_series
except Exception as e:
print(f"Error calculating {factor_name}: {e}")
df[factor_name] = np.nan
finally:
cols_to_drop = [col for col in _temp_cols if col in df.columns]
if cols_to_drop:
df.drop(columns=cols_to_drop, inplace=True)
print(f"Finished {factor_name}.")
# Category 4: Technical Indicators & Market Behavior
def atr_norm_channel_pos(df: pd.DataFrame, N: int = 14, factor_name: str = None):
"""
Calculates Factor 13: ATR Normalized Channel Position (In-place).
WARNING: Modifies df in-place.
"""
if factor_name is None:
factor_name = f'atr_norm_channel_pos_{N}'
print(f"Calculating {factor_name}...")
_temp_cols = ['_prev_close', '_h_l', '_h_pc', '_l_pc', '_tr', '_atr_N', '_roll_low_N']
try:
df['_prev_close'] = df.groupby('ts_code')['close'].shift(1)
df['_h_l'] = df['high'] - df['low']
df['_h_pc'] = abs(df['high'] - df['_prev_close'])
df['_l_pc'] = abs(df['low'] - df['_prev_close'])
df['_tr'] = df[['_h_l', '_h_pc', '_l_pc']].max(axis=1)
df['_atr_N'] = df.groupby('ts_code')['_tr'].rolling(N, min_periods=max(1, N//2)).mean().reset_index(level=0, drop=True)
df['_roll_low_N'] = df.groupby('ts_code')['low'].rolling(N, min_periods=max(1, N//2)).min().reset_index(level=0, drop=True)
df[factor_name] = _safe_divide((df['close'] - df['_roll_low_N']), df['_atr_N'])
except Exception as e:
print(f"Error calculating {factor_name}: {e}")
df[factor_name] = np.nan
finally:
cols_to_drop = [col for col in _temp_cols if col in df.columns]
if cols_to_drop:
df.drop(columns=cols_to_drop, inplace=True)
print(f"Finished {factor_name}.")
def turnover_diff_skew(df: pd.DataFrame, N: int = 20, factor_name: str = None):
"""
Calculates Factor 14: Skewness of Turnover Rate Change (In-place).
WARNING: Modifies df in-place.
"""
if factor_name is None:
factor_name = f'turnover_diff_skew_{N}'
print(f"Calculating {factor_name}...")
_temp_cols = ['_turnover_diff']
try:
# Assuming turnover_rate is in percentage points, diff is fine
df['_turnover_diff'] = df.groupby('ts_code')['turnover_rate'].diff(1)
factor_series = df.groupby('ts_code')['_turnover_diff'].rolling(N, min_periods=max(3, N//2)).skew().reset_index(level=0, drop=True)
df[factor_name] = factor_series
except Exception as e:
print(f"Error calculating {factor_name}: {e}")
df[factor_name] = np.nan
finally:
cols_to_drop = [col for col in _temp_cols if col in df.columns]
if cols_to_drop:
df.drop(columns=cols_to_drop, inplace=True)
print(f"Finished {factor_name}.")
def lg_sm_flow_diverge(df: pd.DataFrame, N: int = 20, factor_name: str = None):
"""
Calculates Factor 15: Divergence between Large and Small Flow (In-place).
WARNING: Modifies df in-place.
"""
if factor_name is None:
factor_name = f'lg_sm_flow_diverge_{N}'
print(f"Calculating {factor_name}...")
_temp_cols = ['_lg_flow_ratio', '_sm_flow_ratio', '_lg_flow_ratio_N', '_sm_flow_ratio_N']
try:
df['_lg_flow_ratio'] = _safe_divide(
(df['buy_lg_vol'] + df['buy_elg_vol'] - df['sell_lg_vol'] - df['sell_elg_vol']),
df['vol']
)
df['_sm_flow_ratio'] = _safe_divide(
(df['buy_sm_vol'] - df['sell_sm_vol']),
df['vol']
)
df['_lg_flow_ratio_N'] = df.groupby('ts_code')['_lg_flow_ratio'].rolling(N, min_periods=max(1, N // 2)).mean().reset_index(level=0, drop=True)
df['_sm_flow_ratio_N'] = df.groupby('ts_code')['_sm_flow_ratio'].rolling(N, min_periods=max(1, N // 2)).mean().reset_index(level=0, drop=True)
df[factor_name] = df['_lg_flow_ratio_N'] - df['_sm_flow_ratio_N']
except Exception as e:
print(f"Error calculating {factor_name}: {e}")
df[factor_name] = np.nan
finally:
cols_to_drop = [col for col in _temp_cols if col in df.columns]
if cols_to_drop:
df.drop(columns=cols_to_drop, inplace=True)
print(f"Finished {factor_name}.")
def cap_neutral_cost_metric(df: pd.DataFrame, factor_name: str = 'cap_neutral_cost_metric'):
"""
Calculates Factor 16: Market Cap Neutralized Cost Metric (Placeholder).
Requires statsmodels and complex implementation.
WARNING: Modifies df in-place. Placeholder implementation returns NaN.
"""
print(f"Calculating {factor_name} (Placeholder - requires statsmodels)...")
df[factor_name] = np.nan
print(f"Finished {factor_name} (Placeholder).")
def pullback_strong(df: pd.DataFrame, N: int = 20, M: int = 20, gain_thresh: float = 0.2, factor_name: str = None):
"""
Calculates Factor 17: Pullback Depth from Recent High for Strong Stocks (In-place).
WARNING: Modifies df in-place.
"""
if factor_name is None:
factor_name = f'pullback_strong_{N}_{M}'
print(f"Calculating {factor_name}...")
_temp_cols = ['_high_N', '_pullback_depth', '_recent_gain_M']
try:
df['_high_N'] = df.groupby('ts_code')['high'].rolling(N, min_periods=max(1, N // 2)).max().reset_index(level=0, drop=True)
df['_pullback_depth'] = _safe_divide((df['_high_N'] - df['close']), df['_high_N'])
df['_recent_gain_M'] = _safe_divide(df['close'], df.groupby('ts_code')['close'].shift(M)) - 1
df[factor_name] = _safe_divide(df['_pullback_depth'], df['_recent_gain_M'])
except Exception as e:
print(f"Error calculating {factor_name}: {e}")
df[factor_name] = np.nan
finally:
cols_to_drop = [col for col in _temp_cols if col in df.columns]
if cols_to_drop:
df.drop(columns=cols_to_drop, inplace=True)
print(f"Finished {factor_name}.")
def hurst_exponent_flow(df: pd.DataFrame, N: int = 60, flow_col: str = 'net_mf_vol', factor_name: str = None):
"""
Calculates Factor 18: Hurst Exponent of Money Flow (Placeholder).
Requires 'hurst' library and potentially slow rolling apply.
WARNING: Modifies df in-place. Placeholder implementation returns NaN.
"""
if factor_name is None:
factor_name = f'hurst_{flow_col}_{N}'
print(f"Calculating {factor_name} (Placeholder - requires hurst library)...")
try:
from hurst import compute_Hc
# Logic would go here, likely using rolling().apply() which is slow
# factor_series = df.groupby('ts_code')[flow_col]....apply(hurst_calc_func...)
df[factor_name] = np.nan # Placeholder
except ImportError:
print("Error: 'hurst' library not installed. Cannot calculate factor.")
df[factor_name] = np.nan
except Exception as e:
print(f"Error calculating {factor_name}: {e}")
df[factor_name] = np.nan
print(f"Finished {factor_name} (Placeholder).")
def vol_wgt_hist_pos(df: pd.DataFrame, N: int = 20, factor_name: str = None):
"""
Calculates Factor 19: Volume Weighting at Historical Price Level (In-place).
WARNING: Modifies df in-place.
"""
if factor_name is None:
factor_name = f'vol_wgt_hist_pos_{N}'
print(f"Calculating {factor_name}...")
_temp_cols = ['_hist_pos', '_rolling_mean_vol', '_vol_rel_strength']
try:
df['_hist_pos'] = _safe_divide((df['close'] - df['his_low']), (df['his_high'] - df['his_low'])).clip(0, 1)
df['_rolling_mean_vol'] = df.groupby('ts_code')['vol'].rolling(N, min_periods=max(1, N // 2)).mean().reset_index(level=0, drop=True)
df['_vol_rel_strength'] = _safe_divide(df['vol'], df['_rolling_mean_vol'])
df[factor_name] = df['_hist_pos'] * df['_vol_rel_strength']
except Exception as e:
print(f"Error calculating {factor_name}: {e}")
df[factor_name] = np.nan
finally:
cols_to_drop = [col for col in _temp_cols if col in df.columns]
if cols_to_drop:
df.drop(columns=cols_to_drop, inplace=True)
print(f"Finished {factor_name}.")
def vol_adj_roc(df: pd.DataFrame, N: int = 20, factor_name: str = None):
"""
Calculates Factor 20: Volatility-Adjusted ROC (In-place).
WARNING: Modifies df in-place.
"""
if factor_name is None:
factor_name = f'vol_adj_roc_{N}'
print(f"Calculating {factor_name}...")
_temp_cols = ['_roc_N', '_vol_N']
try:
df['_roc_N'] = _safe_divide(df['close'], df.groupby('ts_code')['close'].shift(N)) - 1
df['_vol_N'] = df.groupby('ts_code')['pct_chg'].rolling(N, min_periods=max(2, N // 2)).std().reset_index(level=0, drop=True).fillna(0)
df[factor_name] = _safe_divide(df['_roc_N'], df['_vol_N'])
except Exception as e:
print(f"Error calculating {factor_name}: {e}")
df[factor_name] = np.nan
finally:
cols_to_drop = [col for col in _temp_cols if col in df.columns]
if cols_to_drop:
df.drop(columns=cols_to_drop, inplace=True)
print(f"Finished {factor_name}.")
def calculate_complex_factor(df: pd.DataFrame, factor_name: str = "complex_factor_deap_1"):
"""
表达式: sub(protected_div_torch(A, B), C)
其中 A, B, C 及内部组件依赖于多个预计算因子列。
Args:
df (pd.DataFrame): 包含所有必需基础因子列的 DataFrame。
factor_name (str): 要在 df 中创建的新因子列的名称。
WARNING: 此函数会原地修改输入的 DataFrame 'df'
如果在计算过程中缺少任何必需的列,将打印错误并填充 NaN。
"""
print(f"开始计算因子: {factor_name} (原地修改)...")
_temp_cols_list = [] # 用于记录中间计算列的名称
try:
# --- 分解计算表达式的各个部分 ---
# 计算组件 D
# D = sub(mul(pullback_strong_20_20, div(log_close, industry_return_5)), div(add(vol_adj_roc_20, vol_drop_profit_cnt_5), sub(nonlinear_mv_volume, alpha_007)))
_temp_d_term1_div = _safe_divide(df['log_close'], df['industry_return_5'])
_temp_d_term1 = df['pullback_strong_20_20'] * _temp_d_term1_div
_temp_d_term2_sub = df['nonlinear_mv_volume'] - df['alpha_007']
_temp_d_term2_add = df['vol_adj_roc_20'] + df['vol_drop_profit_cnt_5']
_temp_d_term2 = _safe_divide(_temp_d_term2_add, _temp_d_term2_sub)
df['_temp_D'] = _temp_d_term1 - _temp_d_term2
_temp_cols_list.extend(['_temp_D', '_temp_d_term1_div', '_temp_d_term1', '_temp_d_term2_sub', '_temp_d_term2_add', '_temp_d_term2'])
# 计算组件 A
# A = add(add(mul(D, lg_buy_consolidation_20), lg_buy_consolidation_20), pullback_strong_20_20)
_temp_a_term1 = df['_temp_D'] * df['lg_buy_consolidation_20']
_temp_a_term2 = _temp_a_term1 + df['lg_buy_consolidation_20']
df['_temp_A'] = _temp_a_term2 + df['pullback_strong_20_20']
_temp_cols_list.extend(['_temp_A', '_temp_a_term1', '_temp_a_term2'])
# 计算组件 F
# F = mul(add(net_mf_vol, std_return_5), sub(arbr, industry_act_factor5))
_temp_f_term1 = df['net_mf_vol'] + df['std_return_5']
_temp_f_term2 = df['arbr'] - df['industry_act_factor5']
df['_temp_F'] = _temp_f_term1 * _temp_f_term2
_temp_cols_list.extend(['_temp_F', '_temp_f_term1', '_temp_f_term2'])
# 计算组件 H
# H = add(add(industry_act_factor1, low_cost_dev), mul(mv_weighted_turnover, act_factor4))
_temp_h_term1 = df['industry_act_factor1'] + df['low_cost_dev']
_temp_h_term2 = df['mv_weighted_turnover'] * df['act_factor4']
df['_temp_H'] = _temp_h_term1 + _temp_h_term2
_temp_cols_list.extend(['_temp_H', '_temp_h_term1', '_temp_h_term2'])
# 计算组件 B
# B = div(add(add(F, vol), H), lg_elg_buy_prop)
_temp_b_term1 = df['_temp_F'] + df['vol']
_temp_b_term2 = _temp_b_term1 + df['_temp_H']
df['_temp_B'] = _safe_divide(_temp_b_term2, df['lg_elg_buy_prop'])
_temp_cols_list.extend(['_temp_B', '_temp_b_term1', '_temp_b_term2'])
# 计算组件 C
# C = div(div(intraday_lg_flow_corr_20, lg_elg_buy_prop), lg_elg_buy_prop)
# 注意: intraday_lg_flow_corr_20 可能本身就是 NaN 或需要特殊处理
_temp_c_term1 = _safe_divide(df.get('intraday_lg_flow_corr_20', np.nan), df['lg_elg_buy_prop']) # 使用 .get 处理可能不存在的列
df['_temp_C'] = _safe_divide(_temp_c_term1, df['lg_elg_buy_prop'])
_temp_cols_list.extend(['_temp_C', '_temp_c_term1'])
# --- 计算最终表达式 ---
# final = sub(div(A, B), C)
_temp_final_term1 = _safe_divide(df['_temp_A'], df['_temp_B'])
final_factor_series = _temp_final_term1 - df['_temp_C']
# --- 将最终结果赋值给 df 的新列 (原地修改) ---
df[factor_name] = final_factor_series
print(f"因子 {factor_name} 计算成功。")
except KeyError as e:
# 捕获因为缺少列而产生的错误
print(f"错误: 计算 {factor_name} 时缺少必需的列: {e}")
print("请确保输入的 DataFrame 包含所有表达式中引用的因子列。")
print("将为因子 {factor_name} 填充 NaN。")
df[factor_name] = np.nan # 出错时填充 NaN
except Exception as e:
# 捕获其他可能的计算错误
print(f"错误: 计算 {factor_name} 时发生意外错误: {e}")
print(f"将为因子 {factor_name} 填充 NaN。")
df[factor_name] = np.nan # 出错时填充 NaN
finally:
# --- 清理所有中间计算列 ---
cols_to_drop = [col for col in _temp_cols_list if col in df.columns]
if cols_to_drop:
df.drop(columns=cols_to_drop, inplace=True)
# print(f"已清理 {len(cols_to_drop)} 个临时列 for {factor_name}.")
print(f"因子 {factor_name} 计算流程结束。")
# 函数不返回任何值,因为 df 是原地修改的
import pandas as pd
import numpy as np
# from scipy.stats import rankdata # rankdata is not needed if using pandas rank
# import statsmodels.api as sm # Needed for factor 19
# --- Constants ---
epsilon = 1e-10 # Prevent division by zero
# --- Helper Functions ---
def _safe_divide(numerator, denominator, default_val=0):
"""
安全的除法函数处理分母为零或接近零以及NaN/Inf的情况。
Args:
numerator (pd.Series): 分子.
denominator (pd.Series): 分母.
default_val (float): 当分母为零或结果无效时返回的默认值.
Returns:
pd.Series: 除法结果.
"""
with np.errstate(divide='ignore', invalid='ignore'):
# Convert inputs to numeric, coercing errors to NaN before division
num = pd.to_numeric(numerator, errors='coerce')
den = pd.to_numeric(denominator, errors='coerce')
# Perform division where denominator is not close to zero and inputs are valid numbers
result = np.where(np.abs(den) > epsilon, num / den, default_val)
# Ensure result is float, handle potential NaNs from coercion or division
result = pd.to_numeric(result, errors='coerce')
# Fill remaining NaNs if necessary
result = np.nan_to_num(result, nan=default_val, posinf=default_val, neginf=default_val)
# Ensure result index matches numerator's index if numerator is a Series
if isinstance(numerator, pd.Series):
return pd.Series(result, index=numerator.index)
else:
return pd.Series(result) # Fallback if numerator is not a Series (less likely)
# --- Cross-Sectional Factor Calculation Functions (In-Place Modification) ---
# Category 1: Cross-Sectional Flow & Behavior Strength
def cs_rank_net_lg_flow_val(df: pd.DataFrame, factor_name: str = 'cs_rank_net_lg_flow_val'):
"""
Factor 1: 大单净额截面排序 (In-place).
WARNING: Modifies df in-place.
"""
print(f"Calculating {factor_name}...")
_temp_cols = ['_net_lg_flow_val']
try:
df['_net_lg_flow_val'] = (df['buy_lg_vol'] + df['buy_elg_vol'] - df['sell_lg_vol'] - df['sell_elg_vol']) * df['close']
df[factor_name] = df.groupby('trade_date')['_net_lg_flow_val'].rank(pct=True)
except KeyError as e:
print(f"Error calculating {factor_name}: Missing column {e}. Assigning NaN.")
df[factor_name] = np.nan
except Exception as e:
print(f"An unexpected error occurred calculating {factor_name}: {e}. Assigning NaN.")
df[factor_name] = np.nan
finally:
cols_to_drop = [col for col in _temp_cols if col in df.columns]
if cols_to_drop:
df.drop(columns=cols_to_drop, inplace=True)
print(f"Finished {factor_name}.")
def cs_rank_flow_divergence(df: pd.DataFrame, factor_name: str = 'cs_rank_flow_divergence'):
"""
Factor 2: 大小单流向背离度截面排序 (In-place).
WARNING: Modifies df in-place.
"""
print(f"Calculating {factor_name}...")
_temp_cols = ['_lg_ratio', '_sm_ratio', '_divergence']
try:
df['_lg_ratio'] = _safe_divide(
(df['buy_lg_vol'] + df['buy_elg_vol'] - df['sell_lg_vol'] - df['sell_elg_vol']),
df['vol']
)
df['_sm_ratio'] = _safe_divide(
(df['buy_sm_vol'] - df['sell_sm_vol']),
df['vol']
)
df['_divergence'] = df['_lg_ratio'] - df['_sm_ratio']
df[factor_name] = df.groupby('trade_date')['_divergence'].rank(pct=True)
except KeyError as e:
print(f"Error calculating {factor_name}: Missing column {e}. Assigning NaN.")
df[factor_name] = np.nan
except Exception as e:
print(f"An unexpected error occurred calculating {factor_name}: {e}. Assigning NaN.")
df[factor_name] = np.nan
finally:
cols_to_drop = [col for col in _temp_cols if col in df.columns]
if cols_to_drop:
df.drop(columns=cols_to_drop, inplace=True)
print(f"Finished {factor_name}.")
def cs_rank_industry_adj_lg_flow(df: pd.DataFrame, factor_name: str = 'cs_rank_ind_adj_lg_flow'):
"""
Factor 3: 行业内大单流强度排序 (In-place). Requires 'cat_l2_code'.
WARNING: Modifies df in-place.
"""
print(f"Calculating {factor_name}...")
_temp_cols = ['_net_lg_flow_vol', '_industry_avg_flow', '_deviation']
if 'cat_l2_code' not in df.columns:
print(f"Error calculating {factor_name}: Missing 'cat_l2_code' column. Assigning NaN.")
df[factor_name] = np.nan
return
try:
df['_net_lg_flow_vol'] = (df['buy_lg_vol'] + df['buy_elg_vol'] - df['sell_lg_vol'] - df['sell_elg_vol']) * df['close'] # Or use vol
df['_industry_avg_flow'] = df.groupby(['trade_date', 'cat_l2_code'])['_net_lg_flow_vol'].transform('mean')
df['_deviation'] = df['_net_lg_flow_vol'] - df['_industry_avg_flow']
df[factor_name] = df.groupby('trade_date')['_deviation'].rank(pct=True)
except KeyError as e:
print(f"Error calculating {factor_name}: Missing column {e}. Assigning NaN.")
df[factor_name] = np.nan
except Exception as e:
print(f"An unexpected error occurred calculating {factor_name}: {e}. Assigning NaN.")
df[factor_name] = np.nan
finally:
cols_to_drop = [col for col in _temp_cols if col in df.columns]
if cols_to_drop:
df.drop(columns=cols_to_drop, inplace=True)
print(f"Finished {factor_name}.")
def cs_rank_elg_buy_ratio(df: pd.DataFrame, factor_name: str = 'cs_rank_elg_buy_ratio'):
"""
Factor 4: 超大单买入占比排序 (In-place).
WARNING: Modifies df in-place.
"""
print(f"Calculating {factor_name}...")
_temp_cols = ['_elg_buy_ratio']
try:
df['_elg_buy_ratio'] = _safe_divide(df['buy_elg_vol'], df['vol'])
df[factor_name] = df.groupby('trade_date')['_elg_buy_ratio'].rank(pct=True)
except KeyError as e:
print(f"Error calculating {factor_name}: Missing column {e}. Assigning NaN.")
df[factor_name] = np.nan
except Exception as e:
print(f"An unexpected error occurred calculating {factor_name}: {e}. Assigning NaN.")
df[factor_name] = np.nan
finally:
cols_to_drop = [col for col in _temp_cols if col in df.columns]
if cols_to_drop:
df.drop(columns=cols_to_drop, inplace=True)
print(f"Finished {factor_name}.")
# Category 2: Cross-Sectional Cost Basis & PnL Status
def cs_rank_rel_profit_margin(df: pd.DataFrame, factor_name: str = 'cs_rank_rel_profit_margin'):
"""
Factor 5: 相对盈利幅度排序 (In-place).
WARNING: Modifies df in-place.
"""
print(f"Calculating {factor_name}...")
_temp_cols = ['_profit_margin']
try:
df['_profit_margin'] = _safe_divide((df['close'] - df['weight_avg']), df['close'])
df[factor_name] = df.groupby('trade_date')['_profit_margin'].rank(pct=True)
except KeyError as e:
print(f"Error calculating {factor_name}: Missing column {e}. Assigning NaN.")
df[factor_name] = np.nan
except Exception as e:
print(f"An unexpected error occurred calculating {factor_name}: {e}. Assigning NaN.")
df[factor_name] = np.nan
finally:
cols_to_drop = [col for col in _temp_cols if col in df.columns]
if cols_to_drop:
df.drop(columns=cols_to_drop, inplace=True)
print(f"Finished {factor_name}.")
def cs_rank_cost_breadth(df: pd.DataFrame, factor_name: str = 'cs_rank_cost_breadth'):
"""
Factor 6: 成本分布宽度截面排序 (In-place).
WARNING: Modifies df in-place.
"""
print(f"Calculating {factor_name}...")
_temp_cols = ['_cost_breadth']
try:
df['_cost_breadth'] = _safe_divide((df['cost_85pct'] - df['cost_15pct']), df['weight_avg'])
df[factor_name] = df.groupby('trade_date')['_cost_breadth'].rank(pct=True)
except KeyError as e:
print(f"Error calculating {factor_name}: Missing column {e}. Assigning NaN.")
df[factor_name] = np.nan
except Exception as e:
print(f"An unexpected error occurred calculating {factor_name}: {e}. Assigning NaN.")
df[factor_name] = np.nan
finally:
cols_to_drop = [col for col in _temp_cols if col in df.columns]
if cols_to_drop:
df.drop(columns=cols_to_drop, inplace=True)
print(f"Finished {factor_name}.")
def cs_rank_dist_to_upper_cost(df: pd.DataFrame, factor_name: str = 'cs_rank_dist_to_upper_cost'):
"""
Factor 7: 股价相对高成本位距离排序 (In-place).
WARNING: Modifies df in-place.
"""
print(f"Calculating {factor_name}...")
_temp_cols = ['_dist_to_95']
try:
df['_dist_to_95'] = _safe_divide(df['close'], df['cost_95pct'])
df[factor_name] = df.groupby('trade_date')['_dist_to_95'].rank(pct=True)
except KeyError as e:
print(f"Error calculating {factor_name}: Missing column {e}. Assigning NaN.")
df[factor_name] = np.nan
except Exception as e:
print(f"An unexpected error occurred calculating {factor_name}: {e}. Assigning NaN.")
df[factor_name] = np.nan
finally:
cols_to_drop = [col for col in _temp_cols if col in df.columns]
if cols_to_drop:
df.drop(columns=cols_to_drop, inplace=True)
print(f"Finished {factor_name}.")
def cs_rank_winner_rate(df: pd.DataFrame, factor_name: str = 'cs_rank_winner_rate'):
"""
Factor 8: 获利盘比例截面排序 (In-place).
WARNING: Modifies df in-place.
"""
print(f"Calculating {factor_name}...")
try:
df[factor_name] = df.groupby('trade_date')['winner_rate'].rank(pct=True)
except KeyError as e:
print(f"Error calculating {factor_name}: Missing column {e}. Assigning NaN.")
df[factor_name] = np.nan
except Exception as e:
print(f"An unexpected error occurred calculating {factor_name}: {e}. Assigning NaN.")
df[factor_name] = np.nan
finally:
print(f"Finished {factor_name}.")
# Category 3: Cross-Sectional Price Action & Volatility
def cs_rank_intraday_range(df: pd.DataFrame, factor_name: str = 'cs_rank_intraday_range'):
"""
Factor 9: 日内相对振幅排序 (In-place).
WARNING: Modifies df in-place.
"""
print(f"Calculating {factor_name}...")
_temp_cols = ['_norm_range']
try:
df['_norm_range'] = _safe_divide((df['high'] - df['low']), df['close'])
df[factor_name] = df.groupby('trade_date')['_norm_range'].rank(pct=True)
except KeyError as e:
print(f"Error calculating {factor_name}: Missing column {e}. Assigning NaN.")
df[factor_name] = np.nan
except Exception as e:
print(f"An unexpected error occurred calculating {factor_name}: {e}. Assigning NaN.")
df[factor_name] = np.nan
finally:
cols_to_drop = [col for col in _temp_cols if col in df.columns]
if cols_to_drop:
df.drop(columns=cols_to_drop, inplace=True)
print(f"Finished {factor_name}.")
def cs_rank_close_pos_in_range(df: pd.DataFrame, factor_name: str = 'cs_rank_close_pos_in_range'):
"""
Factor 10: 收盘价在日内位置排序 (In-place).
WARNING: Modifies df in-place.
"""
print(f"Calculating {factor_name}...")
_temp_cols = ['_close_pos']
try:
df['_close_pos'] = _safe_divide((df['close'] - df['low']), (df['high'] - df['low']), default_val=0.5) # Assign 0.5 if high==low
df[factor_name] = df.groupby('trade_date')['_close_pos'].rank(pct=True)
except KeyError as e:
print(f"Error calculating {factor_name}: Missing column {e}. Assigning NaN.")
df[factor_name] = np.nan
except Exception as e:
print(f"An unexpected error occurred calculating {factor_name}: {e}. Assigning NaN.")
df[factor_name] = np.nan
finally:
cols_to_drop = [col for col in _temp_cols if col in df.columns]
if cols_to_drop:
df.drop(columns=cols_to_drop, inplace=True)
print(f"Finished {factor_name}.")
def cs_rank_opening_gap(df: pd.DataFrame, factor_name: str = 'cs_rank_opening_gap'):
"""
Factor 11: 开盘相对跳空幅度排序 (In-place). Needs pre_close.
WARNING: Modifies df in-place. Assumes 'pre_close' exists.
"""
print(f"Calculating {factor_name}...")
_temp_cols = ['_gap']
if 'pre_close' not in df.columns:
print(f"Error calculating {factor_name}: Missing 'pre_close' column. Assigning NaN.")
df[factor_name] = np.nan
return
try:
df['_gap'] = _safe_divide(df['open'], df['pre_close']) - 1
df[factor_name] = df.groupby('trade_date')['_gap'].rank(pct=True)
except KeyError as e:
print(f"Error calculating {factor_name}: Missing column {e} (likely 'open'). Assigning NaN.")
df[factor_name] = np.nan
except Exception as e:
print(f"An unexpected error occurred calculating {factor_name}: {e}. Assigning NaN.")
df[factor_name] = np.nan
finally:
cols_to_drop = [col for col in _temp_cols if col in df.columns]
if cols_to_drop:
df.drop(columns=cols_to_drop, inplace=True)
print(f"Finished {factor_name}.")
def cs_rank_pos_in_hist_range(df: pd.DataFrame, factor_name: str = 'cs_rank_pos_in_hist_range'):
"""
Factor 12: 相对历史波动位置排序 (In-place).
WARNING: Modifies df in-place.
"""
print(f"Calculating {factor_name}...")
_temp_cols = ['_hist_pos']
try:
df['_hist_pos'] = _safe_divide((df['close'] - df['his_low']), (df['his_high'] - df['his_low'])).clip(0, 1) # Clip to 0-1 range
df[factor_name] = df.groupby('trade_date')['_hist_pos'].rank(pct=True)
except KeyError as e:
print(f"Error calculating {factor_name}: Missing column {e}. Assigning NaN.")
df[factor_name] = np.nan
except Exception as e:
print(f"An unexpected error occurred calculating {factor_name}: {e}. Assigning NaN.")
df[factor_name] = np.nan
finally:
cols_to_drop = [col for col in _temp_cols if col in df.columns]
if cols_to_drop:
df.drop(columns=cols_to_drop, inplace=True)
print(f"Finished {factor_name}.")
# Category 4: Cross-Sectional Interaction & Composite Indicators
def cs_rank_vol_x_profit_margin(df: pd.DataFrame, factor_name: str = 'cs_rank_vol_x_profit_margin'):
"""
Factor 13: 波动率与盈亏状态交互排序 (In-place).
WARNING: Modifies df in-place.
"""
print(f"Calculating {factor_name}...")
_temp_cols = ['_daily_vol', '_profit_margin', '_interaction']
try:
df['_daily_vol'] = abs(df['pct_chg'])
df['_profit_margin'] = _safe_divide((df['close'] - df['weight_avg']), df['close'])
df['_interaction'] = df['_daily_vol'] * df['_profit_margin']
df[factor_name] = df.groupby('trade_date')['_interaction'].rank(pct=True)
except KeyError as e:
print(f"Error calculating {factor_name}: Missing column {e}. Assigning NaN.")
df[factor_name] = np.nan
except Exception as e:
print(f"An unexpected error occurred calculating {factor_name}: {e}. Assigning NaN.")
df[factor_name] = np.nan
finally:
cols_to_drop = [col for col in _temp_cols if col in df.columns]
if cols_to_drop:
df.drop(columns=cols_to_drop, inplace=True)
print(f"Finished {factor_name}.")
def cs_rank_lg_flow_price_concordance(df: pd.DataFrame, factor_name: str = 'cs_rank_lg_flow_price_concordance'):
"""
Factor 14: 大单流向与价格变动一致性排序 (In-place).
WARNING: Modifies df in-place.
"""
print(f"Calculating {factor_name}...")
_temp_cols = ['_net_lg_flow_vol', '_concordance']
try:
df['_net_lg_flow_vol'] = df['buy_lg_vol'] + df['buy_elg_vol'] - df['sell_lg_vol'] - df['sell_elg_vol']
df['_concordance'] = df['_net_lg_flow_vol'] * df['pct_chg']
df[factor_name] = df.groupby('trade_date')['_concordance'].rank(pct=True)
except KeyError as e:
print(f"Error calculating {factor_name}: Missing column {e}. Assigning NaN.")
df[factor_name] = np.nan
except Exception as e:
print(f"An unexpected error occurred calculating {factor_name}: {e}. Assigning NaN.")
df[factor_name] = np.nan
finally:
cols_to_drop = [col for col in _temp_cols if col in df.columns]
if cols_to_drop:
df.drop(columns=cols_to_drop, inplace=True)
print(f"Finished {factor_name}.")
def cs_rank_turnover_per_winner(df: pd.DataFrame, factor_name: str = 'cs_rank_turnover_per_winner'):
"""
Factor 15: 高换手获利盘占比排序 (In-place).
WARNING: Modifies df in-place.
"""
print(f"Calculating {factor_name}...")
_temp_cols = ['_turnover_per_winner']
try:
df['_turnover_per_winner'] = _safe_divide(df['turnover_rate'], df['winner_rate'])
df[factor_name] = df.groupby('trade_date')['_turnover_per_winner'].rank(pct=True)
except KeyError as e:
print(f"Error calculating {factor_name}: Missing column {e}. Assigning NaN.")
df[factor_name] = np.nan
except Exception as e:
print(f"An unexpected error occurred calculating {factor_name}: {e}. Assigning NaN.")
df[factor_name] = np.nan
finally:
cols_to_drop = [col for col in _temp_cols if col in df.columns]
if cols_to_drop:
df.drop(columns=cols_to_drop, inplace=True)
print(f"Finished {factor_name}.")
def cs_rank_ind_cap_neutral_pe(df: pd.DataFrame, factor_name: str = 'cs_rank_ind_cap_neutral_pe'):
"""
Factor 16: 行业市值中性化PE排序 (Placeholder).
Requires statsmodels and complex cross-sectional regression implementation.
WARNING: Modifies df in-place. Placeholder implementation returns NaN.
"""
print(f"Calculating {factor_name} (Placeholder - requires statsmodels)...")
df[factor_name] = np.nan
print(f"Finished {factor_name} (Placeholder).")
def cs_rank_volume_ratio(df: pd.DataFrame, factor_name: str = 'cs_rank_volume_ratio'):
"""
Factor 17: 成交量相对强度排序 (In-place).
WARNING: Modifies df in-place.
"""
print(f"Calculating {factor_name}...")
try:
# Assumes 'volume_ratio' (量比) column already exists
df[factor_name] = df.groupby('trade_date')['volume_ratio'].rank(pct=True)
except KeyError as e:
print(f"Error calculating {factor_name}: Missing column {e}. Assigning NaN.")
df[factor_name] = np.nan
except Exception as e:
print(f"An unexpected error occurred calculating {factor_name}: {e}. Assigning NaN.")
df[factor_name] = np.nan
finally:
print(f"Finished {factor_name}.")
def cs_rank_elg_buy_sell_sm_ratio(df: pd.DataFrame, factor_name: str = 'cs_rank_elg_buy_sell_sm_ratio'):
"""
Factor 18: 超大单买入与小单卖出比排序 (In-place).
WARNING: Modifies df in-place.
"""
print(f"Calculating {factor_name}...")
_temp_cols = ['_ratio']
try:
df['_ratio'] = _safe_divide(df['buy_elg_vol'], df['sell_sm_vol'])
df[factor_name] = df.groupby('trade_date')['_ratio'].rank(pct=True)
except KeyError as e:
print(f"Error calculating {factor_name}: Missing column {e}. Assigning NaN.")
df[factor_name] = np.nan
except Exception as e:
print(f"An unexpected error occurred calculating {factor_name}: {e}. Assigning NaN.")
df[factor_name] = np.nan
finally:
cols_to_drop = [col for col in _temp_cols if col in df.columns]
if cols_to_drop:
df.drop(columns=cols_to_drop, inplace=True)
print(f"Finished {factor_name}.")
def cs_rank_cost_dist_vol_ratio(df: pd.DataFrame, factor_name: str = 'cs_rank_cost_dist_vol_ratio'):
"""
Factor 19: 价格偏离成本程度与成交量放大交互排序 (In-place).
WARNING: Modifies df in-place.
"""
print(f"Calculating {factor_name}...")
_temp_cols = ['_dist', '_interaction']
if 'volume_ratio' not in df.columns:
print(f"Error calculating {factor_name}: Missing 'volume_ratio' column. Assigning NaN.")
df[factor_name] = np.nan
return
try:
df['_dist'] = abs(df['close'] - df['weight_avg']) / (df['close'] + epsilon)
df['_interaction'] = df['_dist'] * df['volume_ratio']
df[factor_name] = df.groupby('trade_date')['_interaction'].rank(pct=True)
except KeyError as e:
print(f"Error calculating {factor_name}: Missing column {e}. Assigning NaN.")
df[factor_name] = np.nan
except Exception as e:
print(f"An unexpected error occurred calculating {factor_name}: {e}. Assigning NaN.")
df[factor_name] = np.nan
finally:
cols_to_drop = [col for col in _temp_cols if col in df.columns]
if cols_to_drop:
df.drop(columns=cols_to_drop, inplace=True)
print(f"Finished {factor_name}.")
def cs_rank_size(df: pd.DataFrame, factor_name: str = 'cs_rank_size'):
"""
Factor 20: 市值因子暴露度排序 (Log of circ_mv) (In-place).
WARNING: Modifies df in-place.
"""
print(f"Calculating {factor_name}...")
_temp_cols = ['_log_circ_mv']
try:
# Use log1p for stability if circ_mv can be zero or very small
df['_log_circ_mv'] = np.log1p(df['circ_mv'])
df[factor_name] = df.groupby('trade_date')['_log_circ_mv'].rank(pct=True)
except KeyError as e:
print(f"Error calculating {factor_name}: Missing column {e}. Assigning NaN.")
df[factor_name] = np.nan
except Exception as e:
print(f"An unexpected error occurred calculating {factor_name}: {e}. Assigning NaN.")
df[factor_name] = np.nan
finally:
cols_to_drop = [col for col in _temp_cols if col in df.columns]
if cols_to_drop:
df.drop(columns=cols_to_drop, inplace=True)
print(f"Finished {factor_name}.")