import numpy as np import pandas as pd import talib def get_rolling_factor(df): old_columns = df.columns.tolist()[:] # 按股票和日期排序(如果尚未排序) df = df.sort_values(by=['ts_code', 'trade_date']) grouped = df.groupby('ts_code', group_keys=False) epsilon = 1e-8 df['lg_elg_net_buy_vol'] = df['buy_lg_vol'] + df['buy_elg_vol'] - df['sell_lg_vol'] - df['sell_elg_vol'] # 检查 'volume' 列是否存在且有效 df['flow_lg_elg_intensity'] = df['lg_elg_net_buy_vol'] / (df['vol'] + epsilon) # 2. 散户与主力背离度 (Retail vs Institutional Divergence) # 衡量小单净流入与(大单+超大单)净流入的差异或比率 df['sm_net_buy_vol'] = df['buy_sm_vol'] - df['sell_sm_vol'] df['flow_divergence_diff'] = df['sm_net_buy_vol'] - df['lg_elg_net_buy_vol'] # 比率形式可能更稳定 df['flow_divergence_ratio'] = df['sm_net_buy_vol'] / ( df['lg_elg_net_buy_vol'] + np.sign(df['lg_elg_net_buy_vol']) * epsilon + epsilon) # 复杂处理避免0/0 # 3. 资金流结构变动 (Flow Structure Change - Relative Strength of Large Flow) # 大单+超大单买入额占总买入额的比例的变化 df['total_buy_vol'] = df['buy_sm_vol'] + df['buy_lg_vol'] + df['buy_elg_vol'] df['lg_elg_buy_prop'] = (df['buy_lg_vol'] + df['buy_elg_vol']) / (df['total_buy_vol'] + epsilon) df['flow_struct_buy_change'] = grouped['lg_elg_buy_prop'].diff(1) # 1日变化 # 4. 资金流加速度 (Flow Acceleration) # 净主力资金流的变化率(二阶导) df['lg_elg_net_buy_vol_change'] = grouped['lg_elg_net_buy_vol'].diff(1) df['flow_lg_elg_accel'] = grouped['lg_elg_net_buy_vol_change'].diff(1) # # 5. 极端资金流事件 (Categorical: Extreme Flow Event) # # 定义主力资金流强度是否处于其历史极端水平(例如,过去N天的90分位数以上或10分位数以下) # rolling_window = 20 # 可调整窗口期 # # Step 1: Calculate the rolling quantiles separately # rolling_high = grouped['flow_lg_elg_intensity'].rolling(rolling_window, min_periods=1).quantile(0.9) # min_periods=1 保证窗口未满时也有输出 # rolling_low = grouped['flow_lg_elg_intensity'].rolling(rolling_window, min_periods=1).quantile(0.1) # # Step 2: Assign the results to the DataFrame # # 确保 df 和 rolling_high/low 的索引是一致的 # # 如果 df 的索引在此期间没有被修改过,这通常是安全的 # df['flow_lg_elg_intensity_rolling_high'] = rolling_high # df['flow_lg_elg_intensity_rolling_low'] = rolling_low # # Step 3: Continue with the logic using the new columns # conditions_flow = [ # df['flow_lg_elg_intensity'] > df['flow_lg_elg_intensity_rolling_high'], # df['flow_lg_elg_intensity'] < df['flow_lg_elg_intensity_rolling_low'] # ] # choices_flow = [1, -1] # 1: 极端流入, -1: 极端流出 # df['cat_extreme_flow'] = np.select(conditions_flow, choices_flow, default=0) # --- 筹码分布因子 --- # 6. 筹码集中度 (Chip Concentration) # 衡量筹码分布的紧密程度,例如 95% 与 5% 成本价的差距,相对于当前价格进行标准化 # 检查 'close' 列是否存在且有效 df['chip_concentration_range'] = (df['cost_95pct'] - df['cost_5pct']) / (df['close'] + epsilon) # 7. 筹码分布偏度 (Chip Distribution Skewness Proxy) # 比较中位数成本 (cost_50pct) 和加权平均成本 (weight_avg) # weight_avg > cost_50pct 暗示高成本区有较多筹码(右偏) df['chip_skewness'] = (df['weight_avg'] - df['cost_50pct']) / (df['cost_50pct'] + epsilon) # 8. 浮筹比例 (Floating Chips Proxy) # 衡量短期内(例如15%成本线以下)的筹码比例与总获利盘比例的关系 # winner_rate 高但 cost_15pct 接近当前价,可能意味着大部分获利盘成本不高,易浮动 # 这里简化为:获利盘比例 与 (当前价-15%成本价)/当前价 的乘积 price_dist_cost15 = (df['close'] - df['cost_15pct']) / (df['close'] + epsilon) df['floating_chip_proxy'] = df['winner_rate'] * np.maximum(0, price_dist_cost15) # 只考虑价格高于15%成本线的情况 # 9. 成本支撑强度变化 (Cost Support Strength Change) # 观察低位筹码成本(如 5% 或 15% 分位点)的变化率,看支撑位是上移还是下移 df['cost_support_15pct_change'] = grouped['cost_15pct'].pct_change(1) * 100 # 百分比变化 # 10. 获利盘压力/支撑区 (Categorical: Winner Rate Zone & Price Position) # 结合获利盘比例和当前价格相对于筹码成本的位置 # 例如: 价格在 85% 成本线之上 & 获利盘 > 0.8 -> 高位派发风险区? # 价格在 15% 成本线之下 & 获利盘 < 0.2 -> 低位吸筹潜力区? conditions_winner = [ (df['close'] > df['cost_85pct']) & (df['winner_rate'] > 0.8), # 高位 & 高获利盘 (df['close'] < df['cost_15pct']) & (df['winner_rate'] < 0.2), # 低位 & 低获利盘 (df['close'] > df['cost_50pct']) & (df['winner_rate'] > 0.5), # 中高位 & 多数获利 (df['close'] < df['cost_50pct']) & (df['winner_rate'] < 0.5), # 中低位 & 多数亏损 ] choices_winner = [1, 2, 3, 4] # 1:高风险区, 2:低潜力区, 3:中上获利区, 4:中下亏损区 df['cat_winner_price_zone'] = np.select(conditions_winner, choices_winner, default=0) # 0: 其他 # --- 结合因子 --- # 11. 主力行为与筹码结构一致性 (Flow-Chip Consistency) # 例如:主力净买入发生在价格接近下方筹码密集区(如 cost_15pct 到 cost_50pct)时 price_near_low_support = (df['close'] > df['cost_15pct']) & (df['close'] < df['cost_50pct']) df['flow_chip_consistency'] = df['lg_elg_net_buy_vol'] * price_near_low_support.astype(int) # 可以进一步标准化或做成 categorical # 12. 获利了结压力/承接盘强度 (Profit-Taking Pressure vs Absorption) # 在高获利盘(winner_rate > 0.7)的情况下,观察主力资金是净流出(了结)还是净流入(高位换手/承接) high_winner_rate_flag = (df['winner_rate'] > 0.7).astype(int) df['profit_taking_vs_absorb'] = df['lg_elg_net_buy_vol'] * high_winner_rate_flag # 正值表示高获利盘下主力仍在买入(承接),负值表示主力在卖出(了结) # 清理临时列和可能产生的 NaN (可选,根据需要处理) cols_to_drop = ['lg_elg_net_buy_vol', 'sm_net_buy_vol', 'total_buy_vol', 'lg_elg_buy_prop', 'lg_elg_net_buy_vol_change', 'flow_lg_elg_intensity_rolling_high', 'flow_lg_elg_intensity_rolling_low'] # df = df.drop(columns=cols_to_drop) window = 20 df['_is_positive'] = (df['pct_chg'] > 0).astype(int) df['_is_negative'] = (df['pct_chg'] < 0).astype(int) df['cat_is_positive'] = (df['pct_chg'] > 0).astype(int) # 分离正负收益率 (用于计算各自的均值和平方均值) # 注意:这里我们保留原始收益率用于计算,而不是 clip 到 0 df['_pos_returns'] = df['pct_chg'].where(df['pct_chg'] > 0, 0) # 非正设为0,便于求和 df['_neg_returns'] = df['pct_chg'].where(df['pct_chg'] < 0, 0) # 非负设为0,便于求和 # 计算收益率的平方 (用于计算 E[X^2]) df['_pos_returns_sq'] = np.square(df['_pos_returns']) df['_neg_returns_sq'] = np.square(df['_neg_returns']) # 平方后负数变正 # 4. 计算滚动统计量 (使用内置函数,速度较快) # 计算正收益日的统计量 rolling_pos_count = grouped['_is_positive'].rolling(window, min_periods=max(1, window // 2)).sum() rolling_pos_sum = grouped['_pos_returns'].rolling(window, min_periods=max(1, window // 2)).sum() rolling_pos_sum_sq = grouped['_pos_returns_sq'].rolling(window, min_periods=max(1, window // 2)).sum() # 计算负收益日的统计量 rolling_neg_count = grouped['_is_negative'].rolling(window, min_periods=max(1, window // 2)).sum() rolling_neg_sum = grouped['_neg_returns'].rolling(window, min_periods=max(1, window // 2)).sum() rolling_neg_sum_sq = grouped['_neg_returns_sq'].rolling(window, min_periods=max(1, window // 2)).sum() # 5. 计算方差和标准差 pos_mean_sq = rolling_pos_sum_sq / rolling_pos_count pos_mean = rolling_pos_sum / rolling_pos_count pos_var = pos_mean_sq - np.square(pos_mean) pos_var = pos_var.where(rolling_pos_count >= 2, np.nan).clip(lower=0) upside_vol = np.sqrt(pos_var) neg_mean_sq = rolling_neg_sum_sq / rolling_neg_count neg_mean = rolling_neg_sum / rolling_neg_count # 注意 neg_mean 是负数 neg_var = neg_mean_sq - np.square(neg_mean) neg_var = neg_var.where(rolling_neg_count >= 2, np.nan).clip(lower=0) downside_vol = np.sqrt(neg_var) # rolling 操作后结果带有 MultiIndex,需要去除股票代码层级以便合并 df['upside_vol'] = upside_vol.reset_index(level=0, drop=True) df['downside_vol'] = downside_vol.reset_index(level=0, drop=True) df['vol_ratio'] = df['upside_vol'] / df['downside_vol'] df['vol_ratio'] = df['vol_ratio'].replace([np.inf, -np.inf], np.nan).fillna(0) # 或 fillna(np.nan) df['return_skew'] = grouped['pct_chg'].rolling(window=5).skew().reset_index(0, drop=True) df['return_kurtosis'] = grouped['pct_chg'].rolling(window=5).kurt().reset_index(0, drop=True) # 因子 1:短期成交量变化率 df['volume_change_rate'] = ( grouped['vol'].rolling(window=2).mean() / grouped['vol'].rolling(window=10).mean() - 1 ).reset_index(level=0, drop=True) # 确保索引对齐 # 因子 2:成交量突破信号 max_volume = grouped['vol'].rolling(window=5).max().reset_index(level=0, drop=True) # 确保索引对齐 df['cat_volume_breakout'] = (df['vol'] > max_volume) # 因子 3:换手率均线偏离度 mean_turnover = grouped['turnover_rate'].rolling(window=3).mean().reset_index(level=0, drop=True) std_turnover = grouped['turnover_rate'].rolling(window=3).std().reset_index(level=0, drop=True) df['turnover_deviation'] = (df['turnover_rate'] - mean_turnover) / std_turnover # 因子 4:换手率激增信号 df['cat_turnover_spike'] = (df['turnover_rate'] > mean_turnover + 2 * std_turnover) # 因子 5:量比均值 df['avg_volume_ratio'] = grouped['volume_ratio'].rolling(window=3).mean().reset_index(level=0, drop=True) # 因子 6:量比突破信号 max_volume_ratio = grouped['volume_ratio'].rolling(window=5).max().reset_index(level=0, drop=True) df['cat_volume_ratio_breakout'] = (df['volume_ratio'] > max_volume_ratio) df['vol_spike'] = grouped.apply( lambda x: pd.Series(x['vol'].rolling(20).mean(), index=x.index) ) df['vol_std_5'] = grouped['vol'].pct_change().rolling(window=5).std() # 计算 ATR df['atr_14'] = grouped.apply( lambda x: pd.Series(talib.ATR(x['high'].values, x['low'].values, x['close'].values, timeperiod=14), index=x.index) ) df['atr_6'] = grouped.apply( lambda x: pd.Series(talib.ATR(x['high'].values, x['low'].values, x['close'].values, timeperiod=6), index=x.index) ) # 计算 OBV 及其均线 df['obv'] = grouped.apply( lambda x: pd.Series(talib.OBV(x['close'].values, x['vol'].values), index=x.index) ) print(df.columns) df['maobv_6'] = grouped.apply( lambda x: pd.Series(talib.SMA(x['obv'].values, timeperiod=6), index=x.index) ) df['rsi_3'] = grouped.apply( lambda x: pd.Series(talib.RSI(x['close'].values, timeperiod=3), index=x.index) ) # df['rsi_6'] = grouped.apply( # lambda x: pd.Series(talib.RSI(x['close'].values, timeperiod=6), index=x.index) # ) # df['rsi_9'] = grouped.apply( # lambda x: pd.Series(talib.RSI(x['close'].values, timeperiod=9), index=x.index) # ) # 计算 return_10 和 return_20 df['return_5'] = grouped['close'].apply(lambda x: x / x.shift(5) - 1) # df['return_10'] = grouped['close'].apply(lambda x: x / x.shift(10) - 1) df['return_20'] = grouped['close'].apply(lambda x: x / x.shift(20) - 1) # df['avg_close_5'] = grouped['close'].apply(lambda x: x.rolling(window=5).mean() / x) # 计算标准差指标 df['std_return_5'] = grouped['close'].apply(lambda x: x.pct_change().rolling(window=5).std()) # df['std_return_15'] = grouped['close'].apply(lambda x: x.pct_change().rolling(window=15).std()) # df['std_return_25'] = grouped['close'].apply(lambda x: x.pct_change().rolling(window=25).std()) df['std_return_90'] = grouped['close'].apply(lambda x: x.pct_change().rolling(window=90).std()) df['std_return_90_2'] = grouped['close'].apply(lambda x: x.shift(10).pct_change().rolling(window=90).std()) # 计算 EMA 指标 df['_ema_5'] = grouped['close'].apply( lambda x: pd.Series(talib.EMA(x.values, timeperiod=5), index=x.index) ) df['_ema_13'] = grouped['close'].apply( lambda x: pd.Series(talib.EMA(x.values, timeperiod=13), index=x.index) ) df['_ema_20'] = grouped['close'].apply( lambda x: pd.Series(talib.EMA(x.values, timeperiod=20), index=x.index) ) df['_ema_60'] = grouped['close'].apply( lambda x: pd.Series(talib.EMA(x.values, timeperiod=60), index=x.index) ) # 计算 act_factor1, act_factor2, act_factor3, act_factor4 df['act_factor1'] = grouped['_ema_5'].apply( lambda x: np.arctan((x / x.shift(1) - 1) * 100) * 57.3 / 50 ) df['act_factor2'] = grouped['_ema_13'].apply( lambda x: np.arctan((x / x.shift(1) - 1) * 100) * 57.3 / 40 ) df['act_factor3'] = grouped['_ema_20'].apply( lambda x: np.arctan((x / x.shift(1) - 1) * 100) * 57.3 / 21 ) df['act_factor4'] = grouped['_ema_60'].apply( lambda x: np.arctan((x / x.shift(1) - 1) * 100) * 57.3 / 10 ) # 根据 trade_date 截面计算排名 df['rank_act_factor1'] = df.groupby('trade_date', group_keys=False)['act_factor1'].rank(ascending=False, pct=True) df['rank_act_factor2'] = df.groupby('trade_date', group_keys=False)['act_factor2'].rank(ascending=False, pct=True) df['rank_act_factor3'] = df.groupby('trade_date', group_keys=False)['act_factor3'].rank(ascending=False, pct=True) df['log_circ_mv'] = np.log(df['circ_mv']) window_high_volume = 5 window_close_stddev = 20 period_delta = 5 # 计算每只股票的滚动协方差 def calculate_rolling_cov(group): return group['high'].rolling(window_high_volume).cov(group['vol']) df['cov'] = grouped.apply(calculate_rolling_cov) # 计算每只股票的协方差差分 def calculate_delta_cov(group): return group['cov'].diff(period_delta) df['delta_cov'] = grouped.apply(calculate_delta_cov) # 计算每只股票的滚动标准差 def calculate_stddev_close(group): return group['close'].rolling(window_close_stddev).std() df['_stddev_close'] = grouped.apply(calculate_stddev_close) df['_rank_stddev'] = df.groupby('trade_date')['_stddev_close'].rank(pct=True) df['alpha_22_improved'] = -1 * df['delta_cov'] * df['_rank_stddev'] df['alpha_003'] = np.where(df['high'] != df['low'], (df['close'] - df['open']) / (df['high'] - df['low']), 0) df['alpha_007'] = grouped.apply(lambda x: x['close'].rolling(5).corr(x['vol'])) df['alpha_007'] = df.groupby('trade_date', group_keys=False)['alpha_007'].rank(ascending=True, pct=True) df['alpha_013'] = grouped['close'].transform(lambda x: x.rolling(5).sum() - x.rolling(20).sum()) df['alpha_013'] = df.groupby('trade_date', group_keys=False)['alpha_013'].rank(ascending=True, pct=True) df['cat_up_limit'] = (df['close'] == df['up_limit']) # 是否涨停(1表示涨停,0表示未涨停) df['cat_down_limit'] = (df['close'] == df['down_limit']) # 是否跌停(1表示跌停,0表示未跌停) df['up_limit_count_10d'] = grouped['cat_up_limit'].rolling(window=10, min_periods=1).sum().reset_index(level=0, drop=True) df['down_limit_count_10d'] = grouped['cat_down_limit'].rolling(window=10, min_periods=1).sum().reset_index(level=0, drop=True) # 3. 最近连续涨跌停天数 def calculate_consecutive_limits(series): """ 计算连续涨停/跌停天数。 """ consecutive_up = series * (series.groupby((series != series.shift()).cumsum()).cumcount() + 1) consecutive_down = series * (series.groupby((series != series.shift()).cumsum()).cumcount() + 1) return consecutive_up, consecutive_down # 连续涨停天数 df['consecutive_up_limit'] = grouped['cat_up_limit'].apply( lambda x: calculate_consecutive_limits(x)[0] ) df['vol_break'] = np.where((df['close'] > df['cost_85pct']) & (df['volume_ratio'] > 2), 1, 0) df['weight_roc5'] = grouped['weight_avg'].apply(lambda x: x.pct_change(5)) def rolling_corr(group): roc_close = group['close'].pct_change() roc_weight = group['weight_avg'].pct_change() return roc_close.rolling(10).corr(roc_weight) df['price_cost_divergence'] = grouped.apply(rolling_corr) df['smallcap_concentration'] = (1 / df['log_circ_mv']) * (df['cost_85pct'] - df['cost_15pct']) # 16. 筹码稳定性指数 (20日波动率) df['weight_std20'] = grouped['weight_avg'].apply(lambda x: x.rolling(20).std()) df['cost_stability'] = df['weight_std20'] / grouped['weight_avg'].transform(lambda x: x.rolling(20).mean()) # 17. 成本区间突破标记 df['high_cost_break_days'] = grouped.apply(lambda g: g['close'].gt(g['cost_95pct']).rolling(5).sum()) # 20. 筹码-流动性风险 df['liquidity_risk'] = (df['cost_95pct'] - df['cost_5pct']) * ( 1 / grouped['vol'].transform(lambda x: x.rolling(10).mean())) # 7. 市值波动率因子 (使用 grouped) df['turnover_std'] = grouped['turnover_rate'].transform(lambda x: x.rolling(window=20).std()) df['mv_volatility'] = grouped.apply(lambda x: x['turnover_std'] / x['log_circ_mv']) # 8. 市值成长性因子 df['volume_growth'] = grouped['vol'].pct_change(periods=20) df['mv_growth'] = df['volume_growth'] / df['log_circ_mv'] # AR 指标 df["ar"] = grouped.apply( lambda x: (x["high"].div(x["open"]).rolling(3).sum()) / (x["open"].div(x["low"]).rolling(3).sum()) * 100) # BR 指标 df["pre_close"] = grouped["close"].shift(1) df["br_up"] = (df["high"] - df["pre_close"]).clip(lower=0) df["br_down"] = (df["pre_close"] - df["low"]).clip(lower=0) df["br"] = grouped.apply(lambda x: (x["br_up"].rolling(3).sum()) / (x["br_down"].rolling(3).sum()) * 100) # ARBR df['arbr'] = df['ar'] - df['br'] df.drop(columns=["pre_close", "br_up", "br_down", 'ar', 'br'], inplace=True) df.drop(columns=['weight_std20'], inplace=True, errors='ignore') df.drop( columns=['_is_positive', '_is_negative', '_pos_returns', '_neg_returns', '_pos_returns_sq', '_neg_returns_sq'], inplace=True, errors='ignore') new_columns = [col for col in df.columns.tolist()[:] if col not in old_columns] return df, new_columns def get_simple_factor(df): old_columns = df.columns.tolist()[:] df = df.sort_values(by=['ts_code', 'trade_date']) alpha = 0.5 df['momentum_factor'] = df['volume_change_rate'] + alpha * df['turnover_deviation'] df['resonance_factor'] = df['volume_ratio'] * df['pct_chg'] df['log_close'] = np.log(df['close']) df['cat_vol_spike'] = df['vol'] > 2 * df['vol_spike'] df['up'] = (df['high'] - df[['close', 'open']].max(axis=1)) / df['close'] df['down'] = (df[['close', 'open']].min(axis=1) - df['low']) / df['close'] df['obv_maobv_6'] = df['obv'] - df['maobv_6'] # 计算比值指标 df['std_return_5_over_std_return_90'] = df['std_return_5'] / df['std_return_90'] # df['std_return_5 / std_return_25'] = df['std_return_5'] / df['std_return_25'] # 计算标准差差值 df['std_return_90_minus_std_return_90_2'] = df['std_return_90'] - df['std_return_90_2'] # df['cat_af1'] = df['act_factor1'] > 0 df['cat_af2'] = df['act_factor2'] > df['act_factor1'] df['cat_af3'] = df['act_factor3'] > df['act_factor2'] df['cat_af4'] = df['act_factor4'] > df['act_factor3'] # 计算 act_factor5 和 act_factor6 df['act_factor5'] = df['act_factor1'] + df['act_factor2'] + df['act_factor3'] + df['act_factor4'] df['act_factor6'] = (df['act_factor1'] - df['act_factor2']) / np.sqrt( df['act_factor1'] ** 2 + df['act_factor2'] ** 2) df['active_buy_volume_large'] = df['buy_lg_vol'] / df['net_mf_vol'] df['active_buy_volume_big'] = df['buy_elg_vol'] / df['net_mf_vol'] df['active_buy_volume_small'] = df['buy_sm_vol'] / df['net_mf_vol'] df['buy_lg_vol_minus_sell_lg_vol'] = (df['buy_lg_vol'] - df['sell_lg_vol']) / df['net_mf_vol'] df['buy_elg_vol_minus_sell_elg_vol'] = (df['buy_elg_vol'] - df['sell_elg_vol']) / df['net_mf_vol'] df['log_circ_mv'] = np.log(df['circ_mv']) df['ctrl_strength'] = (df['cost_85pct'] - df['cost_15pct']) / (df['his_high'] - df['his_low']) df['low_cost_dev'] = (df['close'] - df['cost_5pct']) / (df['cost_50pct'] - df['cost_5pct']) df['asymmetry'] = (df['cost_95pct'] - df['cost_50pct']) / (df['cost_50pct'] - df['cost_5pct']) df['lock_factor'] = df['turnover_rate'] * ( 1 - (df['cost_95pct'] - df['cost_5pct']) / (df['his_high'] - df['his_low'])) df['cat_vol_break'] = (df['close'] > df['cost_85pct']) & (df['volume_ratio'] > 2) df['cost_atr_adj'] = (df['cost_95pct'] - df['cost_5pct']) / df['atr_14'] # 12. 小盘股筹码集中度 df['smallcap_concentration'] = (1 / df['log_circ_mv']) * (df['cost_85pct'] - df['cost_15pct']) df['cat_golden_resonance'] = ((df['close'] > df['weight_avg']) & (df['volume_ratio'] > 1.5) & (df['winner_rate'] > 0.7)) df['mv_turnover_ratio'] = df['turnover_rate'] / df['log_circ_mv'] df['mv_adjusted_volume'] = df['vol'] / df['log_circ_mv'] df['mv_weighted_turnover'] = df['turnover_rate'] * (1 / df['log_circ_mv']) df['nonlinear_mv_volume'] = df['vol'] / df['log_circ_mv'] df['mv_volume_ratio'] = df['volume_ratio'] / df['log_circ_mv'] df['mv_momentum'] = df['turnover_rate'] * df['volume_ratio'] / df['log_circ_mv'] drop_columns = [col for col in df.columns if col.startswith('_')] df.drop(columns=drop_columns, inplace=True, errors='ignore') new_columns = [col for col in df.columns.tolist()[:] if col not in old_columns] return df, new_columns import pandas as pd import numpy as np from scipy.stats import linregress # For factor 4 (if implementing slope directly) # from hurst import compute_Hc # For factor 18, needs pip install hurst # import statsmodels.api as sm # For factor 16, needs pip install statsmodels # --- Constants --- epsilon = 1e-10 # Prevent division by zero # --- Helper Functions --- def _safe_divide(a, b, default_val=0): """Safe division, returns default_val for division by zero or NaN/inf results.""" with np.errstate(divide='ignore', invalid='ignore'): result = a / b # Replace NaN, Inf, -Inf resulting from division or invalid ops result[~np.isfinite(result)] = default_val return result # --- Factor Calculation Functions (In-Place Modification) --- # Category 1: Large Player Intent & Behavior def lg_flow_mom_corr(df: pd.DataFrame, N: int = 20, M: int = 60, factor_name: str = None): """ Calculates Factor 1: Large Flow & Price Momentum Concordance (In-place). WARNING: Modifies df in-place. """ if factor_name is None: factor_name = f'lg_flow_mom_corr_{N}_{M}' print(f"Calculating {factor_name}...") _temp_cols = ['_net_lg_flow_val', '_rolling_net_lg_flow', '_price_mom'] try: df['_net_lg_flow_val'] = (df['buy_lg_vol'] + df['buy_elg_vol'] - df['sell_lg_vol'] - df['sell_elg_vol']) * df['close'] df['_rolling_net_lg_flow'] = df.groupby('ts_code')['_net_lg_flow_val'].rolling(N, min_periods=max(1, N // 2)).sum().reset_index(level=0, drop=True) df['_price_mom'] = df.groupby('ts_code')['close'].pct_change(N) # Calculate correlation on the temporary Series to handle alignment factor_series = df['_rolling_net_lg_flow'].rolling(M, min_periods=max(1, M // 2)).corr(df['_price_mom']) df[factor_name] = factor_series except Exception as e: print(f"Error calculating {factor_name}: {e}") df[factor_name] = np.nan # Assign NaN on error finally: # Cleanup intermediate columns cols_to_drop = [col for col in _temp_cols if col in df.columns] if cols_to_drop: df.drop(columns=cols_to_drop, inplace=True) print(f"Finished {factor_name}.") def lg_buy_consolidation(df: pd.DataFrame, N: int = 20, vol_quantile: float = 0.2, factor_name: str = None): """ Calculates Factor 2: Large Buying during Consolidation (In-place). WARNING: Modifies df in-place. """ if factor_name is None: factor_name = f'lg_buy_consolidation_{N}' print(f"Calculating {factor_name}...") _temp_cols = ['_rolling_std', '_net_lg_flow_ratio', '_rolling_net_lg_flow_ratio_mean', '_std_threshold'] try: df['_rolling_std'] = df.groupby('ts_code')['close'].rolling(N, min_periods=max(1, N // 2)).std().reset_index(level=0, drop=True) df['_net_lg_flow_ratio'] = _safe_divide( (df['buy_lg_vol'] + df['buy_elg_vol'] - df['sell_lg_vol'] - df['sell_elg_vol']), df['vol'] ) df['_rolling_net_lg_flow_ratio_mean'] = df.groupby('ts_code')['_net_lg_flow_ratio'].rolling(N, min_periods=max(1, N // 2)).mean().reset_index(level=0, drop=True) df['_std_threshold'] = df.groupby('trade_date')['_rolling_std'].transform(lambda x: x.quantile(vol_quantile)) df[factor_name] = df['_rolling_net_lg_flow_ratio_mean'].where(df['_rolling_std'] < df['_std_threshold']) except Exception as e: print(f"Error calculating {factor_name}: {e}") df[factor_name] = np.nan finally: cols_to_drop = [col for col in _temp_cols if col in df.columns] if cols_to_drop: df.drop(columns=cols_to_drop, inplace=True) print(f"Finished {factor_name}.") def lg_flow_accel(df: pd.DataFrame, factor_name: str = 'lg_flow_accel'): """ Calculates Factor 3: Large Flow Acceleration (In-place). WARNING: Modifies df in-place. """ print(f"Calculating {factor_name}...") _temp_cols = ['_net_lg_flow_vol'] try: df['_net_lg_flow_vol'] = df['buy_lg_vol'] + df['buy_elg_vol'] - df['sell_lg_vol'] - df['sell_elg_vol'] df[factor_name] = df.groupby('ts_code')['_net_lg_flow_vol'].diff(1).diff(1) except Exception as e: print(f"Error calculating {factor_name}: {e}") df[factor_name] = np.nan finally: cols_to_drop = [col for col in _temp_cols if col in df.columns] if cols_to_drop: df.drop(columns=cols_to_drop, inplace=True) print(f"Finished {factor_name}.") def intraday_lg_flow_corr(df: pd.DataFrame, N: int = 20, factor_name: str = None): """ Calculates Factor 4: (Approx) Intraday Trend & Large Flow Correlation (In-place). NOTE: Direct rolling correlation between two rolling series is complex/slow in pandas. This provides a placeholder or requires significant optimization/pre-calculation. WARNING: Modifies df in-place. Placeholder implementation returns NaN. """ if factor_name is None: factor_name = f'intraday_lg_flow_corr_{N}' print(f"Calculating {factor_name} (Placeholder - complex implementation)...") df[factor_name] = np.nan # Placeholder, see previous thought process for detailed logic needed print(f"Finished {factor_name} (Placeholder).") # Category 2: Cost Basis & PnL Status def profit_pressure(df: pd.DataFrame, factor_name: str = 'profit_pressure'): """ Calculates Factor 5: Profit Pressure Index (In-place). WARNING: Modifies df in-place. """ print(f"Calculating {factor_name}...") _temp_cols = ['_profit_margin_85', '_profit_margin_95'] try: df['_profit_margin_85'] = _safe_divide(df['close'], df['cost_85pct']) - 1 df['_profit_margin_95'] = _safe_divide(df['close'], df['cost_95pct']) - 1 df[factor_name] = df['winner_rate'] * 0.5 * (df['_profit_margin_85'] + df['_profit_margin_95']) except Exception as e: print(f"Error calculating {factor_name}: {e}") df[factor_name] = np.nan finally: cols_to_drop = [col for col in _temp_cols if col in df.columns] if cols_to_drop: df.drop(columns=cols_to_drop, inplace=True) print(f"Finished {factor_name}.") def underwater_resistance(df: pd.DataFrame, factor_name: str = 'underwater_resistance'): """ Calculates Factor 6: Resistance from Underwater Positions (In-place). WARNING: Modifies df in-place. """ print(f"Calculating {factor_name}...") _temp_cols = ['_underwater_ratio', '_dist_to_cost_15'] try: df['_underwater_ratio'] = 1.0 - df['winner_rate'] df['_dist_to_cost_15'] = np.maximum(0, df['cost_15pct'] - df['close']) / (df['close'] + epsilon) df[factor_name] = df['_underwater_ratio'] * df['_dist_to_cost_15'] except Exception as e: print(f"Error calculating {factor_name}: {e}") df[factor_name] = np.nan finally: cols_to_drop = [col for col in _temp_cols if col in df.columns] if cols_to_drop: df.drop(columns=cols_to_drop, inplace=True) print(f"Finished {factor_name}.") def cost_conc_std(df: pd.DataFrame, N: int = 20, factor_name: str = None): """ Calculates Factor 7: Cost Concentration Change (Std Dev) (In-place). WARNING: Modifies df in-place. """ if factor_name is None: factor_name = f'cost_conc_std_{N}' print(f"Calculating {factor_name}...") _temp_cols = ['_cost_range_norm'] try: df['_cost_range_norm'] = _safe_divide( (df['cost_85pct'] - df['cost_15pct']), (df['weight_avg'] + epsilon) ) # Need to calculate rolling std on the temp col before dropping it factor_series = df.groupby('ts_code')['_cost_range_norm'].rolling(N, min_periods=max(1, N//2)).std().reset_index(level=0, drop=True) df[factor_name] = factor_series except Exception as e: print(f"Error calculating {factor_name}: {e}") df[factor_name] = np.nan finally: cols_to_drop = [col for col in _temp_cols if col in df.columns] if cols_to_drop: df.drop(columns=cols_to_drop, inplace=True) print(f"Finished {factor_name}.") def profit_decay(df: pd.DataFrame, N: int = 20, factor_name: str = None): """ Calculates Factor 8: Profit Expectation Decay (In-place). WARNING: Modifies df in-place. """ if factor_name is None: factor_name = f'profit_decay_{N}' print(f"Calculating {factor_name}...") _temp_cols = ['_ret_N', '_winner_rate_change_N'] try: df['_ret_N'] = _safe_divide(df['close'], df.groupby('ts_code')['close'].shift(N)) - 1 df['_winner_rate_change_N'] = df.groupby('ts_code')['winner_rate'].diff(N) df[factor_name] = _safe_divide(df['_ret_N'], df['_winner_rate_change_N']) except Exception as e: print(f"Error calculating {factor_name}: {e}") df[factor_name] = np.nan finally: cols_to_drop = [col for col in _temp_cols if col in df.columns] if cols_to_drop: df.drop(columns=cols_to_drop, inplace=True) print(f"Finished {factor_name}.") # Category 3: Volatility Source & Market State def vol_amp_loss(df: pd.DataFrame, N: int = 20, factor_name: str = None): """ Calculates Factor 9: Volatility Amplification when Underwater (In-place). WARNING: Modifies df in-place. """ if factor_name is None: factor_name = f'vol_amp_loss_{N}' print(f"Calculating {factor_name}...") _temp_cols = ['_vol_N', '_loss_degree'] try: df['_vol_N'] = df.groupby('ts_code')['pct_chg'].rolling(N, min_periods=max(1, N // 2)).std().reset_index(level=0, drop=True) df['_loss_degree'] = np.maximum(0, df['weight_avg'] - df['close']) / (df['close'] + epsilon) df[factor_name] = df['_vol_N'] * df['_loss_degree'] except Exception as e: print(f"Error calculating {factor_name}: {e}") df[factor_name] = np.nan finally: cols_to_drop = [col for col in _temp_cols if col in df.columns] if cols_to_drop: df.drop(columns=cols_to_drop, inplace=True) print(f"Finished {factor_name}.") def vol_drop_profit_cnt(df: pd.DataFrame, N: int = 20, M: int = 5, profit_thresh: float = 0.1, drop_thresh: float = -0.03, vol_multiple: float = 2.0, factor_name: str = None): """ Calculates Factor 10: High Volume Drop when Profitable (Count over M days) (In-place). WARNING: Modifies df in-place. """ if factor_name is None: factor_name = f'vol_drop_profit_cnt_{M}' print(f"Calculating {factor_name}...") _temp_cols = ['_is_profitable', '_is_dropping', '_rolling_mean_vol', '_rolling_std_vol', '_is_high_vol', '_event'] try: df['_is_profitable'] = df['close'] > df['weight_avg'] * (1 + profit_thresh) df['_is_dropping'] = df['pct_chg'] < drop_thresh df['_rolling_mean_vol'] = df.groupby('ts_code')['vol'].rolling(N, min_periods=1).mean().reset_index(level=0, drop=True) df['_rolling_std_vol'] = df.groupby('ts_code')['vol'].rolling(N, min_periods=2).std().reset_index(level=0, drop=True).fillna(0) df['_is_high_vol'] = df['vol'] > (df['_rolling_mean_vol'] + vol_multiple * df['_rolling_std_vol']) df['_event'] = (df['_is_profitable'] & df['_is_dropping'] & df['_is_high_vol']).astype(int) factor_series = df.groupby('ts_code')['_event'].rolling(M, min_periods=1).sum().reset_index(level=0, drop=True) df[factor_name] = factor_series except Exception as e: print(f"Error calculating {factor_name}: {e}") df[factor_name] = np.nan finally: cols_to_drop = [col for col in _temp_cols if col in df.columns] if cols_to_drop: df.drop(columns=cols_to_drop, inplace=True) print(f"Finished {factor_name}.") def lg_flow_vol_interact(df: pd.DataFrame, N: int = 20, factor_name: str = None): """ Calculates Factor 11: Large Flow Driven Volatility (Interaction Term) (In-place). WARNING: Modifies df in-place. """ if factor_name is None: factor_name = f'lg_flow_vol_interact_{N}' print(f"Calculating {factor_name}...") _temp_cols = ['_vol_N', '_net_lg_flow_val', '_total_val', '_abs_net_lg_flow_ratio', '_abs_net_lg_flow_ratio_N'] try: df['_vol_N'] = df.groupby('ts_code')['pct_chg'].rolling(N, min_periods=max(1, N // 2)).std().reset_index(level=0, drop=True) df['_net_lg_flow_val'] = (df['buy_lg_vol'] + df['buy_elg_vol'] - df['sell_lg_vol'] - df['sell_elg_vol']) * df['close'] df['_total_val'] = df['vol'] * df['close'] df['_abs_net_lg_flow_ratio'] = abs(df['_net_lg_flow_val']) / (df['_total_val'] + epsilon) df['_abs_net_lg_flow_ratio_N'] = df.groupby('ts_code')['_abs_net_lg_flow_ratio'].rolling(N, min_periods=max(1, N // 2)).mean().reset_index(level=0, drop=True) df[factor_name] = df['_vol_N'] * df['_abs_net_lg_flow_ratio_N'] except Exception as e: print(f"Error calculating {factor_name}: {e}") df[factor_name] = np.nan finally: cols_to_drop = [col for col in _temp_cols if col in df.columns] if cols_to_drop: df.drop(columns=cols_to_drop, inplace=True) print(f"Finished {factor_name}.") def cost_break_confirm_cnt(df: pd.DataFrame, M: int = 5, factor_name: str = None): """ Calculates Factor 12: Cost Breakout Confirmation Count (In-place). WARNING: Modifies df in-place. """ if factor_name is None: factor_name = f'cost_break_confirm_cnt_{M}' print(f"Calculating {factor_name}...") _temp_cols = ['_prev_cost_85', '_prev_cost_15', '_break_up', '_break_down', '_net_lg_flow_vol', '_confirm_up', '_confirm_down', '_net_confirm'] try: df['_prev_cost_85'] = df.groupby('ts_code')['cost_85pct'].shift(1) df['_prev_cost_15'] = df.groupby('ts_code')['cost_15pct'].shift(1) df['_break_up'] = df['close'] > df['_prev_cost_85'] df['_break_down'] = df['close'] < df['_prev_cost_15'] df['_net_lg_flow_vol'] = df['buy_lg_vol'] + df['buy_elg_vol'] - df['sell_lg_vol'] - df['sell_elg_vol'] df['_confirm_up'] = (df['_break_up'] & (df['_net_lg_flow_vol'] > 0)).astype(int) df['_confirm_down'] = (df['_break_down'] & (df['_net_lg_flow_vol'] < 0)).astype(int) df['_net_confirm'] = df['_confirm_up'] - df['_confirm_down'] factor_series = df.groupby('ts_code')['_net_confirm'].rolling(M, min_periods=1).sum().reset_index(level=0, drop=True) df[factor_name] = factor_series except Exception as e: print(f"Error calculating {factor_name}: {e}") df[factor_name] = np.nan finally: cols_to_drop = [col for col in _temp_cols if col in df.columns] if cols_to_drop: df.drop(columns=cols_to_drop, inplace=True) print(f"Finished {factor_name}.") # Category 4: Technical Indicators & Market Behavior def atr_norm_channel_pos(df: pd.DataFrame, N: int = 14, factor_name: str = None): """ Calculates Factor 13: ATR Normalized Channel Position (In-place). WARNING: Modifies df in-place. """ if factor_name is None: factor_name = f'atr_norm_channel_pos_{N}' print(f"Calculating {factor_name}...") _temp_cols = ['_prev_close', '_h_l', '_h_pc', '_l_pc', '_tr', '_atr_N', '_roll_low_N'] try: df['_prev_close'] = df.groupby('ts_code')['close'].shift(1) df['_h_l'] = df['high'] - df['low'] df['_h_pc'] = abs(df['high'] - df['_prev_close']) df['_l_pc'] = abs(df['low'] - df['_prev_close']) df['_tr'] = df[['_h_l', '_h_pc', '_l_pc']].max(axis=1) df['_atr_N'] = df.groupby('ts_code')['_tr'].rolling(N, min_periods=max(1, N//2)).mean().reset_index(level=0, drop=True) df['_roll_low_N'] = df.groupby('ts_code')['low'].rolling(N, min_periods=max(1, N//2)).min().reset_index(level=0, drop=True) df[factor_name] = _safe_divide((df['close'] - df['_roll_low_N']), df['_atr_N']) except Exception as e: print(f"Error calculating {factor_name}: {e}") df[factor_name] = np.nan finally: cols_to_drop = [col for col in _temp_cols if col in df.columns] if cols_to_drop: df.drop(columns=cols_to_drop, inplace=True) print(f"Finished {factor_name}.") def turnover_diff_skew(df: pd.DataFrame, N: int = 20, factor_name: str = None): """ Calculates Factor 14: Skewness of Turnover Rate Change (In-place). WARNING: Modifies df in-place. """ if factor_name is None: factor_name = f'turnover_diff_skew_{N}' print(f"Calculating {factor_name}...") _temp_cols = ['_turnover_diff'] try: # Assuming turnover_rate is in percentage points, diff is fine df['_turnover_diff'] = df.groupby('ts_code')['turnover_rate'].diff(1) factor_series = df.groupby('ts_code')['_turnover_diff'].rolling(N, min_periods=max(3, N//2)).skew().reset_index(level=0, drop=True) df[factor_name] = factor_series except Exception as e: print(f"Error calculating {factor_name}: {e}") df[factor_name] = np.nan finally: cols_to_drop = [col for col in _temp_cols if col in df.columns] if cols_to_drop: df.drop(columns=cols_to_drop, inplace=True) print(f"Finished {factor_name}.") def lg_sm_flow_diverge(df: pd.DataFrame, N: int = 20, factor_name: str = None): """ Calculates Factor 15: Divergence between Large and Small Flow (In-place). WARNING: Modifies df in-place. """ if factor_name is None: factor_name = f'lg_sm_flow_diverge_{N}' print(f"Calculating {factor_name}...") _temp_cols = ['_lg_flow_ratio', '_sm_flow_ratio', '_lg_flow_ratio_N', '_sm_flow_ratio_N'] try: df['_lg_flow_ratio'] = _safe_divide( (df['buy_lg_vol'] + df['buy_elg_vol'] - df['sell_lg_vol'] - df['sell_elg_vol']), df['vol'] ) df['_sm_flow_ratio'] = _safe_divide( (df['buy_sm_vol'] - df['sell_sm_vol']), df['vol'] ) df['_lg_flow_ratio_N'] = df.groupby('ts_code')['_lg_flow_ratio'].rolling(N, min_periods=max(1, N // 2)).mean().reset_index(level=0, drop=True) df['_sm_flow_ratio_N'] = df.groupby('ts_code')['_sm_flow_ratio'].rolling(N, min_periods=max(1, N // 2)).mean().reset_index(level=0, drop=True) df[factor_name] = df['_lg_flow_ratio_N'] - df['_sm_flow_ratio_N'] except Exception as e: print(f"Error calculating {factor_name}: {e}") df[factor_name] = np.nan finally: cols_to_drop = [col for col in _temp_cols if col in df.columns] if cols_to_drop: df.drop(columns=cols_to_drop, inplace=True) print(f"Finished {factor_name}.") def cap_neutral_cost_metric(df: pd.DataFrame, factor_name: str = 'cap_neutral_cost_metric'): """ Calculates Factor 16: Market Cap Neutralized Cost Metric (Placeholder). Requires statsmodels and complex implementation. WARNING: Modifies df in-place. Placeholder implementation returns NaN. """ print(f"Calculating {factor_name} (Placeholder - requires statsmodels)...") df[factor_name] = np.nan print(f"Finished {factor_name} (Placeholder).") def pullback_strong(df: pd.DataFrame, N: int = 20, M: int = 20, gain_thresh: float = 0.2, factor_name: str = None): """ Calculates Factor 17: Pullback Depth from Recent High for Strong Stocks (In-place). WARNING: Modifies df in-place. """ if factor_name is None: factor_name = f'pullback_strong_{N}_{M}' print(f"Calculating {factor_name}...") _temp_cols = ['_high_N', '_pullback_depth', '_recent_gain_M'] try: df['_high_N'] = df.groupby('ts_code')['high'].rolling(N, min_periods=max(1, N // 2)).max().reset_index(level=0, drop=True) df['_pullback_depth'] = _safe_divide((df['_high_N'] - df['close']), df['_high_N']) df['_recent_gain_M'] = _safe_divide(df['close'], df.groupby('ts_code')['close'].shift(M)) - 1 df[factor_name] = _safe_divide(df['_pullback_depth'], df['_recent_gain_M']) except Exception as e: print(f"Error calculating {factor_name}: {e}") df[factor_name] = np.nan finally: cols_to_drop = [col for col in _temp_cols if col in df.columns] if cols_to_drop: df.drop(columns=cols_to_drop, inplace=True) print(f"Finished {factor_name}.") def hurst_exponent_flow(df: pd.DataFrame, N: int = 60, flow_col: str = 'net_mf_vol', factor_name: str = None): """ Calculates Factor 18: Hurst Exponent of Money Flow (Placeholder). Requires 'hurst' library and potentially slow rolling apply. WARNING: Modifies df in-place. Placeholder implementation returns NaN. """ if factor_name is None: factor_name = f'hurst_{flow_col}_{N}' print(f"Calculating {factor_name} (Placeholder - requires hurst library)...") try: from hurst import compute_Hc # Logic would go here, likely using rolling().apply() which is slow # factor_series = df.groupby('ts_code')[flow_col]....apply(hurst_calc_func...) df[factor_name] = np.nan # Placeholder except ImportError: print("Error: 'hurst' library not installed. Cannot calculate factor.") df[factor_name] = np.nan except Exception as e: print(f"Error calculating {factor_name}: {e}") df[factor_name] = np.nan print(f"Finished {factor_name} (Placeholder).") def vol_wgt_hist_pos(df: pd.DataFrame, N: int = 20, factor_name: str = None): """ Calculates Factor 19: Volume Weighting at Historical Price Level (In-place). WARNING: Modifies df in-place. """ if factor_name is None: factor_name = f'vol_wgt_hist_pos_{N}' print(f"Calculating {factor_name}...") _temp_cols = ['_hist_pos', '_rolling_mean_vol', '_vol_rel_strength'] try: df['_hist_pos'] = _safe_divide((df['close'] - df['his_low']), (df['his_high'] - df['his_low'])).clip(0, 1) df['_rolling_mean_vol'] = df.groupby('ts_code')['vol'].rolling(N, min_periods=max(1, N // 2)).mean().reset_index(level=0, drop=True) df['_vol_rel_strength'] = _safe_divide(df['vol'], df['_rolling_mean_vol']) df[factor_name] = df['_hist_pos'] * df['_vol_rel_strength'] except Exception as e: print(f"Error calculating {factor_name}: {e}") df[factor_name] = np.nan finally: cols_to_drop = [col for col in _temp_cols if col in df.columns] if cols_to_drop: df.drop(columns=cols_to_drop, inplace=True) print(f"Finished {factor_name}.") def vol_adj_roc(df: pd.DataFrame, N: int = 20, factor_name: str = None): """ Calculates Factor 20: Volatility-Adjusted ROC (In-place). WARNING: Modifies df in-place. """ if factor_name is None: factor_name = f'vol_adj_roc_{N}' print(f"Calculating {factor_name}...") _temp_cols = ['_roc_N', '_vol_N'] try: df['_roc_N'] = _safe_divide(df['close'], df.groupby('ts_code')['close'].shift(N)) - 1 df['_vol_N'] = df.groupby('ts_code')['pct_chg'].rolling(N, min_periods=max(2, N // 2)).std().reset_index(level=0, drop=True).fillna(0) df[factor_name] = _safe_divide(df['_roc_N'], df['_vol_N']) except Exception as e: print(f"Error calculating {factor_name}: {e}") df[factor_name] = np.nan finally: cols_to_drop = [col for col in _temp_cols if col in df.columns] if cols_to_drop: df.drop(columns=cols_to_drop, inplace=True) print(f"Finished {factor_name}.") def calculate_complex_factor(df: pd.DataFrame, factor_name: str = "complex_factor_deap_1"): """ 表达式: sub(protected_div_torch(A, B), C) 其中 A, B, C 及内部组件依赖于多个预计算因子列。 Args: df (pd.DataFrame): 包含所有必需基础因子列的 DataFrame。 factor_name (str): 要在 df 中创建的新因子列的名称。 WARNING: 此函数会原地修改输入的 DataFrame 'df'。 如果在计算过程中缺少任何必需的列,将打印错误并填充 NaN。 """ print(f"开始计算因子: {factor_name} (原地修改)...") _temp_cols_list = [] # 用于记录中间计算列的名称 try: # --- 分解计算表达式的各个部分 --- # 计算组件 D # D = sub(mul(pullback_strong_20_20, div(log_close, industry_return_5)), div(add(vol_adj_roc_20, vol_drop_profit_cnt_5), sub(nonlinear_mv_volume, alpha_007))) _temp_d_term1_div = _safe_divide(df['log_close'], df['industry_return_5']) _temp_d_term1 = df['pullback_strong_20_20'] * _temp_d_term1_div _temp_d_term2_sub = df['nonlinear_mv_volume'] - df['alpha_007'] _temp_d_term2_add = df['vol_adj_roc_20'] + df['vol_drop_profit_cnt_5'] _temp_d_term2 = _safe_divide(_temp_d_term2_add, _temp_d_term2_sub) df['_temp_D'] = _temp_d_term1 - _temp_d_term2 _temp_cols_list.extend(['_temp_D', '_temp_d_term1_div', '_temp_d_term1', '_temp_d_term2_sub', '_temp_d_term2_add', '_temp_d_term2']) # 计算组件 A # A = add(add(mul(D, lg_buy_consolidation_20), lg_buy_consolidation_20), pullback_strong_20_20) _temp_a_term1 = df['_temp_D'] * df['lg_buy_consolidation_20'] _temp_a_term2 = _temp_a_term1 + df['lg_buy_consolidation_20'] df['_temp_A'] = _temp_a_term2 + df['pullback_strong_20_20'] _temp_cols_list.extend(['_temp_A', '_temp_a_term1', '_temp_a_term2']) # 计算组件 F # F = mul(add(net_mf_vol, std_return_5), sub(arbr, industry_act_factor5)) _temp_f_term1 = df['net_mf_vol'] + df['std_return_5'] _temp_f_term2 = df['arbr'] - df['industry_act_factor5'] df['_temp_F'] = _temp_f_term1 * _temp_f_term2 _temp_cols_list.extend(['_temp_F', '_temp_f_term1', '_temp_f_term2']) # 计算组件 H # H = add(add(industry_act_factor1, low_cost_dev), mul(mv_weighted_turnover, act_factor4)) _temp_h_term1 = df['industry_act_factor1'] + df['low_cost_dev'] _temp_h_term2 = df['mv_weighted_turnover'] * df['act_factor4'] df['_temp_H'] = _temp_h_term1 + _temp_h_term2 _temp_cols_list.extend(['_temp_H', '_temp_h_term1', '_temp_h_term2']) # 计算组件 B # B = div(add(add(F, vol), H), lg_elg_buy_prop) _temp_b_term1 = df['_temp_F'] + df['vol'] _temp_b_term2 = _temp_b_term1 + df['_temp_H'] df['_temp_B'] = _safe_divide(_temp_b_term2, df['lg_elg_buy_prop']) _temp_cols_list.extend(['_temp_B', '_temp_b_term1', '_temp_b_term2']) # 计算组件 C # C = div(div(intraday_lg_flow_corr_20, lg_elg_buy_prop), lg_elg_buy_prop) # 注意: intraday_lg_flow_corr_20 可能本身就是 NaN 或需要特殊处理 _temp_c_term1 = _safe_divide(df.get('intraday_lg_flow_corr_20', np.nan), df['lg_elg_buy_prop']) # 使用 .get 处理可能不存在的列 df['_temp_C'] = _safe_divide(_temp_c_term1, df['lg_elg_buy_prop']) _temp_cols_list.extend(['_temp_C', '_temp_c_term1']) # --- 计算最终表达式 --- # final = sub(div(A, B), C) _temp_final_term1 = _safe_divide(df['_temp_A'], df['_temp_B']) final_factor_series = _temp_final_term1 - df['_temp_C'] # --- 将最终结果赋值给 df 的新列 (原地修改) --- df[factor_name] = final_factor_series print(f"因子 {factor_name} 计算成功。") except KeyError as e: # 捕获因为缺少列而产生的错误 print(f"错误: 计算 {factor_name} 时缺少必需的列: {e}") print("请确保输入的 DataFrame 包含所有表达式中引用的因子列。") print("将为因子 {factor_name} 填充 NaN。") df[factor_name] = np.nan # 出错时填充 NaN except Exception as e: # 捕获其他可能的计算错误 print(f"错误: 计算 {factor_name} 时发生意外错误: {e}") print(f"将为因子 {factor_name} 填充 NaN。") df[factor_name] = np.nan # 出错时填充 NaN finally: # --- 清理所有中间计算列 --- cols_to_drop = [col for col in _temp_cols_list if col in df.columns] if cols_to_drop: df.drop(columns=cols_to_drop, inplace=True) # print(f"已清理 {len(cols_to_drop)} 个临时列 for {factor_name}.") print(f"因子 {factor_name} 计算流程结束。") # 函数不返回任何值,因为 df 是原地修改的 import pandas as pd import numpy as np # from scipy.stats import rankdata # rankdata is not needed if using pandas rank # import statsmodels.api as sm # Needed for factor 19 # --- Constants --- epsilon = 1e-10 # Prevent division by zero # --- Helper Functions --- def _safe_divide(numerator, denominator, default_val=0): """ 安全的除法函数,处理分母为零或接近零,以及NaN/Inf的情况。 Args: numerator (pd.Series): 分子. denominator (pd.Series): 分母. default_val (float): 当分母为零或结果无效时返回的默认值. Returns: pd.Series: 除法结果. """ with np.errstate(divide='ignore', invalid='ignore'): # Convert inputs to numeric, coercing errors to NaN before division num = pd.to_numeric(numerator, errors='coerce') den = pd.to_numeric(denominator, errors='coerce') # Perform division where denominator is not close to zero and inputs are valid numbers result = np.where(np.abs(den) > epsilon, num / den, default_val) # Ensure result is float, handle potential NaNs from coercion or division result = pd.to_numeric(result, errors='coerce') # Fill remaining NaNs if necessary result = np.nan_to_num(result, nan=default_val, posinf=default_val, neginf=default_val) # Ensure result index matches numerator's index if numerator is a Series if isinstance(numerator, pd.Series): return pd.Series(result, index=numerator.index) else: return pd.Series(result) # Fallback if numerator is not a Series (less likely) # --- Cross-Sectional Factor Calculation Functions (In-Place Modification) --- # Category 1: Cross-Sectional Flow & Behavior Strength def cs_rank_net_lg_flow_val(df: pd.DataFrame, factor_name: str = 'cs_rank_net_lg_flow_val'): """ Factor 1: 大单净额截面排序 (In-place). WARNING: Modifies df in-place. """ print(f"Calculating {factor_name}...") _temp_cols = ['_net_lg_flow_val'] try: df['_net_lg_flow_val'] = (df['buy_lg_vol'] + df['buy_elg_vol'] - df['sell_lg_vol'] - df['sell_elg_vol']) * df['close'] df[factor_name] = df.groupby('trade_date')['_net_lg_flow_val'].rank(pct=True) except KeyError as e: print(f"Error calculating {factor_name}: Missing column {e}. Assigning NaN.") df[factor_name] = np.nan except Exception as e: print(f"An unexpected error occurred calculating {factor_name}: {e}. Assigning NaN.") df[factor_name] = np.nan finally: cols_to_drop = [col for col in _temp_cols if col in df.columns] if cols_to_drop: df.drop(columns=cols_to_drop, inplace=True) print(f"Finished {factor_name}.") def cs_rank_flow_divergence(df: pd.DataFrame, factor_name: str = 'cs_rank_flow_divergence'): """ Factor 2: 大小单流向背离度截面排序 (In-place). WARNING: Modifies df in-place. """ print(f"Calculating {factor_name}...") _temp_cols = ['_lg_ratio', '_sm_ratio', '_divergence'] try: df['_lg_ratio'] = _safe_divide( (df['buy_lg_vol'] + df['buy_elg_vol'] - df['sell_lg_vol'] - df['sell_elg_vol']), df['vol'] ) df['_sm_ratio'] = _safe_divide( (df['buy_sm_vol'] - df['sell_sm_vol']), df['vol'] ) df['_divergence'] = df['_lg_ratio'] - df['_sm_ratio'] df[factor_name] = df.groupby('trade_date')['_divergence'].rank(pct=True) except KeyError as e: print(f"Error calculating {factor_name}: Missing column {e}. Assigning NaN.") df[factor_name] = np.nan except Exception as e: print(f"An unexpected error occurred calculating {factor_name}: {e}. Assigning NaN.") df[factor_name] = np.nan finally: cols_to_drop = [col for col in _temp_cols if col in df.columns] if cols_to_drop: df.drop(columns=cols_to_drop, inplace=True) print(f"Finished {factor_name}.") def cs_rank_industry_adj_lg_flow(df: pd.DataFrame, factor_name: str = 'cs_rank_ind_adj_lg_flow'): """ Factor 3: 行业内大单流强度排序 (In-place). Requires 'cat_l2_code'. WARNING: Modifies df in-place. """ print(f"Calculating {factor_name}...") _temp_cols = ['_net_lg_flow_vol', '_industry_avg_flow', '_deviation'] if 'cat_l2_code' not in df.columns: print(f"Error calculating {factor_name}: Missing 'cat_l2_code' column. Assigning NaN.") df[factor_name] = np.nan return try: df['_net_lg_flow_vol'] = (df['buy_lg_vol'] + df['buy_elg_vol'] - df['sell_lg_vol'] - df['sell_elg_vol']) * df['close'] # Or use vol df['_industry_avg_flow'] = df.groupby(['trade_date', 'cat_l2_code'])['_net_lg_flow_vol'].transform('mean') df['_deviation'] = df['_net_lg_flow_vol'] - df['_industry_avg_flow'] df[factor_name] = df.groupby('trade_date')['_deviation'].rank(pct=True) except KeyError as e: print(f"Error calculating {factor_name}: Missing column {e}. Assigning NaN.") df[factor_name] = np.nan except Exception as e: print(f"An unexpected error occurred calculating {factor_name}: {e}. Assigning NaN.") df[factor_name] = np.nan finally: cols_to_drop = [col for col in _temp_cols if col in df.columns] if cols_to_drop: df.drop(columns=cols_to_drop, inplace=True) print(f"Finished {factor_name}.") def cs_rank_elg_buy_ratio(df: pd.DataFrame, factor_name: str = 'cs_rank_elg_buy_ratio'): """ Factor 4: 超大单买入占比排序 (In-place). WARNING: Modifies df in-place. """ print(f"Calculating {factor_name}...") _temp_cols = ['_elg_buy_ratio'] try: df['_elg_buy_ratio'] = _safe_divide(df['buy_elg_vol'], df['vol']) df[factor_name] = df.groupby('trade_date')['_elg_buy_ratio'].rank(pct=True) except KeyError as e: print(f"Error calculating {factor_name}: Missing column {e}. Assigning NaN.") df[factor_name] = np.nan except Exception as e: print(f"An unexpected error occurred calculating {factor_name}: {e}. Assigning NaN.") df[factor_name] = np.nan finally: cols_to_drop = [col for col in _temp_cols if col in df.columns] if cols_to_drop: df.drop(columns=cols_to_drop, inplace=True) print(f"Finished {factor_name}.") # Category 2: Cross-Sectional Cost Basis & PnL Status def cs_rank_rel_profit_margin(df: pd.DataFrame, factor_name: str = 'cs_rank_rel_profit_margin'): """ Factor 5: 相对盈利幅度排序 (In-place). WARNING: Modifies df in-place. """ print(f"Calculating {factor_name}...") _temp_cols = ['_profit_margin'] try: df['_profit_margin'] = _safe_divide((df['close'] - df['weight_avg']), df['close']) df[factor_name] = df.groupby('trade_date')['_profit_margin'].rank(pct=True) except KeyError as e: print(f"Error calculating {factor_name}: Missing column {e}. Assigning NaN.") df[factor_name] = np.nan except Exception as e: print(f"An unexpected error occurred calculating {factor_name}: {e}. Assigning NaN.") df[factor_name] = np.nan finally: cols_to_drop = [col for col in _temp_cols if col in df.columns] if cols_to_drop: df.drop(columns=cols_to_drop, inplace=True) print(f"Finished {factor_name}.") def cs_rank_cost_breadth(df: pd.DataFrame, factor_name: str = 'cs_rank_cost_breadth'): """ Factor 6: 成本分布宽度截面排序 (In-place). WARNING: Modifies df in-place. """ print(f"Calculating {factor_name}...") _temp_cols = ['_cost_breadth'] try: df['_cost_breadth'] = _safe_divide((df['cost_85pct'] - df['cost_15pct']), df['weight_avg']) df[factor_name] = df.groupby('trade_date')['_cost_breadth'].rank(pct=True) except KeyError as e: print(f"Error calculating {factor_name}: Missing column {e}. Assigning NaN.") df[factor_name] = np.nan except Exception as e: print(f"An unexpected error occurred calculating {factor_name}: {e}. Assigning NaN.") df[factor_name] = np.nan finally: cols_to_drop = [col for col in _temp_cols if col in df.columns] if cols_to_drop: df.drop(columns=cols_to_drop, inplace=True) print(f"Finished {factor_name}.") def cs_rank_dist_to_upper_cost(df: pd.DataFrame, factor_name: str = 'cs_rank_dist_to_upper_cost'): """ Factor 7: 股价相对高成本位距离排序 (In-place). WARNING: Modifies df in-place. """ print(f"Calculating {factor_name}...") _temp_cols = ['_dist_to_95'] try: df['_dist_to_95'] = _safe_divide(df['close'], df['cost_95pct']) df[factor_name] = df.groupby('trade_date')['_dist_to_95'].rank(pct=True) except KeyError as e: print(f"Error calculating {factor_name}: Missing column {e}. Assigning NaN.") df[factor_name] = np.nan except Exception as e: print(f"An unexpected error occurred calculating {factor_name}: {e}. Assigning NaN.") df[factor_name] = np.nan finally: cols_to_drop = [col for col in _temp_cols if col in df.columns] if cols_to_drop: df.drop(columns=cols_to_drop, inplace=True) print(f"Finished {factor_name}.") def cs_rank_winner_rate(df: pd.DataFrame, factor_name: str = 'cs_rank_winner_rate'): """ Factor 8: 获利盘比例截面排序 (In-place). WARNING: Modifies df in-place. """ print(f"Calculating {factor_name}...") try: df[factor_name] = df.groupby('trade_date')['winner_rate'].rank(pct=True) except KeyError as e: print(f"Error calculating {factor_name}: Missing column {e}. Assigning NaN.") df[factor_name] = np.nan except Exception as e: print(f"An unexpected error occurred calculating {factor_name}: {e}. Assigning NaN.") df[factor_name] = np.nan finally: print(f"Finished {factor_name}.") # Category 3: Cross-Sectional Price Action & Volatility def cs_rank_intraday_range(df: pd.DataFrame, factor_name: str = 'cs_rank_intraday_range'): """ Factor 9: 日内相对振幅排序 (In-place). WARNING: Modifies df in-place. """ print(f"Calculating {factor_name}...") _temp_cols = ['_norm_range'] try: df['_norm_range'] = _safe_divide((df['high'] - df['low']), df['close']) df[factor_name] = df.groupby('trade_date')['_norm_range'].rank(pct=True) except KeyError as e: print(f"Error calculating {factor_name}: Missing column {e}. Assigning NaN.") df[factor_name] = np.nan except Exception as e: print(f"An unexpected error occurred calculating {factor_name}: {e}. Assigning NaN.") df[factor_name] = np.nan finally: cols_to_drop = [col for col in _temp_cols if col in df.columns] if cols_to_drop: df.drop(columns=cols_to_drop, inplace=True) print(f"Finished {factor_name}.") def cs_rank_close_pos_in_range(df: pd.DataFrame, factor_name: str = 'cs_rank_close_pos_in_range'): """ Factor 10: 收盘价在日内位置排序 (In-place). WARNING: Modifies df in-place. """ print(f"Calculating {factor_name}...") _temp_cols = ['_close_pos'] try: df['_close_pos'] = _safe_divide((df['close'] - df['low']), (df['high'] - df['low']), default_val=0.5) # Assign 0.5 if high==low df[factor_name] = df.groupby('trade_date')['_close_pos'].rank(pct=True) except KeyError as e: print(f"Error calculating {factor_name}: Missing column {e}. Assigning NaN.") df[factor_name] = np.nan except Exception as e: print(f"An unexpected error occurred calculating {factor_name}: {e}. Assigning NaN.") df[factor_name] = np.nan finally: cols_to_drop = [col for col in _temp_cols if col in df.columns] if cols_to_drop: df.drop(columns=cols_to_drop, inplace=True) print(f"Finished {factor_name}.") def cs_rank_opening_gap(df: pd.DataFrame, factor_name: str = 'cs_rank_opening_gap'): """ Factor 11: 开盘相对跳空幅度排序 (In-place). Needs pre_close. WARNING: Modifies df in-place. Assumes 'pre_close' exists. """ print(f"Calculating {factor_name}...") _temp_cols = ['_gap'] if 'pre_close' not in df.columns: print(f"Error calculating {factor_name}: Missing 'pre_close' column. Assigning NaN.") df[factor_name] = np.nan return try: df['_gap'] = _safe_divide(df['open'], df['pre_close']) - 1 df[factor_name] = df.groupby('trade_date')['_gap'].rank(pct=True) except KeyError as e: print(f"Error calculating {factor_name}: Missing column {e} (likely 'open'). Assigning NaN.") df[factor_name] = np.nan except Exception as e: print(f"An unexpected error occurred calculating {factor_name}: {e}. Assigning NaN.") df[factor_name] = np.nan finally: cols_to_drop = [col for col in _temp_cols if col in df.columns] if cols_to_drop: df.drop(columns=cols_to_drop, inplace=True) print(f"Finished {factor_name}.") def cs_rank_pos_in_hist_range(df: pd.DataFrame, factor_name: str = 'cs_rank_pos_in_hist_range'): """ Factor 12: 相对历史波动位置排序 (In-place). WARNING: Modifies df in-place. """ print(f"Calculating {factor_name}...") _temp_cols = ['_hist_pos'] try: df['_hist_pos'] = _safe_divide((df['close'] - df['his_low']), (df['his_high'] - df['his_low'])).clip(0, 1) # Clip to 0-1 range df[factor_name] = df.groupby('trade_date')['_hist_pos'].rank(pct=True) except KeyError as e: print(f"Error calculating {factor_name}: Missing column {e}. Assigning NaN.") df[factor_name] = np.nan except Exception as e: print(f"An unexpected error occurred calculating {factor_name}: {e}. Assigning NaN.") df[factor_name] = np.nan finally: cols_to_drop = [col for col in _temp_cols if col in df.columns] if cols_to_drop: df.drop(columns=cols_to_drop, inplace=True) print(f"Finished {factor_name}.") # Category 4: Cross-Sectional Interaction & Composite Indicators def cs_rank_vol_x_profit_margin(df: pd.DataFrame, factor_name: str = 'cs_rank_vol_x_profit_margin'): """ Factor 13: 波动率与盈亏状态交互排序 (In-place). WARNING: Modifies df in-place. """ print(f"Calculating {factor_name}...") _temp_cols = ['_daily_vol', '_profit_margin', '_interaction'] try: df['_daily_vol'] = abs(df['pct_chg']) df['_profit_margin'] = _safe_divide((df['close'] - df['weight_avg']), df['close']) df['_interaction'] = df['_daily_vol'] * df['_profit_margin'] df[factor_name] = df.groupby('trade_date')['_interaction'].rank(pct=True) except KeyError as e: print(f"Error calculating {factor_name}: Missing column {e}. Assigning NaN.") df[factor_name] = np.nan except Exception as e: print(f"An unexpected error occurred calculating {factor_name}: {e}. Assigning NaN.") df[factor_name] = np.nan finally: cols_to_drop = [col for col in _temp_cols if col in df.columns] if cols_to_drop: df.drop(columns=cols_to_drop, inplace=True) print(f"Finished {factor_name}.") def cs_rank_lg_flow_price_concordance(df: pd.DataFrame, factor_name: str = 'cs_rank_lg_flow_price_concordance'): """ Factor 14: 大单流向与价格变动一致性排序 (In-place). WARNING: Modifies df in-place. """ print(f"Calculating {factor_name}...") _temp_cols = ['_net_lg_flow_vol', '_concordance'] try: df['_net_lg_flow_vol'] = df['buy_lg_vol'] + df['buy_elg_vol'] - df['sell_lg_vol'] - df['sell_elg_vol'] df['_concordance'] = df['_net_lg_flow_vol'] * df['pct_chg'] df[factor_name] = df.groupby('trade_date')['_concordance'].rank(pct=True) except KeyError as e: print(f"Error calculating {factor_name}: Missing column {e}. Assigning NaN.") df[factor_name] = np.nan except Exception as e: print(f"An unexpected error occurred calculating {factor_name}: {e}. Assigning NaN.") df[factor_name] = np.nan finally: cols_to_drop = [col for col in _temp_cols if col in df.columns] if cols_to_drop: df.drop(columns=cols_to_drop, inplace=True) print(f"Finished {factor_name}.") def cs_rank_turnover_per_winner(df: pd.DataFrame, factor_name: str = 'cs_rank_turnover_per_winner'): """ Factor 15: 高换手获利盘占比排序 (In-place). WARNING: Modifies df in-place. """ print(f"Calculating {factor_name}...") _temp_cols = ['_turnover_per_winner'] try: df['_turnover_per_winner'] = _safe_divide(df['turnover_rate'], df['winner_rate']) df[factor_name] = df.groupby('trade_date')['_turnover_per_winner'].rank(pct=True) except KeyError as e: print(f"Error calculating {factor_name}: Missing column {e}. Assigning NaN.") df[factor_name] = np.nan except Exception as e: print(f"An unexpected error occurred calculating {factor_name}: {e}. Assigning NaN.") df[factor_name] = np.nan finally: cols_to_drop = [col for col in _temp_cols if col in df.columns] if cols_to_drop: df.drop(columns=cols_to_drop, inplace=True) print(f"Finished {factor_name}.") def cs_rank_ind_cap_neutral_pe(df: pd.DataFrame, factor_name: str = 'cs_rank_ind_cap_neutral_pe'): """ Factor 16: 行业市值中性化PE排序 (Placeholder). Requires statsmodels and complex cross-sectional regression implementation. WARNING: Modifies df in-place. Placeholder implementation returns NaN. """ print(f"Calculating {factor_name} (Placeholder - requires statsmodels)...") df[factor_name] = np.nan print(f"Finished {factor_name} (Placeholder).") def cs_rank_volume_ratio(df: pd.DataFrame, factor_name: str = 'cs_rank_volume_ratio'): """ Factor 17: 成交量相对强度排序 (In-place). WARNING: Modifies df in-place. """ print(f"Calculating {factor_name}...") try: # Assumes 'volume_ratio' (量比) column already exists df[factor_name] = df.groupby('trade_date')['volume_ratio'].rank(pct=True) except KeyError as e: print(f"Error calculating {factor_name}: Missing column {e}. Assigning NaN.") df[factor_name] = np.nan except Exception as e: print(f"An unexpected error occurred calculating {factor_name}: {e}. Assigning NaN.") df[factor_name] = np.nan finally: print(f"Finished {factor_name}.") def cs_rank_elg_buy_sell_sm_ratio(df: pd.DataFrame, factor_name: str = 'cs_rank_elg_buy_sell_sm_ratio'): """ Factor 18: 超大单买入与小单卖出比排序 (In-place). WARNING: Modifies df in-place. """ print(f"Calculating {factor_name}...") _temp_cols = ['_ratio'] try: df['_ratio'] = _safe_divide(df['buy_elg_vol'], df['sell_sm_vol']) df[factor_name] = df.groupby('trade_date')['_ratio'].rank(pct=True) except KeyError as e: print(f"Error calculating {factor_name}: Missing column {e}. Assigning NaN.") df[factor_name] = np.nan except Exception as e: print(f"An unexpected error occurred calculating {factor_name}: {e}. Assigning NaN.") df[factor_name] = np.nan finally: cols_to_drop = [col for col in _temp_cols if col in df.columns] if cols_to_drop: df.drop(columns=cols_to_drop, inplace=True) print(f"Finished {factor_name}.") def cs_rank_cost_dist_vol_ratio(df: pd.DataFrame, factor_name: str = 'cs_rank_cost_dist_vol_ratio'): """ Factor 19: 价格偏离成本程度与成交量放大交互排序 (In-place). WARNING: Modifies df in-place. """ print(f"Calculating {factor_name}...") _temp_cols = ['_dist', '_interaction'] if 'volume_ratio' not in df.columns: print(f"Error calculating {factor_name}: Missing 'volume_ratio' column. Assigning NaN.") df[factor_name] = np.nan return try: df['_dist'] = abs(df['close'] - df['weight_avg']) / (df['close'] + epsilon) df['_interaction'] = df['_dist'] * df['volume_ratio'] df[factor_name] = df.groupby('trade_date')['_interaction'].rank(pct=True) except KeyError as e: print(f"Error calculating {factor_name}: Missing column {e}. Assigning NaN.") df[factor_name] = np.nan except Exception as e: print(f"An unexpected error occurred calculating {factor_name}: {e}. Assigning NaN.") df[factor_name] = np.nan finally: cols_to_drop = [col for col in _temp_cols if col in df.columns] if cols_to_drop: df.drop(columns=cols_to_drop, inplace=True) print(f"Finished {factor_name}.") def cs_rank_size(df: pd.DataFrame, factor_name: str = 'cs_rank_size'): """ Factor 20: 市值因子暴露度排序 (Log of circ_mv) (In-place). WARNING: Modifies df in-place. """ print(f"Calculating {factor_name}...") _temp_cols = ['_log_circ_mv'] try: # Use log1p for stability if circ_mv can be zero or very small df['_log_circ_mv'] = np.log1p(df['circ_mv']) df[factor_name] = df.groupby('trade_date')['_log_circ_mv'].rank(pct=True) except KeyError as e: print(f"Error calculating {factor_name}: Missing column {e}. Assigning NaN.") df[factor_name] = np.nan except Exception as e: print(f"An unexpected error occurred calculating {factor_name}: {e}. Assigning NaN.") df[factor_name] = np.nan finally: cols_to_drop = [col for col in _temp_cols if col in df.columns] if cols_to_drop: df.drop(columns=cols_to_drop, inplace=True) print(f"Finished {factor_name}.")