Files
NewStock/main/factor/factor.txt
2025-11-29 00:23:12 +08:00

3576 lines
143 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import numpy as np
import pandas as pd
import talib
def get_rolling_factor(df):
old_columns = df.columns.tolist()[:]
# 按股票和日期排序(如果尚未排序)
df = df.sort_values(by=["ts_code", "trade_date"])
grouped = df.groupby("ts_code", group_keys=False)
epsilon = 1e-8
df["lg_elg_net_buy_vol"] = (
df["buy_lg_vol"] + df["buy_elg_vol"] - df["sell_lg_vol"] - df["sell_elg_vol"]
)
# 检查 'volume' 列是否存在且有效
df["flow_lg_elg_intensity"] = df["lg_elg_net_buy_vol"] / (df["vol"] + epsilon)
# 2. 散户与主力背离度 (Retail vs Institutional Divergence)
# 衡量小单净流入与(大单+超大单)净流入的差异或比率
df["sm_net_buy_vol"] = df["buy_sm_vol"] - df["sell_sm_vol"]
df["flow_divergence_diff"] = df["sm_net_buy_vol"] - df["lg_elg_net_buy_vol"]
# 比率形式可能更稳定
df["flow_divergence_ratio"] = df["sm_net_buy_vol"] / (
df["lg_elg_net_buy_vol"] + np.sign(df["lg_elg_net_buy_vol"]) * epsilon + epsilon
) # 复杂处理避免0/0
# 3. 资金流结构变动 (Flow Structure Change - Relative Strength of Large Flow)
# 大单+超大单买入额占总买入额的比例的变化
df["total_buy_vol"] = df["buy_sm_vol"] + df["buy_lg_vol"] + df["buy_elg_vol"]
df["lg_elg_buy_prop"] = (df["buy_lg_vol"] + df["buy_elg_vol"]) / (
df["total_buy_vol"] + epsilon
)
df["flow_struct_buy_change"] = grouped["lg_elg_buy_prop"].diff(1) # 1日变化
# 4. 资金流加速度 (Flow Acceleration)
# 净主力资金流的变化率(二阶导)
df["lg_elg_net_buy_vol_change"] = grouped["lg_elg_net_buy_vol"].diff(1)
df["flow_lg_elg_accel"] = grouped["lg_elg_net_buy_vol_change"].diff(1)
# # 5. 极端资金流事件 (Categorical: Extreme Flow Event)
# # 定义主力资金流强度是否处于其历史极端水平例如过去N天的90分位数以上或10分位数以下
# rolling_window = 20 # 可调整窗口期
# # Step 1: Calculate the rolling quantiles separately
# rolling_high = grouped['flow_lg_elg_intensity'].rolling(rolling_window, min_periods=1).quantile(0.9) # min_periods=1 保证窗口未满时也有输出
# rolling_low = grouped['flow_lg_elg_intensity'].rolling(rolling_window, min_periods=1).quantile(0.1)
# # Step 2: Assign the results to the DataFrame
# # 确保 df 和 rolling_high/low 的索引是一致的
# # 如果 df 的索引在此期间没有被修改过,这通常是安全的
# df['flow_lg_elg_intensity_rolling_high'] = rolling_high
# df['flow_lg_elg_intensity_rolling_low'] = rolling_low
# # Step 3: Continue with the logic using the new columns
# conditions_flow = [
# df['flow_lg_elg_intensity'] > df['flow_lg_elg_intensity_rolling_high'],
# df['flow_lg_elg_intensity'] < df['flow_lg_elg_intensity_rolling_low']
# ]
# choices_flow = [1, -1] # 1: 极端流入, -1: 极端流出
# df['cat_extreme_flow'] = np.select(conditions_flow, choices_flow, default=0)
# --- 筹码分布因子 ---
# 6. 筹码集中度 (Chip Concentration)
# 衡量筹码分布的紧密程度,例如 95% 与 5% 成本价的差距,相对于当前价格进行标准化
# 检查 'close' 列是否存在且有效
df["chip_concentration_range"] = (df["cost_95pct"] - df["cost_5pct"]) / (
df["close"] + epsilon
)
# 7. 筹码分布偏度 (Chip Distribution Skewness Proxy)
# 比较中位数成本 (cost_50pct) 和加权平均成本 (weight_avg)
# weight_avg > cost_50pct 暗示高成本区有较多筹码(右偏)
df["chip_skewness"] = (df["weight_avg"] - df["cost_50pct"]) / (
df["cost_50pct"] + epsilon
)
# 8. 浮筹比例 (Floating Chips Proxy)
# 衡量短期内例如15%成本线以下)的筹码比例与总获利盘比例的关系
# winner_rate 高但 cost_15pct 接近当前价,可能意味着大部分获利盘成本不高,易浮动
# 这里简化为:获利盘比例 与 (当前价-15%成本价)/当前价 的乘积
price_dist_cost15 = (df["close"] - df["cost_15pct"]) / (df["close"] + epsilon)
df["floating_chip_proxy"] = df["winner_rate"] * np.maximum(
0, price_dist_cost15
) # 只考虑价格高于15%成本线的情况
# 9. 成本支撑强度变化 (Cost Support Strength Change)
# 观察低位筹码成本(如 5% 或 15% 分位点)的变化率,看支撑位是上移还是下移
df["cost_support_15pct_change"] = (
grouped["cost_15pct"].pct_change(1) * 100
) # 百分比变化
# 10. 获利盘压力/支撑区 (Categorical: Winner Rate Zone & Price Position)
# 结合获利盘比例和当前价格相对于筹码成本的位置
# 例如: 价格在 85% 成本线之上 & 获利盘 > 0.8 -> 高位派发风险区?
# 价格在 15% 成本线之下 & 获利盘 < 0.2 -> 低位吸筹潜力区?
conditions_winner = [
(df["close"] > df["cost_85pct"]) & (df["winner_rate"] > 0.8), # 高位 & 高获利盘
(df["close"] < df["cost_15pct"]) & (df["winner_rate"] < 0.2), # 低位 & 低获利盘
(df["close"] > df["cost_50pct"])
& (df["winner_rate"] > 0.5), # 中高位 & 多数获利
(df["close"] < df["cost_50pct"])
& (df["winner_rate"] < 0.5), # 中低位 & 多数亏损
]
choices_winner = [1, 2, 3, 4] # 1:高风险区, 2:低潜力区, 3:中上获利区, 4:中下亏损区
df["cat_winner_price_zone"] = np.select(
conditions_winner, choices_winner, default=0
) # 0: 其他
# --- 结合因子 ---
# 11. 主力行为与筹码结构一致性 (Flow-Chip Consistency)
# 例如:主力净买入发生在价格接近下方筹码密集区(如 cost_15pct 到 cost_50pct
price_near_low_support = (df["close"] > df["cost_15pct"]) & (
df["close"] < df["cost_50pct"]
)
df["flow_chip_consistency"] = df[
"lg_elg_net_buy_vol"
] * price_near_low_support.astype(int)
# 可以进一步标准化或做成 categorical
# 12. 获利了结压力/承接盘强度 (Profit-Taking Pressure vs Absorption)
# 在高获利盘(winner_rate > 0.7)的情况下,观察主力资金是净流出(了结)还是净流入(高位换手/承接)
high_winner_rate_flag = (df["winner_rate"] > 0.7).astype(int)
df["profit_taking_vs_absorb"] = df["lg_elg_net_buy_vol"] * high_winner_rate_flag
# 正值表示高获利盘下主力仍在买入(承接),负值表示主力在卖出(了结)
# 清理临时列和可能产生的 NaN (可选,根据需要处理)
cols_to_drop = [
"lg_elg_net_buy_vol",
"sm_net_buy_vol",
"total_buy_vol",
"lg_elg_buy_prop",
"lg_elg_net_buy_vol_change",
"flow_lg_elg_intensity_rolling_high",
"flow_lg_elg_intensity_rolling_low",
]
# df = df.drop(columns=cols_to_drop)
window = 20
df["_is_positive"] = (df["pct_chg"] > 0).astype(int)
df["_is_negative"] = (df["pct_chg"] < 0).astype(int)
df["cat_is_positive"] = (df["pct_chg"] > 0).astype(int)
# 分离正负收益率 (用于计算各自的均值和平方均值)
# 注意:这里我们保留原始收益率用于计算,而不是 clip 到 0
df["_pos_returns"] = df["pct_chg"].where(
df["pct_chg"] > 0, 0
) # 非正设为0便于求和
df["_neg_returns"] = df["pct_chg"].where(
df["pct_chg"] < 0, 0
) # 非负设为0便于求和
# 计算收益率的平方 (用于计算 E[X^2])
df["_pos_returns_sq"] = np.square(df["_pos_returns"])
df["_neg_returns_sq"] = np.square(df["_neg_returns"]) # 平方后负数变正
# 4. 计算滚动统计量 (使用内置函数,速度较快)
# 计算正收益日的统计量
rolling_pos_count = (
grouped["_is_positive"].rolling(window, min_periods=max(1, window // 2)).sum()
)
rolling_pos_sum = (
grouped["_pos_returns"].rolling(window, min_periods=max(1, window // 2)).sum()
)
rolling_pos_sum_sq = (
grouped["_pos_returns_sq"]
.rolling(window, min_periods=max(1, window // 2))
.sum()
)
# 计算负收益日的统计量
rolling_neg_count = (
grouped["_is_negative"].rolling(window, min_periods=max(1, window // 2)).sum()
)
rolling_neg_sum = (
grouped["_neg_returns"].rolling(window, min_periods=max(1, window // 2)).sum()
)
rolling_neg_sum_sq = (
grouped["_neg_returns_sq"]
.rolling(window, min_periods=max(1, window // 2))
.sum()
)
# 5. 计算方差和标准差
pos_mean_sq = rolling_pos_sum_sq / rolling_pos_count
pos_mean = rolling_pos_sum / rolling_pos_count
pos_var = pos_mean_sq - np.square(pos_mean)
pos_var = pos_var.where(rolling_pos_count >= 2, np.nan).clip(lower=0)
upside_vol = np.sqrt(pos_var)
neg_mean_sq = rolling_neg_sum_sq / rolling_neg_count
neg_mean = rolling_neg_sum / rolling_neg_count # 注意 neg_mean 是负数
neg_var = neg_mean_sq - np.square(neg_mean)
neg_var = neg_var.where(rolling_neg_count >= 2, np.nan).clip(lower=0)
downside_vol = np.sqrt(neg_var)
# rolling 操作后结果带有 MultiIndex需要去除股票代码层级以便合并
df["upside_vol"] = upside_vol.reset_index(level=0, drop=True)
df["downside_vol"] = downside_vol.reset_index(level=0, drop=True)
df["vol_ratio"] = df["upside_vol"] / df["downside_vol"]
df["vol_ratio"] = (
df["vol_ratio"].replace([np.inf, -np.inf], np.nan).fillna(0)
) # 或 fillna(np.nan)
df["return_skew"] = (
grouped["pct_chg"].rolling(window=5).skew().reset_index(0, drop=True)
)
df["return_kurtosis"] = (
grouped["pct_chg"].rolling(window=5).kurt().reset_index(0, drop=True)
)
# 因子 1短期成交量变化率
df["volume_change_rate"] = (
grouped["vol"].rolling(window=2).mean()
/ grouped["vol"].rolling(window=10).mean()
- 1
).reset_index(
level=0, drop=True
) # 确保索引对齐
# 因子 2成交量突破信号
max_volume = (
grouped["vol"].rolling(window=5).max().reset_index(level=0, drop=True)
) # 确保索引对齐
df["cat_volume_breakout"] = df["vol"] > max_volume
# 因子 3换手率均线偏离度
mean_turnover = (
grouped["turnover_rate"]
.rolling(window=3)
.mean()
.reset_index(level=0, drop=True)
)
std_turnover = (
grouped["turnover_rate"].rolling(window=3).std().reset_index(level=0, drop=True)
)
df["turnover_deviation"] = (df["turnover_rate"] - mean_turnover) / std_turnover
# 因子 4换手率激增信号
df["cat_turnover_spike"] = df["turnover_rate"] > mean_turnover + 2 * std_turnover
# 因子 5量比均值
df["avg_volume_ratio"] = (
grouped["volume_ratio"].rolling(window=3).mean().reset_index(level=0, drop=True)
)
# 因子 6量比突破信号
max_volume_ratio = (
grouped["volume_ratio"].rolling(window=5).max().reset_index(level=0, drop=True)
)
df["cat_volume_ratio_breakout"] = df["volume_ratio"] > max_volume_ratio
df["vol_spike"] = grouped.apply(
lambda x: pd.Series(x["vol"].rolling(20).mean(), index=x.index)
)
df["vol_std_5"] = grouped["vol"].pct_change().rolling(window=5).std()
# 计算 ATR
df["atr_14"] = grouped.apply(
lambda x: pd.Series(
talib.ATR(
x["high"].values, x["low"].values, x["close"].values, timeperiod=14
),
index=x.index,
)
)
df["atr_6"] = grouped.apply(
lambda x: pd.Series(
talib.ATR(
x["high"].values, x["low"].values, x["close"].values, timeperiod=6
),
index=x.index,
)
)
# 计算 OBV 及其均线
df["obv"] = grouped.apply(
lambda x: pd.Series(
talib.OBV(x["close"].values, x["vol"].values), index=x.index
)
)
print(df.columns)
df["maobv_6"] = grouped.apply(
lambda x: pd.Series(talib.SMA(x["obv"].values, timeperiod=6), index=x.index)
)
df["rsi_3"] = grouped.apply(
lambda x: pd.Series(talib.RSI(x["close"].values, timeperiod=3), index=x.index)
)
# df['rsi_6'] = grouped.apply(
# lambda x: pd.Series(talib.RSI(x['close'].values, timeperiod=6), index=x.index)
# )
# df['rsi_9'] = grouped.apply(
# lambda x: pd.Series(talib.RSI(x['close'].values, timeperiod=9), index=x.index)
# )
# 计算 return_10 和 return_20
df["return_5"] = grouped["close"].apply(lambda x: x / x.shift(5) - 1)
# df['return_10'] = grouped['close'].apply(lambda x: x / x.shift(10) - 1)
df["return_20"] = grouped["close"].apply(lambda x: x / x.shift(20) - 1)
# df['avg_close_5'] = grouped['close'].apply(lambda x: x.rolling(window=5).mean() / x)
# 计算标准差指标
df["std_return_5"] = grouped["close"].apply(
lambda x: x.pct_change().rolling(window=5).std()
)
# df['std_return_15'] = grouped['close'].apply(lambda x: x.pct_change().rolling(window=15).std())
# df['std_return_25'] = grouped['close'].apply(lambda x: x.pct_change().rolling(window=25).std())
df["std_return_90"] = grouped["close"].apply(
lambda x: x.pct_change().rolling(window=90).std()
)
df["std_return_90_2"] = grouped["close"].apply(
lambda x: x.shift(10).pct_change().rolling(window=90).std()
)
# 计算 EMA 指标
df["_ema_5"] = grouped["close"].apply(
lambda x: pd.Series(talib.EMA(x.values, timeperiod=5), index=x.index)
)
df["_ema_13"] = grouped["close"].apply(
lambda x: pd.Series(talib.EMA(x.values, timeperiod=13), index=x.index)
)
df["_ema_20"] = grouped["close"].apply(
lambda x: pd.Series(talib.EMA(x.values, timeperiod=20), index=x.index)
)
df["_ema_60"] = grouped["close"].apply(
lambda x: pd.Series(talib.EMA(x.values, timeperiod=60), index=x.index)
)
# 计算 act_factor1, act_factor2, act_factor3, act_factor4
df["act_factor1"] = grouped["_ema_5"].apply(
lambda x: np.arctan((x / x.shift(1) - 1) * 100) * 57.3 / 50
)
df["act_factor2"] = grouped["_ema_13"].apply(
lambda x: np.arctan((x / x.shift(1) - 1) * 100) * 57.3 / 40
)
df["act_factor3"] = grouped["_ema_20"].apply(
lambda x: np.arctan((x / x.shift(1) - 1) * 100) * 57.3 / 21
)
df["act_factor4"] = grouped["_ema_60"].apply(
lambda x: np.arctan((x / x.shift(1) - 1) * 100) * 57.3 / 10
)
# 根据 trade_date 截面计算排名
df["rank_act_factor1"] = df.groupby("trade_date", group_keys=False)[
"act_factor1"
].rank(ascending=False, pct=True)
df["rank_act_factor2"] = df.groupby("trade_date", group_keys=False)[
"act_factor2"
].rank(ascending=False, pct=True)
df["rank_act_factor3"] = df.groupby("trade_date", group_keys=False)[
"act_factor3"
].rank(ascending=False, pct=True)
df["log_circ_mv"] = np.log(df["circ_mv"])
window_high_volume = 5
window_close_stddev = 20
period_delta = 5
# 计算每只股票的滚动协方差
def calculate_rolling_cov(group):
return group["high"].rolling(window_high_volume).cov(group["vol"])
df["cov"] = grouped.apply(calculate_rolling_cov)
# 计算每只股票的协方差差分
def calculate_delta_cov(group):
return group["cov"].diff(period_delta)
df["delta_cov"] = grouped.apply(calculate_delta_cov)
# 计算每只股票的滚动标准差
def calculate_stddev_close(group):
return group["close"].rolling(window_close_stddev).std()
df["_stddev_close"] = grouped.apply(calculate_stddev_close)
df["_rank_stddev"] = df.groupby("trade_date")["_stddev_close"].rank(pct=True)
df["alpha_22_improved"] = -1 * df["delta_cov"] * df["_rank_stddev"]
df["alpha_003"] = np.where(
df["high"] != df["low"],
(df["close"] - df["open"]) / (df["high"] - df["low"]),
0,
)
df["alpha_007"] = grouped.apply(lambda x: x["close"].rolling(5).corr(x["vol"]))
df["alpha_007"] = df.groupby("trade_date", group_keys=False)["alpha_007"].rank(
ascending=True, pct=True
)
df["alpha_013"] = grouped["close"].transform(
lambda x: x.rolling(5).sum() - x.rolling(20).sum()
)
df["alpha_013"] = df.groupby("trade_date", group_keys=False)["alpha_013"].rank(
ascending=True, pct=True
)
df["vol_break"] = np.where(
(df["close"] > df["cost_85pct"]) & (df["volume_ratio"] > 2), 1, 0
)
df["weight_roc5"] = grouped["weight_avg"].apply(lambda x: x.pct_change(5))
def rolling_corr(group):
roc_close = group["close"].pct_change()
roc_weight = group["weight_avg"].pct_change()
return roc_close.rolling(10).corr(roc_weight)
df["price_cost_divergence"] = grouped.apply(rolling_corr)
df["smallcap_concentration"] = (1 / df["log_circ_mv"]) * (
df["cost_85pct"] - df["cost_15pct"]
)
# 16. 筹码稳定性指数 (20日波动率)
df["weight_std20"] = grouped["weight_avg"].apply(lambda x: x.rolling(20).std())
df["cost_stability"] = df["weight_std20"] / grouped["weight_avg"].transform(
lambda x: x.rolling(20).mean()
)
# 17. 成本区间突破标记
df["high_cost_break_days"] = grouped.apply(
lambda g: g["close"].gt(g["cost_95pct"]).rolling(5).sum()
)
# 20. 筹码-流动性风险
df["liquidity_risk"] = (df["cost_95pct"] - df["cost_5pct"]) * (
1 / grouped["vol"].transform(lambda x: x.rolling(10).mean())
)
# 7. 市值波动率因子 (使用 grouped)
df["turnover_std"] = grouped["turnover_rate"].transform(
lambda x: x.rolling(window=20).std()
)
df["mv_volatility"] = grouped.apply(lambda x: x["turnover_std"] / x["log_circ_mv"])
# 8. 市值成长性因子
df["volume_growth"] = grouped["vol"].pct_change(periods=20)
df["mv_growth"] = df["volume_growth"] / df["log_circ_mv"]
df.drop(columns=["weight_std20"], inplace=True, errors="ignore")
df.drop(
columns=[
"_is_positive",
"_is_negative",
"_pos_returns",
"_neg_returns",
"_pos_returns_sq",
"_neg_returns_sq",
],
inplace=True,
errors="ignore",
)
new_columns = [col for col in df.columns.tolist()[:] if col not in old_columns]
return df, new_columns
def get_simple_factor(df):
old_columns = df.columns.tolist()[:]
df = df.sort_values(by=["ts_code", "trade_date"])
alpha = 0.5
df["momentum_factor"] = df["volume_change_rate"] + alpha * df["turnover_deviation"]
df["resonance_factor"] = df["volume_ratio"] * df["pct_chg"]
df["log_close"] = np.log(df["close"])
df["cat_vol_spike"] = df["vol"] > 2 * df["vol_spike"]
df["up"] = (df["high"] - df[["close", "open"]].max(axis=1)) / df["close"]
df["down"] = (df[["close", "open"]].min(axis=1) - df["low"]) / df["close"]
df["obv_maobv_6"] = df["obv"] - df["maobv_6"]
# 计算比值指标
df["std_return_5_over_std_return_90"] = df["std_return_5"] / df["std_return_90"]
# df['std_return_5 / std_return_25'] = df['std_return_5'] / df['std_return_25']
# 计算标准差差值
df["std_return_90_minus_std_return_90_2"] = (
df["std_return_90"] - df["std_return_90_2"]
)
# df['cat_af1'] = df['act_factor1'] > 0
df["cat_af2"] = df["act_factor2"] > df["act_factor1"]
df["cat_af3"] = df["act_factor3"] > df["act_factor2"]
df["cat_af4"] = df["act_factor4"] > df["act_factor3"]
# 计算 act_factor5 和 act_factor6
df["act_factor5"] = (
df["act_factor1"] + df["act_factor2"] + df["act_factor3"] + df["act_factor4"]
)
df["act_factor6"] = (df["act_factor1"] - df["act_factor2"]) / np.sqrt(
df["act_factor1"] ** 2 + df["act_factor2"] ** 2
)
df["active_buy_volume_large"] = df["buy_lg_vol"] / df["net_mf_vol"]
df["active_buy_volume_big"] = df["buy_elg_vol"] / df["net_mf_vol"]
df["active_buy_volume_small"] = df["buy_sm_vol"] / df["net_mf_vol"]
df["buy_lg_vol_minus_sell_lg_vol"] = (df["buy_lg_vol"] - df["sell_lg_vol"]) / df[
"net_mf_vol"
]
df["buy_elg_vol_minus_sell_elg_vol"] = (
df["buy_elg_vol"] - df["sell_elg_vol"]
) / df["net_mf_vol"]
df["log_circ_mv"] = np.log(df["circ_mv"])
df["ctrl_strength"] = (df["cost_85pct"] - df["cost_15pct"]) / (
df["his_high"] - df["his_low"]
)
df["low_cost_dev"] = (df["close"] - df["cost_5pct"]) / (
df["cost_50pct"] - df["cost_5pct"]
)
df["asymmetry"] = (df["cost_95pct"] - df["cost_50pct"]) / (
df["cost_50pct"] - df["cost_5pct"]
)
df["lock_factor"] = df["turnover_rate"] * (
1 - (df["cost_95pct"] - df["cost_5pct"]) / (df["his_high"] - df["his_low"])
)
df["cat_vol_break"] = (df["close"] > df["cost_85pct"]) & (df["volume_ratio"] > 2)
df["cost_atr_adj"] = (df["cost_95pct"] - df["cost_5pct"]) / df["atr_14"]
# 12. 小盘股筹码集中度
df["smallcap_concentration"] = (1 / df["log_circ_mv"]) * (
df["cost_85pct"] - df["cost_15pct"]
)
df["cat_golden_resonance"] = (
(df["close"] > df["weight_avg"])
& (df["volume_ratio"] > 1.5)
& (df["winner_rate"] > 0.7)
)
df["mv_turnover_ratio"] = df["turnover_rate"] / df["log_circ_mv"]
df["mv_adjusted_volume"] = df["vol"] / df["log_circ_mv"]
df["mv_weighted_turnover"] = df["turnover_rate"] * (1 / df["log_circ_mv"])
df["nonlinear_mv_volume"] = df["vol"] / df["log_circ_mv"]
df["mv_volume_ratio"] = df["volume_ratio"] / df["log_circ_mv"]
df["mv_momentum"] = df["turnover_rate"] * df["volume_ratio"] / df["log_circ_mv"]
drop_columns = [col for col in df.columns if col.startswith("_")]
df.drop(columns=drop_columns, inplace=True, errors="ignore")
new_columns = [col for col in df.columns.tolist()[:] if col not in old_columns]
return df, new_columns
import pandas as pd
import numpy as np
from scipy.stats import linregress # For factor 4 (if implementing slope directly)
# from hurst import compute_Hc # For factor 18, needs pip install hurst
# import statsmodels.api as sm # For factor 16, needs pip install statsmodels
# --- Constants ---
epsilon = 1e-10 # Prevent division by zero
# --- Helper Functions ---
def _safe_divide(a, b, default_val=0):
"""Safe division, returns default_val for division by zero or NaN/inf results."""
with np.errstate(divide="ignore", invalid="ignore"):
result = a / b
# Replace NaN, Inf, -Inf resulting from division or invalid ops
result[~np.isfinite(result)] = default_val
return result
# --- Factor Calculation Functions (In-Place Modification) ---
# Category 1: Large Player Intent & Behavior
def lg_flow_mom_corr(
df: pd.DataFrame, N: int = 20, M: int = 60, factor_name: str = None
):
"""
Calculates Factor 1: Large Flow & Price Momentum Concordance (In-place).
WARNING: Modifies df in-place.
"""
if factor_name is None:
factor_name = f"lg_flow_mom_corr_{N}_{M}"
print(f"Calculating {factor_name}...")
_temp_cols = ["_net_lg_flow_val", "_rolling_net_lg_flow", "_price_mom"]
try:
df["_net_lg_flow_val"] = (
df["buy_lg_vol"]
+ df["buy_elg_vol"]
- df["sell_lg_vol"]
- df["sell_elg_vol"]
) * df["close"]
df["_rolling_net_lg_flow"] = (
df.groupby("ts_code")["_net_lg_flow_val"]
.rolling(N, min_periods=max(1, N // 2))
.sum()
.reset_index(level=0, drop=True)
)
df["_price_mom"] = df.groupby("ts_code")["close"].pct_change(N)
# Calculate correlation on the temporary Series to handle alignment
factor_series = (
df["_rolling_net_lg_flow"]
.rolling(M, min_periods=max(1, M // 2))
.corr(df["_price_mom"])
)
df[factor_name] = factor_series
except Exception as e:
print(f"Error calculating {factor_name}: {e}")
df[factor_name] = np.nan # Assign NaN on error
finally:
# Cleanup intermediate columns
cols_to_drop = [col for col in _temp_cols if col in df.columns]
if cols_to_drop:
df.drop(columns=cols_to_drop, inplace=True)
print(f"Finished {factor_name}.")
def lg_buy_consolidation(
df: pd.DataFrame, N: int = 20, vol_quantile: float = 0.2, factor_name: str = None
):
"""
Calculates Factor 2: Large Buying during Consolidation (In-place).
WARNING: Modifies df in-place.
"""
if factor_name is None:
factor_name = f"lg_buy_consolidation_{N}"
print(f"Calculating {factor_name}...")
_temp_cols = [
"_rolling_std",
"_net_lg_flow_ratio",
"_rolling_net_lg_flow_ratio_mean",
"_std_threshold",
]
try:
df["_rolling_std"] = (
df.groupby("ts_code")["close"]
.rolling(N, min_periods=max(1, N // 2))
.std()
.reset_index(level=0, drop=True)
)
df["_net_lg_flow_ratio"] = _safe_divide(
(
df["buy_lg_vol"]
+ df["buy_elg_vol"]
- df["sell_lg_vol"]
- df["sell_elg_vol"]
),
df["vol"],
)
df["_rolling_net_lg_flow_ratio_mean"] = (
df.groupby("ts_code")["_net_lg_flow_ratio"]
.rolling(N, min_periods=max(1, N // 2))
.mean()
.reset_index(level=0, drop=True)
)
df["_std_threshold"] = df.groupby("trade_date")["_rolling_std"].transform(
lambda x: x.quantile(vol_quantile)
)
df[factor_name] = df["_rolling_net_lg_flow_ratio_mean"].where(
df["_rolling_std"] < df["_std_threshold"]
)
except Exception as e:
print(f"Error calculating {factor_name}: {e}")
df[factor_name] = np.nan
finally:
cols_to_drop = [col for col in _temp_cols if col in df.columns]
if cols_to_drop:
df.drop(columns=cols_to_drop, inplace=True)
print(f"Finished {factor_name}.")
def lg_flow_accel(df: pd.DataFrame, factor_name: str = "lg_flow_accel"):
"""
Calculates Factor 3: Large Flow Acceleration (In-place).
WARNING: Modifies df in-place.
"""
print(f"Calculating {factor_name}...")
_temp_cols = ["_net_lg_flow_vol"]
try:
df["_net_lg_flow_vol"] = (
df["buy_lg_vol"]
+ df["buy_elg_vol"]
- df["sell_lg_vol"]
- df["sell_elg_vol"]
)
df[factor_name] = df.groupby("ts_code")["_net_lg_flow_vol"].diff(1).diff(1)
except Exception as e:
print(f"Error calculating {factor_name}: {e}")
df[factor_name] = np.nan
finally:
cols_to_drop = [col for col in _temp_cols if col in df.columns]
if cols_to_drop:
df.drop(columns=cols_to_drop, inplace=True)
print(f"Finished {factor_name}.")
def intraday_lg_flow_corr(df: pd.DataFrame, N: int = 20, factor_name: str = None):
"""
Calculates Factor 4: (Approx) Intraday Trend & Large Flow Correlation (In-place).
NOTE: Direct rolling correlation between two rolling series is complex/slow in pandas.
This provides a placeholder or requires significant optimization/pre-calculation.
WARNING: Modifies df in-place. Placeholder implementation returns NaN.
"""
if factor_name is None:
factor_name = f"intraday_lg_flow_corr_{N}"
print(f"Calculating {factor_name} (Placeholder - complex implementation)...")
df[factor_name] = (
np.nan
) # Placeholder, see previous thought process for detailed logic needed
print(f"Finished {factor_name} (Placeholder).")
# Category 2: Cost Basis & PnL Status
def profit_pressure(df: pd.DataFrame, factor_name: str = "profit_pressure"):
"""
Calculates Factor 5: Profit Pressure Index (In-place).
WARNING: Modifies df in-place.
"""
print(f"Calculating {factor_name}...")
_temp_cols = ["_profit_margin_85", "_profit_margin_95"]
try:
df["_profit_margin_85"] = _safe_divide(df["close"], df["cost_85pct"]) - 1
df["_profit_margin_95"] = _safe_divide(df["close"], df["cost_95pct"]) - 1
df[factor_name] = (
df["winner_rate"]
* 0.5
* (df["_profit_margin_85"] + df["_profit_margin_95"])
)
except Exception as e:
print(f"Error calculating {factor_name}: {e}")
df[factor_name] = np.nan
finally:
cols_to_drop = [col for col in _temp_cols if col in df.columns]
if cols_to_drop:
df.drop(columns=cols_to_drop, inplace=True)
print(f"Finished {factor_name}.")
def underwater_resistance(df: pd.DataFrame, factor_name: str = "underwater_resistance"):
"""
Calculates Factor 6: Resistance from Underwater Positions (In-place).
WARNING: Modifies df in-place.
"""
print(f"Calculating {factor_name}...")
_temp_cols = ["_underwater_ratio", "_dist_to_cost_15"]
try:
df["_underwater_ratio"] = 1.0 - df["winner_rate"]
df["_dist_to_cost_15"] = np.maximum(0, df["cost_15pct"] - df["close"]) / (
df["close"] + epsilon
)
df[factor_name] = df["_underwater_ratio"] * df["_dist_to_cost_15"]
except Exception as e:
print(f"Error calculating {factor_name}: {e}")
df[factor_name] = np.nan
finally:
cols_to_drop = [col for col in _temp_cols if col in df.columns]
if cols_to_drop:
df.drop(columns=cols_to_drop, inplace=True)
print(f"Finished {factor_name}.")
def cost_conc_std(df: pd.DataFrame, N: int = 20, factor_name: str = None):
"""
Calculates Factor 7: Cost Concentration Change (Std Dev) (In-place).
WARNING: Modifies df in-place.
"""
if factor_name is None:
factor_name = f"cost_conc_std_{N}"
print(f"Calculating {factor_name}...")
_temp_cols = ["_cost_range_norm"]
try:
df["_cost_range_norm"] = _safe_divide(
(df["cost_85pct"] - df["cost_15pct"]), (df["weight_avg"] + epsilon)
)
# Need to calculate rolling std on the temp col before dropping it
factor_series = (
df.groupby("ts_code")["_cost_range_norm"]
.rolling(N, min_periods=max(1, N // 2))
.std()
.reset_index(level=0, drop=True)
)
df[factor_name] = factor_series
except Exception as e:
print(f"Error calculating {factor_name}: {e}")
df[factor_name] = np.nan
finally:
cols_to_drop = [col for col in _temp_cols if col in df.columns]
if cols_to_drop:
df.drop(columns=cols_to_drop, inplace=True)
print(f"Finished {factor_name}.")
def profit_decay(df: pd.DataFrame, N: int = 20, factor_name: str = None):
"""
Calculates Factor 8: Profit Expectation Decay (In-place).
WARNING: Modifies df in-place.
"""
if factor_name is None:
factor_name = f"profit_decay_{N}"
print(f"Calculating {factor_name}...")
_temp_cols = ["_ret_N", "_winner_rate_change_N"]
try:
df["_ret_N"] = (
_safe_divide(df["close"], df.groupby("ts_code")["close"].shift(N)) - 1
)
df["_winner_rate_change_N"] = df.groupby("ts_code")["winner_rate"].diff(N)
df[factor_name] = _safe_divide(df["_ret_N"], df["_winner_rate_change_N"])
except Exception as e:
print(f"Error calculating {factor_name}: {e}")
df[factor_name] = np.nan
finally:
cols_to_drop = [col for col in _temp_cols if col in df.columns]
if cols_to_drop:
df.drop(columns=cols_to_drop, inplace=True)
print(f"Finished {factor_name}.")
# Category 3: Volatility Source & Market State
def vol_amp_loss(df: pd.DataFrame, N: int = 20, factor_name: str = None):
"""
Calculates Factor 9: Volatility Amplification when Underwater (In-place).
WARNING: Modifies df in-place.
"""
if factor_name is None:
factor_name = f"vol_amp_loss_{N}"
print(f"Calculating {factor_name}...")
_temp_cols = ["_vol_N", "_loss_degree"]
try:
df["_vol_N"] = (
df.groupby("ts_code")["pct_chg"]
.rolling(N, min_periods=max(1, N // 2))
.std()
.reset_index(level=0, drop=True)
)
df["_loss_degree"] = np.maximum(0, df["weight_avg"] - df["close"]) / (
df["close"] + epsilon
)
df[factor_name] = df["_vol_N"] * df["_loss_degree"]
except Exception as e:
print(f"Error calculating {factor_name}: {e}")
df[factor_name] = np.nan
finally:
cols_to_drop = [col for col in _temp_cols if col in df.columns]
if cols_to_drop:
df.drop(columns=cols_to_drop, inplace=True)
print(f"Finished {factor_name}.")
def vol_drop_profit_cnt(
df: pd.DataFrame,
N: int = 20,
M: int = 5,
profit_thresh: float = 0.1,
drop_thresh: float = -0.03,
vol_multiple: float = 2.0,
factor_name: str = None,
):
"""
Calculates Factor 10: High Volume Drop when Profitable (Count over M days) (In-place).
WARNING: Modifies df in-place.
"""
if factor_name is None:
factor_name = f"vol_drop_profit_cnt_{M}"
print(f"Calculating {factor_name}...")
_temp_cols = [
"_is_profitable",
"_is_dropping",
"_rolling_mean_vol",
"_rolling_std_vol",
"_is_high_vol",
"_event",
]
try:
df["_is_profitable"] = df["close"] > df["weight_avg"] * (1 + profit_thresh)
df["_is_dropping"] = df["pct_chg"] < drop_thresh
df["_rolling_mean_vol"] = (
df.groupby("ts_code")["vol"]
.rolling(N, min_periods=1)
.mean()
.reset_index(level=0, drop=True)
)
df["_rolling_std_vol"] = (
df.groupby("ts_code")["vol"]
.rolling(N, min_periods=2)
.std()
.reset_index(level=0, drop=True)
.fillna(0)
)
df["_is_high_vol"] = df["vol"] > (
df["_rolling_mean_vol"] + vol_multiple * df["_rolling_std_vol"]
)
df["_event"] = (
df["_is_profitable"] & df["_is_dropping"] & df["_is_high_vol"]
).astype(int)
factor_series = (
df.groupby("ts_code")["_event"]
.rolling(M, min_periods=1)
.sum()
.reset_index(level=0, drop=True)
)
df[factor_name] = factor_series
except Exception as e:
print(f"Error calculating {factor_name}: {e}")
df[factor_name] = np.nan
finally:
cols_to_drop = [col for col in _temp_cols if col in df.columns]
if cols_to_drop:
df.drop(columns=cols_to_drop, inplace=True)
print(f"Finished {factor_name}.")
def lg_flow_vol_interact(df: pd.DataFrame, N: int = 20, factor_name: str = None):
"""
Calculates Factor 11: Large Flow Driven Volatility (Interaction Term) (In-place).
WARNING: Modifies df in-place.
"""
if factor_name is None:
factor_name = f"lg_flow_vol_interact_{N}"
print(f"Calculating {factor_name}...")
_temp_cols = [
"_vol_N",
"_net_lg_flow_val",
"_total_val",
"_abs_net_lg_flow_ratio",
"_abs_net_lg_flow_ratio_N",
]
try:
df["_vol_N"] = (
df.groupby("ts_code")["pct_chg"]
.rolling(N, min_periods=max(1, N // 2))
.std()
.reset_index(level=0, drop=True)
)
df["_net_lg_flow_val"] = (
df["buy_lg_vol"]
+ df["buy_elg_vol"]
- df["sell_lg_vol"]
- df["sell_elg_vol"]
) * df["close"]
df["_total_val"] = df["vol"] * df["close"]
df["_abs_net_lg_flow_ratio"] = abs(df["_net_lg_flow_val"]) / (
df["_total_val"] + epsilon
)
df["_abs_net_lg_flow_ratio_N"] = (
df.groupby("ts_code")["_abs_net_lg_flow_ratio"]
.rolling(N, min_periods=max(1, N // 2))
.mean()
.reset_index(level=0, drop=True)
)
df[factor_name] = df["_vol_N"] * df["_abs_net_lg_flow_ratio_N"]
except Exception as e:
print(f"Error calculating {factor_name}: {e}")
df[factor_name] = np.nan
finally:
cols_to_drop = [col for col in _temp_cols if col in df.columns]
if cols_to_drop:
df.drop(columns=cols_to_drop, inplace=True)
print(f"Finished {factor_name}.")
def cost_break_confirm_cnt(df: pd.DataFrame, M: int = 5, factor_name: str = None):
"""
Calculates Factor 12: Cost Breakout Confirmation Count (In-place).
WARNING: Modifies df in-place.
"""
if factor_name is None:
factor_name = f"cost_break_confirm_cnt_{M}"
print(f"Calculating {factor_name}...")
_temp_cols = [
"_prev_cost_85",
"_prev_cost_15",
"_break_up",
"_break_down",
"_net_lg_flow_vol",
"_confirm_up",
"_confirm_down",
"_net_confirm",
]
try:
df["_prev_cost_85"] = df.groupby("ts_code")["cost_85pct"].shift(1)
df["_prev_cost_15"] = df.groupby("ts_code")["cost_15pct"].shift(1)
df["_break_up"] = df["close"] > df["_prev_cost_85"]
df["_break_down"] = df["close"] < df["_prev_cost_15"]
df["_net_lg_flow_vol"] = (
df["buy_lg_vol"]
+ df["buy_elg_vol"]
- df["sell_lg_vol"]
- df["sell_elg_vol"]
)
df["_confirm_up"] = (df["_break_up"] & (df["_net_lg_flow_vol"] > 0)).astype(int)
df["_confirm_down"] = (df["_break_down"] & (df["_net_lg_flow_vol"] < 0)).astype(
int
)
df["_net_confirm"] = df["_confirm_up"] - df["_confirm_down"]
factor_series = (
df.groupby("ts_code")["_net_confirm"]
.rolling(M, min_periods=1)
.sum()
.reset_index(level=0, drop=True)
)
df[factor_name] = factor_series
except Exception as e:
print(f"Error calculating {factor_name}: {e}")
df[factor_name] = np.nan
finally:
cols_to_drop = [col for col in _temp_cols if col in df.columns]
if cols_to_drop:
df.drop(columns=cols_to_drop, inplace=True)
print(f"Finished {factor_name}.")
# Category 4: Technical Indicators & Market Behavior
def atr_norm_channel_pos(df: pd.DataFrame, N: int = 14, factor_name: str = None):
"""
Calculates Factor 13: ATR Normalized Channel Position (In-place).
WARNING: Modifies df in-place.
"""
if factor_name is None:
factor_name = f"atr_norm_channel_pos_{N}"
print(f"Calculating {factor_name}...")
_temp_cols = [
"_prev_close",
"_h_l",
"_h_pc",
"_l_pc",
"_tr",
"_atr_N",
"_roll_low_N",
]
try:
df["_prev_close"] = df.groupby("ts_code")["close"].shift(1)
df["_h_l"] = df["high"] - df["low"]
df["_h_pc"] = abs(df["high"] - df["_prev_close"])
df["_l_pc"] = abs(df["low"] - df["_prev_close"])
df["_tr"] = df[["_h_l", "_h_pc", "_l_pc"]].max(axis=1)
df["_atr_N"] = (
df.groupby("ts_code")["_tr"]
.rolling(N, min_periods=max(1, N // 2))
.mean()
.reset_index(level=0, drop=True)
)
df["_roll_low_N"] = (
df.groupby("ts_code")["low"]
.rolling(N, min_periods=max(1, N // 2))
.min()
.reset_index(level=0, drop=True)
)
df[factor_name] = _safe_divide((df["close"] - df["_roll_low_N"]), df["_atr_N"])
except Exception as e:
print(f"Error calculating {factor_name}: {e}")
df[factor_name] = np.nan
finally:
cols_to_drop = [col for col in _temp_cols if col in df.columns]
if cols_to_drop:
df.drop(columns=cols_to_drop, inplace=True)
print(f"Finished {factor_name}.")
def turnover_diff_skew(df: pd.DataFrame, N: int = 20, factor_name: str = None):
"""
Calculates Factor 14: Skewness of Turnover Rate Change (In-place).
WARNING: Modifies df in-place.
"""
if factor_name is None:
factor_name = f"turnover_diff_skew_{N}"
print(f"Calculating {factor_name}...")
_temp_cols = ["_turnover_diff"]
try:
# Assuming turnover_rate is in percentage points, diff is fine
df["_turnover_diff"] = df.groupby("ts_code")["turnover_rate"].diff(1)
factor_series = (
df.groupby("ts_code")["_turnover_diff"]
.rolling(N, min_periods=max(3, N // 2))
.skew()
.reset_index(level=0, drop=True)
)
df[factor_name] = factor_series
except Exception as e:
print(f"Error calculating {factor_name}: {e}")
df[factor_name] = np.nan
finally:
cols_to_drop = [col for col in _temp_cols if col in df.columns]
if cols_to_drop:
df.drop(columns=cols_to_drop, inplace=True)
print(f"Finished {factor_name}.")
def lg_sm_flow_diverge(df: pd.DataFrame, N: int = 20, factor_name: str = None):
"""
Calculates Factor 15: Divergence between Large and Small Flow (In-place).
WARNING: Modifies df in-place.
"""
if factor_name is None:
factor_name = f"lg_sm_flow_diverge_{N}"
print(f"Calculating {factor_name}...")
_temp_cols = [
"_lg_flow_ratio",
"_sm_flow_ratio",
"_lg_flow_ratio_N",
"_sm_flow_ratio_N",
]
try:
df["_lg_flow_ratio"] = _safe_divide(
(
df["buy_lg_vol"]
+ df["buy_elg_vol"]
- df["sell_lg_vol"]
- df["sell_elg_vol"]
),
df["vol"],
)
df["_sm_flow_ratio"] = _safe_divide(
(df["buy_sm_vol"] - df["sell_sm_vol"]), df["vol"]
)
df["_lg_flow_ratio_N"] = (
df.groupby("ts_code")["_lg_flow_ratio"]
.rolling(N, min_periods=max(1, N // 2))
.mean()
.reset_index(level=0, drop=True)
)
df["_sm_flow_ratio_N"] = (
df.groupby("ts_code")["_sm_flow_ratio"]
.rolling(N, min_periods=max(1, N // 2))
.mean()
.reset_index(level=0, drop=True)
)
df[factor_name] = df["_lg_flow_ratio_N"] - df["_sm_flow_ratio_N"]
except Exception as e:
print(f"Error calculating {factor_name}: {e}")
df[factor_name] = np.nan
finally:
cols_to_drop = [col for col in _temp_cols if col in df.columns]
if cols_to_drop:
df.drop(columns=cols_to_drop, inplace=True)
print(f"Finished {factor_name}.")
def cap_neutral_cost_metric(
df: pd.DataFrame, factor_name: str = "cap_neutral_cost_metric"
):
"""
Calculates Factor 16: Market Cap Neutralized Cost Metric (Placeholder).
Requires statsmodels and complex implementation.
WARNING: Modifies df in-place. Placeholder implementation returns NaN.
"""
print(f"Calculating {factor_name} (Placeholder - requires statsmodels)...")
df[factor_name] = np.nan
print(f"Finished {factor_name} (Placeholder).")
def pullback_strong(
df: pd.DataFrame,
N: int = 20,
M: int = 20,
gain_thresh: float = 0.2,
factor_name: str = None,
):
"""
Calculates Factor 17: Pullback Depth from Recent High for Strong Stocks (In-place).
WARNING: Modifies df in-place.
"""
if factor_name is None:
factor_name = f"pullback_strong_{N}_{M}"
print(f"Calculating {factor_name}...")
_temp_cols = ["_high_N", "_pullback_depth", "_recent_gain_M"]
try:
df["_high_N"] = (
df.groupby("ts_code")["high"]
.rolling(N, min_periods=max(1, N // 2))
.max()
.reset_index(level=0, drop=True)
)
df["_pullback_depth"] = _safe_divide(
(df["_high_N"] - df["close"]), df["_high_N"]
)
df["_recent_gain_M"] = (
_safe_divide(df["close"], df.groupby("ts_code")["close"].shift(M)) - 1
)
df[factor_name] = _safe_divide(df["_pullback_depth"], df["_recent_gain_M"])
except Exception as e:
print(f"Error calculating {factor_name}: {e}")
df[factor_name] = np.nan
finally:
cols_to_drop = [col for col in _temp_cols if col in df.columns]
if cols_to_drop:
df.drop(columns=cols_to_drop, inplace=True)
print(f"Finished {factor_name}.")
def hurst_exponent_flow(
df: pd.DataFrame, N: int = 60, flow_col: str = "net_mf_vol", factor_name: str = None
):
"""
Calculates Factor 18: Hurst Exponent of Money Flow (Placeholder).
Requires 'hurst' library and potentially slow rolling apply.
WARNING: Modifies df in-place. Placeholder implementation returns NaN.
"""
if factor_name is None:
factor_name = f"hurst_{flow_col}_{N}"
print(f"Calculating {factor_name} (Placeholder - requires hurst library)...")
try:
from hurst import compute_Hc
# Logic would go here, likely using rolling().apply() which is slow
# factor_series = df.groupby('ts_code')[flow_col]....apply(hurst_calc_func...)
df[factor_name] = np.nan # Placeholder
except ImportError:
print("Error: 'hurst' library not installed. Cannot calculate factor.")
df[factor_name] = np.nan
except Exception as e:
print(f"Error calculating {factor_name}: {e}")
df[factor_name] = np.nan
print(f"Finished {factor_name} (Placeholder).")
def vol_wgt_hist_pos(df: pd.DataFrame, N: int = 20, factor_name: str = None):
"""
Calculates Factor 19: Volume Weighting at Historical Price Level (In-place).
WARNING: Modifies df in-place.
"""
if factor_name is None:
factor_name = f"vol_wgt_hist_pos_{N}"
print(f"Calculating {factor_name}...")
_temp_cols = ["_hist_pos", "_rolling_mean_vol", "_vol_rel_strength"]
try:
df["_hist_pos"] = _safe_divide(
(df["close"] - df["his_low"]), (df["his_high"] - df["his_low"])
).clip(0, 1)
df["_rolling_mean_vol"] = (
df.groupby("ts_code")["vol"]
.rolling(N, min_periods=max(1, N // 2))
.mean()
.reset_index(level=0, drop=True)
)
df["_vol_rel_strength"] = _safe_divide(df["vol"], df["_rolling_mean_vol"])
df[factor_name] = df["_hist_pos"] * df["_vol_rel_strength"]
except Exception as e:
print(f"Error calculating {factor_name}: {e}")
df[factor_name] = np.nan
finally:
cols_to_drop = [col for col in _temp_cols if col in df.columns]
if cols_to_drop:
df.drop(columns=cols_to_drop, inplace=True)
print(f"Finished {factor_name}.")
def vol_adj_roc(df: pd.DataFrame, N: int = 20, factor_name: str = None):
"""
Calculates Factor 20: Volatility-Adjusted ROC (In-place).
WARNING: Modifies df in-place.
"""
if factor_name is None:
factor_name = f"vol_adj_roc_{N}"
print(f"Calculating {factor_name}...")
_temp_cols = ["_roc_N", "_vol_N"]
try:
df["_roc_N"] = (
_safe_divide(df["close"], df.groupby("ts_code")["close"].shift(N)) - 1
)
df["_vol_N"] = (
df.groupby("ts_code")["pct_chg"]
.rolling(N, min_periods=max(2, N // 2))
.std()
.reset_index(level=0, drop=True)
.fillna(0)
)
df[factor_name] = _safe_divide(df["_roc_N"], df["_vol_N"])
except Exception as e:
print(f"Error calculating {factor_name}: {e}")
df[factor_name] = np.nan
finally:
cols_to_drop = [col for col in _temp_cols if col in df.columns]
if cols_to_drop:
df.drop(columns=cols_to_drop, inplace=True)
print(f"Finished {factor_name}.")
def calculate_complex_factor(
df: pd.DataFrame, factor_name: str = "complex_factor_deap_1"
):
"""
表达式: sub(protected_div_torch(A, B), C)
其中 A, B, C 及内部组件依赖于多个预计算因子列。
Args:
df (pd.DataFrame): 包含所有必需基础因子列的 DataFrame。
factor_name (str): 要在 df 中创建的新因子列的名称。
WARNING: 此函数会原地修改输入的 DataFrame 'df'。
如果在计算过程中缺少任何必需的列,将打印错误并填充 NaN。
"""
print(f"开始计算因子: {factor_name} (原地修改)...")
_temp_cols_list = [] # 用于记录中间计算列的名称
try:
# --- 分解计算表达式的各个部分 ---
# 计算组件 D
# D = sub(mul(pullback_strong_20_20, div(log_close, industry_return_5)), div(add(vol_adj_roc_20, vol_drop_profit_cnt_5), sub(nonlinear_mv_volume, alpha_007)))
_temp_d_term1_div = _safe_divide(df["log_close"], df["industry_return_5"])
_temp_d_term1 = df["pullback_strong_20_20"] * _temp_d_term1_div
_temp_d_term2_sub = df["nonlinear_mv_volume"] - df["alpha_007"]
_temp_d_term2_add = df["vol_adj_roc_20"] + df["vol_drop_profit_cnt_5"]
_temp_d_term2 = _safe_divide(_temp_d_term2_add, _temp_d_term2_sub)
df["_temp_D"] = _temp_d_term1 - _temp_d_term2
_temp_cols_list.extend(
[
"_temp_D",
"_temp_d_term1_div",
"_temp_d_term1",
"_temp_d_term2_sub",
"_temp_d_term2_add",
"_temp_d_term2",
]
)
# 计算组件 A
# A = add(add(mul(D, lg_buy_consolidation_20), lg_buy_consolidation_20), pullback_strong_20_20)
_temp_a_term1 = df["_temp_D"] * df["lg_buy_consolidation_20"]
_temp_a_term2 = _temp_a_term1 + df["lg_buy_consolidation_20"]
df["_temp_A"] = _temp_a_term2 + df["pullback_strong_20_20"]
_temp_cols_list.extend(["_temp_A", "_temp_a_term1", "_temp_a_term2"])
# 计算组件 F
# F = mul(add(net_mf_vol, std_return_5), sub(arbr, industry_act_factor5))
_temp_f_term1 = df["net_mf_vol"] + df["std_return_5"]
_temp_f_term2 = df["arbr"] - df["industry_act_factor5"]
df["_temp_F"] = _temp_f_term1 * _temp_f_term2
_temp_cols_list.extend(["_temp_F", "_temp_f_term1", "_temp_f_term2"])
# 计算组件 H
# H = add(add(industry_act_factor1, low_cost_dev), mul(mv_weighted_turnover, act_factor4))
_temp_h_term1 = df["industry_act_factor1"] + df["low_cost_dev"]
_temp_h_term2 = df["mv_weighted_turnover"] * df["act_factor4"]
df["_temp_H"] = _temp_h_term1 + _temp_h_term2
_temp_cols_list.extend(["_temp_H", "_temp_h_term1", "_temp_h_term2"])
# 计算组件 B
# B = div(add(add(F, vol), H), lg_elg_buy_prop)
_temp_b_term1 = df["_temp_F"] + df["vol"]
_temp_b_term2 = _temp_b_term1 + df["_temp_H"]
df["_temp_B"] = _safe_divide(_temp_b_term2, df["lg_elg_buy_prop"])
_temp_cols_list.extend(["_temp_B", "_temp_b_term1", "_temp_b_term2"])
# 计算组件 C
# C = div(div(intraday_lg_flow_corr_20, lg_elg_buy_prop), lg_elg_buy_prop)
# 注意: intraday_lg_flow_corr_20 可能本身就是 NaN 或需要特殊处理
_temp_c_term1 = _safe_divide(
df.get("intraday_lg_flow_corr_20", np.nan), df["lg_elg_buy_prop"]
) # 使用 .get 处理可能不存在的列
df["_temp_C"] = _safe_divide(_temp_c_term1, df["lg_elg_buy_prop"])
_temp_cols_list.extend(["_temp_C", "_temp_c_term1"])
# --- 计算最终表达式 ---
# final = sub(div(A, B), C)
_temp_final_term1 = _safe_divide(df["_temp_A"], df["_temp_B"])
final_factor_series = _temp_final_term1 - df["_temp_C"]
# --- 将最终结果赋值给 df 的新列 (原地修改) ---
df[factor_name] = final_factor_series
print(f"因子 {factor_name} 计算成功。")
except KeyError as e:
# 捕获因为缺少列而产生的错误
print(f"错误: 计算 {factor_name} 时缺少必需的列: {e}")
print("请确保输入的 DataFrame 包含所有表达式中引用的因子列。")
print("将为因子 {factor_name} 填充 NaN。")
df[factor_name] = np.nan # 出错时填充 NaN
except Exception as e:
# 捕获其他可能的计算错误
print(f"错误: 计算 {factor_name} 时发生意外错误: {e}")
print(f"将为因子 {factor_name} 填充 NaN。")
df[factor_name] = np.nan # 出错时填充 NaN
finally:
# --- 清理所有中间计算列 ---
cols_to_drop = [col for col in _temp_cols_list if col in df.columns]
if cols_to_drop:
df.drop(columns=cols_to_drop, inplace=True)
# print(f"已清理 {len(cols_to_drop)} 个临时列 for {factor_name}.")
print(f"因子 {factor_name} 计算流程结束。")
# 函数不返回任何值,因为 df 是原地修改的
import pandas as pd
import numpy as np
# from scipy.stats import rankdata # rankdata is not needed if using pandas rank
# import statsmodels.api as sm # Needed for factor 19
# --- Constants ---
epsilon = 1e-10 # Prevent division by zero
# --- Helper Functions ---
def _safe_divide(numerator, denominator, default_val=0):
"""
安全的除法函数处理分母为零或接近零以及NaN/Inf的情况。
Args:
numerator (pd.Series): 分子.
denominator (pd.Series): 分母.
default_val (float): 当分母为零或结果无效时返回的默认值.
Returns:
pd.Series: 除法结果.
"""
with np.errstate(divide="ignore", invalid="ignore"):
# Convert inputs to numeric, coercing errors to NaN before division
num = pd.to_numeric(numerator, errors="coerce")
den = pd.to_numeric(denominator, errors="coerce")
# Perform division where denominator is not close to zero and inputs are valid numbers
result = np.where(np.abs(den) > epsilon, num / den, default_val)
# Ensure result is float, handle potential NaNs from coercion or division
result = pd.to_numeric(result, errors="coerce")
# Fill remaining NaNs if necessary
result = np.nan_to_num(
result, nan=default_val, posinf=default_val, neginf=default_val
)
# Ensure result index matches numerator's index if numerator is a Series
if isinstance(numerator, pd.Series):
return pd.Series(result, index=numerator.index)
else:
return pd.Series(result) # Fallback if numerator is not a Series (less likely)
# --- Cross-Sectional Factor Calculation Functions (In-Place Modification) ---
# Category 1: Cross-Sectional Flow & Behavior Strength
def cs_rank_net_lg_flow_val(
df: pd.DataFrame, factor_name: str = "cs_rank_net_lg_flow_val"
):
"""
Factor 1: 大单净额截面排序 (In-place).
WARNING: Modifies df in-place.
"""
print(f"Calculating {factor_name}...")
_temp_cols = ["_net_lg_flow_val"]
try:
df["_net_lg_flow_val"] = (
df["buy_lg_vol"]
+ df["buy_elg_vol"]
- df["sell_lg_vol"]
- df["sell_elg_vol"]
) * df["close"]
df[factor_name] = df.groupby("trade_date")["_net_lg_flow_val"].rank(pct=True)
except KeyError as e:
print(f"Error calculating {factor_name}: Missing column {e}. Assigning NaN.")
df[factor_name] = np.nan
except Exception as e:
print(
f"An unexpected error occurred calculating {factor_name}: {e}. Assigning NaN."
)
df[factor_name] = np.nan
finally:
cols_to_drop = [col for col in _temp_cols if col in df.columns]
if cols_to_drop:
df.drop(columns=cols_to_drop, inplace=True)
print(f"Finished {factor_name}.")
def cs_rank_flow_divergence(
df: pd.DataFrame, factor_name: str = "cs_rank_flow_divergence"
):
"""
Factor 2: 大小单流向背离度截面排序 (In-place).
WARNING: Modifies df in-place.
"""
print(f"Calculating {factor_name}...")
_temp_cols = ["_lg_ratio", "_sm_ratio", "_divergence"]
try:
df["_lg_ratio"] = _safe_divide(
(
df["buy_lg_vol"]
+ df["buy_elg_vol"]
- df["sell_lg_vol"]
- df["sell_elg_vol"]
),
df["vol"],
)
df["_sm_ratio"] = _safe_divide(
(df["buy_sm_vol"] - df["sell_sm_vol"]), df["vol"]
)
df["_divergence"] = df["_lg_ratio"] - df["_sm_ratio"]
df[factor_name] = df.groupby("trade_date")["_divergence"].rank(pct=True)
except KeyError as e:
print(f"Error calculating {factor_name}: Missing column {e}. Assigning NaN.")
df[factor_name] = np.nan
except Exception as e:
print(
f"An unexpected error occurred calculating {factor_name}: {e}. Assigning NaN."
)
df[factor_name] = np.nan
finally:
cols_to_drop = [col for col in _temp_cols if col in df.columns]
if cols_to_drop:
df.drop(columns=cols_to_drop, inplace=True)
print(f"Finished {factor_name}.")
def cs_rank_industry_adj_lg_flow(
df: pd.DataFrame, factor_name: str = "cs_rank_ind_adj_lg_flow"
):
"""
Factor 3: 行业内大单流强度排序 (In-place). Requires 'cat_l2_code'.
WARNING: Modifies df in-place.
"""
print(f"Calculating {factor_name}...")
_temp_cols = ["_net_lg_flow_vol", "_industry_avg_flow", "_deviation"]
if "cat_l2_code" not in df.columns:
print(
f"Error calculating {factor_name}: Missing 'cat_l2_code' column. Assigning NaN."
)
df[factor_name] = np.nan
return
try:
df["_net_lg_flow_vol"] = (
df["buy_lg_vol"]
+ df["buy_elg_vol"]
- df["sell_lg_vol"]
- df["sell_elg_vol"]
) * df[
"close"
] # Or use vol
df["_industry_avg_flow"] = df.groupby(["trade_date", "cat_l2_code"])[
"_net_lg_flow_vol"
].transform("mean")
df["_deviation"] = df["_net_lg_flow_vol"] - df["_industry_avg_flow"]
df[factor_name] = df.groupby("trade_date")["_deviation"].rank(pct=True)
except KeyError as e:
print(f"Error calculating {factor_name}: Missing column {e}. Assigning NaN.")
df[factor_name] = np.nan
except Exception as e:
print(
f"An unexpected error occurred calculating {factor_name}: {e}. Assigning NaN."
)
df[factor_name] = np.nan
finally:
cols_to_drop = [col for col in _temp_cols if col in df.columns]
if cols_to_drop:
df.drop(columns=cols_to_drop, inplace=True)
print(f"Finished {factor_name}.")
def cs_rank_elg_buy_ratio(df: pd.DataFrame, factor_name: str = "cs_rank_elg_buy_ratio"):
"""
Factor 4: 超大单买入占比排序 (In-place).
WARNING: Modifies df in-place.
"""
print(f"Calculating {factor_name}...")
_temp_cols = ["_elg_buy_ratio"]
try:
df["_elg_buy_ratio"] = _safe_divide(df["buy_elg_vol"], df["vol"])
df[factor_name] = df.groupby("trade_date")["_elg_buy_ratio"].rank(pct=True)
except KeyError as e:
print(f"Error calculating {factor_name}: Missing column {e}. Assigning NaN.")
df[factor_name] = np.nan
except Exception as e:
print(
f"An unexpected error occurred calculating {factor_name}: {e}. Assigning NaN."
)
df[factor_name] = np.nan
finally:
cols_to_drop = [col for col in _temp_cols if col in df.columns]
if cols_to_drop:
df.drop(columns=cols_to_drop, inplace=True)
print(f"Finished {factor_name}.")
# Category 2: Cross-Sectional Cost Basis & PnL Status
def cs_rank_rel_profit_margin(
df: pd.DataFrame, factor_name: str = "cs_rank_rel_profit_margin"
):
"""
Factor 5: 相对盈利幅度排序 (In-place).
WARNING: Modifies df in-place.
"""
print(f"Calculating {factor_name}...")
_temp_cols = ["_profit_margin"]
try:
df["_profit_margin"] = _safe_divide(
(df["close"] - df["weight_avg"]), df["close"]
)
df[factor_name] = df.groupby("trade_date")["_profit_margin"].rank(pct=True)
except KeyError as e:
print(f"Error calculating {factor_name}: Missing column {e}. Assigning NaN.")
df[factor_name] = np.nan
except Exception as e:
print(
f"An unexpected error occurred calculating {factor_name}: {e}. Assigning NaN."
)
df[factor_name] = np.nan
finally:
cols_to_drop = [col for col in _temp_cols if col in df.columns]
if cols_to_drop:
df.drop(columns=cols_to_drop, inplace=True)
print(f"Finished {factor_name}.")
def cs_rank_cost_breadth(df: pd.DataFrame, factor_name: str = "cs_rank_cost_breadth"):
"""
Factor 6: 成本分布宽度截面排序 (In-place).
WARNING: Modifies df in-place.
"""
print(f"Calculating {factor_name}...")
_temp_cols = ["_cost_breadth"]
try:
df["_cost_breadth"] = _safe_divide(
(df["cost_85pct"] - df["cost_15pct"]), df["weight_avg"]
)
df[factor_name] = df.groupby("trade_date")["_cost_breadth"].rank(pct=True)
except KeyError as e:
print(f"Error calculating {factor_name}: Missing column {e}. Assigning NaN.")
df[factor_name] = np.nan
except Exception as e:
print(
f"An unexpected error occurred calculating {factor_name}: {e}. Assigning NaN."
)
df[factor_name] = np.nan
finally:
cols_to_drop = [col for col in _temp_cols if col in df.columns]
if cols_to_drop:
df.drop(columns=cols_to_drop, inplace=True)
print(f"Finished {factor_name}.")
def cs_rank_dist_to_upper_cost(
df: pd.DataFrame, factor_name: str = "cs_rank_dist_to_upper_cost"
):
"""
Factor 7: 股价相对高成本位距离排序 (In-place).
WARNING: Modifies df in-place.
"""
print(f"Calculating {factor_name}...")
_temp_cols = ["_dist_to_95"]
try:
df["_dist_to_95"] = _safe_divide(df["close"], df["cost_95pct"])
df[factor_name] = df.groupby("trade_date")["_dist_to_95"].rank(pct=True)
except KeyError as e:
print(f"Error calculating {factor_name}: Missing column {e}. Assigning NaN.")
df[factor_name] = np.nan
except Exception as e:
print(
f"An unexpected error occurred calculating {factor_name}: {e}. Assigning NaN."
)
df[factor_name] = np.nan
finally:
cols_to_drop = [col for col in _temp_cols if col in df.columns]
if cols_to_drop:
df.drop(columns=cols_to_drop, inplace=True)
print(f"Finished {factor_name}.")
def cs_rank_winner_rate(df: pd.DataFrame, factor_name: str = "cs_rank_winner_rate"):
"""
Factor 8: 获利盘比例截面排序 (In-place).
WARNING: Modifies df in-place.
"""
print(f"Calculating {factor_name}...")
try:
df[factor_name] = df.groupby("trade_date")["winner_rate"].rank(pct=True)
except KeyError as e:
print(f"Error calculating {factor_name}: Missing column {e}. Assigning NaN.")
df[factor_name] = np.nan
except Exception as e:
print(
f"An unexpected error occurred calculating {factor_name}: {e}. Assigning NaN."
)
df[factor_name] = np.nan
finally:
print(f"Finished {factor_name}.")
# Category 3: Cross-Sectional Price Action & Volatility
def cs_rank_intraday_range(
df: pd.DataFrame, factor_name: str = "cs_rank_intraday_range"
):
"""
Factor 9: 日内相对振幅排序 (In-place).
WARNING: Modifies df in-place.
"""
print(f"Calculating {factor_name}...")
_temp_cols = ["_norm_range"]
try:
df["_norm_range"] = _safe_divide((df["high"] - df["low"]), df["close"])
df[factor_name] = df.groupby("trade_date")["_norm_range"].rank(pct=True)
except KeyError as e:
print(f"Error calculating {factor_name}: Missing column {e}. Assigning NaN.")
df[factor_name] = np.nan
except Exception as e:
print(
f"An unexpected error occurred calculating {factor_name}: {e}. Assigning NaN."
)
df[factor_name] = np.nan
finally:
cols_to_drop = [col for col in _temp_cols if col in df.columns]
if cols_to_drop:
df.drop(columns=cols_to_drop, inplace=True)
print(f"Finished {factor_name}.")
def cs_rank_close_pos_in_range(
df: pd.DataFrame, factor_name: str = "cs_rank_close_pos_in_range"
):
"""
Factor 10: 收盘价在日内位置排序 (In-place).
WARNING: Modifies df in-place.
"""
print(f"Calculating {factor_name}...")
_temp_cols = ["_close_pos"]
try:
df["_close_pos"] = _safe_divide(
(df["close"] - df["low"]), (df["high"] - df["low"]), default_val=0.5
) # Assign 0.5 if high==low
df[factor_name] = df.groupby("trade_date")["_close_pos"].rank(pct=True)
except KeyError as e:
print(f"Error calculating {factor_name}: Missing column {e}. Assigning NaN.")
df[factor_name] = np.nan
except Exception as e:
print(
f"An unexpected error occurred calculating {factor_name}: {e}. Assigning NaN."
)
df[factor_name] = np.nan
finally:
cols_to_drop = [col for col in _temp_cols if col in df.columns]
if cols_to_drop:
df.drop(columns=cols_to_drop, inplace=True)
print(f"Finished {factor_name}.")
def cs_rank_opening_gap(df: pd.DataFrame, factor_name: str = "cs_rank_opening_gap"):
"""
Factor 11: 开盘相对跳空幅度排序 (In-place). Needs pre_close.
WARNING: Modifies df in-place. Assumes 'pre_close' exists.
"""
print(f"Calculating {factor_name}...")
_temp_cols = ["_gap"]
if "pre_close" not in df.columns:
print(
f"Error calculating {factor_name}: Missing 'pre_close' column. Assigning NaN."
)
df[factor_name] = np.nan
return
try:
df["_gap"] = _safe_divide(df["open"], df["pre_close"]) - 1
df[factor_name] = df.groupby("trade_date")["_gap"].rank(pct=True)
except KeyError as e:
print(
f"Error calculating {factor_name}: Missing column {e} (likely 'open'). Assigning NaN."
)
df[factor_name] = np.nan
except Exception as e:
print(
f"An unexpected error occurred calculating {factor_name}: {e}. Assigning NaN."
)
df[factor_name] = np.nan
finally:
cols_to_drop = [col for col in _temp_cols if col in df.columns]
if cols_to_drop:
df.drop(columns=cols_to_drop, inplace=True)
print(f"Finished {factor_name}.")
def cs_rank_pos_in_hist_range(
df: pd.DataFrame, factor_name: str = "cs_rank_pos_in_hist_range"
):
"""
Factor 12: 相对历史波动位置排序 (In-place).
WARNING: Modifies df in-place.
"""
print(f"Calculating {factor_name}...")
_temp_cols = ["_hist_pos"]
try:
df["_hist_pos"] = _safe_divide(
(df["close"] - df["his_low"]), (df["his_high"] - df["his_low"])
).clip(
0, 1
) # Clip to 0-1 range
df[factor_name] = df.groupby("trade_date")["_hist_pos"].rank(pct=True)
except KeyError as e:
print(f"Error calculating {factor_name}: Missing column {e}. Assigning NaN.")
df[factor_name] = np.nan
except Exception as e:
print(
f"An unexpected error occurred calculating {factor_name}: {e}. Assigning NaN."
)
df[factor_name] = np.nan
finally:
cols_to_drop = [col for col in _temp_cols if col in df.columns]
if cols_to_drop:
df.drop(columns=cols_to_drop, inplace=True)
print(f"Finished {factor_name}.")
# Category 4: Cross-Sectional Interaction & Composite Indicators
def cs_rank_vol_x_profit_margin(
df: pd.DataFrame, factor_name: str = "cs_rank_vol_x_profit_margin"
):
"""
Factor 13: 波动率与盈亏状态交互排序 (In-place).
WARNING: Modifies df in-place.
"""
print(f"Calculating {factor_name}...")
_temp_cols = ["_daily_vol", "_profit_margin", "_interaction"]
try:
df["_daily_vol"] = abs(df["pct_chg"])
df["_profit_margin"] = _safe_divide(
(df["close"] - df["weight_avg"]), df["close"]
)
df["_interaction"] = df["_daily_vol"] * df["_profit_margin"]
df[factor_name] = df.groupby("trade_date")["_interaction"].rank(pct=True)
except KeyError as e:
print(f"Error calculating {factor_name}: Missing column {e}. Assigning NaN.")
df[factor_name] = np.nan
except Exception as e:
print(
f"An unexpected error occurred calculating {factor_name}: {e}. Assigning NaN."
)
df[factor_name] = np.nan
finally:
cols_to_drop = [col for col in _temp_cols if col in df.columns]
if cols_to_drop:
df.drop(columns=cols_to_drop, inplace=True)
print(f"Finished {factor_name}.")
def cs_rank_lg_flow_price_concordance(
df: pd.DataFrame, factor_name: str = "cs_rank_lg_flow_price_concordance"
):
"""
Factor 14: 大单流向与价格变动一致性排序 (In-place).
WARNING: Modifies df in-place.
"""
print(f"Calculating {factor_name}...")
_temp_cols = ["_net_lg_flow_vol", "_concordance"]
try:
df["_net_lg_flow_vol"] = (
df["buy_lg_vol"]
+ df["buy_elg_vol"]
- df["sell_lg_vol"]
- df["sell_elg_vol"]
)
df["_concordance"] = df["_net_lg_flow_vol"] * df["pct_chg"]
df[factor_name] = df.groupby("trade_date")["_concordance"].rank(pct=True)
except KeyError as e:
print(f"Error calculating {factor_name}: Missing column {e}. Assigning NaN.")
df[factor_name] = np.nan
except Exception as e:
print(
f"An unexpected error occurred calculating {factor_name}: {e}. Assigning NaN."
)
df[factor_name] = np.nan
finally:
cols_to_drop = [col for col in _temp_cols if col in df.columns]
if cols_to_drop:
df.drop(columns=cols_to_drop, inplace=True)
print(f"Finished {factor_name}.")
def cs_rank_turnover_per_winner(
df: pd.DataFrame, factor_name: str = "cs_rank_turnover_per_winner"
):
"""
Factor 15: 高换手获利盘占比排序 (In-place).
WARNING: Modifies df in-place.
"""
print(f"Calculating {factor_name}...")
_temp_cols = ["_turnover_per_winner"]
try:
df["_turnover_per_winner"] = _safe_divide(
df["turnover_rate"], df["winner_rate"]
)
df[factor_name] = df.groupby("trade_date")["_turnover_per_winner"].rank(
pct=True
)
except KeyError as e:
print(f"Error calculating {factor_name}: Missing column {e}. Assigning NaN.")
df[factor_name] = np.nan
except Exception as e:
print(
f"An unexpected error occurred calculating {factor_name}: {e}. Assigning NaN."
)
df[factor_name] = np.nan
finally:
cols_to_drop = [col for col in _temp_cols if col in df.columns]
if cols_to_drop:
df.drop(columns=cols_to_drop, inplace=True)
print(f"Finished {factor_name}.")
def cs_rank_ind_cap_neutral_pe(
df: pd.DataFrame, factor_name: str = "cs_rank_ind_cap_neutral_pe"
):
"""
Factor 16: 行业市值中性化PE排序 (Placeholder).
Requires statsmodels and complex cross-sectional regression implementation.
WARNING: Modifies df in-place. Placeholder implementation returns NaN.
"""
print(f"Calculating {factor_name} (Placeholder - requires statsmodels)...")
df[factor_name] = np.nan
print(f"Finished {factor_name} (Placeholder).")
def cs_rank_volume_ratio(df: pd.DataFrame, factor_name: str = "cs_rank_volume_ratio"):
"""
Factor 17: 成交量相对强度排序 (In-place).
WARNING: Modifies df in-place.
"""
print(f"Calculating {factor_name}...")
try:
# Assumes 'volume_ratio' (量比) column already exists
df[factor_name] = df.groupby("trade_date")["volume_ratio"].rank(pct=True)
except KeyError as e:
print(f"Error calculating {factor_name}: Missing column {e}. Assigning NaN.")
df[factor_name] = np.nan
except Exception as e:
print(
f"An unexpected error occurred calculating {factor_name}: {e}. Assigning NaN."
)
df[factor_name] = np.nan
finally:
print(f"Finished {factor_name}.")
def cs_rank_elg_buy_sell_sm_ratio(
df: pd.DataFrame, factor_name: str = "cs_rank_elg_buy_sell_sm_ratio"
):
"""
Factor 18: 超大单买入与小单卖出比排序 (In-place).
WARNING: Modifies df in-place.
"""
print(f"Calculating {factor_name}...")
_temp_cols = ["_ratio"]
try:
df["_ratio"] = _safe_divide(df["buy_elg_vol"], df["sell_sm_vol"])
df[factor_name] = df.groupby("trade_date")["_ratio"].rank(pct=True)
except KeyError as e:
print(f"Error calculating {factor_name}: Missing column {e}. Assigning NaN.")
df[factor_name] = np.nan
except Exception as e:
print(
f"An unexpected error occurred calculating {factor_name}: {e}. Assigning NaN."
)
df[factor_name] = np.nan
finally:
cols_to_drop = [col for col in _temp_cols if col in df.columns]
if cols_to_drop:
df.drop(columns=cols_to_drop, inplace=True)
print(f"Finished {factor_name}.")
def cs_rank_cost_dist_vol_ratio(
df: pd.DataFrame, factor_name: str = "cs_rank_cost_dist_vol_ratio"
):
"""
Factor 19: 价格偏离成本程度与成交量放大交互排序 (In-place).
WARNING: Modifies df in-place.
"""
print(f"Calculating {factor_name}...")
_temp_cols = ["_dist", "_interaction"]
if "volume_ratio" not in df.columns:
print(
f"Error calculating {factor_name}: Missing 'volume_ratio' column. Assigning NaN."
)
df[factor_name] = np.nan
return
try:
df["_dist"] = abs(df["close"] - df["weight_avg"]) / (df["close"] + epsilon)
df["_interaction"] = df["_dist"] * df["volume_ratio"]
df[factor_name] = df.groupby("trade_date")["_interaction"].rank(pct=True)
except KeyError as e:
print(f"Error calculating {factor_name}: Missing column {e}. Assigning NaN.")
df[factor_name] = np.nan
except Exception as e:
print(
f"An unexpected error occurred calculating {factor_name}: {e}. Assigning NaN."
)
df[factor_name] = np.nan
finally:
cols_to_drop = [col for col in _temp_cols if col in df.columns]
if cols_to_drop:
df.drop(columns=cols_to_drop, inplace=True)
print(f"Finished {factor_name}.")
def cs_rank_size(df: pd.DataFrame, factor_name: str = "cs_rank_size"):
"""
Factor 20: 市值因子暴露度排序 (Log of circ_mv) (In-place).
WARNING: Modifies df in-place.
"""
print(f"Calculating {factor_name}...")
_temp_cols = ["_log_circ_mv"]
try:
# Use log1p for stability if circ_mv can be zero or very small
df["_log_circ_mv"] = np.log1p(df["circ_mv"])
df[factor_name] = df.groupby("trade_date")["_log_circ_mv"].rank(pct=True)
except KeyError as e:
print(f"Error calculating {factor_name}: Missing column {e}. Assigning NaN.")
df[factor_name] = np.nan
except Exception as e:
print(
f"An unexpected error occurred calculating {factor_name}: {e}. Assigning NaN."
)
df[factor_name] = np.nan
finally:
cols_to_drop = [col for col in _temp_cols if col in df.columns]
if cols_to_drop:
df.drop(columns=cols_to_drop, inplace=True)
print(f"Finished {factor_name}.")
def add_financial_factor(
main_df: pd.DataFrame,
financial_df: pd.DataFrame,
ts_code_col: str = "ts_code",
trade_date_col: str = "trade_date",
ann_date_col: str = "ann_date", # 公告日期
f_ann_date_col: str = "f_ann_date", # 实际公告日期 (优先使用)
factor_value_col: str = "undist_profit_ps", # 财务指标值所在的列
new_factor_col_name: str = "retained_profit_per_share", # 新因子列的名称
) -> pd.DataFrame:
"""
将财务指标数据(如每股未分配利润)作为因子添加到主时间序列 DataFrame 中。
使用 merge_asof 根据股票代码和公告日期,将最新的财务指标值匹配到每个交易日。
Args:
main_df: 包含时间序列交易数据的主 DataFrame (至少包含 ts_code_col 和 trade_date_col)。
financial_df: 包含财务指标数据的 DataFrame (至少包含 ts_code_col,
ann_date_col 或 f_ann_date_col, 以及 factor_value_col)。
ts_code_col: 股票代码列在两个 DataFrame 中的名称。默认为 'ts_code'。
trade_date_col: 交易日期列在 main_df 中的名称。默认为 'trade_date'。
ann_date_col: 公告日期列在 financial_df 中的名称(作为 f_ann_date_col 的备选)。默认为 'ann_date'。
f_ann_date_col: 实际公告日期列在 financial_df 中的名称(优先使用)。默认为 'f_ann_date'。
factor_value_col: 财务指标值(即要添加的因子值)在 financial_df 中的列名。默认为 'undistr_pft_ps'。
new_factor_col_name: 添加到 main_df 中的新因子列的名称。默认为 'retained_profit_per_share'。
Returns:
包含新因子列的 main_df DataFrame。
"""
# --- 数据校验 ---
required_main_cols = [ts_code_col, trade_date_col]
if not all(col in main_df.columns for col in required_main_cols):
raise ValueError(f"主 DataFrame 必须包含列: {required_main_cols}")
required_financial_cols = [ts_code_col, factor_value_col]
if f_ann_date_col and f_ann_date_col in financial_df.columns:
effective_date_col = f_ann_date_col
elif ann_date_col and ann_date_col in financial_df.columns:
effective_date_col = ann_date_col
else:
raise ValueError(
f"财务指标 DataFrame 必须包含列 '{f_ann_date_col}' 或 '{ann_date_col}' 作为数据生效日期"
)
required_financial_cols.append(effective_date_col)
if not all(col in financial_df.columns for col in required_financial_cols):
raise ValueError(f"财务指标 DataFrame 必须包含列: {required_financial_cols}")
# --- 数据预处理 ---
# 复制 main_df 避免修改原始 DataFrame
main_df = main_df.copy()
# 确保日期列是 datetime 类型
main_df[trade_date_col] = pd.to_datetime(main_df[trade_date_col])
financial_df[effective_date_col] = pd.to_datetime(financial_df[effective_date_col])
# 确保股票代码是字符串类型,以便合并时类型一致
main_df[ts_code_col] = main_df[ts_code_col].astype(str)
financial_df[ts_code_col] = financial_df[ts_code_col].astype(str)
# 选取 financial_df 中需要合并的列,并为 merge_asof 准备日期列
financial_data_subset = financial_df[
[ts_code_col, effective_date_col, factor_value_col]
].copy()
# 重命名 effective_date_col 为一个统一的名称,方便 merge_asof
# merge_asof 需要 right_on 参数,使用原始列名即可,不需要重命名
# 为了使用 merge_asof两个 DataFrame 都必须按合并键 (ts_code) 和日期列排序
main_df = main_df.sort_values(by=[ts_code_col, trade_date_col])
financial_data_subset = financial_data_subset.sort_values(
by=[ts_code_col, effective_date_col]
)
# --- 使用 merge_asof 计算因子 ---
# 执行 as-of 合并
df_with_factor = pd.merge_asof(
main_df,
financial_data_subset,
left_on=trade_date_col, # main_df 中用于匹配的日期列
right_on=effective_date_col, # financial_data_subset 中用于匹配的日期列
by=ts_code_col, # 按股票代码进行分组匹配
direction="backward", # 匹配方向:向后查找(即找 <= trade_date 的最近数据)
# 如果您需要容忍日期上的微小差异,可以使用 tolerance 参数
# tolerance=pd.Timedelta('1 days')
)
# 清理:移除用于匹配的 effective_date_col以及原始 financial_df 中可能带来的其他重复列
# merge_asof 默认不会带上 right DataFrame 中用于合并的 key 列,但如果名称不同可能会带上
# 这里的清理主要针对 effective_date_col
if (
effective_date_col in df_with_factor.columns
and effective_date_col != trade_date_col
):
# 确保不是trade_date_col本身被意外重命名
df_with_factor = df_with_factor.drop(columns=[effective_date_col])
# 重命名新加入的因子列
# merge_asof 会将 factor_value_col 直接带入,名称不变
# 我们将其重命名为 new_factor_col_name
if factor_value_col != new_factor_col_name:
if factor_value_col in df_with_factor.columns:
df_with_factor = df_with_factor.rename(
columns={factor_value_col: new_factor_col_name}
)
else:
print(
f"警告: 合并后未找到列 '{factor_value_col}',无法重命名为 '{new_factor_col_name}'。"
)
# --- 返回结果 ---
return df_with_factor
# --- ARBR 因子计算函数 ---
def calculate_arbr(df: pd.DataFrame, N: int = 26):
"""
计算 AR 和 BR 指标,并将结果原地添加到 DataFrame 中。
Args:
df (pd.DataFrame): 输入的 DataFrame必须包含 'ts_code', 'trade_date',
'open', 'high', 'low', 'close' 列。
建议预先按 ts_code, trade_date 排序。
N (int): 计算 AR, BR 的窗口期,默认为 26。
WARNING: 此函数会原地修改输入的 DataFrame 'df'。
"""
ar_col_name = "AR"
br_col_name = "BR"
print(f"开始计算因子: {ar_col_name}, {br_col_name} (原地修改)...")
_temp_cols = [] # 记录中间列
try:
# 0. 确保排序 (虽然 groupby 会处理,但有序更保险)
# df.sort_values(['ts_code', 'trade_date'], inplace=True) # 如果不确定df已排序
# 1. 计算所需中间值
df["_h_minus_o"] = df["high"] - df["open"]
df["_o_minus_l"] = df["open"] - df["low"]
df["_prev_close"] = df.groupby("ts_code")["close"].shift(1)
# BR 计算需要 max(0, H-PC) 和 max(0, PC-L)
df["_h_minus_pc_pos"] = np.maximum(0, df["high"] - df["_prev_close"])
df["_pc_minus_l_pos"] = np.maximum(0, df["_prev_close"] - df["low"])
_temp_cols.extend(
[
"_h_minus_o",
"_o_minus_l",
"_prev_close",
"_h_minus_pc_pos",
"_pc_minus_l_pos",
]
)
# 2. 计算滚动和
# 使用 min_periods=N 确保有完整的窗口数据才计算,也可以用 N//2 等
min_p = N # 严格要求 N 天数据
grouped = df.groupby("ts_code")
sum_h_minus_o = (
grouped["_h_minus_o"]
.rolling(N, min_periods=min_p)
.sum()
.reset_index(level=0, drop=True)
)
sum_o_minus_l = (
grouped["_o_minus_l"]
.rolling(N, min_periods=min_p)
.sum()
.reset_index(level=0, drop=True)
)
sum_h_minus_pc_pos = (
grouped["_h_minus_pc_pos"]
.rolling(N, min_periods=min_p)
.sum()
.reset_index(level=0, drop=True)
)
sum_pc_minus_l_pos = (
grouped["_pc_minus_l_pos"]
.rolling(N, min_periods=min_p)
.sum()
.reset_index(level=0, drop=True)
)
# 3. 计算 AR 和 BR
df[ar_col_name] = (
_safe_divide(sum_h_minus_o, sum_o_minus_l, default_val=np.nan) * 100
) # AR 通常乘以 100
df[br_col_name] = (
_safe_divide(sum_h_minus_pc_pos, sum_pc_minus_l_pos, default_val=np.nan)
* 100
) # BR 通常乘以 100
df[f"{ar_col_name}_{br_col_name}"] = df[ar_col_name] - df[br_col_name]
print(f"因子 {ar_col_name}, {br_col_name} 计算成功。")
except KeyError as e:
print(f"错误: 计算 ARBR 时缺少必需的列: {e}")
print(f"将为因子 {ar_col_name}, {br_col_name} 填充 NaN。")
if ar_col_name not in df.columns:
df[ar_col_name] = np.nan
if br_col_name not in df.columns:
df[br_col_name] = np.nan
except Exception as e:
print(f"错误: 计算 ARBR 时发生意外错误: {e}")
print(f"将为因子 {ar_col_name}, {br_col_name} 填充 NaN。")
if ar_col_name not in df.columns:
df[ar_col_name] = np.nan
if br_col_name not in df.columns:
df[br_col_name] = np.nan
finally:
# 4. 清理中间列
cols_to_drop = [col for col in _temp_cols if col in df.columns]
if cols_to_drop:
df.drop(columns=cols_to_drop, inplace=True)
print(f"因子 {ar_col_name}, {br_col_name} 计算流程结束。")
def add_financial_factor(
main_df: pd.DataFrame,
financial_df: pd.DataFrame,
factor_value_col: str, # 财务指标值所在的列
ts_code_col: str = "ts_code",
trade_date_col: str = "trade_date",
ann_date_col: str = "ann_date", # 公告日期
f_ann_date_col: str = "f_ann_date", # 实际公告日期 (优先使用)
) -> pd.DataFrame:
"""
将财务指标数据(如每股未分配利润)作为因子添加到主时间序列 DataFrame 中。
使用 merge_asof 根据股票代码和公告日期,将最新的财务指标值匹配到每个交易日。
Args:
main_df: 包含时间序列交易数据的主 DataFrame (至少包含 ts_code_col 和 trade_date_col)。
financial_df: 包含财务指标数据的 DataFrame (至少包含 ts_code_col,
ann_date_col 或 f_ann_date_col, 以及 factor_value_col)。
ts_code_col: 股票代码列在两个 DataFrame 中的名称。默认为 'ts_code'。
trade_date_col: 交易日期列在 main_df 中的名称。默认为 'trade_date'。
ann_date_col: 公告日期列在 financial_df 中的名称(作为 f_ann_date_col 的备选)。默认为 'ann_date'。
f_ann_date_col: 实际公告日期列在 financial_df 中的名称(优先使用)。默认为 'f_ann_date'。
factor_value_col: 财务指标值(即要添加的因子值)在 financial_df 中的列名。默认为 'undistr_pft_ps'。
new_factor_col_name: 添加到 main_df 中的新因子列的名称。默认为 'undist_profit_ps'。
Returns:
包含新因子列的 main_df DataFrame。
"""
if factor_value_col in main_df.columns:
return main_df
new_factor_col_name = factor_value_col
# --- 数据校验 ---
required_main_cols = [ts_code_col, trade_date_col]
if not all(col in main_df.columns for col in required_main_cols):
raise ValueError(f"主 DataFrame 必须包含列: {required_main_cols}")
required_financial_cols = [ts_code_col, factor_value_col]
if f_ann_date_col and f_ann_date_col in financial_df.columns:
effective_date_col = f_ann_date_col
print(f"使用 '{f_ann_date_col}' 作为财务数据生效日期。")
elif ann_date_col and ann_date_col in financial_df.columns:
effective_date_col = ann_date_col
print(f"使用 '{ann_date_col}' 作为财务数据生效日期。")
else:
raise ValueError(
f"财务指标 DataFrame 必须包含列 '{f_ann_date_col}' 或 '{ann_date_col}' 作为数据生效日期"
)
required_financial_cols.append(effective_date_col)
if not all(col in financial_df.columns for col in required_financial_cols):
raise ValueError(f"财务指标 DataFrame 必须包含列: {required_financial_cols}")
# --- 数据准备和清理 ---
# 确保日期列是 datetime 类型
# 使用 .copy() 避免 SettingWithCopyWarning
main_df = main_df.copy()
financial_df = financial_df.copy()
main_df[trade_date_col] = pd.to_datetime(main_df[trade_date_col], errors="coerce")
financial_df[effective_date_col] = pd.to_datetime(
financial_df[effective_date_col], errors="coerce"
)
# 确保股票代码是字符串类型
main_df[ts_code_col] = main_df[ts_code_col].astype(str)
financial_df[ts_code_col] = financial_df[ts_code_col].astype(str)
# 选取 financial_df 中需要合并的列
financial_data_subset = financial_df[
[ts_code_col, effective_date_col, factor_value_col]
].copy()
# *** 新增:处理右表合并键中的空值 ***
initial_rows_financial = len(financial_data_subset)
financial_data_subset = financial_data_subset.dropna(
subset=[ts_code_col, effective_date_col]
)
rows_dropped = initial_rows_financial - len(financial_data_subset)
if rows_dropped > 0:
print(
f"警告: 从 financial_data_subset 中移除了 {rows_dropped} 行,因为其 '{ts_code_col}' 或 '{effective_date_col}' 列存在空值。"
)
if financial_data_subset.empty:
print(
f"警告: 清理空值后 financial_data_subset 为空,无法添加因子 '{new_factor_col_name}'。将填充 NaN。"
)
main_df[new_factor_col_name] = np.nan
return main_df
# *** 修改:修正排序顺序以满足 merge_asof 要求 ***
# 先按 ts_code 排序,再按日期排序
# main_df = main_df.sort_values(by=[ts_code_col, trade_date_col])
# financial_data_subset = financial_data_subset.sort_values(by=[ts_code_col, effective_date_col])
main_df = main_df.sort_values(by=[trade_date_col, ts_code_col])
financial_data_subset = financial_data_subset.sort_values(
by=[effective_date_col, ts_code_col]
)
# --- 使用 merge_asof 计算因子 ---
try:
df_with_factor = pd.merge_asof(
main_df,
financial_data_subset,
left_on=trade_date_col,
right_on=effective_date_col,
by=ts_code_col,
direction="backward",
)
except Exception as e:
print(f"merge_asof 执行失败: {e}")
# 根据需要决定如何处理错误,这里填充 NaN
main_df[new_factor_col_name] = np.nan
return main_df
# --- 清理与重命名 ---
# 移除右表的日期列(如果它与左表日期列名称不同)
if (
effective_date_col in df_with_factor.columns
and effective_date_col != trade_date_col
):
df_with_factor = df_with_factor.drop(columns=[effective_date_col])
# 重命名新加入的因子列
if factor_value_col != new_factor_col_name:
if factor_value_col in df_with_factor.columns:
df_with_factor = df_with_factor.rename(
columns={factor_value_col: new_factor_col_name}
)
else:
# 这种情况理论上不应发生,因为 merge_asof 应该会把右表的非 key 列带过来
print(f"警告: 合并后未找到原始因子列 '{factor_value_col}',无法重命名。")
# 如果 factor_value_col 已是目标名称,则无需重命名
if new_factor_col_name not in df_with_factor.columns:
# 如果目标名称也不存在,则可能合并失败或列名有问题
df_with_factor[new_factor_col_name] = np.nan
# 如果 factor_value_col 就是目标名称,确保该列存在
elif new_factor_col_name not in df_with_factor.columns:
print(f"警告: 合并后未找到目标因子列 '{new_factor_col_name}'。填充 NaN。")
df_with_factor[new_factor_col_name] = np.nan
return df_with_factor
def calculate_cashflow_to_ev_factor(
df: pd.DataFrame,
cashflow_df: pd.DataFrame,
balancesheet_df: pd.DataFrame,
market_cap_col: str = "total_mv",
date_col: str = "trade_date",
ts_code_col: str = "ts_code",
) -> pd.DataFrame:
"""
计算经营活动产生的现金流量净额TTM / 企业价值因子。
企业价值 = 司市值 + 负债合计 - 货币资金。
重要提示:本代码假设 add_financial_factor 能够将财务数据正确地合并到主数据框。
如果您使用 add_financial_factor 只获取单季度数据,那么
n_cashflow_act 将不是 TTM 值,这将导致最终因子计算不准确。
Args:
df (pd.DataFrame): 包含市场数据(需有市值列)和日期、股票代码的主数据框。
cashflow_df (pd.DataFrame): Tushare 现金流量表数据。
balancesheet_df (pd.DataFrame): Tushare 资产负债表数据。
market_cap_col (str): DataFrame 中代表公司总市值的列名,默认为 'total_mv'。
date_col (str): DataFrame 中的日期列名,默认为 'trade_date'。
ts_code_col (str): DataFrame 中的股票代码列名,默认为 'ts_code'。
Returns:
pd.DataFrame: 添加了 'cashflow_to_ev_factor' 列的 DataFrame。
"""
df_factor = df.copy() # 创建副本以避免修改原始 DataFrame
# 0. 确保必要的市场市值列存在
if market_cap_col not in df_factor.columns:
print(f"错误: DataFrame 中缺少市值列 '{market_cap_col}'。无法计算因子。")
# 添加一个空的因子列并返回
df_factor["cashflow_to_ev_factor"] = np.nan
return df_factor
# 1. 获取经营活动产生的现金流量净额 (TTM - **注意这里的潜在不准确性**)
# 如果 add_financial_factor 只获取单季度,这里的 n_cashflow_act 将不是 TTM
df_factor = add_financial_factor(df_factor, cashflow_df, "n_cashflow_act")
# 如果 add_financial_factor 能够正确处理 TTM那么上面的调用是正确的。
# 否则,您需要在 add_financial_factor 内部实现 TTM 逻辑,或者在调用 add_financial_factor
# 获取多个季度数据后,在这里手动进行 TTM 求和。
# 为了符合您的描述,我们暂时假设 add_financial_factor 已经处理了 TTM 或我们接受单季度的值
# 并命名为 ttm_n_cashflow_act 以示期望
# 重新命名获取的现金流列以便后续计算
cashflow_col_name = "n_cashflow_act" # 获取的列名
ttm_cashflow_col = "ttm_n_cashflow_act" # 因子计算中使用的列名
if cashflow_col_name in df_factor.columns:
df_factor = df_factor.rename(columns={cashflow_col_name: ttm_cashflow_col})
else:
# 如果 add_financial_factor 没成功添加列
print(f"错误: add_financial_factor 未能成功添加 '{cashflow_col_name}' 列。")
df_factor["cashflow_to_ev_factor"] = np.nan
return df_factor
# 2. 获取负债合计
df_factor = add_financial_factor(df_factor, balancesheet_df, "total_liab")
liab_col_name = "total_liab"
if liab_col_name not in df_factor.columns:
print(f"错误: add_financial_factor 未能成功添加 '{liab_col_name}' 列。")
df_factor["cashflow_to_ev_factor"] = np.nan
return df_factor
# 3. 获取货币资金
df_factor = add_financial_factor(df_factor, balancesheet_df, "money_cap")
money_col_name = "money_cap"
if money_col_name not in df_factor.columns:
print(f"错误: add_financial_factor 未能成功添加 '{money_col_name}' 列。")
df_factor["cashflow_to_ev_factor"] = np.nan
return df_factor
# 4. 计算企业价值 (Enterprise Value)
# 确保参与计算的列是数值类型,并处理 NaN (NaN + X = NaN, NaN - X = NaN)
enterprise_value = (
df_factor[market_cap_col].astype(float) * 10000
+ df_factor[liab_col_name].astype(float)
- df_factor[money_col_name].astype(float)
)
# 5. 计算最终因子经营活动产生的现金流量净额TTM / 企业价值
# 使用之前定义的安全除法
df_factor["cashflow_to_ev_factor"] = _safe_divide(
df_factor[ttm_cashflow_col], enterprise_value
)
# 6. 删除临时添加的财务数据列
cols_to_drop = [ttm_cashflow_col, liab_col_name, money_col_name]
df_factor = df_factor.drop(
columns=[col for col in cols_to_drop if col in df_factor.columns]
)
return df_factor
def caculate_book_to_price_ratio(
df: pd.DataFrame, fina_indicator_df: pd.DataFrame
) -> pd.DataFrame:
if "bps" not in df.columns:
df = add_financial_factor(df, fina_indicator_df, factor_value_col="bps")
df["book_to_price_ratio"] = df["bps"] / df["close"]
df = df.drop(columns=["bps"])
return df
def turnover_rate_n(df: pd.DataFrame, n: int) -> pd.DataFrame:
df[f"turnover_rate_mean_{n}"] = (
df.groupby("ts_code", group_keys=False)["turnover_rate"]
.rolling(n)
.mean()
.reset_index(level=0, drop=True)
)
return df
def variance_n(df: pd.DataFrame, n: int) -> pd.DataFrame:
df[f"variance_{n}"] = (
df.groupby("ts_code", group_keys=False)["pct_chg"]
.rolling(n)
.var()
.reset_index(level=0, drop=True)
)
return df
def bbi_ratio_factor(df: pd.DataFrame) -> pd.DataFrame:
df_factor = df
# 确保数据按股票代码和日期排序,这对滚动计算非常重要
df_factor = df_factor.sort_values(by=["ts_code", "trade_date"])
# 获取收盘价列
close_prices = df_factor["close"]
# 1. 根据 ts_code 分组计算各周期的简单移动平均线 (SMA)
grouped = df_factor.groupby("ts_code", group_keys=False)
# 计算不同周期的 SMA并使用 reset_index 展平索引
sma3 = grouped["close"].rolling(3).mean().reset_index(level=0, drop=True)
sma6 = grouped["close"].rolling(6).mean().reset_index(level=0, drop=True)
sma12 = grouped["close"].rolling(12).mean().reset_index(level=0, drop=True)
sma24 = grouped["close"].rolling(24).mean().reset_index(level=0, drop=True)
# 2. 计算 BBI = (SMA3 + SMA6 + SMA12 + SMA24) / 4
print("计算 BBI...")
# 注意:如果任何一个 SMA 在某个位置是 NaN (例如,数据点不足),那么它们的和也将是 NaN
bbi = (sma3 + sma6 + sma12 + sma24) / 4
# 3. 计算最终因子 = BBI / 收盘价 (使用安全除法)
df_factor["bbi_ratio_factor"] = _safe_divide(bbi, close_prices)
return df_factor
def limit_factor(df: pd.DataFrame) -> pd.DataFrame:
grouped = df.groupby("ts_code", group_keys=False)
df["cat_up_limit"] = (
df["close"] == df["up_limit"]
) # 是否涨停1表示涨停0表示未涨停
df["cat_down_limit"] = (
df["close"] == df["down_limit"]
) # 是否跌停1表示跌停0表示未跌停
df["up_limit_count_10d"] = (
grouped["cat_up_limit"]
.rolling(window=10, min_periods=1)
.sum()
.reset_index(level=0, drop=True)
)
df["down_limit_count_10d"] = (
grouped["cat_down_limit"]
.rolling(window=10, min_periods=1)
.sum()
.reset_index(level=0, drop=True)
)
# 3. 最近连续涨跌停天数
def calculate_consecutive_limits(series):
"""
计算连续涨停/跌停天数。
"""
consecutive_up = series * (
series.groupby((series != series.shift()).cumsum()).cumcount() + 1
)
consecutive_down = series * (
series.groupby((series != series.shift()).cumsum()).cumcount() + 1
)
return consecutive_up, consecutive_down
# 连续涨停天数
df["consecutive_up_limit"] = grouped["cat_up_limit"].apply(
lambda x: calculate_consecutive_limits(x)[0]
)
return df
import pandas as pd
import numpy as np
# 假设 df 已经加载并包含 'ts_code', 'trade_date', 'pct_chg' 列
# 并且已经按照 'ts_code' 和 'trade_date' 进行了排序
def daily_momentum_benchmark(df):
"""
计算日级别动量基准 (Positive and Negative),使用现有的 'pct_chg' 列。
这个函数将原分钟级动量基准的概念应用于日线数据。
计算每日全市场上涨股票 ('pct_chg' > 0) 的平均涨跌幅
和下跌股票 ('pct_chg' < 0) 的平均涨跌幅。
参数:
df (pd.DataFrame): 包含日级别股票数据的DataFrame。
必须包含 'ts_code', 'trade_date', 'pct_chg' 列,
并已按 'ts_code' 和 'trade_date' 排序。
返回:
pd.DataFrame: 增加了 'daily_positive_benchmark', 'daily_negative_benchmark' 列的DataFrame。
原始的 'pct_chg' 列会被直接使用。
"""
print("--- 计算日级别动量基准 (使用 pct_chg) ---")
# 确保 pct_chg 列存在
if "pct_chg" not in df.columns:
print("错误: DataFrame中没有'pct_chg'列,无法计算日级别动量基准。")
return df
# 计算每日的全市场动量基准
# 对于每一个交易日,计算所有股票中 pct_chg > 0 和 < 0 的平均值
# 使用 trade_date 进行分组
daily_benchmarks = (
df.groupby("trade_date")["pct_chg"]
.agg(
daily_positive_benchmark=lambda x: x[
x > 0
].mean(), # 日级别上涨股票的平均涨跌幅
daily_negative_benchmark=lambda x: x[
x < 0
].mean(), # 日级别下跌股票的平均涨跌幅
)
.reset_index()
)
# 将日级别动量基准合并回原始日线数据DataFrame
df = pd.merge(df, daily_benchmarks, on="trade_date", how="left")
# 对可能出现的NaN基准进行填充这里用0填充表示没有对应的同向基准
df["daily_positive_benchmark"].fillna(0, inplace=True)
df["daily_negative_benchmark"].fillna(0, inplace=True)
print("日级别动量基准计算完成 (使用 pct_chg)。")
return df
def daily_deviation(df):
"""
计算日级别偏离度,使用现有的 'pct_chg' 列和计算出的日级别动量基准。
计算每只股票的日涨跌幅 ('pct_chg') 相对于日级别动量基准的偏离。
参数:
df (pd.DataFrame): 包含日级别股票数据的DataFrame。
必须包含 'ts_code', 'trade_date', 'pct_chg',
'daily_positive_benchmark', 'daily_negative_benchmark' 列。
这些基准列通常通过运行 daily_momentum_benchmark(df) 获得。
返回:
pd.DataFrame: 增加了 'daily_deviation' 列的DataFrame。
"""
print("--- 计算日级别偏离度 (使用 pct_chg) ---")
# 确保所需的列存在
df = daily_momentum_benchmark(df)
required_cols = ["pct_chg", "daily_positive_benchmark", "daily_negative_benchmark"]
if not all(col in df.columns for col in required_cols):
print(
f"错误: 计算日级别偏离度需要以下列: {required_cols}。请先运行 daily_momentum_benchmark(df)。"
)
return df
conditions = [
(df["pct_chg"] > 0) & (df["daily_positive_benchmark"] > 0),
(df["pct_chg"] < 0) & (df["daily_negative_benchmark"] < 0),
]
choices = [
df["pct_chg"] - df["daily_positive_benchmark"],
df["pct_chg"] - df["daily_negative_benchmark"],
]
df["daily_deviation"] = np.select(conditions, choices, default=0)
df = df.drop(columns=["daily_positive_benchmark", "daily_negative_benchmark"])
print("日级别偏离度计算完成 (使用 pct_chg)。")
return df
def daily_industry_momentum_benchmark(df):
"""
计算日级别行业动量基准 (Positive and Negative),使用现有的 'pct_chg' 列和 'cat_l2_code' 列。
计算每日每个行业内部上涨股票 ('pct_chg' > 0) 的平均涨跌幅
和下跌股票 ('pct_chg' < 0) 的平均涨跌幅。
参数:
df (pd.DataFrame): 包含日级别股票数据的DataFrame。
必须包含 'ts_code', 'trade_date', 'pct_chg', 'cat_l2_code' 列,
并已按 'ts_code' 和 'trade_date' 排序。
返回:
pd.DataFrame: 增加了 'daily_industry_positive_benchmark', 'daily_industry_negative_benchmark' 列的DataFrame。
原始的 'pct_chg' 和 'cat_l2_code' 列会被直接使用。
"""
print("--- 计算日级别行业动量基准 (使用 pct_chg 和 cat_l2_code) ---")
# 确保必需列存在
required_cols = ["pct_chg", "cat_l2_code", "trade_date", "ts_code"]
if not all(col in df.columns for col in required_cols):
print(f"错误: 计算日级别行业动量基准需要以下列: {required_cols}。")
return df
# 计算每日每个行业内部的动量基准
# 使用 trade_date 和 cat_l2_code 进行分组
industry_daily_benchmarks = (
df.groupby(["trade_date", "cat_l2_code"])["pct_chg"]
.agg(
daily_industry_positive_benchmark=lambda x: x[
x > 0
].mean(), # 日级别行业内上涨股票的平均涨跌幅
daily_industry_negative_benchmark=lambda x: x[
x < 0
].mean(), # 日级别行业内下跌股票的平均涨跌幅
)
.reset_index()
)
# 将日级别行业动量基准合并回原始日线数据DataFrame
# 使用 trade_date 和 cat_l2_code 进行 merge
df = pd.merge(
df, industry_daily_benchmarks, on=["trade_date", "cat_l2_code"], how="left"
)
# 对可能出现的NaN基准进行填充例如某个行业某一天没有上涨或下跌的股票
# 这里用0填充表示该行业该天没有对应的同向基准
df["daily_industry_positive_benchmark"].fillna(0, inplace=True)
df["daily_industry_negative_benchmark"].fillna(0, inplace=True)
print("日级别行业动量基准计算完成 (使用 pct_chg 和 cat_l2_code)。")
return df
def daily_industry_deviation(df):
"""
计算日级别行业偏离度,使用现有的 'pct_chg' 列和计算出的日级别行业动量基准。
计算每只股票的日涨跌幅 ('pct_chg') 相对于其所属行业日级别动量基准的偏离。
参数:
df (pd.DataFrame): 包含日级别股票数据的DataFrame。
必须包含 'ts_code', 'trade_date', 'pct_chg', 'cat_l2_code',
'daily_industry_positive_benchmark', 'daily_industry_negative_benchmark' 列。
这些基准列通常通过运行 daily_industry_momentum_benchmark(df) 获得。
返回:
pd.DataFrame: 增加了 'daily_industry_deviation' 列的DataFrame。
"""
print("--- 计算日级别行业偏离度 (使用 pct_chg 和行业基准) ---")
# 确保所需的列存在
df = daily_industry_momentum_benchmark(df)
required_cols = [
"pct_chg",
"daily_industry_positive_benchmark",
"daily_industry_negative_benchmark",
]
if not all(col in df.columns for col in required_cols):
print(
f"错误: 计算日级别行业偏离度需要以下列: {required_cols}。请先运行 daily_industry_momentum_benchmark(df)。"
)
return df
# 根据规则计算日级别行业偏离度:
# 如果 pct_chg > 0 且 daily_industry_positive_benchmark > 0deviation = pct_chg - daily_industry_positive_benchmark
# 如果 pct_chg < 0 且 daily_industry_negative_benchmark < 0deviation = pct_chg - daily_industry_negative_benchmark
# 否则 deviation = 0
conditions = [
(df["pct_chg"] > 0) & (df["daily_industry_positive_benchmark"] > 0),
(df["pct_chg"] < 0) & (df["daily_industry_negative_benchmark"] < 0),
]
choices = [
df["pct_chg"] - df["daily_industry_positive_benchmark"],
df["pct_chg"] - df["daily_industry_negative_benchmark"],
]
df["daily_industry_deviation"] = np.select(conditions, choices, default=0)
df = df.drop(
columns=[
"daily_industry_positive_benchmark",
"daily_industry_negative_benchmark",
]
)
print("日级别行业偏离度计算完成 (使用 pct_chg 和行业基准)。")
return df
def sentiment_panic_greed_index(
df: pd.DataFrame,
window_atr: int = 14,
window_smooth: int = 5,
factor_name: str = "senti_panic_greed",
):
"""
计算市场恐慌/贪婪指数 (原地修改)。
结合日内振幅、影线、跳空及与近期ATR的比较。
WARNING: Modifies df in-place.
"""
print(f"Calculating {factor_name}...")
_temp_cols = [
"_prev_close",
"_atr",
"_true_range",
"_upper_shadow",
"_lower_shadow",
"_body",
"_gap",
"_volatility_surprise",
]
if not all(col in df.columns for col in ["open", "high", "low", "close", "vol"]):
print(f"Error: DataFrame 缺少必需的 OHLCV 列。将为 {factor_name} 填充 NaN。")
df[factor_name] = np.nan
return
try:
df["_prev_close"] = df["close"].shift(1)
# 计算真实波幅 (TR) 和 ATR
df["_true_range"] = talib.TRANGE(df["high"], df["low"], df["_prev_close"])
df["_atr"] = talib.ATR(
df["high"], df["low"], df["_prev_close"], timeperiod=window_atr
)
# 计算影线和实体
df["_upper_shadow"] = df["high"] - np.maximum(df["open"], df["close"])
df["_lower_shadow"] = np.minimum(df["open"], df["close"]) - df["low"]
df["_body"] = np.abs(df["close"] - df["open"])
# 计算跳空
df["_gap"] = (df["open"] / df["_prev_close"] - 1).fillna(0)
# 波动性意外: 当日真实波幅相对于近期ATR的倍数乘以涨跌方向
# 如果真实波幅显著放大,根据涨跌幅赋予正负号,表明情绪的强度和方向
df["_volatility_surprise"] = (
df["_true_range"] / (df["_atr"] + epsilon) - 1
) * np.sign(df["pct_chg"].fillna(0))
# 简化版情绪指标:(下影线 - 上影线) / ATR + 跳空幅度 + 当日涨跌幅, 然后平滑
# 更强的信号:波动性意外,结合跳空
# 考虑当日振幅相对于ATR的超额部分并结合实体方向
# ( (真实波幅/ATR) * 涨跌方向 ) + 跳空幅度
raw_senti = (df["_true_range"] / (df["_atr"] + epsilon)) * np.sign(
df["pct_chg"].fillna(0)
) + df[
"_gap"
] * 2 # 放大跳空影响
df[factor_name] = raw_senti.rolling(window_smooth, min_periods=1).mean()
except Exception as e:
print(f"Error calculating {factor_name}: {e}")
df[factor_name] = np.nan
finally:
cols_to_drop = [col for col in _temp_cols if col in df.columns]
if cols_to_drop:
df.drop(columns=cols_to_drop, inplace=True)
print(f"Finished {factor_name}.")
return df
def sentiment_market_breadth_proxy(
df: pd.DataFrame,
window_vol: int = 20,
window_smooth: int = 3,
factor_name: str = "senti_breadth_proxy",
):
"""
计算市场宽度情绪代理指标 (基于指数的价量配合度) (原地修改).
WARNING: Modifies df in-place.
"""
print(f"Calculating {factor_name}...")
_temp_cols = ["_rolling_avg_vol"]
if not all(col in df.columns for col in ["pct_chg", "vol"]):
print(
f"Error: DataFrame 缺少 'pct_chg' 或 'vol' 列。将为 {factor_name} 填充 NaN。"
)
df[factor_name] = np.nan
return
try:
df["_rolling_avg_vol"] = (
df["vol"].rolling(window_vol, min_periods=max(1, window_vol // 2)).mean()
)
# 价量配合度:涨跌幅乘以相对成交量强度
raw_breadth = df["pct_chg"] * (df["vol"] / (df["_rolling_avg_vol"] + epsilon))
df[factor_name] = raw_breadth.rolling(
window_smooth, min_periods=1
).mean() # 平滑处理
except Exception as e:
print(f"Error calculating {factor_name}: {e}")
df[factor_name] = np.nan
finally:
cols_to_drop = [col for col in _temp_cols if col in df.columns]
if cols_to_drop:
df.drop(columns=cols_to_drop, inplace=True)
print(f"Finished {factor_name}.")
return df
def sentiment_reversal_indicator(
df: pd.DataFrame,
window_ret: int = 5,
window_vol: int = 5,
factor_name: str = "senti_reversal",
):
"""
计算短期情绪反转因子 (原地修改).
WARNING: Modifies df in-place.
"""
print(f"Calculating {factor_name}...")
_temp_cols = ["_return_M", "_volatility_M"]
if "pct_chg" not in df.columns:
print(f"Error: DataFrame 缺少 'pct_chg' 列。将为 {factor_name} 填充 NaN。")
df[factor_name] = np.nan
return
try:
# 计算 M 日累计收益率 (这里用连乘近似,或者 sum of log returns)
# (close / close.shift(M)) -1
df["_return_M"] = (df["close"] / df["close"].shift(window_ret)) - 1
# df['_return_M'] = df['pct_chg'].rolling(window_ret, min_periods=1).sum() # 另一种近似
# 计算 M 日已实现波动率
df["_volatility_M"] = (
df["pct_chg"].rolling(window_vol, min_periods=max(1, window_vol // 2)).std()
)
# 因子计算
df[factor_name] = -df["_return_M"] * df["_volatility_M"]
# 对因子本身可以再做一次平滑
# df[factor_name] = df[factor_name].rolling(3, min_periods=1).mean()
except Exception as e:
print(f"Error calculating {factor_name}: {e}")
df[factor_name] = np.nan
finally:
cols_to_drop = [col for col in _temp_cols if col in df.columns]
if cols_to_drop:
df.drop(columns=cols_to_drop, inplace=True)
print(f"Finished {factor_name}.")
return df
def price_minus_deduction_price(df, n=10):
"""
因子 1 (定量): 计算当前收盘价与N周期前收盘价均线抵扣价的差值。
该因子衡量当前价格相对于即将移出均线计算窗口的价格的差异。
正值表示当前价格高于抵扣价,下一周期均线倾向于上涨(如果价格不变)。
参数:
df (pd.DataFrame): 包含股票日线数据的DataFrame。必须包含 'ts_code', 'close' 列。
n (int): 均线计算的周期数。抵扣价是 n-1 周期前的数据点。
返回:
pd.DataFrame: 增加了 'price_minus_deduction_price_n' 列的DataFrame。
"""
if "close" not in df.columns:
print("错误: DataFrame中没有'close'列,无法计算抵扣价相关因子。")
return df
if n <= 1:
print("错误: 均线周期 n 必须大于 1。")
df[f"price_minus_deduction_price_{n}"] = np.nan
return df
grouped = df.groupby("ts_code", group_keys=False)
# 抵扣价是当前窗口移除的最早的价格,即当前价格的 n-1 周期前的价格
# 例如计算 SMA(10) for P_t, 窗口是 P_{t-9}, ..., P_t. 移除的是 P_{t-9}.
# P_{t-9} 是 P_t 的 shift(9). So shift(n-1).
deduction_price = grouped["close"].shift(n - 1)
# 计算差值
df[f"price_minus_deduction_price_{n}"] = df["close"] - deduction_price
print(f"因子 price_minus_deduction_price_{n} 计算完成。")
return df
def price_deduction_price_diff_ratio_to_sma(df, n=10):
"""
因子 2 (定量): 计算当前收盘价与抵扣价的差值占N周期均线的比例。
该因子衡量当前价格高于抵扣价的程度相对于均线水平的大小。
参数:
df (pd.DataFrame): 包含股票日线数据的DataFrame。必须包含 'ts_code', 'close' 列。
n (int): 均线计算的周期数。抵扣价是 n-1 周期前的数据点。
返回:
pd.DataFrame: 增加了 'price_deduction_price_diff_ratio_to_sma_n' 列的DataFrame。
"""
if "close" not in df.columns:
print("错误: DataFrame中没有'close'列,无法计算抵扣价相关因子。")
return df
if n <= 1:
print("错误: 均线周期 n 必须大于 1。")
df[f"price_deduction_price_diff_ratio_to_sma_{n}"] = np.nan
return df
grouped = df.groupby("ts_code", group_keys=False)
# 计算N周期SMA
# 使用 transform 可以保持与原始 df 的索引对齐
sma = grouped["close"].transform(lambda x: x.rolling(window=n).mean())
# 抵扣价
deduction_price = grouped["close"].shift(n - 1)
# 计算比例,处理均线为零的情况
diff = df["close"] - deduction_price
# 使用 np.divide 并指定 where 条件和 fill_value 来避免除以零警告和 NaN 结果
# 如果 sma 为 0则结果设为 NaN
df[f"price_deduction_price_diff_ratio_to_sma_{n}"] = np.divide(
diff,
sma,
out=np.full_like(diff, np.nan), # 输出数组形状与 diff 相同NaN 填充
where=(sma != 0), # 仅在 sma 不为 0 时执行除法
)
# np.divide with where handles Inf/-Inf and 0/0 (as NaN), but explicitly replace might be slightly safer depending on numpy version
# df[f'price_deduction_price_diff_ratio_to_sma_{n}'].replace([np.inf, -np.inf], np.nan, inplace=True) # This is often redundant with np.divide(..., where=...)
print(f"因子 price_deduction_price_diff_ratio_to_sma_{n} 计算完成。")
return df
def cat_price_vs_sma_vs_deduction_price(df, n=10):
"""
因子 3 (分类): 基于当前收盘价、N周期均线和抵扣价的位置关系构建分类因子。
捕捉当前价格和抵扣价相对于均线的位置,指示可能的趋势状态或变化。
分类定义:
0: 数据不足 (SMA 或抵扣价为 NaN) 或 均线为 NaN
1: 当前价 > SMA 且 抵扣价 > SMA (两者都在均线之上)
2: 当前价 < SMA 且 抵扣价 < SMA (两者都在均线之下)
3: 当前价 > SMA 且 抵扣价 <= SMA (当前价上穿或位于均线上方,抵扣价在均线下方或正好在均线)
4: 当前价 <= SMA 且 抵扣价 > SMA (当前价下穿或位于均线下方,抵扣价在均线上方)
参数:
df (pd.DataFrame): 包含股票日线数据的DataFrame。必须包含 'ts_code', 'close' 列。
n (int): 均线计算的周期数。抵扣价是 n-1 周期前的数据点。
返回:
pd.DataFrame: 增加了 'cat_price_vs_sma_vs_deduction_price_n' 列的DataFrame。
"""
if "close" not in df.columns:
print("错误: DataFrame中没有'close'列,无法计算抵扣价相关因子。")
return df
if n <= 1:
print("错误: 均线周期 n 必须大于 1。")
df[f"cat_price_vs_sma_vs_deduction_price_{n}"] = np.nan
return df
grouped = df.groupby("ts_code", group_keys=False)
# 计算N周期SMA
sma = grouped["close"].transform(lambda x: x.rolling(window=n).mean())
# 抵扣价
deduction_price = grouped["close"].shift(n - 1)
# 定义条件和分类值
conditions = [
(df["close"] > sma) & (deduction_price > sma),
(df["close"] < sma) & (deduction_price < sma),
(df["close"] > sma) & (deduction_price <= sma), # 包含等于的情况
(df["close"] <= sma) & (deduction_price > sma), # 包含等于的情况
# 注意sma 或 deduction_price 为 NaN 的行,其条件结果为 False会落入 default=0
]
choices = [1, 2, 3, 4]
# 使用 np.select 进行分类
# 默认值为 0用于处理条件不满足或输入为 NaN 的情况
df[f"cat_price_vs_sma_vs_deduction_price_{n}"] = np.select(
conditions, choices, default=0
)
print(f"因子 cat_price_vs_sma_vs_deduction_price_{n} 计算完成。")
return df
def cat_is_on_top_list(df: pd.DataFrame, top_list: pd.DataFrame) -> pd.DataFrame:
if "cat_reason" not in df.columns:
print("计算因子cat_is_on_top_list失败缺少cat_reason列")
return df
df["cat_is_on_top_list"] = df["cat_reason"]
df["cat_is_on_top_list"] = df["cat_is_on_top_list"] * (df["pct_chg"] > 0).astype(
int
)
return df
def cat_reason(df: pd.DataFrame, top_list: pd.DataFrame) -> pd.DataFrame:
"""
高效地将龙虎榜的 reason 列转化为单一数值类型的因子列,并仅返回必要的列。
Args:
df (pd.DataFrame): 包含所有股票数据的 DataFrame需包含 'ts_code' 和 'trade_date' 列。
top_list (pd.DataFrame): 每日龙虎榜数据的 DataFrame需包含 'ts_code', 'trade_date' 和 'reason' 列。
Returns:
pd.DataFrame: 包含 'ts_code', 'trade_date' 和 'cat_reason' 列。
"""
# 提取所有唯一的 reason 并进行数值编码
unique_reasons = top_list["reason"].unique()
reason_mapping = {reason: i + 1 for i, reason in enumerate(unique_reasons)}
# 在 top_list 中创建数值型的 reason 列
top_list["cat_reason"] = top_list["reason"].map(reason_mapping).astype("Int64")
# 转换 trade_date 类型以进行合并
df["trade_date"] = pd.to_datetime(df["trade_date"])
top_list["trade_date"] = pd.to_datetime(top_list["trade_date"])
# 仅保留 top_list 中需要的列进行合并
top_list_slim = top_list[["ts_code", "trade_date", "cat_reason"]]
# 合并 DataFrame
merged_df = df.merge(top_list_slim, on=["ts_code", "trade_date"], how="left")
# 填充 NaN 为 0 并转换为 int 类型
merged_df["cat_reason"] = merged_df["cat_reason"].fillna(0).astype("int")
return merged_df
def ts_volatility_slope_20_5(df: pd.DataFrame) -> pd.DataFrame:
"""
计算 20 日收益率标准差的 5 日线性回归斜率因子。
Args:
df (pd.DataFrame): 包含 'ts_code', 'trade_date' 和 'pct_chg' 列的 DataFrame。
Returns:
pd.DataFrame: 包含新增 'ts_volatility_slope_20_5' 列的 DataFrame。
"""
print(f"计算因子 ts_volatility_slope_20_5")
df["trade_date"] = pd.to_datetime(df["trade_date"])
df.sort_values(["ts_code", "trade_date"], inplace=True)
def std_slope(series):
if len(series) < 2:
return 0
x = np.arange(len(series))
slope, _, _, _, _ = linregress(x, series)
return slope
df["volatility_20"] = (
df.groupby("ts_code")["pct_chg"]
.rolling(window=20, min_periods=1)
.std()
.reset_index(level=0, drop=True)
)
df["ts_volatility_slope_20_5"] = (
df.groupby("ts_code")["volatility_20"]
.rolling(window=5, min_periods=2)
.apply(std_slope)
.reset_index(level=0, drop=True)
)
df.drop(columns=["volatility_20"], inplace=True)
return df
def ts_turnover_rate_acceleration_5_20(df: pd.DataFrame) -> pd.DataFrame:
"""
计算短期 (5日) 和长期 (20日) 换手率均值的差值因子。
Args:
df (pd.DataFrame): 包含 'ts_code', 'trade_date' 和 'turnover_rate' 列的 DataFrame。
Returns:
pd.DataFrame: 包含新增 'ts_turnover_rate_acceleration_5_20' 列的 DataFrame。
"""
print(f"计算因子 ts_turnover_rate_acceleration_5_20")
df['trade_date'] = pd.to_datetime(df['trade_date'])
df.sort_values(['ts_code', 'trade_date'], inplace=True)
df['short_avg_turnover'] = df.groupby('ts_code')['turnover_rate'].rolling(window=5, min_periods=1).mean().reset_index(level=0, drop=True)
df['long_avg_turnover'] = df.groupby('ts_code')['turnover_rate'].rolling(window=20, min_periods=1).mean().reset_index(level=0, drop=True)
df['ts_turnover_rate_acceleration_5_20'] = df['short_avg_turnover'] - df['long_avg_turnover']
df.drop(columns=['short_avg_turnover', 'long_avg_turnover'], inplace=True)
return df
def ts_vol_sustain_10_30(df: pd.DataFrame) -> pd.DataFrame:
"""
计算过去 10 日成交量大于 30 日均值的交易天数占比因子。
Args:
df (pd.DataFrame): 包含 'ts_code', 'trade_date' 和 'vol' 列的 DataFrame。
Returns:
pd.DataFrame: 包含新增 'ts_vol_sustain_10_30' 列的 DataFrame。
"""
print(f"计算因子 ts_vol_sustain_10_30")
df['trade_date'] = pd.to_datetime(df['trade_date'])
df.sort_values(['ts_code', 'trade_date'], inplace=True)
df['long_avg_vol'] = df.groupby('ts_code')['vol'].rolling(window=30, min_periods=1).mean().reset_index(level=0, drop=True)
def vol_above_avg(group):
group['vol_above'] = group['vol'] > group['long_avg_vol']
group['ts_vol_sustain_10_30'] = group['vol_above'].rolling(window=10, min_periods=1).mean()
return group.drop(columns=['vol_above'])
df = df.groupby('ts_code', group_keys=False).apply(vol_above_avg)
df.drop(columns=['long_avg_vol'], inplace=True)
return df
def cs_turnover_rate_relative_strength_20(df: pd.DataFrame) -> pd.DataFrame:
"""
计算当日换手率 vs 20 日均值比值的横截面排名因子。
Args:
df (pd.DataFrame): 包含 'ts_code', 'trade_date' 和 'turnover_rate' 列的 DataFrame。
Returns:
pd.DataFrame: 包含新增 'cs_turnover_rate_relative_strength_20' 列的 DataFrame。
"""
print(f"计算因子 cs_turnover_rate_relative_strength_20")
df['trade_date'] = pd.to_datetime(df['trade_date'])
df.sort_values(['ts_code', 'trade_date'], inplace=True)
def calculate_ratio(group):
group['avg_turnover_20'] = group['turnover_rate'].rolling(window=20, min_periods=1).mean()
group['turnover_ratio'] = group['turnover_rate'] / group['avg_turnover_20']
return group.drop(columns=['avg_turnover_20'])
df = df.groupby('ts_code', group_keys=False).apply(calculate_ratio)
def rank_ratios(group):
group['cs_turnover_rate_relative_strength_20'] = group['turnover_ratio'].rank(method='dense', ascending=False)
return group.drop(columns=['turnover_ratio'])
df = df.groupby('trade_date', group_keys=False).apply(rank_ratios)
return df
def cs_amount_outlier_10(df: pd.DataFrame) -> pd.DataFrame:
"""
计算当日成交额 vs 10 日均值差值的横截面 Z-score 因子。
Args:
df (pd.DataFrame): 包含 'ts_code', 'trade_date' 和 'amount' 列的 DataFrame。
Returns:
pd.DataFrame: 包含新增 'cs_amount_outlier_10' 列的 DataFrame。
"""
print(f"计算因子 cs_amount_outlier_10")
df['trade_date'] = pd.to_datetime(df['trade_date'])
df.sort_values(['ts_code', 'trade_date'], inplace=True)
def calculate_diff(group):
group['avg_amount_10'] = group['amount'].rolling(window=10, min_periods=1).mean()
group['amount_diff'] = group['amount'] - group['avg_amount_10']
return group.drop(columns=['avg_amount_10'])
df = df.groupby('ts_code', group_keys=False).apply(calculate_diff)
def zscore_diff(group):
mean_diff = group['amount_diff'].mean()
std_diff = group['amount_diff'].std()
if std_diff == 0:
group['cs_amount_outlier_10'] = 0
else:
group['cs_amount_outlier_10'] = (group['amount_diff'] - mean_diff) / std_diff
return group.drop(columns=['amount_diff'])
df = df.groupby('trade_date', group_keys=False).apply(zscore_diff)
return df
def ts_ff_to_total_turnover_ratio(df: pd.DataFrame) -> pd.DataFrame:
"""
计算自由流通股换手率与总换手率之比。
Args:
df (pd.DataFrame): 包含 'ts_code', 'trade_date', 'turnover_rate' 和 'turnover_rate' 列的 DataFrame。
Returns:
pd.DataFrame: 包含新增 'ts_ff_to_total_turnover_ratio' 列的 DataFrame。
"""
print(f"计算因子 ts_ff_to_total_turnover_ratio")
df['ts_ff_to_total_turnover_ratio'] = df['turnover_rate'] / (df['turnover_rate'] + 1e-8) # 避免除零
return df
def ts_price_volume_trend_coherence_5_20(df: pd.DataFrame) -> pd.DataFrame:
"""
计算过去 5 日价格上涨占比与过去 5 日成交量高于 20 日均量占比的乘积。
Args:
df (pd.DataFrame): 包含 'ts_code', 'trade_date', 'close' 和 'vol' 列的 DataFrame。
Returns:
pd.DataFrame: 包含新增 'ts_price_volume_trend_coherence_5_20' 列的 DataFrame。
"""
print(f"计算因子 ts_price_volume_trend_coherence_5_20")
df['trade_date'] = pd.to_datetime(df['trade_date'])
df.sort_values(['ts_code', 'trade_date'], inplace=True)
def price_up_days(series):
return (series.diff() > 0).rolling(window=5, min_periods=1).mean()
df['price_up_ratio'] = df.groupby('ts_code')['close'].apply(price_up_days).reset_index(level=0, drop=True)
df['vol_avg_20'] = df.groupby('ts_code')['vol'].rolling(window=20, min_periods=1).mean().reset_index(level=0, drop=True)
df['vol_above_avg'] = (df['vol'] > df['vol_avg_20']).rolling(window=5, min_periods=1).mean()
df['ts_price_volume_trend_coherence_5_20'] = df['price_up_ratio'] * df['vol_above_avg']
df.drop(columns=['price_up_ratio', 'vol_avg_20', 'vol_above_avg'], inplace=True)
return df
def ts_turnover_rate_trend_strength_5(df: pd.DataFrame) -> pd.DataFrame:
"""
计算过去 5 日换手率的线性回归斜率。
Args:
df (pd.DataFrame): 包含 'ts_code', 'trade_date' 和 'turnover_rate' 列的 DataFrame。
Returns:
pd.DataFrame: 包含新增 'ts_turnover_rate_trend_strength_5' 列的 DataFrame。
"""
print(f"计算因子 ts_turnover_rate_trend_strength_5")
df['trade_date'] = pd.to_datetime(df['trade_date'])
df.sort_values(['ts_code', 'trade_date'], inplace=True)
def turnover_slope(series):
if len(series) < 2:
return 0
x = np.arange(len(series))
slope, _, _, _, _ = linregress(x, series)
return slope
df['ts_turnover_rate_trend_strength_5'] = df.groupby('ts_code')['turnover_rate'].rolling(window=5, min_periods=2).apply(turnover_slope).reset_index(level=0, drop=True)
return df
def ts_ff_turnover_rate_surge_10(df: pd.DataFrame) -> pd.DataFrame:
"""
计算当日自由流通股换手率与过去 10 日均值比值。
Args:
df (pd.DataFrame): 包含 'ts_code', 'trade_date' 和 'turnover_rate' 列的 DataFrame。
Returns:
pd.DataFrame: 包含新增 'ts_ff_turnover_rate_surge_10' 列的 DataFrame。
"""
print(f"计算因子 ts_ff_turnover_rate_surge_10")
df['trade_date'] = pd.to_datetime(df['trade_date'])
df.sort_values(['ts_code', 'trade_date'], inplace=True)
df['avg_ff_turnover_10'] = df.groupby('ts_code')['turnover_rate'].rolling(window=10, min_periods=1).mean().reset_index(level=0, drop=True)
df['ts_ff_turnover_rate_surge_10'] = df['turnover_rate'] / (df['avg_ff_turnover_10'] + 1e-8) # 避免除零
df.drop(columns=['avg_ff_turnover_10'], inplace=True)
return df
# --- Factor 1: 近期积极动量与成交量激增 (简化版催化剂代理) ---
def cat_senti_mom_vol_spike(df_input: pd.DataFrame,
return_period: int = 3,
return_threshold: float = 0.05,
volume_ratio_threshold: float = 1.5,
current_pct_chg_min: float = -0.01,
current_pct_chg_max: float = 0.03,
factor_name: str = 'cat_senti_mom_vol_spike') -> pd.DataFrame:
"""
计算近期积极动量与成交量激增因子。
理念: 近期有显著正收益 + 近期成交量显著放大 + 当日小幅上涨或横盘。
"""
df = df_input
print(f"Calculating {factor_name}...")
_temp_cols = []
try:
# 1. 计算N日收益率 (如果不存在)
return_col = f'_return_{return_period}d'
if return_col not in df.columns:
df[return_col] = df.groupby('ts_code')['close'].pct_change(periods=return_period)
_temp_cols.append(return_col)
# 2. 检查 volume_ratio 是否存在 (通常由基础数据提供或 factor.txt 计算)
# 如果没有,我们可以尝试计算一个简单的 N 日均量比当日量
if 'volume_ratio' not in df.columns:
print(f"Warning: 'volume_ratio' column not found. Calculating a proxy for {factor_name}.")
df['_avg_vol_5d'] = df.groupby('ts_code')['vol'].rolling(window=5, min_periods=1).mean().reset_index(level=0, drop=True)
df['_volume_ratio_proxy'] = df['vol'] / (df['_avg_vol_5d'] + epsilon)
volume_metric_col = '_volume_ratio_proxy'
_temp_cols.extend(['_avg_vol_5d', '_volume_ratio_proxy'])
else:
volume_metric_col = 'volume_ratio'
# 条件判断
cond_momentum = df[return_col] > return_threshold
cond_volume = df[volume_metric_col] > volume_ratio_threshold
cond_current_price = (df['pct_chg'] > current_pct_chg_min) & (df['pct_chg'] < current_pct_chg_max)
df[factor_name] = (cond_momentum.astype(int).astype(str) + cond_volume.astype(int).astype(str) + cond_current_price.astype(int).astype(str))
except KeyError as e:
print(f"Error calculating {factor_name}: Missing column {e}. Factor column will be all zeros or NaN.")
df[factor_name] = 0
except Exception as e:
print(f"An unexpected error occurred in {factor_name}: {e}. Factor column will be all zeros or NaN.")
df[factor_name] = 0
finally:
# 清理中间列
df.drop(columns=[col for col in _temp_cols if col in df.columns], inplace=True, errors='ignore')
print(f"Finished {factor_name}.")
return df
# --- Factor 2: 强主力资金流入信号(未实现) ---
def calculate_strong_inflow_signal(df_input: pd.DataFrame,
intensity_avg_N: int = 3,
intensity_threshold: float = 0.01, # 假设 flow_lg_elg_intensity 的合理阈值
consecutive_buy_N: int = 2,
accel_positive_M: int = 1,
factor_name: str = 'senti_strong_inflow') -> pd.DataFrame:
"""
计算强主力资金流入信号因子。
理念: 大单资金持续、显著净流入,且有加速迹象。
依赖: df 中已包含 'flow_lg_elg_intensity' 和 'flow_lg_elg_accel' (来自 factor.txt)
"""
df = df_input
print(f"Calculating {factor_name}...")
_temp_cols = []
required_flow_cols = ['flow_lg_elg_intensity', 'flow_lg_elg_accel']
if not all(col in df.columns for col in required_flow_cols):
missing = [col for col in required_flow_cols if col not in df.columns]
print(f"Error: DataFrame 缺少必需的资金流因子列: {missing} for {factor_name}. Factor column will be all zeros or NaN.")
df[factor_name] = 0
return df
try:
# 1. 近N日主力资金强度均值
avg_intensity_col = f'_avg_flow_intensity_{intensity_avg_N}d'
df[avg_intensity_col] = df.groupby('ts_code')['flow_lg_elg_intensity'].rolling(window=intensity_avg_N, min_periods=1).mean().reset_index(level=0, drop=True)
_temp_cols.append(avg_intensity_col)
cond_avg_intensity = df[avg_intensity_col] > intensity_threshold
# 2. 近N日连续主力净买入天数 (近似flow_lg_elg_intensity > 0)
# 或者使用 lg_elg_net_buy_vol > 0 (如果该列存在)
df['_lg_elg_is_net_buy'] = (df['flow_lg_elg_intensity'] > 0).astype(int) # 或者用绝对量判断
_temp_cols.append('_lg_elg_is_net_buy')
# 计算连续天数
def count_consecutive_positive(series):
return series.rolling(window=consecutive_buy_N, min_periods=consecutive_buy_N).apply(lambda x: x.sum() == consecutive_buy_N, raw=True)
df['_consecutive_buy_days_flag'] = df.groupby('ts_code')['_lg_elg_is_net_buy'].apply(count_consecutive_positive).reset_index(level=0, drop=True).fillna(0)
_temp_cols.append('_consecutive_buy_days_flag')
cond_consecutive_buy = df['_consecutive_buy_days_flag'] == 1
# 3. 近M日主力资金流加速度为正
df['_accel_is_positive'] = (df['flow_lg_elg_accel'] > 0).astype(int)
_temp_cols.append('_accel_is_positive')
def check_all_positive_recent_M(series):
return series.rolling(window=accel_positive_M, min_periods=accel_positive_M).apply(lambda x: x.sum() == accel_positive_M, raw=True)
df['_accel_positive_M_flag'] = df.groupby('ts_code')['_accel_is_positive'].apply(check_all_positive_recent_M).reset_index(level=0, drop=True).fillna(0)
_temp_cols.append('_accel_positive_M_flag')
cond_accel_positive = df['_accel_positive_M_flag'] == 1
df[factor_name] = (cond_avg_intensity & cond_consecutive_buy & cond_accel_positive).astype(int)
except KeyError as e:
print(f"Error calculating {factor_name}: Missing column {e}. Factor column will be all zeros or NaN.")
df[factor_name] = 0
except Exception as e:
print(f"An unexpected error occurred in {factor_name}: {e}. Factor column will be all zeros or NaN.")
df[factor_name] = 0
finally:
df.drop(columns=[col for col in _temp_cols if col in df.columns], inplace=True, errors='ignore')
print(f"Finished {factor_name}.")
return df
# --- Factor 3: 突破前盘整模式 ---
def cat_senti_pre_breakout(df_input: pd.DataFrame,
atr_short_N: int = 10,
atr_long_M: int = 40,
vol_atrophy_N: int = 10, # 用于计算短期均量
vol_atrophy_M: int = 40, # 用于计算长期均量
price_stab_N: int = 5,
price_stab_threshold: float = 0.05,
current_pct_chg_min_signal: float = 0.005, # 当日上涨至少0.5%
current_pct_chg_max_signal: float = 0.07, # 当日上涨不超过7% (避免追已大涨的)
volume_ratio_signal_threshold: float = 1.2,
factor_name: str = 'cat_senti_pre_breakout') -> pd.DataFrame:
"""
计算突破前盘整模式因子。
理念: 波动率收缩、成交量萎缩、近期价格稳定,当日出现温和放量上涨。
"""
df = df_input
print(f"Calculating {factor_name}...")
_temp_cols = []
try:
# 1. 波动率收缩 (使用 ATR)
atr_short_col = f'atr_{atr_short_N}'
atr_long_col = f'atr_{atr_long_M}'
for N, col_name in [(atr_short_N, atr_short_col), (atr_long_M, atr_long_col)]:
if col_name not in df.columns:
print(f"Calculating {col_name} as it's missing...")
# TA-Lib需要numpy array作为输入并且不能有NaN在中间 (首行NaN可以)
# 分组计算ATR比较麻烦这里假设如果df不是很大可以先整列计算再groupby获取
# 一个更稳健的方法是groupby().apply(lambda x: talib.ATR(x['high'], x['low'], x['close'], N))
# 但为了避免 apply 的性能问题,这里用一种近似,如果数据量大,最好预计算
temp_atr = df.groupby('ts_code', group_keys=False).apply(
lambda x: pd.Series(talib.ATR(x['high'].values, x['low'].values, x['close'].values, timeperiod=N), index=x.index)
)
df[col_name] = temp_atr
_temp_cols.append(col_name)
cond_vol_contraction = df[atr_short_col] < (0.7 * df[atr_long_col]) # 短期ATR显著小于长期ATR
# 2. 成交量萎缩
avg_vol_short_col = f'_avg_vol_{vol_atrophy_N}'
avg_vol_long_col = f'_avg_vol_{vol_atrophy_M}'
df[avg_vol_short_col] = df.groupby('ts_code')['vol'].rolling(window=vol_atrophy_N, min_periods=1).mean().reset_index(level=0,drop=True)
df[avg_vol_long_col] = df.groupby('ts_code')['vol'].rolling(window=vol_atrophy_M, min_periods=1).mean().reset_index(level=0,drop=True)
_temp_cols.extend([avg_vol_short_col, avg_vol_long_col])
cond_vol_atrophy = df[avg_vol_short_col] < (0.7 * df[avg_vol_long_col]) # 短期均量显著小于长期均量
# 3. 近期价格稳定
rolling_max_h_col = f'_rolling_max_h_{price_stab_N}'
rolling_min_l_col = f'_rolling_min_l_{price_stab_N}'
df[rolling_max_h_col] = df.groupby('ts_code')['high'].rolling(window=price_stab_N, min_periods=1).max().reset_index(level=0,drop=True)
df[rolling_min_l_col] = df.groupby('ts_code')['low'].rolling(window=price_stab_N, min_periods=1).min().reset_index(level=0,drop=True)
_temp_cols.extend([rolling_max_h_col, rolling_min_l_col])
cond_price_stability = ( (df[rolling_max_h_col] - df[rolling_min_l_col]) / (df['close'] + epsilon) ) < price_stab_threshold
# 4. 当日温和放量上涨信号
if 'volume_ratio' not in df.columns:
print(f"Warning: 'volume_ratio' column not found for {factor_name}. Using a proxy.")
# 如果没有量比,就用当日成交量 > 1.2 * 近5日均量作为代理
if avg_vol_short_col not in df.columns: # 确保这个短期均量列已计算
df[avg_vol_short_col] = df.groupby('ts_code')['vol'].rolling(window=vol_atrophy_N, min_periods=1).mean().reset_index(level=0,drop=True)
cond_vol_signal = df['vol'] > (1.2 * df[avg_vol_short_col])
else:
cond_vol_signal = df['volume_ratio'] > volume_ratio_signal_threshold
cond_price_signal = (df['pct_chg'] > current_pct_chg_min_signal) & (df['pct_chg'] < current_pct_chg_max_signal)
cond_current_day_signal = cond_price_signal & cond_vol_signal
df[factor_name] = (cond_vol_contraction.astype(int).astype(str) + cond_vol_atrophy.astype(int).astype(str) + cond_price_stability.astype(int).astype(str) + cond_current_day_signal.astype(int).astype(str))
except KeyError as e:
print(f"Error calculating {factor_name}: Missing column {e}. Factor column will be all zeros or NaN.")
df[factor_name] = 0
except Exception as e:
print(f"An unexpected error occurred in {factor_name}: {e}. Factor column will be all zeros or NaN.")
df[factor_name] = 0
finally:
df.drop(columns=[col for col in _temp_cols if col in df.columns], inplace=True, errors='ignore')
print(f"Finished {factor_name}.")
return df