3576 lines
140 KiB
Python
3576 lines
140 KiB
Python
import numpy as np
|
||
import pandas as pd
|
||
import talib
|
||
|
||
|
||
|
||
def get_rolling_factor(df):
|
||
old_columns = df.columns.tolist()[:]
|
||
|
||
# 按股票和日期排序(如果尚未排序)
|
||
df = df.sort_values(by=["ts_code", "trade_date"])
|
||
|
||
grouped = df.groupby("ts_code", group_keys=False)
|
||
|
||
epsilon = 1e-8
|
||
df["lg_elg_net_buy_vol"] = (
|
||
df["buy_lg_vol"] + df["buy_elg_vol"] - df["sell_lg_vol"] - df["sell_elg_vol"]
|
||
)
|
||
# 检查 'volume' 列是否存在且有效
|
||
df["flow_lg_elg_intensity"] = df["lg_elg_net_buy_vol"] / (df["vol"] + epsilon)
|
||
|
||
# 2. 散户与主力背离度 (Retail vs Institutional Divergence)
|
||
# 衡量小单净流入与(大单+超大单)净流入的差异或比率
|
||
df["sm_net_buy_vol"] = df["buy_sm_vol"] - df["sell_sm_vol"]
|
||
df["flow_divergence_diff"] = df["sm_net_buy_vol"] - df["lg_elg_net_buy_vol"]
|
||
# 比率形式可能更稳定
|
||
df["flow_divergence_ratio"] = df["sm_net_buy_vol"] / (
|
||
df["lg_elg_net_buy_vol"] + np.sign(df["lg_elg_net_buy_vol"]) * epsilon + epsilon
|
||
) # 复杂处理避免0/0
|
||
|
||
# 3. 资金流结构变动 (Flow Structure Change - Relative Strength of Large Flow)
|
||
# 大单+超大单买入额占总买入额的比例的变化
|
||
df["total_buy_vol"] = df["buy_sm_vol"] + df["buy_lg_vol"] + df["buy_elg_vol"]
|
||
df["lg_elg_buy_prop"] = (df["buy_lg_vol"] + df["buy_elg_vol"]) / (
|
||
df["total_buy_vol"] + epsilon
|
||
)
|
||
df["flow_struct_buy_change"] = grouped["lg_elg_buy_prop"].diff(1) # 1日变化
|
||
|
||
# 4. 资金流加速度 (Flow Acceleration)
|
||
# 净主力资金流的变化率(二阶导)
|
||
df["lg_elg_net_buy_vol_change"] = grouped["lg_elg_net_buy_vol"].diff(1)
|
||
df["flow_lg_elg_accel"] = grouped["lg_elg_net_buy_vol_change"].diff(1)
|
||
|
||
# # 5. 极端资金流事件 (Categorical: Extreme Flow Event)
|
||
# # 定义主力资金流强度是否处于其历史极端水平(例如,过去N天的90分位数以上或10分位数以下)
|
||
# rolling_window = 20 # 可调整窗口期
|
||
|
||
# # Step 1: Calculate the rolling quantiles separately
|
||
# rolling_high = grouped['flow_lg_elg_intensity'].rolling(rolling_window, min_periods=1).quantile(0.9) # min_periods=1 保证窗口未满时也有输出
|
||
# rolling_low = grouped['flow_lg_elg_intensity'].rolling(rolling_window, min_periods=1).quantile(0.1)
|
||
|
||
# # Step 2: Assign the results to the DataFrame
|
||
# # 确保 df 和 rolling_high/low 的索引是一致的
|
||
# # 如果 df 的索引在此期间没有被修改过,这通常是安全的
|
||
# df['flow_lg_elg_intensity_rolling_high'] = rolling_high
|
||
# df['flow_lg_elg_intensity_rolling_low'] = rolling_low
|
||
|
||
# # Step 3: Continue with the logic using the new columns
|
||
# conditions_flow = [
|
||
# df['flow_lg_elg_intensity'] > df['flow_lg_elg_intensity_rolling_high'],
|
||
# df['flow_lg_elg_intensity'] < df['flow_lg_elg_intensity_rolling_low']
|
||
# ]
|
||
# choices_flow = [1, -1] # 1: 极端流入, -1: 极端流出
|
||
# df['cat_extreme_flow'] = np.select(conditions_flow, choices_flow, default=0)
|
||
|
||
# --- 筹码分布因子 ---
|
||
|
||
# 6. 筹码集中度 (Chip Concentration)
|
||
# 衡量筹码分布的紧密程度,例如 95% 与 5% 成本价的差距,相对于当前价格进行标准化
|
||
# 检查 'close' 列是否存在且有效
|
||
df["chip_concentration_range"] = (df["cost_95pct"] - df["cost_5pct"]) / (
|
||
df["close"] + epsilon
|
||
)
|
||
|
||
# 7. 筹码分布偏度 (Chip Distribution Skewness Proxy)
|
||
# 比较中位数成本 (cost_50pct) 和加权平均成本 (weight_avg)
|
||
# weight_avg > cost_50pct 暗示高成本区有较多筹码(右偏)
|
||
df["chip_skewness"] = (df["weight_avg"] - df["cost_50pct"]) / (
|
||
df["cost_50pct"] + epsilon
|
||
)
|
||
|
||
# 8. 浮筹比例 (Floating Chips Proxy)
|
||
# 衡量短期内(例如15%成本线以下)的筹码比例与总获利盘比例的关系
|
||
# winner_rate 高但 cost_15pct 接近当前价,可能意味着大部分获利盘成本不高,易浮动
|
||
# 这里简化为:获利盘比例 与 (当前价-15%成本价)/当前价 的乘积
|
||
price_dist_cost15 = (df["close"] - df["cost_15pct"]) / (df["close"] + epsilon)
|
||
df["floating_chip_proxy"] = df["winner_rate"] * np.maximum(
|
||
0, price_dist_cost15
|
||
) # 只考虑价格高于15%成本线的情况
|
||
|
||
# 9. 成本支撑强度变化 (Cost Support Strength Change)
|
||
# 观察低位筹码成本(如 5% 或 15% 分位点)的变化率,看支撑位是上移还是下移
|
||
df["cost_support_15pct_change"] = (
|
||
grouped["cost_15pct"].pct_change(1) * 100
|
||
) # 百分比变化
|
||
|
||
# 10. 获利盘压力/支撑区 (Categorical: Winner Rate Zone & Price Position)
|
||
# 结合获利盘比例和当前价格相对于筹码成本的位置
|
||
# 例如: 价格在 85% 成本线之上 & 获利盘 > 0.8 -> 高位派发风险区?
|
||
# 价格在 15% 成本线之下 & 获利盘 < 0.2 -> 低位吸筹潜力区?
|
||
conditions_winner = [
|
||
(df["close"] > df["cost_85pct"]) & (df["winner_rate"] > 0.8), # 高位 & 高获利盘
|
||
(df["close"] < df["cost_15pct"]) & (df["winner_rate"] < 0.2), # 低位 & 低获利盘
|
||
(df["close"] > df["cost_50pct"])
|
||
& (df["winner_rate"] > 0.5), # 中高位 & 多数获利
|
||
(df["close"] < df["cost_50pct"])
|
||
& (df["winner_rate"] < 0.5), # 中低位 & 多数亏损
|
||
]
|
||
choices_winner = [1, 2, 3, 4] # 1:高风险区, 2:低潜力区, 3:中上获利区, 4:中下亏损区
|
||
df["cat_winner_price_zone"] = np.select(
|
||
conditions_winner, choices_winner, default=0
|
||
) # 0: 其他
|
||
|
||
# --- 结合因子 ---
|
||
|
||
# 11. 主力行为与筹码结构一致性 (Flow-Chip Consistency)
|
||
# 例如:主力净买入发生在价格接近下方筹码密集区(如 cost_15pct 到 cost_50pct)时
|
||
price_near_low_support = (df["close"] > df["cost_15pct"]) & (
|
||
df["close"] < df["cost_50pct"]
|
||
)
|
||
df["flow_chip_consistency"] = df[
|
||
"lg_elg_net_buy_vol"
|
||
] * price_near_low_support.astype(int)
|
||
# 可以进一步标准化或做成 categorical
|
||
|
||
# 12. 获利了结压力/承接盘强度 (Profit-Taking Pressure vs Absorption)
|
||
# 在高获利盘(winner_rate > 0.7)的情况下,观察主力资金是净流出(了结)还是净流入(高位换手/承接)
|
||
high_winner_rate_flag = (df["winner_rate"] > 0.7).astype(int)
|
||
df["profit_taking_vs_absorb"] = df["lg_elg_net_buy_vol"] * high_winner_rate_flag
|
||
# 正值表示高获利盘下主力仍在买入(承接),负值表示主力在卖出(了结)
|
||
|
||
# 清理临时列和可能产生的 NaN (可选,根据需要处理)
|
||
cols_to_drop = [
|
||
"lg_elg_net_buy_vol",
|
||
"sm_net_buy_vol",
|
||
"total_buy_vol",
|
||
"lg_elg_buy_prop",
|
||
"lg_elg_net_buy_vol_change",
|
||
"flow_lg_elg_intensity_rolling_high",
|
||
"flow_lg_elg_intensity_rolling_low",
|
||
]
|
||
# df = df.drop(columns=cols_to_drop)
|
||
|
||
window = 20
|
||
df["_is_positive"] = (df["pct_chg"] > 0).astype(int)
|
||
df["_is_negative"] = (df["pct_chg"] < 0).astype(int)
|
||
df["cat_is_positive"] = (df["pct_chg"] > 0).astype(int)
|
||
|
||
# 分离正负收益率 (用于计算各自的均值和平方均值)
|
||
# 注意:这里我们保留原始收益率用于计算,而不是 clip 到 0
|
||
df["_pos_returns"] = df["pct_chg"].where(
|
||
df["pct_chg"] > 0, 0
|
||
) # 非正设为0,便于求和
|
||
df["_neg_returns"] = df["pct_chg"].where(
|
||
df["pct_chg"] < 0, 0
|
||
) # 非负设为0,便于求和
|
||
|
||
# 计算收益率的平方 (用于计算 E[X^2])
|
||
df["_pos_returns_sq"] = np.square(df["_pos_returns"])
|
||
df["_neg_returns_sq"] = np.square(df["_neg_returns"]) # 平方后负数变正
|
||
|
||
# 4. 计算滚动统计量 (使用内置函数,速度较快)
|
||
# 计算正收益日的统计量
|
||
rolling_pos_count = (
|
||
grouped["_is_positive"].rolling(window, min_periods=max(1, window // 2)).sum()
|
||
)
|
||
rolling_pos_sum = (
|
||
grouped["_pos_returns"].rolling(window, min_periods=max(1, window // 2)).sum()
|
||
)
|
||
rolling_pos_sum_sq = (
|
||
grouped["_pos_returns_sq"]
|
||
.rolling(window, min_periods=max(1, window // 2))
|
||
.sum()
|
||
)
|
||
|
||
# 计算负收益日的统计量
|
||
rolling_neg_count = (
|
||
grouped["_is_negative"].rolling(window, min_periods=max(1, window // 2)).sum()
|
||
)
|
||
rolling_neg_sum = (
|
||
grouped["_neg_returns"].rolling(window, min_periods=max(1, window // 2)).sum()
|
||
)
|
||
rolling_neg_sum_sq = (
|
||
grouped["_neg_returns_sq"]
|
||
.rolling(window, min_periods=max(1, window // 2))
|
||
.sum()
|
||
)
|
||
|
||
# 5. 计算方差和标准差
|
||
pos_mean_sq = rolling_pos_sum_sq / rolling_pos_count
|
||
pos_mean = rolling_pos_sum / rolling_pos_count
|
||
pos_var = pos_mean_sq - np.square(pos_mean)
|
||
pos_var = pos_var.where(rolling_pos_count >= 2, np.nan).clip(lower=0)
|
||
upside_vol = np.sqrt(pos_var)
|
||
|
||
neg_mean_sq = rolling_neg_sum_sq / rolling_neg_count
|
||
neg_mean = rolling_neg_sum / rolling_neg_count # 注意 neg_mean 是负数
|
||
neg_var = neg_mean_sq - np.square(neg_mean)
|
||
neg_var = neg_var.where(rolling_neg_count >= 2, np.nan).clip(lower=0)
|
||
downside_vol = np.sqrt(neg_var)
|
||
|
||
# rolling 操作后结果带有 MultiIndex,需要去除股票代码层级以便合并
|
||
df["upside_vol"] = upside_vol.reset_index(level=0, drop=True)
|
||
df["downside_vol"] = downside_vol.reset_index(level=0, drop=True)
|
||
|
||
df["vol_ratio"] = df["upside_vol"] / df["downside_vol"]
|
||
df["vol_ratio"] = (
|
||
df["vol_ratio"].replace([np.inf, -np.inf], np.nan).fillna(0)
|
||
) # 或 fillna(np.nan)
|
||
|
||
df["return_skew"] = (
|
||
grouped["pct_chg"].rolling(window=5).skew().reset_index(0, drop=True)
|
||
)
|
||
df["return_kurtosis"] = (
|
||
grouped["pct_chg"].rolling(window=5).kurt().reset_index(0, drop=True)
|
||
)
|
||
|
||
# 因子 1:短期成交量变化率
|
||
df["volume_change_rate"] = (
|
||
grouped["vol"].rolling(window=2).mean()
|
||
/ grouped["vol"].rolling(window=10).mean()
|
||
- 1
|
||
).reset_index(
|
||
level=0, drop=True
|
||
) # 确保索引对齐
|
||
|
||
# 因子 2:成交量突破信号
|
||
max_volume = (
|
||
grouped["vol"].rolling(window=5).max().reset_index(level=0, drop=True)
|
||
) # 确保索引对齐
|
||
df["cat_volume_breakout"] = df["vol"] > max_volume
|
||
|
||
# 因子 3:换手率均线偏离度
|
||
mean_turnover = (
|
||
grouped["turnover_rate"]
|
||
.rolling(window=3)
|
||
.mean()
|
||
.reset_index(level=0, drop=True)
|
||
)
|
||
std_turnover = (
|
||
grouped["turnover_rate"].rolling(window=3).std().reset_index(level=0, drop=True)
|
||
)
|
||
df["turnover_deviation"] = (df["turnover_rate"] - mean_turnover) / std_turnover
|
||
|
||
# 因子 4:换手率激增信号
|
||
df["cat_turnover_spike"] = df["turnover_rate"] > mean_turnover + 2 * std_turnover
|
||
|
||
# 因子 5:量比均值
|
||
df["avg_volume_ratio"] = (
|
||
grouped["volume_ratio"].rolling(window=3).mean().reset_index(level=0, drop=True)
|
||
)
|
||
|
||
# 因子 6:量比突破信号
|
||
max_volume_ratio = (
|
||
grouped["volume_ratio"].rolling(window=5).max().reset_index(level=0, drop=True)
|
||
)
|
||
df["cat_volume_ratio_breakout"] = df["volume_ratio"] > max_volume_ratio
|
||
|
||
df["vol_spike"] = grouped.apply(
|
||
lambda x: pd.Series(x["vol"].rolling(20).mean(), index=x.index)
|
||
)
|
||
df["vol_std_5"] = grouped["vol"].pct_change().rolling(window=5).std()
|
||
|
||
# 计算 ATR
|
||
df["atr_14"] = grouped.apply(
|
||
lambda x: pd.Series(
|
||
talib.ATR(
|
||
x["high"].values, x["low"].values, x["close"].values, timeperiod=14
|
||
),
|
||
index=x.index,
|
||
)
|
||
)
|
||
df["atr_6"] = grouped.apply(
|
||
lambda x: pd.Series(
|
||
talib.ATR(
|
||
x["high"].values, x["low"].values, x["close"].values, timeperiod=6
|
||
),
|
||
index=x.index,
|
||
)
|
||
)
|
||
|
||
# 计算 OBV 及其均线
|
||
df["obv"] = grouped.apply(
|
||
lambda x: pd.Series(
|
||
talib.OBV(x["close"].values, x["vol"].values), index=x.index
|
||
)
|
||
)
|
||
print(df.columns)
|
||
df["maobv_6"] = grouped.apply(
|
||
lambda x: pd.Series(talib.SMA(x["obv"].values, timeperiod=6), index=x.index)
|
||
)
|
||
|
||
df["rsi_3"] = grouped.apply(
|
||
lambda x: pd.Series(talib.RSI(x["close"].values, timeperiod=3), index=x.index)
|
||
)
|
||
# df['rsi_6'] = grouped.apply(
|
||
# lambda x: pd.Series(talib.RSI(x['close'].values, timeperiod=6), index=x.index)
|
||
# )
|
||
# df['rsi_9'] = grouped.apply(
|
||
# lambda x: pd.Series(talib.RSI(x['close'].values, timeperiod=9), index=x.index)
|
||
# )
|
||
|
||
# 计算 return_10 和 return_20
|
||
df["return_5"] = grouped["close"].apply(lambda x: x / x.shift(5) - 1)
|
||
# df['return_10'] = grouped['close'].apply(lambda x: x / x.shift(10) - 1)
|
||
df["return_20"] = grouped["close"].apply(lambda x: x / x.shift(20) - 1)
|
||
|
||
# df['avg_close_5'] = grouped['close'].apply(lambda x: x.rolling(window=5).mean() / x)
|
||
|
||
# 计算标准差指标
|
||
df["std_return_5"] = grouped["close"].apply(
|
||
lambda x: x.pct_change().rolling(window=5).std()
|
||
)
|
||
# df['std_return_15'] = grouped['close'].apply(lambda x: x.pct_change().rolling(window=15).std())
|
||
# df['std_return_25'] = grouped['close'].apply(lambda x: x.pct_change().rolling(window=25).std())
|
||
df["std_return_90"] = grouped["close"].apply(
|
||
lambda x: x.pct_change().rolling(window=90).std()
|
||
)
|
||
df["std_return_90_2"] = grouped["close"].apply(
|
||
lambda x: x.shift(10).pct_change().rolling(window=90).std()
|
||
)
|
||
|
||
# 计算 EMA 指标
|
||
df["_ema_5"] = grouped["close"].apply(
|
||
lambda x: pd.Series(talib.EMA(x.values, timeperiod=5), index=x.index)
|
||
)
|
||
df["_ema_13"] = grouped["close"].apply(
|
||
lambda x: pd.Series(talib.EMA(x.values, timeperiod=13), index=x.index)
|
||
)
|
||
df["_ema_20"] = grouped["close"].apply(
|
||
lambda x: pd.Series(talib.EMA(x.values, timeperiod=20), index=x.index)
|
||
)
|
||
df["_ema_60"] = grouped["close"].apply(
|
||
lambda x: pd.Series(talib.EMA(x.values, timeperiod=60), index=x.index)
|
||
)
|
||
|
||
# 计算 act_factor1, act_factor2, act_factor3, act_factor4
|
||
df["act_factor1"] = grouped["_ema_5"].apply(
|
||
lambda x: np.arctan((x / x.shift(1) - 1) * 100) * 57.3 / 50
|
||
)
|
||
df["act_factor2"] = grouped["_ema_13"].apply(
|
||
lambda x: np.arctan((x / x.shift(1) - 1) * 100) * 57.3 / 40
|
||
)
|
||
df["act_factor3"] = grouped["_ema_20"].apply(
|
||
lambda x: np.arctan((x / x.shift(1) - 1) * 100) * 57.3 / 21
|
||
)
|
||
df["act_factor4"] = grouped["_ema_60"].apply(
|
||
lambda x: np.arctan((x / x.shift(1) - 1) * 100) * 57.3 / 10
|
||
)
|
||
|
||
# 根据 trade_date 截面计算排名
|
||
df["rank_act_factor1"] = df.groupby("trade_date", group_keys=False)[
|
||
"act_factor1"
|
||
].rank(ascending=False, pct=True)
|
||
df["rank_act_factor2"] = df.groupby("trade_date", group_keys=False)[
|
||
"act_factor2"
|
||
].rank(ascending=False, pct=True)
|
||
df["rank_act_factor3"] = df.groupby("trade_date", group_keys=False)[
|
||
"act_factor3"
|
||
].rank(ascending=False, pct=True)
|
||
|
||
df["log_circ_mv"] = np.log(df["circ_mv"])
|
||
|
||
window_high_volume = 5
|
||
window_close_stddev = 20
|
||
period_delta = 5
|
||
|
||
# 计算每只股票的滚动协方差
|
||
def calculate_rolling_cov(group):
|
||
return group["high"].rolling(window_high_volume).cov(group["vol"])
|
||
|
||
df["cov"] = grouped.apply(calculate_rolling_cov)
|
||
|
||
# 计算每只股票的协方差差分
|
||
def calculate_delta_cov(group):
|
||
return group["cov"].diff(period_delta)
|
||
|
||
df["delta_cov"] = grouped.apply(calculate_delta_cov)
|
||
|
||
# 计算每只股票的滚动标准差
|
||
def calculate_stddev_close(group):
|
||
return group["close"].rolling(window_close_stddev).std()
|
||
|
||
df["_stddev_close"] = grouped.apply(calculate_stddev_close)
|
||
df["_rank_stddev"] = df.groupby("trade_date")["_stddev_close"].rank(pct=True)
|
||
df["alpha_22_improved"] = -1 * df["delta_cov"] * df["_rank_stddev"]
|
||
|
||
df["alpha_003"] = np.where(
|
||
df["high"] != df["low"],
|
||
(df["close"] - df["open"]) / (df["high"] - df["low"]),
|
||
0,
|
||
)
|
||
|
||
df["alpha_007"] = grouped.apply(lambda x: x["close"].rolling(5).corr(x["vol"]))
|
||
df["alpha_007"] = df.groupby("trade_date", group_keys=False)["alpha_007"].rank(
|
||
ascending=True, pct=True
|
||
)
|
||
|
||
df["alpha_013"] = grouped["close"].transform(
|
||
lambda x: x.rolling(5).sum() - x.rolling(20).sum()
|
||
)
|
||
df["alpha_013"] = df.groupby("trade_date", group_keys=False)["alpha_013"].rank(
|
||
ascending=True, pct=True
|
||
)
|
||
|
||
df["vol_break"] = np.where(
|
||
(df["close"] > df["cost_85pct"]) & (df["volume_ratio"] > 2), 1, 0
|
||
)
|
||
|
||
df["weight_roc5"] = grouped["weight_avg"].apply(lambda x: x.pct_change(5))
|
||
|
||
def rolling_corr(group):
|
||
roc_close = group["close"].pct_change()
|
||
roc_weight = group["weight_avg"].pct_change()
|
||
return roc_close.rolling(10).corr(roc_weight)
|
||
|
||
df["price_cost_divergence"] = grouped.apply(rolling_corr)
|
||
|
||
df["smallcap_concentration"] = (1 / df["log_circ_mv"]) * (
|
||
df["cost_85pct"] - df["cost_15pct"]
|
||
)
|
||
|
||
# 16. 筹码稳定性指数 (20日波动率)
|
||
df["weight_std20"] = grouped["weight_avg"].apply(lambda x: x.rolling(20).std())
|
||
df["cost_stability"] = df["weight_std20"] / grouped["weight_avg"].transform(
|
||
lambda x: x.rolling(20).mean()
|
||
)
|
||
|
||
# 17. 成本区间突破标记
|
||
df["high_cost_break_days"] = grouped.apply(
|
||
lambda g: g["close"].gt(g["cost_95pct"]).rolling(5).sum()
|
||
)
|
||
|
||
# 20. 筹码-流动性风险
|
||
df["liquidity_risk"] = (df["cost_95pct"] - df["cost_5pct"]) * (
|
||
1 / grouped["vol"].transform(lambda x: x.rolling(10).mean())
|
||
)
|
||
|
||
# 7. 市值波动率因子 (使用 grouped)
|
||
df["turnover_std"] = grouped["turnover_rate"].transform(
|
||
lambda x: x.rolling(window=20).std()
|
||
)
|
||
df["mv_volatility"] = grouped.apply(lambda x: x["turnover_std"] / x["log_circ_mv"])
|
||
|
||
# 8. 市值成长性因子
|
||
df["volume_growth"] = grouped["vol"].pct_change(periods=20)
|
||
df["mv_growth"] = df["volume_growth"] / df["log_circ_mv"]
|
||
|
||
df.drop(columns=["weight_std20"], inplace=True, errors="ignore")
|
||
df.drop(
|
||
columns=[
|
||
"_is_positive",
|
||
"_is_negative",
|
||
"_pos_returns",
|
||
"_neg_returns",
|
||
"_pos_returns_sq",
|
||
"_neg_returns_sq",
|
||
],
|
||
inplace=True,
|
||
errors="ignore",
|
||
)
|
||
new_columns = [col for col in df.columns.tolist()[:] if col not in old_columns]
|
||
|
||
return df, new_columns
|
||
|
||
|
||
def get_simple_factor(df):
|
||
old_columns = df.columns.tolist()[:]
|
||
df = df.sort_values(by=["ts_code", "trade_date"])
|
||
|
||
alpha = 0.5
|
||
df["momentum_factor"] = df["volume_change_rate"] + alpha * df["turnover_deviation"]
|
||
df["resonance_factor"] = df["volume_ratio"] * df["pct_chg"]
|
||
df["log_close"] = np.log(df["close"])
|
||
|
||
df["cat_vol_spike"] = df["vol"] > 2 * df["vol_spike"]
|
||
|
||
df["up"] = (df["high"] - df[["close", "open"]].max(axis=1)) / df["close"]
|
||
df["down"] = (df[["close", "open"]].min(axis=1) - df["low"]) / df["close"]
|
||
|
||
df["obv_maobv_6"] = df["obv"] - df["maobv_6"]
|
||
|
||
# 计算比值指标
|
||
df["std_return_5_over_std_return_90"] = df["std_return_5"] / df["std_return_90"]
|
||
# df['std_return_5 / std_return_25'] = df['std_return_5'] / df['std_return_25']
|
||
|
||
# 计算标准差差值
|
||
df["std_return_90_minus_std_return_90_2"] = (
|
||
df["std_return_90"] - df["std_return_90_2"]
|
||
)
|
||
|
||
# df['cat_af1'] = df['act_factor1'] > 0
|
||
df["cat_af2"] = df["act_factor2"] > df["act_factor1"]
|
||
df["cat_af3"] = df["act_factor3"] > df["act_factor2"]
|
||
df["cat_af4"] = df["act_factor4"] > df["act_factor3"]
|
||
|
||
# 计算 act_factor5 和 act_factor6
|
||
df["act_factor5"] = (
|
||
df["act_factor1"] + df["act_factor2"] + df["act_factor3"] + df["act_factor4"]
|
||
)
|
||
df["act_factor6"] = (df["act_factor1"] - df["act_factor2"]) / np.sqrt(
|
||
df["act_factor1"] ** 2 + df["act_factor2"] ** 2
|
||
)
|
||
|
||
df["active_buy_volume_large"] = df["buy_lg_vol"] / df["net_mf_vol"]
|
||
df["active_buy_volume_big"] = df["buy_elg_vol"] / df["net_mf_vol"]
|
||
df["active_buy_volume_small"] = df["buy_sm_vol"] / df["net_mf_vol"]
|
||
|
||
df["buy_lg_vol_minus_sell_lg_vol"] = (df["buy_lg_vol"] - df["sell_lg_vol"]) / df[
|
||
"net_mf_vol"
|
||
]
|
||
df["buy_elg_vol_minus_sell_elg_vol"] = (
|
||
df["buy_elg_vol"] - df["sell_elg_vol"]
|
||
) / df["net_mf_vol"]
|
||
|
||
df["log_circ_mv"] = np.log(df["circ_mv"])
|
||
|
||
df["ctrl_strength"] = (df["cost_85pct"] - df["cost_15pct"]) / (
|
||
df["his_high"] - df["his_low"]
|
||
)
|
||
|
||
df["low_cost_dev"] = (df["close"] - df["cost_5pct"]) / (
|
||
df["cost_50pct"] - df["cost_5pct"]
|
||
)
|
||
|
||
df["asymmetry"] = (df["cost_95pct"] - df["cost_50pct"]) / (
|
||
df["cost_50pct"] - df["cost_5pct"]
|
||
)
|
||
|
||
df["lock_factor"] = df["turnover_rate"] * (
|
||
1 - (df["cost_95pct"] - df["cost_5pct"]) / (df["his_high"] - df["his_low"])
|
||
)
|
||
|
||
df["cat_vol_break"] = (df["close"] > df["cost_85pct"]) & (df["volume_ratio"] > 2)
|
||
|
||
df["cost_atr_adj"] = (df["cost_95pct"] - df["cost_5pct"]) / df["atr_14"]
|
||
|
||
# 12. 小盘股筹码集中度
|
||
df["smallcap_concentration"] = (1 / df["log_circ_mv"]) * (
|
||
df["cost_85pct"] - df["cost_15pct"]
|
||
)
|
||
|
||
df["cat_golden_resonance"] = (
|
||
(df["close"] > df["weight_avg"])
|
||
& (df["volume_ratio"] > 1.5)
|
||
& (df["winner_rate"] > 0.7)
|
||
)
|
||
|
||
df["mv_turnover_ratio"] = df["turnover_rate"] / df["log_circ_mv"]
|
||
|
||
df["mv_adjusted_volume"] = df["vol"] / df["log_circ_mv"]
|
||
|
||
df["mv_weighted_turnover"] = df["turnover_rate"] * (1 / df["log_circ_mv"])
|
||
|
||
df["nonlinear_mv_volume"] = df["vol"] / df["log_circ_mv"]
|
||
|
||
df["mv_volume_ratio"] = df["volume_ratio"] / df["log_circ_mv"]
|
||
|
||
df["mv_momentum"] = df["turnover_rate"] * df["volume_ratio"] / df["log_circ_mv"]
|
||
|
||
drop_columns = [col for col in df.columns if col.startswith("_")]
|
||
df.drop(columns=drop_columns, inplace=True, errors="ignore")
|
||
|
||
new_columns = [col for col in df.columns.tolist()[:] if col not in old_columns]
|
||
return df, new_columns
|
||
|
||
|
||
import pandas as pd
|
||
import numpy as np
|
||
from scipy.stats import linregress # For factor 4 (if implementing slope directly)
|
||
|
||
# from hurst import compute_Hc # For factor 18, needs pip install hurst
|
||
# import statsmodels.api as sm # For factor 16, needs pip install statsmodels
|
||
|
||
# --- Constants ---
|
||
epsilon = 1e-10 # Prevent division by zero
|
||
|
||
|
||
# --- Helper Functions ---
|
||
def _safe_divide(a, b, default_val=0):
|
||
"""Safe division, returns default_val for division by zero or NaN/inf results."""
|
||
with np.errstate(divide="ignore", invalid="ignore"):
|
||
result = a / b
|
||
# Replace NaN, Inf, -Inf resulting from division or invalid ops
|
||
result[~np.isfinite(result)] = default_val
|
||
return result
|
||
|
||
|
||
# --- Factor Calculation Functions (In-Place Modification) ---
|
||
|
||
|
||
# Category 1: Large Player Intent & Behavior
|
||
def lg_flow_mom_corr(
|
||
df: pd.DataFrame, N: int = 20, M: int = 60, factor_name: str = None
|
||
):
|
||
"""
|
||
Calculates Factor 1: Large Flow & Price Momentum Concordance (In-place).
|
||
WARNING: Modifies df in-place.
|
||
"""
|
||
if factor_name is None:
|
||
factor_name = f"lg_flow_mom_corr_{N}_{M}"
|
||
print(f"Calculating {factor_name}...")
|
||
_temp_cols = ["_net_lg_flow_val", "_rolling_net_lg_flow", "_price_mom"]
|
||
try:
|
||
df["_net_lg_flow_val"] = (
|
||
df["buy_lg_vol"]
|
||
+ df["buy_elg_vol"]
|
||
- df["sell_lg_vol"]
|
||
- df["sell_elg_vol"]
|
||
) * df["close"]
|
||
df["_rolling_net_lg_flow"] = (
|
||
df.groupby("ts_code")["_net_lg_flow_val"]
|
||
.rolling(N, min_periods=max(1, N // 2))
|
||
.sum()
|
||
.reset_index(level=0, drop=True)
|
||
)
|
||
df["_price_mom"] = df.groupby("ts_code")["close"].pct_change(N)
|
||
# Calculate correlation on the temporary Series to handle alignment
|
||
factor_series = (
|
||
df["_rolling_net_lg_flow"]
|
||
.rolling(M, min_periods=max(1, M // 2))
|
||
.corr(df["_price_mom"])
|
||
)
|
||
df[factor_name] = factor_series
|
||
except Exception as e:
|
||
print(f"Error calculating {factor_name}: {e}")
|
||
df[factor_name] = np.nan # Assign NaN on error
|
||
finally:
|
||
# Cleanup intermediate columns
|
||
cols_to_drop = [col for col in _temp_cols if col in df.columns]
|
||
if cols_to_drop:
|
||
df.drop(columns=cols_to_drop, inplace=True)
|
||
print(f"Finished {factor_name}.")
|
||
|
||
|
||
def lg_buy_consolidation(
|
||
df: pd.DataFrame, N: int = 20, vol_quantile: float = 0.2, factor_name: str = None
|
||
):
|
||
"""
|
||
Calculates Factor 2: Large Buying during Consolidation (In-place).
|
||
WARNING: Modifies df in-place.
|
||
"""
|
||
if factor_name is None:
|
||
factor_name = f"lg_buy_consolidation_{N}"
|
||
print(f"Calculating {factor_name}...")
|
||
_temp_cols = [
|
||
"_rolling_std",
|
||
"_net_lg_flow_ratio",
|
||
"_rolling_net_lg_flow_ratio_mean",
|
||
"_std_threshold",
|
||
]
|
||
try:
|
||
df["_rolling_std"] = (
|
||
df.groupby("ts_code")["close"]
|
||
.rolling(N, min_periods=max(1, N // 2))
|
||
.std()
|
||
.reset_index(level=0, drop=True)
|
||
)
|
||
df["_net_lg_flow_ratio"] = _safe_divide(
|
||
(
|
||
df["buy_lg_vol"]
|
||
+ df["buy_elg_vol"]
|
||
- df["sell_lg_vol"]
|
||
- df["sell_elg_vol"]
|
||
),
|
||
df["vol"],
|
||
)
|
||
df["_rolling_net_lg_flow_ratio_mean"] = (
|
||
df.groupby("ts_code")["_net_lg_flow_ratio"]
|
||
.rolling(N, min_periods=max(1, N // 2))
|
||
.mean()
|
||
.reset_index(level=0, drop=True)
|
||
)
|
||
df["_std_threshold"] = df.groupby("trade_date")["_rolling_std"].transform(
|
||
lambda x: x.quantile(vol_quantile)
|
||
)
|
||
df[factor_name] = df["_rolling_net_lg_flow_ratio_mean"].where(
|
||
df["_rolling_std"] < df["_std_threshold"]
|
||
)
|
||
except Exception as e:
|
||
print(f"Error calculating {factor_name}: {e}")
|
||
df[factor_name] = np.nan
|
||
finally:
|
||
cols_to_drop = [col for col in _temp_cols if col in df.columns]
|
||
if cols_to_drop:
|
||
df.drop(columns=cols_to_drop, inplace=True)
|
||
print(f"Finished {factor_name}.")
|
||
|
||
|
||
def lg_flow_accel(df: pd.DataFrame, factor_name: str = "lg_flow_accel"):
|
||
"""
|
||
Calculates Factor 3: Large Flow Acceleration (In-place).
|
||
WARNING: Modifies df in-place.
|
||
"""
|
||
print(f"Calculating {factor_name}...")
|
||
_temp_cols = ["_net_lg_flow_vol"]
|
||
try:
|
||
df["_net_lg_flow_vol"] = (
|
||
df["buy_lg_vol"]
|
||
+ df["buy_elg_vol"]
|
||
- df["sell_lg_vol"]
|
||
- df["sell_elg_vol"]
|
||
)
|
||
df[factor_name] = df.groupby("ts_code")["_net_lg_flow_vol"].diff(1).diff(1)
|
||
except Exception as e:
|
||
print(f"Error calculating {factor_name}: {e}")
|
||
df[factor_name] = np.nan
|
||
finally:
|
||
cols_to_drop = [col for col in _temp_cols if col in df.columns]
|
||
if cols_to_drop:
|
||
df.drop(columns=cols_to_drop, inplace=True)
|
||
print(f"Finished {factor_name}.")
|
||
|
||
|
||
def intraday_lg_flow_corr(df: pd.DataFrame, N: int = 20, factor_name: str = None):
|
||
"""
|
||
Calculates Factor 4: (Approx) Intraday Trend & Large Flow Correlation (In-place).
|
||
NOTE: Direct rolling correlation between two rolling series is complex/slow in pandas.
|
||
This provides a placeholder or requires significant optimization/pre-calculation.
|
||
WARNING: Modifies df in-place. Placeholder implementation returns NaN.
|
||
"""
|
||
if factor_name is None:
|
||
factor_name = f"intraday_lg_flow_corr_{N}"
|
||
print(f"Calculating {factor_name} (Placeholder - complex implementation)...")
|
||
df[factor_name] = (
|
||
np.nan
|
||
) # Placeholder, see previous thought process for detailed logic needed
|
||
print(f"Finished {factor_name} (Placeholder).")
|
||
|
||
|
||
# Category 2: Cost Basis & PnL Status
|
||
def profit_pressure(df: pd.DataFrame, factor_name: str = "profit_pressure"):
|
||
"""
|
||
Calculates Factor 5: Profit Pressure Index (In-place).
|
||
WARNING: Modifies df in-place.
|
||
"""
|
||
print(f"Calculating {factor_name}...")
|
||
_temp_cols = ["_profit_margin_85", "_profit_margin_95"]
|
||
try:
|
||
df["_profit_margin_85"] = _safe_divide(df["close"], df["cost_85pct"]) - 1
|
||
df["_profit_margin_95"] = _safe_divide(df["close"], df["cost_95pct"]) - 1
|
||
df[factor_name] = (
|
||
df["winner_rate"]
|
||
* 0.5
|
||
* (df["_profit_margin_85"] + df["_profit_margin_95"])
|
||
)
|
||
except Exception as e:
|
||
print(f"Error calculating {factor_name}: {e}")
|
||
df[factor_name] = np.nan
|
||
finally:
|
||
cols_to_drop = [col for col in _temp_cols if col in df.columns]
|
||
if cols_to_drop:
|
||
df.drop(columns=cols_to_drop, inplace=True)
|
||
print(f"Finished {factor_name}.")
|
||
|
||
|
||
def underwater_resistance(df: pd.DataFrame, factor_name: str = "underwater_resistance"):
|
||
"""
|
||
Calculates Factor 6: Resistance from Underwater Positions (In-place).
|
||
WARNING: Modifies df in-place.
|
||
"""
|
||
print(f"Calculating {factor_name}...")
|
||
_temp_cols = ["_underwater_ratio", "_dist_to_cost_15"]
|
||
try:
|
||
df["_underwater_ratio"] = 1.0 - df["winner_rate"]
|
||
df["_dist_to_cost_15"] = np.maximum(0, df["cost_15pct"] - df["close"]) / (
|
||
df["close"] + epsilon
|
||
)
|
||
df[factor_name] = df["_underwater_ratio"] * df["_dist_to_cost_15"]
|
||
except Exception as e:
|
||
print(f"Error calculating {factor_name}: {e}")
|
||
df[factor_name] = np.nan
|
||
finally:
|
||
cols_to_drop = [col for col in _temp_cols if col in df.columns]
|
||
if cols_to_drop:
|
||
df.drop(columns=cols_to_drop, inplace=True)
|
||
print(f"Finished {factor_name}.")
|
||
|
||
|
||
def cost_conc_std(df: pd.DataFrame, N: int = 20, factor_name: str = None):
|
||
"""
|
||
Calculates Factor 7: Cost Concentration Change (Std Dev) (In-place).
|
||
WARNING: Modifies df in-place.
|
||
"""
|
||
if factor_name is None:
|
||
factor_name = f"cost_conc_std_{N}"
|
||
print(f"Calculating {factor_name}...")
|
||
_temp_cols = ["_cost_range_norm"]
|
||
try:
|
||
df["_cost_range_norm"] = _safe_divide(
|
||
(df["cost_85pct"] - df["cost_15pct"]), (df["weight_avg"] + epsilon)
|
||
)
|
||
# Need to calculate rolling std on the temp col before dropping it
|
||
factor_series = (
|
||
df.groupby("ts_code")["_cost_range_norm"]
|
||
.rolling(N, min_periods=max(1, N // 2))
|
||
.std()
|
||
.reset_index(level=0, drop=True)
|
||
)
|
||
df[factor_name] = factor_series
|
||
except Exception as e:
|
||
print(f"Error calculating {factor_name}: {e}")
|
||
df[factor_name] = np.nan
|
||
finally:
|
||
cols_to_drop = [col for col in _temp_cols if col in df.columns]
|
||
if cols_to_drop:
|
||
df.drop(columns=cols_to_drop, inplace=True)
|
||
print(f"Finished {factor_name}.")
|
||
|
||
|
||
def profit_decay(df: pd.DataFrame, N: int = 20, factor_name: str = None):
|
||
"""
|
||
Calculates Factor 8: Profit Expectation Decay (In-place).
|
||
WARNING: Modifies df in-place.
|
||
"""
|
||
if factor_name is None:
|
||
factor_name = f"profit_decay_{N}"
|
||
print(f"Calculating {factor_name}...")
|
||
_temp_cols = ["_ret_N", "_winner_rate_change_N"]
|
||
try:
|
||
df["_ret_N"] = (
|
||
_safe_divide(df["close"], df.groupby("ts_code")["close"].shift(N)) - 1
|
||
)
|
||
df["_winner_rate_change_N"] = df.groupby("ts_code")["winner_rate"].diff(N)
|
||
df[factor_name] = _safe_divide(df["_ret_N"], df["_winner_rate_change_N"])
|
||
except Exception as e:
|
||
print(f"Error calculating {factor_name}: {e}")
|
||
df[factor_name] = np.nan
|
||
finally:
|
||
cols_to_drop = [col for col in _temp_cols if col in df.columns]
|
||
if cols_to_drop:
|
||
df.drop(columns=cols_to_drop, inplace=True)
|
||
print(f"Finished {factor_name}.")
|
||
|
||
|
||
# Category 3: Volatility Source & Market State
|
||
def vol_amp_loss(df: pd.DataFrame, N: int = 20, factor_name: str = None):
|
||
"""
|
||
Calculates Factor 9: Volatility Amplification when Underwater (In-place).
|
||
WARNING: Modifies df in-place.
|
||
"""
|
||
if factor_name is None:
|
||
factor_name = f"vol_amp_loss_{N}"
|
||
print(f"Calculating {factor_name}...")
|
||
_temp_cols = ["_vol_N", "_loss_degree"]
|
||
try:
|
||
df["_vol_N"] = (
|
||
df.groupby("ts_code")["pct_chg"]
|
||
.rolling(N, min_periods=max(1, N // 2))
|
||
.std()
|
||
.reset_index(level=0, drop=True)
|
||
)
|
||
df["_loss_degree"] = np.maximum(0, df["weight_avg"] - df["close"]) / (
|
||
df["close"] + epsilon
|
||
)
|
||
df[factor_name] = df["_vol_N"] * df["_loss_degree"]
|
||
except Exception as e:
|
||
print(f"Error calculating {factor_name}: {e}")
|
||
df[factor_name] = np.nan
|
||
finally:
|
||
cols_to_drop = [col for col in _temp_cols if col in df.columns]
|
||
if cols_to_drop:
|
||
df.drop(columns=cols_to_drop, inplace=True)
|
||
print(f"Finished {factor_name}.")
|
||
|
||
|
||
def vol_drop_profit_cnt(
|
||
df: pd.DataFrame,
|
||
N: int = 20,
|
||
M: int = 5,
|
||
profit_thresh: float = 0.1,
|
||
drop_thresh: float = -0.03,
|
||
vol_multiple: float = 2.0,
|
||
factor_name: str = None,
|
||
):
|
||
"""
|
||
Calculates Factor 10: High Volume Drop when Profitable (Count over M days) (In-place).
|
||
WARNING: Modifies df in-place.
|
||
"""
|
||
if factor_name is None:
|
||
factor_name = f"vol_drop_profit_cnt_{M}"
|
||
print(f"Calculating {factor_name}...")
|
||
_temp_cols = [
|
||
"_is_profitable",
|
||
"_is_dropping",
|
||
"_rolling_mean_vol",
|
||
"_rolling_std_vol",
|
||
"_is_high_vol",
|
||
"_event",
|
||
]
|
||
try:
|
||
df["_is_profitable"] = df["close"] > df["weight_avg"] * (1 + profit_thresh)
|
||
df["_is_dropping"] = df["pct_chg"] < drop_thresh
|
||
df["_rolling_mean_vol"] = (
|
||
df.groupby("ts_code")["vol"]
|
||
.rolling(N, min_periods=1)
|
||
.mean()
|
||
.reset_index(level=0, drop=True)
|
||
)
|
||
df["_rolling_std_vol"] = (
|
||
df.groupby("ts_code")["vol"]
|
||
.rolling(N, min_periods=2)
|
||
.std()
|
||
.reset_index(level=0, drop=True)
|
||
.fillna(0)
|
||
)
|
||
df["_is_high_vol"] = df["vol"] > (
|
||
df["_rolling_mean_vol"] + vol_multiple * df["_rolling_std_vol"]
|
||
)
|
||
df["_event"] = (
|
||
df["_is_profitable"] & df["_is_dropping"] & df["_is_high_vol"]
|
||
).astype(int)
|
||
factor_series = (
|
||
df.groupby("ts_code")["_event"]
|
||
.rolling(M, min_periods=1)
|
||
.sum()
|
||
.reset_index(level=0, drop=True)
|
||
)
|
||
df[factor_name] = factor_series
|
||
except Exception as e:
|
||
print(f"Error calculating {factor_name}: {e}")
|
||
df[factor_name] = np.nan
|
||
finally:
|
||
cols_to_drop = [col for col in _temp_cols if col in df.columns]
|
||
if cols_to_drop:
|
||
df.drop(columns=cols_to_drop, inplace=True)
|
||
print(f"Finished {factor_name}.")
|
||
|
||
|
||
def lg_flow_vol_interact(df: pd.DataFrame, N: int = 20, factor_name: str = None):
|
||
"""
|
||
Calculates Factor 11: Large Flow Driven Volatility (Interaction Term) (In-place).
|
||
WARNING: Modifies df in-place.
|
||
"""
|
||
if factor_name is None:
|
||
factor_name = f"lg_flow_vol_interact_{N}"
|
||
print(f"Calculating {factor_name}...")
|
||
_temp_cols = [
|
||
"_vol_N",
|
||
"_net_lg_flow_val",
|
||
"_total_val",
|
||
"_abs_net_lg_flow_ratio",
|
||
"_abs_net_lg_flow_ratio_N",
|
||
]
|
||
try:
|
||
df["_vol_N"] = (
|
||
df.groupby("ts_code")["pct_chg"]
|
||
.rolling(N, min_periods=max(1, N // 2))
|
||
.std()
|
||
.reset_index(level=0, drop=True)
|
||
)
|
||
df["_net_lg_flow_val"] = (
|
||
df["buy_lg_vol"]
|
||
+ df["buy_elg_vol"]
|
||
- df["sell_lg_vol"]
|
||
- df["sell_elg_vol"]
|
||
) * df["close"]
|
||
df["_total_val"] = df["vol"] * df["close"]
|
||
df["_abs_net_lg_flow_ratio"] = abs(df["_net_lg_flow_val"]) / (
|
||
df["_total_val"] + epsilon
|
||
)
|
||
df["_abs_net_lg_flow_ratio_N"] = (
|
||
df.groupby("ts_code")["_abs_net_lg_flow_ratio"]
|
||
.rolling(N, min_periods=max(1, N // 2))
|
||
.mean()
|
||
.reset_index(level=0, drop=True)
|
||
)
|
||
df[factor_name] = df["_vol_N"] * df["_abs_net_lg_flow_ratio_N"]
|
||
except Exception as e:
|
||
print(f"Error calculating {factor_name}: {e}")
|
||
df[factor_name] = np.nan
|
||
finally:
|
||
cols_to_drop = [col for col in _temp_cols if col in df.columns]
|
||
if cols_to_drop:
|
||
df.drop(columns=cols_to_drop, inplace=True)
|
||
print(f"Finished {factor_name}.")
|
||
|
||
|
||
def cost_break_confirm_cnt(df: pd.DataFrame, M: int = 5, factor_name: str = None):
|
||
"""
|
||
Calculates Factor 12: Cost Breakout Confirmation Count (In-place).
|
||
WARNING: Modifies df in-place.
|
||
"""
|
||
if factor_name is None:
|
||
factor_name = f"cost_break_confirm_cnt_{M}"
|
||
print(f"Calculating {factor_name}...")
|
||
_temp_cols = [
|
||
"_prev_cost_85",
|
||
"_prev_cost_15",
|
||
"_break_up",
|
||
"_break_down",
|
||
"_net_lg_flow_vol",
|
||
"_confirm_up",
|
||
"_confirm_down",
|
||
"_net_confirm",
|
||
]
|
||
try:
|
||
df["_prev_cost_85"] = df.groupby("ts_code")["cost_85pct"].shift(1)
|
||
df["_prev_cost_15"] = df.groupby("ts_code")["cost_15pct"].shift(1)
|
||
df["_break_up"] = df["close"] > df["_prev_cost_85"]
|
||
df["_break_down"] = df["close"] < df["_prev_cost_15"]
|
||
df["_net_lg_flow_vol"] = (
|
||
df["buy_lg_vol"]
|
||
+ df["buy_elg_vol"]
|
||
- df["sell_lg_vol"]
|
||
- df["sell_elg_vol"]
|
||
)
|
||
df["_confirm_up"] = (df["_break_up"] & (df["_net_lg_flow_vol"] > 0)).astype(int)
|
||
df["_confirm_down"] = (df["_break_down"] & (df["_net_lg_flow_vol"] < 0)).astype(
|
||
int
|
||
)
|
||
df["_net_confirm"] = df["_confirm_up"] - df["_confirm_down"]
|
||
factor_series = (
|
||
df.groupby("ts_code")["_net_confirm"]
|
||
.rolling(M, min_periods=1)
|
||
.sum()
|
||
.reset_index(level=0, drop=True)
|
||
)
|
||
df[factor_name] = factor_series
|
||
except Exception as e:
|
||
print(f"Error calculating {factor_name}: {e}")
|
||
df[factor_name] = np.nan
|
||
finally:
|
||
cols_to_drop = [col for col in _temp_cols if col in df.columns]
|
||
if cols_to_drop:
|
||
df.drop(columns=cols_to_drop, inplace=True)
|
||
print(f"Finished {factor_name}.")
|
||
|
||
|
||
# Category 4: Technical Indicators & Market Behavior
|
||
def atr_norm_channel_pos(df: pd.DataFrame, N: int = 14, factor_name: str = None):
|
||
"""
|
||
Calculates Factor 13: ATR Normalized Channel Position (In-place).
|
||
WARNING: Modifies df in-place.
|
||
"""
|
||
if factor_name is None:
|
||
factor_name = f"atr_norm_channel_pos_{N}"
|
||
print(f"Calculating {factor_name}...")
|
||
_temp_cols = [
|
||
"_prev_close",
|
||
"_h_l",
|
||
"_h_pc",
|
||
"_l_pc",
|
||
"_tr",
|
||
"_atr_N",
|
||
"_roll_low_N",
|
||
]
|
||
try:
|
||
df["_prev_close"] = df.groupby("ts_code")["close"].shift(1)
|
||
df["_h_l"] = df["high"] - df["low"]
|
||
df["_h_pc"] = abs(df["high"] - df["_prev_close"])
|
||
df["_l_pc"] = abs(df["low"] - df["_prev_close"])
|
||
df["_tr"] = df[["_h_l", "_h_pc", "_l_pc"]].max(axis=1)
|
||
df["_atr_N"] = (
|
||
df.groupby("ts_code")["_tr"]
|
||
.rolling(N, min_periods=max(1, N // 2))
|
||
.mean()
|
||
.reset_index(level=0, drop=True)
|
||
)
|
||
df["_roll_low_N"] = (
|
||
df.groupby("ts_code")["low"]
|
||
.rolling(N, min_periods=max(1, N // 2))
|
||
.min()
|
||
.reset_index(level=0, drop=True)
|
||
)
|
||
df[factor_name] = _safe_divide((df["close"] - df["_roll_low_N"]), df["_atr_N"])
|
||
except Exception as e:
|
||
print(f"Error calculating {factor_name}: {e}")
|
||
df[factor_name] = np.nan
|
||
finally:
|
||
cols_to_drop = [col for col in _temp_cols if col in df.columns]
|
||
if cols_to_drop:
|
||
df.drop(columns=cols_to_drop, inplace=True)
|
||
print(f"Finished {factor_name}.")
|
||
|
||
|
||
def turnover_diff_skew(df: pd.DataFrame, N: int = 20, factor_name: str = None):
|
||
"""
|
||
Calculates Factor 14: Skewness of Turnover Rate Change (In-place).
|
||
WARNING: Modifies df in-place.
|
||
"""
|
||
if factor_name is None:
|
||
factor_name = f"turnover_diff_skew_{N}"
|
||
print(f"Calculating {factor_name}...")
|
||
_temp_cols = ["_turnover_diff"]
|
||
try:
|
||
# Assuming turnover_rate is in percentage points, diff is fine
|
||
df["_turnover_diff"] = df.groupby("ts_code")["turnover_rate"].diff(1)
|
||
factor_series = (
|
||
df.groupby("ts_code")["_turnover_diff"]
|
||
.rolling(N, min_periods=max(3, N // 2))
|
||
.skew()
|
||
.reset_index(level=0, drop=True)
|
||
)
|
||
df[factor_name] = factor_series
|
||
except Exception as e:
|
||
print(f"Error calculating {factor_name}: {e}")
|
||
df[factor_name] = np.nan
|
||
finally:
|
||
cols_to_drop = [col for col in _temp_cols if col in df.columns]
|
||
if cols_to_drop:
|
||
df.drop(columns=cols_to_drop, inplace=True)
|
||
print(f"Finished {factor_name}.")
|
||
|
||
|
||
def lg_sm_flow_diverge(df: pd.DataFrame, N: int = 20, factor_name: str = None):
|
||
"""
|
||
Calculates Factor 15: Divergence between Large and Small Flow (In-place).
|
||
WARNING: Modifies df in-place.
|
||
"""
|
||
if factor_name is None:
|
||
factor_name = f"lg_sm_flow_diverge_{N}"
|
||
print(f"Calculating {factor_name}...")
|
||
_temp_cols = [
|
||
"_lg_flow_ratio",
|
||
"_sm_flow_ratio",
|
||
"_lg_flow_ratio_N",
|
||
"_sm_flow_ratio_N",
|
||
]
|
||
try:
|
||
df["_lg_flow_ratio"] = _safe_divide(
|
||
(
|
||
df["buy_lg_vol"]
|
||
+ df["buy_elg_vol"]
|
||
- df["sell_lg_vol"]
|
||
- df["sell_elg_vol"]
|
||
),
|
||
df["vol"],
|
||
)
|
||
df["_sm_flow_ratio"] = _safe_divide(
|
||
(df["buy_sm_vol"] - df["sell_sm_vol"]), df["vol"]
|
||
)
|
||
df["_lg_flow_ratio_N"] = (
|
||
df.groupby("ts_code")["_lg_flow_ratio"]
|
||
.rolling(N, min_periods=max(1, N // 2))
|
||
.mean()
|
||
.reset_index(level=0, drop=True)
|
||
)
|
||
df["_sm_flow_ratio_N"] = (
|
||
df.groupby("ts_code")["_sm_flow_ratio"]
|
||
.rolling(N, min_periods=max(1, N // 2))
|
||
.mean()
|
||
.reset_index(level=0, drop=True)
|
||
)
|
||
df[factor_name] = df["_lg_flow_ratio_N"] - df["_sm_flow_ratio_N"]
|
||
except Exception as e:
|
||
print(f"Error calculating {factor_name}: {e}")
|
||
df[factor_name] = np.nan
|
||
finally:
|
||
cols_to_drop = [col for col in _temp_cols if col in df.columns]
|
||
if cols_to_drop:
|
||
df.drop(columns=cols_to_drop, inplace=True)
|
||
print(f"Finished {factor_name}.")
|
||
|
||
|
||
def cap_neutral_cost_metric(
|
||
df: pd.DataFrame, factor_name: str = "cap_neutral_cost_metric"
|
||
):
|
||
"""
|
||
Calculates Factor 16: Market Cap Neutralized Cost Metric (Placeholder).
|
||
Requires statsmodels and complex implementation.
|
||
WARNING: Modifies df in-place. Placeholder implementation returns NaN.
|
||
"""
|
||
print(f"Calculating {factor_name} (Placeholder - requires statsmodels)...")
|
||
df[factor_name] = np.nan
|
||
print(f"Finished {factor_name} (Placeholder).")
|
||
|
||
|
||
def pullback_strong(
|
||
df: pd.DataFrame,
|
||
N: int = 20,
|
||
M: int = 20,
|
||
gain_thresh: float = 0.2,
|
||
factor_name: str = None,
|
||
):
|
||
"""
|
||
Calculates Factor 17: Pullback Depth from Recent High for Strong Stocks (In-place).
|
||
WARNING: Modifies df in-place.
|
||
"""
|
||
if factor_name is None:
|
||
factor_name = f"pullback_strong_{N}_{M}"
|
||
print(f"Calculating {factor_name}...")
|
||
_temp_cols = ["_high_N", "_pullback_depth", "_recent_gain_M"]
|
||
try:
|
||
df["_high_N"] = (
|
||
df.groupby("ts_code")["high"]
|
||
.rolling(N, min_periods=max(1, N // 2))
|
||
.max()
|
||
.reset_index(level=0, drop=True)
|
||
)
|
||
df["_pullback_depth"] = _safe_divide(
|
||
(df["_high_N"] - df["close"]), df["_high_N"]
|
||
)
|
||
df["_recent_gain_M"] = (
|
||
_safe_divide(df["close"], df.groupby("ts_code")["close"].shift(M)) - 1
|
||
)
|
||
df[factor_name] = _safe_divide(df["_pullback_depth"], df["_recent_gain_M"])
|
||
except Exception as e:
|
||
print(f"Error calculating {factor_name}: {e}")
|
||
df[factor_name] = np.nan
|
||
finally:
|
||
cols_to_drop = [col for col in _temp_cols if col in df.columns]
|
||
if cols_to_drop:
|
||
df.drop(columns=cols_to_drop, inplace=True)
|
||
print(f"Finished {factor_name}.")
|
||
|
||
|
||
def hurst_exponent_flow(
|
||
df: pd.DataFrame, N: int = 60, flow_col: str = "net_mf_vol", factor_name: str = None
|
||
):
|
||
"""
|
||
Calculates Factor 18: Hurst Exponent of Money Flow (Placeholder).
|
||
Requires 'hurst' library and potentially slow rolling apply.
|
||
WARNING: Modifies df in-place. Placeholder implementation returns NaN.
|
||
"""
|
||
if factor_name is None:
|
||
factor_name = f"hurst_{flow_col}_{N}"
|
||
print(f"Calculating {factor_name} (Placeholder - requires hurst library)...")
|
||
try:
|
||
from hurst import compute_Hc
|
||
|
||
# Logic would go here, likely using rolling().apply() which is slow
|
||
# factor_series = df.groupby('ts_code')[flow_col]....apply(hurst_calc_func...)
|
||
df[factor_name] = np.nan # Placeholder
|
||
except ImportError:
|
||
print("Error: 'hurst' library not installed. Cannot calculate factor.")
|
||
df[factor_name] = np.nan
|
||
except Exception as e:
|
||
print(f"Error calculating {factor_name}: {e}")
|
||
df[factor_name] = np.nan
|
||
print(f"Finished {factor_name} (Placeholder).")
|
||
|
||
|
||
def vol_wgt_hist_pos(df: pd.DataFrame, N: int = 20, factor_name: str = None):
|
||
"""
|
||
Calculates Factor 19: Volume Weighting at Historical Price Level (In-place).
|
||
WARNING: Modifies df in-place.
|
||
"""
|
||
if factor_name is None:
|
||
factor_name = f"vol_wgt_hist_pos_{N}"
|
||
print(f"Calculating {factor_name}...")
|
||
_temp_cols = ["_hist_pos", "_rolling_mean_vol", "_vol_rel_strength"]
|
||
try:
|
||
df["_hist_pos"] = _safe_divide(
|
||
(df["close"] - df["his_low"]), (df["his_high"] - df["his_low"])
|
||
).clip(0, 1)
|
||
df["_rolling_mean_vol"] = (
|
||
df.groupby("ts_code")["vol"]
|
||
.rolling(N, min_periods=max(1, N // 2))
|
||
.mean()
|
||
.reset_index(level=0, drop=True)
|
||
)
|
||
df["_vol_rel_strength"] = _safe_divide(df["vol"], df["_rolling_mean_vol"])
|
||
df[factor_name] = df["_hist_pos"] * df["_vol_rel_strength"]
|
||
except Exception as e:
|
||
print(f"Error calculating {factor_name}: {e}")
|
||
df[factor_name] = np.nan
|
||
finally:
|
||
cols_to_drop = [col for col in _temp_cols if col in df.columns]
|
||
if cols_to_drop:
|
||
df.drop(columns=cols_to_drop, inplace=True)
|
||
print(f"Finished {factor_name}.")
|
||
|
||
|
||
def vol_adj_roc(df: pd.DataFrame, N: int = 20, factor_name: str = None):
|
||
"""
|
||
Calculates Factor 20: Volatility-Adjusted ROC (In-place).
|
||
WARNING: Modifies df in-place.
|
||
"""
|
||
if factor_name is None:
|
||
factor_name = f"vol_adj_roc_{N}"
|
||
print(f"Calculating {factor_name}...")
|
||
_temp_cols = ["_roc_N", "_vol_N"]
|
||
try:
|
||
df["_roc_N"] = (
|
||
_safe_divide(df["close"], df.groupby("ts_code")["close"].shift(N)) - 1
|
||
)
|
||
df["_vol_N"] = (
|
||
df.groupby("ts_code")["pct_chg"]
|
||
.rolling(N, min_periods=max(2, N // 2))
|
||
.std()
|
||
.reset_index(level=0, drop=True)
|
||
.fillna(0)
|
||
)
|
||
df[factor_name] = _safe_divide(df["_roc_N"], df["_vol_N"])
|
||
except Exception as e:
|
||
print(f"Error calculating {factor_name}: {e}")
|
||
df[factor_name] = np.nan
|
||
finally:
|
||
cols_to_drop = [col for col in _temp_cols if col in df.columns]
|
||
if cols_to_drop:
|
||
df.drop(columns=cols_to_drop, inplace=True)
|
||
print(f"Finished {factor_name}.")
|
||
|
||
|
||
def calculate_complex_factor(
|
||
df: pd.DataFrame, factor_name: str = "complex_factor_deap_1"
|
||
):
|
||
"""
|
||
表达式: sub(protected_div_torch(A, B), C)
|
||
其中 A, B, C 及内部组件依赖于多个预计算因子列。
|
||
|
||
Args:
|
||
df (pd.DataFrame): 包含所有必需基础因子列的 DataFrame。
|
||
factor_name (str): 要在 df 中创建的新因子列的名称。
|
||
|
||
WARNING: 此函数会原地修改输入的 DataFrame 'df'。
|
||
如果在计算过程中缺少任何必需的列,将打印错误并填充 NaN。
|
||
"""
|
||
print(f"开始计算因子: {factor_name} (原地修改)...")
|
||
_temp_cols_list = [] # 用于记录中间计算列的名称
|
||
|
||
try:
|
||
# --- 分解计算表达式的各个部分 ---
|
||
|
||
# 计算组件 D
|
||
# D = sub(mul(pullback_strong_20_20, div(log_close, industry_return_5)), div(add(vol_adj_roc_20, vol_drop_profit_cnt_5), sub(nonlinear_mv_volume, alpha_007)))
|
||
_temp_d_term1_div = _safe_divide(df["log_close"], df["industry_return_5"])
|
||
_temp_d_term1 = df["pullback_strong_20_20"] * _temp_d_term1_div
|
||
_temp_d_term2_sub = df["nonlinear_mv_volume"] - df["alpha_007"]
|
||
_temp_d_term2_add = df["vol_adj_roc_20"] + df["vol_drop_profit_cnt_5"]
|
||
_temp_d_term2 = _safe_divide(_temp_d_term2_add, _temp_d_term2_sub)
|
||
df["_temp_D"] = _temp_d_term1 - _temp_d_term2
|
||
_temp_cols_list.extend(
|
||
[
|
||
"_temp_D",
|
||
"_temp_d_term1_div",
|
||
"_temp_d_term1",
|
||
"_temp_d_term2_sub",
|
||
"_temp_d_term2_add",
|
||
"_temp_d_term2",
|
||
]
|
||
)
|
||
|
||
# 计算组件 A
|
||
# A = add(add(mul(D, lg_buy_consolidation_20), lg_buy_consolidation_20), pullback_strong_20_20)
|
||
_temp_a_term1 = df["_temp_D"] * df["lg_buy_consolidation_20"]
|
||
_temp_a_term2 = _temp_a_term1 + df["lg_buy_consolidation_20"]
|
||
df["_temp_A"] = _temp_a_term2 + df["pullback_strong_20_20"]
|
||
_temp_cols_list.extend(["_temp_A", "_temp_a_term1", "_temp_a_term2"])
|
||
|
||
# 计算组件 F
|
||
# F = mul(add(net_mf_vol, std_return_5), sub(arbr, industry_act_factor5))
|
||
_temp_f_term1 = df["net_mf_vol"] + df["std_return_5"]
|
||
_temp_f_term2 = df["arbr"] - df["industry_act_factor5"]
|
||
df["_temp_F"] = _temp_f_term1 * _temp_f_term2
|
||
_temp_cols_list.extend(["_temp_F", "_temp_f_term1", "_temp_f_term2"])
|
||
|
||
# 计算组件 H
|
||
# H = add(add(industry_act_factor1, low_cost_dev), mul(mv_weighted_turnover, act_factor4))
|
||
_temp_h_term1 = df["industry_act_factor1"] + df["low_cost_dev"]
|
||
_temp_h_term2 = df["mv_weighted_turnover"] * df["act_factor4"]
|
||
df["_temp_H"] = _temp_h_term1 + _temp_h_term2
|
||
_temp_cols_list.extend(["_temp_H", "_temp_h_term1", "_temp_h_term2"])
|
||
|
||
# 计算组件 B
|
||
# B = div(add(add(F, vol), H), lg_elg_buy_prop)
|
||
_temp_b_term1 = df["_temp_F"] + df["vol"]
|
||
_temp_b_term2 = _temp_b_term1 + df["_temp_H"]
|
||
df["_temp_B"] = _safe_divide(_temp_b_term2, df["lg_elg_buy_prop"])
|
||
_temp_cols_list.extend(["_temp_B", "_temp_b_term1", "_temp_b_term2"])
|
||
|
||
# 计算组件 C
|
||
# C = div(div(intraday_lg_flow_corr_20, lg_elg_buy_prop), lg_elg_buy_prop)
|
||
# 注意: intraday_lg_flow_corr_20 可能本身就是 NaN 或需要特殊处理
|
||
_temp_c_term1 = _safe_divide(
|
||
df.get("intraday_lg_flow_corr_20", np.nan), df["lg_elg_buy_prop"]
|
||
) # 使用 .get 处理可能不存在的列
|
||
df["_temp_C"] = _safe_divide(_temp_c_term1, df["lg_elg_buy_prop"])
|
||
_temp_cols_list.extend(["_temp_C", "_temp_c_term1"])
|
||
|
||
# --- 计算最终表达式 ---
|
||
# final = sub(div(A, B), C)
|
||
_temp_final_term1 = _safe_divide(df["_temp_A"], df["_temp_B"])
|
||
final_factor_series = _temp_final_term1 - df["_temp_C"]
|
||
|
||
# --- 将最终结果赋值给 df 的新列 (原地修改) ---
|
||
df[factor_name] = final_factor_series
|
||
|
||
print(f"因子 {factor_name} 计算成功。")
|
||
|
||
except KeyError as e:
|
||
# 捕获因为缺少列而产生的错误
|
||
print(f"错误: 计算 {factor_name} 时缺少必需的列: {e}")
|
||
print("请确保输入的 DataFrame 包含所有表达式中引用的因子列。")
|
||
print("将为因子 {factor_name} 填充 NaN。")
|
||
df[factor_name] = np.nan # 出错时填充 NaN
|
||
except Exception as e:
|
||
# 捕获其他可能的计算错误
|
||
print(f"错误: 计算 {factor_name} 时发生意外错误: {e}")
|
||
print(f"将为因子 {factor_name} 填充 NaN。")
|
||
df[factor_name] = np.nan # 出错时填充 NaN
|
||
finally:
|
||
# --- 清理所有中间计算列 ---
|
||
cols_to_drop = [col for col in _temp_cols_list if col in df.columns]
|
||
if cols_to_drop:
|
||
df.drop(columns=cols_to_drop, inplace=True)
|
||
# print(f"已清理 {len(cols_to_drop)} 个临时列 for {factor_name}.")
|
||
print(f"因子 {factor_name} 计算流程结束。")
|
||
# 函数不返回任何值,因为 df 是原地修改的
|
||
|
||
|
||
import pandas as pd
|
||
import numpy as np
|
||
|
||
# from scipy.stats import rankdata # rankdata is not needed if using pandas rank
|
||
# import statsmodels.api as sm # Needed for factor 19
|
||
|
||
# --- Constants ---
|
||
epsilon = 1e-10 # Prevent division by zero
|
||
|
||
|
||
# --- Helper Functions ---
|
||
def _safe_divide(numerator, denominator, default_val=0):
|
||
"""
|
||
安全的除法函数,处理分母为零或接近零,以及NaN/Inf的情况。
|
||
|
||
Args:
|
||
numerator (pd.Series): 分子.
|
||
denominator (pd.Series): 分母.
|
||
default_val (float): 当分母为零或结果无效时返回的默认值.
|
||
|
||
Returns:
|
||
pd.Series: 除法结果.
|
||
"""
|
||
with np.errstate(divide="ignore", invalid="ignore"):
|
||
# Convert inputs to numeric, coercing errors to NaN before division
|
||
num = pd.to_numeric(numerator, errors="coerce")
|
||
den = pd.to_numeric(denominator, errors="coerce")
|
||
# Perform division where denominator is not close to zero and inputs are valid numbers
|
||
result = np.where(np.abs(den) > epsilon, num / den, default_val)
|
||
# Ensure result is float, handle potential NaNs from coercion or division
|
||
result = pd.to_numeric(result, errors="coerce")
|
||
# Fill remaining NaNs if necessary
|
||
result = np.nan_to_num(
|
||
result, nan=default_val, posinf=default_val, neginf=default_val
|
||
)
|
||
# Ensure result index matches numerator's index if numerator is a Series
|
||
if isinstance(numerator, pd.Series):
|
||
return pd.Series(result, index=numerator.index)
|
||
else:
|
||
return pd.Series(result) # Fallback if numerator is not a Series (less likely)
|
||
|
||
|
||
# --- Cross-Sectional Factor Calculation Functions (In-Place Modification) ---
|
||
|
||
|
||
# Category 1: Cross-Sectional Flow & Behavior Strength
|
||
def cs_rank_net_lg_flow_val(
|
||
df: pd.DataFrame, factor_name: str = "cs_rank_net_lg_flow_val"
|
||
):
|
||
"""
|
||
Factor 1: 大单净额截面排序 (In-place).
|
||
WARNING: Modifies df in-place.
|
||
"""
|
||
print(f"Calculating {factor_name}...")
|
||
_temp_cols = ["_net_lg_flow_val"]
|
||
try:
|
||
df["_net_lg_flow_val"] = (
|
||
df["buy_lg_vol"]
|
||
+ df["buy_elg_vol"]
|
||
- df["sell_lg_vol"]
|
||
- df["sell_elg_vol"]
|
||
) * df["close"]
|
||
df[factor_name] = df.groupby("trade_date")["_net_lg_flow_val"].rank(pct=True)
|
||
except KeyError as e:
|
||
print(f"Error calculating {factor_name}: Missing column {e}. Assigning NaN.")
|
||
df[factor_name] = np.nan
|
||
except Exception as e:
|
||
print(
|
||
f"An unexpected error occurred calculating {factor_name}: {e}. Assigning NaN."
|
||
)
|
||
df[factor_name] = np.nan
|
||
finally:
|
||
cols_to_drop = [col for col in _temp_cols if col in df.columns]
|
||
if cols_to_drop:
|
||
df.drop(columns=cols_to_drop, inplace=True)
|
||
print(f"Finished {factor_name}.")
|
||
|
||
|
||
def cs_rank_flow_divergence(
|
||
df: pd.DataFrame, factor_name: str = "cs_rank_flow_divergence"
|
||
):
|
||
"""
|
||
Factor 2: 大小单流向背离度截面排序 (In-place).
|
||
WARNING: Modifies df in-place.
|
||
"""
|
||
print(f"Calculating {factor_name}...")
|
||
_temp_cols = ["_lg_ratio", "_sm_ratio", "_divergence"]
|
||
try:
|
||
df["_lg_ratio"] = _safe_divide(
|
||
(
|
||
df["buy_lg_vol"]
|
||
+ df["buy_elg_vol"]
|
||
- df["sell_lg_vol"]
|
||
- df["sell_elg_vol"]
|
||
),
|
||
df["vol"],
|
||
)
|
||
df["_sm_ratio"] = _safe_divide(
|
||
(df["buy_sm_vol"] - df["sell_sm_vol"]), df["vol"]
|
||
)
|
||
df["_divergence"] = df["_lg_ratio"] - df["_sm_ratio"]
|
||
df[factor_name] = df.groupby("trade_date")["_divergence"].rank(pct=True)
|
||
except KeyError as e:
|
||
print(f"Error calculating {factor_name}: Missing column {e}. Assigning NaN.")
|
||
df[factor_name] = np.nan
|
||
except Exception as e:
|
||
print(
|
||
f"An unexpected error occurred calculating {factor_name}: {e}. Assigning NaN."
|
||
)
|
||
df[factor_name] = np.nan
|
||
finally:
|
||
cols_to_drop = [col for col in _temp_cols if col in df.columns]
|
||
if cols_to_drop:
|
||
df.drop(columns=cols_to_drop, inplace=True)
|
||
print(f"Finished {factor_name}.")
|
||
|
||
|
||
def cs_rank_industry_adj_lg_flow(
|
||
df: pd.DataFrame, factor_name: str = "cs_rank_ind_adj_lg_flow"
|
||
):
|
||
"""
|
||
Factor 3: 行业内大单流强度排序 (In-place). Requires 'cat_l2_code'.
|
||
WARNING: Modifies df in-place.
|
||
"""
|
||
print(f"Calculating {factor_name}...")
|
||
_temp_cols = ["_net_lg_flow_vol", "_industry_avg_flow", "_deviation"]
|
||
if "cat_l2_code" not in df.columns:
|
||
print(
|
||
f"Error calculating {factor_name}: Missing 'cat_l2_code' column. Assigning NaN."
|
||
)
|
||
df[factor_name] = np.nan
|
||
return
|
||
try:
|
||
df["_net_lg_flow_vol"] = (
|
||
df["buy_lg_vol"]
|
||
+ df["buy_elg_vol"]
|
||
- df["sell_lg_vol"]
|
||
- df["sell_elg_vol"]
|
||
) * df[
|
||
"close"
|
||
] # Or use vol
|
||
df["_industry_avg_flow"] = df.groupby(["trade_date", "cat_l2_code"])[
|
||
"_net_lg_flow_vol"
|
||
].transform("mean")
|
||
df["_deviation"] = df["_net_lg_flow_vol"] - df["_industry_avg_flow"]
|
||
df[factor_name] = df.groupby("trade_date")["_deviation"].rank(pct=True)
|
||
except KeyError as e:
|
||
print(f"Error calculating {factor_name}: Missing column {e}. Assigning NaN.")
|
||
df[factor_name] = np.nan
|
||
except Exception as e:
|
||
print(
|
||
f"An unexpected error occurred calculating {factor_name}: {e}. Assigning NaN."
|
||
)
|
||
df[factor_name] = np.nan
|
||
finally:
|
||
cols_to_drop = [col for col in _temp_cols if col in df.columns]
|
||
if cols_to_drop:
|
||
df.drop(columns=cols_to_drop, inplace=True)
|
||
print(f"Finished {factor_name}.")
|
||
|
||
|
||
def cs_rank_elg_buy_ratio(df: pd.DataFrame, factor_name: str = "cs_rank_elg_buy_ratio"):
|
||
"""
|
||
Factor 4: 超大单买入占比排序 (In-place).
|
||
WARNING: Modifies df in-place.
|
||
"""
|
||
print(f"Calculating {factor_name}...")
|
||
_temp_cols = ["_elg_buy_ratio"]
|
||
try:
|
||
df["_elg_buy_ratio"] = _safe_divide(df["buy_elg_vol"], df["vol"])
|
||
df[factor_name] = df.groupby("trade_date")["_elg_buy_ratio"].rank(pct=True)
|
||
except KeyError as e:
|
||
print(f"Error calculating {factor_name}: Missing column {e}. Assigning NaN.")
|
||
df[factor_name] = np.nan
|
||
except Exception as e:
|
||
print(
|
||
f"An unexpected error occurred calculating {factor_name}: {e}. Assigning NaN."
|
||
)
|
||
df[factor_name] = np.nan
|
||
finally:
|
||
cols_to_drop = [col for col in _temp_cols if col in df.columns]
|
||
if cols_to_drop:
|
||
df.drop(columns=cols_to_drop, inplace=True)
|
||
print(f"Finished {factor_name}.")
|
||
|
||
|
||
# Category 2: Cross-Sectional Cost Basis & PnL Status
|
||
def cs_rank_rel_profit_margin(
|
||
df: pd.DataFrame, factor_name: str = "cs_rank_rel_profit_margin"
|
||
):
|
||
"""
|
||
Factor 5: 相对盈利幅度排序 (In-place).
|
||
WARNING: Modifies df in-place.
|
||
"""
|
||
print(f"Calculating {factor_name}...")
|
||
_temp_cols = ["_profit_margin"]
|
||
try:
|
||
df["_profit_margin"] = _safe_divide(
|
||
(df["close"] - df["weight_avg"]), df["close"]
|
||
)
|
||
df[factor_name] = df.groupby("trade_date")["_profit_margin"].rank(pct=True)
|
||
except KeyError as e:
|
||
print(f"Error calculating {factor_name}: Missing column {e}. Assigning NaN.")
|
||
df[factor_name] = np.nan
|
||
except Exception as e:
|
||
print(
|
||
f"An unexpected error occurred calculating {factor_name}: {e}. Assigning NaN."
|
||
)
|
||
df[factor_name] = np.nan
|
||
finally:
|
||
cols_to_drop = [col for col in _temp_cols if col in df.columns]
|
||
if cols_to_drop:
|
||
df.drop(columns=cols_to_drop, inplace=True)
|
||
print(f"Finished {factor_name}.")
|
||
|
||
|
||
def cs_rank_cost_breadth(df: pd.DataFrame, factor_name: str = "cs_rank_cost_breadth"):
|
||
"""
|
||
Factor 6: 成本分布宽度截面排序 (In-place).
|
||
WARNING: Modifies df in-place.
|
||
"""
|
||
print(f"Calculating {factor_name}...")
|
||
_temp_cols = ["_cost_breadth"]
|
||
try:
|
||
df["_cost_breadth"] = _safe_divide(
|
||
(df["cost_85pct"] - df["cost_15pct"]), df["weight_avg"]
|
||
)
|
||
df[factor_name] = df.groupby("trade_date")["_cost_breadth"].rank(pct=True)
|
||
except KeyError as e:
|
||
print(f"Error calculating {factor_name}: Missing column {e}. Assigning NaN.")
|
||
df[factor_name] = np.nan
|
||
except Exception as e:
|
||
print(
|
||
f"An unexpected error occurred calculating {factor_name}: {e}. Assigning NaN."
|
||
)
|
||
df[factor_name] = np.nan
|
||
finally:
|
||
cols_to_drop = [col for col in _temp_cols if col in df.columns]
|
||
if cols_to_drop:
|
||
df.drop(columns=cols_to_drop, inplace=True)
|
||
print(f"Finished {factor_name}.")
|
||
|
||
|
||
def cs_rank_dist_to_upper_cost(
|
||
df: pd.DataFrame, factor_name: str = "cs_rank_dist_to_upper_cost"
|
||
):
|
||
"""
|
||
Factor 7: 股价相对高成本位距离排序 (In-place).
|
||
WARNING: Modifies df in-place.
|
||
"""
|
||
print(f"Calculating {factor_name}...")
|
||
_temp_cols = ["_dist_to_95"]
|
||
try:
|
||
df["_dist_to_95"] = _safe_divide(df["close"], df["cost_95pct"])
|
||
df[factor_name] = df.groupby("trade_date")["_dist_to_95"].rank(pct=True)
|
||
except KeyError as e:
|
||
print(f"Error calculating {factor_name}: Missing column {e}. Assigning NaN.")
|
||
df[factor_name] = np.nan
|
||
except Exception as e:
|
||
print(
|
||
f"An unexpected error occurred calculating {factor_name}: {e}. Assigning NaN."
|
||
)
|
||
df[factor_name] = np.nan
|
||
finally:
|
||
cols_to_drop = [col for col in _temp_cols if col in df.columns]
|
||
if cols_to_drop:
|
||
df.drop(columns=cols_to_drop, inplace=True)
|
||
print(f"Finished {factor_name}.")
|
||
|
||
|
||
def cs_rank_winner_rate(df: pd.DataFrame, factor_name: str = "cs_rank_winner_rate"):
|
||
"""
|
||
Factor 8: 获利盘比例截面排序 (In-place).
|
||
WARNING: Modifies df in-place.
|
||
"""
|
||
print(f"Calculating {factor_name}...")
|
||
try:
|
||
df[factor_name] = df.groupby("trade_date")["winner_rate"].rank(pct=True)
|
||
except KeyError as e:
|
||
print(f"Error calculating {factor_name}: Missing column {e}. Assigning NaN.")
|
||
df[factor_name] = np.nan
|
||
except Exception as e:
|
||
print(
|
||
f"An unexpected error occurred calculating {factor_name}: {e}. Assigning NaN."
|
||
)
|
||
df[factor_name] = np.nan
|
||
finally:
|
||
print(f"Finished {factor_name}.")
|
||
|
||
|
||
# Category 3: Cross-Sectional Price Action & Volatility
|
||
def cs_rank_intraday_range(
|
||
df: pd.DataFrame, factor_name: str = "cs_rank_intraday_range"
|
||
):
|
||
"""
|
||
Factor 9: 日内相对振幅排序 (In-place).
|
||
WARNING: Modifies df in-place.
|
||
"""
|
||
print(f"Calculating {factor_name}...")
|
||
_temp_cols = ["_norm_range"]
|
||
try:
|
||
df["_norm_range"] = _safe_divide((df["high"] - df["low"]), df["close"])
|
||
df[factor_name] = df.groupby("trade_date")["_norm_range"].rank(pct=True)
|
||
except KeyError as e:
|
||
print(f"Error calculating {factor_name}: Missing column {e}. Assigning NaN.")
|
||
df[factor_name] = np.nan
|
||
except Exception as e:
|
||
print(
|
||
f"An unexpected error occurred calculating {factor_name}: {e}. Assigning NaN."
|
||
)
|
||
df[factor_name] = np.nan
|
||
finally:
|
||
cols_to_drop = [col for col in _temp_cols if col in df.columns]
|
||
if cols_to_drop:
|
||
df.drop(columns=cols_to_drop, inplace=True)
|
||
print(f"Finished {factor_name}.")
|
||
|
||
|
||
def cs_rank_close_pos_in_range(
|
||
df: pd.DataFrame, factor_name: str = "cs_rank_close_pos_in_range"
|
||
):
|
||
"""
|
||
Factor 10: 收盘价在日内位置排序 (In-place).
|
||
WARNING: Modifies df in-place.
|
||
"""
|
||
print(f"Calculating {factor_name}...")
|
||
_temp_cols = ["_close_pos"]
|
||
try:
|
||
df["_close_pos"] = _safe_divide(
|
||
(df["close"] - df["low"]), (df["high"] - df["low"]), default_val=0.5
|
||
) # Assign 0.5 if high==low
|
||
df[factor_name] = df.groupby("trade_date")["_close_pos"].rank(pct=True)
|
||
except KeyError as e:
|
||
print(f"Error calculating {factor_name}: Missing column {e}. Assigning NaN.")
|
||
df[factor_name] = np.nan
|
||
except Exception as e:
|
||
print(
|
||
f"An unexpected error occurred calculating {factor_name}: {e}. Assigning NaN."
|
||
)
|
||
df[factor_name] = np.nan
|
||
finally:
|
||
cols_to_drop = [col for col in _temp_cols if col in df.columns]
|
||
if cols_to_drop:
|
||
df.drop(columns=cols_to_drop, inplace=True)
|
||
print(f"Finished {factor_name}.")
|
||
|
||
|
||
def cs_rank_opening_gap(df: pd.DataFrame, factor_name: str = "cs_rank_opening_gap"):
|
||
"""
|
||
Factor 11: 开盘相对跳空幅度排序 (In-place). Needs pre_close.
|
||
WARNING: Modifies df in-place. Assumes 'pre_close' exists.
|
||
"""
|
||
print(f"Calculating {factor_name}...")
|
||
_temp_cols = ["_gap"]
|
||
if "pre_close" not in df.columns:
|
||
print(
|
||
f"Error calculating {factor_name}: Missing 'pre_close' column. Assigning NaN."
|
||
)
|
||
df[factor_name] = np.nan
|
||
return
|
||
try:
|
||
df["_gap"] = _safe_divide(df["open"], df["pre_close"]) - 1
|
||
df[factor_name] = df.groupby("trade_date")["_gap"].rank(pct=True)
|
||
except KeyError as e:
|
||
print(
|
||
f"Error calculating {factor_name}: Missing column {e} (likely 'open'). Assigning NaN."
|
||
)
|
||
df[factor_name] = np.nan
|
||
except Exception as e:
|
||
print(
|
||
f"An unexpected error occurred calculating {factor_name}: {e}. Assigning NaN."
|
||
)
|
||
df[factor_name] = np.nan
|
||
finally:
|
||
cols_to_drop = [col for col in _temp_cols if col in df.columns]
|
||
if cols_to_drop:
|
||
df.drop(columns=cols_to_drop, inplace=True)
|
||
print(f"Finished {factor_name}.")
|
||
|
||
|
||
def cs_rank_pos_in_hist_range(
|
||
df: pd.DataFrame, factor_name: str = "cs_rank_pos_in_hist_range"
|
||
):
|
||
"""
|
||
Factor 12: 相对历史波动位置排序 (In-place).
|
||
WARNING: Modifies df in-place.
|
||
"""
|
||
print(f"Calculating {factor_name}...")
|
||
_temp_cols = ["_hist_pos"]
|
||
try:
|
||
df["_hist_pos"] = _safe_divide(
|
||
(df["close"] - df["his_low"]), (df["his_high"] - df["his_low"])
|
||
).clip(
|
||
0, 1
|
||
) # Clip to 0-1 range
|
||
df[factor_name] = df.groupby("trade_date")["_hist_pos"].rank(pct=True)
|
||
except KeyError as e:
|
||
print(f"Error calculating {factor_name}: Missing column {e}. Assigning NaN.")
|
||
df[factor_name] = np.nan
|
||
except Exception as e:
|
||
print(
|
||
f"An unexpected error occurred calculating {factor_name}: {e}. Assigning NaN."
|
||
)
|
||
df[factor_name] = np.nan
|
||
finally:
|
||
cols_to_drop = [col for col in _temp_cols if col in df.columns]
|
||
if cols_to_drop:
|
||
df.drop(columns=cols_to_drop, inplace=True)
|
||
print(f"Finished {factor_name}.")
|
||
|
||
|
||
# Category 4: Cross-Sectional Interaction & Composite Indicators
|
||
def cs_rank_vol_x_profit_margin(
|
||
df: pd.DataFrame, factor_name: str = "cs_rank_vol_x_profit_margin"
|
||
):
|
||
"""
|
||
Factor 13: 波动率与盈亏状态交互排序 (In-place).
|
||
WARNING: Modifies df in-place.
|
||
"""
|
||
print(f"Calculating {factor_name}...")
|
||
_temp_cols = ["_daily_vol", "_profit_margin", "_interaction"]
|
||
try:
|
||
df["_daily_vol"] = abs(df["pct_chg"])
|
||
df["_profit_margin"] = _safe_divide(
|
||
(df["close"] - df["weight_avg"]), df["close"]
|
||
)
|
||
df["_interaction"] = df["_daily_vol"] * df["_profit_margin"]
|
||
df[factor_name] = df.groupby("trade_date")["_interaction"].rank(pct=True)
|
||
except KeyError as e:
|
||
print(f"Error calculating {factor_name}: Missing column {e}. Assigning NaN.")
|
||
df[factor_name] = np.nan
|
||
except Exception as e:
|
||
print(
|
||
f"An unexpected error occurred calculating {factor_name}: {e}. Assigning NaN."
|
||
)
|
||
df[factor_name] = np.nan
|
||
finally:
|
||
cols_to_drop = [col for col in _temp_cols if col in df.columns]
|
||
if cols_to_drop:
|
||
df.drop(columns=cols_to_drop, inplace=True)
|
||
print(f"Finished {factor_name}.")
|
||
|
||
|
||
def cs_rank_lg_flow_price_concordance(
|
||
df: pd.DataFrame, factor_name: str = "cs_rank_lg_flow_price_concordance"
|
||
):
|
||
"""
|
||
Factor 14: 大单流向与价格变动一致性排序 (In-place).
|
||
WARNING: Modifies df in-place.
|
||
"""
|
||
print(f"Calculating {factor_name}...")
|
||
_temp_cols = ["_net_lg_flow_vol", "_concordance"]
|
||
try:
|
||
df["_net_lg_flow_vol"] = (
|
||
df["buy_lg_vol"]
|
||
+ df["buy_elg_vol"]
|
||
- df["sell_lg_vol"]
|
||
- df["sell_elg_vol"]
|
||
)
|
||
df["_concordance"] = df["_net_lg_flow_vol"] * df["pct_chg"]
|
||
df[factor_name] = df.groupby("trade_date")["_concordance"].rank(pct=True)
|
||
except KeyError as e:
|
||
print(f"Error calculating {factor_name}: Missing column {e}. Assigning NaN.")
|
||
df[factor_name] = np.nan
|
||
except Exception as e:
|
||
print(
|
||
f"An unexpected error occurred calculating {factor_name}: {e}. Assigning NaN."
|
||
)
|
||
df[factor_name] = np.nan
|
||
finally:
|
||
cols_to_drop = [col for col in _temp_cols if col in df.columns]
|
||
if cols_to_drop:
|
||
df.drop(columns=cols_to_drop, inplace=True)
|
||
print(f"Finished {factor_name}.")
|
||
|
||
|
||
def cs_rank_turnover_per_winner(
|
||
df: pd.DataFrame, factor_name: str = "cs_rank_turnover_per_winner"
|
||
):
|
||
"""
|
||
Factor 15: 高换手获利盘占比排序 (In-place).
|
||
WARNING: Modifies df in-place.
|
||
"""
|
||
print(f"Calculating {factor_name}...")
|
||
_temp_cols = ["_turnover_per_winner"]
|
||
try:
|
||
df["_turnover_per_winner"] = _safe_divide(
|
||
df["turnover_rate"], df["winner_rate"]
|
||
)
|
||
df[factor_name] = df.groupby("trade_date")["_turnover_per_winner"].rank(
|
||
pct=True
|
||
)
|
||
except KeyError as e:
|
||
print(f"Error calculating {factor_name}: Missing column {e}. Assigning NaN.")
|
||
df[factor_name] = np.nan
|
||
except Exception as e:
|
||
print(
|
||
f"An unexpected error occurred calculating {factor_name}: {e}. Assigning NaN."
|
||
)
|
||
df[factor_name] = np.nan
|
||
finally:
|
||
cols_to_drop = [col for col in _temp_cols if col in df.columns]
|
||
if cols_to_drop:
|
||
df.drop(columns=cols_to_drop, inplace=True)
|
||
print(f"Finished {factor_name}.")
|
||
|
||
|
||
def cs_rank_ind_cap_neutral_pe(
|
||
df: pd.DataFrame, factor_name: str = "cs_rank_ind_cap_neutral_pe"
|
||
):
|
||
"""
|
||
Factor 16: 行业市值中性化PE排序 (Placeholder).
|
||
Requires statsmodels and complex cross-sectional regression implementation.
|
||
WARNING: Modifies df in-place. Placeholder implementation returns NaN.
|
||
"""
|
||
print(f"Calculating {factor_name} (Placeholder - requires statsmodels)...")
|
||
df[factor_name] = np.nan
|
||
print(f"Finished {factor_name} (Placeholder).")
|
||
|
||
|
||
def cs_rank_volume_ratio(df: pd.DataFrame, factor_name: str = "cs_rank_volume_ratio"):
|
||
"""
|
||
Factor 17: 成交量相对强度排序 (In-place).
|
||
WARNING: Modifies df in-place.
|
||
"""
|
||
print(f"Calculating {factor_name}...")
|
||
try:
|
||
# Assumes 'volume_ratio' (量比) column already exists
|
||
df[factor_name] = df.groupby("trade_date")["volume_ratio"].rank(pct=True)
|
||
except KeyError as e:
|
||
print(f"Error calculating {factor_name}: Missing column {e}. Assigning NaN.")
|
||
df[factor_name] = np.nan
|
||
except Exception as e:
|
||
print(
|
||
f"An unexpected error occurred calculating {factor_name}: {e}. Assigning NaN."
|
||
)
|
||
df[factor_name] = np.nan
|
||
finally:
|
||
print(f"Finished {factor_name}.")
|
||
|
||
|
||
def cs_rank_elg_buy_sell_sm_ratio(
|
||
df: pd.DataFrame, factor_name: str = "cs_rank_elg_buy_sell_sm_ratio"
|
||
):
|
||
"""
|
||
Factor 18: 超大单买入与小单卖出比排序 (In-place).
|
||
WARNING: Modifies df in-place.
|
||
"""
|
||
print(f"Calculating {factor_name}...")
|
||
_temp_cols = ["_ratio"]
|
||
try:
|
||
df["_ratio"] = _safe_divide(df["buy_elg_vol"], df["sell_sm_vol"])
|
||
df[factor_name] = df.groupby("trade_date")["_ratio"].rank(pct=True)
|
||
except KeyError as e:
|
||
print(f"Error calculating {factor_name}: Missing column {e}. Assigning NaN.")
|
||
df[factor_name] = np.nan
|
||
except Exception as e:
|
||
print(
|
||
f"An unexpected error occurred calculating {factor_name}: {e}. Assigning NaN."
|
||
)
|
||
df[factor_name] = np.nan
|
||
finally:
|
||
cols_to_drop = [col for col in _temp_cols if col in df.columns]
|
||
if cols_to_drop:
|
||
df.drop(columns=cols_to_drop, inplace=True)
|
||
print(f"Finished {factor_name}.")
|
||
|
||
|
||
def cs_rank_cost_dist_vol_ratio(
|
||
df: pd.DataFrame, factor_name: str = "cs_rank_cost_dist_vol_ratio"
|
||
):
|
||
"""
|
||
Factor 19: 价格偏离成本程度与成交量放大交互排序 (In-place).
|
||
WARNING: Modifies df in-place.
|
||
"""
|
||
print(f"Calculating {factor_name}...")
|
||
_temp_cols = ["_dist", "_interaction"]
|
||
if "volume_ratio" not in df.columns:
|
||
print(
|
||
f"Error calculating {factor_name}: Missing 'volume_ratio' column. Assigning NaN."
|
||
)
|
||
df[factor_name] = np.nan
|
||
return
|
||
try:
|
||
df["_dist"] = abs(df["close"] - df["weight_avg"]) / (df["close"] + epsilon)
|
||
df["_interaction"] = df["_dist"] * df["volume_ratio"]
|
||
df[factor_name] = df.groupby("trade_date")["_interaction"].rank(pct=True)
|
||
except KeyError as e:
|
||
print(f"Error calculating {factor_name}: Missing column {e}. Assigning NaN.")
|
||
df[factor_name] = np.nan
|
||
except Exception as e:
|
||
print(
|
||
f"An unexpected error occurred calculating {factor_name}: {e}. Assigning NaN."
|
||
)
|
||
df[factor_name] = np.nan
|
||
finally:
|
||
cols_to_drop = [col for col in _temp_cols if col in df.columns]
|
||
if cols_to_drop:
|
||
df.drop(columns=cols_to_drop, inplace=True)
|
||
print(f"Finished {factor_name}.")
|
||
|
||
|
||
def cs_rank_size(df: pd.DataFrame, factor_name: str = "cs_rank_size"):
|
||
"""
|
||
Factor 20: 市值因子暴露度排序 (Log of circ_mv) (In-place).
|
||
WARNING: Modifies df in-place.
|
||
"""
|
||
print(f"Calculating {factor_name}...")
|
||
_temp_cols = ["_log_circ_mv"]
|
||
try:
|
||
# Use log1p for stability if circ_mv can be zero or very small
|
||
df["_log_circ_mv"] = np.log1p(df["circ_mv"])
|
||
df[factor_name] = df.groupby("trade_date")["_log_circ_mv"].rank(pct=True)
|
||
except KeyError as e:
|
||
print(f"Error calculating {factor_name}: Missing column {e}. Assigning NaN.")
|
||
df[factor_name] = np.nan
|
||
except Exception as e:
|
||
print(
|
||
f"An unexpected error occurred calculating {factor_name}: {e}. Assigning NaN."
|
||
)
|
||
df[factor_name] = np.nan
|
||
finally:
|
||
cols_to_drop = [col for col in _temp_cols if col in df.columns]
|
||
if cols_to_drop:
|
||
df.drop(columns=cols_to_drop, inplace=True)
|
||
print(f"Finished {factor_name}.")
|
||
|
||
|
||
def add_financial_factor(
|
||
main_df: pd.DataFrame,
|
||
financial_df: pd.DataFrame,
|
||
ts_code_col: str = "ts_code",
|
||
trade_date_col: str = "trade_date",
|
||
ann_date_col: str = "ann_date", # 公告日期
|
||
f_ann_date_col: str = "f_ann_date", # 实际公告日期 (优先使用)
|
||
factor_value_col: str = "undist_profit_ps", # 财务指标值所在的列
|
||
new_factor_col_name: str = "retained_profit_per_share", # 新因子列的名称
|
||
) -> pd.DataFrame:
|
||
"""
|
||
将财务指标数据(如每股未分配利润)作为因子添加到主时间序列 DataFrame 中。
|
||
|
||
使用 merge_asof 根据股票代码和公告日期,将最新的财务指标值匹配到每个交易日。
|
||
|
||
Args:
|
||
main_df: 包含时间序列交易数据的主 DataFrame (至少包含 ts_code_col 和 trade_date_col)。
|
||
financial_df: 包含财务指标数据的 DataFrame (至少包含 ts_code_col,
|
||
ann_date_col 或 f_ann_date_col, 以及 factor_value_col)。
|
||
ts_code_col: 股票代码列在两个 DataFrame 中的名称。默认为 'ts_code'。
|
||
trade_date_col: 交易日期列在 main_df 中的名称。默认为 'trade_date'。
|
||
ann_date_col: 公告日期列在 financial_df 中的名称(作为 f_ann_date_col 的备选)。默认为 'ann_date'。
|
||
f_ann_date_col: 实际公告日期列在 financial_df 中的名称(优先使用)。默认为 'f_ann_date'。
|
||
factor_value_col: 财务指标值(即要添加的因子值)在 financial_df 中的列名。默认为 'undistr_pft_ps'。
|
||
new_factor_col_name: 添加到 main_df 中的新因子列的名称。默认为 'retained_profit_per_share'。
|
||
|
||
Returns:
|
||
包含新因子列的 main_df DataFrame。
|
||
"""
|
||
# --- 数据校验 ---
|
||
required_main_cols = [ts_code_col, trade_date_col]
|
||
if not all(col in main_df.columns for col in required_main_cols):
|
||
raise ValueError(f"主 DataFrame 必须包含列: {required_main_cols}")
|
||
|
||
required_financial_cols = [ts_code_col, factor_value_col]
|
||
if f_ann_date_col and f_ann_date_col in financial_df.columns:
|
||
effective_date_col = f_ann_date_col
|
||
elif ann_date_col and ann_date_col in financial_df.columns:
|
||
effective_date_col = ann_date_col
|
||
else:
|
||
raise ValueError(
|
||
f"财务指标 DataFrame 必须包含列 '{f_ann_date_col}' 或 '{ann_date_col}' 作为数据生效日期"
|
||
)
|
||
required_financial_cols.append(effective_date_col)
|
||
|
||
if not all(col in financial_df.columns for col in required_financial_cols):
|
||
raise ValueError(f"财务指标 DataFrame 必须包含列: {required_financial_cols}")
|
||
|
||
# --- 数据预处理 ---
|
||
|
||
# 复制 main_df 避免修改原始 DataFrame
|
||
main_df = main_df.copy()
|
||
|
||
# 确保日期列是 datetime 类型
|
||
main_df[trade_date_col] = pd.to_datetime(main_df[trade_date_col])
|
||
financial_df[effective_date_col] = pd.to_datetime(financial_df[effective_date_col])
|
||
|
||
# 确保股票代码是字符串类型,以便合并时类型一致
|
||
main_df[ts_code_col] = main_df[ts_code_col].astype(str)
|
||
financial_df[ts_code_col] = financial_df[ts_code_col].astype(str)
|
||
|
||
# 选取 financial_df 中需要合并的列,并为 merge_asof 准备日期列
|
||
financial_data_subset = financial_df[
|
||
[ts_code_col, effective_date_col, factor_value_col]
|
||
].copy()
|
||
# 重命名 effective_date_col 为一个统一的名称,方便 merge_asof
|
||
# merge_asof 需要 right_on 参数,使用原始列名即可,不需要重命名
|
||
|
||
# 为了使用 merge_asof,两个 DataFrame 都必须按合并键 (ts_code) 和日期列排序
|
||
main_df = main_df.sort_values(by=[ts_code_col, trade_date_col])
|
||
financial_data_subset = financial_data_subset.sort_values(
|
||
by=[ts_code_col, effective_date_col]
|
||
)
|
||
|
||
# --- 使用 merge_asof 计算因子 ---
|
||
|
||
# 执行 as-of 合并
|
||
df_with_factor = pd.merge_asof(
|
||
main_df,
|
||
financial_data_subset,
|
||
left_on=trade_date_col, # main_df 中用于匹配的日期列
|
||
right_on=effective_date_col, # financial_data_subset 中用于匹配的日期列
|
||
by=ts_code_col, # 按股票代码进行分组匹配
|
||
direction="backward", # 匹配方向:向后查找(即找 <= trade_date 的最近数据)
|
||
# 如果您需要容忍日期上的微小差异,可以使用 tolerance 参数
|
||
# tolerance=pd.Timedelta('1 days')
|
||
)
|
||
|
||
# 清理:移除用于匹配的 effective_date_col,以及原始 financial_df 中可能带来的其他重复列
|
||
# merge_asof 默认不会带上 right DataFrame 中用于合并的 key 列,但如果名称不同可能会带上
|
||
# 这里的清理主要针对 effective_date_col
|
||
if (
|
||
effective_date_col in df_with_factor.columns
|
||
and effective_date_col != trade_date_col
|
||
):
|
||
# 确保不是trade_date_col本身被意外重命名
|
||
df_with_factor = df_with_factor.drop(columns=[effective_date_col])
|
||
|
||
# 重命名新加入的因子列
|
||
# merge_asof 会将 factor_value_col 直接带入,名称不变
|
||
# 我们将其重命名为 new_factor_col_name
|
||
if factor_value_col != new_factor_col_name:
|
||
if factor_value_col in df_with_factor.columns:
|
||
df_with_factor = df_with_factor.rename(
|
||
columns={factor_value_col: new_factor_col_name}
|
||
)
|
||
else:
|
||
print(
|
||
f"警告: 合并后未找到列 '{factor_value_col}',无法重命名为 '{new_factor_col_name}'。"
|
||
)
|
||
|
||
# --- 返回结果 ---
|
||
return df_with_factor
|
||
|
||
|
||
# --- ARBR 因子计算函数 ---
|
||
|
||
|
||
def calculate_arbr(df: pd.DataFrame, N: int = 26):
|
||
"""
|
||
计算 AR 和 BR 指标,并将结果原地添加到 DataFrame 中。
|
||
|
||
Args:
|
||
df (pd.DataFrame): 输入的 DataFrame,必须包含 'ts_code', 'trade_date',
|
||
'open', 'high', 'low', 'close' 列。
|
||
建议预先按 ts_code, trade_date 排序。
|
||
N (int): 计算 AR, BR 的窗口期,默认为 26。
|
||
|
||
WARNING: 此函数会原地修改输入的 DataFrame 'df'。
|
||
"""
|
||
ar_col_name = "AR"
|
||
br_col_name = "BR"
|
||
print(f"开始计算因子: {ar_col_name}, {br_col_name} (原地修改)...")
|
||
|
||
_temp_cols = [] # 记录中间列
|
||
|
||
try:
|
||
# 0. 确保排序 (虽然 groupby 会处理,但有序更保险)
|
||
# df.sort_values(['ts_code', 'trade_date'], inplace=True) # 如果不确定df已排序
|
||
|
||
# 1. 计算所需中间值
|
||
df["_h_minus_o"] = df["high"] - df["open"]
|
||
df["_o_minus_l"] = df["open"] - df["low"]
|
||
df["_prev_close"] = df.groupby("ts_code")["close"].shift(1)
|
||
# BR 计算需要 max(0, H-PC) 和 max(0, PC-L)
|
||
df["_h_minus_pc_pos"] = np.maximum(0, df["high"] - df["_prev_close"])
|
||
df["_pc_minus_l_pos"] = np.maximum(0, df["_prev_close"] - df["low"])
|
||
_temp_cols.extend(
|
||
[
|
||
"_h_minus_o",
|
||
"_o_minus_l",
|
||
"_prev_close",
|
||
"_h_minus_pc_pos",
|
||
"_pc_minus_l_pos",
|
||
]
|
||
)
|
||
|
||
# 2. 计算滚动和
|
||
# 使用 min_periods=N 确保有完整的窗口数据才计算,也可以用 N//2 等
|
||
min_p = N # 严格要求 N 天数据
|
||
grouped = df.groupby("ts_code")
|
||
|
||
sum_h_minus_o = (
|
||
grouped["_h_minus_o"]
|
||
.rolling(N, min_periods=min_p)
|
||
.sum()
|
||
.reset_index(level=0, drop=True)
|
||
)
|
||
sum_o_minus_l = (
|
||
grouped["_o_minus_l"]
|
||
.rolling(N, min_periods=min_p)
|
||
.sum()
|
||
.reset_index(level=0, drop=True)
|
||
)
|
||
sum_h_minus_pc_pos = (
|
||
grouped["_h_minus_pc_pos"]
|
||
.rolling(N, min_periods=min_p)
|
||
.sum()
|
||
.reset_index(level=0, drop=True)
|
||
)
|
||
sum_pc_minus_l_pos = (
|
||
grouped["_pc_minus_l_pos"]
|
||
.rolling(N, min_periods=min_p)
|
||
.sum()
|
||
.reset_index(level=0, drop=True)
|
||
)
|
||
|
||
# 3. 计算 AR 和 BR
|
||
df[ar_col_name] = (
|
||
_safe_divide(sum_h_minus_o, sum_o_minus_l, default_val=np.nan) * 100
|
||
) # AR 通常乘以 100
|
||
df[br_col_name] = (
|
||
_safe_divide(sum_h_minus_pc_pos, sum_pc_minus_l_pos, default_val=np.nan)
|
||
* 100
|
||
) # BR 通常乘以 100
|
||
df[f"{ar_col_name}_{br_col_name}"] = df[ar_col_name] - df[br_col_name]
|
||
|
||
print(f"因子 {ar_col_name}, {br_col_name} 计算成功。")
|
||
|
||
except KeyError as e:
|
||
print(f"错误: 计算 ARBR 时缺少必需的列: {e}")
|
||
print(f"将为因子 {ar_col_name}, {br_col_name} 填充 NaN。")
|
||
if ar_col_name not in df.columns:
|
||
df[ar_col_name] = np.nan
|
||
if br_col_name not in df.columns:
|
||
df[br_col_name] = np.nan
|
||
except Exception as e:
|
||
print(f"错误: 计算 ARBR 时发生意外错误: {e}")
|
||
print(f"将为因子 {ar_col_name}, {br_col_name} 填充 NaN。")
|
||
if ar_col_name not in df.columns:
|
||
df[ar_col_name] = np.nan
|
||
if br_col_name not in df.columns:
|
||
df[br_col_name] = np.nan
|
||
finally:
|
||
# 4. 清理中间列
|
||
cols_to_drop = [col for col in _temp_cols if col in df.columns]
|
||
if cols_to_drop:
|
||
df.drop(columns=cols_to_drop, inplace=True)
|
||
print(f"因子 {ar_col_name}, {br_col_name} 计算流程结束。")
|
||
|
||
|
||
def add_financial_factor(
|
||
main_df: pd.DataFrame,
|
||
financial_df: pd.DataFrame,
|
||
factor_value_col: str, # 财务指标值所在的列
|
||
ts_code_col: str = "ts_code",
|
||
trade_date_col: str = "trade_date",
|
||
ann_date_col: str = "ann_date", # 公告日期
|
||
f_ann_date_col: str = "f_ann_date", # 实际公告日期 (优先使用)
|
||
) -> pd.DataFrame:
|
||
"""
|
||
将财务指标数据(如每股未分配利润)作为因子添加到主时间序列 DataFrame 中。
|
||
|
||
使用 merge_asof 根据股票代码和公告日期,将最新的财务指标值匹配到每个交易日。
|
||
|
||
Args:
|
||
main_df: 包含时间序列交易数据的主 DataFrame (至少包含 ts_code_col 和 trade_date_col)。
|
||
financial_df: 包含财务指标数据的 DataFrame (至少包含 ts_code_col,
|
||
ann_date_col 或 f_ann_date_col, 以及 factor_value_col)。
|
||
ts_code_col: 股票代码列在两个 DataFrame 中的名称。默认为 'ts_code'。
|
||
trade_date_col: 交易日期列在 main_df 中的名称。默认为 'trade_date'。
|
||
ann_date_col: 公告日期列在 financial_df 中的名称(作为 f_ann_date_col 的备选)。默认为 'ann_date'。
|
||
f_ann_date_col: 实际公告日期列在 financial_df 中的名称(优先使用)。默认为 'f_ann_date'。
|
||
factor_value_col: 财务指标值(即要添加的因子值)在 financial_df 中的列名。默认为 'undistr_pft_ps'。
|
||
new_factor_col_name: 添加到 main_df 中的新因子列的名称。默认为 'undist_profit_ps'。
|
||
|
||
Returns:
|
||
包含新因子列的 main_df DataFrame。
|
||
"""
|
||
if factor_value_col in main_df.columns:
|
||
return main_df
|
||
new_factor_col_name = factor_value_col
|
||
# --- 数据校验 ---
|
||
required_main_cols = [ts_code_col, trade_date_col]
|
||
if not all(col in main_df.columns for col in required_main_cols):
|
||
raise ValueError(f"主 DataFrame 必须包含列: {required_main_cols}")
|
||
|
||
required_financial_cols = [ts_code_col, factor_value_col]
|
||
if f_ann_date_col and f_ann_date_col in financial_df.columns:
|
||
effective_date_col = f_ann_date_col
|
||
print(f"使用 '{f_ann_date_col}' 作为财务数据生效日期。")
|
||
elif ann_date_col and ann_date_col in financial_df.columns:
|
||
effective_date_col = ann_date_col
|
||
print(f"使用 '{ann_date_col}' 作为财务数据生效日期。")
|
||
else:
|
||
raise ValueError(
|
||
f"财务指标 DataFrame 必须包含列 '{f_ann_date_col}' 或 '{ann_date_col}' 作为数据生效日期"
|
||
)
|
||
required_financial_cols.append(effective_date_col)
|
||
|
||
if not all(col in financial_df.columns for col in required_financial_cols):
|
||
raise ValueError(f"财务指标 DataFrame 必须包含列: {required_financial_cols}")
|
||
|
||
# --- 数据准备和清理 ---
|
||
# 确保日期列是 datetime 类型
|
||
# 使用 .copy() 避免 SettingWithCopyWarning
|
||
main_df = main_df.copy()
|
||
financial_df = financial_df.copy()
|
||
main_df[trade_date_col] = pd.to_datetime(main_df[trade_date_col], errors="coerce")
|
||
financial_df[effective_date_col] = pd.to_datetime(
|
||
financial_df[effective_date_col], errors="coerce"
|
||
)
|
||
|
||
# 确保股票代码是字符串类型
|
||
main_df[ts_code_col] = main_df[ts_code_col].astype(str)
|
||
financial_df[ts_code_col] = financial_df[ts_code_col].astype(str)
|
||
|
||
# 选取 financial_df 中需要合并的列
|
||
financial_data_subset = financial_df[
|
||
[ts_code_col, effective_date_col, factor_value_col]
|
||
].copy()
|
||
|
||
# *** 新增:处理右表合并键中的空值 ***
|
||
initial_rows_financial = len(financial_data_subset)
|
||
financial_data_subset = financial_data_subset.dropna(
|
||
subset=[ts_code_col, effective_date_col]
|
||
)
|
||
rows_dropped = initial_rows_financial - len(financial_data_subset)
|
||
if rows_dropped > 0:
|
||
print(
|
||
f"警告: 从 financial_data_subset 中移除了 {rows_dropped} 行,因为其 '{ts_code_col}' 或 '{effective_date_col}' 列存在空值。"
|
||
)
|
||
|
||
if financial_data_subset.empty:
|
||
print(
|
||
f"警告: 清理空值后 financial_data_subset 为空,无法添加因子 '{new_factor_col_name}'。将填充 NaN。"
|
||
)
|
||
main_df[new_factor_col_name] = np.nan
|
||
return main_df
|
||
|
||
# *** 修改:修正排序顺序以满足 merge_asof 要求 ***
|
||
# 先按 ts_code 排序,再按日期排序
|
||
# main_df = main_df.sort_values(by=[ts_code_col, trade_date_col])
|
||
# financial_data_subset = financial_data_subset.sort_values(by=[ts_code_col, effective_date_col])
|
||
main_df = main_df.sort_values(by=[trade_date_col, ts_code_col])
|
||
financial_data_subset = financial_data_subset.sort_values(
|
||
by=[effective_date_col, ts_code_col]
|
||
)
|
||
|
||
# --- 使用 merge_asof 计算因子 ---
|
||
try:
|
||
df_with_factor = pd.merge_asof(
|
||
main_df,
|
||
financial_data_subset,
|
||
left_on=trade_date_col,
|
||
right_on=effective_date_col,
|
||
by=ts_code_col,
|
||
direction="backward",
|
||
)
|
||
except Exception as e:
|
||
print(f"merge_asof 执行失败: {e}")
|
||
# 根据需要决定如何处理错误,这里填充 NaN
|
||
main_df[new_factor_col_name] = np.nan
|
||
return main_df
|
||
|
||
# --- 清理与重命名 ---
|
||
# 移除右表的日期列(如果它与左表日期列名称不同)
|
||
if (
|
||
effective_date_col in df_with_factor.columns
|
||
and effective_date_col != trade_date_col
|
||
):
|
||
df_with_factor = df_with_factor.drop(columns=[effective_date_col])
|
||
|
||
# 重命名新加入的因子列
|
||
if factor_value_col != new_factor_col_name:
|
||
if factor_value_col in df_with_factor.columns:
|
||
df_with_factor = df_with_factor.rename(
|
||
columns={factor_value_col: new_factor_col_name}
|
||
)
|
||
else:
|
||
# 这种情况理论上不应发生,因为 merge_asof 应该会把右表的非 key 列带过来
|
||
print(f"警告: 合并后未找到原始因子列 '{factor_value_col}',无法重命名。")
|
||
# 如果 factor_value_col 已是目标名称,则无需重命名
|
||
if new_factor_col_name not in df_with_factor.columns:
|
||
# 如果目标名称也不存在,则可能合并失败或列名有问题
|
||
df_with_factor[new_factor_col_name] = np.nan
|
||
|
||
# 如果 factor_value_col 就是目标名称,确保该列存在
|
||
elif new_factor_col_name not in df_with_factor.columns:
|
||
print(f"警告: 合并后未找到目标因子列 '{new_factor_col_name}'。填充 NaN。")
|
||
df_with_factor[new_factor_col_name] = np.nan
|
||
|
||
return df_with_factor
|
||
|
||
|
||
def calculate_cashflow_to_ev_factor(
|
||
df: pd.DataFrame,
|
||
cashflow_df: pd.DataFrame,
|
||
balancesheet_df: pd.DataFrame,
|
||
market_cap_col: str = "total_mv",
|
||
date_col: str = "trade_date",
|
||
ts_code_col: str = "ts_code",
|
||
) -> pd.DataFrame:
|
||
"""
|
||
计算经营活动产生的现金流量净额TTM / 企业价值因子。
|
||
企业价值 = 司市值 + 负债合计 - 货币资金。
|
||
|
||
重要提示:本代码假设 add_financial_factor 能够将财务数据正确地合并到主数据框。
|
||
如果您使用 add_financial_factor 只获取单季度数据,那么
|
||
n_cashflow_act 将不是 TTM 值,这将导致最终因子计算不准确。
|
||
|
||
Args:
|
||
df (pd.DataFrame): 包含市场数据(需有市值列)和日期、股票代码的主数据框。
|
||
cashflow_df (pd.DataFrame): Tushare 现金流量表数据。
|
||
balancesheet_df (pd.DataFrame): Tushare 资产负债表数据。
|
||
market_cap_col (str): DataFrame 中代表公司总市值的列名,默认为 'total_mv'。
|
||
date_col (str): DataFrame 中的日期列名,默认为 'trade_date'。
|
||
ts_code_col (str): DataFrame 中的股票代码列名,默认为 'ts_code'。
|
||
|
||
Returns:
|
||
pd.DataFrame: 添加了 'cashflow_to_ev_factor' 列的 DataFrame。
|
||
"""
|
||
df_factor = df.copy() # 创建副本以避免修改原始 DataFrame
|
||
|
||
# 0. 确保必要的市场市值列存在
|
||
if market_cap_col not in df_factor.columns:
|
||
print(f"错误: DataFrame 中缺少市值列 '{market_cap_col}'。无法计算因子。")
|
||
# 添加一个空的因子列并返回
|
||
df_factor["cashflow_to_ev_factor"] = np.nan
|
||
return df_factor
|
||
|
||
# 1. 获取经营活动产生的现金流量净额 (TTM - **注意这里的潜在不准确性**)
|
||
# 如果 add_financial_factor 只获取单季度,这里的 n_cashflow_act 将不是 TTM
|
||
df_factor = add_financial_factor(df_factor, cashflow_df, "n_cashflow_act")
|
||
# 如果 add_financial_factor 能够正确处理 TTM,那么上面的调用是正确的。
|
||
# 否则,您需要在 add_financial_factor 内部实现 TTM 逻辑,或者在调用 add_financial_factor
|
||
# 获取多个季度数据后,在这里手动进行 TTM 求和。
|
||
# 为了符合您的描述,我们暂时假设 add_financial_factor 已经处理了 TTM 或我们接受单季度的值
|
||
# 并命名为 ttm_n_cashflow_act 以示期望
|
||
|
||
# 重新命名获取的现金流列以便后续计算
|
||
cashflow_col_name = "n_cashflow_act" # 获取的列名
|
||
ttm_cashflow_col = "ttm_n_cashflow_act" # 因子计算中使用的列名
|
||
if cashflow_col_name in df_factor.columns:
|
||
df_factor = df_factor.rename(columns={cashflow_col_name: ttm_cashflow_col})
|
||
else:
|
||
# 如果 add_financial_factor 没成功添加列
|
||
print(f"错误: add_financial_factor 未能成功添加 '{cashflow_col_name}' 列。")
|
||
df_factor["cashflow_to_ev_factor"] = np.nan
|
||
return df_factor
|
||
|
||
# 2. 获取负债合计
|
||
df_factor = add_financial_factor(df_factor, balancesheet_df, "total_liab")
|
||
liab_col_name = "total_liab"
|
||
if liab_col_name not in df_factor.columns:
|
||
print(f"错误: add_financial_factor 未能成功添加 '{liab_col_name}' 列。")
|
||
df_factor["cashflow_to_ev_factor"] = np.nan
|
||
return df_factor
|
||
|
||
# 3. 获取货币资金
|
||
df_factor = add_financial_factor(df_factor, balancesheet_df, "money_cap")
|
||
money_col_name = "money_cap"
|
||
if money_col_name not in df_factor.columns:
|
||
print(f"错误: add_financial_factor 未能成功添加 '{money_col_name}' 列。")
|
||
df_factor["cashflow_to_ev_factor"] = np.nan
|
||
return df_factor
|
||
|
||
# 4. 计算企业价值 (Enterprise Value)
|
||
# 确保参与计算的列是数值类型,并处理 NaN (NaN + X = NaN, NaN - X = NaN)
|
||
enterprise_value = (
|
||
df_factor[market_cap_col].astype(float) * 10000
|
||
+ df_factor[liab_col_name].astype(float)
|
||
- df_factor[money_col_name].astype(float)
|
||
)
|
||
|
||
# 5. 计算最终因子:经营活动产生的现金流量净额TTM / 企业价值
|
||
# 使用之前定义的安全除法
|
||
df_factor["cashflow_to_ev_factor"] = _safe_divide(
|
||
df_factor[ttm_cashflow_col], enterprise_value
|
||
)
|
||
|
||
# 6. 删除临时添加的财务数据列
|
||
cols_to_drop = [ttm_cashflow_col, liab_col_name, money_col_name]
|
||
df_factor = df_factor.drop(
|
||
columns=[col for col in cols_to_drop if col in df_factor.columns]
|
||
)
|
||
|
||
return df_factor
|
||
|
||
|
||
def caculate_book_to_price_ratio(
|
||
df: pd.DataFrame, fina_indicator_df: pd.DataFrame
|
||
) -> pd.DataFrame:
|
||
if "bps" not in df.columns:
|
||
df = add_financial_factor(df, fina_indicator_df, factor_value_col="bps")
|
||
df["book_to_price_ratio"] = df["bps"] / df["close"]
|
||
df = df.drop(columns=["bps"])
|
||
return df
|
||
|
||
|
||
def turnover_rate_n(df: pd.DataFrame, n: int) -> pd.DataFrame:
|
||
df[f"turnover_rate_mean_{n}"] = (
|
||
df.groupby("ts_code", group_keys=False)["turnover_rate"]
|
||
.rolling(n)
|
||
.mean()
|
||
.reset_index(level=0, drop=True)
|
||
)
|
||
return df
|
||
|
||
|
||
def variance_n(df: pd.DataFrame, n: int) -> pd.DataFrame:
|
||
df[f"variance_{n}"] = (
|
||
df.groupby("ts_code", group_keys=False)["pct_chg"]
|
||
.rolling(n)
|
||
.var()
|
||
.reset_index(level=0, drop=True)
|
||
)
|
||
|
||
return df
|
||
|
||
|
||
def bbi_ratio_factor(df: pd.DataFrame) -> pd.DataFrame:
|
||
df_factor = df
|
||
|
||
# 确保数据按股票代码和日期排序,这对滚动计算非常重要
|
||
df_factor = df_factor.sort_values(by=["ts_code", "trade_date"])
|
||
|
||
# 获取收盘价列
|
||
close_prices = df_factor["close"]
|
||
|
||
# 1. 根据 ts_code 分组计算各周期的简单移动平均线 (SMA)
|
||
grouped = df_factor.groupby("ts_code", group_keys=False)
|
||
|
||
# 计算不同周期的 SMA,并使用 reset_index 展平索引
|
||
sma3 = grouped["close"].rolling(3).mean().reset_index(level=0, drop=True)
|
||
sma6 = grouped["close"].rolling(6).mean().reset_index(level=0, drop=True)
|
||
sma12 = grouped["close"].rolling(12).mean().reset_index(level=0, drop=True)
|
||
sma24 = grouped["close"].rolling(24).mean().reset_index(level=0, drop=True)
|
||
|
||
# 2. 计算 BBI = (SMA3 + SMA6 + SMA12 + SMA24) / 4
|
||
print("计算 BBI...")
|
||
# 注意:如果任何一个 SMA 在某个位置是 NaN (例如,数据点不足),那么它们的和也将是 NaN
|
||
bbi = (sma3 + sma6 + sma12 + sma24) / 4
|
||
|
||
# 3. 计算最终因子 = BBI / 收盘价 (使用安全除法)
|
||
df_factor["bbi_ratio_factor"] = _safe_divide(bbi, close_prices)
|
||
|
||
return df_factor
|
||
|
||
|
||
def limit_factor(df: pd.DataFrame) -> pd.DataFrame:
|
||
grouped = df.groupby("ts_code", group_keys=False)
|
||
df["cat_up_limit"] = (
|
||
df["close"] == df["up_limit"]
|
||
) # 是否涨停(1表示涨停,0表示未涨停)
|
||
df["cat_down_limit"] = (
|
||
df["close"] == df["down_limit"]
|
||
) # 是否跌停(1表示跌停,0表示未跌停)
|
||
df["up_limit_count_10d"] = (
|
||
grouped["cat_up_limit"]
|
||
.rolling(window=10, min_periods=1)
|
||
.sum()
|
||
.reset_index(level=0, drop=True)
|
||
)
|
||
df["down_limit_count_10d"] = (
|
||
grouped["cat_down_limit"]
|
||
.rolling(window=10, min_periods=1)
|
||
.sum()
|
||
.reset_index(level=0, drop=True)
|
||
)
|
||
|
||
# 3. 最近连续涨跌停天数
|
||
def calculate_consecutive_limits(series):
|
||
"""
|
||
计算连续涨停/跌停天数。
|
||
"""
|
||
consecutive_up = series * (
|
||
series.groupby((series != series.shift()).cumsum()).cumcount() + 1
|
||
)
|
||
consecutive_down = series * (
|
||
series.groupby((series != series.shift()).cumsum()).cumcount() + 1
|
||
)
|
||
return consecutive_up, consecutive_down
|
||
|
||
# 连续涨停天数
|
||
df["consecutive_up_limit"] = grouped["cat_up_limit"].apply(
|
||
lambda x: calculate_consecutive_limits(x)[0]
|
||
)
|
||
return df
|
||
|
||
|
||
import pandas as pd
|
||
import numpy as np
|
||
|
||
# 假设 df 已经加载并包含 'ts_code', 'trade_date', 'pct_chg' 列
|
||
# 并且已经按照 'ts_code' 和 'trade_date' 进行了排序
|
||
|
||
|
||
def daily_momentum_benchmark(df):
|
||
"""
|
||
计算日级别动量基准 (Positive and Negative),使用现有的 'pct_chg' 列。
|
||
|
||
这个函数将原分钟级动量基准的概念应用于日线数据。
|
||
计算每日全市场上涨股票 ('pct_chg' > 0) 的平均涨跌幅
|
||
和下跌股票 ('pct_chg' < 0) 的平均涨跌幅。
|
||
|
||
参数:
|
||
df (pd.DataFrame): 包含日级别股票数据的DataFrame。
|
||
必须包含 'ts_code', 'trade_date', 'pct_chg' 列,
|
||
并已按 'ts_code' 和 'trade_date' 排序。
|
||
|
||
返回:
|
||
pd.DataFrame: 增加了 'daily_positive_benchmark', 'daily_negative_benchmark' 列的DataFrame。
|
||
原始的 'pct_chg' 列会被直接使用。
|
||
"""
|
||
print("--- 计算日级别动量基准 (使用 pct_chg) ---")
|
||
|
||
# 确保 pct_chg 列存在
|
||
if "pct_chg" not in df.columns:
|
||
print("错误: DataFrame中没有'pct_chg'列,无法计算日级别动量基准。")
|
||
return df
|
||
|
||
# 计算每日的全市场动量基准
|
||
# 对于每一个交易日,计算所有股票中 pct_chg > 0 和 < 0 的平均值
|
||
# 使用 trade_date 进行分组
|
||
daily_benchmarks = (
|
||
df.groupby("trade_date")["pct_chg"]
|
||
.agg(
|
||
daily_positive_benchmark=lambda x: x[
|
||
x > 0
|
||
].mean(), # 日级别上涨股票的平均涨跌幅
|
||
daily_negative_benchmark=lambda x: x[
|
||
x < 0
|
||
].mean(), # 日级别下跌股票的平均涨跌幅
|
||
)
|
||
.reset_index()
|
||
)
|
||
|
||
# 将日级别动量基准合并回原始日线数据DataFrame
|
||
df = pd.merge(df, daily_benchmarks, on="trade_date", how="left")
|
||
|
||
# 对可能出现的NaN基准进行填充,这里用0填充表示没有对应的同向基准
|
||
df["daily_positive_benchmark"].fillna(0, inplace=True)
|
||
df["daily_negative_benchmark"].fillna(0, inplace=True)
|
||
|
||
print("日级别动量基准计算完成 (使用 pct_chg)。")
|
||
return df
|
||
|
||
|
||
def daily_deviation(df):
|
||
"""
|
||
计算日级别偏离度,使用现有的 'pct_chg' 列和计算出的日级别动量基准。
|
||
|
||
计算每只股票的日涨跌幅 ('pct_chg') 相对于日级别动量基准的偏离。
|
||
|
||
参数:
|
||
df (pd.DataFrame): 包含日级别股票数据的DataFrame。
|
||
必须包含 'ts_code', 'trade_date', 'pct_chg',
|
||
'daily_positive_benchmark', 'daily_negative_benchmark' 列。
|
||
这些基准列通常通过运行 daily_momentum_benchmark(df) 获得。
|
||
|
||
返回:
|
||
pd.DataFrame: 增加了 'daily_deviation' 列的DataFrame。
|
||
"""
|
||
print("--- 计算日级别偏离度 (使用 pct_chg) ---")
|
||
|
||
# 确保所需的列存在
|
||
df = daily_momentum_benchmark(df)
|
||
required_cols = ["pct_chg", "daily_positive_benchmark", "daily_negative_benchmark"]
|
||
if not all(col in df.columns for col in required_cols):
|
||
print(
|
||
f"错误: 计算日级别偏离度需要以下列: {required_cols}。请先运行 daily_momentum_benchmark(df)。"
|
||
)
|
||
return df
|
||
|
||
conditions = [
|
||
(df["pct_chg"] > 0) & (df["daily_positive_benchmark"] > 0),
|
||
(df["pct_chg"] < 0) & (df["daily_negative_benchmark"] < 0),
|
||
]
|
||
choices = [
|
||
df["pct_chg"] - df["daily_positive_benchmark"],
|
||
df["pct_chg"] - df["daily_negative_benchmark"],
|
||
]
|
||
df["daily_deviation"] = np.select(conditions, choices, default=0)
|
||
|
||
df = df.drop(columns=["daily_positive_benchmark", "daily_negative_benchmark"])
|
||
print("日级别偏离度计算完成 (使用 pct_chg)。")
|
||
return df
|
||
|
||
|
||
def daily_industry_momentum_benchmark(df):
|
||
"""
|
||
计算日级别行业动量基准 (Positive and Negative),使用现有的 'pct_chg' 列和 'cat_l2_code' 列。
|
||
|
||
计算每日每个行业内部上涨股票 ('pct_chg' > 0) 的平均涨跌幅
|
||
和下跌股票 ('pct_chg' < 0) 的平均涨跌幅。
|
||
|
||
参数:
|
||
df (pd.DataFrame): 包含日级别股票数据的DataFrame。
|
||
必须包含 'ts_code', 'trade_date', 'pct_chg', 'cat_l2_code' 列,
|
||
并已按 'ts_code' 和 'trade_date' 排序。
|
||
|
||
返回:
|
||
pd.DataFrame: 增加了 'daily_industry_positive_benchmark', 'daily_industry_negative_benchmark' 列的DataFrame。
|
||
原始的 'pct_chg' 和 'cat_l2_code' 列会被直接使用。
|
||
"""
|
||
print("--- 计算日级别行业动量基准 (使用 pct_chg 和 cat_l2_code) ---")
|
||
|
||
# 确保必需列存在
|
||
required_cols = ["pct_chg", "cat_l2_code", "trade_date", "ts_code"]
|
||
if not all(col in df.columns for col in required_cols):
|
||
print(f"错误: 计算日级别行业动量基准需要以下列: {required_cols}。")
|
||
return df
|
||
|
||
# 计算每日每个行业内部的动量基准
|
||
# 使用 trade_date 和 cat_l2_code 进行分组
|
||
industry_daily_benchmarks = (
|
||
df.groupby(["trade_date", "cat_l2_code"])["pct_chg"]
|
||
.agg(
|
||
daily_industry_positive_benchmark=lambda x: x[
|
||
x > 0
|
||
].mean(), # 日级别行业内上涨股票的平均涨跌幅
|
||
daily_industry_negative_benchmark=lambda x: x[
|
||
x < 0
|
||
].mean(), # 日级别行业内下跌股票的平均涨跌幅
|
||
)
|
||
.reset_index()
|
||
)
|
||
|
||
# 将日级别行业动量基准合并回原始日线数据DataFrame
|
||
# 使用 trade_date 和 cat_l2_code 进行 merge
|
||
df = pd.merge(
|
||
df, industry_daily_benchmarks, on=["trade_date", "cat_l2_code"], how="left"
|
||
)
|
||
|
||
# 对可能出现的NaN基准进行填充(例如某个行业某一天没有上涨或下跌的股票)
|
||
# 这里用0填充表示该行业该天没有对应的同向基准
|
||
df["daily_industry_positive_benchmark"].fillna(0, inplace=True)
|
||
df["daily_industry_negative_benchmark"].fillna(0, inplace=True)
|
||
|
||
print("日级别行业动量基准计算完成 (使用 pct_chg 和 cat_l2_code)。")
|
||
return df
|
||
|
||
|
||
def daily_industry_deviation(df):
|
||
"""
|
||
计算日级别行业偏离度,使用现有的 'pct_chg' 列和计算出的日级别行业动量基准。
|
||
|
||
计算每只股票的日涨跌幅 ('pct_chg') 相对于其所属行业日级别动量基准的偏离。
|
||
|
||
参数:
|
||
df (pd.DataFrame): 包含日级别股票数据的DataFrame。
|
||
必须包含 'ts_code', 'trade_date', 'pct_chg', 'cat_l2_code',
|
||
'daily_industry_positive_benchmark', 'daily_industry_negative_benchmark' 列。
|
||
这些基准列通常通过运行 daily_industry_momentum_benchmark(df) 获得。
|
||
|
||
返回:
|
||
pd.DataFrame: 增加了 'daily_industry_deviation' 列的DataFrame。
|
||
"""
|
||
print("--- 计算日级别行业偏离度 (使用 pct_chg 和行业基准) ---")
|
||
|
||
# 确保所需的列存在
|
||
df = daily_industry_momentum_benchmark(df)
|
||
required_cols = [
|
||
"pct_chg",
|
||
"daily_industry_positive_benchmark",
|
||
"daily_industry_negative_benchmark",
|
||
]
|
||
if not all(col in df.columns for col in required_cols):
|
||
print(
|
||
f"错误: 计算日级别行业偏离度需要以下列: {required_cols}。请先运行 daily_industry_momentum_benchmark(df)。"
|
||
)
|
||
return df
|
||
|
||
# 根据规则计算日级别行业偏离度:
|
||
# 如果 pct_chg > 0 且 daily_industry_positive_benchmark > 0,deviation = pct_chg - daily_industry_positive_benchmark
|
||
# 如果 pct_chg < 0 且 daily_industry_negative_benchmark < 0,deviation = pct_chg - daily_industry_negative_benchmark
|
||
# 否则 deviation = 0
|
||
|
||
conditions = [
|
||
(df["pct_chg"] > 0) & (df["daily_industry_positive_benchmark"] > 0),
|
||
(df["pct_chg"] < 0) & (df["daily_industry_negative_benchmark"] < 0),
|
||
]
|
||
choices = [
|
||
df["pct_chg"] - df["daily_industry_positive_benchmark"],
|
||
df["pct_chg"] - df["daily_industry_negative_benchmark"],
|
||
]
|
||
df["daily_industry_deviation"] = np.select(conditions, choices, default=0)
|
||
|
||
df = df.drop(
|
||
columns=[
|
||
"daily_industry_positive_benchmark",
|
||
"daily_industry_negative_benchmark",
|
||
]
|
||
)
|
||
print("日级别行业偏离度计算完成 (使用 pct_chg 和行业基准)。")
|
||
return df
|
||
|
||
|
||
def sentiment_panic_greed_index(
|
||
df: pd.DataFrame,
|
||
window_atr: int = 14,
|
||
window_smooth: int = 5,
|
||
factor_name: str = "senti_panic_greed",
|
||
):
|
||
"""
|
||
计算市场恐慌/贪婪指数 (原地修改)。
|
||
结合日内振幅、影线、跳空及与近期ATR的比较。
|
||
WARNING: Modifies df in-place.
|
||
"""
|
||
print(f"Calculating {factor_name}...")
|
||
_temp_cols = [
|
||
"_prev_close",
|
||
"_atr",
|
||
"_true_range",
|
||
"_upper_shadow",
|
||
"_lower_shadow",
|
||
"_body",
|
||
"_gap",
|
||
"_volatility_surprise",
|
||
]
|
||
if not all(col in df.columns for col in ["open", "high", "low", "close", "vol"]):
|
||
print(f"Error: DataFrame 缺少必需的 OHLCV 列。将为 {factor_name} 填充 NaN。")
|
||
df[factor_name] = np.nan
|
||
return
|
||
|
||
try:
|
||
df["_prev_close"] = df["close"].shift(1)
|
||
|
||
# 计算真实波幅 (TR) 和 ATR
|
||
df["_true_range"] = talib.TRANGE(df["high"], df["low"], df["_prev_close"])
|
||
df["_atr"] = talib.ATR(
|
||
df["high"], df["low"], df["_prev_close"], timeperiod=window_atr
|
||
)
|
||
|
||
# 计算影线和实体
|
||
df["_upper_shadow"] = df["high"] - np.maximum(df["open"], df["close"])
|
||
df["_lower_shadow"] = np.minimum(df["open"], df["close"]) - df["low"]
|
||
df["_body"] = np.abs(df["close"] - df["open"])
|
||
|
||
# 计算跳空
|
||
df["_gap"] = (df["open"] / df["_prev_close"] - 1).fillna(0)
|
||
|
||
# 波动性意外: 当日真实波幅相对于近期ATR的倍数,乘以涨跌方向
|
||
# 如果真实波幅显著放大,根据涨跌幅赋予正负号,表明情绪的强度和方向
|
||
df["_volatility_surprise"] = (
|
||
df["_true_range"] / (df["_atr"] + epsilon) - 1
|
||
) * np.sign(df["pct_chg"].fillna(0))
|
||
|
||
# 简化版情绪指标:(下影线 - 上影线) / ATR + 跳空幅度 + 当日涨跌幅, 然后平滑
|
||
# 更强的信号:波动性意外,结合跳空
|
||
# 考虑当日振幅相对于ATR的超额部分,并结合实体方向
|
||
# ( (真实波幅/ATR) * 涨跌方向 ) + 跳空幅度
|
||
raw_senti = (df["_true_range"] / (df["_atr"] + epsilon)) * np.sign(
|
||
df["pct_chg"].fillna(0)
|
||
) + df[
|
||
"_gap"
|
||
] * 2 # 放大跳空影响
|
||
df[factor_name] = raw_senti.rolling(window_smooth, min_periods=1).mean()
|
||
|
||
except Exception as e:
|
||
print(f"Error calculating {factor_name}: {e}")
|
||
df[factor_name] = np.nan
|
||
finally:
|
||
cols_to_drop = [col for col in _temp_cols if col in df.columns]
|
||
if cols_to_drop:
|
||
df.drop(columns=cols_to_drop, inplace=True)
|
||
print(f"Finished {factor_name}.")
|
||
return df
|
||
|
||
|
||
def sentiment_market_breadth_proxy(
|
||
df: pd.DataFrame,
|
||
window_vol: int = 20,
|
||
window_smooth: int = 3,
|
||
factor_name: str = "senti_breadth_proxy",
|
||
):
|
||
"""
|
||
计算市场宽度情绪代理指标 (基于指数的价量配合度) (原地修改).
|
||
WARNING: Modifies df in-place.
|
||
"""
|
||
print(f"Calculating {factor_name}...")
|
||
_temp_cols = ["_rolling_avg_vol"]
|
||
if not all(col in df.columns for col in ["pct_chg", "vol"]):
|
||
print(
|
||
f"Error: DataFrame 缺少 'pct_chg' 或 'vol' 列。将为 {factor_name} 填充 NaN。"
|
||
)
|
||
df[factor_name] = np.nan
|
||
return
|
||
|
||
try:
|
||
df["_rolling_avg_vol"] = (
|
||
df["vol"].rolling(window_vol, min_periods=max(1, window_vol // 2)).mean()
|
||
)
|
||
# 价量配合度:涨跌幅乘以相对成交量强度
|
||
raw_breadth = df["pct_chg"] * (df["vol"] / (df["_rolling_avg_vol"] + epsilon))
|
||
df[factor_name] = raw_breadth.rolling(
|
||
window_smooth, min_periods=1
|
||
).mean() # 平滑处理
|
||
|
||
except Exception as e:
|
||
print(f"Error calculating {factor_name}: {e}")
|
||
df[factor_name] = np.nan
|
||
finally:
|
||
cols_to_drop = [col for col in _temp_cols if col in df.columns]
|
||
if cols_to_drop:
|
||
df.drop(columns=cols_to_drop, inplace=True)
|
||
print(f"Finished {factor_name}.")
|
||
return df
|
||
|
||
|
||
def sentiment_reversal_indicator(
|
||
df: pd.DataFrame,
|
||
window_ret: int = 5,
|
||
window_vol: int = 5,
|
||
factor_name: str = "senti_reversal",
|
||
):
|
||
"""
|
||
计算短期情绪反转因子 (原地修改).
|
||
WARNING: Modifies df in-place.
|
||
"""
|
||
print(f"Calculating {factor_name}...")
|
||
_temp_cols = ["_return_M", "_volatility_M"]
|
||
if "pct_chg" not in df.columns:
|
||
print(f"Error: DataFrame 缺少 'pct_chg' 列。将为 {factor_name} 填充 NaN。")
|
||
df[factor_name] = np.nan
|
||
return
|
||
try:
|
||
# 计算 M 日累计收益率 (这里用连乘近似,或者 sum of log returns)
|
||
# (close / close.shift(M)) -1
|
||
df["_return_M"] = (df["close"] / df["close"].shift(window_ret)) - 1
|
||
# df['_return_M'] = df['pct_chg'].rolling(window_ret, min_periods=1).sum() # 另一种近似
|
||
|
||
# 计算 M 日已实现波动率
|
||
df["_volatility_M"] = (
|
||
df["pct_chg"].rolling(window_vol, min_periods=max(1, window_vol // 2)).std()
|
||
)
|
||
|
||
# 因子计算
|
||
df[factor_name] = -df["_return_M"] * df["_volatility_M"]
|
||
# 对因子本身可以再做一次平滑
|
||
# df[factor_name] = df[factor_name].rolling(3, min_periods=1).mean()
|
||
|
||
except Exception as e:
|
||
print(f"Error calculating {factor_name}: {e}")
|
||
df[factor_name] = np.nan
|
||
finally:
|
||
cols_to_drop = [col for col in _temp_cols if col in df.columns]
|
||
if cols_to_drop:
|
||
df.drop(columns=cols_to_drop, inplace=True)
|
||
print(f"Finished {factor_name}.")
|
||
return df
|
||
|
||
|
||
def price_minus_deduction_price(df, n=10):
|
||
"""
|
||
因子 1 (定量): 计算当前收盘价与N周期前收盘价(均线抵扣价)的差值。
|
||
|
||
该因子衡量当前价格相对于即将移出均线计算窗口的价格的差异。
|
||
正值表示当前价格高于抵扣价,下一周期均线倾向于上涨(如果价格不变)。
|
||
|
||
参数:
|
||
df (pd.DataFrame): 包含股票日线数据的DataFrame。必须包含 'ts_code', 'close' 列。
|
||
n (int): 均线计算的周期数。抵扣价是 n-1 周期前的数据点。
|
||
|
||
返回:
|
||
pd.DataFrame: 增加了 'price_minus_deduction_price_n' 列的DataFrame。
|
||
"""
|
||
if "close" not in df.columns:
|
||
print("错误: DataFrame中没有'close'列,无法计算抵扣价相关因子。")
|
||
return df
|
||
|
||
if n <= 1:
|
||
print("错误: 均线周期 n 必须大于 1。")
|
||
df[f"price_minus_deduction_price_{n}"] = np.nan
|
||
return df
|
||
|
||
grouped = df.groupby("ts_code", group_keys=False)
|
||
|
||
# 抵扣价是当前窗口移除的最早的价格,即当前价格的 n-1 周期前的价格
|
||
# 例如计算 SMA(10) for P_t, 窗口是 P_{t-9}, ..., P_t. 移除的是 P_{t-9}.
|
||
# P_{t-9} 是 P_t 的 shift(9). So shift(n-1).
|
||
deduction_price = grouped["close"].shift(n - 1)
|
||
|
||
# 计算差值
|
||
df[f"price_minus_deduction_price_{n}"] = df["close"] - deduction_price
|
||
|
||
print(f"因子 price_minus_deduction_price_{n} 计算完成。")
|
||
return df
|
||
|
||
|
||
def price_deduction_price_diff_ratio_to_sma(df, n=10):
|
||
"""
|
||
因子 2 (定量): 计算当前收盘价与抵扣价的差值占N周期均线的比例。
|
||
|
||
该因子衡量当前价格高于抵扣价的程度相对于均线水平的大小。
|
||
|
||
参数:
|
||
df (pd.DataFrame): 包含股票日线数据的DataFrame。必须包含 'ts_code', 'close' 列。
|
||
n (int): 均线计算的周期数。抵扣价是 n-1 周期前的数据点。
|
||
|
||
返回:
|
||
pd.DataFrame: 增加了 'price_deduction_price_diff_ratio_to_sma_n' 列的DataFrame。
|
||
"""
|
||
if "close" not in df.columns:
|
||
print("错误: DataFrame中没有'close'列,无法计算抵扣价相关因子。")
|
||
return df
|
||
if n <= 1:
|
||
print("错误: 均线周期 n 必须大于 1。")
|
||
df[f"price_deduction_price_diff_ratio_to_sma_{n}"] = np.nan
|
||
return df
|
||
|
||
grouped = df.groupby("ts_code", group_keys=False)
|
||
|
||
# 计算N周期SMA
|
||
# 使用 transform 可以保持与原始 df 的索引对齐
|
||
sma = grouped["close"].transform(lambda x: x.rolling(window=n).mean())
|
||
|
||
# 抵扣价
|
||
deduction_price = grouped["close"].shift(n - 1)
|
||
|
||
# 计算比例,处理均线为零的情况
|
||
diff = df["close"] - deduction_price
|
||
# 使用 np.divide 并指定 where 条件和 fill_value 来避免除以零警告和 NaN 结果
|
||
# 如果 sma 为 0,则结果设为 NaN
|
||
df[f"price_deduction_price_diff_ratio_to_sma_{n}"] = np.divide(
|
||
diff,
|
||
sma,
|
||
out=np.full_like(diff, np.nan), # 输出数组形状与 diff 相同,NaN 填充
|
||
where=(sma != 0), # 仅在 sma 不为 0 时执行除法
|
||
)
|
||
|
||
# np.divide with where handles Inf/-Inf and 0/0 (as NaN), but explicitly replace might be slightly safer depending on numpy version
|
||
# df[f'price_deduction_price_diff_ratio_to_sma_{n}'].replace([np.inf, -np.inf], np.nan, inplace=True) # This is often redundant with np.divide(..., where=...)
|
||
|
||
print(f"因子 price_deduction_price_diff_ratio_to_sma_{n} 计算完成。")
|
||
return df
|
||
|
||
|
||
def cat_price_vs_sma_vs_deduction_price(df, n=10):
|
||
"""
|
||
因子 3 (分类): 基于当前收盘价、N周期均线和抵扣价的位置关系构建分类因子。
|
||
|
||
捕捉当前价格和抵扣价相对于均线的位置,指示可能的趋势状态或变化。
|
||
|
||
分类定义:
|
||
0: 数据不足 (SMA 或抵扣价为 NaN) 或 均线为 NaN
|
||
1: 当前价 > SMA 且 抵扣价 > SMA (两者都在均线之上)
|
||
2: 当前价 < SMA 且 抵扣价 < SMA (两者都在均线之下)
|
||
3: 当前价 > SMA 且 抵扣价 <= SMA (当前价上穿或位于均线上方,抵扣价在均线下方或正好在均线)
|
||
4: 当前价 <= SMA 且 抵扣价 > SMA (当前价下穿或位于均线下方,抵扣价在均线上方)
|
||
|
||
参数:
|
||
df (pd.DataFrame): 包含股票日线数据的DataFrame。必须包含 'ts_code', 'close' 列。
|
||
n (int): 均线计算的周期数。抵扣价是 n-1 周期前的数据点。
|
||
|
||
返回:
|
||
pd.DataFrame: 增加了 'cat_price_vs_sma_vs_deduction_price_n' 列的DataFrame。
|
||
"""
|
||
if "close" not in df.columns:
|
||
print("错误: DataFrame中没有'close'列,无法计算抵扣价相关因子。")
|
||
return df
|
||
if n <= 1:
|
||
print("错误: 均线周期 n 必须大于 1。")
|
||
df[f"cat_price_vs_sma_vs_deduction_price_{n}"] = np.nan
|
||
return df
|
||
|
||
grouped = df.groupby("ts_code", group_keys=False)
|
||
|
||
# 计算N周期SMA
|
||
sma = grouped["close"].transform(lambda x: x.rolling(window=n).mean())
|
||
|
||
# 抵扣价
|
||
deduction_price = grouped["close"].shift(n - 1)
|
||
|
||
# 定义条件和分类值
|
||
conditions = [
|
||
(df["close"] > sma) & (deduction_price > sma),
|
||
(df["close"] < sma) & (deduction_price < sma),
|
||
(df["close"] > sma) & (deduction_price <= sma), # 包含等于的情况
|
||
(df["close"] <= sma) & (deduction_price > sma), # 包含等于的情况
|
||
# 注意:sma 或 deduction_price 为 NaN 的行,其条件结果为 False,会落入 default=0
|
||
]
|
||
choices = [1, 2, 3, 4]
|
||
|
||
# 使用 np.select 进行分类
|
||
# 默认值为 0,用于处理条件不满足或输入为 NaN 的情况
|
||
df[f"cat_price_vs_sma_vs_deduction_price_{n}"] = np.select(
|
||
conditions, choices, default=0
|
||
)
|
||
|
||
print(f"因子 cat_price_vs_sma_vs_deduction_price_{n} 计算完成。")
|
||
return df
|
||
|
||
|
||
def cat_is_on_top_list(df: pd.DataFrame, top_list: pd.DataFrame) -> pd.DataFrame:
|
||
if "cat_reason" not in df.columns:
|
||
print("计算因子cat_is_on_top_list失败,缺少cat_reason列")
|
||
return df
|
||
|
||
df["cat_is_on_top_list"] = df["cat_reason"]
|
||
df["cat_is_on_top_list"] = df["cat_is_on_top_list"] * (df["pct_chg"] > 0).astype(
|
||
int
|
||
)
|
||
|
||
return df
|
||
|
||
|
||
def cat_reason(df: pd.DataFrame, top_list: pd.DataFrame) -> pd.DataFrame:
|
||
"""
|
||
高效地将龙虎榜的 reason 列转化为单一数值类型的因子列,并仅返回必要的列。
|
||
|
||
Args:
|
||
df (pd.DataFrame): 包含所有股票数据的 DataFrame,需包含 'ts_code' 和 'trade_date' 列。
|
||
top_list (pd.DataFrame): 每日龙虎榜数据的 DataFrame,需包含 'ts_code', 'trade_date' 和 'reason' 列。
|
||
|
||
Returns:
|
||
pd.DataFrame: 包含 'ts_code', 'trade_date' 和 'cat_reason' 列。
|
||
"""
|
||
# 提取所有唯一的 reason 并进行数值编码
|
||
unique_reasons = top_list["reason"].unique()
|
||
reason_mapping = {reason: i + 1 for i, reason in enumerate(unique_reasons)}
|
||
|
||
# 在 top_list 中创建数值型的 reason 列
|
||
top_list["cat_reason"] = top_list["reason"].map(reason_mapping).astype("Int64")
|
||
|
||
# 转换 trade_date 类型以进行合并
|
||
df["trade_date"] = pd.to_datetime(df["trade_date"])
|
||
top_list["trade_date"] = pd.to_datetime(top_list["trade_date"])
|
||
|
||
# 仅保留 top_list 中需要的列进行合并
|
||
top_list_slim = top_list[["ts_code", "trade_date", "cat_reason"]]
|
||
|
||
# 合并 DataFrame
|
||
merged_df = df.merge(top_list_slim, on=["ts_code", "trade_date"], how="left")
|
||
|
||
# 填充 NaN 为 0 并转换为 int 类型
|
||
merged_df["cat_reason"] = merged_df["cat_reason"].fillna(0).astype("int")
|
||
|
||
return merged_df
|
||
|
||
|
||
def ts_volatility_slope_20_5(df: pd.DataFrame) -> pd.DataFrame:
|
||
"""
|
||
计算 20 日收益率标准差的 5 日线性回归斜率因子。
|
||
|
||
Args:
|
||
df (pd.DataFrame): 包含 'ts_code', 'trade_date' 和 'pct_chg' 列的 DataFrame。
|
||
|
||
Returns:
|
||
pd.DataFrame: 包含新增 'ts_volatility_slope_20_5' 列的 DataFrame。
|
||
"""
|
||
print(f"计算因子 ts_volatility_slope_20_5")
|
||
df["trade_date"] = pd.to_datetime(df["trade_date"])
|
||
df.sort_values(["ts_code", "trade_date"], inplace=True)
|
||
|
||
def std_slope(series):
|
||
if len(series) < 2:
|
||
return 0
|
||
x = np.arange(len(series))
|
||
slope, _, _, _, _ = linregress(x, series)
|
||
return slope
|
||
|
||
df["volatility_20"] = (
|
||
df.groupby("ts_code")["pct_chg"]
|
||
.rolling(window=20, min_periods=1)
|
||
.std()
|
||
.reset_index(level=0, drop=True)
|
||
)
|
||
df["ts_volatility_slope_20_5"] = (
|
||
df.groupby("ts_code")["volatility_20"]
|
||
.rolling(window=5, min_periods=2)
|
||
.apply(std_slope)
|
||
.reset_index(level=0, drop=True)
|
||
)
|
||
df.drop(columns=["volatility_20"], inplace=True)
|
||
return df
|
||
|
||
|
||
def ts_turnover_rate_acceleration_5_20(df: pd.DataFrame) -> pd.DataFrame:
|
||
"""
|
||
计算短期 (5日) 和长期 (20日) 换手率均值的差值因子。
|
||
|
||
Args:
|
||
df (pd.DataFrame): 包含 'ts_code', 'trade_date' 和 'turnover_rate' 列的 DataFrame。
|
||
|
||
Returns:
|
||
pd.DataFrame: 包含新增 'ts_turnover_rate_acceleration_5_20' 列的 DataFrame。
|
||
"""
|
||
print(f"计算因子 ts_turnover_rate_acceleration_5_20")
|
||
|
||
df['trade_date'] = pd.to_datetime(df['trade_date'])
|
||
df.sort_values(['ts_code', 'trade_date'], inplace=True)
|
||
df['short_avg_turnover'] = df.groupby('ts_code')['turnover_rate'].rolling(window=5, min_periods=1).mean().reset_index(level=0, drop=True)
|
||
df['long_avg_turnover'] = df.groupby('ts_code')['turnover_rate'].rolling(window=20, min_periods=1).mean().reset_index(level=0, drop=True)
|
||
df['ts_turnover_rate_acceleration_5_20'] = df['short_avg_turnover'] - df['long_avg_turnover']
|
||
df.drop(columns=['short_avg_turnover', 'long_avg_turnover'], inplace=True)
|
||
return df
|
||
|
||
def ts_vol_sustain_10_30(df: pd.DataFrame) -> pd.DataFrame:
|
||
"""
|
||
计算过去 10 日成交量大于 30 日均值的交易天数占比因子。
|
||
|
||
Args:
|
||
df (pd.DataFrame): 包含 'ts_code', 'trade_date' 和 'vol' 列的 DataFrame。
|
||
|
||
Returns:
|
||
pd.DataFrame: 包含新增 'ts_vol_sustain_10_30' 列的 DataFrame。
|
||
"""
|
||
print(f"计算因子 ts_vol_sustain_10_30")
|
||
|
||
df['trade_date'] = pd.to_datetime(df['trade_date'])
|
||
df.sort_values(['ts_code', 'trade_date'], inplace=True)
|
||
df['long_avg_vol'] = df.groupby('ts_code')['vol'].rolling(window=30, min_periods=1).mean().reset_index(level=0, drop=True)
|
||
|
||
def vol_above_avg(group):
|
||
group['vol_above'] = group['vol'] > group['long_avg_vol']
|
||
group['ts_vol_sustain_10_30'] = group['vol_above'].rolling(window=10, min_periods=1).mean()
|
||
return group.drop(columns=['vol_above'])
|
||
|
||
df = df.groupby('ts_code', group_keys=False).apply(vol_above_avg)
|
||
df.drop(columns=['long_avg_vol'], inplace=True)
|
||
return df
|
||
|
||
def cs_turnover_rate_relative_strength_20(df: pd.DataFrame) -> pd.DataFrame:
|
||
"""
|
||
计算当日换手率 vs 20 日均值比值的横截面排名因子。
|
||
|
||
Args:
|
||
df (pd.DataFrame): 包含 'ts_code', 'trade_date' 和 'turnover_rate' 列的 DataFrame。
|
||
|
||
Returns:
|
||
pd.DataFrame: 包含新增 'cs_turnover_rate_relative_strength_20' 列的 DataFrame。
|
||
"""
|
||
print(f"计算因子 cs_turnover_rate_relative_strength_20")
|
||
|
||
df['trade_date'] = pd.to_datetime(df['trade_date'])
|
||
df.sort_values(['ts_code', 'trade_date'], inplace=True)
|
||
|
||
def calculate_ratio(group):
|
||
group['avg_turnover_20'] = group['turnover_rate'].rolling(window=20, min_periods=1).mean()
|
||
group['turnover_ratio'] = group['turnover_rate'] / group['avg_turnover_20']
|
||
return group.drop(columns=['avg_turnover_20'])
|
||
|
||
df = df.groupby('ts_code', group_keys=False).apply(calculate_ratio)
|
||
|
||
def rank_ratios(group):
|
||
group['cs_turnover_rate_relative_strength_20'] = group['turnover_ratio'].rank(method='dense', ascending=False)
|
||
return group.drop(columns=['turnover_ratio'])
|
||
|
||
df = df.groupby('trade_date', group_keys=False).apply(rank_ratios)
|
||
return df
|
||
|
||
def cs_amount_outlier_10(df: pd.DataFrame) -> pd.DataFrame:
|
||
"""
|
||
计算当日成交额 vs 10 日均值差值的横截面 Z-score 因子。
|
||
|
||
Args:
|
||
df (pd.DataFrame): 包含 'ts_code', 'trade_date' 和 'amount' 列的 DataFrame。
|
||
|
||
Returns:
|
||
pd.DataFrame: 包含新增 'cs_amount_outlier_10' 列的 DataFrame。
|
||
"""
|
||
print(f"计算因子 cs_amount_outlier_10")
|
||
|
||
df['trade_date'] = pd.to_datetime(df['trade_date'])
|
||
df.sort_values(['ts_code', 'trade_date'], inplace=True)
|
||
|
||
def calculate_diff(group):
|
||
group['avg_amount_10'] = group['amount'].rolling(window=10, min_periods=1).mean()
|
||
group['amount_diff'] = group['amount'] - group['avg_amount_10']
|
||
return group.drop(columns=['avg_amount_10'])
|
||
|
||
df = df.groupby('ts_code', group_keys=False).apply(calculate_diff)
|
||
|
||
def zscore_diff(group):
|
||
mean_diff = group['amount_diff'].mean()
|
||
std_diff = group['amount_diff'].std()
|
||
if std_diff == 0:
|
||
group['cs_amount_outlier_10'] = 0
|
||
else:
|
||
group['cs_amount_outlier_10'] = (group['amount_diff'] - mean_diff) / std_diff
|
||
return group.drop(columns=['amount_diff'])
|
||
|
||
df = df.groupby('trade_date', group_keys=False).apply(zscore_diff)
|
||
return df
|
||
|
||
def ts_ff_to_total_turnover_ratio(df: pd.DataFrame) -> pd.DataFrame:
|
||
"""
|
||
计算自由流通股换手率与总换手率之比。
|
||
|
||
Args:
|
||
df (pd.DataFrame): 包含 'ts_code', 'trade_date', 'turnover_rate' 和 'turnover_rate' 列的 DataFrame。
|
||
|
||
Returns:
|
||
pd.DataFrame: 包含新增 'ts_ff_to_total_turnover_ratio' 列的 DataFrame。
|
||
"""
|
||
print(f"计算因子 ts_ff_to_total_turnover_ratio")
|
||
|
||
df['ts_ff_to_total_turnover_ratio'] = df['turnover_rate'] / (df['turnover_rate'] + 1e-8) # 避免除零
|
||
return df
|
||
|
||
|
||
def ts_price_volume_trend_coherence_5_20(df: pd.DataFrame) -> pd.DataFrame:
|
||
"""
|
||
计算过去 5 日价格上涨占比与过去 5 日成交量高于 20 日均量占比的乘积。
|
||
|
||
Args:
|
||
df (pd.DataFrame): 包含 'ts_code', 'trade_date', 'close' 和 'vol' 列的 DataFrame。
|
||
|
||
Returns:
|
||
pd.DataFrame: 包含新增 'ts_price_volume_trend_coherence_5_20' 列的 DataFrame。
|
||
"""
|
||
print(f"计算因子 ts_price_volume_trend_coherence_5_20")
|
||
|
||
df['trade_date'] = pd.to_datetime(df['trade_date'])
|
||
df.sort_values(['ts_code', 'trade_date'], inplace=True)
|
||
|
||
def price_up_days(series):
|
||
return (series.diff() > 0).rolling(window=5, min_periods=1).mean()
|
||
|
||
df['price_up_ratio'] = df.groupby('ts_code')['close'].apply(price_up_days).reset_index(level=0, drop=True)
|
||
df['vol_avg_20'] = df.groupby('ts_code')['vol'].rolling(window=20, min_periods=1).mean().reset_index(level=0, drop=True)
|
||
df['vol_above_avg'] = (df['vol'] > df['vol_avg_20']).rolling(window=5, min_periods=1).mean()
|
||
df['ts_price_volume_trend_coherence_5_20'] = df['price_up_ratio'] * df['vol_above_avg']
|
||
df.drop(columns=['price_up_ratio', 'vol_avg_20', 'vol_above_avg'], inplace=True)
|
||
return df
|
||
|
||
def ts_turnover_rate_trend_strength_5(df: pd.DataFrame) -> pd.DataFrame:
|
||
"""
|
||
计算过去 5 日换手率的线性回归斜率。
|
||
|
||
Args:
|
||
df (pd.DataFrame): 包含 'ts_code', 'trade_date' 和 'turnover_rate' 列的 DataFrame。
|
||
|
||
Returns:
|
||
pd.DataFrame: 包含新增 'ts_turnover_rate_trend_strength_5' 列的 DataFrame。
|
||
"""
|
||
print(f"计算因子 ts_turnover_rate_trend_strength_5")
|
||
|
||
df['trade_date'] = pd.to_datetime(df['trade_date'])
|
||
df.sort_values(['ts_code', 'trade_date'], inplace=True)
|
||
|
||
def turnover_slope(series):
|
||
if len(series) < 2:
|
||
return 0
|
||
x = np.arange(len(series))
|
||
slope, _, _, _, _ = linregress(x, series)
|
||
return slope
|
||
|
||
df['ts_turnover_rate_trend_strength_5'] = df.groupby('ts_code')['turnover_rate'].rolling(window=5, min_periods=2).apply(turnover_slope).reset_index(level=0, drop=True)
|
||
return df
|
||
|
||
|
||
def ts_ff_turnover_rate_surge_10(df: pd.DataFrame) -> pd.DataFrame:
|
||
"""
|
||
计算当日自由流通股换手率与过去 10 日均值比值。
|
||
|
||
Args:
|
||
df (pd.DataFrame): 包含 'ts_code', 'trade_date' 和 'turnover_rate' 列的 DataFrame。
|
||
|
||
Returns:
|
||
pd.DataFrame: 包含新增 'ts_ff_turnover_rate_surge_10' 列的 DataFrame。
|
||
"""
|
||
print(f"计算因子 ts_ff_turnover_rate_surge_10")
|
||
|
||
df['trade_date'] = pd.to_datetime(df['trade_date'])
|
||
df.sort_values(['ts_code', 'trade_date'], inplace=True)
|
||
df['avg_ff_turnover_10'] = df.groupby('ts_code')['turnover_rate'].rolling(window=10, min_periods=1).mean().reset_index(level=0, drop=True)
|
||
df['ts_ff_turnover_rate_surge_10'] = df['turnover_rate'] / (df['avg_ff_turnover_10'] + 1e-8) # 避免除零
|
||
df.drop(columns=['avg_ff_turnover_10'], inplace=True)
|
||
return df
|
||
|
||
|
||
# --- Factor 1: 近期积极动量与成交量激增 (简化版催化剂代理) ---
|
||
def cat_senti_mom_vol_spike(df_input: pd.DataFrame,
|
||
return_period: int = 3,
|
||
return_threshold: float = 0.05,
|
||
volume_ratio_threshold: float = 1.5,
|
||
current_pct_chg_min: float = -0.01,
|
||
current_pct_chg_max: float = 0.03,
|
||
factor_name: str = 'cat_senti_mom_vol_spike') -> pd.DataFrame:
|
||
"""
|
||
计算近期积极动量与成交量激增因子。
|
||
理念: 近期有显著正收益 + 近期成交量显著放大 + 当日小幅上涨或横盘。
|
||
"""
|
||
df = df_input
|
||
print(f"Calculating {factor_name}...")
|
||
_temp_cols = []
|
||
|
||
try:
|
||
# 1. 计算N日收益率 (如果不存在)
|
||
return_col = f'_return_{return_period}d'
|
||
if return_col not in df.columns:
|
||
df[return_col] = df.groupby('ts_code')['close'].pct_change(periods=return_period)
|
||
_temp_cols.append(return_col)
|
||
|
||
# 2. 检查 volume_ratio 是否存在 (通常由基础数据提供或 factor.txt 计算)
|
||
# 如果没有,我们可以尝试计算一个简单的 N 日均量比当日量
|
||
if 'volume_ratio' not in df.columns:
|
||
print(f"Warning: 'volume_ratio' column not found. Calculating a proxy for {factor_name}.")
|
||
df['_avg_vol_5d'] = df.groupby('ts_code')['vol'].rolling(window=5, min_periods=1).mean().reset_index(level=0, drop=True)
|
||
df['_volume_ratio_proxy'] = df['vol'] / (df['_avg_vol_5d'] + epsilon)
|
||
volume_metric_col = '_volume_ratio_proxy'
|
||
_temp_cols.extend(['_avg_vol_5d', '_volume_ratio_proxy'])
|
||
else:
|
||
volume_metric_col = 'volume_ratio'
|
||
|
||
# 条件判断
|
||
cond_momentum = df[return_col] > return_threshold
|
||
cond_volume = df[volume_metric_col] > volume_ratio_threshold
|
||
cond_current_price = (df['pct_chg'] > current_pct_chg_min) & (df['pct_chg'] < current_pct_chg_max)
|
||
|
||
df[factor_name] = (cond_momentum.astype(int).astype(str) + cond_volume.astype(int).astype(str) + cond_current_price.astype(int).astype(str))
|
||
|
||
except KeyError as e:
|
||
print(f"Error calculating {factor_name}: Missing column {e}. Factor column will be all zeros or NaN.")
|
||
df[factor_name] = 0
|
||
except Exception as e:
|
||
print(f"An unexpected error occurred in {factor_name}: {e}. Factor column will be all zeros or NaN.")
|
||
df[factor_name] = 0
|
||
finally:
|
||
# 清理中间列
|
||
df.drop(columns=[col for col in _temp_cols if col in df.columns], inplace=True, errors='ignore')
|
||
print(f"Finished {factor_name}.")
|
||
return df
|
||
|
||
# --- Factor 2: 强主力资金流入信号(未实现) ---
|
||
def calculate_strong_inflow_signal(df_input: pd.DataFrame,
|
||
intensity_avg_N: int = 3,
|
||
intensity_threshold: float = 0.01, # 假设 flow_lg_elg_intensity 的合理阈值
|
||
consecutive_buy_N: int = 2,
|
||
accel_positive_M: int = 1,
|
||
factor_name: str = 'senti_strong_inflow') -> pd.DataFrame:
|
||
"""
|
||
计算强主力资金流入信号因子。
|
||
理念: 大单资金持续、显著净流入,且有加速迹象。
|
||
依赖: df 中已包含 'flow_lg_elg_intensity' 和 'flow_lg_elg_accel' (来自 factor.txt)
|
||
"""
|
||
df = df_input
|
||
print(f"Calculating {factor_name}...")
|
||
_temp_cols = []
|
||
|
||
required_flow_cols = ['flow_lg_elg_intensity', 'flow_lg_elg_accel']
|
||
if not all(col in df.columns for col in required_flow_cols):
|
||
missing = [col for col in required_flow_cols if col not in df.columns]
|
||
print(f"Error: DataFrame 缺少必需的资金流因子列: {missing} for {factor_name}. Factor column will be all zeros or NaN.")
|
||
df[factor_name] = 0
|
||
return df
|
||
|
||
try:
|
||
# 1. 近N日主力资金强度均值
|
||
avg_intensity_col = f'_avg_flow_intensity_{intensity_avg_N}d'
|
||
df[avg_intensity_col] = df.groupby('ts_code')['flow_lg_elg_intensity'].rolling(window=intensity_avg_N, min_periods=1).mean().reset_index(level=0, drop=True)
|
||
_temp_cols.append(avg_intensity_col)
|
||
cond_avg_intensity = df[avg_intensity_col] > intensity_threshold
|
||
|
||
# 2. 近N日连续主力净买入天数 (近似:flow_lg_elg_intensity > 0)
|
||
# 或者使用 lg_elg_net_buy_vol > 0 (如果该列存在)
|
||
df['_lg_elg_is_net_buy'] = (df['flow_lg_elg_intensity'] > 0).astype(int) # 或者用绝对量判断
|
||
_temp_cols.append('_lg_elg_is_net_buy')
|
||
# 计算连续天数
|
||
def count_consecutive_positive(series):
|
||
return series.rolling(window=consecutive_buy_N, min_periods=consecutive_buy_N).apply(lambda x: x.sum() == consecutive_buy_N, raw=True)
|
||
|
||
df['_consecutive_buy_days_flag'] = df.groupby('ts_code')['_lg_elg_is_net_buy'].apply(count_consecutive_positive).reset_index(level=0, drop=True).fillna(0)
|
||
_temp_cols.append('_consecutive_buy_days_flag')
|
||
cond_consecutive_buy = df['_consecutive_buy_days_flag'] == 1
|
||
|
||
|
||
# 3. 近M日主力资金流加速度为正
|
||
df['_accel_is_positive'] = (df['flow_lg_elg_accel'] > 0).astype(int)
|
||
_temp_cols.append('_accel_is_positive')
|
||
|
||
def check_all_positive_recent_M(series):
|
||
return series.rolling(window=accel_positive_M, min_periods=accel_positive_M).apply(lambda x: x.sum() == accel_positive_M, raw=True)
|
||
|
||
df['_accel_positive_M_flag'] = df.groupby('ts_code')['_accel_is_positive'].apply(check_all_positive_recent_M).reset_index(level=0, drop=True).fillna(0)
|
||
_temp_cols.append('_accel_positive_M_flag')
|
||
cond_accel_positive = df['_accel_positive_M_flag'] == 1
|
||
|
||
df[factor_name] = (cond_avg_intensity & cond_consecutive_buy & cond_accel_positive).astype(int)
|
||
|
||
except KeyError as e:
|
||
print(f"Error calculating {factor_name}: Missing column {e}. Factor column will be all zeros or NaN.")
|
||
df[factor_name] = 0
|
||
except Exception as e:
|
||
print(f"An unexpected error occurred in {factor_name}: {e}. Factor column will be all zeros or NaN.")
|
||
df[factor_name] = 0
|
||
finally:
|
||
df.drop(columns=[col for col in _temp_cols if col in df.columns], inplace=True, errors='ignore')
|
||
print(f"Finished {factor_name}.")
|
||
return df
|
||
|
||
# --- Factor 3: 突破前盘整模式 ---
|
||
def cat_senti_pre_breakout(df_input: pd.DataFrame,
|
||
atr_short_N: int = 10,
|
||
atr_long_M: int = 40,
|
||
vol_atrophy_N: int = 10, # 用于计算短期均量
|
||
vol_atrophy_M: int = 40, # 用于计算长期均量
|
||
price_stab_N: int = 5,
|
||
price_stab_threshold: float = 0.05,
|
||
current_pct_chg_min_signal: float = 0.005, # 当日上涨至少0.5%
|
||
current_pct_chg_max_signal: float = 0.07, # 当日上涨不超过7% (避免追已大涨的)
|
||
volume_ratio_signal_threshold: float = 1.2,
|
||
factor_name: str = 'cat_senti_pre_breakout') -> pd.DataFrame:
|
||
"""
|
||
计算突破前盘整模式因子。
|
||
理念: 波动率收缩、成交量萎缩、近期价格稳定,当日出现温和放量上涨。
|
||
"""
|
||
df = df_input
|
||
print(f"Calculating {factor_name}...")
|
||
_temp_cols = []
|
||
|
||
try:
|
||
# 1. 波动率收缩 (使用 ATR)
|
||
atr_short_col = f'atr_{atr_short_N}'
|
||
atr_long_col = f'atr_{atr_long_M}'
|
||
for N, col_name in [(atr_short_N, atr_short_col), (atr_long_M, atr_long_col)]:
|
||
if col_name not in df.columns:
|
||
print(f"Calculating {col_name} as it's missing...")
|
||
# TA-Lib需要numpy array作为输入,并且不能有NaN在中间 (首行NaN可以)
|
||
# 分组计算ATR比较麻烦,这里假设如果df不是很大,可以先整列计算,再groupby获取
|
||
# 一个更稳健的方法是groupby().apply(lambda x: talib.ATR(x['high'], x['low'], x['close'], N))
|
||
# 但为了避免 apply 的性能问题,这里用一种近似,如果数据量大,最好预计算
|
||
temp_atr = df.groupby('ts_code', group_keys=False).apply(
|
||
lambda x: pd.Series(talib.ATR(x['high'].values, x['low'].values, x['close'].values, timeperiod=N), index=x.index)
|
||
)
|
||
df[col_name] = temp_atr
|
||
_temp_cols.append(col_name)
|
||
|
||
cond_vol_contraction = df[atr_short_col] < (0.7 * df[atr_long_col]) # 短期ATR显著小于长期ATR
|
||
|
||
# 2. 成交量萎缩
|
||
avg_vol_short_col = f'_avg_vol_{vol_atrophy_N}'
|
||
avg_vol_long_col = f'_avg_vol_{vol_atrophy_M}'
|
||
df[avg_vol_short_col] = df.groupby('ts_code')['vol'].rolling(window=vol_atrophy_N, min_periods=1).mean().reset_index(level=0,drop=True)
|
||
df[avg_vol_long_col] = df.groupby('ts_code')['vol'].rolling(window=vol_atrophy_M, min_periods=1).mean().reset_index(level=0,drop=True)
|
||
_temp_cols.extend([avg_vol_short_col, avg_vol_long_col])
|
||
cond_vol_atrophy = df[avg_vol_short_col] < (0.7 * df[avg_vol_long_col]) # 短期均量显著小于长期均量
|
||
|
||
# 3. 近期价格稳定
|
||
rolling_max_h_col = f'_rolling_max_h_{price_stab_N}'
|
||
rolling_min_l_col = f'_rolling_min_l_{price_stab_N}'
|
||
df[rolling_max_h_col] = df.groupby('ts_code')['high'].rolling(window=price_stab_N, min_periods=1).max().reset_index(level=0,drop=True)
|
||
df[rolling_min_l_col] = df.groupby('ts_code')['low'].rolling(window=price_stab_N, min_periods=1).min().reset_index(level=0,drop=True)
|
||
_temp_cols.extend([rolling_max_h_col, rolling_min_l_col])
|
||
cond_price_stability = ( (df[rolling_max_h_col] - df[rolling_min_l_col]) / (df['close'] + epsilon) ) < price_stab_threshold
|
||
|
||
# 4. 当日温和放量上涨信号
|
||
if 'volume_ratio' not in df.columns:
|
||
print(f"Warning: 'volume_ratio' column not found for {factor_name}. Using a proxy.")
|
||
# 如果没有量比,就用当日成交量 > 1.2 * 近5日均量作为代理
|
||
if avg_vol_short_col not in df.columns: # 确保这个短期均量列已计算
|
||
df[avg_vol_short_col] = df.groupby('ts_code')['vol'].rolling(window=vol_atrophy_N, min_periods=1).mean().reset_index(level=0,drop=True)
|
||
cond_vol_signal = df['vol'] > (1.2 * df[avg_vol_short_col])
|
||
else:
|
||
cond_vol_signal = df['volume_ratio'] > volume_ratio_signal_threshold
|
||
|
||
cond_price_signal = (df['pct_chg'] > current_pct_chg_min_signal) & (df['pct_chg'] < current_pct_chg_max_signal)
|
||
cond_current_day_signal = cond_price_signal & cond_vol_signal
|
||
|
||
df[factor_name] = (cond_vol_contraction.astype(int).astype(str) + cond_vol_atrophy.astype(int).astype(str) + cond_price_stability.astype(int).astype(str) + cond_current_day_signal.astype(int).astype(str))
|
||
|
||
except KeyError as e:
|
||
print(f"Error calculating {factor_name}: Missing column {e}. Factor column will be all zeros or NaN.")
|
||
df[factor_name] = 0
|
||
except Exception as e:
|
||
print(f"An unexpected error occurred in {factor_name}: {e}. Factor column will be all zeros or NaN.")
|
||
df[factor_name] = 0
|
||
finally:
|
||
df.drop(columns=[col for col in _temp_cols if col in df.columns], inplace=True, errors='ignore')
|
||
print(f"Finished {factor_name}.")
|
||
return df
|
||
|