Files
NewStock/main/utils/factor_processor.py
2025-06-01 15:59:29 +08:00

233 lines
9.4 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import numpy as np
import pandas as pd
from scipy.stats import ks_2samp
from sklearn.preprocessing import StandardScaler
def remove_shifted_features(train_data, feature_columns, ks_threshold=0.05, wasserstein_threshold=0.1, size=0.8,
log=True, val_data=None):
dropped_features = []
if val_data is None:
all_dates = sorted(train_data['trade_date'].unique().tolist()) # 获取所有唯一的 trade_date
split_date = all_dates[int(len(all_dates) * size)] # 划分点为倒数第 validation_days 天
train_data_split = train_data[train_data['trade_date'] < split_date] # 训练集
val_data_split = train_data[train_data['trade_date'] >= split_date] # 验证集
else:
train_data_split = train_data
val_data_split = val_data
# **统计数据漂移**
numeric_columns = train_data_split.select_dtypes(include=['float64', 'int64']).columns
numeric_columns = [col for col in numeric_columns if col in feature_columns]
for feature in numeric_columns:
ks_stat, p_value = ks_2samp(train_data_split[feature], val_data_split[feature])
# wasserstein_dist = wasserstein_distance(train_data_split[feature], val_data_split[feature])
# if p_value < ks_threshold or wasserstein_dist > wasserstein_threshold:
if p_value < ks_threshold:
dropped_features.append(feature)
if log:
print(f"检测到 {len(dropped_features)} 个可能漂移的特征: {dropped_features}")
# **应用阈值进行最终筛选**
filtered_features = [f for f in feature_columns if f not in dropped_features]
return filtered_features, dropped_features
def remove_outliers_label_percentile(label: pd.Series, lower_percentile: float = 0.01, upper_percentile: float = 0.99,
log=True):
if not (0 <= lower_percentile < upper_percentile <= 1):
raise ValueError("Percentile values must satisfy 0 <= lower_percentile < upper_percentile <= 1.")
# Calculate lower and upper bounds based on percentiles
lower_bound = label.quantile(lower_percentile)
upper_bound = label.quantile(upper_percentile)
# Filter out values outside the bounds
filtered_label = label[(label >= lower_bound) & (label <= upper_bound)]
# Print the number of removed outliers
if log:
print(f"Removed {len(label) - len(filtered_label)} outliers.")
return filtered_label
def calculate_risk_adjusted_target(df, days=5):
df = df.sort_values(by=['ts_code', 'trade_date'])
df['future_close'] = df.groupby('ts_code')['close'].shift(-days)
df['future_open'] = df.groupby('ts_code')['open'].shift(-1)
df['future_return'] = (df['future_close'] - df['future_open']) / df['future_open']
df['future_volatility'] = df.groupby('ts_code')['future_return'].rolling(days, min_periods=1).std().reset_index(
level=0, drop=True)
sharpe_ratio = df['future_return'] * df['future_volatility']
sharpe_ratio.replace([np.inf, -np.inf], np.nan, inplace=True)
return sharpe_ratio
def calculate_score(df, days=5, lambda_param=1.0):
def calculate_max_drawdown(prices):
peak = prices.iloc[0] # 初始化峰值
max_drawdown = 0 # 初始化最大回撤
for price in prices:
if price > peak:
peak = price # 更新峰值
else:
drawdown = (peak - price) / peak # 计算当前回撤
max_drawdown = max(max_drawdown, drawdown) # 更新最大回撤
return max_drawdown
def compute_stock_score(stock_df):
stock_df = stock_df.sort_values(by=['trade_date'])
future_return = stock_df['future_return']
# 使用已有的 pct_chg 字段计算波动率
volatility = stock_df['pct_chg'].rolling(days).std().shift(-days)
max_drawdown = stock_df['close'].rolling(days).apply(calculate_max_drawdown, raw=False).shift(-days)
score = future_return - lambda_param * max_drawdown
return score
# # 确保 DataFrame 按照股票代码和交易日期排序
# df = df.sort_values(by=['ts_code', 'trade_date'])
# 对每个股票分别计算 score
df['score'] = df.groupby('ts_code').apply(compute_stock_score).reset_index(level=0, drop=True)
return df['score']
def remove_highly_correlated_features(df, feature_columns, threshold=0.9):
numeric_features = df[feature_columns].select_dtypes(include=[np.number]).columns.tolist()
if not numeric_features:
raise ValueError("No numeric features found in the provided data.")
corr_matrix = df[numeric_features].corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop = [column for column in upper.columns if any(upper[column] > threshold)]
remaining_features = [col for col in feature_columns if col not in to_drop
or 'act' in col or 'af' in col]
return remaining_features
def cross_sectional_standardization(df, features):
df_sorted = df.sort_values(by='trade_date') # 按时间排序
df_standardized = df_sorted.copy()
for date in df_sorted['trade_date'].unique():
# 获取当前时间点的数据
current_data = df_standardized[df_standardized['trade_date'] == date]
# 只对指定特征进行标准化
scaler = StandardScaler()
standardized_values = scaler.fit_transform(current_data[features])
# 将标准化结果重新赋值回去
df_standardized.loc[df_standardized['trade_date'] == date, features] = standardized_values
return df_standardized
def neutralize_manual(df, features, industry_col, mkt_cap_col):
""" 手动实现简单回归以提升速度 """
for col in features:
residuals = []
for _, group in df.groupby(industry_col):
if len(group) > 1:
x = np.log(group[mkt_cap_col]) # 市值对数
y = group[col] # 因子值
beta = np.cov(y, x)[0, 1] / np.var(x) # 计算斜率
alpha = np.mean(y) - beta * np.mean(x) # 计算截距
resid = y - (alpha + beta * x) # 计算残差
residuals.extend(resid)
else:
residuals.extend(group[col]) # 样本不足时保留原值
df[col] = residuals
return df
def mad_filter(df, features, n=3):
for col in features:
median = df[col].median()
mad = np.median(np.abs(df[col] - median))
upper = median + n * mad
lower = median - n * mad
df[col] = np.clip(df[col], lower, upper) # 截断极值
return df
def percentile_filter(df, features, lower_percentile=0.01, upper_percentile=0.99):
for col in features:
# 按日期分组计算上下百分位数
lower_bound = df.groupby('trade_date')[col].transform(
lambda x: x.quantile(lower_percentile)
)
upper_bound = df.groupby('trade_date')[col].transform(
lambda x: x.quantile(upper_percentile)
)
# 截断超出范围的值
df[col] = np.clip(df[col], lower_bound, upper_bound)
return df
from scipy.stats import iqr
def iqr_filter(df, features):
for col in features:
df[col] = df.groupby('trade_date')[col].transform(
lambda x: (x - x.median()) / iqr(x) if iqr(x) != 0 else x
)
return df
def quantile_filter(df, features, lower_quantile=0.01, upper_quantile=0.99, window=60):
df = df.copy()
for col in features:
# 计算 rolling 统计量,需要按日期进行 groupby
rolling_lower = df.groupby('trade_date')[col].transform(
lambda x: x.rolling(window=min(len(x), window)).quantile(lower_quantile))
rolling_upper = df.groupby('trade_date')[col].transform(
lambda x: x.rolling(window=min(len(x), window)).quantile(upper_quantile))
# 对数据进行裁剪
df[col] = np.clip(df[col], rolling_lower, rolling_upper)
return df
def time_series_quantile_filter(df, features, lower_quantile=0.01, upper_quantile=0.99, window=60):
df = df.copy()
# 确保按股票和时间排序
df = df.sort_values(['ts_code', 'trade_date'])
grouped = df.groupby('ts_code')
for col in features:
# 对每个股票的时间序列计算滚动分位数
rolling_lower = grouped[col].rolling(window=window, min_periods=window // 2).quantile(lower_quantile)
rolling_upper = grouped[col].rolling(window=window, min_periods=window // 2).quantile(upper_quantile)
# rolling结果带有多重索引需要对齐
rolling_lower = rolling_lower.reset_index(level=0, drop=True)
rolling_upper = rolling_upper.reset_index(level=0, drop=True)
# 应用 clip
df[col] = np.clip(df[col], rolling_lower, rolling_upper)
return df
def cross_sectional_quantile_filter(df, features, lower_quantile=0.01, upper_quantile=0.99):
df = df.copy()
grouped = df.groupby('trade_date')
for col in features:
# 计算每日截面的分位数边界
lower_bound = grouped[col].transform(lambda x: x.quantile(lower_quantile))
upper_bound = grouped[col].transform(lambda x: x.quantile(upper_quantile))
# 应用 clip
df[col] = np.clip(df[col], lower_bound, upper_bound)
return df