import numpy as np import pandas as pd from scipy.stats import ks_2samp from sklearn.preprocessing import StandardScaler def remove_shifted_features(train_data, feature_columns, ks_threshold=0.05, wasserstein_threshold=0.1, size=0.8, log=True, val_data=None): dropped_features = [] if val_data is None: all_dates = sorted(train_data['trade_date'].unique().tolist()) # 获取所有唯一的 trade_date split_date = all_dates[int(len(all_dates) * size)] # 划分点为倒数第 validation_days 天 train_data_split = train_data[train_data['trade_date'] < split_date] # 训练集 val_data_split = train_data[train_data['trade_date'] >= split_date] # 验证集 else: train_data_split = train_data val_data_split = val_data # **统计数据漂移** numeric_columns = train_data_split.select_dtypes(include=['float64', 'int64']).columns numeric_columns = [col for col in numeric_columns if col in feature_columns] for feature in numeric_columns: ks_stat, p_value = ks_2samp(train_data_split[feature], val_data_split[feature]) # wasserstein_dist = wasserstein_distance(train_data_split[feature], val_data_split[feature]) # if p_value < ks_threshold or wasserstein_dist > wasserstein_threshold: if p_value < ks_threshold: dropped_features.append(feature) if log: print(f"检测到 {len(dropped_features)} 个可能漂移的特征: {dropped_features}") # **应用阈值进行最终筛选** filtered_features = [f for f in feature_columns if f not in dropped_features] return filtered_features, dropped_features def remove_outliers_label_percentile(label: pd.Series, lower_percentile: float = 0.01, upper_percentile: float = 0.99, log=True): if not (0 <= lower_percentile < upper_percentile <= 1): raise ValueError("Percentile values must satisfy 0 <= lower_percentile < upper_percentile <= 1.") # Calculate lower and upper bounds based on percentiles lower_bound = label.quantile(lower_percentile) upper_bound = label.quantile(upper_percentile) # Filter out values outside the bounds filtered_label = label[(label >= lower_bound) & (label <= upper_bound)] # Print the number of removed outliers if log: print(f"Removed {len(label) - len(filtered_label)} outliers.") return filtered_label def calculate_risk_adjusted_target(df, days=5): df = df.sort_values(by=['ts_code', 'trade_date']) df['future_close'] = df.groupby('ts_code')['close'].shift(-days) df['future_open'] = df.groupby('ts_code')['open'].shift(-1) df['future_return'] = (df['future_close'] - df['future_open']) / df['future_open'] df['future_volatility'] = df.groupby('ts_code')['future_return'].rolling(days, min_periods=1).std().reset_index( level=0, drop=True) sharpe_ratio = df['future_return'] * df['future_volatility'] sharpe_ratio.replace([np.inf, -np.inf], np.nan, inplace=True) return sharpe_ratio def calculate_score(df, days=5, lambda_param=1.0): def calculate_max_drawdown(prices): peak = prices.iloc[0] # 初始化峰值 max_drawdown = 0 # 初始化最大回撤 for price in prices: if price > peak: peak = price # 更新峰值 else: drawdown = (peak - price) / peak # 计算当前回撤 max_drawdown = max(max_drawdown, drawdown) # 更新最大回撤 return max_drawdown def compute_stock_score(stock_df): stock_df = stock_df.sort_values(by=['trade_date']) future_return = stock_df['future_return'] # 使用已有的 pct_chg 字段计算波动率 volatility = stock_df['pct_chg'].rolling(days).std().shift(-days) max_drawdown = stock_df['close'].rolling(days).apply(calculate_max_drawdown, raw=False).shift(-days) score = future_return - lambda_param * max_drawdown return score # # 确保 DataFrame 按照股票代码和交易日期排序 # df = df.sort_values(by=['ts_code', 'trade_date']) # 对每个股票分别计算 score df['score'] = df.groupby('ts_code').apply(compute_stock_score).reset_index(level=0, drop=True) return df['score'] def remove_highly_correlated_features(df, feature_columns, threshold=0.9): numeric_features = df[feature_columns].select_dtypes(include=[np.number]).columns.tolist() if not numeric_features: raise ValueError("No numeric features found in the provided data.") corr_matrix = df[numeric_features].corr().abs() upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool)) to_drop = [column for column in upper.columns if any(upper[column] > threshold)] remaining_features = [col for col in feature_columns if col not in to_drop or 'act' in col or 'af' in col] return remaining_features def cross_sectional_standardization(df, features): df_sorted = df.sort_values(by='trade_date') # 按时间排序 df_standardized = df_sorted.copy() for date in df_sorted['trade_date'].unique(): # 获取当前时间点的数据 current_data = df_standardized[df_standardized['trade_date'] == date] # 只对指定特征进行标准化 scaler = StandardScaler() standardized_values = scaler.fit_transform(current_data[features]) # 将标准化结果重新赋值回去 df_standardized.loc[df_standardized['trade_date'] == date, features] = standardized_values return df_standardized def neutralize_manual(df, features, industry_col, mkt_cap_col): """ 手动实现简单回归以提升速度 """ for col in features: residuals = [] for _, group in df.groupby(industry_col): if len(group) > 1: x = np.log(group[mkt_cap_col]) # 市值对数 y = group[col] # 因子值 beta = np.cov(y, x)[0, 1] / np.var(x) # 计算斜率 alpha = np.mean(y) - beta * np.mean(x) # 计算截距 resid = y - (alpha + beta * x) # 计算残差 residuals.extend(resid) else: residuals.extend(group[col]) # 样本不足时保留原值 df[col] = residuals return df def mad_filter(df, features, n=3): for col in features: median = df[col].median() mad = np.median(np.abs(df[col] - median)) upper = median + n * mad lower = median - n * mad df[col] = np.clip(df[col], lower, upper) # 截断极值 return df def percentile_filter(df, features, lower_percentile=0.01, upper_percentile=0.99): for col in features: # 按日期分组计算上下百分位数 lower_bound = df.groupby('trade_date')[col].transform( lambda x: x.quantile(lower_percentile) ) upper_bound = df.groupby('trade_date')[col].transform( lambda x: x.quantile(upper_percentile) ) # 截断超出范围的值 df[col] = np.clip(df[col], lower_bound, upper_bound) return df from scipy.stats import iqr def iqr_filter(df, features): for col in features: df[col] = df.groupby('trade_date')[col].transform( lambda x: (x - x.median()) / iqr(x) if iqr(x) != 0 else x ) return df def quantile_filter(df, features, lower_quantile=0.01, upper_quantile=0.99, window=60): df = df.copy() for col in features: # 计算 rolling 统计量,需要按日期进行 groupby rolling_lower = df.groupby('trade_date')[col].transform( lambda x: x.rolling(window=min(len(x), window)).quantile(lower_quantile)) rolling_upper = df.groupby('trade_date')[col].transform( lambda x: x.rolling(window=min(len(x), window)).quantile(upper_quantile)) # 对数据进行裁剪 df[col] = np.clip(df[col], rolling_lower, rolling_upper) return df def time_series_quantile_filter(df, features, lower_quantile=0.01, upper_quantile=0.99, window=60): df = df.copy() # 确保按股票和时间排序 df = df.sort_values(['ts_code', 'trade_date']) grouped = df.groupby('ts_code') for col in features: # 对每个股票的时间序列计算滚动分位数 rolling_lower = grouped[col].rolling(window=window, min_periods=window // 2).quantile(lower_quantile) rolling_upper = grouped[col].rolling(window=window, min_periods=window // 2).quantile(upper_quantile) # rolling结果带有多重索引,需要对齐 rolling_lower = rolling_lower.reset_index(level=0, drop=True) rolling_upper = rolling_upper.reset_index(level=0, drop=True) # 应用 clip df[col] = np.clip(df[col], rolling_lower, rolling_upper) return df def cross_sectional_quantile_filter(df, features, lower_quantile=0.01, upper_quantile=0.99): df = df.copy() grouped = df.groupby('trade_date') for col in features: # 计算每日截面的分位数边界 lower_bound = grouped[col].transform(lambda x: x.quantile(lower_quantile)) upper_bound = grouped[col].transform(lambda x: x.quantile(upper_quantile)) # 应用 clip df[col] = np.clip(df[col], lower_bound, upper_bound) return df