新环境

2025-06-01 15:59:29 +08:00
parent eb52c3673c
commit ffe9c6ae3c
47 changed files with 28572 additions and 27694 deletions
--- a/main/utils/pycache/init.cpython-313.pyc
+++ b/main/utils/pycache/init.cpython-313.pyc
--- a/main/utils/pycache/factor.cpython-313.pyc
+++ b/main/utils/pycache/factor.cpython-313.pyc
--- a/main/utils/pycache/factor_processor.cpython-313.pyc
+++ b/main/utils/pycache/factor_processor.cpython-313.pyc
--- a/main/utils/pycache/utils.cpython-313.pyc
+++ b/main/utils/pycache/utils.cpython-313.pyc
--- a/main/utils/factor.py
+++ b/main/utils/factor.py
--- a/main/utils/factor_processor.py
+++ b/main/utils/factor_processor.py
@@ -1,233 +1,233 @@
-import numpy as np
-import pandas as pd
-
-from scipy.stats import ks_2samp
-from sklearn.preprocessing import StandardScaler
-
-
-def remove_shifted_features(train_data, feature_columns, ks_threshold=0.05, wasserstein_threshold=0.1, size=0.8,
-                            log=True, val_data=None):
-    dropped_features = []
-
-    if val_data is None:
-        all_dates = sorted(train_data['trade_date'].unique().tolist())  # 获取所有唯一的 trade_date
-        split_date = all_dates[int(len(all_dates) * size)]  # 划分点为倒数第 validation_days 天
-        train_data_split = train_data[train_data['trade_date'] < split_date]  # 训练集
-        val_data_split = train_data[train_data['trade_date'] >= split_date]  # 验证集
-    else:
-        train_data_split = train_data
-        val_data_split = val_data
-
-    # **统计数据漂移**
-    numeric_columns = train_data_split.select_dtypes(include=['float64', 'int64']).columns
-    numeric_columns = [col for col in numeric_columns if col in feature_columns]
-    for feature in numeric_columns:
-        ks_stat, p_value = ks_2samp(train_data_split[feature], val_data_split[feature])
-        # wasserstein_dist = wasserstein_distance(train_data_split[feature], val_data_split[feature])
-
-        # if p_value < ks_threshold or wasserstein_dist > wasserstein_threshold:
-        if p_value < ks_threshold:
-            dropped_features.append(feature)
-    if log:
-        print(f"检测到 {len(dropped_features)} 个可能漂移的特征: {dropped_features}")
-
-    # **应用阈值进行最终筛选**
-    filtered_features = [f for f in feature_columns if f not in dropped_features]
-
-    return filtered_features, dropped_features
-
-
-def remove_outliers_label_percentile(label: pd.Series, lower_percentile: float = 0.01, upper_percentile: float = 0.99,
-                                     log=True):
-    if not (0 <= lower_percentile < upper_percentile <= 1):
-        raise ValueError("Percentile values must satisfy 0 <= lower_percentile < upper_percentile <= 1.")
-
-    # Calculate lower and upper bounds based on percentiles
-    lower_bound = label.quantile(lower_percentile)
-    upper_bound = label.quantile(upper_percentile)
-
-    # Filter out values outside the bounds
-    filtered_label = label[(label >= lower_bound) & (label <= upper_bound)]
-
-    # Print the number of removed outliers
-    if log:
-        print(f"Removed {len(label) - len(filtered_label)} outliers.")
-    return filtered_label
-
-
-def calculate_risk_adjusted_target(df, days=5):
-    df = df.sort_values(by=['ts_code', 'trade_date'])
-
-    df['future_close'] = df.groupby('ts_code')['close'].shift(-days)
-    df['future_open'] = df.groupby('ts_code')['open'].shift(-1)
-    df['future_return'] = (df['future_close'] - df['future_open']) / df['future_open']
-
-    df['future_volatility'] = df.groupby('ts_code')['future_return'].rolling(days, min_periods=1).std().reset_index(
-        level=0, drop=True)
-    sharpe_ratio = df['future_return'] * df['future_volatility']
-    sharpe_ratio.replace([np.inf, -np.inf], np.nan, inplace=True)
-
-    return sharpe_ratio
-
-
-def calculate_score(df, days=5, lambda_param=1.0):
-    def calculate_max_drawdown(prices):
-        peak = prices.iloc[0]  # 初始化峰值
-        max_drawdown = 0  # 初始化最大回撤
-
-        for price in prices:
-            if price > peak:
-                peak = price  # 更新峰值
-            else:
-                drawdown = (peak - price) / peak  # 计算当前回撤
-                max_drawdown = max(max_drawdown, drawdown)  # 更新最大回撤
-
-        return max_drawdown
-
-    def compute_stock_score(stock_df):
-        stock_df = stock_df.sort_values(by=['trade_date'])
-        future_return = stock_df['future_return']
-        # 使用已有的 pct_chg 字段计算波动率
-        volatility = stock_df['pct_chg'].rolling(days).std().shift(-days)
-        max_drawdown = stock_df['close'].rolling(days).apply(calculate_max_drawdown, raw=False).shift(-days)
-        score = future_return - lambda_param * max_drawdown
-        return score
-
-    # # 确保 DataFrame 按照股票代码和交易日期排序
-    # df = df.sort_values(by=['ts_code', 'trade_date'])
-
-    # 对每个股票分别计算 score
-    df['score'] = df.groupby('ts_code').apply(compute_stock_score).reset_index(level=0, drop=True)
-
-    return df['score']
-
-
-def remove_highly_correlated_features(df, feature_columns, threshold=0.9):
-    numeric_features = df[feature_columns].select_dtypes(include=[np.number]).columns.tolist()
-    if not numeric_features:
-        raise ValueError("No numeric features found in the provided data.")
-
-    corr_matrix = df[numeric_features].corr().abs()
-    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
-    to_drop = [column for column in upper.columns if any(upper[column] > threshold)]
-    remaining_features = [col for col in feature_columns if col not in to_drop
-                          or 'act' in col or 'af' in col]
-    return remaining_features
-
-
-def cross_sectional_standardization(df, features):
-    df_sorted = df.sort_values(by='trade_date')  # 按时间排序
-    df_standardized = df_sorted.copy()
-
-    for date in df_sorted['trade_date'].unique():
-        # 获取当前时间点的数据
-        current_data = df_standardized[df_standardized['trade_date'] == date]
-
-        # 只对指定特征进行标准化
-        scaler = StandardScaler()
-        standardized_values = scaler.fit_transform(current_data[features])
-
-        # 将标准化结果重新赋值回去
-        df_standardized.loc[df_standardized['trade_date'] == date, features] = standardized_values
-
-    return df_standardized
-
-
-def neutralize_manual(df, features, industry_col, mkt_cap_col):
-    """ 手动实现简单回归以提升速度 """
-
-    for col in features:
-        residuals = []
-        for _, group in df.groupby(industry_col):
-            if len(group) > 1:
-                x = np.log(group[mkt_cap_col])  # 市值对数
-                y = group[col]  # 因子值
-                beta = np.cov(y, x)[0, 1] / np.var(x)  # 计算斜率
-                alpha = np.mean(y) - beta * np.mean(x)  # 计算截距
-                resid = y - (alpha + beta * x)  # 计算残差
-                residuals.extend(resid)
-            else:
-                residuals.extend(group[col])  # 样本不足时保留原值
-
-        df[col] = residuals
-
-    return df
-
-
-def mad_filter(df, features, n=3):
-    for col in features:
-        median = df[col].median()
-        mad = np.median(np.abs(df[col] - median))
-        upper = median + n * mad
-        lower = median - n * mad
-        df[col] = np.clip(df[col], lower, upper)  # 截断极值
-    return df
-
-
-def percentile_filter(df, features, lower_percentile=0.01, upper_percentile=0.99):
-    for col in features:
-        # 按日期分组计算上下百分位数
-        lower_bound = df.groupby('trade_date')[col].transform(
-            lambda x: x.quantile(lower_percentile)
-        )
-        upper_bound = df.groupby('trade_date')[col].transform(
-            lambda x: x.quantile(upper_percentile)
-        )
-        # 截断超出范围的值
-        df[col] = np.clip(df[col], lower_bound, upper_bound)
-    return df
-
-
-from scipy.stats import iqr
-
-
-def iqr_filter(df, features):
-    for col in features:
-        df[col] = df.groupby('trade_date')[col].transform(
-            lambda x: (x - x.median()) / iqr(x) if iqr(x) != 0 else x
-        )
-    return df
-
-
-def quantile_filter(df, features, lower_quantile=0.01, upper_quantile=0.99, window=60):
-    df = df.copy()
-    for col in features:
-        # 计算 rolling 统计量，需要按日期进行 groupby
-        rolling_lower = df.groupby('trade_date')[col].transform(
-            lambda x: x.rolling(window=min(len(x), window)).quantile(lower_quantile))
-        rolling_upper = df.groupby('trade_date')[col].transform(
-            lambda x: x.rolling(window=min(len(x), window)).quantile(upper_quantile))
-
-        # 对数据进行裁剪
-        df[col] = np.clip(df[col], rolling_lower, rolling_upper)
-
-    return df
-
-
-def time_series_quantile_filter(df, features, lower_quantile=0.01, upper_quantile=0.99, window=60):
-    df = df.copy()
-    # 确保按股票和时间排序
-    df = df.sort_values(['ts_code', 'trade_date'])
-    grouped = df.groupby('ts_code')
-    for col in features:
-        # 对每个股票的时间序列计算滚动分位数
-        rolling_lower = grouped[col].rolling(window=window, min_periods=window // 2).quantile(lower_quantile)
-        rolling_upper = grouped[col].rolling(window=window, min_periods=window // 2).quantile(upper_quantile)
-        # rolling结果带有多重索引，需要对齐
-        rolling_lower = rolling_lower.reset_index(level=0, drop=True)
-        rolling_upper = rolling_upper.reset_index(level=0, drop=True)
-        # 应用 clip
-        df[col] = np.clip(df[col], rolling_lower, rolling_upper)
-    return df
-
-
-def cross_sectional_quantile_filter(df, features, lower_quantile=0.01, upper_quantile=0.99):
-    df = df.copy()
-    grouped = df.groupby('trade_date')
-    for col in features:
-        # 计算每日截面的分位数边界
-        lower_bound = grouped[col].transform(lambda x: x.quantile(lower_quantile))
-        upper_bound = grouped[col].transform(lambda x: x.quantile(upper_quantile))
-        # 应用 clip
-        df[col] = np.clip(df[col], lower_bound, upper_bound)
+import numpy as np
+import pandas as pd
+
+from scipy.stats import ks_2samp
+from sklearn.preprocessing import StandardScaler
+
+
+def remove_shifted_features(train_data, feature_columns, ks_threshold=0.05, wasserstein_threshold=0.1, size=0.8,
+                            log=True, val_data=None):
+    dropped_features = []
+
+    if val_data is None:
+        all_dates = sorted(train_data['trade_date'].unique().tolist())  # 获取所有唯一的 trade_date
+        split_date = all_dates[int(len(all_dates) * size)]  # 划分点为倒数第 validation_days 天
+        train_data_split = train_data[train_data['trade_date'] < split_date]  # 训练集
+        val_data_split = train_data[train_data['trade_date'] >= split_date]  # 验证集
+    else:
+        train_data_split = train_data
+        val_data_split = val_data
+
+    # **统计数据漂移**
+    numeric_columns = train_data_split.select_dtypes(include=['float64', 'int64']).columns
+    numeric_columns = [col for col in numeric_columns if col in feature_columns]
+    for feature in numeric_columns:
+        ks_stat, p_value = ks_2samp(train_data_split[feature], val_data_split[feature])
+        # wasserstein_dist = wasserstein_distance(train_data_split[feature], val_data_split[feature])
+
+        # if p_value < ks_threshold or wasserstein_dist > wasserstein_threshold:
+        if p_value < ks_threshold:
+            dropped_features.append(feature)
+    if log:
+        print(f"检测到 {len(dropped_features)} 个可能漂移的特征: {dropped_features}")
+
+    # **应用阈值进行最终筛选**
+    filtered_features = [f for f in feature_columns if f not in dropped_features]
+
+    return filtered_features, dropped_features
+
+
+def remove_outliers_label_percentile(label: pd.Series, lower_percentile: float = 0.01, upper_percentile: float = 0.99,
+                                     log=True):
+    if not (0 <= lower_percentile < upper_percentile <= 1):
+        raise ValueError("Percentile values must satisfy 0 <= lower_percentile < upper_percentile <= 1.")
+
+    # Calculate lower and upper bounds based on percentiles
+    lower_bound = label.quantile(lower_percentile)
+    upper_bound = label.quantile(upper_percentile)
+
+    # Filter out values outside the bounds
+    filtered_label = label[(label >= lower_bound) & (label <= upper_bound)]
+
+    # Print the number of removed outliers
+    if log:
+        print(f"Removed {len(label) - len(filtered_label)} outliers.")
+    return filtered_label
+
+
+def calculate_risk_adjusted_target(df, days=5):
+    df = df.sort_values(by=['ts_code', 'trade_date'])
+
+    df['future_close'] = df.groupby('ts_code')['close'].shift(-days)
+    df['future_open'] = df.groupby('ts_code')['open'].shift(-1)
+    df['future_return'] = (df['future_close'] - df['future_open']) / df['future_open']
+
+    df['future_volatility'] = df.groupby('ts_code')['future_return'].rolling(days, min_periods=1).std().reset_index(
+        level=0, drop=True)
+    sharpe_ratio = df['future_return'] * df['future_volatility']
+    sharpe_ratio.replace([np.inf, -np.inf], np.nan, inplace=True)
+
+    return sharpe_ratio
+
+
+def calculate_score(df, days=5, lambda_param=1.0):
+    def calculate_max_drawdown(prices):
+        peak = prices.iloc[0]  # 初始化峰值
+        max_drawdown = 0  # 初始化最大回撤
+
+        for price in prices:
+            if price > peak:
+                peak = price  # 更新峰值
+            else:
+                drawdown = (peak - price) / peak  # 计算当前回撤
+                max_drawdown = max(max_drawdown, drawdown)  # 更新最大回撤
+
+        return max_drawdown
+
+    def compute_stock_score(stock_df):
+        stock_df = stock_df.sort_values(by=['trade_date'])
+        future_return = stock_df['future_return']
+        # 使用已有的 pct_chg 字段计算波动率
+        volatility = stock_df['pct_chg'].rolling(days).std().shift(-days)
+        max_drawdown = stock_df['close'].rolling(days).apply(calculate_max_drawdown, raw=False).shift(-days)
+        score = future_return - lambda_param * max_drawdown
+        return score
+
+    # # 确保 DataFrame 按照股票代码和交易日期排序
+    # df = df.sort_values(by=['ts_code', 'trade_date'])
+
+    # 对每个股票分别计算 score
+    df['score'] = df.groupby('ts_code').apply(compute_stock_score).reset_index(level=0, drop=True)
+
+    return df['score']
+
+
+def remove_highly_correlated_features(df, feature_columns, threshold=0.9):
+    numeric_features = df[feature_columns].select_dtypes(include=[np.number]).columns.tolist()
+    if not numeric_features:
+        raise ValueError("No numeric features found in the provided data.")
+
+    corr_matrix = df[numeric_features].corr().abs()
+    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
+    to_drop = [column for column in upper.columns if any(upper[column] > threshold)]
+    remaining_features = [col for col in feature_columns if col not in to_drop
+                          or 'act' in col or 'af' in col]
+    return remaining_features
+
+
+def cross_sectional_standardization(df, features):
+    df_sorted = df.sort_values(by='trade_date')  # 按时间排序
+    df_standardized = df_sorted.copy()
+
+    for date in df_sorted['trade_date'].unique():
+        # 获取当前时间点的数据
+        current_data = df_standardized[df_standardized['trade_date'] == date]
+
+        # 只对指定特征进行标准化
+        scaler = StandardScaler()
+        standardized_values = scaler.fit_transform(current_data[features])
+
+        # 将标准化结果重新赋值回去
+        df_standardized.loc[df_standardized['trade_date'] == date, features] = standardized_values
+
+    return df_standardized
+
+
+def neutralize_manual(df, features, industry_col, mkt_cap_col):
+    """ 手动实现简单回归以提升速度 """
+
+    for col in features:
+        residuals = []
+        for _, group in df.groupby(industry_col):
+            if len(group) > 1:
+                x = np.log(group[mkt_cap_col])  # 市值对数
+                y = group[col]  # 因子值
+                beta = np.cov(y, x)[0, 1] / np.var(x)  # 计算斜率
+                alpha = np.mean(y) - beta * np.mean(x)  # 计算截距
+                resid = y - (alpha + beta * x)  # 计算残差
+                residuals.extend(resid)
+            else:
+                residuals.extend(group[col])  # 样本不足时保留原值
+
+        df[col] = residuals
+
+    return df
+
+
+def mad_filter(df, features, n=3):
+    for col in features:
+        median = df[col].median()
+        mad = np.median(np.abs(df[col] - median))
+        upper = median + n * mad
+        lower = median - n * mad
+        df[col] = np.clip(df[col], lower, upper)  # 截断极值
+    return df
+
+
+def percentile_filter(df, features, lower_percentile=0.01, upper_percentile=0.99):
+    for col in features:
+        # 按日期分组计算上下百分位数
+        lower_bound = df.groupby('trade_date')[col].transform(
+            lambda x: x.quantile(lower_percentile)
+        )
+        upper_bound = df.groupby('trade_date')[col].transform(
+            lambda x: x.quantile(upper_percentile)
+        )
+        # 截断超出范围的值
+        df[col] = np.clip(df[col], lower_bound, upper_bound)
+    return df
+
+
+from scipy.stats import iqr
+
+
+def iqr_filter(df, features):
+    for col in features:
+        df[col] = df.groupby('trade_date')[col].transform(
+            lambda x: (x - x.median()) / iqr(x) if iqr(x) != 0 else x
+        )
+    return df
+
+
+def quantile_filter(df, features, lower_quantile=0.01, upper_quantile=0.99, window=60):
+    df = df.copy()
+    for col in features:
+        # 计算 rolling 统计量，需要按日期进行 groupby
+        rolling_lower = df.groupby('trade_date')[col].transform(
+            lambda x: x.rolling(window=min(len(x), window)).quantile(lower_quantile))
+        rolling_upper = df.groupby('trade_date')[col].transform(
+            lambda x: x.rolling(window=min(len(x), window)).quantile(upper_quantile))
+
+        # 对数据进行裁剪
+        df[col] = np.clip(df[col], rolling_lower, rolling_upper)
+
+    return df
+
+
+def time_series_quantile_filter(df, features, lower_quantile=0.01, upper_quantile=0.99, window=60):
+    df = df.copy()
+    # 确保按股票和时间排序
+    df = df.sort_values(['ts_code', 'trade_date'])
+    grouped = df.groupby('ts_code')
+    for col in features:
+        # 对每个股票的时间序列计算滚动分位数
+        rolling_lower = grouped[col].rolling(window=window, min_periods=window // 2).quantile(lower_quantile)
+        rolling_upper = grouped[col].rolling(window=window, min_periods=window // 2).quantile(upper_quantile)
+        # rolling结果带有多重索引，需要对齐
+        rolling_lower = rolling_lower.reset_index(level=0, drop=True)
+        rolling_upper = rolling_upper.reset_index(level=0, drop=True)
+        # 应用 clip
+        df[col] = np.clip(df[col], rolling_lower, rolling_upper)
+    return df
+
+
+def cross_sectional_quantile_filter(df, features, lower_quantile=0.01, upper_quantile=0.99):
+    df = df.copy()
+    grouped = df.groupby('trade_date')
+    for col in features:
+        # 计算每日截面的分位数边界
+        lower_bound = grouped[col].transform(lambda x: x.quantile(lower_quantile))
+        upper_bound = grouped[col].transform(lambda x: x.quantile(upper_quantile))
+        # 应用 clip
+        df[col] = np.clip(df[col], lower_bound, upper_bound)
    return df
--- a/main/utils/utils.py
+++ b/main/utils/utils.py
@@ -1,154 +1,154 @@
-import numpy as np
-import pandas as pd
-
-
-def read_and_merge_h5_data(h5_filename, key, columns, df=None, join='left', on=['ts_code', 'trade_date'], prefix=None):
-    processed_columns = []
-    for col in columns:
-        if col.startswith('_'):
-            processed_columns.append(col[1:])  # 去掉下划线
-        else:
-            processed_columns.append(col)
-
-    # 从 HDF5 文件读取数据，选择需要的列
-    data = pd.read_hdf(h5_filename, key=key, columns=processed_columns)
-
-    # 修改列名，如果列名以前有 _，加上 _
-    for col in data.columns:
-        if col not in columns:  # 只有不在 columns 中的列才需要加下划线
-            new_col = f'_{col}'
-            data.rename(columns={col: new_col}, inplace=True)
-
-    if prefix is not None:
-        for col in data.columns:
-            if col not in ['ts_code', 'trade_date']:  # 只有不在 columns 中的列才需要加下划线
-                new_col = f'{prefix}_{col}'
-                data.rename(columns={col: new_col}, inplace=True)
-
-    # 如果传入的 df 不为空，则进行合并
-    if df is not None and not df.empty:
-        print(f'{join} merge on {on}')
-        if 'trade_date' in on:
-            # 确保两个 DataFrame 都有 ts_code 和 trade_date 列
-            df['trade_date'] = pd.to_datetime(df['trade_date'], format='%Y%m%d')
-            data['trade_date'] = pd.to_datetime(data['trade_date'], format='%Y%m%d')
-
-        # 根据 ts_code 和 trade_date 合并
-        merged_df = pd.merge(df, data, on=on, how=join)
-    else:
-        # 如果 df 为空，则直接返回读取的数据
-        merged_df = data
-
-    return merged_df
-
-
-def merge_with_industry_data(df, industry_df):
-    # 确保日期字段是 datetime 类型
-    df['trade_date'] = pd.to_datetime(df['trade_date'])
-    industry_df['in_date'] = pd.to_datetime(industry_df['in_date'])
-
-    # 对 industry_df 按 ts_code 和 in_date 排序
-    industry_df_sorted = industry_df.sort_values(['in_date', 'ts_code'])
-
-    # 对原始 df 按 ts_code 和 trade_date 排序
-    df_sorted = df.sort_values(['trade_date', 'ts_code'])
-
-    # 使用 merge_asof 进行向后合并
-    merged = pd.merge_asof(
-        df_sorted,
-        industry_df_sorted,
-        by='ts_code',  # 按 ts_code 分组
-        left_on='trade_date',
-        right_on='in_date',
-        direction='backward'
-    )
-
-    # 获取每个 ts_code 的最早 in_date 记录
-    min_in_date_per_ts = (industry_df_sorted
-    .groupby('ts_code')
-    .first()
-    .reset_index()[['ts_code', 'l2_code']])
-
-    # 填充未匹配到的记录（trade_date 早于所有 in_date 的情况）
-    merged['l2_code'] = merged['l2_code'].fillna(
-        merged['ts_code'].map(min_in_date_per_ts.set_index('ts_code')['l2_code'])
-    )
-
-    # 保留需要的列并重置索引
-    result = merged.reset_index(drop=True)
-    return result
-
-
-def calculate_risk_adjusted_return(df, days=1, method='ratio', lambda_=0.5, eps=1e-8):
-    """
-    计算单只股票的风险调整收益。
-
-    参数：
-    - df: DataFrame，包含 'ts_code' 和 'close' 列，按日期排序（假设 'trade_date' 已排序）。
-    - days: 预测未来多少天的收益，默认1天。
-    - method: 'ratio'（收益/波动率） 或 'difference'（收益 - λ * 波动率）。
-    - lambda_: 风险惩罚系数，仅当 method='difference' 时有效。
-    - eps: 防止除零的小常数。
-
-    返回：
-    - df：添加 'risk_adj_return' 列的 DataFrame，表示风险调整后的收益。
-    """
-    # 确保数据按 ts_code 和 trade_date 排序
-    df = df.sort_values(by=['ts_code', 'trade_date'])
-
-    # 计算未来的对数收益率
-    df['future_return'] = np.log(df.groupby('ts_code')['close'].shift(-days) / df['close'])
-
-    # 计算历史收益（对数收益率）
-    df['historical_return'] = np.log(df.groupby('ts_code')['close'].shift(1) / df['close'])
-
-    # 计算波动率（历史收益的标准差）
-    df['volatility'] = df.groupby('ts_code')['historical_return'].rolling(window=days).std().reset_index(level=0,
-                                                                                                         drop=True)
-
-    # 根据选择的 method 计算风险调整收益
-    if method == 'ratio':
-        # 收益/波动率（防止除零）
-        df['risk_adj_return'] = df['future_return'] / (df['volatility'] + eps)
-    elif method == 'difference':
-        # 收益 - λ * 波动率
-        df['risk_adj_return'] = df['future_return'] - lambda_ * df['volatility']
-    else:
-        raise ValueError("Invalid method. Use 'ratio' or 'difference'.")
-
-    return df
-
-# import polars as pl
-#
-# def read_and_merge_h5_data_polars(h5_filename, key, columns, df=None, join='left', on=['ts_code', 'trade_date']):
-#     processed_columns = []
-#     for col in columns:
-#         if col.startswith('_'):
-#             processed_columns.append(col[1:])  # 去掉下划线
-#         else:
-#             processed_columns.append(col)
-#
-#     # 从 HDF5 文件读取数据，选择需要的列
-#     pd_df = pd.read_hdf(h5_filename, key=key, columns=processed_columns)
-#
-#     # 将 Pandas DataFrame 转换为 Polars DataFrame
-#     data = pl.from_pandas(pd_df)
-#
-#     # 修改列名，如果列名以前有 _，加上 _
-#     data = data.rename({col: f'_{col}' for col in data.columns if col not in columns})
-#
-#     # 如果传入的 df 不为空，则进行合并
-#     if df is not None and not df.is_empty():
-#         print(f'{join} merge on {on}')
-#
-#         # 确保两个 DataFrame 都有 ts_code 和 trade_date 列
-#         # df = df.with_columns(pl.col('trade_date').str.strptime(pl.Datetime, format='%Y%m%d'))
-#         # data = data.with_columns(pl.col('trade_date').str.strptime(pl.Datetime, format='%Y%m%d'))
-#
-#         # 根据 ts_code 和 trade_date 合并
-#         merged_df = df.join(data, on=on, how=join)
-#     else:
-#         # 如果 df 为空，则直接返回读取的数据
-#         merged_df = data
-#
-#     return merged_df
+import numpy as np
+import pandas as pd
+
+
+def read_and_merge_h5_data(h5_filename, key, columns, df=None, join='left', on=['ts_code', 'trade_date'], prefix=None):
+    processed_columns = []
+    for col in columns:
+        if col.startswith('_'):
+            processed_columns.append(col[1:])  # 去掉下划线
+        else:
+            processed_columns.append(col)
+
+    # 从 HDF5 文件读取数据，选择需要的列
+    data = pd.read_hdf(h5_filename, key=key, columns=processed_columns)
+
+    # 修改列名，如果列名以前有 _，加上 _
+    for col in data.columns:
+        if col not in columns:  # 只有不在 columns 中的列才需要加下划线
+            new_col = f'_{col}'
+            data.rename(columns={col: new_col}, inplace=True)
+
+    if prefix is not None:
+        for col in data.columns:
+            if col not in ['ts_code', 'trade_date']:  # 只有不在 columns 中的列才需要加下划线
+                new_col = f'{prefix}_{col}'
+                data.rename(columns={col: new_col}, inplace=True)
+
+    # 如果传入的 df 不为空，则进行合并
+    if df is not None and not df.empty:
+        print(f'{join} merge on {on}')
+        if 'trade_date' in on:
+            # 确保两个 DataFrame 都有 ts_code 和 trade_date 列
+            df['trade_date'] = pd.to_datetime(df['trade_date'], format='%Y%m%d')
+            data['trade_date'] = pd.to_datetime(data['trade_date'], format='%Y%m%d')
+
+        # 根据 ts_code 和 trade_date 合并
+        merged_df = pd.merge(df, data, on=on, how=join)
+    else:
+        # 如果 df 为空，则直接返回读取的数据
+        merged_df = data
+
+    return merged_df
+
+
+def merge_with_industry_data(df, industry_df):
+    # 确保日期字段是 datetime 类型
+    df['trade_date'] = pd.to_datetime(df['trade_date'])
+    industry_df['in_date'] = pd.to_datetime(industry_df['in_date'])
+
+    # 对 industry_df 按 ts_code 和 in_date 排序
+    industry_df_sorted = industry_df.sort_values(['in_date', 'ts_code'])
+
+    # 对原始 df 按 ts_code 和 trade_date 排序
+    df_sorted = df.sort_values(['trade_date', 'ts_code'])
+
+    # 使用 merge_asof 进行向后合并
+    merged = pd.merge_asof(
+        df_sorted,
+        industry_df_sorted,
+        by='ts_code',  # 按 ts_code 分组
+        left_on='trade_date',
+        right_on='in_date',
+        direction='backward'
+    )
+
+    # 获取每个 ts_code 的最早 in_date 记录
+    min_in_date_per_ts = (industry_df_sorted
+    .groupby('ts_code')
+    .first()
+    .reset_index()[['ts_code', 'l2_code']])
+
+    # 填充未匹配到的记录（trade_date 早于所有 in_date 的情况）
+    merged['l2_code'] = merged['l2_code'].fillna(
+        merged['ts_code'].map(min_in_date_per_ts.set_index('ts_code')['l2_code'])
+    )
+
+    # 保留需要的列并重置索引
+    result = merged.reset_index(drop=True)
+    return result
+
+
+def calculate_risk_adjusted_return(df, days=1, method='ratio', lambda_=0.5, eps=1e-8):
+    """
+    计算单只股票的风险调整收益。
+
+    参数：
+    - df: DataFrame，包含 'ts_code' 和 'close' 列，按日期排序（假设 'trade_date' 已排序）。
+    - days: 预测未来多少天的收益，默认1天。
+    - method: 'ratio'（收益/波动率） 或 'difference'（收益 - λ * 波动率）。
+    - lambda_: 风险惩罚系数，仅当 method='difference' 时有效。
+    - eps: 防止除零的小常数。
+
+    返回：
+    - df：添加 'risk_adj_return' 列的 DataFrame，表示风险调整后的收益。
+    """
+    # 确保数据按 ts_code 和 trade_date 排序
+    df = df.sort_values(by=['ts_code', 'trade_date'])
+
+    # 计算未来的对数收益率
+    df['future_return'] = np.log(df.groupby('ts_code')['close'].shift(-days) / df['close'])
+
+    # 计算历史收益（对数收益率）
+    df['historical_return'] = np.log(df.groupby('ts_code')['close'].shift(1) / df['close'])
+
+    # 计算波动率（历史收益的标准差）
+    df['volatility'] = df.groupby('ts_code')['historical_return'].rolling(window=days).std().reset_index(level=0,
+                                                                                                         drop=True)
+
+    # 根据选择的 method 计算风险调整收益
+    if method == 'ratio':
+        # 收益/波动率（防止除零）
+        df['risk_adj_return'] = df['future_return'] / (df['volatility'] + eps)
+    elif method == 'difference':
+        # 收益 - λ * 波动率
+        df['risk_adj_return'] = df['future_return'] - lambda_ * df['volatility']
+    else:
+        raise ValueError("Invalid method. Use 'ratio' or 'difference'.")
+
+    return df
+
+# import polars as pl
+#
+# def read_and_merge_h5_data_polars(h5_filename, key, columns, df=None, join='left', on=['ts_code', 'trade_date']):
+#     processed_columns = []
+#     for col in columns:
+#         if col.startswith('_'):
+#             processed_columns.append(col[1:])  # 去掉下划线
+#         else:
+#             processed_columns.append(col)
+#
+#     # 从 HDF5 文件读取数据，选择需要的列
+#     pd_df = pd.read_hdf(h5_filename, key=key, columns=processed_columns)
+#
+#     # 将 Pandas DataFrame 转换为 Polars DataFrame
+#     data = pl.from_pandas(pd_df)
+#
+#     # 修改列名，如果列名以前有 _，加上 _
+#     data = data.rename({col: f'_{col}' for col in data.columns if col not in columns})
+#
+#     # 如果传入的 df 不为空，则进行合并
+#     if df is not None and not df.is_empty():
+#         print(f'{join} merge on {on}')
+#
+#         # 确保两个 DataFrame 都有 ts_code 和 trade_date 列
+#         # df = df.with_columns(pl.col('trade_date').str.strptime(pl.Datetime, format='%Y%m%d'))
+#         # data = data.with_columns(pl.col('trade_date').str.strptime(pl.Datetime, format='%Y%m%d'))
+#
+#         # 根据 ts_code 和 trade_date 合并
+#         merged_df = df.join(data, on=on, how=join)
+#     else:
+#         # 如果 df 为空，则直接返回读取的数据
+#         merged_df = data
+#
+#     return merged_df