新环境

This commit is contained in:
2025-06-01 15:59:29 +08:00
parent eb52c3673c
commit ffe9c6ae3c
47 changed files with 28572 additions and 27694 deletions

Binary file not shown.

Binary file not shown.

Binary file not shown.

File diff suppressed because it is too large Load Diff

View File

@@ -1,233 +1,233 @@
import numpy as np
import pandas as pd
from scipy.stats import ks_2samp
from sklearn.preprocessing import StandardScaler
def remove_shifted_features(train_data, feature_columns, ks_threshold=0.05, wasserstein_threshold=0.1, size=0.8,
log=True, val_data=None):
dropped_features = []
if val_data is None:
all_dates = sorted(train_data['trade_date'].unique().tolist()) # 获取所有唯一的 trade_date
split_date = all_dates[int(len(all_dates) * size)] # 划分点为倒数第 validation_days 天
train_data_split = train_data[train_data['trade_date'] < split_date] # 训练集
val_data_split = train_data[train_data['trade_date'] >= split_date] # 验证集
else:
train_data_split = train_data
val_data_split = val_data
# **统计数据漂移**
numeric_columns = train_data_split.select_dtypes(include=['float64', 'int64']).columns
numeric_columns = [col for col in numeric_columns if col in feature_columns]
for feature in numeric_columns:
ks_stat, p_value = ks_2samp(train_data_split[feature], val_data_split[feature])
# wasserstein_dist = wasserstein_distance(train_data_split[feature], val_data_split[feature])
# if p_value < ks_threshold or wasserstein_dist > wasserstein_threshold:
if p_value < ks_threshold:
dropped_features.append(feature)
if log:
print(f"检测到 {len(dropped_features)} 个可能漂移的特征: {dropped_features}")
# **应用阈值进行最终筛选**
filtered_features = [f for f in feature_columns if f not in dropped_features]
return filtered_features, dropped_features
def remove_outliers_label_percentile(label: pd.Series, lower_percentile: float = 0.01, upper_percentile: float = 0.99,
log=True):
if not (0 <= lower_percentile < upper_percentile <= 1):
raise ValueError("Percentile values must satisfy 0 <= lower_percentile < upper_percentile <= 1.")
# Calculate lower and upper bounds based on percentiles
lower_bound = label.quantile(lower_percentile)
upper_bound = label.quantile(upper_percentile)
# Filter out values outside the bounds
filtered_label = label[(label >= lower_bound) & (label <= upper_bound)]
# Print the number of removed outliers
if log:
print(f"Removed {len(label) - len(filtered_label)} outliers.")
return filtered_label
def calculate_risk_adjusted_target(df, days=5):
df = df.sort_values(by=['ts_code', 'trade_date'])
df['future_close'] = df.groupby('ts_code')['close'].shift(-days)
df['future_open'] = df.groupby('ts_code')['open'].shift(-1)
df['future_return'] = (df['future_close'] - df['future_open']) / df['future_open']
df['future_volatility'] = df.groupby('ts_code')['future_return'].rolling(days, min_periods=1).std().reset_index(
level=0, drop=True)
sharpe_ratio = df['future_return'] * df['future_volatility']
sharpe_ratio.replace([np.inf, -np.inf], np.nan, inplace=True)
return sharpe_ratio
def calculate_score(df, days=5, lambda_param=1.0):
def calculate_max_drawdown(prices):
peak = prices.iloc[0] # 初始化峰值
max_drawdown = 0 # 初始化最大回撤
for price in prices:
if price > peak:
peak = price # 更新峰值
else:
drawdown = (peak - price) / peak # 计算当前回撤
max_drawdown = max(max_drawdown, drawdown) # 更新最大回撤
return max_drawdown
def compute_stock_score(stock_df):
stock_df = stock_df.sort_values(by=['trade_date'])
future_return = stock_df['future_return']
# 使用已有的 pct_chg 字段计算波动率
volatility = stock_df['pct_chg'].rolling(days).std().shift(-days)
max_drawdown = stock_df['close'].rolling(days).apply(calculate_max_drawdown, raw=False).shift(-days)
score = future_return - lambda_param * max_drawdown
return score
# # 确保 DataFrame 按照股票代码和交易日期排序
# df = df.sort_values(by=['ts_code', 'trade_date'])
# 对每个股票分别计算 score
df['score'] = df.groupby('ts_code').apply(compute_stock_score).reset_index(level=0, drop=True)
return df['score']
def remove_highly_correlated_features(df, feature_columns, threshold=0.9):
numeric_features = df[feature_columns].select_dtypes(include=[np.number]).columns.tolist()
if not numeric_features:
raise ValueError("No numeric features found in the provided data.")
corr_matrix = df[numeric_features].corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop = [column for column in upper.columns if any(upper[column] > threshold)]
remaining_features = [col for col in feature_columns if col not in to_drop
or 'act' in col or 'af' in col]
return remaining_features
def cross_sectional_standardization(df, features):
df_sorted = df.sort_values(by='trade_date') # 按时间排序
df_standardized = df_sorted.copy()
for date in df_sorted['trade_date'].unique():
# 获取当前时间点的数据
current_data = df_standardized[df_standardized['trade_date'] == date]
# 只对指定特征进行标准化
scaler = StandardScaler()
standardized_values = scaler.fit_transform(current_data[features])
# 将标准化结果重新赋值回去
df_standardized.loc[df_standardized['trade_date'] == date, features] = standardized_values
return df_standardized
def neutralize_manual(df, features, industry_col, mkt_cap_col):
""" 手动实现简单回归以提升速度 """
for col in features:
residuals = []
for _, group in df.groupby(industry_col):
if len(group) > 1:
x = np.log(group[mkt_cap_col]) # 市值对数
y = group[col] # 因子值
beta = np.cov(y, x)[0, 1] / np.var(x) # 计算斜率
alpha = np.mean(y) - beta * np.mean(x) # 计算截距
resid = y - (alpha + beta * x) # 计算残差
residuals.extend(resid)
else:
residuals.extend(group[col]) # 样本不足时保留原值
df[col] = residuals
return df
def mad_filter(df, features, n=3):
for col in features:
median = df[col].median()
mad = np.median(np.abs(df[col] - median))
upper = median + n * mad
lower = median - n * mad
df[col] = np.clip(df[col], lower, upper) # 截断极值
return df
def percentile_filter(df, features, lower_percentile=0.01, upper_percentile=0.99):
for col in features:
# 按日期分组计算上下百分位数
lower_bound = df.groupby('trade_date')[col].transform(
lambda x: x.quantile(lower_percentile)
)
upper_bound = df.groupby('trade_date')[col].transform(
lambda x: x.quantile(upper_percentile)
)
# 截断超出范围的值
df[col] = np.clip(df[col], lower_bound, upper_bound)
return df
from scipy.stats import iqr
def iqr_filter(df, features):
for col in features:
df[col] = df.groupby('trade_date')[col].transform(
lambda x: (x - x.median()) / iqr(x) if iqr(x) != 0 else x
)
return df
def quantile_filter(df, features, lower_quantile=0.01, upper_quantile=0.99, window=60):
df = df.copy()
for col in features:
# 计算 rolling 统计量,需要按日期进行 groupby
rolling_lower = df.groupby('trade_date')[col].transform(
lambda x: x.rolling(window=min(len(x), window)).quantile(lower_quantile))
rolling_upper = df.groupby('trade_date')[col].transform(
lambda x: x.rolling(window=min(len(x), window)).quantile(upper_quantile))
# 对数据进行裁剪
df[col] = np.clip(df[col], rolling_lower, rolling_upper)
return df
def time_series_quantile_filter(df, features, lower_quantile=0.01, upper_quantile=0.99, window=60):
df = df.copy()
# 确保按股票和时间排序
df = df.sort_values(['ts_code', 'trade_date'])
grouped = df.groupby('ts_code')
for col in features:
# 对每个股票的时间序列计算滚动分位数
rolling_lower = grouped[col].rolling(window=window, min_periods=window // 2).quantile(lower_quantile)
rolling_upper = grouped[col].rolling(window=window, min_periods=window // 2).quantile(upper_quantile)
# rolling结果带有多重索引需要对齐
rolling_lower = rolling_lower.reset_index(level=0, drop=True)
rolling_upper = rolling_upper.reset_index(level=0, drop=True)
# 应用 clip
df[col] = np.clip(df[col], rolling_lower, rolling_upper)
return df
def cross_sectional_quantile_filter(df, features, lower_quantile=0.01, upper_quantile=0.99):
df = df.copy()
grouped = df.groupby('trade_date')
for col in features:
# 计算每日截面的分位数边界
lower_bound = grouped[col].transform(lambda x: x.quantile(lower_quantile))
upper_bound = grouped[col].transform(lambda x: x.quantile(upper_quantile))
# 应用 clip
df[col] = np.clip(df[col], lower_bound, upper_bound)
import numpy as np
import pandas as pd
from scipy.stats import ks_2samp
from sklearn.preprocessing import StandardScaler
def remove_shifted_features(train_data, feature_columns, ks_threshold=0.05, wasserstein_threshold=0.1, size=0.8,
log=True, val_data=None):
dropped_features = []
if val_data is None:
all_dates = sorted(train_data['trade_date'].unique().tolist()) # 获取所有唯一的 trade_date
split_date = all_dates[int(len(all_dates) * size)] # 划分点为倒数第 validation_days 天
train_data_split = train_data[train_data['trade_date'] < split_date] # 训练集
val_data_split = train_data[train_data['trade_date'] >= split_date] # 验证集
else:
train_data_split = train_data
val_data_split = val_data
# **统计数据漂移**
numeric_columns = train_data_split.select_dtypes(include=['float64', 'int64']).columns
numeric_columns = [col for col in numeric_columns if col in feature_columns]
for feature in numeric_columns:
ks_stat, p_value = ks_2samp(train_data_split[feature], val_data_split[feature])
# wasserstein_dist = wasserstein_distance(train_data_split[feature], val_data_split[feature])
# if p_value < ks_threshold or wasserstein_dist > wasserstein_threshold:
if p_value < ks_threshold:
dropped_features.append(feature)
if log:
print(f"检测到 {len(dropped_features)} 个可能漂移的特征: {dropped_features}")
# **应用阈值进行最终筛选**
filtered_features = [f for f in feature_columns if f not in dropped_features]
return filtered_features, dropped_features
def remove_outliers_label_percentile(label: pd.Series, lower_percentile: float = 0.01, upper_percentile: float = 0.99,
log=True):
if not (0 <= lower_percentile < upper_percentile <= 1):
raise ValueError("Percentile values must satisfy 0 <= lower_percentile < upper_percentile <= 1.")
# Calculate lower and upper bounds based on percentiles
lower_bound = label.quantile(lower_percentile)
upper_bound = label.quantile(upper_percentile)
# Filter out values outside the bounds
filtered_label = label[(label >= lower_bound) & (label <= upper_bound)]
# Print the number of removed outliers
if log:
print(f"Removed {len(label) - len(filtered_label)} outliers.")
return filtered_label
def calculate_risk_adjusted_target(df, days=5):
df = df.sort_values(by=['ts_code', 'trade_date'])
df['future_close'] = df.groupby('ts_code')['close'].shift(-days)
df['future_open'] = df.groupby('ts_code')['open'].shift(-1)
df['future_return'] = (df['future_close'] - df['future_open']) / df['future_open']
df['future_volatility'] = df.groupby('ts_code')['future_return'].rolling(days, min_periods=1).std().reset_index(
level=0, drop=True)
sharpe_ratio = df['future_return'] * df['future_volatility']
sharpe_ratio.replace([np.inf, -np.inf], np.nan, inplace=True)
return sharpe_ratio
def calculate_score(df, days=5, lambda_param=1.0):
def calculate_max_drawdown(prices):
peak = prices.iloc[0] # 初始化峰值
max_drawdown = 0 # 初始化最大回撤
for price in prices:
if price > peak:
peak = price # 更新峰值
else:
drawdown = (peak - price) / peak # 计算当前回撤
max_drawdown = max(max_drawdown, drawdown) # 更新最大回撤
return max_drawdown
def compute_stock_score(stock_df):
stock_df = stock_df.sort_values(by=['trade_date'])
future_return = stock_df['future_return']
# 使用已有的 pct_chg 字段计算波动率
volatility = stock_df['pct_chg'].rolling(days).std().shift(-days)
max_drawdown = stock_df['close'].rolling(days).apply(calculate_max_drawdown, raw=False).shift(-days)
score = future_return - lambda_param * max_drawdown
return score
# # 确保 DataFrame 按照股票代码和交易日期排序
# df = df.sort_values(by=['ts_code', 'trade_date'])
# 对每个股票分别计算 score
df['score'] = df.groupby('ts_code').apply(compute_stock_score).reset_index(level=0, drop=True)
return df['score']
def remove_highly_correlated_features(df, feature_columns, threshold=0.9):
numeric_features = df[feature_columns].select_dtypes(include=[np.number]).columns.tolist()
if not numeric_features:
raise ValueError("No numeric features found in the provided data.")
corr_matrix = df[numeric_features].corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop = [column for column in upper.columns if any(upper[column] > threshold)]
remaining_features = [col for col in feature_columns if col not in to_drop
or 'act' in col or 'af' in col]
return remaining_features
def cross_sectional_standardization(df, features):
df_sorted = df.sort_values(by='trade_date') # 按时间排序
df_standardized = df_sorted.copy()
for date in df_sorted['trade_date'].unique():
# 获取当前时间点的数据
current_data = df_standardized[df_standardized['trade_date'] == date]
# 只对指定特征进行标准化
scaler = StandardScaler()
standardized_values = scaler.fit_transform(current_data[features])
# 将标准化结果重新赋值回去
df_standardized.loc[df_standardized['trade_date'] == date, features] = standardized_values
return df_standardized
def neutralize_manual(df, features, industry_col, mkt_cap_col):
""" 手动实现简单回归以提升速度 """
for col in features:
residuals = []
for _, group in df.groupby(industry_col):
if len(group) > 1:
x = np.log(group[mkt_cap_col]) # 市值对数
y = group[col] # 因子值
beta = np.cov(y, x)[0, 1] / np.var(x) # 计算斜率
alpha = np.mean(y) - beta * np.mean(x) # 计算截距
resid = y - (alpha + beta * x) # 计算残差
residuals.extend(resid)
else:
residuals.extend(group[col]) # 样本不足时保留原值
df[col] = residuals
return df
def mad_filter(df, features, n=3):
for col in features:
median = df[col].median()
mad = np.median(np.abs(df[col] - median))
upper = median + n * mad
lower = median - n * mad
df[col] = np.clip(df[col], lower, upper) # 截断极值
return df
def percentile_filter(df, features, lower_percentile=0.01, upper_percentile=0.99):
for col in features:
# 按日期分组计算上下百分位数
lower_bound = df.groupby('trade_date')[col].transform(
lambda x: x.quantile(lower_percentile)
)
upper_bound = df.groupby('trade_date')[col].transform(
lambda x: x.quantile(upper_percentile)
)
# 截断超出范围的值
df[col] = np.clip(df[col], lower_bound, upper_bound)
return df
from scipy.stats import iqr
def iqr_filter(df, features):
for col in features:
df[col] = df.groupby('trade_date')[col].transform(
lambda x: (x - x.median()) / iqr(x) if iqr(x) != 0 else x
)
return df
def quantile_filter(df, features, lower_quantile=0.01, upper_quantile=0.99, window=60):
df = df.copy()
for col in features:
# 计算 rolling 统计量,需要按日期进行 groupby
rolling_lower = df.groupby('trade_date')[col].transform(
lambda x: x.rolling(window=min(len(x), window)).quantile(lower_quantile))
rolling_upper = df.groupby('trade_date')[col].transform(
lambda x: x.rolling(window=min(len(x), window)).quantile(upper_quantile))
# 对数据进行裁剪
df[col] = np.clip(df[col], rolling_lower, rolling_upper)
return df
def time_series_quantile_filter(df, features, lower_quantile=0.01, upper_quantile=0.99, window=60):
df = df.copy()
# 确保按股票和时间排序
df = df.sort_values(['ts_code', 'trade_date'])
grouped = df.groupby('ts_code')
for col in features:
# 对每个股票的时间序列计算滚动分位数
rolling_lower = grouped[col].rolling(window=window, min_periods=window // 2).quantile(lower_quantile)
rolling_upper = grouped[col].rolling(window=window, min_periods=window // 2).quantile(upper_quantile)
# rolling结果带有多重索引需要对齐
rolling_lower = rolling_lower.reset_index(level=0, drop=True)
rolling_upper = rolling_upper.reset_index(level=0, drop=True)
# 应用 clip
df[col] = np.clip(df[col], rolling_lower, rolling_upper)
return df
def cross_sectional_quantile_filter(df, features, lower_quantile=0.01, upper_quantile=0.99):
df = df.copy()
grouped = df.groupby('trade_date')
for col in features:
# 计算每日截面的分位数边界
lower_bound = grouped[col].transform(lambda x: x.quantile(lower_quantile))
upper_bound = grouped[col].transform(lambda x: x.quantile(upper_quantile))
# 应用 clip
df[col] = np.clip(df[col], lower_bound, upper_bound)
return df

View File

@@ -1,154 +1,154 @@
import numpy as np
import pandas as pd
def read_and_merge_h5_data(h5_filename, key, columns, df=None, join='left', on=['ts_code', 'trade_date'], prefix=None):
processed_columns = []
for col in columns:
if col.startswith('_'):
processed_columns.append(col[1:]) # 去掉下划线
else:
processed_columns.append(col)
# 从 HDF5 文件读取数据,选择需要的列
data = pd.read_hdf(h5_filename, key=key, columns=processed_columns)
# 修改列名,如果列名以前有 _加上 _
for col in data.columns:
if col not in columns: # 只有不在 columns 中的列才需要加下划线
new_col = f'_{col}'
data.rename(columns={col: new_col}, inplace=True)
if prefix is not None:
for col in data.columns:
if col not in ['ts_code', 'trade_date']: # 只有不在 columns 中的列才需要加下划线
new_col = f'{prefix}_{col}'
data.rename(columns={col: new_col}, inplace=True)
# 如果传入的 df 不为空,则进行合并
if df is not None and not df.empty:
print(f'{join} merge on {on}')
if 'trade_date' in on:
# 确保两个 DataFrame 都有 ts_code 和 trade_date 列
df['trade_date'] = pd.to_datetime(df['trade_date'], format='%Y%m%d')
data['trade_date'] = pd.to_datetime(data['trade_date'], format='%Y%m%d')
# 根据 ts_code 和 trade_date 合并
merged_df = pd.merge(df, data, on=on, how=join)
else:
# 如果 df 为空,则直接返回读取的数据
merged_df = data
return merged_df
def merge_with_industry_data(df, industry_df):
# 确保日期字段是 datetime 类型
df['trade_date'] = pd.to_datetime(df['trade_date'])
industry_df['in_date'] = pd.to_datetime(industry_df['in_date'])
# 对 industry_df 按 ts_code 和 in_date 排序
industry_df_sorted = industry_df.sort_values(['in_date', 'ts_code'])
# 对原始 df 按 ts_code 和 trade_date 排序
df_sorted = df.sort_values(['trade_date', 'ts_code'])
# 使用 merge_asof 进行向后合并
merged = pd.merge_asof(
df_sorted,
industry_df_sorted,
by='ts_code', # 按 ts_code 分组
left_on='trade_date',
right_on='in_date',
direction='backward'
)
# 获取每个 ts_code 的最早 in_date 记录
min_in_date_per_ts = (industry_df_sorted
.groupby('ts_code')
.first()
.reset_index()[['ts_code', 'l2_code']])
# 填充未匹配到的记录trade_date 早于所有 in_date 的情况)
merged['l2_code'] = merged['l2_code'].fillna(
merged['ts_code'].map(min_in_date_per_ts.set_index('ts_code')['l2_code'])
)
# 保留需要的列并重置索引
result = merged.reset_index(drop=True)
return result
def calculate_risk_adjusted_return(df, days=1, method='ratio', lambda_=0.5, eps=1e-8):
"""
计算单只股票的风险调整收益。
参数:
- df: DataFrame包含 'ts_code''close' 列,按日期排序(假设 'trade_date' 已排序)。
- days: 预测未来多少天的收益默认1天。
- method: 'ratio'(收益/波动率) 或 'difference'(收益 - λ * 波动率)。
- lambda_: 风险惩罚系数,仅当 method='difference' 时有效。
- eps: 防止除零的小常数。
返回:
- df添加 'risk_adj_return' 列的 DataFrame表示风险调整后的收益。
"""
# 确保数据按 ts_code 和 trade_date 排序
df = df.sort_values(by=['ts_code', 'trade_date'])
# 计算未来的对数收益率
df['future_return'] = np.log(df.groupby('ts_code')['close'].shift(-days) / df['close'])
# 计算历史收益(对数收益率)
df['historical_return'] = np.log(df.groupby('ts_code')['close'].shift(1) / df['close'])
# 计算波动率(历史收益的标准差)
df['volatility'] = df.groupby('ts_code')['historical_return'].rolling(window=days).std().reset_index(level=0,
drop=True)
# 根据选择的 method 计算风险调整收益
if method == 'ratio':
# 收益/波动率(防止除零)
df['risk_adj_return'] = df['future_return'] / (df['volatility'] + eps)
elif method == 'difference':
# 收益 - λ * 波动率
df['risk_adj_return'] = df['future_return'] - lambda_ * df['volatility']
else:
raise ValueError("Invalid method. Use 'ratio' or 'difference'.")
return df
# import polars as pl
#
# def read_and_merge_h5_data_polars(h5_filename, key, columns, df=None, join='left', on=['ts_code', 'trade_date']):
# processed_columns = []
# for col in columns:
# if col.startswith('_'):
# processed_columns.append(col[1:]) # 去掉下划线
# else:
# processed_columns.append(col)
#
# # 从 HDF5 文件读取数据,选择需要的列
# pd_df = pd.read_hdf(h5_filename, key=key, columns=processed_columns)
#
# # 将 Pandas DataFrame 转换为 Polars DataFrame
# data = pl.from_pandas(pd_df)
#
# # 修改列名,如果列名以前有 _加上 _
# data = data.rename({col: f'_{col}' for col in data.columns if col not in columns})
#
# # 如果传入的 df 不为空,则进行合并
# if df is not None and not df.is_empty():
# print(f'{join} merge on {on}')
#
# # 确保两个 DataFrame 都有 ts_code 和 trade_date 列
# # df = df.with_columns(pl.col('trade_date').str.strptime(pl.Datetime, format='%Y%m%d'))
# # data = data.with_columns(pl.col('trade_date').str.strptime(pl.Datetime, format='%Y%m%d'))
#
# # 根据 ts_code 和 trade_date 合并
# merged_df = df.join(data, on=on, how=join)
# else:
# # 如果 df 为空,则直接返回读取的数据
# merged_df = data
#
# return merged_df
import numpy as np
import pandas as pd
def read_and_merge_h5_data(h5_filename, key, columns, df=None, join='left', on=['ts_code', 'trade_date'], prefix=None):
processed_columns = []
for col in columns:
if col.startswith('_'):
processed_columns.append(col[1:]) # 去掉下划线
else:
processed_columns.append(col)
# 从 HDF5 文件读取数据,选择需要的列
data = pd.read_hdf(h5_filename, key=key, columns=processed_columns)
# 修改列名,如果列名以前有 _加上 _
for col in data.columns:
if col not in columns: # 只有不在 columns 中的列才需要加下划线
new_col = f'_{col}'
data.rename(columns={col: new_col}, inplace=True)
if prefix is not None:
for col in data.columns:
if col not in ['ts_code', 'trade_date']: # 只有不在 columns 中的列才需要加下划线
new_col = f'{prefix}_{col}'
data.rename(columns={col: new_col}, inplace=True)
# 如果传入的 df 不为空,则进行合并
if df is not None and not df.empty:
print(f'{join} merge on {on}')
if 'trade_date' in on:
# 确保两个 DataFrame 都有 ts_code 和 trade_date 列
df['trade_date'] = pd.to_datetime(df['trade_date'], format='%Y%m%d')
data['trade_date'] = pd.to_datetime(data['trade_date'], format='%Y%m%d')
# 根据 ts_code 和 trade_date 合并
merged_df = pd.merge(df, data, on=on, how=join)
else:
# 如果 df 为空,则直接返回读取的数据
merged_df = data
return merged_df
def merge_with_industry_data(df, industry_df):
# 确保日期字段是 datetime 类型
df['trade_date'] = pd.to_datetime(df['trade_date'])
industry_df['in_date'] = pd.to_datetime(industry_df['in_date'])
# 对 industry_df 按 ts_code 和 in_date 排序
industry_df_sorted = industry_df.sort_values(['in_date', 'ts_code'])
# 对原始 df 按 ts_code 和 trade_date 排序
df_sorted = df.sort_values(['trade_date', 'ts_code'])
# 使用 merge_asof 进行向后合并
merged = pd.merge_asof(
df_sorted,
industry_df_sorted,
by='ts_code', # 按 ts_code 分组
left_on='trade_date',
right_on='in_date',
direction='backward'
)
# 获取每个 ts_code 的最早 in_date 记录
min_in_date_per_ts = (industry_df_sorted
.groupby('ts_code')
.first()
.reset_index()[['ts_code', 'l2_code']])
# 填充未匹配到的记录trade_date 早于所有 in_date 的情况)
merged['l2_code'] = merged['l2_code'].fillna(
merged['ts_code'].map(min_in_date_per_ts.set_index('ts_code')['l2_code'])
)
# 保留需要的列并重置索引
result = merged.reset_index(drop=True)
return result
def calculate_risk_adjusted_return(df, days=1, method='ratio', lambda_=0.5, eps=1e-8):
"""
计算单只股票的风险调整收益。
参数:
- df: DataFrame包含 'ts_code''close' 列,按日期排序(假设 'trade_date' 已排序)。
- days: 预测未来多少天的收益默认1天。
- method: 'ratio'(收益/波动率) 或 'difference'(收益 - λ * 波动率)。
- lambda_: 风险惩罚系数,仅当 method='difference' 时有效。
- eps: 防止除零的小常数。
返回:
- df添加 'risk_adj_return' 列的 DataFrame表示风险调整后的收益。
"""
# 确保数据按 ts_code 和 trade_date 排序
df = df.sort_values(by=['ts_code', 'trade_date'])
# 计算未来的对数收益率
df['future_return'] = np.log(df.groupby('ts_code')['close'].shift(-days) / df['close'])
# 计算历史收益(对数收益率)
df['historical_return'] = np.log(df.groupby('ts_code')['close'].shift(1) / df['close'])
# 计算波动率(历史收益的标准差)
df['volatility'] = df.groupby('ts_code')['historical_return'].rolling(window=days).std().reset_index(level=0,
drop=True)
# 根据选择的 method 计算风险调整收益
if method == 'ratio':
# 收益/波动率(防止除零)
df['risk_adj_return'] = df['future_return'] / (df['volatility'] + eps)
elif method == 'difference':
# 收益 - λ * 波动率
df['risk_adj_return'] = df['future_return'] - lambda_ * df['volatility']
else:
raise ValueError("Invalid method. Use 'ratio' or 'difference'.")
return df
# import polars as pl
#
# def read_and_merge_h5_data_polars(h5_filename, key, columns, df=None, join='left', on=['ts_code', 'trade_date']):
# processed_columns = []
# for col in columns:
# if col.startswith('_'):
# processed_columns.append(col[1:]) # 去掉下划线
# else:
# processed_columns.append(col)
#
# # 从 HDF5 文件读取数据,选择需要的列
# pd_df = pd.read_hdf(h5_filename, key=key, columns=processed_columns)
#
# # 将 Pandas DataFrame 转换为 Polars DataFrame
# data = pl.from_pandas(pd_df)
#
# # 修改列名,如果列名以前有 _加上 _
# data = data.rename({col: f'_{col}' for col in data.columns if col not in columns})
#
# # 如果传入的 df 不为空,则进行合并
# if df is not None and not df.is_empty():
# print(f'{join} merge on {on}')
#
# # 确保两个 DataFrame 都有 ts_code 和 trade_date 列
# # df = df.with_columns(pl.col('trade_date').str.strptime(pl.Datetime, format='%Y%m%d'))
# # data = data.with_columns(pl.col('trade_date').str.strptime(pl.Datetime, format='%Y%m%d'))
#
# # 根据 ts_code 和 trade_date 合并
# merged_df = df.join(data, on=on, how=join)
# else:
# # 如果 df 为空,则直接返回读取的数据
# merged_df = data
#
# return merged_df