新环境
This commit is contained in:
BIN
main/utils/__pycache__/__init__.cpython-313.pyc
Normal file
BIN
main/utils/__pycache__/__init__.cpython-313.pyc
Normal file
Binary file not shown.
BIN
main/utils/__pycache__/factor.cpython-313.pyc
Normal file
BIN
main/utils/__pycache__/factor.cpython-313.pyc
Normal file
Binary file not shown.
BIN
main/utils/__pycache__/factor_processor.cpython-313.pyc
Normal file
BIN
main/utils/__pycache__/factor_processor.cpython-313.pyc
Normal file
Binary file not shown.
BIN
main/utils/__pycache__/utils.cpython-313.pyc
Normal file
BIN
main/utils/__pycache__/utils.cpython-313.pyc
Normal file
Binary file not shown.
1560
main/utils/factor.py
1560
main/utils/factor.py
File diff suppressed because it is too large
Load Diff
@@ -1,233 +1,233 @@
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
from scipy.stats import ks_2samp
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
|
||||
|
||||
def remove_shifted_features(train_data, feature_columns, ks_threshold=0.05, wasserstein_threshold=0.1, size=0.8,
|
||||
log=True, val_data=None):
|
||||
dropped_features = []
|
||||
|
||||
if val_data is None:
|
||||
all_dates = sorted(train_data['trade_date'].unique().tolist()) # 获取所有唯一的 trade_date
|
||||
split_date = all_dates[int(len(all_dates) * size)] # 划分点为倒数第 validation_days 天
|
||||
train_data_split = train_data[train_data['trade_date'] < split_date] # 训练集
|
||||
val_data_split = train_data[train_data['trade_date'] >= split_date] # 验证集
|
||||
else:
|
||||
train_data_split = train_data
|
||||
val_data_split = val_data
|
||||
|
||||
# **统计数据漂移**
|
||||
numeric_columns = train_data_split.select_dtypes(include=['float64', 'int64']).columns
|
||||
numeric_columns = [col for col in numeric_columns if col in feature_columns]
|
||||
for feature in numeric_columns:
|
||||
ks_stat, p_value = ks_2samp(train_data_split[feature], val_data_split[feature])
|
||||
# wasserstein_dist = wasserstein_distance(train_data_split[feature], val_data_split[feature])
|
||||
|
||||
# if p_value < ks_threshold or wasserstein_dist > wasserstein_threshold:
|
||||
if p_value < ks_threshold:
|
||||
dropped_features.append(feature)
|
||||
if log:
|
||||
print(f"检测到 {len(dropped_features)} 个可能漂移的特征: {dropped_features}")
|
||||
|
||||
# **应用阈值进行最终筛选**
|
||||
filtered_features = [f for f in feature_columns if f not in dropped_features]
|
||||
|
||||
return filtered_features, dropped_features
|
||||
|
||||
|
||||
def remove_outliers_label_percentile(label: pd.Series, lower_percentile: float = 0.01, upper_percentile: float = 0.99,
|
||||
log=True):
|
||||
if not (0 <= lower_percentile < upper_percentile <= 1):
|
||||
raise ValueError("Percentile values must satisfy 0 <= lower_percentile < upper_percentile <= 1.")
|
||||
|
||||
# Calculate lower and upper bounds based on percentiles
|
||||
lower_bound = label.quantile(lower_percentile)
|
||||
upper_bound = label.quantile(upper_percentile)
|
||||
|
||||
# Filter out values outside the bounds
|
||||
filtered_label = label[(label >= lower_bound) & (label <= upper_bound)]
|
||||
|
||||
# Print the number of removed outliers
|
||||
if log:
|
||||
print(f"Removed {len(label) - len(filtered_label)} outliers.")
|
||||
return filtered_label
|
||||
|
||||
|
||||
def calculate_risk_adjusted_target(df, days=5):
|
||||
df = df.sort_values(by=['ts_code', 'trade_date'])
|
||||
|
||||
df['future_close'] = df.groupby('ts_code')['close'].shift(-days)
|
||||
df['future_open'] = df.groupby('ts_code')['open'].shift(-1)
|
||||
df['future_return'] = (df['future_close'] - df['future_open']) / df['future_open']
|
||||
|
||||
df['future_volatility'] = df.groupby('ts_code')['future_return'].rolling(days, min_periods=1).std().reset_index(
|
||||
level=0, drop=True)
|
||||
sharpe_ratio = df['future_return'] * df['future_volatility']
|
||||
sharpe_ratio.replace([np.inf, -np.inf], np.nan, inplace=True)
|
||||
|
||||
return sharpe_ratio
|
||||
|
||||
|
||||
def calculate_score(df, days=5, lambda_param=1.0):
|
||||
def calculate_max_drawdown(prices):
|
||||
peak = prices.iloc[0] # 初始化峰值
|
||||
max_drawdown = 0 # 初始化最大回撤
|
||||
|
||||
for price in prices:
|
||||
if price > peak:
|
||||
peak = price # 更新峰值
|
||||
else:
|
||||
drawdown = (peak - price) / peak # 计算当前回撤
|
||||
max_drawdown = max(max_drawdown, drawdown) # 更新最大回撤
|
||||
|
||||
return max_drawdown
|
||||
|
||||
def compute_stock_score(stock_df):
|
||||
stock_df = stock_df.sort_values(by=['trade_date'])
|
||||
future_return = stock_df['future_return']
|
||||
# 使用已有的 pct_chg 字段计算波动率
|
||||
volatility = stock_df['pct_chg'].rolling(days).std().shift(-days)
|
||||
max_drawdown = stock_df['close'].rolling(days).apply(calculate_max_drawdown, raw=False).shift(-days)
|
||||
score = future_return - lambda_param * max_drawdown
|
||||
return score
|
||||
|
||||
# # 确保 DataFrame 按照股票代码和交易日期排序
|
||||
# df = df.sort_values(by=['ts_code', 'trade_date'])
|
||||
|
||||
# 对每个股票分别计算 score
|
||||
df['score'] = df.groupby('ts_code').apply(compute_stock_score).reset_index(level=0, drop=True)
|
||||
|
||||
return df['score']
|
||||
|
||||
|
||||
def remove_highly_correlated_features(df, feature_columns, threshold=0.9):
|
||||
numeric_features = df[feature_columns].select_dtypes(include=[np.number]).columns.tolist()
|
||||
if not numeric_features:
|
||||
raise ValueError("No numeric features found in the provided data.")
|
||||
|
||||
corr_matrix = df[numeric_features].corr().abs()
|
||||
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
|
||||
to_drop = [column for column in upper.columns if any(upper[column] > threshold)]
|
||||
remaining_features = [col for col in feature_columns if col not in to_drop
|
||||
or 'act' in col or 'af' in col]
|
||||
return remaining_features
|
||||
|
||||
|
||||
def cross_sectional_standardization(df, features):
|
||||
df_sorted = df.sort_values(by='trade_date') # 按时间排序
|
||||
df_standardized = df_sorted.copy()
|
||||
|
||||
for date in df_sorted['trade_date'].unique():
|
||||
# 获取当前时间点的数据
|
||||
current_data = df_standardized[df_standardized['trade_date'] == date]
|
||||
|
||||
# 只对指定特征进行标准化
|
||||
scaler = StandardScaler()
|
||||
standardized_values = scaler.fit_transform(current_data[features])
|
||||
|
||||
# 将标准化结果重新赋值回去
|
||||
df_standardized.loc[df_standardized['trade_date'] == date, features] = standardized_values
|
||||
|
||||
return df_standardized
|
||||
|
||||
|
||||
def neutralize_manual(df, features, industry_col, mkt_cap_col):
|
||||
""" 手动实现简单回归以提升速度 """
|
||||
|
||||
for col in features:
|
||||
residuals = []
|
||||
for _, group in df.groupby(industry_col):
|
||||
if len(group) > 1:
|
||||
x = np.log(group[mkt_cap_col]) # 市值对数
|
||||
y = group[col] # 因子值
|
||||
beta = np.cov(y, x)[0, 1] / np.var(x) # 计算斜率
|
||||
alpha = np.mean(y) - beta * np.mean(x) # 计算截距
|
||||
resid = y - (alpha + beta * x) # 计算残差
|
||||
residuals.extend(resid)
|
||||
else:
|
||||
residuals.extend(group[col]) # 样本不足时保留原值
|
||||
|
||||
df[col] = residuals
|
||||
|
||||
return df
|
||||
|
||||
|
||||
def mad_filter(df, features, n=3):
|
||||
for col in features:
|
||||
median = df[col].median()
|
||||
mad = np.median(np.abs(df[col] - median))
|
||||
upper = median + n * mad
|
||||
lower = median - n * mad
|
||||
df[col] = np.clip(df[col], lower, upper) # 截断极值
|
||||
return df
|
||||
|
||||
|
||||
def percentile_filter(df, features, lower_percentile=0.01, upper_percentile=0.99):
|
||||
for col in features:
|
||||
# 按日期分组计算上下百分位数
|
||||
lower_bound = df.groupby('trade_date')[col].transform(
|
||||
lambda x: x.quantile(lower_percentile)
|
||||
)
|
||||
upper_bound = df.groupby('trade_date')[col].transform(
|
||||
lambda x: x.quantile(upper_percentile)
|
||||
)
|
||||
# 截断超出范围的值
|
||||
df[col] = np.clip(df[col], lower_bound, upper_bound)
|
||||
return df
|
||||
|
||||
|
||||
from scipy.stats import iqr
|
||||
|
||||
|
||||
def iqr_filter(df, features):
|
||||
for col in features:
|
||||
df[col] = df.groupby('trade_date')[col].transform(
|
||||
lambda x: (x - x.median()) / iqr(x) if iqr(x) != 0 else x
|
||||
)
|
||||
return df
|
||||
|
||||
|
||||
def quantile_filter(df, features, lower_quantile=0.01, upper_quantile=0.99, window=60):
|
||||
df = df.copy()
|
||||
for col in features:
|
||||
# 计算 rolling 统计量,需要按日期进行 groupby
|
||||
rolling_lower = df.groupby('trade_date')[col].transform(
|
||||
lambda x: x.rolling(window=min(len(x), window)).quantile(lower_quantile))
|
||||
rolling_upper = df.groupby('trade_date')[col].transform(
|
||||
lambda x: x.rolling(window=min(len(x), window)).quantile(upper_quantile))
|
||||
|
||||
# 对数据进行裁剪
|
||||
df[col] = np.clip(df[col], rolling_lower, rolling_upper)
|
||||
|
||||
return df
|
||||
|
||||
|
||||
def time_series_quantile_filter(df, features, lower_quantile=0.01, upper_quantile=0.99, window=60):
|
||||
df = df.copy()
|
||||
# 确保按股票和时间排序
|
||||
df = df.sort_values(['ts_code', 'trade_date'])
|
||||
grouped = df.groupby('ts_code')
|
||||
for col in features:
|
||||
# 对每个股票的时间序列计算滚动分位数
|
||||
rolling_lower = grouped[col].rolling(window=window, min_periods=window // 2).quantile(lower_quantile)
|
||||
rolling_upper = grouped[col].rolling(window=window, min_periods=window // 2).quantile(upper_quantile)
|
||||
# rolling结果带有多重索引,需要对齐
|
||||
rolling_lower = rolling_lower.reset_index(level=0, drop=True)
|
||||
rolling_upper = rolling_upper.reset_index(level=0, drop=True)
|
||||
# 应用 clip
|
||||
df[col] = np.clip(df[col], rolling_lower, rolling_upper)
|
||||
return df
|
||||
|
||||
|
||||
def cross_sectional_quantile_filter(df, features, lower_quantile=0.01, upper_quantile=0.99):
|
||||
df = df.copy()
|
||||
grouped = df.groupby('trade_date')
|
||||
for col in features:
|
||||
# 计算每日截面的分位数边界
|
||||
lower_bound = grouped[col].transform(lambda x: x.quantile(lower_quantile))
|
||||
upper_bound = grouped[col].transform(lambda x: x.quantile(upper_quantile))
|
||||
# 应用 clip
|
||||
df[col] = np.clip(df[col], lower_bound, upper_bound)
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
from scipy.stats import ks_2samp
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
|
||||
|
||||
def remove_shifted_features(train_data, feature_columns, ks_threshold=0.05, wasserstein_threshold=0.1, size=0.8,
|
||||
log=True, val_data=None):
|
||||
dropped_features = []
|
||||
|
||||
if val_data is None:
|
||||
all_dates = sorted(train_data['trade_date'].unique().tolist()) # 获取所有唯一的 trade_date
|
||||
split_date = all_dates[int(len(all_dates) * size)] # 划分点为倒数第 validation_days 天
|
||||
train_data_split = train_data[train_data['trade_date'] < split_date] # 训练集
|
||||
val_data_split = train_data[train_data['trade_date'] >= split_date] # 验证集
|
||||
else:
|
||||
train_data_split = train_data
|
||||
val_data_split = val_data
|
||||
|
||||
# **统计数据漂移**
|
||||
numeric_columns = train_data_split.select_dtypes(include=['float64', 'int64']).columns
|
||||
numeric_columns = [col for col in numeric_columns if col in feature_columns]
|
||||
for feature in numeric_columns:
|
||||
ks_stat, p_value = ks_2samp(train_data_split[feature], val_data_split[feature])
|
||||
# wasserstein_dist = wasserstein_distance(train_data_split[feature], val_data_split[feature])
|
||||
|
||||
# if p_value < ks_threshold or wasserstein_dist > wasserstein_threshold:
|
||||
if p_value < ks_threshold:
|
||||
dropped_features.append(feature)
|
||||
if log:
|
||||
print(f"检测到 {len(dropped_features)} 个可能漂移的特征: {dropped_features}")
|
||||
|
||||
# **应用阈值进行最终筛选**
|
||||
filtered_features = [f for f in feature_columns if f not in dropped_features]
|
||||
|
||||
return filtered_features, dropped_features
|
||||
|
||||
|
||||
def remove_outliers_label_percentile(label: pd.Series, lower_percentile: float = 0.01, upper_percentile: float = 0.99,
|
||||
log=True):
|
||||
if not (0 <= lower_percentile < upper_percentile <= 1):
|
||||
raise ValueError("Percentile values must satisfy 0 <= lower_percentile < upper_percentile <= 1.")
|
||||
|
||||
# Calculate lower and upper bounds based on percentiles
|
||||
lower_bound = label.quantile(lower_percentile)
|
||||
upper_bound = label.quantile(upper_percentile)
|
||||
|
||||
# Filter out values outside the bounds
|
||||
filtered_label = label[(label >= lower_bound) & (label <= upper_bound)]
|
||||
|
||||
# Print the number of removed outliers
|
||||
if log:
|
||||
print(f"Removed {len(label) - len(filtered_label)} outliers.")
|
||||
return filtered_label
|
||||
|
||||
|
||||
def calculate_risk_adjusted_target(df, days=5):
|
||||
df = df.sort_values(by=['ts_code', 'trade_date'])
|
||||
|
||||
df['future_close'] = df.groupby('ts_code')['close'].shift(-days)
|
||||
df['future_open'] = df.groupby('ts_code')['open'].shift(-1)
|
||||
df['future_return'] = (df['future_close'] - df['future_open']) / df['future_open']
|
||||
|
||||
df['future_volatility'] = df.groupby('ts_code')['future_return'].rolling(days, min_periods=1).std().reset_index(
|
||||
level=0, drop=True)
|
||||
sharpe_ratio = df['future_return'] * df['future_volatility']
|
||||
sharpe_ratio.replace([np.inf, -np.inf], np.nan, inplace=True)
|
||||
|
||||
return sharpe_ratio
|
||||
|
||||
|
||||
def calculate_score(df, days=5, lambda_param=1.0):
|
||||
def calculate_max_drawdown(prices):
|
||||
peak = prices.iloc[0] # 初始化峰值
|
||||
max_drawdown = 0 # 初始化最大回撤
|
||||
|
||||
for price in prices:
|
||||
if price > peak:
|
||||
peak = price # 更新峰值
|
||||
else:
|
||||
drawdown = (peak - price) / peak # 计算当前回撤
|
||||
max_drawdown = max(max_drawdown, drawdown) # 更新最大回撤
|
||||
|
||||
return max_drawdown
|
||||
|
||||
def compute_stock_score(stock_df):
|
||||
stock_df = stock_df.sort_values(by=['trade_date'])
|
||||
future_return = stock_df['future_return']
|
||||
# 使用已有的 pct_chg 字段计算波动率
|
||||
volatility = stock_df['pct_chg'].rolling(days).std().shift(-days)
|
||||
max_drawdown = stock_df['close'].rolling(days).apply(calculate_max_drawdown, raw=False).shift(-days)
|
||||
score = future_return - lambda_param * max_drawdown
|
||||
return score
|
||||
|
||||
# # 确保 DataFrame 按照股票代码和交易日期排序
|
||||
# df = df.sort_values(by=['ts_code', 'trade_date'])
|
||||
|
||||
# 对每个股票分别计算 score
|
||||
df['score'] = df.groupby('ts_code').apply(compute_stock_score).reset_index(level=0, drop=True)
|
||||
|
||||
return df['score']
|
||||
|
||||
|
||||
def remove_highly_correlated_features(df, feature_columns, threshold=0.9):
|
||||
numeric_features = df[feature_columns].select_dtypes(include=[np.number]).columns.tolist()
|
||||
if not numeric_features:
|
||||
raise ValueError("No numeric features found in the provided data.")
|
||||
|
||||
corr_matrix = df[numeric_features].corr().abs()
|
||||
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
|
||||
to_drop = [column for column in upper.columns if any(upper[column] > threshold)]
|
||||
remaining_features = [col for col in feature_columns if col not in to_drop
|
||||
or 'act' in col or 'af' in col]
|
||||
return remaining_features
|
||||
|
||||
|
||||
def cross_sectional_standardization(df, features):
|
||||
df_sorted = df.sort_values(by='trade_date') # 按时间排序
|
||||
df_standardized = df_sorted.copy()
|
||||
|
||||
for date in df_sorted['trade_date'].unique():
|
||||
# 获取当前时间点的数据
|
||||
current_data = df_standardized[df_standardized['trade_date'] == date]
|
||||
|
||||
# 只对指定特征进行标准化
|
||||
scaler = StandardScaler()
|
||||
standardized_values = scaler.fit_transform(current_data[features])
|
||||
|
||||
# 将标准化结果重新赋值回去
|
||||
df_standardized.loc[df_standardized['trade_date'] == date, features] = standardized_values
|
||||
|
||||
return df_standardized
|
||||
|
||||
|
||||
def neutralize_manual(df, features, industry_col, mkt_cap_col):
|
||||
""" 手动实现简单回归以提升速度 """
|
||||
|
||||
for col in features:
|
||||
residuals = []
|
||||
for _, group in df.groupby(industry_col):
|
||||
if len(group) > 1:
|
||||
x = np.log(group[mkt_cap_col]) # 市值对数
|
||||
y = group[col] # 因子值
|
||||
beta = np.cov(y, x)[0, 1] / np.var(x) # 计算斜率
|
||||
alpha = np.mean(y) - beta * np.mean(x) # 计算截距
|
||||
resid = y - (alpha + beta * x) # 计算残差
|
||||
residuals.extend(resid)
|
||||
else:
|
||||
residuals.extend(group[col]) # 样本不足时保留原值
|
||||
|
||||
df[col] = residuals
|
||||
|
||||
return df
|
||||
|
||||
|
||||
def mad_filter(df, features, n=3):
|
||||
for col in features:
|
||||
median = df[col].median()
|
||||
mad = np.median(np.abs(df[col] - median))
|
||||
upper = median + n * mad
|
||||
lower = median - n * mad
|
||||
df[col] = np.clip(df[col], lower, upper) # 截断极值
|
||||
return df
|
||||
|
||||
|
||||
def percentile_filter(df, features, lower_percentile=0.01, upper_percentile=0.99):
|
||||
for col in features:
|
||||
# 按日期分组计算上下百分位数
|
||||
lower_bound = df.groupby('trade_date')[col].transform(
|
||||
lambda x: x.quantile(lower_percentile)
|
||||
)
|
||||
upper_bound = df.groupby('trade_date')[col].transform(
|
||||
lambda x: x.quantile(upper_percentile)
|
||||
)
|
||||
# 截断超出范围的值
|
||||
df[col] = np.clip(df[col], lower_bound, upper_bound)
|
||||
return df
|
||||
|
||||
|
||||
from scipy.stats import iqr
|
||||
|
||||
|
||||
def iqr_filter(df, features):
|
||||
for col in features:
|
||||
df[col] = df.groupby('trade_date')[col].transform(
|
||||
lambda x: (x - x.median()) / iqr(x) if iqr(x) != 0 else x
|
||||
)
|
||||
return df
|
||||
|
||||
|
||||
def quantile_filter(df, features, lower_quantile=0.01, upper_quantile=0.99, window=60):
|
||||
df = df.copy()
|
||||
for col in features:
|
||||
# 计算 rolling 统计量,需要按日期进行 groupby
|
||||
rolling_lower = df.groupby('trade_date')[col].transform(
|
||||
lambda x: x.rolling(window=min(len(x), window)).quantile(lower_quantile))
|
||||
rolling_upper = df.groupby('trade_date')[col].transform(
|
||||
lambda x: x.rolling(window=min(len(x), window)).quantile(upper_quantile))
|
||||
|
||||
# 对数据进行裁剪
|
||||
df[col] = np.clip(df[col], rolling_lower, rolling_upper)
|
||||
|
||||
return df
|
||||
|
||||
|
||||
def time_series_quantile_filter(df, features, lower_quantile=0.01, upper_quantile=0.99, window=60):
|
||||
df = df.copy()
|
||||
# 确保按股票和时间排序
|
||||
df = df.sort_values(['ts_code', 'trade_date'])
|
||||
grouped = df.groupby('ts_code')
|
||||
for col in features:
|
||||
# 对每个股票的时间序列计算滚动分位数
|
||||
rolling_lower = grouped[col].rolling(window=window, min_periods=window // 2).quantile(lower_quantile)
|
||||
rolling_upper = grouped[col].rolling(window=window, min_periods=window // 2).quantile(upper_quantile)
|
||||
# rolling结果带有多重索引,需要对齐
|
||||
rolling_lower = rolling_lower.reset_index(level=0, drop=True)
|
||||
rolling_upper = rolling_upper.reset_index(level=0, drop=True)
|
||||
# 应用 clip
|
||||
df[col] = np.clip(df[col], rolling_lower, rolling_upper)
|
||||
return df
|
||||
|
||||
|
||||
def cross_sectional_quantile_filter(df, features, lower_quantile=0.01, upper_quantile=0.99):
|
||||
df = df.copy()
|
||||
grouped = df.groupby('trade_date')
|
||||
for col in features:
|
||||
# 计算每日截面的分位数边界
|
||||
lower_bound = grouped[col].transform(lambda x: x.quantile(lower_quantile))
|
||||
upper_bound = grouped[col].transform(lambda x: x.quantile(upper_quantile))
|
||||
# 应用 clip
|
||||
df[col] = np.clip(df[col], lower_bound, upper_bound)
|
||||
return df
|
||||
@@ -1,154 +1,154 @@
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
|
||||
def read_and_merge_h5_data(h5_filename, key, columns, df=None, join='left', on=['ts_code', 'trade_date'], prefix=None):
|
||||
processed_columns = []
|
||||
for col in columns:
|
||||
if col.startswith('_'):
|
||||
processed_columns.append(col[1:]) # 去掉下划线
|
||||
else:
|
||||
processed_columns.append(col)
|
||||
|
||||
# 从 HDF5 文件读取数据,选择需要的列
|
||||
data = pd.read_hdf(h5_filename, key=key, columns=processed_columns)
|
||||
|
||||
# 修改列名,如果列名以前有 _,加上 _
|
||||
for col in data.columns:
|
||||
if col not in columns: # 只有不在 columns 中的列才需要加下划线
|
||||
new_col = f'_{col}'
|
||||
data.rename(columns={col: new_col}, inplace=True)
|
||||
|
||||
if prefix is not None:
|
||||
for col in data.columns:
|
||||
if col not in ['ts_code', 'trade_date']: # 只有不在 columns 中的列才需要加下划线
|
||||
new_col = f'{prefix}_{col}'
|
||||
data.rename(columns={col: new_col}, inplace=True)
|
||||
|
||||
# 如果传入的 df 不为空,则进行合并
|
||||
if df is not None and not df.empty:
|
||||
print(f'{join} merge on {on}')
|
||||
if 'trade_date' in on:
|
||||
# 确保两个 DataFrame 都有 ts_code 和 trade_date 列
|
||||
df['trade_date'] = pd.to_datetime(df['trade_date'], format='%Y%m%d')
|
||||
data['trade_date'] = pd.to_datetime(data['trade_date'], format='%Y%m%d')
|
||||
|
||||
# 根据 ts_code 和 trade_date 合并
|
||||
merged_df = pd.merge(df, data, on=on, how=join)
|
||||
else:
|
||||
# 如果 df 为空,则直接返回读取的数据
|
||||
merged_df = data
|
||||
|
||||
return merged_df
|
||||
|
||||
|
||||
def merge_with_industry_data(df, industry_df):
|
||||
# 确保日期字段是 datetime 类型
|
||||
df['trade_date'] = pd.to_datetime(df['trade_date'])
|
||||
industry_df['in_date'] = pd.to_datetime(industry_df['in_date'])
|
||||
|
||||
# 对 industry_df 按 ts_code 和 in_date 排序
|
||||
industry_df_sorted = industry_df.sort_values(['in_date', 'ts_code'])
|
||||
|
||||
# 对原始 df 按 ts_code 和 trade_date 排序
|
||||
df_sorted = df.sort_values(['trade_date', 'ts_code'])
|
||||
|
||||
# 使用 merge_asof 进行向后合并
|
||||
merged = pd.merge_asof(
|
||||
df_sorted,
|
||||
industry_df_sorted,
|
||||
by='ts_code', # 按 ts_code 分组
|
||||
left_on='trade_date',
|
||||
right_on='in_date',
|
||||
direction='backward'
|
||||
)
|
||||
|
||||
# 获取每个 ts_code 的最早 in_date 记录
|
||||
min_in_date_per_ts = (industry_df_sorted
|
||||
.groupby('ts_code')
|
||||
.first()
|
||||
.reset_index()[['ts_code', 'l2_code']])
|
||||
|
||||
# 填充未匹配到的记录(trade_date 早于所有 in_date 的情况)
|
||||
merged['l2_code'] = merged['l2_code'].fillna(
|
||||
merged['ts_code'].map(min_in_date_per_ts.set_index('ts_code')['l2_code'])
|
||||
)
|
||||
|
||||
# 保留需要的列并重置索引
|
||||
result = merged.reset_index(drop=True)
|
||||
return result
|
||||
|
||||
|
||||
def calculate_risk_adjusted_return(df, days=1, method='ratio', lambda_=0.5, eps=1e-8):
|
||||
"""
|
||||
计算单只股票的风险调整收益。
|
||||
|
||||
参数:
|
||||
- df: DataFrame,包含 'ts_code' 和 'close' 列,按日期排序(假设 'trade_date' 已排序)。
|
||||
- days: 预测未来多少天的收益,默认1天。
|
||||
- method: 'ratio'(收益/波动率) 或 'difference'(收益 - λ * 波动率)。
|
||||
- lambda_: 风险惩罚系数,仅当 method='difference' 时有效。
|
||||
- eps: 防止除零的小常数。
|
||||
|
||||
返回:
|
||||
- df:添加 'risk_adj_return' 列的 DataFrame,表示风险调整后的收益。
|
||||
"""
|
||||
# 确保数据按 ts_code 和 trade_date 排序
|
||||
df = df.sort_values(by=['ts_code', 'trade_date'])
|
||||
|
||||
# 计算未来的对数收益率
|
||||
df['future_return'] = np.log(df.groupby('ts_code')['close'].shift(-days) / df['close'])
|
||||
|
||||
# 计算历史收益(对数收益率)
|
||||
df['historical_return'] = np.log(df.groupby('ts_code')['close'].shift(1) / df['close'])
|
||||
|
||||
# 计算波动率(历史收益的标准差)
|
||||
df['volatility'] = df.groupby('ts_code')['historical_return'].rolling(window=days).std().reset_index(level=0,
|
||||
drop=True)
|
||||
|
||||
# 根据选择的 method 计算风险调整收益
|
||||
if method == 'ratio':
|
||||
# 收益/波动率(防止除零)
|
||||
df['risk_adj_return'] = df['future_return'] / (df['volatility'] + eps)
|
||||
elif method == 'difference':
|
||||
# 收益 - λ * 波动率
|
||||
df['risk_adj_return'] = df['future_return'] - lambda_ * df['volatility']
|
||||
else:
|
||||
raise ValueError("Invalid method. Use 'ratio' or 'difference'.")
|
||||
|
||||
return df
|
||||
|
||||
# import polars as pl
|
||||
#
|
||||
# def read_and_merge_h5_data_polars(h5_filename, key, columns, df=None, join='left', on=['ts_code', 'trade_date']):
|
||||
# processed_columns = []
|
||||
# for col in columns:
|
||||
# if col.startswith('_'):
|
||||
# processed_columns.append(col[1:]) # 去掉下划线
|
||||
# else:
|
||||
# processed_columns.append(col)
|
||||
#
|
||||
# # 从 HDF5 文件读取数据,选择需要的列
|
||||
# pd_df = pd.read_hdf(h5_filename, key=key, columns=processed_columns)
|
||||
#
|
||||
# # 将 Pandas DataFrame 转换为 Polars DataFrame
|
||||
# data = pl.from_pandas(pd_df)
|
||||
#
|
||||
# # 修改列名,如果列名以前有 _,加上 _
|
||||
# data = data.rename({col: f'_{col}' for col in data.columns if col not in columns})
|
||||
#
|
||||
# # 如果传入的 df 不为空,则进行合并
|
||||
# if df is not None and not df.is_empty():
|
||||
# print(f'{join} merge on {on}')
|
||||
#
|
||||
# # 确保两个 DataFrame 都有 ts_code 和 trade_date 列
|
||||
# # df = df.with_columns(pl.col('trade_date').str.strptime(pl.Datetime, format='%Y%m%d'))
|
||||
# # data = data.with_columns(pl.col('trade_date').str.strptime(pl.Datetime, format='%Y%m%d'))
|
||||
#
|
||||
# # 根据 ts_code 和 trade_date 合并
|
||||
# merged_df = df.join(data, on=on, how=join)
|
||||
# else:
|
||||
# # 如果 df 为空,则直接返回读取的数据
|
||||
# merged_df = data
|
||||
#
|
||||
# return merged_df
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
|
||||
def read_and_merge_h5_data(h5_filename, key, columns, df=None, join='left', on=['ts_code', 'trade_date'], prefix=None):
|
||||
processed_columns = []
|
||||
for col in columns:
|
||||
if col.startswith('_'):
|
||||
processed_columns.append(col[1:]) # 去掉下划线
|
||||
else:
|
||||
processed_columns.append(col)
|
||||
|
||||
# 从 HDF5 文件读取数据,选择需要的列
|
||||
data = pd.read_hdf(h5_filename, key=key, columns=processed_columns)
|
||||
|
||||
# 修改列名,如果列名以前有 _,加上 _
|
||||
for col in data.columns:
|
||||
if col not in columns: # 只有不在 columns 中的列才需要加下划线
|
||||
new_col = f'_{col}'
|
||||
data.rename(columns={col: new_col}, inplace=True)
|
||||
|
||||
if prefix is not None:
|
||||
for col in data.columns:
|
||||
if col not in ['ts_code', 'trade_date']: # 只有不在 columns 中的列才需要加下划线
|
||||
new_col = f'{prefix}_{col}'
|
||||
data.rename(columns={col: new_col}, inplace=True)
|
||||
|
||||
# 如果传入的 df 不为空,则进行合并
|
||||
if df is not None and not df.empty:
|
||||
print(f'{join} merge on {on}')
|
||||
if 'trade_date' in on:
|
||||
# 确保两个 DataFrame 都有 ts_code 和 trade_date 列
|
||||
df['trade_date'] = pd.to_datetime(df['trade_date'], format='%Y%m%d')
|
||||
data['trade_date'] = pd.to_datetime(data['trade_date'], format='%Y%m%d')
|
||||
|
||||
# 根据 ts_code 和 trade_date 合并
|
||||
merged_df = pd.merge(df, data, on=on, how=join)
|
||||
else:
|
||||
# 如果 df 为空,则直接返回读取的数据
|
||||
merged_df = data
|
||||
|
||||
return merged_df
|
||||
|
||||
|
||||
def merge_with_industry_data(df, industry_df):
|
||||
# 确保日期字段是 datetime 类型
|
||||
df['trade_date'] = pd.to_datetime(df['trade_date'])
|
||||
industry_df['in_date'] = pd.to_datetime(industry_df['in_date'])
|
||||
|
||||
# 对 industry_df 按 ts_code 和 in_date 排序
|
||||
industry_df_sorted = industry_df.sort_values(['in_date', 'ts_code'])
|
||||
|
||||
# 对原始 df 按 ts_code 和 trade_date 排序
|
||||
df_sorted = df.sort_values(['trade_date', 'ts_code'])
|
||||
|
||||
# 使用 merge_asof 进行向后合并
|
||||
merged = pd.merge_asof(
|
||||
df_sorted,
|
||||
industry_df_sorted,
|
||||
by='ts_code', # 按 ts_code 分组
|
||||
left_on='trade_date',
|
||||
right_on='in_date',
|
||||
direction='backward'
|
||||
)
|
||||
|
||||
# 获取每个 ts_code 的最早 in_date 记录
|
||||
min_in_date_per_ts = (industry_df_sorted
|
||||
.groupby('ts_code')
|
||||
.first()
|
||||
.reset_index()[['ts_code', 'l2_code']])
|
||||
|
||||
# 填充未匹配到的记录(trade_date 早于所有 in_date 的情况)
|
||||
merged['l2_code'] = merged['l2_code'].fillna(
|
||||
merged['ts_code'].map(min_in_date_per_ts.set_index('ts_code')['l2_code'])
|
||||
)
|
||||
|
||||
# 保留需要的列并重置索引
|
||||
result = merged.reset_index(drop=True)
|
||||
return result
|
||||
|
||||
|
||||
def calculate_risk_adjusted_return(df, days=1, method='ratio', lambda_=0.5, eps=1e-8):
|
||||
"""
|
||||
计算单只股票的风险调整收益。
|
||||
|
||||
参数:
|
||||
- df: DataFrame,包含 'ts_code' 和 'close' 列,按日期排序(假设 'trade_date' 已排序)。
|
||||
- days: 预测未来多少天的收益,默认1天。
|
||||
- method: 'ratio'(收益/波动率) 或 'difference'(收益 - λ * 波动率)。
|
||||
- lambda_: 风险惩罚系数,仅当 method='difference' 时有效。
|
||||
- eps: 防止除零的小常数。
|
||||
|
||||
返回:
|
||||
- df:添加 'risk_adj_return' 列的 DataFrame,表示风险调整后的收益。
|
||||
"""
|
||||
# 确保数据按 ts_code 和 trade_date 排序
|
||||
df = df.sort_values(by=['ts_code', 'trade_date'])
|
||||
|
||||
# 计算未来的对数收益率
|
||||
df['future_return'] = np.log(df.groupby('ts_code')['close'].shift(-days) / df['close'])
|
||||
|
||||
# 计算历史收益(对数收益率)
|
||||
df['historical_return'] = np.log(df.groupby('ts_code')['close'].shift(1) / df['close'])
|
||||
|
||||
# 计算波动率(历史收益的标准差)
|
||||
df['volatility'] = df.groupby('ts_code')['historical_return'].rolling(window=days).std().reset_index(level=0,
|
||||
drop=True)
|
||||
|
||||
# 根据选择的 method 计算风险调整收益
|
||||
if method == 'ratio':
|
||||
# 收益/波动率(防止除零)
|
||||
df['risk_adj_return'] = df['future_return'] / (df['volatility'] + eps)
|
||||
elif method == 'difference':
|
||||
# 收益 - λ * 波动率
|
||||
df['risk_adj_return'] = df['future_return'] - lambda_ * df['volatility']
|
||||
else:
|
||||
raise ValueError("Invalid method. Use 'ratio' or 'difference'.")
|
||||
|
||||
return df
|
||||
|
||||
# import polars as pl
|
||||
#
|
||||
# def read_and_merge_h5_data_polars(h5_filename, key, columns, df=None, join='left', on=['ts_code', 'trade_date']):
|
||||
# processed_columns = []
|
||||
# for col in columns:
|
||||
# if col.startswith('_'):
|
||||
# processed_columns.append(col[1:]) # 去掉下划线
|
||||
# else:
|
||||
# processed_columns.append(col)
|
||||
#
|
||||
# # 从 HDF5 文件读取数据,选择需要的列
|
||||
# pd_df = pd.read_hdf(h5_filename, key=key, columns=processed_columns)
|
||||
#
|
||||
# # 将 Pandas DataFrame 转换为 Polars DataFrame
|
||||
# data = pl.from_pandas(pd_df)
|
||||
#
|
||||
# # 修改列名,如果列名以前有 _,加上 _
|
||||
# data = data.rename({col: f'_{col}' for col in data.columns if col not in columns})
|
||||
#
|
||||
# # 如果传入的 df 不为空,则进行合并
|
||||
# if df is not None and not df.is_empty():
|
||||
# print(f'{join} merge on {on}')
|
||||
#
|
||||
# # 确保两个 DataFrame 都有 ts_code 和 trade_date 列
|
||||
# # df = df.with_columns(pl.col('trade_date').str.strptime(pl.Datetime, format='%Y%m%d'))
|
||||
# # data = data.with_columns(pl.col('trade_date').str.strptime(pl.Datetime, format='%Y%m%d'))
|
||||
#
|
||||
# # 根据 ts_code 和 trade_date 合并
|
||||
# merged_df = df.join(data, on=on, how=join)
|
||||
# else:
|
||||
# # 如果 df 为空,则直接返回读取的数据
|
||||
# merged_df = data
|
||||
#
|
||||
# return merged_df
|
||||
|
||||
Reference in New Issue
Block a user