Files
NewStock/main/factor/polars_complex_factors.py

649 lines
23 KiB
Python
Raw Normal View History

2025-10-13 21:42:35 +08:00
"""
复杂组合因子 - 使用Polars实现
包含复杂的组合因子和高级因子计算
"""
import polars as pl
import numpy as np
from typing import Dict, List, Optional, Any
from operator_framework import StockWiseOperator, DateWiseOperator, OperatorConfig
# 时间序列因子
class LargeFlowMomentumCorrelationOperator(StockWiseOperator):
"""大单资金流与价格动量相关性算子"""
def __init__(self, n: int = 20, m: int = 60):
config = OperatorConfig(
name=f"lg_flow_mom_corr_{n}_{m}",
description=f"{n}日大单资金流与{m}日价格动量相关性",
required_columns=['buy_lg_vol', 'buy_elg_vol', 'sell_lg_vol', 'sell_elg_vol',
'close', 'vol'],
output_columns=[f'lg_flow_mom_corr_{n}_{m}'],
parameters={'n': n, 'm': m}
)
super().__init__(config)
self.n = n
self.m = m
def apply_stock(self, stock_df: pl.DataFrame, **kwargs) -> pl.DataFrame:
"""计算大单资金流与价格动量相关性"""
# 计算大单净额
net_lg_flow_val = (
(pl.col('buy_lg_vol') + pl.col('buy_elg_vol') -
pl.col('sell_lg_vol') - pl.col('sell_elg_vol')) * pl.col('close')
)
# 计算滚动净大单流
rolling_net_lg_flow = net_lg_flow_val.rolling_sum(window=self.n)
# 计算价格动量
price_mom = pl.col('close').pct_change(self.n)
# 计算相关性
correlation = rolling_net_lg_flow.rolling_corr(price_mom, window=self.m)
return stock_df.with_columns(
correlation.alias(f'lg_flow_mom_corr_{self.n}_{self.m}')
)
class LargeBuyConsolidationOperator(StockWiseOperator):
"""大单买入盘整期算子"""
def __init__(self, n: int = 20, vol_quantile: float = 0.2):
config = OperatorConfig(
name=f"lg_buy_consolidation_{n}",
description=f"{n}日大单买入盘整期",
required_columns=['close', 'buy_lg_vol', 'buy_elg_vol', 'sell_lg_vol',
'sell_elg_vol', 'vol'],
output_columns=[f'lg_buy_consolidation_{n}'],
parameters={'n': n, 'vol_quantile': vol_quantile}
)
super().__init__(config)
self.n = n
self.vol_quantile = vol_quantile
def apply_stock(self, stock_df: pl.DataFrame, **kwargs) -> pl.DataFrame:
"""计算大单买入盘整期"""
epsilon = 1e-8
# 计算收盘价滚动标准差
rolling_std = pl.col('close').rolling_std(window=self.n)
# 计算大单净流比率
net_lg_flow_ratio = (
(pl.col('buy_lg_vol') + pl.col('buy_elg_vol') -
pl.col('sell_lg_vol') - pl.col('sell_elg_vol')) /
(pl.col('vol') + epsilon)
)
# 计算滚动均值
rolling_mean_ratio = net_lg_flow_ratio.rolling_mean(window=self.n)
return stock_df.with_columns(
rolling_mean_ratio.alias(f'lg_buy_consolidation_{self.n}')
)
class IntradayLargeFlowCorrelationOperator(StockWiseOperator):
"""日内趋势与大单流相关性算子"""
def __init__(self, n: int = 20):
config = OperatorConfig(
name=f"intraday_lg_flow_corr_{n}",
description=f"{n}日日内趋势与大单流相关性",
required_columns=['high', 'low', 'close', 'buy_lg_vol', 'buy_elg_vol',
'sell_lg_vol', 'sell_elg_vol'],
output_columns=[f'intraday_lg_flow_corr_{n}'],
parameters={'n': n}
)
super().__init__(config)
self.n = n
def apply_stock(self, stock_df: pl.DataFrame, **kwargs) -> pl.DataFrame:
"""计算日内趋势与大单流相关性"""
# 这是一个复杂的因子,简化处理
# 实际实现需要更复杂的日内数据
placeholder = pl.lit(None).cast(float)
return stock_df.with_columns(
placeholder.alias(f'intraday_lg_flow_corr_{self.n}')
)
class ProfitPressureOperator(StockWiseOperator):
"""获利压力指数算子"""
def __init__(self):
config = OperatorConfig(
name="profit_pressure",
description="获利压力指数",
required_columns=['close', 'cost_85pct', 'cost_95pct', 'winner_rate'],
output_columns=['profit_pressure'],
parameters={}
)
super().__init__(config)
def apply_stock(self, stock_df: pl.DataFrame, **kwargs) -> pl.DataFrame:
"""计算获利压力指数"""
epsilon = 1e-8
# 计算盈利幅度
profit_margin_85 = (pl.col('close') / (pl.col('cost_85pct') + epsilon)) - 1
profit_margin_95 = (pl.col('close') / (pl.col('cost_95pct') + epsilon)) - 1
# 计算压力指数
pressure = pl.col('winner_rate') * 0.5 * (profit_margin_85 + profit_margin_95)
return stock_df.with_columns(pressure.alias('profit_pressure'))
class UnderwaterResistanceOperator(StockWiseOperator):
"""套牢盘阻力算子"""
def __init__(self):
config = OperatorConfig(
name="underwater_resistance",
description="套牢盘阻力",
required_columns=['close', 'winner_rate', 'cost_15pct'],
output_columns=['underwater_resistance'],
parameters={}
)
super().__init__(config)
def apply_stock(self, stock_df: pl.DataFrame, **kwargs) -> pl.DataFrame:
"""计算套牢盘阻力"""
epsilon = 1e-8
# 计算套牢比例
underwater_ratio = 1.0 - pl.col('winner_rate')
# 计算与成本的距离
dist_to_cost_15 = pl.max_horizontal(0, pl.col('cost_15pct') - pl.col('close')) / (pl.col('close') + epsilon)
# 计算阻力
resistance = underwater_ratio * dist_to_cost_15
return stock_df.with_columns(resistance.alias('underwater_resistance'))
class ProfitDecayOperator(StockWiseOperator):
"""盈利预期衰减算子"""
def __init__(self, n: int = 20):
config = OperatorConfig(
name=f"profit_decay_{n}",
description=f"{n}日盈利预期衰减",
required_columns=['close', 'winner_rate'],
output_columns=[f'profit_decay_{n}'],
parameters={'n': n}
)
super().__init__(config)
self.n = n
def apply_stock(self, stock_df: pl.DataFrame, **kwargs) -> pl.DataFrame:
"""计算盈利预期衰减"""
# 计算n日收益率
ret_n = pl.col('close').pct_change(self.n)
# 计算winner_rate变化
winner_rate_change = pl.col('winner_rate').diff(self.n)
# 计算衰减因子
decay = ret_n / (winner_rate_change + 1e-8)
return stock_df.with_columns(decay.alias(f'profit_decay_{self.n}'))
class PullbackStrongOperator(StockWiseOperator):
"""强势股回调深度算子"""
def __init__(self, n: int = 20, m: int = 20, gain_thresh: float = 0.2):
config = OperatorConfig(
name=f"pullback_strong_{n}_{m}",
description=f"{n}{m}期强势股回调深度",
required_columns=['high', 'close'],
output_columns=[f'pullback_strong_{n}_{m}'],
parameters={'n': n, 'm': m, 'gain_thresh': gain_thresh}
)
super().__init__(config)
self.n = n
self.m = m
self.gain_thresh = gain_thresh
def apply_stock(self, stock_df: pl.DataFrame, **kwargs) -> pl.DataFrame:
"""计算强势股回调深度"""
# 计算n日最高价
high_n = pl.col('high').rolling_max(window=self.n)
# 计算回调深度
pullback_depth = (high_n - pl.col('close')) / high_n
# 计算近期涨幅
recent_gain = (pl.col('close') / pl.col('close').shift(self.m)) - 1
# 计算回调因子
pullback_factor = pullback_depth / (recent_gain + 1e-8)
return stock_df.with_columns(pullback_factor.alias(f'pullback_strong_{self.n}_{self.m}'))
class HurstExponentFlowOperator(StockWiseOperator):
"""资金流Hurst指数算子"""
def __init__(self, n: int = 60, flow_col: str = 'net_mf_vol'):
config = OperatorConfig(
name=f"hurst_{flow_col}_{n}",
description=f"{n}{flow_col}Hurst指数",
required_columns=[flow_col],
output_columns=[f'hurst_{flow_col}_{n}'],
parameters={'n': n, 'flow_col': flow_col}
)
super().__init__(config)
self.n = n
self.flow_col = flow_col
def apply_stock(self, stock_df: pl.DataFrame, **kwargs) -> pl.DataFrame:
"""计算Hurst指数"""
# Hurst指数计算复杂这里使用占位符
# 实际实现需要专门的Hurst指数计算库
placeholder = pl.lit(None).cast(float)
return stock_df.with_columns(
placeholder.alias(f'hurst_{self.flow_col}_{self.n}')
)
class VolWeightedHistoricalPositionOperator(StockWiseOperator):
"""成交量加权历史位置算子"""
def __init__(self, n: int = 20):
config = OperatorConfig(
name=f"vol_wgt_hist_pos_{n}",
description=f"{n}日成交量加权历史位置",
required_columns=['close', 'his_high', 'his_low', 'vol'],
output_columns=[f'vol_wgt_hist_pos_{n}'],
parameters={'n': n}
)
super().__init__(config)
self.n = n
def apply_stock(self, stock_df: pl.DataFrame, **kwargs) -> pl.DataFrame:
"""计算成交量加权历史位置"""
# 计算历史位置
hist_pos = (pl.col('close') - pl.col('his_low')) / (pl.col('his_high') - pl.col('his_low'))
hist_pos = hist_pos.clip(0, 1)
# 计算成交量相对强度
rolling_mean_vol = pl.col('vol').rolling_mean(window=self.n)
vol_rel_strength = pl.col('vol') / rolling_mean_vol
# 计算加权位置
weighted_pos = hist_pos * vol_rel_strength
return stock_df.with_columns(weighted_pos.alias(f'vol_wgt_hist_pos_{self.n}'))
# 横截面因子
class CrossSectionalRankOperator(DateWiseOperator):
"""横截面排名算子"""
def __init__(self, column: str, ascending: bool = True):
config = OperatorConfig(
name=f"cs_rank_{column}",
description=f"{column}横截面排名",
required_columns=[column],
output_columns=[f'cs_rank_{column}'],
parameters={'column': column, 'ascending': ascending}
)
super().__init__(config)
self.column = column
self.ascending = ascending
def apply_date(self, date_df: pl.DataFrame, **kwargs) -> pl.DataFrame:
"""计算横截面排名"""
# 计算排名
rank_col = pl.col(self.column).rank(method='dense', descending=not self.ascending)
# 转换为百分比排名
pct_rank = rank_col / rank_col.max()
return date_df.with_columns(pct_rank.alias(f'cs_rank_{self.column}'))
class CrossSectionalNetLargeFlowRankOperator(DateWiseOperator):
"""横截面大单净额排名算子"""
def __init__(self):
config = OperatorConfig(
name="cs_rank_net_lg_flow_val",
description="横截面大单净额排名",
required_columns=['buy_lg_vol', 'buy_elg_vol', 'sell_lg_vol', 'sell_elg_vol', 'close'],
output_columns=['cs_rank_net_lg_flow_val'],
parameters={}
)
super().__init__(config)
def apply_date(self, date_df: pl.DataFrame, **kwargs) -> pl.DataFrame:
"""计算横截面大单净额排名"""
# 计算大单净额
net_lg_flow_val = (
(pl.col('buy_lg_vol') + pl.col('buy_elg_vol') -
pl.col('sell_lg_vol') - pl.col('sell_elg_vol')) * pl.col('close')
)
# 计算排名
rank_col = net_lg_flow_val.rank(method='dense', descending=True)
pct_rank = rank_col / rank_col.max()
return date_df.with_columns(pct_rank.alias('cs_rank_net_lg_flow_val'))
class CrossSectionalFlowDivergenceRankOperator(DateWiseOperator):
"""横截面流向背离度排名算子"""
def __init__(self):
config = OperatorConfig(
name="cs_rank_flow_divergence",
description="横截面流向背离度排名",
required_columns=['buy_sm_vol', 'sell_sm_vol', 'buy_lg_vol', 'buy_elg_vol',
'sell_lg_vol', 'sell_elg_vol', 'vol'],
output_columns=['cs_rank_flow_divergence'],
parameters={}
)
super().__init__(config)
def apply_date(self, date_df: pl.DataFrame, **kwargs) -> pl.DataFrame:
"""计算横截面流向背离度排名"""
epsilon = 1e-8
# 计算大单比率
lg_ratio = (
(pl.col('buy_lg_vol') + pl.col('buy_elg_vol') -
pl.col('sell_lg_vol') - pl.col('sell_elg_vol')) /
(pl.col('vol') + epsilon)
)
# 计算小单比率
sm_ratio = (pl.col('buy_sm_vol') - pl.col('sell_sm_vol')) / (pl.col('vol') + epsilon)
# 计算背离度
divergence = lg_ratio - sm_ratio
# 计算排名
rank_col = divergence.rank(method='dense', descending=True)
pct_rank = rank_col / rank_col.max()
return date_df.with_columns(pct_rank.alias('cs_rank_flow_divergence'))
class CrossSectionalRelativeProfitMarginRankOperator(DateWiseOperator):
"""横截面相对盈利幅度排名算子"""
def __init__(self):
config = OperatorConfig(
name="cs_rank_rel_profit_margin",
description="横截面相对盈利幅度排名",
required_columns=['close', 'weight_avg'],
output_columns=['cs_rank_rel_profit_margin'],
parameters={}
)
super().__init__(config)
def apply_date(self, date_df: pl.DataFrame, **kwargs) -> pl.DataFrame:
"""计算横截面相对盈利幅度排名"""
# 计算盈利幅度
profit_margin = (pl.col('close') - pl.col('weight_avg')) / pl.col('close')
# 计算排名
rank_col = profit_margin.rank(method='dense', descending=True)
pct_rank = rank_col / rank_col.max()
return date_df.with_columns(pct_rank.alias('cs_rank_rel_profit_margin'))
class CrossSectionalCostBreadthRankOperator(DateWiseOperator):
"""横截面成本分布宽度排名算子"""
def __init__(self):
config = OperatorConfig(
name="cs_rank_cost_breadth",
description="横截面成本分布宽度排名",
required_columns=['cost_85pct', 'cost_15pct', 'weight_avg'],
output_columns=['cs_rank_cost_breadth'],
parameters={}
)
super().__init__(config)
def apply_date(self, date_df: pl.DataFrame, **kwargs) -> pl.DataFrame:
"""计算横截面成本分布宽度排名"""
epsilon = 1e-8
# 计算成本宽度
cost_breadth = (pl.col('cost_85pct') - pl.col('cost_15pct')) / (pl.col('weight_avg') + epsilon)
# 计算排名
rank_col = cost_breadth.rank(method='dense', descending=True)
pct_rank = rank_col / rank_col.max()
return date_df.with_columns(pct_rank.alias('cs_rank_cost_breadth'))
class CrossSectionalWinnerRateRankOperator(DateWiseOperator):
"""横截面获利盘比例排名算子"""
def __init__(self):
config = OperatorConfig(
name="cs_rank_winner_rate",
description="横截面获利盘比例排名",
required_columns=['winner_rate'],
output_columns=['cs_rank_winner_rate'],
parameters={}
)
super().__init__(config)
def apply_date(self, date_df: pl.DataFrame, **kwargs) -> pl.DataFrame:
"""计算横截面获利盘比例排名"""
# 计算排名
rank_col = pl.col('winner_rate').rank(method='dense', descending=True)
pct_rank = rank_col / rank_col.max()
return date_df.with_columns(pct_rank.alias('cs_rank_winner_rate'))
class CrossSectionalVolumeRatioRankOperator(DateWiseOperator):
"""横截面量比排名算子"""
def __init__(self):
config = OperatorConfig(
name="cs_rank_volume_ratio",
description="横截面量比排名",
required_columns=['volume_ratio'],
output_columns=['cs_rank_volume_ratio'],
parameters={}
)
super().__init__(config)
def apply_date(self, date_df: pl.DataFrame, **kwargs) -> pl.DataFrame:
"""计算横截面量比排名"""
# 计算排名
rank_col = pl.col('volume_ratio').rank(method='dense', descending=True)
pct_rank = rank_col / rank_col.max()
return date_df.with_columns(pct_rank.alias('cs_rank_volume_ratio'))
# 复杂组合因子
class ComplexFactorDEAPOperator(StockWiseOperator):
"""DEAP复杂因子算子"""
def __init__(self):
config = OperatorConfig(
name="complex_factor_deap_1",
description="DEAP复杂组合因子",
required_columns=['pullback_strong_20_20', 'log_close', 'industry_return_5',
'vol_adj_roc_20', 'vol_drop_profit_cnt_5', 'nonlinear_mv_volume',
'alpha_007', 'lg_buy_consolidation_20', 'net_mf_vol', 'std_return_5',
'arbr', 'industry_act_factor5', 'industry_act_factor1', 'low_cost_dev',
'mv_weighted_turnover', 'act_factor4', 'vol', 'lg_elg_buy_prop',
'intraday_lg_flow_corr_20'],
output_columns=['complex_factor_deap_1'],
parameters={}
)
super().__init__(config)
def apply_stock(self, stock_df: pl.DataFrame, **kwargs) -> pl.DataFrame:
"""计算DEAP复杂因子"""
try:
# 安全除法函数
def safe_divide(a, b, default_val=0):
return pl.when(b.abs() > 1e-8).then(a / b).otherwise(default_val)
# 计算组件D
d_term1_div = safe_divide(pl.col('log_close'), pl.col('industry_return_5'))
d_term1 = pl.col('pullback_strong_20_20') * d_term1_div
d_term2_sub = pl.col('nonlinear_mv_volume') - pl.col('alpha_007')
d_term2_add = pl.col('vol_adj_roc_20') + pl.col('vol_drop_profit_cnt_5')
d_term2 = safe_divide(d_term2_add, d_term2_sub)
temp_d = d_term1 - d_term2
# 计算组件A
a_term1 = temp_d * pl.col('lg_buy_consolidation_20')
a_term2 = a_term1 + pl.col('lg_buy_consolidation_20')
temp_a = a_term2 + pl.col('pullback_strong_20_20')
# 计算组件F
f_term1 = pl.col('net_mf_vol') + pl.col('std_return_5')
f_term2 = pl.col('arbr') - pl.col('industry_act_factor5')
temp_f = f_term1 * f_term2
# 计算组件H
h_term1 = pl.col('industry_act_factor1') + pl.col('low_cost_dev')
h_term2 = pl.col('mv_weighted_turnover') * pl.col('act_factor4')
temp_h = h_term1 + h_term2
# 计算组件B
b_term1 = temp_f + pl.col('vol')
b_term2 = b_term1 + temp_h
temp_b = safe_divide(b_term2, pl.col('lg_elg_buy_prop'))
# 计算组件C
c_term1 = safe_divide(
pl.col('intraday_lg_flow_corr_20').fill_null(0),
pl.col('lg_elg_buy_prop')
)
temp_c = safe_divide(c_term1, pl.col('lg_elg_buy_prop'))
# 计算最终因子
final_term1 = safe_divide(temp_a, temp_b)
complex_factor = final_term1 - temp_c
return stock_df.with_columns(complex_factor.alias('complex_factor_deap_1'))
except Exception as e:
# 如果计算失败填充NaN
print(f"Error calculating complex_factor_deap_1: {e}")
return stock_df.with_columns(pl.lit(None).cast(float).alias('complex_factor_deap_1'))
# 因子集合
COMPLEX_OPERATORS = [
LargeFlowMomentumCorrelationOperator(),
LargeBuyConsolidationOperator(),
IntradayLargeFlowCorrelationOperator(),
ProfitPressureOperator(),
UnderwaterResistanceOperator(),
ProfitDecayOperator(),
PullbackStrongOperator(),
HurstExponentFlowOperator(),
VolWeightedHistoricalPositionOperator(),
CrossSectionalRankOperator('close'),
CrossSectionalNetLargeFlowRankOperator(),
CrossSectionalFlowDivergenceRankOperator(),
CrossSectionalRelativeProfitMarginRankOperator(),
CrossSectionalCostBreadthRankOperator(),
CrossSectionalWinnerRateRankOperator(),
CrossSectionalVolumeRatioRankOperator(),
ComplexFactorDEAPOperator(),
]
def apply_complex_factors(df: pl.DataFrame, operators: List = None) -> pl.DataFrame:
"""
应用所有复杂组合因子
Args:
df: 输入的Polars DataFrame
operators: 要应用的算子列表如果为None则使用默认列表
Returns:
添加了复杂组合因子的DataFrame
"""
if operators is None:
operators = COMPLEX_OPERATORS
result_df = df
for operator in operators:
result_df = operator(result_df)
return result_df
# 主应用函数
def apply_all_factors(df: pl.DataFrame,
factor_categories: List[str] = None) -> pl.DataFrame:
"""
应用所有类别的因子
Args:
df: 输入的Polars DataFrame
factor_categories: 要应用的因子类别列表如果为None则应用所有类别
Returns:
添加了所有因子的DataFrame
"""
if factor_categories is None:
factor_categories = ['money_flow', 'chip', 'volatility', 'volume',
'technical', 'sentiment', 'momentum', 'complex']
result_df = df
# 导入所有因子模块
from polars_money_flow_factors import apply_money_flow_factors
from polars_chip_factors import apply_chip_distribution_factors
from polars_volatility_factors import apply_volatility_factors
from polars_volume_factors import apply_volume_factors
from polars_technical_factors import apply_technical_factors
from polars_sentiment_factors import apply_sentiment_factors
from polars_momentum_factors import apply_momentum_factors
# 应用各类因子
if 'money_flow' in factor_categories:
result_df = apply_money_flow_factors(result_df)
if 'chip' in factor_categories:
result_df = apply_chip_distribution_factors(result_df)
if 'volatility' in factor_categories:
result_df = apply_volatility_factors(result_df)
if 'volume' in factor_categories:
result_df = apply_volume_factors(result_df)
if 'technical' in factor_categories:
result_df = apply_technical_factors(result_df)
if 'sentiment' in factor_categories:
result_df = apply_sentiment_factors(result_df)
if 'momentum' in factor_categories:
result_df = apply_momentum_factors(result_df)
if 'complex' in factor_categories:
result_df = apply_complex_factors(result_df)
return result_df