Files
NewStock/main/factor/polars_complex_factors.py

649 lines
23 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
复杂组合因子 - 使用Polars实现
包含复杂的组合因子和高级因子计算
"""
import polars as pl
import numpy as np
from typing import Dict, List, Optional, Any
from operator_framework import StockWiseOperator, DateWiseOperator, OperatorConfig
# 时间序列因子
class LargeFlowMomentumCorrelationOperator(StockWiseOperator):
"""大单资金流与价格动量相关性算子"""
def __init__(self, n: int = 20, m: int = 60):
config = OperatorConfig(
name=f"lg_flow_mom_corr_{n}_{m}",
description=f"{n}日大单资金流与{m}日价格动量相关性",
required_columns=['buy_lg_vol', 'buy_elg_vol', 'sell_lg_vol', 'sell_elg_vol',
'close', 'vol'],
output_columns=[f'lg_flow_mom_corr_{n}_{m}'],
parameters={'n': n, 'm': m}
)
super().__init__(config)
self.n = n
self.m = m
def apply_stock(self, stock_df: pl.DataFrame, **kwargs) -> pl.DataFrame:
"""计算大单资金流与价格动量相关性"""
# 计算大单净额
net_lg_flow_val = (
(pl.col('buy_lg_vol') + pl.col('buy_elg_vol') -
pl.col('sell_lg_vol') - pl.col('sell_elg_vol')) * pl.col('close')
)
# 计算滚动净大单流
rolling_net_lg_flow = net_lg_flow_val.rolling_sum(window=self.n)
# 计算价格动量
price_mom = pl.col('close').pct_change(self.n)
# 计算相关性
correlation = rolling_net_lg_flow.rolling_corr(price_mom, window=self.m)
return stock_df.with_columns(
correlation.alias(f'lg_flow_mom_corr_{self.n}_{self.m}')
)
class LargeBuyConsolidationOperator(StockWiseOperator):
"""大单买入盘整期算子"""
def __init__(self, n: int = 20, vol_quantile: float = 0.2):
config = OperatorConfig(
name=f"lg_buy_consolidation_{n}",
description=f"{n}日大单买入盘整期",
required_columns=['close', 'buy_lg_vol', 'buy_elg_vol', 'sell_lg_vol',
'sell_elg_vol', 'vol'],
output_columns=[f'lg_buy_consolidation_{n}'],
parameters={'n': n, 'vol_quantile': vol_quantile}
)
super().__init__(config)
self.n = n
self.vol_quantile = vol_quantile
def apply_stock(self, stock_df: pl.DataFrame, **kwargs) -> pl.DataFrame:
"""计算大单买入盘整期"""
epsilon = 1e-8
# 计算收盘价滚动标准差
rolling_std = pl.col('close').rolling_std(window=self.n)
# 计算大单净流比率
net_lg_flow_ratio = (
(pl.col('buy_lg_vol') + pl.col('buy_elg_vol') -
pl.col('sell_lg_vol') - pl.col('sell_elg_vol')) /
(pl.col('vol') + epsilon)
)
# 计算滚动均值
rolling_mean_ratio = net_lg_flow_ratio.rolling_mean(window=self.n)
return stock_df.with_columns(
rolling_mean_ratio.alias(f'lg_buy_consolidation_{self.n}')
)
class IntradayLargeFlowCorrelationOperator(StockWiseOperator):
"""日内趋势与大单流相关性算子"""
def __init__(self, n: int = 20):
config = OperatorConfig(
name=f"intraday_lg_flow_corr_{n}",
description=f"{n}日日内趋势与大单流相关性",
required_columns=['high', 'low', 'close', 'buy_lg_vol', 'buy_elg_vol',
'sell_lg_vol', 'sell_elg_vol'],
output_columns=[f'intraday_lg_flow_corr_{n}'],
parameters={'n': n}
)
super().__init__(config)
self.n = n
def apply_stock(self, stock_df: pl.DataFrame, **kwargs) -> pl.DataFrame:
"""计算日内趋势与大单流相关性"""
# 这是一个复杂的因子,简化处理
# 实际实现需要更复杂的日内数据
placeholder = pl.lit(None).cast(float)
return stock_df.with_columns(
placeholder.alias(f'intraday_lg_flow_corr_{self.n}')
)
class ProfitPressureOperator(StockWiseOperator):
"""获利压力指数算子"""
def __init__(self):
config = OperatorConfig(
name="profit_pressure",
description="获利压力指数",
required_columns=['close', 'cost_85pct', 'cost_95pct', 'winner_rate'],
output_columns=['profit_pressure'],
parameters={}
)
super().__init__(config)
def apply_stock(self, stock_df: pl.DataFrame, **kwargs) -> pl.DataFrame:
"""计算获利压力指数"""
epsilon = 1e-8
# 计算盈利幅度
profit_margin_85 = (pl.col('close') / (pl.col('cost_85pct') + epsilon)) - 1
profit_margin_95 = (pl.col('close') / (pl.col('cost_95pct') + epsilon)) - 1
# 计算压力指数
pressure = pl.col('winner_rate') * 0.5 * (profit_margin_85 + profit_margin_95)
return stock_df.with_columns(pressure.alias('profit_pressure'))
class UnderwaterResistanceOperator(StockWiseOperator):
"""套牢盘阻力算子"""
def __init__(self):
config = OperatorConfig(
name="underwater_resistance",
description="套牢盘阻力",
required_columns=['close', 'winner_rate', 'cost_15pct'],
output_columns=['underwater_resistance'],
parameters={}
)
super().__init__(config)
def apply_stock(self, stock_df: pl.DataFrame, **kwargs) -> pl.DataFrame:
"""计算套牢盘阻力"""
epsilon = 1e-8
# 计算套牢比例
underwater_ratio = 1.0 - pl.col('winner_rate')
# 计算与成本的距离
dist_to_cost_15 = pl.max_horizontal(0, pl.col('cost_15pct') - pl.col('close')) / (pl.col('close') + epsilon)
# 计算阻力
resistance = underwater_ratio * dist_to_cost_15
return stock_df.with_columns(resistance.alias('underwater_resistance'))
class ProfitDecayOperator(StockWiseOperator):
"""盈利预期衰减算子"""
def __init__(self, n: int = 20):
config = OperatorConfig(
name=f"profit_decay_{n}",
description=f"{n}日盈利预期衰减",
required_columns=['close', 'winner_rate'],
output_columns=[f'profit_decay_{n}'],
parameters={'n': n}
)
super().__init__(config)
self.n = n
def apply_stock(self, stock_df: pl.DataFrame, **kwargs) -> pl.DataFrame:
"""计算盈利预期衰减"""
# 计算n日收益率
ret_n = pl.col('close').pct_change(self.n)
# 计算winner_rate变化
winner_rate_change = pl.col('winner_rate').diff(self.n)
# 计算衰减因子
decay = ret_n / (winner_rate_change + 1e-8)
return stock_df.with_columns(decay.alias(f'profit_decay_{self.n}'))
class PullbackStrongOperator(StockWiseOperator):
"""强势股回调深度算子"""
def __init__(self, n: int = 20, m: int = 20, gain_thresh: float = 0.2):
config = OperatorConfig(
name=f"pullback_strong_{n}_{m}",
description=f"{n}{m}期强势股回调深度",
required_columns=['high', 'close'],
output_columns=[f'pullback_strong_{n}_{m}'],
parameters={'n': n, 'm': m, 'gain_thresh': gain_thresh}
)
super().__init__(config)
self.n = n
self.m = m
self.gain_thresh = gain_thresh
def apply_stock(self, stock_df: pl.DataFrame, **kwargs) -> pl.DataFrame:
"""计算强势股回调深度"""
# 计算n日最高价
high_n = pl.col('high').rolling_max(window=self.n)
# 计算回调深度
pullback_depth = (high_n - pl.col('close')) / high_n
# 计算近期涨幅
recent_gain = (pl.col('close') / pl.col('close').shift(self.m)) - 1
# 计算回调因子
pullback_factor = pullback_depth / (recent_gain + 1e-8)
return stock_df.with_columns(pullback_factor.alias(f'pullback_strong_{self.n}_{self.m}'))
class HurstExponentFlowOperator(StockWiseOperator):
"""资金流Hurst指数算子"""
def __init__(self, n: int = 60, flow_col: str = 'net_mf_vol'):
config = OperatorConfig(
name=f"hurst_{flow_col}_{n}",
description=f"{n}{flow_col}Hurst指数",
required_columns=[flow_col],
output_columns=[f'hurst_{flow_col}_{n}'],
parameters={'n': n, 'flow_col': flow_col}
)
super().__init__(config)
self.n = n
self.flow_col = flow_col
def apply_stock(self, stock_df: pl.DataFrame, **kwargs) -> pl.DataFrame:
"""计算Hurst指数"""
# Hurst指数计算复杂这里使用占位符
# 实际实现需要专门的Hurst指数计算库
placeholder = pl.lit(None).cast(float)
return stock_df.with_columns(
placeholder.alias(f'hurst_{self.flow_col}_{self.n}')
)
class VolWeightedHistoricalPositionOperator(StockWiseOperator):
"""成交量加权历史位置算子"""
def __init__(self, n: int = 20):
config = OperatorConfig(
name=f"vol_wgt_hist_pos_{n}",
description=f"{n}日成交量加权历史位置",
required_columns=['close', 'his_high', 'his_low', 'vol'],
output_columns=[f'vol_wgt_hist_pos_{n}'],
parameters={'n': n}
)
super().__init__(config)
self.n = n
def apply_stock(self, stock_df: pl.DataFrame, **kwargs) -> pl.DataFrame:
"""计算成交量加权历史位置"""
# 计算历史位置
hist_pos = (pl.col('close') - pl.col('his_low')) / (pl.col('his_high') - pl.col('his_low'))
hist_pos = hist_pos.clip(0, 1)
# 计算成交量相对强度
rolling_mean_vol = pl.col('vol').rolling_mean(window=self.n)
vol_rel_strength = pl.col('vol') / rolling_mean_vol
# 计算加权位置
weighted_pos = hist_pos * vol_rel_strength
return stock_df.with_columns(weighted_pos.alias(f'vol_wgt_hist_pos_{self.n}'))
# 横截面因子
class CrossSectionalRankOperator(DateWiseOperator):
"""横截面排名算子"""
def __init__(self, column: str, ascending: bool = True):
config = OperatorConfig(
name=f"cs_rank_{column}",
description=f"{column}横截面排名",
required_columns=[column],
output_columns=[f'cs_rank_{column}'],
parameters={'column': column, 'ascending': ascending}
)
super().__init__(config)
self.column = column
self.ascending = ascending
def apply_date(self, date_df: pl.DataFrame, **kwargs) -> pl.DataFrame:
"""计算横截面排名"""
# 计算排名
rank_col = pl.col(self.column).rank(method='dense', descending=not self.ascending)
# 转换为百分比排名
pct_rank = rank_col / rank_col.max()
return date_df.with_columns(pct_rank.alias(f'cs_rank_{self.column}'))
class CrossSectionalNetLargeFlowRankOperator(DateWiseOperator):
"""横截面大单净额排名算子"""
def __init__(self):
config = OperatorConfig(
name="cs_rank_net_lg_flow_val",
description="横截面大单净额排名",
required_columns=['buy_lg_vol', 'buy_elg_vol', 'sell_lg_vol', 'sell_elg_vol', 'close'],
output_columns=['cs_rank_net_lg_flow_val'],
parameters={}
)
super().__init__(config)
def apply_date(self, date_df: pl.DataFrame, **kwargs) -> pl.DataFrame:
"""计算横截面大单净额排名"""
# 计算大单净额
net_lg_flow_val = (
(pl.col('buy_lg_vol') + pl.col('buy_elg_vol') -
pl.col('sell_lg_vol') - pl.col('sell_elg_vol')) * pl.col('close')
)
# 计算排名
rank_col = net_lg_flow_val.rank(method='dense', descending=True)
pct_rank = rank_col / rank_col.max()
return date_df.with_columns(pct_rank.alias('cs_rank_net_lg_flow_val'))
class CrossSectionalFlowDivergenceRankOperator(DateWiseOperator):
"""横截面流向背离度排名算子"""
def __init__(self):
config = OperatorConfig(
name="cs_rank_flow_divergence",
description="横截面流向背离度排名",
required_columns=['buy_sm_vol', 'sell_sm_vol', 'buy_lg_vol', 'buy_elg_vol',
'sell_lg_vol', 'sell_elg_vol', 'vol'],
output_columns=['cs_rank_flow_divergence'],
parameters={}
)
super().__init__(config)
def apply_date(self, date_df: pl.DataFrame, **kwargs) -> pl.DataFrame:
"""计算横截面流向背离度排名"""
epsilon = 1e-8
# 计算大单比率
lg_ratio = (
(pl.col('buy_lg_vol') + pl.col('buy_elg_vol') -
pl.col('sell_lg_vol') - pl.col('sell_elg_vol')) /
(pl.col('vol') + epsilon)
)
# 计算小单比率
sm_ratio = (pl.col('buy_sm_vol') - pl.col('sell_sm_vol')) / (pl.col('vol') + epsilon)
# 计算背离度
divergence = lg_ratio - sm_ratio
# 计算排名
rank_col = divergence.rank(method='dense', descending=True)
pct_rank = rank_col / rank_col.max()
return date_df.with_columns(pct_rank.alias('cs_rank_flow_divergence'))
class CrossSectionalRelativeProfitMarginRankOperator(DateWiseOperator):
"""横截面相对盈利幅度排名算子"""
def __init__(self):
config = OperatorConfig(
name="cs_rank_rel_profit_margin",
description="横截面相对盈利幅度排名",
required_columns=['close', 'weight_avg'],
output_columns=['cs_rank_rel_profit_margin'],
parameters={}
)
super().__init__(config)
def apply_date(self, date_df: pl.DataFrame, **kwargs) -> pl.DataFrame:
"""计算横截面相对盈利幅度排名"""
# 计算盈利幅度
profit_margin = (pl.col('close') - pl.col('weight_avg')) / pl.col('close')
# 计算排名
rank_col = profit_margin.rank(method='dense', descending=True)
pct_rank = rank_col / rank_col.max()
return date_df.with_columns(pct_rank.alias('cs_rank_rel_profit_margin'))
class CrossSectionalCostBreadthRankOperator(DateWiseOperator):
"""横截面成本分布宽度排名算子"""
def __init__(self):
config = OperatorConfig(
name="cs_rank_cost_breadth",
description="横截面成本分布宽度排名",
required_columns=['cost_85pct', 'cost_15pct', 'weight_avg'],
output_columns=['cs_rank_cost_breadth'],
parameters={}
)
super().__init__(config)
def apply_date(self, date_df: pl.DataFrame, **kwargs) -> pl.DataFrame:
"""计算横截面成本分布宽度排名"""
epsilon = 1e-8
# 计算成本宽度
cost_breadth = (pl.col('cost_85pct') - pl.col('cost_15pct')) / (pl.col('weight_avg') + epsilon)
# 计算排名
rank_col = cost_breadth.rank(method='dense', descending=True)
pct_rank = rank_col / rank_col.max()
return date_df.with_columns(pct_rank.alias('cs_rank_cost_breadth'))
class CrossSectionalWinnerRateRankOperator(DateWiseOperator):
"""横截面获利盘比例排名算子"""
def __init__(self):
config = OperatorConfig(
name="cs_rank_winner_rate",
description="横截面获利盘比例排名",
required_columns=['winner_rate'],
output_columns=['cs_rank_winner_rate'],
parameters={}
)
super().__init__(config)
def apply_date(self, date_df: pl.DataFrame, **kwargs) -> pl.DataFrame:
"""计算横截面获利盘比例排名"""
# 计算排名
rank_col = pl.col('winner_rate').rank(method='dense', descending=True)
pct_rank = rank_col / rank_col.max()
return date_df.with_columns(pct_rank.alias('cs_rank_winner_rate'))
class CrossSectionalVolumeRatioRankOperator(DateWiseOperator):
"""横截面量比排名算子"""
def __init__(self):
config = OperatorConfig(
name="cs_rank_volume_ratio",
description="横截面量比排名",
required_columns=['volume_ratio'],
output_columns=['cs_rank_volume_ratio'],
parameters={}
)
super().__init__(config)
def apply_date(self, date_df: pl.DataFrame, **kwargs) -> pl.DataFrame:
"""计算横截面量比排名"""
# 计算排名
rank_col = pl.col('volume_ratio').rank(method='dense', descending=True)
pct_rank = rank_col / rank_col.max()
return date_df.with_columns(pct_rank.alias('cs_rank_volume_ratio'))
# 复杂组合因子
class ComplexFactorDEAPOperator(StockWiseOperator):
"""DEAP复杂因子算子"""
def __init__(self):
config = OperatorConfig(
name="complex_factor_deap_1",
description="DEAP复杂组合因子",
required_columns=['pullback_strong_20_20', 'log_close', 'industry_return_5',
'vol_adj_roc_20', 'vol_drop_profit_cnt_5', 'nonlinear_mv_volume',
'alpha_007', 'lg_buy_consolidation_20', 'net_mf_vol', 'std_return_5',
'arbr', 'industry_act_factor5', 'industry_act_factor1', 'low_cost_dev',
'mv_weighted_turnover', 'act_factor4', 'vol', 'lg_elg_buy_prop',
'intraday_lg_flow_corr_20'],
output_columns=['complex_factor_deap_1'],
parameters={}
)
super().__init__(config)
def apply_stock(self, stock_df: pl.DataFrame, **kwargs) -> pl.DataFrame:
"""计算DEAP复杂因子"""
try:
# 安全除法函数
def safe_divide(a, b, default_val=0):
return pl.when(b.abs() > 1e-8).then(a / b).otherwise(default_val)
# 计算组件D
d_term1_div = safe_divide(pl.col('log_close'), pl.col('industry_return_5'))
d_term1 = pl.col('pullback_strong_20_20') * d_term1_div
d_term2_sub = pl.col('nonlinear_mv_volume') - pl.col('alpha_007')
d_term2_add = pl.col('vol_adj_roc_20') + pl.col('vol_drop_profit_cnt_5')
d_term2 = safe_divide(d_term2_add, d_term2_sub)
temp_d = d_term1 - d_term2
# 计算组件A
a_term1 = temp_d * pl.col('lg_buy_consolidation_20')
a_term2 = a_term1 + pl.col('lg_buy_consolidation_20')
temp_a = a_term2 + pl.col('pullback_strong_20_20')
# 计算组件F
f_term1 = pl.col('net_mf_vol') + pl.col('std_return_5')
f_term2 = pl.col('arbr') - pl.col('industry_act_factor5')
temp_f = f_term1 * f_term2
# 计算组件H
h_term1 = pl.col('industry_act_factor1') + pl.col('low_cost_dev')
h_term2 = pl.col('mv_weighted_turnover') * pl.col('act_factor4')
temp_h = h_term1 + h_term2
# 计算组件B
b_term1 = temp_f + pl.col('vol')
b_term2 = b_term1 + temp_h
temp_b = safe_divide(b_term2, pl.col('lg_elg_buy_prop'))
# 计算组件C
c_term1 = safe_divide(
pl.col('intraday_lg_flow_corr_20').fill_null(0),
pl.col('lg_elg_buy_prop')
)
temp_c = safe_divide(c_term1, pl.col('lg_elg_buy_prop'))
# 计算最终因子
final_term1 = safe_divide(temp_a, temp_b)
complex_factor = final_term1 - temp_c
return stock_df.with_columns(complex_factor.alias('complex_factor_deap_1'))
except Exception as e:
# 如果计算失败填充NaN
print(f"Error calculating complex_factor_deap_1: {e}")
return stock_df.with_columns(pl.lit(None).cast(float).alias('complex_factor_deap_1'))
# 因子集合
COMPLEX_OPERATORS = [
LargeFlowMomentumCorrelationOperator(),
LargeBuyConsolidationOperator(),
IntradayLargeFlowCorrelationOperator(),
ProfitPressureOperator(),
UnderwaterResistanceOperator(),
ProfitDecayOperator(),
PullbackStrongOperator(),
HurstExponentFlowOperator(),
VolWeightedHistoricalPositionOperator(),
CrossSectionalRankOperator('close'),
CrossSectionalNetLargeFlowRankOperator(),
CrossSectionalFlowDivergenceRankOperator(),
CrossSectionalRelativeProfitMarginRankOperator(),
CrossSectionalCostBreadthRankOperator(),
CrossSectionalWinnerRateRankOperator(),
CrossSectionalVolumeRatioRankOperator(),
ComplexFactorDEAPOperator(),
]
def apply_complex_factors(df: pl.DataFrame, operators: List = None) -> pl.DataFrame:
"""
应用所有复杂组合因子
Args:
df: 输入的Polars DataFrame
operators: 要应用的算子列表如果为None则使用默认列表
Returns:
添加了复杂组合因子的DataFrame
"""
if operators is None:
operators = COMPLEX_OPERATORS
result_df = df
for operator in operators:
result_df = operator(result_df)
return result_df
# 主应用函数
def apply_all_factors(df: pl.DataFrame,
factor_categories: List[str] = None) -> pl.DataFrame:
"""
应用所有类别的因子
Args:
df: 输入的Polars DataFrame
factor_categories: 要应用的因子类别列表如果为None则应用所有类别
Returns:
添加了所有因子的DataFrame
"""
if factor_categories is None:
factor_categories = ['money_flow', 'chip', 'volatility', 'volume',
'technical', 'sentiment', 'momentum', 'complex']
result_df = df
# 导入所有因子模块
from polars_money_flow_factors import apply_money_flow_factors
from polars_chip_factors import apply_chip_distribution_factors
from polars_volatility_factors import apply_volatility_factors
from polars_volume_factors import apply_volume_factors
from polars_technical_factors import apply_technical_factors
from polars_sentiment_factors import apply_sentiment_factors
from polars_momentum_factors import apply_momentum_factors
# 应用各类因子
if 'money_flow' in factor_categories:
result_df = apply_money_flow_factors(result_df)
if 'chip' in factor_categories:
result_df = apply_chip_distribution_factors(result_df)
if 'volatility' in factor_categories:
result_df = apply_volatility_factors(result_df)
if 'volume' in factor_categories:
result_df = apply_volume_factors(result_df)
if 'technical' in factor_categories:
result_df = apply_technical_factors(result_df)
if 'sentiment' in factor_categories:
result_df = apply_sentiment_factors(result_df)
if 'momentum' in factor_categories:
result_df = apply_momentum_factors(result_df)
if 'complex' in factor_categories:
result_df = apply_complex_factors(result_df)
return result_df