""" 复杂组合因子 - 使用Polars实现 包含复杂的组合因子和高级因子计算 """ import polars as pl import numpy as np from typing import Dict, List, Optional, Any from operator_framework import StockWiseOperator, DateWiseOperator, OperatorConfig # 时间序列因子 class LargeFlowMomentumCorrelationOperator(StockWiseOperator): """大单资金流与价格动量相关性算子""" def __init__(self, n: int = 20, m: int = 60): config = OperatorConfig( name=f"lg_flow_mom_corr_{n}_{m}", description=f"{n}日大单资金流与{m}日价格动量相关性", required_columns=['buy_lg_vol', 'buy_elg_vol', 'sell_lg_vol', 'sell_elg_vol', 'close', 'vol'], output_columns=[f'lg_flow_mom_corr_{n}_{m}'], parameters={'n': n, 'm': m} ) super().__init__(config) self.n = n self.m = m def apply_stock(self, stock_df: pl.DataFrame, **kwargs) -> pl.DataFrame: """计算大单资金流与价格动量相关性""" # 计算大单净额 net_lg_flow_val = ( (pl.col('buy_lg_vol') + pl.col('buy_elg_vol') - pl.col('sell_lg_vol') - pl.col('sell_elg_vol')) * pl.col('close') ) # 计算滚动净大单流 rolling_net_lg_flow = net_lg_flow_val.rolling_sum(window=self.n) # 计算价格动量 price_mom = pl.col('close').pct_change(self.n) # 计算相关性 correlation = rolling_net_lg_flow.rolling_corr(price_mom, window=self.m) return stock_df.with_columns( correlation.alias(f'lg_flow_mom_corr_{self.n}_{self.m}') ) class LargeBuyConsolidationOperator(StockWiseOperator): """大单买入盘整期算子""" def __init__(self, n: int = 20, vol_quantile: float = 0.2): config = OperatorConfig( name=f"lg_buy_consolidation_{n}", description=f"{n}日大单买入盘整期", required_columns=['close', 'buy_lg_vol', 'buy_elg_vol', 'sell_lg_vol', 'sell_elg_vol', 'vol'], output_columns=[f'lg_buy_consolidation_{n}'], parameters={'n': n, 'vol_quantile': vol_quantile} ) super().__init__(config) self.n = n self.vol_quantile = vol_quantile def apply_stock(self, stock_df: pl.DataFrame, **kwargs) -> pl.DataFrame: """计算大单买入盘整期""" epsilon = 1e-8 # 计算收盘价滚动标准差 rolling_std = pl.col('close').rolling_std(window=self.n) # 计算大单净流比率 net_lg_flow_ratio = ( (pl.col('buy_lg_vol') + pl.col('buy_elg_vol') - pl.col('sell_lg_vol') - pl.col('sell_elg_vol')) / (pl.col('vol') + epsilon) ) # 计算滚动均值 rolling_mean_ratio = net_lg_flow_ratio.rolling_mean(window=self.n) return stock_df.with_columns( rolling_mean_ratio.alias(f'lg_buy_consolidation_{self.n}') ) class IntradayLargeFlowCorrelationOperator(StockWiseOperator): """日内趋势与大单流相关性算子""" def __init__(self, n: int = 20): config = OperatorConfig( name=f"intraday_lg_flow_corr_{n}", description=f"{n}日日内趋势与大单流相关性", required_columns=['high', 'low', 'close', 'buy_lg_vol', 'buy_elg_vol', 'sell_lg_vol', 'sell_elg_vol'], output_columns=[f'intraday_lg_flow_corr_{n}'], parameters={'n': n} ) super().__init__(config) self.n = n def apply_stock(self, stock_df: pl.DataFrame, **kwargs) -> pl.DataFrame: """计算日内趋势与大单流相关性""" # 这是一个复杂的因子,简化处理 # 实际实现需要更复杂的日内数据 placeholder = pl.lit(None).cast(float) return stock_df.with_columns( placeholder.alias(f'intraday_lg_flow_corr_{self.n}') ) class ProfitPressureOperator(StockWiseOperator): """获利压力指数算子""" def __init__(self): config = OperatorConfig( name="profit_pressure", description="获利压力指数", required_columns=['close', 'cost_85pct', 'cost_95pct', 'winner_rate'], output_columns=['profit_pressure'], parameters={} ) super().__init__(config) def apply_stock(self, stock_df: pl.DataFrame, **kwargs) -> pl.DataFrame: """计算获利压力指数""" epsilon = 1e-8 # 计算盈利幅度 profit_margin_85 = (pl.col('close') / (pl.col('cost_85pct') + epsilon)) - 1 profit_margin_95 = (pl.col('close') / (pl.col('cost_95pct') + epsilon)) - 1 # 计算压力指数 pressure = pl.col('winner_rate') * 0.5 * (profit_margin_85 + profit_margin_95) return stock_df.with_columns(pressure.alias('profit_pressure')) class UnderwaterResistanceOperator(StockWiseOperator): """套牢盘阻力算子""" def __init__(self): config = OperatorConfig( name="underwater_resistance", description="套牢盘阻力", required_columns=['close', 'winner_rate', 'cost_15pct'], output_columns=['underwater_resistance'], parameters={} ) super().__init__(config) def apply_stock(self, stock_df: pl.DataFrame, **kwargs) -> pl.DataFrame: """计算套牢盘阻力""" epsilon = 1e-8 # 计算套牢比例 underwater_ratio = 1.0 - pl.col('winner_rate') # 计算与成本的距离 dist_to_cost_15 = pl.max_horizontal(0, pl.col('cost_15pct') - pl.col('close')) / (pl.col('close') + epsilon) # 计算阻力 resistance = underwater_ratio * dist_to_cost_15 return stock_df.with_columns(resistance.alias('underwater_resistance')) class ProfitDecayOperator(StockWiseOperator): """盈利预期衰减算子""" def __init__(self, n: int = 20): config = OperatorConfig( name=f"profit_decay_{n}", description=f"{n}日盈利预期衰减", required_columns=['close', 'winner_rate'], output_columns=[f'profit_decay_{n}'], parameters={'n': n} ) super().__init__(config) self.n = n def apply_stock(self, stock_df: pl.DataFrame, **kwargs) -> pl.DataFrame: """计算盈利预期衰减""" # 计算n日收益率 ret_n = pl.col('close').pct_change(self.n) # 计算winner_rate变化 winner_rate_change = pl.col('winner_rate').diff(self.n) # 计算衰减因子 decay = ret_n / (winner_rate_change + 1e-8) return stock_df.with_columns(decay.alias(f'profit_decay_{self.n}')) class PullbackStrongOperator(StockWiseOperator): """强势股回调深度算子""" def __init__(self, n: int = 20, m: int = 20, gain_thresh: float = 0.2): config = OperatorConfig( name=f"pullback_strong_{n}_{m}", description=f"{n}日{m}期强势股回调深度", required_columns=['high', 'close'], output_columns=[f'pullback_strong_{n}_{m}'], parameters={'n': n, 'm': m, 'gain_thresh': gain_thresh} ) super().__init__(config) self.n = n self.m = m self.gain_thresh = gain_thresh def apply_stock(self, stock_df: pl.DataFrame, **kwargs) -> pl.DataFrame: """计算强势股回调深度""" # 计算n日最高价 high_n = pl.col('high').rolling_max(window=self.n) # 计算回调深度 pullback_depth = (high_n - pl.col('close')) / high_n # 计算近期涨幅 recent_gain = (pl.col('close') / pl.col('close').shift(self.m)) - 1 # 计算回调因子 pullback_factor = pullback_depth / (recent_gain + 1e-8) return stock_df.with_columns(pullback_factor.alias(f'pullback_strong_{self.n}_{self.m}')) class HurstExponentFlowOperator(StockWiseOperator): """资金流Hurst指数算子""" def __init__(self, n: int = 60, flow_col: str = 'net_mf_vol'): config = OperatorConfig( name=f"hurst_{flow_col}_{n}", description=f"{n}日{flow_col}Hurst指数", required_columns=[flow_col], output_columns=[f'hurst_{flow_col}_{n}'], parameters={'n': n, 'flow_col': flow_col} ) super().__init__(config) self.n = n self.flow_col = flow_col def apply_stock(self, stock_df: pl.DataFrame, **kwargs) -> pl.DataFrame: """计算Hurst指数""" # Hurst指数计算复杂,这里使用占位符 # 实际实现需要专门的Hurst指数计算库 placeholder = pl.lit(None).cast(float) return stock_df.with_columns( placeholder.alias(f'hurst_{self.flow_col}_{self.n}') ) class VolWeightedHistoricalPositionOperator(StockWiseOperator): """成交量加权历史位置算子""" def __init__(self, n: int = 20): config = OperatorConfig( name=f"vol_wgt_hist_pos_{n}", description=f"{n}日成交量加权历史位置", required_columns=['close', 'his_high', 'his_low', 'vol'], output_columns=[f'vol_wgt_hist_pos_{n}'], parameters={'n': n} ) super().__init__(config) self.n = n def apply_stock(self, stock_df: pl.DataFrame, **kwargs) -> pl.DataFrame: """计算成交量加权历史位置""" # 计算历史位置 hist_pos = (pl.col('close') - pl.col('his_low')) / (pl.col('his_high') - pl.col('his_low')) hist_pos = hist_pos.clip(0, 1) # 计算成交量相对强度 rolling_mean_vol = pl.col('vol').rolling_mean(window=self.n) vol_rel_strength = pl.col('vol') / rolling_mean_vol # 计算加权位置 weighted_pos = hist_pos * vol_rel_strength return stock_df.with_columns(weighted_pos.alias(f'vol_wgt_hist_pos_{self.n}')) # 横截面因子 class CrossSectionalRankOperator(DateWiseOperator): """横截面排名算子""" def __init__(self, column: str, ascending: bool = True): config = OperatorConfig( name=f"cs_rank_{column}", description=f"{column}横截面排名", required_columns=[column], output_columns=[f'cs_rank_{column}'], parameters={'column': column, 'ascending': ascending} ) super().__init__(config) self.column = column self.ascending = ascending def apply_date(self, date_df: pl.DataFrame, **kwargs) -> pl.DataFrame: """计算横截面排名""" # 计算排名 rank_col = pl.col(self.column).rank(method='dense', descending=not self.ascending) # 转换为百分比排名 pct_rank = rank_col / rank_col.max() return date_df.with_columns(pct_rank.alias(f'cs_rank_{self.column}')) class CrossSectionalNetLargeFlowRankOperator(DateWiseOperator): """横截面大单净额排名算子""" def __init__(self): config = OperatorConfig( name="cs_rank_net_lg_flow_val", description="横截面大单净额排名", required_columns=['buy_lg_vol', 'buy_elg_vol', 'sell_lg_vol', 'sell_elg_vol', 'close'], output_columns=['cs_rank_net_lg_flow_val'], parameters={} ) super().__init__(config) def apply_date(self, date_df: pl.DataFrame, **kwargs) -> pl.DataFrame: """计算横截面大单净额排名""" # 计算大单净额 net_lg_flow_val = ( (pl.col('buy_lg_vol') + pl.col('buy_elg_vol') - pl.col('sell_lg_vol') - pl.col('sell_elg_vol')) * pl.col('close') ) # 计算排名 rank_col = net_lg_flow_val.rank(method='dense', descending=True) pct_rank = rank_col / rank_col.max() return date_df.with_columns(pct_rank.alias('cs_rank_net_lg_flow_val')) class CrossSectionalFlowDivergenceRankOperator(DateWiseOperator): """横截面流向背离度排名算子""" def __init__(self): config = OperatorConfig( name="cs_rank_flow_divergence", description="横截面流向背离度排名", required_columns=['buy_sm_vol', 'sell_sm_vol', 'buy_lg_vol', 'buy_elg_vol', 'sell_lg_vol', 'sell_elg_vol', 'vol'], output_columns=['cs_rank_flow_divergence'], parameters={} ) super().__init__(config) def apply_date(self, date_df: pl.DataFrame, **kwargs) -> pl.DataFrame: """计算横截面流向背离度排名""" epsilon = 1e-8 # 计算大单比率 lg_ratio = ( (pl.col('buy_lg_vol') + pl.col('buy_elg_vol') - pl.col('sell_lg_vol') - pl.col('sell_elg_vol')) / (pl.col('vol') + epsilon) ) # 计算小单比率 sm_ratio = (pl.col('buy_sm_vol') - pl.col('sell_sm_vol')) / (pl.col('vol') + epsilon) # 计算背离度 divergence = lg_ratio - sm_ratio # 计算排名 rank_col = divergence.rank(method='dense', descending=True) pct_rank = rank_col / rank_col.max() return date_df.with_columns(pct_rank.alias('cs_rank_flow_divergence')) class CrossSectionalRelativeProfitMarginRankOperator(DateWiseOperator): """横截面相对盈利幅度排名算子""" def __init__(self): config = OperatorConfig( name="cs_rank_rel_profit_margin", description="横截面相对盈利幅度排名", required_columns=['close', 'weight_avg'], output_columns=['cs_rank_rel_profit_margin'], parameters={} ) super().__init__(config) def apply_date(self, date_df: pl.DataFrame, **kwargs) -> pl.DataFrame: """计算横截面相对盈利幅度排名""" # 计算盈利幅度 profit_margin = (pl.col('close') - pl.col('weight_avg')) / pl.col('close') # 计算排名 rank_col = profit_margin.rank(method='dense', descending=True) pct_rank = rank_col / rank_col.max() return date_df.with_columns(pct_rank.alias('cs_rank_rel_profit_margin')) class CrossSectionalCostBreadthRankOperator(DateWiseOperator): """横截面成本分布宽度排名算子""" def __init__(self): config = OperatorConfig( name="cs_rank_cost_breadth", description="横截面成本分布宽度排名", required_columns=['cost_85pct', 'cost_15pct', 'weight_avg'], output_columns=['cs_rank_cost_breadth'], parameters={} ) super().__init__(config) def apply_date(self, date_df: pl.DataFrame, **kwargs) -> pl.DataFrame: """计算横截面成本分布宽度排名""" epsilon = 1e-8 # 计算成本宽度 cost_breadth = (pl.col('cost_85pct') - pl.col('cost_15pct')) / (pl.col('weight_avg') + epsilon) # 计算排名 rank_col = cost_breadth.rank(method='dense', descending=True) pct_rank = rank_col / rank_col.max() return date_df.with_columns(pct_rank.alias('cs_rank_cost_breadth')) class CrossSectionalWinnerRateRankOperator(DateWiseOperator): """横截面获利盘比例排名算子""" def __init__(self): config = OperatorConfig( name="cs_rank_winner_rate", description="横截面获利盘比例排名", required_columns=['winner_rate'], output_columns=['cs_rank_winner_rate'], parameters={} ) super().__init__(config) def apply_date(self, date_df: pl.DataFrame, **kwargs) -> pl.DataFrame: """计算横截面获利盘比例排名""" # 计算排名 rank_col = pl.col('winner_rate').rank(method='dense', descending=True) pct_rank = rank_col / rank_col.max() return date_df.with_columns(pct_rank.alias('cs_rank_winner_rate')) class CrossSectionalVolumeRatioRankOperator(DateWiseOperator): """横截面量比排名算子""" def __init__(self): config = OperatorConfig( name="cs_rank_volume_ratio", description="横截面量比排名", required_columns=['volume_ratio'], output_columns=['cs_rank_volume_ratio'], parameters={} ) super().__init__(config) def apply_date(self, date_df: pl.DataFrame, **kwargs) -> pl.DataFrame: """计算横截面量比排名""" # 计算排名 rank_col = pl.col('volume_ratio').rank(method='dense', descending=True) pct_rank = rank_col / rank_col.max() return date_df.with_columns(pct_rank.alias('cs_rank_volume_ratio')) # 复杂组合因子 class ComplexFactorDEAPOperator(StockWiseOperator): """DEAP复杂因子算子""" def __init__(self): config = OperatorConfig( name="complex_factor_deap_1", description="DEAP复杂组合因子", required_columns=['pullback_strong_20_20', 'log_close', 'industry_return_5', 'vol_adj_roc_20', 'vol_drop_profit_cnt_5', 'nonlinear_mv_volume', 'alpha_007', 'lg_buy_consolidation_20', 'net_mf_vol', 'std_return_5', 'arbr', 'industry_act_factor5', 'industry_act_factor1', 'low_cost_dev', 'mv_weighted_turnover', 'act_factor4', 'vol', 'lg_elg_buy_prop', 'intraday_lg_flow_corr_20'], output_columns=['complex_factor_deap_1'], parameters={} ) super().__init__(config) def apply_stock(self, stock_df: pl.DataFrame, **kwargs) -> pl.DataFrame: """计算DEAP复杂因子""" try: # 安全除法函数 def safe_divide(a, b, default_val=0): return pl.when(b.abs() > 1e-8).then(a / b).otherwise(default_val) # 计算组件D d_term1_div = safe_divide(pl.col('log_close'), pl.col('industry_return_5')) d_term1 = pl.col('pullback_strong_20_20') * d_term1_div d_term2_sub = pl.col('nonlinear_mv_volume') - pl.col('alpha_007') d_term2_add = pl.col('vol_adj_roc_20') + pl.col('vol_drop_profit_cnt_5') d_term2 = safe_divide(d_term2_add, d_term2_sub) temp_d = d_term1 - d_term2 # 计算组件A a_term1 = temp_d * pl.col('lg_buy_consolidation_20') a_term2 = a_term1 + pl.col('lg_buy_consolidation_20') temp_a = a_term2 + pl.col('pullback_strong_20_20') # 计算组件F f_term1 = pl.col('net_mf_vol') + pl.col('std_return_5') f_term2 = pl.col('arbr') - pl.col('industry_act_factor5') temp_f = f_term1 * f_term2 # 计算组件H h_term1 = pl.col('industry_act_factor1') + pl.col('low_cost_dev') h_term2 = pl.col('mv_weighted_turnover') * pl.col('act_factor4') temp_h = h_term1 + h_term2 # 计算组件B b_term1 = temp_f + pl.col('vol') b_term2 = b_term1 + temp_h temp_b = safe_divide(b_term2, pl.col('lg_elg_buy_prop')) # 计算组件C c_term1 = safe_divide( pl.col('intraday_lg_flow_corr_20').fill_null(0), pl.col('lg_elg_buy_prop') ) temp_c = safe_divide(c_term1, pl.col('lg_elg_buy_prop')) # 计算最终因子 final_term1 = safe_divide(temp_a, temp_b) complex_factor = final_term1 - temp_c return stock_df.with_columns(complex_factor.alias('complex_factor_deap_1')) except Exception as e: # 如果计算失败,填充NaN print(f"Error calculating complex_factor_deap_1: {e}") return stock_df.with_columns(pl.lit(None).cast(float).alias('complex_factor_deap_1')) # 因子集合 COMPLEX_OPERATORS = [ LargeFlowMomentumCorrelationOperator(), LargeBuyConsolidationOperator(), IntradayLargeFlowCorrelationOperator(), ProfitPressureOperator(), UnderwaterResistanceOperator(), ProfitDecayOperator(), PullbackStrongOperator(), HurstExponentFlowOperator(), VolWeightedHistoricalPositionOperator(), CrossSectionalRankOperator('close'), CrossSectionalNetLargeFlowRankOperator(), CrossSectionalFlowDivergenceRankOperator(), CrossSectionalRelativeProfitMarginRankOperator(), CrossSectionalCostBreadthRankOperator(), CrossSectionalWinnerRateRankOperator(), CrossSectionalVolumeRatioRankOperator(), ComplexFactorDEAPOperator(), ] def apply_complex_factors(df: pl.DataFrame, operators: List = None) -> pl.DataFrame: """ 应用所有复杂组合因子 Args: df: 输入的Polars DataFrame operators: 要应用的算子列表,如果为None则使用默认列表 Returns: 添加了复杂组合因子的DataFrame """ if operators is None: operators = COMPLEX_OPERATORS result_df = df for operator in operators: result_df = operator(result_df) return result_df # 主应用函数 def apply_all_factors(df: pl.DataFrame, factor_categories: List[str] = None) -> pl.DataFrame: """ 应用所有类别的因子 Args: df: 输入的Polars DataFrame factor_categories: 要应用的因子类别列表,如果为None则应用所有类别 Returns: 添加了所有因子的DataFrame """ if factor_categories is None: factor_categories = ['money_flow', 'chip', 'volatility', 'volume', 'technical', 'sentiment', 'momentum', 'complex'] result_df = df # 导入所有因子模块 from polars_money_flow_factors import apply_money_flow_factors from polars_chip_factors import apply_chip_distribution_factors from polars_volatility_factors import apply_volatility_factors from polars_volume_factors import apply_volume_factors from polars_technical_factors import apply_technical_factors from polars_sentiment_factors import apply_sentiment_factors from polars_momentum_factors import apply_momentum_factors # 应用各类因子 if 'money_flow' in factor_categories: result_df = apply_money_flow_factors(result_df) if 'chip' in factor_categories: result_df = apply_chip_distribution_factors(result_df) if 'volatility' in factor_categories: result_df = apply_volatility_factors(result_df) if 'volume' in factor_categories: result_df = apply_volume_factors(result_df) if 'technical' in factor_categories: result_df = apply_technical_factors(result_df) if 'sentiment' in factor_categories: result_df = apply_sentiment_factors(result_df) if 'momentum' in factor_categories: result_df = apply_momentum_factors(result_df) if 'complex' in factor_categories: result_df = apply_complex_factors(result_df) return result_df