"""因子批量注册脚本。 使用 FactorManager 批量注册因子,用户只需提供 name、desc 和表达式, 自动生成 factor_id 并保存到 factors.jsonl。 使用方法: 1. 在 FACTORS 列表中添加因子定义 2. 运行: uv run python src/scripts/register_factors.py 示例: FACTORS = [ { "name": "mom_5d", "desc": "5日价格动量", "dsl": "cs_rank(close / ts_delay(close, 5) - 1)", "category": "momentum", # 可选扩展字段 }, ... ] """ import json import re from pathlib import Path from typing import Any, Dict, List, Optional from src.factors.metadata import FactorManager from src.factors.metadata.exceptions import DuplicateFactorError, ValidationError from src.config.settings import get_settings # ============================================================================ # 用户配置区域 - 在这里添加要注册的因子 # ============================================================================ FACTORS: List[Dict[str, Any]] =[ # ==================== 第一类:筹码集中度与离散度因子 ==================== { "name": "chip_dispersion_90", "desc": "90%筹码离散度:衡量市场90%持仓筹码的宽度,值越小表示筹码越高度集中(单峰密集),往往是洗盘结束的前兆", "dsl": "(cost_95pct - cost_5pct) / (cost_95pct + cost_5pct)", }, { "name": "chip_dispersion_70", "desc": "70%核心筹码离散度:剔除极端的底部死筹和高位套牢盘,反映中间70%主流资金的成本集中度", "dsl": "(cost_85pct - cost_15pct) / (cost_85pct + cost_15pct)", }, { "name": "cost_skewness", "desc": "筹码偏度:反映筹码分布的不对称性。大于1说明上方套牢盘拖尾严重,小于1说明下方获利盘雄厚", "dsl": "(cost_95pct - cost_50pct) / (cost_50pct - cost_5pct)", }, { "name": "dispersion_change_20", "desc": "筹码集中度近期变化率:过去20天筹码宽度的变化比例,持续下降说明主力正在暗中吸筹", "dsl": "ts_pct_change((cost_95pct - cost_5pct) / cost_50pct, 20)", }, # ==================== 第二类:筹码相对位置与压力/支撑因子 ==================== { "name": "price_to_avg_cost", "desc": "整体浮盈比例:当前价格相对加权平均成本的溢价率。高溢价有均值回归压力,负溢价代表超跌", "dsl": "(close - weight_avg) / weight_avg", }, { "name": "price_to_median_cost", "desc": "中位数成本偏离度:价格相对于50%分位点(绝对半数人持仓价)的偏离,向上突破通常是右侧买点", "dsl": "(close - cost_50pct) / cost_50pct", }, { "name": "mean_median_dev", "desc": "均值中位数背离:均值显著大于中位数说明高位筹码堆积,上涨阻力大", "dsl": "(weight_avg - cost_50pct) / cost_50pct", }, { "name": "trap_pressure", "desc": "高位套牢盘压力指数:当前价格距离上方95%高位套牢成本的距离。距离越大,反弹的真空期阻力越小", "dsl": "(cost_95pct - close) / close", }, { "name": "bottom_profit", "desc": "底部支撑底仓利润率:当前价格距离底部5%筹码的利润空间。暴跌时大于0说明底仓极度稳定", "dsl": "(close - cost_5pct) / cost_5pct", }, { "name": "history_position", "desc": "历史区间分位点:当前价格在个股上市以来历史最高点和最低点之间的相对位置", "dsl": "(close - his_low) / (his_high - his_low)", }, # ==================== 第三类:胜率相关的动量与反转因子 ==================== { "name": "winner_rate_surge_5", "desc": "获利盘短期爆发力:胜率在过去5天内的变化值,急剧上升是极强的动量做多信号", "dsl": "ts_delta(winner_rate, 5)", }, { "name": "winner_rate_cs_rank", "desc": "获利盘高位反转信号:全市场胜率截面排名,极端高胜率往往面临多头踩踏的获利了结压力(反转因子)", "dsl": "cs_rank(winner_rate)", }, { "name": "winner_rate_dev_20", "desc": "获利盘均线偏离:当前胜率相对过去20天平均胜率的偏离程度,捕捉筹码情绪的边际超买/超卖", "dsl": "winner_rate - ts_mean(winner_rate, 20)", }, { "name": "winner_rate_volatility", "desc": "获利盘波动率:过去20天胜率的波动率。波动率低且胜率高说明单边上涨极度稳健", "dsl": "ts_std(winner_rate, 20)", }, { "name": "smart_money_accumulation", "desc": "潜在主力吸筹隐蔽指标:胜率的60日时序分位数减去价格的时序分位数。值越大说明‘价平而获利盘增’,底部吸筹明显", "dsl": "ts_rank(winner_rate, 60) - ts_rank(close, 60)", }, # ==================== 第四类:量价与筹码交乘因子 ==================== { "name": "winner_vol_corr_20", "desc": "放量突破筹码密集区:胜率与成交量的20日时序相关性,正相关说明增量资金在主动解套上方筹码", "dsl": "ts_corr(winner_rate, vol, 20)", }, { "name": "cost_base_momentum", "desc": "成本重心上移换手率:过去20天加权平均成本的变化幅度,快速上移说明高位换手极其充分", "dsl": "ts_pct_change(weight_avg, 20)", }, { "name": "bottom_cost_stability", "desc": "底部坚如磐石因子:底部5%成本的60天波动率相对于中位数的比值,波动越小说明死筹越稳固", "dsl": "ts_std(cost_5pct, 60) / cost_50pct", }, { "name": "pivot_reversion", "desc": "盈亏分界线乖离修复:价格偏离50%分位点除以近20日价格标准差,用于寻找超跌后的均值回归买点", "dsl": "(close - cost_50pct) / ts_std(close, 20)", }, { "name": "chip_transition", "desc": "强弱筹码切换度:上方厚度与下方厚度差值的20日变化量。由正变负说明筹码彻底完成了自上而下的转移(洗盘结束)", "dsl": "ts_delta((cost_85pct - cost_50pct) - (cost_50pct - cost_15pct), 20)", }, ] # 因子存储路径(使用项目根路径下的 data 目录) settings = get_settings() OUTPUT_PATH = settings.data_path_resolved / "factors.jsonl" # ============================================================================ # 核心实现 # ============================================================================ def get_next_factor_id(filepath: Path) -> str: """生成下一个 factor_id。 从现有文件中提取最大序号,生成新的 F_XXX 格式 ID。 Args: filepath: JSONL 文件路径 Returns: 新的 factor_id,如 "F_001" """ if not filepath.exists(): return "F_001" try: with open(filepath, "r", encoding="utf-8") as f: lines = f.readlines() except Exception: return "F_001" max_num = 0 pattern = re.compile(r"^F_(\d+)$") for line in lines: line = line.strip() if not line: continue try: data = json.loads(line) factor_id = data.get("factor_id", "") match = pattern.match(factor_id) if match: num = int(match.group(1)) max_num = max(max_num, num) except (json.JSONDecodeError, ValueError): continue return f"F_{max_num + 1:03d}" def validate_factor(factor: Dict[str, Any]) -> None: """验证因子定义是否有效。 Args: factor: 因子定义字典 Raises: ValueError: 验证失败时抛出 """ required_fields = ["name", "desc", "dsl"] for field in required_fields: if field not in factor or not factor[field]: raise ValueError(f"因子缺少必填字段 '{field}'") # 验证 name 格式(只允许字母、数字、下划线) if not re.match(r"^[a-zA-Z_][a-zA-Z0-9_]*$", factor["name"]): raise ValueError( f"因子名称 '{factor['name']}' 格式无效,只允许字母、数字、下划线" ) def register_factors( factors: List[Dict[str, Any]], output_path: Optional[Path] = None, skip_duplicates: bool = True, ) -> Dict[str, List[str]]: """批量注册因子。 Args: factors: 因子定义列表 output_path: 输出文件路径,默认使用 OUTPUT_PATH skip_duplicates: 遇到重复因子是否跳过而不是报错 Returns: 注册结果统计,包含成功列表和失败列表 """ output_path = output_path or OUTPUT_PATH manager = FactorManager(str(output_path)) results = { "success": [], "failed": [], "skipped": [], } for factor in factors: try: # 验证因子定义 validate_factor(factor) # 检查 name 是否已存在 existing = manager.get_factors_by_name(factor["name"]) if len(existing) > 0: if skip_duplicates: results["skipped"].append(factor["name"]) print(f"[跳过] 因子 '{factor['name']}' 已存在") continue else: raise DuplicateFactorError(factor["name"]) # 生成 factor_id factor_id = get_next_factor_id(output_path) # 构建完整的因子记录 factor_record = { "factor_id": factor_id, "name": factor["name"], "desc": factor["desc"], "dsl": factor["dsl"], } # 添加可选扩展字段 for key in ["category", "author", "tags", "notes"]: if key in factor: factor_record[key] = factor[key] # 注册因子 manager.add_factor(factor_record) results["success"].append(factor["name"]) print(f"[成功] {factor_id}: {factor['name']}") except DuplicateFactorError as e: results["failed"].append(factor.get("name", "unknown")) print(f"[失败] 因子 '{factor.get('name', 'unknown')}': {e}") except (ValidationError, ValueError) as e: results["failed"].append(factor.get("name", "unknown")) print(f"[失败] 因子 '{factor.get('name', 'unknown')}': {e}") except Exception as e: results["failed"].append(factor.get("name", "unknown")) print(f"[错误] 因子 '{factor.get('name', 'unknown')}': {e}") return results def main(): """主函数。""" print("=" * 60) print("因子批量注册工具") print("=" * 60) print(f"目标文件: {OUTPUT_PATH}") print(f"待注册因子数: {len(FACTORS)}") print("-" * 60) if not FACTORS: print("[警告] FACTORS 列表为空,请在脚本中配置要注册的因子") return results = register_factors(FACTORS) print("-" * 60) print("注册完成:") print(f" 成功: {len(results['success'])} 个") print(f" 跳过: {len(results['skipped'])} 个") print(f" 失败: {len(results['failed'])} 个") if results["success"]: print(f"\n已注册因子: {', '.join(results['success'])}") if results["skipped"]: print(f"已跳过因子: {', '.join(results['skipped'])}") if results["failed"]: print(f"失败因子: {', '.join(results['failed'])}") if __name__ == "__main__": main()