diff --git a/src/scripts/register_factors.py b/src/scripts/register_factors.py new file mode 100644 index 0000000..fb37b70 --- /dev/null +++ b/src/scripts/register_factors.py @@ -0,0 +1,244 @@ +"""因子批量注册脚本。 + +使用 FactorManager 批量注册因子,用户只需提供 name、desc 和表达式, +自动生成 factor_id 并保存到 factors.jsonl。 + +使用方法: + 1. 在 FACTORS 列表中添加因子定义 + 2. 运行: uv run python src/scripts/register_factors.py + +示例: + FACTORS = [ + { + "name": "mom_5d", + "desc": "5日价格动量", + "dsl": "cs_rank(close / ts_delay(close, 5) - 1)", + "category": "momentum", # 可选扩展字段 + }, + ... + ] +""" + +import json +import re +from pathlib import Path +from typing import Any, Dict, List, Optional + +from src.factors.metadata import FactorManager +from src.factors.metadata.exceptions import DuplicateFactorError, ValidationError + + +# ============================================================================ +# 用户配置区域 - 在这里添加要注册的因子 +# ============================================================================ + +FACTORS: List[Dict[str, Any]] = [ + # 示例因子,请根据实际需要修改或添加 + { + "name": "mom_5d", + "desc": "5日价格动量,收盘价相对于5日前收盘价的涨跌幅进行截面排名", + "dsl": "cs_rank(close / ts_delay(close, 5) - 1)", + "category": "momentum", + }, + { + "name": "mom_20d", + "desc": "20日价格动量,收盘价相对于20日前收盘价的涨跌幅进行截面排名", + "dsl": "cs_rank(close / ts_delay(close, 20) - 1)", + "category": "momentum", + }, + { + "name": "volatility_20d", + "desc": "20日价格波动率,收益率的20日滚动标准差", + "dsl": "ts_std(ts_delta(close, 1) / ts_delay(close, 1), 20)", + "category": "volatility", + }, + { + "name": "price_ma_ratio", + "desc": "价格与20日均线的偏离度", + "dsl": "close / ts_mean(close, 20) - 1", + "category": "mean_reversion", + }, + { + "name": "volume_ratio", + "desc": "成交量比率,当日成交量相对于20日均量的比值", + "dsl": "volume / ts_mean(volume, 20)", + "category": "volume", + }, +] + +# 因子存储路径(默认使用实验目录) +OUTPUT_PATH = Path(__file__).parent.parent / "experiment" / "data" / "factors.jsonl" + + +# ============================================================================ +# 核心实现 +# ============================================================================ + + +def get_next_factor_id(filepath: Path) -> str: + """生成下一个 factor_id。 + + 从现有文件中提取最大序号,生成新的 F_XXX 格式 ID。 + + Args: + filepath: JSONL 文件路径 + + Returns: + 新的 factor_id,如 "F_001" + """ + if not filepath.exists(): + return "F_001" + + try: + with open(filepath, "r", encoding="utf-8") as f: + lines = f.readlines() + except Exception: + return "F_001" + + max_num = 0 + pattern = re.compile(r"^F_(\d+)$") + + for line in lines: + line = line.strip() + if not line: + continue + try: + data = json.loads(line) + factor_id = data.get("factor_id", "") + match = pattern.match(factor_id) + if match: + num = int(match.group(1)) + max_num = max(max_num, num) + except (json.JSONDecodeError, ValueError): + continue + + return f"F_{max_num + 1:03d}" + + +def validate_factor(factor: Dict[str, Any]) -> None: + """验证因子定义是否有效。 + + Args: + factor: 因子定义字典 + + Raises: + ValueError: 验证失败时抛出 + """ + required_fields = ["name", "desc", "dsl"] + for field in required_fields: + if field not in factor or not factor[field]: + raise ValueError(f"因子缺少必填字段 '{field}'") + + # 验证 name 格式(只允许字母、数字、下划线) + if not re.match(r"^[a-zA-Z_][a-zA-Z0-9_]*$", factor["name"]): + raise ValueError( + f"因子名称 '{factor['name']}' 格式无效,只允许字母、数字、下划线" + ) + + +def register_factors( + factors: List[Dict[str, Any]], + output_path: Optional[Path] = None, + skip_duplicates: bool = True, +) -> Dict[str, List[str]]: + """批量注册因子。 + + Args: + factors: 因子定义列表 + output_path: 输出文件路径,默认使用 OUTPUT_PATH + skip_duplicates: 遇到重复因子是否跳过而不是报错 + + Returns: + 注册结果统计,包含成功列表和失败列表 + """ + output_path = output_path or OUTPUT_PATH + manager = FactorManager(str(output_path)) + + results = { + "success": [], + "failed": [], + "skipped": [], + } + + for factor in factors: + try: + # 验证因子定义 + validate_factor(factor) + + # 检查 name 是否已存在 + existing = manager.get_factors_by_name(factor["name"]) + if len(existing) > 0: + if skip_duplicates: + results["skipped"].append(factor["name"]) + print(f"[跳过] 因子 '{factor['name']}' 已存在") + continue + else: + raise DuplicateFactorError(factor["name"]) + + # 生成 factor_id + factor_id = get_next_factor_id(output_path) + + # 构建完整的因子记录 + factor_record = { + "factor_id": factor_id, + "name": factor["name"], + "desc": factor["desc"], + "dsl": factor["dsl"], + } + + # 添加可选扩展字段 + for key in ["category", "author", "tags", "notes"]: + if key in factor: + factor_record[key] = factor[key] + + # 注册因子 + manager.add_factor(factor_record) + results["success"].append(factor["name"]) + print(f"[成功] {factor_id}: {factor['name']}") + + except DuplicateFactorError as e: + results["failed"].append(factor.get("name", "unknown")) + print(f"[失败] 因子 '{factor.get('name', 'unknown')}': {e}") + + except (ValidationError, ValueError) as e: + results["failed"].append(factor.get("name", "unknown")) + print(f"[失败] 因子 '{factor.get('name', 'unknown')}': {e}") + + except Exception as e: + results["failed"].append(factor.get("name", "unknown")) + print(f"[错误] 因子 '{factor.get('name', 'unknown')}': {e}") + + return results + + +def main(): + """主函数。""" + print("=" * 60) + print("因子批量注册工具") + print("=" * 60) + print(f"目标文件: {OUTPUT_PATH}") + print(f"待注册因子数: {len(FACTORS)}") + print("-" * 60) + + if not FACTORS: + print("[警告] FACTORS 列表为空,请在脚本中配置要注册的因子") + return + + results = register_factors(FACTORS) + + print("-" * 60) + print("注册完成:") + print(f" 成功: {len(results['success'])} 个") + print(f" 跳过: {len(results['skipped'])} 个") + print(f" 失败: {len(results['failed'])} 个") + + if results["success"]: + print(f"\n已注册因子: {', '.join(results['success'])}") + if results["skipped"]: + print(f"已跳过因子: {', '.join(results['skipped'])}") + if results["failed"]: + print(f"失败因子: {', '.join(results['failed'])}") + + +if __name__ == "__main__": + main()