Files
ProStock/src/scripts/register_factors.py

323 lines
12 KiB
Python
Raw Normal View History

"""因子批量注册脚本。
使用 FactorManager 批量注册因子用户只需提供 namedesc 和表达式
自动生成 factor_id 并保存到 factors.jsonl
使用方法:
1. FACTORS 列表中添加因子定义
2. 运行: uv run python src/scripts/register_factors.py
示例:
FACTORS = [
{
"name": "mom_5d",
"desc": "5日价格动量",
"dsl": "cs_rank(close / ts_delay(close, 5) - 1)",
"category": "momentum", # 可选扩展字段
},
...
]
"""
import json
import re
from pathlib import Path
from typing import Any, Dict, List, Optional
from src.factors.metadata import FactorManager
from src.factors.metadata.exceptions import DuplicateFactorError, ValidationError
from src.config.settings import get_settings
# ============================================================================
# 用户配置区域 - 在这里添加要注册的因子
# ============================================================================
FACTORS: List[Dict[str, Any]] =[
# ==================== 第一类:筹码集中度与离散度因子 ====================
{
"name": "chip_dispersion_90",
"desc": "90%筹码离散度衡量市场90%持仓筹码的宽度,值越小表示筹码越高度集中(单峰密集),往往是洗盘结束的前兆",
"dsl": "(cost_95pct - cost_5pct) / (cost_95pct + cost_5pct)",
},
{
"name": "chip_dispersion_70",
"desc": "70%核心筹码离散度剔除极端的底部死筹和高位套牢盘反映中间70%主流资金的成本集中度",
"dsl": "(cost_85pct - cost_15pct) / (cost_85pct + cost_15pct)",
},
{
"name": "cost_skewness",
"desc": "筹码偏度反映筹码分布的不对称性。大于1说明上方套牢盘拖尾严重小于1说明下方获利盘雄厚",
"dsl": "(cost_95pct - cost_50pct) / (cost_50pct - cost_5pct)",
},
{
"name": "dispersion_change_20",
"desc": "筹码集中度近期变化率过去20天筹码宽度的变化比例持续下降说明主力正在暗中吸筹",
"dsl": "ts_pct_change((cost_95pct - cost_5pct) / cost_50pct, 20)",
},
# ==================== 第二类:筹码相对位置与压力/支撑因子 ====================
{
"name": "price_to_avg_cost",
"desc": "整体浮盈比例:当前价格相对加权平均成本的溢价率。高溢价有均值回归压力,负溢价代表超跌",
"dsl": "(close - weight_avg) / weight_avg",
},
{
"name": "price_to_median_cost",
"desc": "中位数成本偏离度价格相对于50%分位点(绝对半数人持仓价)的偏离,向上突破通常是右侧买点",
"dsl": "(close - cost_50pct) / cost_50pct",
},
{
"name": "mean_median_dev",
"desc": "均值中位数背离:均值显著大于中位数说明高位筹码堆积,上涨阻力大",
"dsl": "(weight_avg - cost_50pct) / cost_50pct",
},
{
"name": "trap_pressure",
"desc": "高位套牢盘压力指数当前价格距离上方95%高位套牢成本的距离。距离越大,反弹的真空期阻力越小",
"dsl": "(cost_95pct - close) / close",
},
{
"name": "bottom_profit",
"desc": "底部支撑底仓利润率当前价格距离底部5%筹码的利润空间。暴跌时大于0说明底仓极度稳定",
"dsl": "(close - cost_5pct) / cost_5pct",
},
{
"name": "history_position",
"desc": "历史区间分位点:当前价格在个股上市以来历史最高点和最低点之间的相对位置",
"dsl": "(close - his_low) / (his_high - his_low)",
},
# ==================== 第三类:胜率相关的动量与反转因子 ====================
{
"name": "winner_rate_surge_5",
"desc": "获利盘短期爆发力胜率在过去5天内的变化值急剧上升是极强的动量做多信号",
"dsl": "ts_delta(winner_rate, 5)",
},
{
"name": "winner_rate_cs_rank",
"desc": "获利盘高位反转信号:全市场胜率截面排名,极端高胜率往往面临多头踩踏的获利了结压力(反转因子)",
"dsl": "cs_rank(winner_rate)",
},
{
"name": "winner_rate_dev_20",
"desc": "获利盘均线偏离当前胜率相对过去20天平均胜率的偏离程度捕捉筹码情绪的边际超买/超卖",
"dsl": "winner_rate - ts_mean(winner_rate, 20)",
},
{
"name": "winner_rate_volatility",
"desc": "获利盘波动率过去20天胜率的波动率。波动率低且胜率高说明单边上涨极度稳健",
"dsl": "ts_std(winner_rate, 20)",
},
{
"name": "smart_money_accumulation",
"desc": "潜在主力吸筹隐蔽指标胜率的60日时序分位数减去价格的时序分位数。值越大说明价平而获利盘增底部吸筹明显",
"dsl": "ts_rank(winner_rate, 60) - ts_rank(close, 60)",
},
# ==================== 第四类:量价与筹码交乘因子 ====================
{
"name": "winner_vol_corr_20",
"desc": "放量突破筹码密集区胜率与成交量的20日时序相关性正相关说明增量资金在主动解套上方筹码",
"dsl": "ts_corr(winner_rate, vol, 20)",
},
{
"name": "cost_base_momentum",
"desc": "成本重心上移换手率过去20天加权平均成本的变化幅度快速上移说明高位换手极其充分",
"dsl": "ts_pct_change(weight_avg, 20)",
},
{
"name": "bottom_cost_stability",
"desc": "底部坚如磐石因子底部5%成本的60天波动率相对于中位数的比值波动越小说明死筹越稳固",
"dsl": "ts_std(cost_5pct, 60) / cost_50pct",
},
{
"name": "pivot_reversion",
"desc": "盈亏分界线乖离修复价格偏离50%分位点除以近20日价格标准差用于寻找超跌后的均值回归买点",
"dsl": "(close - cost_50pct) / ts_std(close, 20)",
},
{
"name": "chip_transition",
"desc": "强弱筹码切换度上方厚度与下方厚度差值的20日变化量。由正变负说明筹码彻底完成了自上而下的转移洗盘结束",
"dsl": "ts_delta((cost_85pct - cost_50pct) - (cost_50pct - cost_15pct), 20)",
},
]
# 因子存储路径(使用项目根路径下的 data 目录)
settings = get_settings()
OUTPUT_PATH = settings.data_path_resolved / "factors.jsonl"
# ============================================================================
# 核心实现
# ============================================================================
def get_next_factor_id(filepath: Path) -> str:
"""生成下一个 factor_id。
从现有文件中提取最大序号生成新的 F_XXX 格式 ID
Args:
filepath: JSONL 文件路径
Returns:
新的 factor_id "F_001"
"""
if not filepath.exists():
return "F_001"
try:
with open(filepath, "r", encoding="utf-8") as f:
lines = f.readlines()
except Exception:
return "F_001"
max_num = 0
pattern = re.compile(r"^F_(\d+)$")
for line in lines:
line = line.strip()
if not line:
continue
try:
data = json.loads(line)
factor_id = data.get("factor_id", "")
match = pattern.match(factor_id)
if match:
num = int(match.group(1))
max_num = max(max_num, num)
except (json.JSONDecodeError, ValueError):
continue
return f"F_{max_num + 1:03d}"
def validate_factor(factor: Dict[str, Any]) -> None:
"""验证因子定义是否有效。
Args:
factor: 因子定义字典
Raises:
ValueError: 验证失败时抛出
"""
required_fields = ["name", "desc", "dsl"]
for field in required_fields:
if field not in factor or not factor[field]:
raise ValueError(f"因子缺少必填字段 '{field}'")
# 验证 name 格式(只允许字母、数字、下划线)
if not re.match(r"^[a-zA-Z_][a-zA-Z0-9_]*$", factor["name"]):
raise ValueError(
f"因子名称 '{factor['name']}' 格式无效,只允许字母、数字、下划线"
)
def register_factors(
factors: List[Dict[str, Any]],
output_path: Optional[Path] = None,
skip_duplicates: bool = True,
) -> Dict[str, List[str]]:
"""批量注册因子。
Args:
factors: 因子定义列表
output_path: 输出文件路径默认使用 OUTPUT_PATH
skip_duplicates: 遇到重复因子是否跳过而不是报错
Returns:
注册结果统计包含成功列表和失败列表
"""
output_path = output_path or OUTPUT_PATH
manager = FactorManager(str(output_path))
results = {
"success": [],
"failed": [],
"skipped": [],
}
for factor in factors:
try:
# 验证因子定义
validate_factor(factor)
# 检查 name 是否已存在
existing = manager.get_factors_by_name(factor["name"])
if len(existing) > 0:
if skip_duplicates:
results["skipped"].append(factor["name"])
print(f"[跳过] 因子 '{factor['name']}' 已存在")
continue
else:
raise DuplicateFactorError(factor["name"])
# 生成 factor_id
factor_id = get_next_factor_id(output_path)
# 构建完整的因子记录
factor_record = {
"factor_id": factor_id,
"name": factor["name"],
"desc": factor["desc"],
"dsl": factor["dsl"],
}
# 添加可选扩展字段
for key in ["category", "author", "tags", "notes"]:
if key in factor:
factor_record[key] = factor[key]
# 注册因子
manager.add_factor(factor_record)
results["success"].append(factor["name"])
print(f"[成功] {factor_id}: {factor['name']}")
except DuplicateFactorError as e:
results["failed"].append(factor.get("name", "unknown"))
print(f"[失败] 因子 '{factor.get('name', 'unknown')}': {e}")
except (ValidationError, ValueError) as e:
results["failed"].append(factor.get("name", "unknown"))
print(f"[失败] 因子 '{factor.get('name', 'unknown')}': {e}")
except Exception as e:
results["failed"].append(factor.get("name", "unknown"))
print(f"[错误] 因子 '{factor.get('name', 'unknown')}': {e}")
return results
def main():
"""主函数。"""
print("=" * 60)
print("因子批量注册工具")
print("=" * 60)
print(f"目标文件: {OUTPUT_PATH}")
print(f"待注册因子数: {len(FACTORS)}")
print("-" * 60)
if not FACTORS:
print("[警告] FACTORS 列表为空,请在脚本中配置要注册的因子")
return
results = register_factors(FACTORS)
print("-" * 60)
print("注册完成:")
print(f" 成功: {len(results['success'])}")
print(f" 跳过: {len(results['skipped'])}")
print(f" 失败: {len(results['failed'])}")
if results["success"]:
print(f"\n已注册因子: {', '.join(results['success'])}")
if results["skipped"]:
print(f"已跳过因子: {', '.join(results['skipped'])}")
if results["failed"]:
print(f"失败因子: {', '.join(results['failed'])}")
if __name__ == "__main__":
main()