Files
ProStock/src/scripts/register_factors.py
liaozhaorun c3d1b157e9 feat(factors): 新增筹码集中度相关因子并优化训练框架
- 添加 19 个筹码分布和胜率相关因子(包括chip_dispersion、winner_rate等系列)
- LightGBM模型添加早停和训练指标记录功能
- 统一Label配置到common.py模块
- 新增list_factors.py因子列表脚本
2026-03-29 01:34:58 +08:00

323 lines
12 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""因子批量注册脚本。
使用 FactorManager 批量注册因子,用户只需提供 name、desc 和表达式,
自动生成 factor_id 并保存到 factors.jsonl。
使用方法:
1. 在 FACTORS 列表中添加因子定义
2. 运行: uv run python src/scripts/register_factors.py
示例:
FACTORS = [
{
"name": "mom_5d",
"desc": "5日价格动量",
"dsl": "cs_rank(close / ts_delay(close, 5) - 1)",
"category": "momentum", # 可选扩展字段
},
...
]
"""
import json
import re
from pathlib import Path
from typing import Any, Dict, List, Optional
from src.factors.metadata import FactorManager
from src.factors.metadata.exceptions import DuplicateFactorError, ValidationError
from src.config.settings import get_settings
# ============================================================================
# 用户配置区域 - 在这里添加要注册的因子
# ============================================================================
FACTORS: List[Dict[str, Any]] =[
# ==================== 第一类:筹码集中度与离散度因子 ====================
{
"name": "chip_dispersion_90",
"desc": "90%筹码离散度衡量市场90%持仓筹码的宽度,值越小表示筹码越高度集中(单峰密集),往往是洗盘结束的前兆",
"dsl": "(cost_95pct - cost_5pct) / (cost_95pct + cost_5pct)",
},
{
"name": "chip_dispersion_70",
"desc": "70%核心筹码离散度剔除极端的底部死筹和高位套牢盘反映中间70%主流资金的成本集中度",
"dsl": "(cost_85pct - cost_15pct) / (cost_85pct + cost_15pct)",
},
{
"name": "cost_skewness",
"desc": "筹码偏度反映筹码分布的不对称性。大于1说明上方套牢盘拖尾严重小于1说明下方获利盘雄厚",
"dsl": "(cost_95pct - cost_50pct) / (cost_50pct - cost_5pct)",
},
{
"name": "dispersion_change_20",
"desc": "筹码集中度近期变化率过去20天筹码宽度的变化比例持续下降说明主力正在暗中吸筹",
"dsl": "ts_pct_change((cost_95pct - cost_5pct) / cost_50pct, 20)",
},
# ==================== 第二类:筹码相对位置与压力/支撑因子 ====================
{
"name": "price_to_avg_cost",
"desc": "整体浮盈比例:当前价格相对加权平均成本的溢价率。高溢价有均值回归压力,负溢价代表超跌",
"dsl": "(close - weight_avg) / weight_avg",
},
{
"name": "price_to_median_cost",
"desc": "中位数成本偏离度价格相对于50%分位点(绝对半数人持仓价)的偏离,向上突破通常是右侧买点",
"dsl": "(close - cost_50pct) / cost_50pct",
},
{
"name": "mean_median_dev",
"desc": "均值中位数背离:均值显著大于中位数说明高位筹码堆积,上涨阻力大",
"dsl": "(weight_avg - cost_50pct) / cost_50pct",
},
{
"name": "trap_pressure",
"desc": "高位套牢盘压力指数当前价格距离上方95%高位套牢成本的距离。距离越大,反弹的真空期阻力越小",
"dsl": "(cost_95pct - close) / close",
},
{
"name": "bottom_profit",
"desc": "底部支撑底仓利润率当前价格距离底部5%筹码的利润空间。暴跌时大于0说明底仓极度稳定",
"dsl": "(close - cost_5pct) / cost_5pct",
},
{
"name": "history_position",
"desc": "历史区间分位点:当前价格在个股上市以来历史最高点和最低点之间的相对位置",
"dsl": "(close - his_low) / (his_high - his_low)",
},
# ==================== 第三类:胜率相关的动量与反转因子 ====================
{
"name": "winner_rate_surge_5",
"desc": "获利盘短期爆发力胜率在过去5天内的变化值急剧上升是极强的动量做多信号",
"dsl": "ts_delta(winner_rate, 5)",
},
{
"name": "winner_rate_cs_rank",
"desc": "获利盘高位反转信号:全市场胜率截面排名,极端高胜率往往面临多头踩踏的获利了结压力(反转因子)",
"dsl": "cs_rank(winner_rate)",
},
{
"name": "winner_rate_dev_20",
"desc": "获利盘均线偏离当前胜率相对过去20天平均胜率的偏离程度捕捉筹码情绪的边际超买/超卖",
"dsl": "winner_rate - ts_mean(winner_rate, 20)",
},
{
"name": "winner_rate_volatility",
"desc": "获利盘波动率过去20天胜率的波动率。波动率低且胜率高说明单边上涨极度稳健",
"dsl": "ts_std(winner_rate, 20)",
},
{
"name": "smart_money_accumulation",
"desc": "潜在主力吸筹隐蔽指标胜率的60日时序分位数减去价格的时序分位数。值越大说明价平而获利盘增底部吸筹明显",
"dsl": "ts_rank(winner_rate, 60) - ts_rank(close, 60)",
},
# ==================== 第四类:量价与筹码交乘因子 ====================
{
"name": "winner_vol_corr_20",
"desc": "放量突破筹码密集区胜率与成交量的20日时序相关性正相关说明增量资金在主动解套上方筹码",
"dsl": "ts_corr(winner_rate, vol, 20)",
},
{
"name": "cost_base_momentum",
"desc": "成本重心上移换手率过去20天加权平均成本的变化幅度快速上移说明高位换手极其充分",
"dsl": "ts_pct_change(weight_avg, 20)",
},
{
"name": "bottom_cost_stability",
"desc": "底部坚如磐石因子底部5%成本的60天波动率相对于中位数的比值波动越小说明死筹越稳固",
"dsl": "ts_std(cost_5pct, 60) / cost_50pct",
},
{
"name": "pivot_reversion",
"desc": "盈亏分界线乖离修复价格偏离50%分位点除以近20日价格标准差用于寻找超跌后的均值回归买点",
"dsl": "(close - cost_50pct) / ts_std(close, 20)",
},
{
"name": "chip_transition",
"desc": "强弱筹码切换度上方厚度与下方厚度差值的20日变化量。由正变负说明筹码彻底完成了自上而下的转移洗盘结束",
"dsl": "ts_delta((cost_85pct - cost_50pct) - (cost_50pct - cost_15pct), 20)",
},
]
# 因子存储路径(使用项目根路径下的 data 目录)
settings = get_settings()
OUTPUT_PATH = settings.data_path_resolved / "factors.jsonl"
# ============================================================================
# 核心实现
# ============================================================================
def get_next_factor_id(filepath: Path) -> str:
"""生成下一个 factor_id。
从现有文件中提取最大序号,生成新的 F_XXX 格式 ID。
Args:
filepath: JSONL 文件路径
Returns:
新的 factor_id"F_001"
"""
if not filepath.exists():
return "F_001"
try:
with open(filepath, "r", encoding="utf-8") as f:
lines = f.readlines()
except Exception:
return "F_001"
max_num = 0
pattern = re.compile(r"^F_(\d+)$")
for line in lines:
line = line.strip()
if not line:
continue
try:
data = json.loads(line)
factor_id = data.get("factor_id", "")
match = pattern.match(factor_id)
if match:
num = int(match.group(1))
max_num = max(max_num, num)
except (json.JSONDecodeError, ValueError):
continue
return f"F_{max_num + 1:03d}"
def validate_factor(factor: Dict[str, Any]) -> None:
"""验证因子定义是否有效。
Args:
factor: 因子定义字典
Raises:
ValueError: 验证失败时抛出
"""
required_fields = ["name", "desc", "dsl"]
for field in required_fields:
if field not in factor or not factor[field]:
raise ValueError(f"因子缺少必填字段 '{field}'")
# 验证 name 格式(只允许字母、数字、下划线)
if not re.match(r"^[a-zA-Z_][a-zA-Z0-9_]*$", factor["name"]):
raise ValueError(
f"因子名称 '{factor['name']}' 格式无效,只允许字母、数字、下划线"
)
def register_factors(
factors: List[Dict[str, Any]],
output_path: Optional[Path] = None,
skip_duplicates: bool = True,
) -> Dict[str, List[str]]:
"""批量注册因子。
Args:
factors: 因子定义列表
output_path: 输出文件路径,默认使用 OUTPUT_PATH
skip_duplicates: 遇到重复因子是否跳过而不是报错
Returns:
注册结果统计,包含成功列表和失败列表
"""
output_path = output_path or OUTPUT_PATH
manager = FactorManager(str(output_path))
results = {
"success": [],
"failed": [],
"skipped": [],
}
for factor in factors:
try:
# 验证因子定义
validate_factor(factor)
# 检查 name 是否已存在
existing = manager.get_factors_by_name(factor["name"])
if len(existing) > 0:
if skip_duplicates:
results["skipped"].append(factor["name"])
print(f"[跳过] 因子 '{factor['name']}' 已存在")
continue
else:
raise DuplicateFactorError(factor["name"])
# 生成 factor_id
factor_id = get_next_factor_id(output_path)
# 构建完整的因子记录
factor_record = {
"factor_id": factor_id,
"name": factor["name"],
"desc": factor["desc"],
"dsl": factor["dsl"],
}
# 添加可选扩展字段
for key in ["category", "author", "tags", "notes"]:
if key in factor:
factor_record[key] = factor[key]
# 注册因子
manager.add_factor(factor_record)
results["success"].append(factor["name"])
print(f"[成功] {factor_id}: {factor['name']}")
except DuplicateFactorError as e:
results["failed"].append(factor.get("name", "unknown"))
print(f"[失败] 因子 '{factor.get('name', 'unknown')}': {e}")
except (ValidationError, ValueError) as e:
results["failed"].append(factor.get("name", "unknown"))
print(f"[失败] 因子 '{factor.get('name', 'unknown')}': {e}")
except Exception as e:
results["failed"].append(factor.get("name", "unknown"))
print(f"[错误] 因子 '{factor.get('name', 'unknown')}': {e}")
return results
def main():
"""主函数。"""
print("=" * 60)
print("因子批量注册工具")
print("=" * 60)
print(f"目标文件: {OUTPUT_PATH}")
print(f"待注册因子数: {len(FACTORS)}")
print("-" * 60)
if not FACTORS:
print("[警告] FACTORS 列表为空,请在脚本中配置要注册的因子")
return
results = register_factors(FACTORS)
print("-" * 60)
print("注册完成:")
print(f" 成功: {len(results['success'])}")
print(f" 跳过: {len(results['skipped'])}")
print(f" 失败: {len(results['failed'])}")
if results["success"]:
print(f"\n已注册因子: {', '.join(results['success'])}")
if results["skipped"]:
print(f"已跳过因子: {', '.join(results['skipped'])}")
if results["failed"]:
print(f"失败因子: {', '.join(results['failed'])}")
if __name__ == "__main__":
main()