- 添加 19 个筹码分布和胜率相关因子(包括chip_dispersion、winner_rate等系列) - LightGBM模型添加早停和训练指标记录功能 - 统一Label配置到common.py模块 - 新增list_factors.py因子列表脚本
323 lines
12 KiB
Python
323 lines
12 KiB
Python
"""因子批量注册脚本。
|
||
|
||
使用 FactorManager 批量注册因子,用户只需提供 name、desc 和表达式,
|
||
自动生成 factor_id 并保存到 factors.jsonl。
|
||
|
||
使用方法:
|
||
1. 在 FACTORS 列表中添加因子定义
|
||
2. 运行: uv run python src/scripts/register_factors.py
|
||
|
||
示例:
|
||
FACTORS = [
|
||
{
|
||
"name": "mom_5d",
|
||
"desc": "5日价格动量",
|
||
"dsl": "cs_rank(close / ts_delay(close, 5) - 1)",
|
||
"category": "momentum", # 可选扩展字段
|
||
},
|
||
...
|
||
]
|
||
"""
|
||
|
||
import json
|
||
import re
|
||
from pathlib import Path
|
||
from typing import Any, Dict, List, Optional
|
||
|
||
from src.factors.metadata import FactorManager
|
||
from src.factors.metadata.exceptions import DuplicateFactorError, ValidationError
|
||
from src.config.settings import get_settings
|
||
|
||
|
||
# ============================================================================
|
||
# 用户配置区域 - 在这里添加要注册的因子
|
||
# ============================================================================
|
||
|
||
FACTORS: List[Dict[str, Any]] =[
|
||
# ==================== 第一类:筹码集中度与离散度因子 ====================
|
||
{
|
||
"name": "chip_dispersion_90",
|
||
"desc": "90%筹码离散度:衡量市场90%持仓筹码的宽度,值越小表示筹码越高度集中(单峰密集),往往是洗盘结束的前兆",
|
||
"dsl": "(cost_95pct - cost_5pct) / (cost_95pct + cost_5pct)",
|
||
},
|
||
{
|
||
"name": "chip_dispersion_70",
|
||
"desc": "70%核心筹码离散度:剔除极端的底部死筹和高位套牢盘,反映中间70%主流资金的成本集中度",
|
||
"dsl": "(cost_85pct - cost_15pct) / (cost_85pct + cost_15pct)",
|
||
},
|
||
{
|
||
"name": "cost_skewness",
|
||
"desc": "筹码偏度:反映筹码分布的不对称性。大于1说明上方套牢盘拖尾严重,小于1说明下方获利盘雄厚",
|
||
"dsl": "(cost_95pct - cost_50pct) / (cost_50pct - cost_5pct)",
|
||
},
|
||
{
|
||
"name": "dispersion_change_20",
|
||
"desc": "筹码集中度近期变化率:过去20天筹码宽度的变化比例,持续下降说明主力正在暗中吸筹",
|
||
"dsl": "ts_pct_change((cost_95pct - cost_5pct) / cost_50pct, 20)",
|
||
},
|
||
|
||
# ==================== 第二类:筹码相对位置与压力/支撑因子 ====================
|
||
{
|
||
"name": "price_to_avg_cost",
|
||
"desc": "整体浮盈比例:当前价格相对加权平均成本的溢价率。高溢价有均值回归压力,负溢价代表超跌",
|
||
"dsl": "(close - weight_avg) / weight_avg",
|
||
},
|
||
{
|
||
"name": "price_to_median_cost",
|
||
"desc": "中位数成本偏离度:价格相对于50%分位点(绝对半数人持仓价)的偏离,向上突破通常是右侧买点",
|
||
"dsl": "(close - cost_50pct) / cost_50pct",
|
||
},
|
||
{
|
||
"name": "mean_median_dev",
|
||
"desc": "均值中位数背离:均值显著大于中位数说明高位筹码堆积,上涨阻力大",
|
||
"dsl": "(weight_avg - cost_50pct) / cost_50pct",
|
||
},
|
||
{
|
||
"name": "trap_pressure",
|
||
"desc": "高位套牢盘压力指数:当前价格距离上方95%高位套牢成本的距离。距离越大,反弹的真空期阻力越小",
|
||
"dsl": "(cost_95pct - close) / close",
|
||
},
|
||
{
|
||
"name": "bottom_profit",
|
||
"desc": "底部支撑底仓利润率:当前价格距离底部5%筹码的利润空间。暴跌时大于0说明底仓极度稳定",
|
||
"dsl": "(close - cost_5pct) / cost_5pct",
|
||
},
|
||
{
|
||
"name": "history_position",
|
||
"desc": "历史区间分位点:当前价格在个股上市以来历史最高点和最低点之间的相对位置",
|
||
"dsl": "(close - his_low) / (his_high - his_low)",
|
||
},
|
||
|
||
# ==================== 第三类:胜率相关的动量与反转因子 ====================
|
||
{
|
||
"name": "winner_rate_surge_5",
|
||
"desc": "获利盘短期爆发力:胜率在过去5天内的变化值,急剧上升是极强的动量做多信号",
|
||
"dsl": "ts_delta(winner_rate, 5)",
|
||
},
|
||
{
|
||
"name": "winner_rate_cs_rank",
|
||
"desc": "获利盘高位反转信号:全市场胜率截面排名,极端高胜率往往面临多头踩踏的获利了结压力(反转因子)",
|
||
"dsl": "cs_rank(winner_rate)",
|
||
},
|
||
{
|
||
"name": "winner_rate_dev_20",
|
||
"desc": "获利盘均线偏离:当前胜率相对过去20天平均胜率的偏离程度,捕捉筹码情绪的边际超买/超卖",
|
||
"dsl": "winner_rate - ts_mean(winner_rate, 20)",
|
||
},
|
||
{
|
||
"name": "winner_rate_volatility",
|
||
"desc": "获利盘波动率:过去20天胜率的波动率。波动率低且胜率高说明单边上涨极度稳健",
|
||
"dsl": "ts_std(winner_rate, 20)",
|
||
},
|
||
{
|
||
"name": "smart_money_accumulation",
|
||
"desc": "潜在主力吸筹隐蔽指标:胜率的60日时序分位数减去价格的时序分位数。值越大说明‘价平而获利盘增’,底部吸筹明显",
|
||
"dsl": "ts_rank(winner_rate, 60) - ts_rank(close, 60)",
|
||
},
|
||
|
||
# ==================== 第四类:量价与筹码交乘因子 ====================
|
||
{
|
||
"name": "winner_vol_corr_20",
|
||
"desc": "放量突破筹码密集区:胜率与成交量的20日时序相关性,正相关说明增量资金在主动解套上方筹码",
|
||
"dsl": "ts_corr(winner_rate, vol, 20)",
|
||
},
|
||
{
|
||
"name": "cost_base_momentum",
|
||
"desc": "成本重心上移换手率:过去20天加权平均成本的变化幅度,快速上移说明高位换手极其充分",
|
||
"dsl": "ts_pct_change(weight_avg, 20)",
|
||
},
|
||
{
|
||
"name": "bottom_cost_stability",
|
||
"desc": "底部坚如磐石因子:底部5%成本的60天波动率相对于中位数的比值,波动越小说明死筹越稳固",
|
||
"dsl": "ts_std(cost_5pct, 60) / cost_50pct",
|
||
},
|
||
{
|
||
"name": "pivot_reversion",
|
||
"desc": "盈亏分界线乖离修复:价格偏离50%分位点除以近20日价格标准差,用于寻找超跌后的均值回归买点",
|
||
"dsl": "(close - cost_50pct) / ts_std(close, 20)",
|
||
},
|
||
{
|
||
"name": "chip_transition",
|
||
"desc": "强弱筹码切换度:上方厚度与下方厚度差值的20日变化量。由正变负说明筹码彻底完成了自上而下的转移(洗盘结束)",
|
||
"dsl": "ts_delta((cost_85pct - cost_50pct) - (cost_50pct - cost_15pct), 20)",
|
||
},
|
||
]
|
||
|
||
# 因子存储路径(使用项目根路径下的 data 目录)
|
||
settings = get_settings()
|
||
OUTPUT_PATH = settings.data_path_resolved / "factors.jsonl"
|
||
|
||
|
||
# ============================================================================
|
||
# 核心实现
|
||
# ============================================================================
|
||
|
||
|
||
def get_next_factor_id(filepath: Path) -> str:
|
||
"""生成下一个 factor_id。
|
||
|
||
从现有文件中提取最大序号,生成新的 F_XXX 格式 ID。
|
||
|
||
Args:
|
||
filepath: JSONL 文件路径
|
||
|
||
Returns:
|
||
新的 factor_id,如 "F_001"
|
||
"""
|
||
if not filepath.exists():
|
||
return "F_001"
|
||
|
||
try:
|
||
with open(filepath, "r", encoding="utf-8") as f:
|
||
lines = f.readlines()
|
||
except Exception:
|
||
return "F_001"
|
||
|
||
max_num = 0
|
||
pattern = re.compile(r"^F_(\d+)$")
|
||
|
||
for line in lines:
|
||
line = line.strip()
|
||
if not line:
|
||
continue
|
||
try:
|
||
data = json.loads(line)
|
||
factor_id = data.get("factor_id", "")
|
||
match = pattern.match(factor_id)
|
||
if match:
|
||
num = int(match.group(1))
|
||
max_num = max(max_num, num)
|
||
except (json.JSONDecodeError, ValueError):
|
||
continue
|
||
|
||
return f"F_{max_num + 1:03d}"
|
||
|
||
|
||
def validate_factor(factor: Dict[str, Any]) -> None:
|
||
"""验证因子定义是否有效。
|
||
|
||
Args:
|
||
factor: 因子定义字典
|
||
|
||
Raises:
|
||
ValueError: 验证失败时抛出
|
||
"""
|
||
required_fields = ["name", "desc", "dsl"]
|
||
for field in required_fields:
|
||
if field not in factor or not factor[field]:
|
||
raise ValueError(f"因子缺少必填字段 '{field}'")
|
||
|
||
# 验证 name 格式(只允许字母、数字、下划线)
|
||
if not re.match(r"^[a-zA-Z_][a-zA-Z0-9_]*$", factor["name"]):
|
||
raise ValueError(
|
||
f"因子名称 '{factor['name']}' 格式无效,只允许字母、数字、下划线"
|
||
)
|
||
|
||
|
||
def register_factors(
|
||
factors: List[Dict[str, Any]],
|
||
output_path: Optional[Path] = None,
|
||
skip_duplicates: bool = True,
|
||
) -> Dict[str, List[str]]:
|
||
"""批量注册因子。
|
||
|
||
Args:
|
||
factors: 因子定义列表
|
||
output_path: 输出文件路径,默认使用 OUTPUT_PATH
|
||
skip_duplicates: 遇到重复因子是否跳过而不是报错
|
||
|
||
Returns:
|
||
注册结果统计,包含成功列表和失败列表
|
||
"""
|
||
output_path = output_path or OUTPUT_PATH
|
||
manager = FactorManager(str(output_path))
|
||
|
||
results = {
|
||
"success": [],
|
||
"failed": [],
|
||
"skipped": [],
|
||
}
|
||
|
||
for factor in factors:
|
||
try:
|
||
# 验证因子定义
|
||
validate_factor(factor)
|
||
|
||
# 检查 name 是否已存在
|
||
existing = manager.get_factors_by_name(factor["name"])
|
||
if len(existing) > 0:
|
||
if skip_duplicates:
|
||
results["skipped"].append(factor["name"])
|
||
print(f"[跳过] 因子 '{factor['name']}' 已存在")
|
||
continue
|
||
else:
|
||
raise DuplicateFactorError(factor["name"])
|
||
|
||
# 生成 factor_id
|
||
factor_id = get_next_factor_id(output_path)
|
||
|
||
# 构建完整的因子记录
|
||
factor_record = {
|
||
"factor_id": factor_id,
|
||
"name": factor["name"],
|
||
"desc": factor["desc"],
|
||
"dsl": factor["dsl"],
|
||
}
|
||
|
||
# 添加可选扩展字段
|
||
for key in ["category", "author", "tags", "notes"]:
|
||
if key in factor:
|
||
factor_record[key] = factor[key]
|
||
|
||
# 注册因子
|
||
manager.add_factor(factor_record)
|
||
results["success"].append(factor["name"])
|
||
print(f"[成功] {factor_id}: {factor['name']}")
|
||
|
||
except DuplicateFactorError as e:
|
||
results["failed"].append(factor.get("name", "unknown"))
|
||
print(f"[失败] 因子 '{factor.get('name', 'unknown')}': {e}")
|
||
|
||
except (ValidationError, ValueError) as e:
|
||
results["failed"].append(factor.get("name", "unknown"))
|
||
print(f"[失败] 因子 '{factor.get('name', 'unknown')}': {e}")
|
||
|
||
except Exception as e:
|
||
results["failed"].append(factor.get("name", "unknown"))
|
||
print(f"[错误] 因子 '{factor.get('name', 'unknown')}': {e}")
|
||
|
||
return results
|
||
|
||
|
||
def main():
|
||
"""主函数。"""
|
||
print("=" * 60)
|
||
print("因子批量注册工具")
|
||
print("=" * 60)
|
||
print(f"目标文件: {OUTPUT_PATH}")
|
||
print(f"待注册因子数: {len(FACTORS)}")
|
||
print("-" * 60)
|
||
|
||
if not FACTORS:
|
||
print("[警告] FACTORS 列表为空,请在脚本中配置要注册的因子")
|
||
return
|
||
|
||
results = register_factors(FACTORS)
|
||
|
||
print("-" * 60)
|
||
print("注册完成:")
|
||
print(f" 成功: {len(results['success'])} 个")
|
||
print(f" 跳过: {len(results['skipped'])} 个")
|
||
print(f" 失败: {len(results['failed'])} 个")
|
||
|
||
if results["success"]:
|
||
print(f"\n已注册因子: {', '.join(results['success'])}")
|
||
if results["skipped"]:
|
||
print(f"已跳过因子: {', '.join(results['skipped'])}")
|
||
if results["failed"]:
|
||
print(f"失败因子: {', '.join(results['failed'])}")
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|