feat(factors): 新增筹码集中度相关因子并优化训练框架

- 添加 19 个筹码分布和胜率相关因子(包括chip_dispersion、winner_rate等系列)
- LightGBM模型添加早停和训练指标记录功能
- 统一Label配置到common.py模块
- 新增list_factors.py因子列表脚本
This commit is contained in:
2026-03-29 01:34:58 +08:00
parent d4e0e2a0b6
commit c3d1b157e9
9 changed files with 373 additions and 246 deletions

View File

@@ -0,0 +1,81 @@
"""列出所有已入库的因子。
以 Python 列表格式输出所有已注册因子的名称,方便复制使用。
保持 factors.jsonl 中的原始顺序(按 factor_id
使用方法:
uv run python -m src.scripts.list_factors
"""
import json
import re
from pathlib import Path
from src.config.settings import get_settings
def extract_factor_id_number(factor_id: str) -> int:
"""从 factor_id 中提取数字部分用于排序。
Args:
factor_id: 如 "F_001"
Returns:
数字部分,如 1
"""
match = re.match(r"F_(\d+)", factor_id)
if match:
return int(match.group(1))
return 0
def list_factors():
"""读取 factors.jsonl 并按 factor_id 顺序打印因子名称列表。"""
settings = get_settings()
factors_path = settings.data_path_resolved / "factors.jsonl"
if not factors_path.exists():
print(f"[错误] 因子文件不存在: {factors_path}")
return
# 读取所有因子并按 factor_id 排序
factors = []
try:
with open(factors_path, "r", encoding="utf-8") as f:
for line in f:
line = line.strip()
if not line:
continue
try:
data = json.loads(line)
factor_id = data.get("factor_id", "")
name = data.get("name")
if name and factor_id:
factors.append((factor_id, name))
except json.JSONDecodeError:
continue
except Exception as e:
print(f"[错误] 读取因子文件失败: {e}")
return
if not factors:
print("[信息] 没有找到任何因子")
return
# 按 factor_id 数字排序(保持入库顺序)
factors.sort(key=lambda x: extract_factor_id_number(x[0]))
# 以 Python 列表格式输出
print("[")
for i, (factor_id, name) in enumerate(factors):
if i == len(factors) - 1:
print(f' "{name}"')
else:
print(f' "{name}",')
print("]")
print(f"\n[统计] 共计 {len(factors)} 个因子")
if __name__ == "__main__":
list_factors()

View File

@@ -26,24 +26,126 @@ from typing import Any, Dict, List, Optional
from src.factors.metadata import FactorManager
from src.factors.metadata.exceptions import DuplicateFactorError, ValidationError
from src.config.settings import get_settings
# ============================================================================
# 用户配置区域 - 在这里添加要注册的因子
# ============================================================================
FACTORS: List[Dict[str, Any]] = [
# 示例因子,请根据实际需要修改或添加
FACTORS: List[Dict[str, Any]] =[
# ==================== 第一类:筹码集中度与离散度因子 ====================
{
"name": "turnover_volatility_ratio",
"desc": "5日价格动量收盘价相对于5日前收盘价的涨跌幅进行截面排名",
"dsl": "cs_rank(close / ts_delay(close, 5) - 1)",
"category": "momentum",
"name": "chip_dispersion_90",
"desc": "90%筹码离散度衡量市场90%持仓筹码的宽度,值越小表示筹码越高度集中(单峰密集),往往是洗盘结束的前兆",
"dsl": "(cost_95pct - cost_5pct) / (cost_95pct + cost_5pct)",
},
{
"name": "chip_dispersion_70",
"desc": "70%核心筹码离散度剔除极端的底部死筹和高位套牢盘反映中间70%主流资金的成本集中度",
"dsl": "(cost_85pct - cost_15pct) / (cost_85pct + cost_15pct)",
},
{
"name": "cost_skewness",
"desc": "筹码偏度反映筹码分布的不对称性。大于1说明上方套牢盘拖尾严重小于1说明下方获利盘雄厚",
"dsl": "(cost_95pct - cost_50pct) / (cost_50pct - cost_5pct)",
},
{
"name": "dispersion_change_20",
"desc": "筹码集中度近期变化率过去20天筹码宽度的变化比例持续下降说明主力正在暗中吸筹",
"dsl": "ts_pct_change((cost_95pct - cost_5pct) / cost_50pct, 20)",
},
# ==================== 第二类:筹码相对位置与压力/支撑因子 ====================
{
"name": "price_to_avg_cost",
"desc": "整体浮盈比例:当前价格相对加权平均成本的溢价率。高溢价有均值回归压力,负溢价代表超跌",
"dsl": "(close - weight_avg) / weight_avg",
},
{
"name": "price_to_median_cost",
"desc": "中位数成本偏离度价格相对于50%分位点(绝对半数人持仓价)的偏离,向上突破通常是右侧买点",
"dsl": "(close - cost_50pct) / cost_50pct",
},
{
"name": "mean_median_dev",
"desc": "均值中位数背离:均值显著大于中位数说明高位筹码堆积,上涨阻力大",
"dsl": "(weight_avg - cost_50pct) / cost_50pct",
},
{
"name": "trap_pressure",
"desc": "高位套牢盘压力指数当前价格距离上方95%高位套牢成本的距离。距离越大,反弹的真空期阻力越小",
"dsl": "(cost_95pct - close) / close",
},
{
"name": "bottom_profit",
"desc": "底部支撑底仓利润率当前价格距离底部5%筹码的利润空间。暴跌时大于0说明底仓极度稳定",
"dsl": "(close - cost_5pct) / cost_5pct",
},
{
"name": "history_position",
"desc": "历史区间分位点:当前价格在个股上市以来历史最高点和最低点之间的相对位置",
"dsl": "(close - his_low) / (his_high - his_low)",
},
# ==================== 第三类:胜率相关的动量与反转因子 ====================
{
"name": "winner_rate_surge_5",
"desc": "获利盘短期爆发力胜率在过去5天内的变化值急剧上升是极强的动量做多信号",
"dsl": "ts_delta(winner_rate, 5)",
},
{
"name": "winner_rate_cs_rank",
"desc": "获利盘高位反转信号:全市场胜率截面排名,极端高胜率往往面临多头踩踏的获利了结压力(反转因子)",
"dsl": "cs_rank(winner_rate)",
},
{
"name": "winner_rate_dev_20",
"desc": "获利盘均线偏离当前胜率相对过去20天平均胜率的偏离程度捕捉筹码情绪的边际超买/超卖",
"dsl": "winner_rate - ts_mean(winner_rate, 20)",
},
{
"name": "winner_rate_volatility",
"desc": "获利盘波动率过去20天胜率的波动率。波动率低且胜率高说明单边上涨极度稳健",
"dsl": "ts_std(winner_rate, 20)",
},
{
"name": "smart_money_accumulation",
"desc": "潜在主力吸筹隐蔽指标胜率的60日时序分位数减去价格的时序分位数。值越大说明价平而获利盘增底部吸筹明显",
"dsl": "ts_rank(winner_rate, 60) - ts_rank(close, 60)",
},
# ==================== 第四类:量价与筹码交乘因子 ====================
{
"name": "winner_vol_corr_20",
"desc": "放量突破筹码密集区胜率与成交量的20日时序相关性正相关说明增量资金在主动解套上方筹码",
"dsl": "ts_corr(winner_rate, vol, 20)",
},
{
"name": "cost_base_momentum",
"desc": "成本重心上移换手率过去20天加权平均成本的变化幅度快速上移说明高位换手极其充分",
"dsl": "ts_pct_change(weight_avg, 20)",
},
{
"name": "bottom_cost_stability",
"desc": "底部坚如磐石因子底部5%成本的60天波动率相对于中位数的比值波动越小说明死筹越稳固",
"dsl": "ts_std(cost_5pct, 60) / cost_50pct",
},
{
"name": "pivot_reversion",
"desc": "盈亏分界线乖离修复价格偏离50%分位点除以近20日价格标准差用于寻找超跌后的均值回归买点",
"dsl": "(close - cost_50pct) / ts_std(close, 20)",
},
{
"name": "chip_transition",
"desc": "强弱筹码切换度上方厚度与下方厚度差值的20日变化量。由正变负说明筹码彻底完成了自上而下的转移洗盘结束",
"dsl": "ts_delta((cost_85pct - cost_50pct) - (cost_50pct - cost_15pct), 20)",
},
]
# 因子存储路径(默认使用实验目录)
OUTPUT_PATH = Path(__file__).parent.parent / "experiment" / "data" / "factors.jsonl"
# 因子存储路径(使用项目根路径下的 data 目录)
settings = get_settings()
OUTPUT_PATH = settings.data_path_resolved / "factors.jsonl"
# ============================================================================