feat(experiment): 添加模型保存功能及因子信息持久化

- 新增 SAVE_MODEL 配置控制是否保存模型
- 新增 get_model_save_path() 生成模型保存路径
- 新增 save_model_with_factors() 保存模型及关联因子信息
- 新增 load_model_factors() 加载因子信息用于模型复现
- 更新训练脚本使用新的模型保存方式
- 清理 data/sync.py 中的废弃代码
This commit is contained in:
2026-03-16 22:50:47 +08:00
parent 5ed06d20d2
commit 16f82d3458
5 changed files with 163 additions and 119 deletions

View File

@@ -5,7 +5,7 @@
"""
from datetime import datetime
from typing import List
from typing import List, Optional
import polars as pl
@@ -255,8 +255,7 @@ SELECTED_FACTORS = [
]
# 因子定义字典完整因子库用于存放尚未注册到metadata的因子
FACTOR_DEFINITIONS = {
}
FACTOR_DEFINITIONS = {}
def get_label_factor(label_name: str) -> dict:
@@ -417,7 +416,10 @@ STOCK_FILTER_REQUIRED_COLUMNS = ["total_mv"]
# =============================================================================
OUTPUT_DIR = "output"
SAVE_PREDICTIONS = True
PERSIST_MODEL = False
# 模型保存配置
SAVE_MODEL = True # 是否保存模型
MODEL_SAVE_DIR = "models" # 模型保存目录
# Top N 配置:每日推荐股票数量
TOP_N = 5 # 可调整为 10, 20 等
@@ -446,3 +448,101 @@ def get_output_path(model_type: str, test_start: str, test_end: str) -> str:
filename = f"{model_type}_output.csv"
return os.path.join(OUTPUT_DIR, filename)
def get_model_save_path(
model_type: str, model_name: Optional[str] = None
) -> Optional[str]:
"""生成模型保存路径。
Args:
model_type: 模型类型("regression""rank"
model_name: 模型名称,默认为 model_type
Returns:
模型保存路径,如果 SAVE_MODEL 为 False 则返回 None
"""
if not SAVE_MODEL:
return None
import os
# 确保模型保存目录存在
os.makedirs(MODEL_SAVE_DIR, exist_ok=True)
# 使用 model_name 或默认使用 model_type
name = model_name if model_name else model_type
filename = f"{name}.pkl"
return os.path.join(MODEL_SAVE_DIR, filename)
def save_model_with_factors(
model,
model_path: str,
selected_factors: List[str],
factor_definitions: dict,
) -> None:
"""保存模型及关联的因子信息。
除了保存模型本身,还会保存一个同名的 .factors.json 文件,
包含 SELECTED_FACTORS 和 FACTOR_DEFINITIONS以便后续加载模型时
知道使用了哪些因子。
Args:
model: 训练好的模型实例(需有 save 方法)
model_path: 模型保存路径
selected_factors: 从 metadata 中选择的因子名称列表
factor_definitions: 通过表达式定义的因子字典
"""
import json
import os
# 1. 保存模型本身
model.save(model_path)
print(f"[模型保存] 模型已保存至: {model_path}")
# 2. 保存因子信息到 .factors.json 文件
factors_path = model_path.replace(".pkl", ".factors.json")
factors_info = {
"selected_factors": selected_factors,
"factor_definitions": factor_definitions,
"total_feature_count": len(selected_factors) + len(factor_definitions),
"selected_factors_count": len(selected_factors),
"factor_definitions_count": len(factor_definitions),
}
with open(factors_path, "w", encoding="utf-8") as f:
json.dump(factors_info, f, ensure_ascii=False, indent=2)
print(f"[模型保存] 因子信息已保存至: {factors_path}")
print(f"[模型保存] 总计 {factors_info['total_feature_count']} 个因子")
print(f" - 来自 metadata: {factors_info['selected_factors_count']}")
print(f" - 来自表达式定义: {factors_info['factor_definitions_count']}")
def load_model_factors(model_path: str) -> Optional[dict]:
"""加载模型关联的因子信息。
Args:
model_path: 模型保存路径
Returns:
包含因子信息的字典,如果文件不存在则返回 None
"""
import json
import os
factors_path = model_path.replace(".pkl", ".factors.json")
if not os.path.exists(factors_path):
print(f"[警告] 未找到因子信息文件: {factors_path}")
return None
with open(factors_path, "r", encoding="utf-8") as f:
factors_info = json.load(f)
print(
f"[模型加载] 已加载因子信息,总计 {factors_info.get('total_feature_count', 'N/A')} 个因子"
)
return factors_info