refactor(factorminer): 禁用 npz 信号缓存并将库 I/O 对接本地 DSL
- 为 Factor 数据类新增 metadata 字段,用于标记未实现算子(unsupported) - save_library 废弃 save_signals 参数,内部强制忽略,仅持久化 JSON 元数据,不再写入 .npz - load_library 删除 .npz 恢复逻辑;加载时自动将 # TODO 公式的 unsupported 标记设为 True - import_from_paper() 直接基于已本地化的 PAPER_FACTORS 构建库,并同步标记 TODO 公式 - 新增 tests/test_factorminer_library_io.py,覆盖序列化、加载及 paper factors 导入
This commit is contained in:
@@ -39,6 +39,7 @@ class Factor:
|
||||
signals: Optional[np.ndarray] = field(default=None, repr=False) # (M, T)
|
||||
research_metrics: dict = field(default_factory=dict)
|
||||
provenance: dict = field(default_factory=dict)
|
||||
metadata: dict = field(default_factory=dict)
|
||||
|
||||
def __post_init__(self) -> None:
|
||||
if not self.admission_date:
|
||||
@@ -59,6 +60,7 @@ class Factor:
|
||||
"admission_date": self.admission_date,
|
||||
"research_metrics": self.research_metrics,
|
||||
"provenance": self.provenance,
|
||||
"metadata": self.metadata,
|
||||
}
|
||||
|
||||
@classmethod
|
||||
@@ -77,6 +79,7 @@ class Factor:
|
||||
admission_date=d.get("admission_date", ""),
|
||||
research_metrics=d.get("research_metrics", {}),
|
||||
provenance=d.get("provenance", {}),
|
||||
metadata=d.get("metadata", {}),
|
||||
)
|
||||
|
||||
|
||||
@@ -172,7 +175,7 @@ class FactorLibrary:
|
||||
# Pearson on ranks == Spearman
|
||||
ra_c = ra - ra.mean()
|
||||
rb_c = rb - rb.mean()
|
||||
denom = np.sqrt((ra_c ** 2).sum() * (rb_c ** 2).sum())
|
||||
denom = np.sqrt((ra_c**2).sum() * (rb_c**2).sum())
|
||||
if denom < 1e-12:
|
||||
continue
|
||||
corr_sum += abs((ra_c * rb_c).sum() / denom)
|
||||
@@ -206,9 +209,7 @@ class FactorLibrary:
|
||||
(admitted, reason) : Tuple[bool, str]
|
||||
"""
|
||||
if candidate_ic < self.ic_threshold:
|
||||
return False, (
|
||||
f"IC {candidate_ic:.4f} below threshold {self.ic_threshold}"
|
||||
)
|
||||
return False, (f"IC {candidate_ic:.4f} below threshold {self.ic_threshold}")
|
||||
|
||||
if self.size == 0:
|
||||
return True, "First factor in library"
|
||||
@@ -221,9 +222,7 @@ class FactorLibrary:
|
||||
f"{self.correlation_threshold} with existing library factor"
|
||||
)
|
||||
|
||||
return True, (
|
||||
f"Admitted: IC={candidate_ic:.4f}, max_corr={max_corr:.4f}"
|
||||
)
|
||||
return True, (f"Admitted: IC={candidate_ic:.4f}, max_corr={max_corr:.4f}")
|
||||
|
||||
def check_replacement(
|
||||
self,
|
||||
@@ -258,8 +257,10 @@ class FactorLibrary:
|
||||
(should_replace, factor_to_replace_id, reason) : Tuple[bool, Optional[int], str]
|
||||
"""
|
||||
if candidate_ic < ic_min:
|
||||
return False, None, (
|
||||
f"IC {candidate_ic:.4f} below replacement floor {ic_min}"
|
||||
return (
|
||||
False,
|
||||
None,
|
||||
(f"IC {candidate_ic:.4f} below replacement floor {ic_min}"),
|
||||
)
|
||||
|
||||
if self.size == 0:
|
||||
@@ -277,21 +278,33 @@ class FactorLibrary:
|
||||
correlated_factors.append((fid, corr, factor.ic_mean))
|
||||
|
||||
if len(correlated_factors) != 1:
|
||||
return False, None, (
|
||||
f"Found {len(correlated_factors)} correlated factors "
|
||||
f"(need exactly 1 for replacement)"
|
||||
return (
|
||||
False,
|
||||
None,
|
||||
(
|
||||
f"Found {len(correlated_factors)} correlated factors "
|
||||
f"(need exactly 1 for replacement)"
|
||||
),
|
||||
)
|
||||
|
||||
fid, corr, existing_ic = correlated_factors[0]
|
||||
if candidate_ic < ic_ratio * existing_ic:
|
||||
return False, None, (
|
||||
f"IC {candidate_ic:.4f} < {ic_ratio} * {existing_ic:.4f} = "
|
||||
f"{ic_ratio * existing_ic:.4f}"
|
||||
return (
|
||||
False,
|
||||
None,
|
||||
(
|
||||
f"IC {candidate_ic:.4f} < {ic_ratio} * {existing_ic:.4f} = "
|
||||
f"{ic_ratio * existing_ic:.4f}"
|
||||
),
|
||||
)
|
||||
|
||||
return True, fid, (
|
||||
f"Replace factor {fid}: candidate IC {candidate_ic:.4f} > "
|
||||
f"{ic_ratio} * {existing_ic:.4f}, corr={corr:.4f}"
|
||||
return (
|
||||
True,
|
||||
fid,
|
||||
(
|
||||
f"Replace factor {fid}: candidate IC {candidate_ic:.4f} > "
|
||||
f"{ic_ratio} * {existing_ic:.4f}, corr={corr:.4f}"
|
||||
),
|
||||
)
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
@@ -321,8 +334,11 @@ class FactorLibrary:
|
||||
|
||||
logger.info(
|
||||
"Admitted factor %d '%s' (IC=%.4f, max_corr=%.4f, category=%s)",
|
||||
factor.id, factor.name, factor.ic_mean,
|
||||
factor.max_correlation, factor.category,
|
||||
factor.id,
|
||||
factor.name,
|
||||
factor.ic_mean,
|
||||
factor.max_correlation,
|
||||
factor.category,
|
||||
)
|
||||
return factor.id
|
||||
|
||||
@@ -360,7 +376,10 @@ class FactorLibrary:
|
||||
|
||||
logger.info(
|
||||
"Replaced factor %d with %d '%s' (IC=%.4f)",
|
||||
old_id, new_factor.id, new_factor.name, new_factor.ic_mean,
|
||||
old_id,
|
||||
new_factor.id,
|
||||
new_factor.name,
|
||||
new_factor.ic_mean,
|
||||
)
|
||||
|
||||
def remove_factor(self, factor_id: int) -> None:
|
||||
@@ -381,9 +400,7 @@ class FactorLibrary:
|
||||
# Correlation matrix management
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def _max_correlation_with_library(
|
||||
self, candidate_signals: np.ndarray
|
||||
) -> float:
|
||||
def _max_correlation_with_library(self, candidate_signals: np.ndarray) -> float:
|
||||
"""Compute max |rho| between candidate and all library factors."""
|
||||
max_corr = 0.0
|
||||
for factor in self.factors.values():
|
||||
@@ -453,9 +470,7 @@ class FactorLibrary:
|
||||
self.correlation_matrix[idx, other_idx] = 0.0
|
||||
self.correlation_matrix[other_idx, idx] = 0.0
|
||||
continue
|
||||
corr = self._compute_correlation_vectorized(
|
||||
factor.signals, other.signals
|
||||
)
|
||||
corr = self._compute_correlation_vectorized(factor.signals, other.signals)
|
||||
self.correlation_matrix[idx, other_idx] = corr
|
||||
self.correlation_matrix[other_idx, idx] = corr
|
||||
|
||||
@@ -509,10 +524,7 @@ class FactorLibrary:
|
||||
|
||||
def get_factors_by_category(self, category: str) -> List[Factor]:
|
||||
"""Return all factors matching a given category."""
|
||||
return [
|
||||
f for f in self.factors.values()
|
||||
if f.category == category
|
||||
]
|
||||
return [f for f in self.factors.values() if f.category == category]
|
||||
|
||||
def get_diagnostics(self) -> dict:
|
||||
"""Library diagnostics: avg |rho|, max tail correlations, per-category counts, saturation.
|
||||
@@ -539,8 +551,7 @@ class FactorLibrary:
|
||||
|
||||
diag["category_counts"] = dict(cat_counts)
|
||||
diag["category_avg_ic"] = {
|
||||
cat: cat_ic_sums[cat] / cat_counts[cat]
|
||||
for cat in cat_counts
|
||||
cat: cat_ic_sums[cat] / cat_counts[cat] for cat in cat_counts
|
||||
}
|
||||
|
||||
# Correlation statistics
|
||||
@@ -575,9 +586,7 @@ class FactorLibrary:
|
||||
Returns a lightweight dictionary suitable for inclusion in LLM prompts
|
||||
or memory store entries.
|
||||
"""
|
||||
factors_sorted = sorted(
|
||||
self.factors.values(), key=lambda f: f.id, reverse=True
|
||||
)
|
||||
factors_sorted = sorted(self.factors.values(), key=lambda f: f.id, reverse=True)
|
||||
recent = factors_sorted[:5] # Last 5 admissions
|
||||
|
||||
categories = defaultdict(int)
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
157
tests/test_factorminer_library_io.py
Normal file
157
tests/test_factorminer_library_io.py
Normal file
@@ -0,0 +1,157 @@
|
||||
"""Tests for library I/O and paper factor imports."""
|
||||
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from src.factorminer.core.factor_library import Factor, FactorLibrary
|
||||
from src.factorminer.core.library_io import (
|
||||
import_from_paper,
|
||||
load_library,
|
||||
save_library,
|
||||
)
|
||||
|
||||
|
||||
class TestSaveLoadLibrary:
|
||||
"""测试 FactorLibrary 的序列化与反序列化."""
|
||||
|
||||
def test_save_library_ignores_save_signals(self, tmp_path: Path) -> None:
|
||||
"""save_signals=True 也不应生成 .npz 文件."""
|
||||
library = FactorLibrary()
|
||||
factor = Factor(
|
||||
id=0,
|
||||
name="test_factor",
|
||||
formula="close / ts_delay(close, 1) - 1",
|
||||
category="Momentum",
|
||||
ic_mean=0.05,
|
||||
icir=0.5,
|
||||
ic_win_rate=0.55,
|
||||
max_correlation=0.1,
|
||||
batch_number=1,
|
||||
)
|
||||
# 即使给一个信号矩阵,也不应保存
|
||||
factor.signals = np.ones((10, 20))
|
||||
library.admit_factor(factor)
|
||||
|
||||
base_path = tmp_path / "test_lib"
|
||||
save_library(library, str(base_path), save_signals=True)
|
||||
|
||||
assert (base_path.with_suffix(".json")).exists()
|
||||
assert not (Path(str(base_path) + "_signals.npz")).exists()
|
||||
|
||||
def test_load_library_restores_metadata_and_unsupported(
|
||||
self, tmp_path: Path
|
||||
) -> None:
|
||||
"""加载 JSON 后应恢复 metadata,并对 # TODO 公式标记 unsupported."""
|
||||
library = FactorLibrary()
|
||||
f1 = Factor(
|
||||
id=0,
|
||||
name="ok_factor",
|
||||
formula="cs_rank(close)",
|
||||
category="Test",
|
||||
ic_mean=0.0,
|
||||
icir=0.0,
|
||||
ic_win_rate=0.0,
|
||||
max_correlation=0.0,
|
||||
batch_number=0,
|
||||
metadata={"author": "ai"},
|
||||
)
|
||||
f2 = Factor(
|
||||
id=0,
|
||||
name="todo_factor",
|
||||
formula="# TODO: Neg(CsRank(Decay(close, 10)))",
|
||||
category="Test",
|
||||
ic_mean=0.0,
|
||||
icir=0.0,
|
||||
ic_win_rate=0.0,
|
||||
max_correlation=0.0,
|
||||
batch_number=0,
|
||||
)
|
||||
library.admit_factor(f1)
|
||||
library.admit_factor(f2)
|
||||
|
||||
base_path = tmp_path / "meta_lib"
|
||||
save_library(library, str(base_path))
|
||||
|
||||
loaded = load_library(str(base_path))
|
||||
assert loaded.size == 2
|
||||
|
||||
f1_loaded = loaded.get_factor(1)
|
||||
assert f1_loaded.metadata.get("author") == "ai"
|
||||
assert not f1_loaded.metadata.get("unsupported", False)
|
||||
|
||||
f2_loaded = loaded.get_factor(2)
|
||||
assert f2_loaded.metadata.get("unsupported") is True
|
||||
|
||||
def test_factor_round_trip_with_metadata(self) -> None:
|
||||
"""Factor.to_dict / from_dict 应正确传递 metadata."""
|
||||
factor = Factor(
|
||||
id=1,
|
||||
name="round_trip",
|
||||
formula="ts_mean(close, 20)",
|
||||
category="Momentum",
|
||||
ic_mean=0.1,
|
||||
icir=1.0,
|
||||
ic_win_rate=0.6,
|
||||
max_correlation=0.2,
|
||||
batch_number=2,
|
||||
metadata={"unsupported": True, "tags": ["test"]},
|
||||
)
|
||||
d = factor.to_dict()
|
||||
restored = Factor.from_dict(d)
|
||||
assert restored.metadata == factor.metadata
|
||||
|
||||
|
||||
class TestImportFromPaper:
|
||||
"""测试从内置 paper catalog 导入因子."""
|
||||
|
||||
def test_import_from_paper_includes_all_translated_factors(self) -> None:
|
||||
"""内置 PAPER_FACTORS 应全部成功导入."""
|
||||
library = import_from_paper()
|
||||
assert library.size > 0
|
||||
# 当前 catalog 中已有因子应全部被 admit
|
||||
for factor in library.list_factors():
|
||||
assert factor.id > 0
|
||||
assert factor.name
|
||||
assert factor.formula
|
||||
assert factor.category
|
||||
|
||||
def test_import_from_paper_marks_todo_as_unsupported(self, tmp_path: Path) -> None:
|
||||
"""对 # TODO 公式应在 metadata 中标记 unsupported."""
|
||||
custom_path = tmp_path / "custom_factors.json"
|
||||
custom_data = [
|
||||
{
|
||||
"name": "Normal Factor",
|
||||
"formula": "cs_rank(close)",
|
||||
"category": "Test",
|
||||
},
|
||||
{
|
||||
"name": "Unsupported Factor",
|
||||
"formula": "# TODO: Neg(CsRank(Decay(close, 10)))",
|
||||
"category": "Test",
|
||||
},
|
||||
]
|
||||
custom_path.write_text(json.dumps(custom_data), encoding="utf-8")
|
||||
|
||||
library = import_from_paper(str(custom_path))
|
||||
assert library.size == 2
|
||||
|
||||
normal = library.list_factors()[0]
|
||||
todo = library.list_factors()[1]
|
||||
|
||||
assert normal.metadata.get("unsupported") is None
|
||||
assert todo.metadata.get("unsupported") is True
|
||||
|
||||
def test_import_from_paper_path_override(self, tmp_path: Path) -> None:
|
||||
"""通过 path 参数加载外部 JSON 列表."""
|
||||
custom_path = tmp_path / "override.json"
|
||||
custom_data = [
|
||||
{"name": "custom_1", "formula": "open + close", "category": "Custom"},
|
||||
]
|
||||
custom_path.write_text(json.dumps(custom_data), encoding="utf-8")
|
||||
|
||||
library = import_from_paper(str(custom_path))
|
||||
assert library.size == 1
|
||||
assert library.list_factors()[0].name == "custom_1"
|
||||
Reference in New Issue
Block a user