refactor(factorminer): 禁用 npz 信号缓存并将库 I/O 对接本地 DSL

- 为 Factor 数据类新增 metadata 字段,用于标记未实现算子(unsupported)
- save_library 废弃 save_signals 参数,内部强制忽略,仅持久化 JSON 元数据,不再写入 .npz
- load_library 删除 .npz 恢复逻辑;加载时自动将 # TODO 公式的 unsupported 标记设为 True
- import_from_paper() 直接基于已本地化的 PAPER_FACTORS 构建库,并同步标记 TODO 公式
- 新增 tests/test_factorminer_library_io.py,覆盖序列化、加载及 paper factors 导入
This commit is contained in:
2026-04-08 22:10:17 +08:00
parent d71f723602
commit 65500cce27
3 changed files with 617 additions and 467 deletions

View File

@@ -39,6 +39,7 @@ class Factor:
signals: Optional[np.ndarray] = field(default=None, repr=False) # (M, T)
research_metrics: dict = field(default_factory=dict)
provenance: dict = field(default_factory=dict)
metadata: dict = field(default_factory=dict)
def __post_init__(self) -> None:
if not self.admission_date:
@@ -59,6 +60,7 @@ class Factor:
"admission_date": self.admission_date,
"research_metrics": self.research_metrics,
"provenance": self.provenance,
"metadata": self.metadata,
}
@classmethod
@@ -77,6 +79,7 @@ class Factor:
admission_date=d.get("admission_date", ""),
research_metrics=d.get("research_metrics", {}),
provenance=d.get("provenance", {}),
metadata=d.get("metadata", {}),
)
@@ -172,7 +175,7 @@ class FactorLibrary:
# Pearson on ranks == Spearman
ra_c = ra - ra.mean()
rb_c = rb - rb.mean()
denom = np.sqrt((ra_c ** 2).sum() * (rb_c ** 2).sum())
denom = np.sqrt((ra_c**2).sum() * (rb_c**2).sum())
if denom < 1e-12:
continue
corr_sum += abs((ra_c * rb_c).sum() / denom)
@@ -206,9 +209,7 @@ class FactorLibrary:
(admitted, reason) : Tuple[bool, str]
"""
if candidate_ic < self.ic_threshold:
return False, (
f"IC {candidate_ic:.4f} below threshold {self.ic_threshold}"
)
return False, (f"IC {candidate_ic:.4f} below threshold {self.ic_threshold}")
if self.size == 0:
return True, "First factor in library"
@@ -221,9 +222,7 @@ class FactorLibrary:
f"{self.correlation_threshold} with existing library factor"
)
return True, (
f"Admitted: IC={candidate_ic:.4f}, max_corr={max_corr:.4f}"
)
return True, (f"Admitted: IC={candidate_ic:.4f}, max_corr={max_corr:.4f}")
def check_replacement(
self,
@@ -258,8 +257,10 @@ class FactorLibrary:
(should_replace, factor_to_replace_id, reason) : Tuple[bool, Optional[int], str]
"""
if candidate_ic < ic_min:
return False, None, (
f"IC {candidate_ic:.4f} below replacement floor {ic_min}"
return (
False,
None,
(f"IC {candidate_ic:.4f} below replacement floor {ic_min}"),
)
if self.size == 0:
@@ -277,21 +278,33 @@ class FactorLibrary:
correlated_factors.append((fid, corr, factor.ic_mean))
if len(correlated_factors) != 1:
return False, None, (
f"Found {len(correlated_factors)} correlated factors "
f"(need exactly 1 for replacement)"
return (
False,
None,
(
f"Found {len(correlated_factors)} correlated factors "
f"(need exactly 1 for replacement)"
),
)
fid, corr, existing_ic = correlated_factors[0]
if candidate_ic < ic_ratio * existing_ic:
return False, None, (
f"IC {candidate_ic:.4f} < {ic_ratio} * {existing_ic:.4f} = "
f"{ic_ratio * existing_ic:.4f}"
return (
False,
None,
(
f"IC {candidate_ic:.4f} < {ic_ratio} * {existing_ic:.4f} = "
f"{ic_ratio * existing_ic:.4f}"
),
)
return True, fid, (
f"Replace factor {fid}: candidate IC {candidate_ic:.4f} > "
f"{ic_ratio} * {existing_ic:.4f}, corr={corr:.4f}"
return (
True,
fid,
(
f"Replace factor {fid}: candidate IC {candidate_ic:.4f} > "
f"{ic_ratio} * {existing_ic:.4f}, corr={corr:.4f}"
),
)
# ------------------------------------------------------------------
@@ -321,8 +334,11 @@ class FactorLibrary:
logger.info(
"Admitted factor %d '%s' (IC=%.4f, max_corr=%.4f, category=%s)",
factor.id, factor.name, factor.ic_mean,
factor.max_correlation, factor.category,
factor.id,
factor.name,
factor.ic_mean,
factor.max_correlation,
factor.category,
)
return factor.id
@@ -360,7 +376,10 @@ class FactorLibrary:
logger.info(
"Replaced factor %d with %d '%s' (IC=%.4f)",
old_id, new_factor.id, new_factor.name, new_factor.ic_mean,
old_id,
new_factor.id,
new_factor.name,
new_factor.ic_mean,
)
def remove_factor(self, factor_id: int) -> None:
@@ -381,9 +400,7 @@ class FactorLibrary:
# Correlation matrix management
# ------------------------------------------------------------------
def _max_correlation_with_library(
self, candidate_signals: np.ndarray
) -> float:
def _max_correlation_with_library(self, candidate_signals: np.ndarray) -> float:
"""Compute max |rho| between candidate and all library factors."""
max_corr = 0.0
for factor in self.factors.values():
@@ -453,9 +470,7 @@ class FactorLibrary:
self.correlation_matrix[idx, other_idx] = 0.0
self.correlation_matrix[other_idx, idx] = 0.0
continue
corr = self._compute_correlation_vectorized(
factor.signals, other.signals
)
corr = self._compute_correlation_vectorized(factor.signals, other.signals)
self.correlation_matrix[idx, other_idx] = corr
self.correlation_matrix[other_idx, idx] = corr
@@ -509,10 +524,7 @@ class FactorLibrary:
def get_factors_by_category(self, category: str) -> List[Factor]:
"""Return all factors matching a given category."""
return [
f for f in self.factors.values()
if f.category == category
]
return [f for f in self.factors.values() if f.category == category]
def get_diagnostics(self) -> dict:
"""Library diagnostics: avg |rho|, max tail correlations, per-category counts, saturation.
@@ -539,8 +551,7 @@ class FactorLibrary:
diag["category_counts"] = dict(cat_counts)
diag["category_avg_ic"] = {
cat: cat_ic_sums[cat] / cat_counts[cat]
for cat in cat_counts
cat: cat_ic_sums[cat] / cat_counts[cat] for cat in cat_counts
}
# Correlation statistics
@@ -575,9 +586,7 @@ class FactorLibrary:
Returns a lightweight dictionary suitable for inclusion in LLM prompts
or memory store entries.
"""
factors_sorted = sorted(
self.factors.values(), key=lambda f: f.id, reverse=True
)
factors_sorted = sorted(self.factors.values(), key=lambda f: f.id, reverse=True)
recent = factors_sorted[:5] # Last 5 admissions
categories = defaultdict(int)

File diff suppressed because it is too large Load Diff