From e5e636d6cdfde0845f1999d6a3c6e5f53228a11b Mon Sep 17 00:00:00 2001
From: liaozhaorun <1300336796@qq.com>
Date: Tue, 7 Apr 2026 22:49:33 +0800
Subject: [PATCH] =?UTF-8?q?refactor(factorminer):=20=E7=BB=9F=E4=B8=80?=
 =?UTF-8?q?=E6=A8=A1=E5=9D=97=E5=BC=95=E7=94=A8=E8=B7=AF=E5=BE=84=E5=B9=B6?=
 =?UTF-8?q?=E7=A7=BB=E9=99=A4=E7=8B=AC=E7=AB=8B=E5=8C=85=E9=85=8D=E7=BD=AE?=
 =?UTF-8?q?=20-=20=E5=88=A0=E9=99=A4=E6=97=A0=E7=94=A8=E6=96=87=E4=BB=B6?=
 =?UTF-8?q?=20-=20=E6=96=B0=E5=A2=9E=E6=9C=AC=E5=9C=B0=E6=A1=86=E6=9E=B6?=
 =?UTF-8?q?=E6=95=B4=E5=90=88=E5=AE=9E=E6=96=BD=E8=AE=A1=E5=88=92=E6=96=87?=
 =?UTF-8?q?=E6=A1=A3?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 ...026-04-07-factorminer-local-integration.md |  363 +++
 src/factorminer/factorminer/__init__.py       |   42 -
 src/factorminer/factorminer/agent/__init__.py |   78 -
 src/factorminer/factorminer/agent/critic.py   |  837 -------
 src/factorminer/factorminer/agent/debate.py   |  949 -------
 .../factorminer/agent/factor_generator.py     |  236 --
 .../factorminer/agent/llm_interface.py        |  365 ---
 .../factorminer/agent/output_parser.py        |  259 --
 .../factorminer/agent/prompt_builder.py       |  682 ------
 .../factorminer/agent/specialists.py          |  596 -----
 .../factorminer/benchmark/__init__.py         |   73 -
 .../factorminer/benchmark/ablation.py         |  798 ------
 .../factorminer/benchmark/catalogs.py         |  236 --
 .../factorminer/benchmark/helix_benchmark.py  | 2172 -----------------
 .../factorminer/benchmark/runtime.py          | 1498 ------------
 src/factorminer/factorminer/cli.py            | 1566 ------------
 .../factorminer/configs/__init__.py           |   25 -
 .../factorminer/configs/benchmark_full.yaml   |   45 -
 .../factorminer/configs/default.yaml          |  307 ---
 .../factorminer/configs/demo_local.yaml       |   57 -
 .../factorminer/configs/helix_research.yaml   |   82 -
 .../factorminer/configs/paper_repro.yaml      |   36 -
 src/factorminer/factorminer/core/__init__.py  |   67 -
 .../factorminer/core/canonicalizer.py         |  206 --
 src/factorminer/factorminer/core/config.py    |   61 -
 .../factorminer/core/expression_tree.py       |  736 ------
 .../factorminer/core/factor_library.py        |  602 -----
 .../factorminer/core/helix_loop.py            | 1576 ------------
 .../factorminer/core/library_io.py            |  921 -------
 src/factorminer/factorminer/core/parser.py    |  374 ---
 .../factorminer/core/provenance.py            |  241 --
 .../factorminer/core/ralph_loop.py            | 1598 ------------
 src/factorminer/factorminer/core/session.py   |  187 --
 src/factorminer/factorminer/core/types.py     |  269 --
 src/factorminer/factorminer/data/__init__.py  |   75 -
 src/factorminer/factorminer/data/loader.py    |  244 --
 src/factorminer/factorminer/data/mock_data.py |  323 ---
 .../factorminer/data/preprocessor.py          |  364 ---
 .../factorminer/data/tensor_builder.py        |  505 ----
 .../factorminer/evaluation/__init__.py        |  169 --
 .../factorminer/evaluation/admission.py       |  221 --
 .../factorminer/evaluation/backtest.py        |  397 ---
 .../factorminer/evaluation/capacity.py        |  449 ----
 .../factorminer/evaluation/causal.py          |  580 -----
 .../factorminer/evaluation/combination.py     |  195 --
 .../factorminer/evaluation/correlation.py     |  374 ---
 .../factorminer/evaluation/metrics.py         |  377 ---
 .../factorminer/evaluation/pipeline.py        |  736 ------
 .../factorminer/evaluation/portfolio.py       |  266 --
 .../factorminer/evaluation/regime.py          |  623 -----
 .../factorminer/evaluation/research.py        |  518 ----
 .../factorminer/evaluation/runtime.py         |  480 ----
 .../factorminer/evaluation/selection.py       |  280 ---
 .../factorminer/evaluation/significance.py    |  495 ----
 .../evaluation/transaction_costs.py           |  539 ----
 .../factorminer/memory/__init__.py            |   84 -
 .../factorminer/memory/embeddings.py          |  392 ---
 .../factorminer/memory/evolution.py           |  482 ----
 .../factorminer/memory/experience_memory.py   |  594 -----
 .../factorminer/memory/formation.py           |  446 ----
 .../factorminer/memory/kg_retrieval.py        |  336 ---
 .../factorminer/memory/knowledge_graph.py     |  418 ----
 .../factorminer/memory/memory_store.py        |  165 --
 .../memory/online_regime_memory.py            | 1625 ------------
 .../factorminer/memory/retrieval.py           |  288 ---
 .../factorminer/operators/__init__.py         |   54 -
 .../factorminer/operators/arithmetic.py       |  223 --
 .../factorminer/operators/auto_inventor.py    |  547 -----
 .../factorminer/operators/crosssectional.py   |  151 --
 .../factorminer/operators/custom.py           |  251 --
 .../factorminer/operators/gpu_backend.py      |  110 -
 .../factorminer/operators/logical.py          |  185 --
 .../factorminer/operators/neuro_symbolic.py   | 1614 ------------
 .../factorminer/operators/registry.py         |  142 --
 .../factorminer/operators/regression.py       |  167 --
 .../factorminer/operators/smoothing.py        |  173 --
 .../factorminer/operators/statistical.py      |  452 ----
 .../factorminer/operators/timeseries.py       |  395 ---
 src/factorminer/factorminer/tests/__init__.py |    1 -
 src/factorminer/factorminer/tests/conftest.py |  163 --
 .../factorminer/tests/test_auto_inventor.py   |  130 -
 .../factorminer/tests/test_benchmark.py       |  484 ----
 .../factorminer/tests/test_canonicalizer.py   |   79 -
 .../factorminer/tests/test_capacity.py        |  118 -
 .../factorminer/tests/test_causal.py          |  147 --
 .../factorminer/tests/test_cli_analysis.py    |  312 ---
 .../factorminer/tests/test_cli_helix.py       |  142 --
 .../factorminer/tests/test_combination.py     |  531 ----
 .../factorminer/tests/test_data.py            |  258 --
 .../factorminer/tests/test_debate.py          |  229 --
 .../factorminer/tests/test_evaluation.py      |  287 ---
 .../factorminer/tests/test_expression_tree.py |  307 ---
 .../factorminer/tests/test_helix_loop.py      |  251 --
 .../factorminer/tests/test_knowledge_graph.py |  166 --
 .../factorminer/tests/test_library.py         |  356 ---
 .../factorminer/tests/test_memory.py          |  405 ---
 .../factorminer/tests/test_operators.py       |  500 ----
 .../factorminer/tests/test_provenance.py      |  131 -
 .../factorminer/tests/test_ralph_loop.py      | 1076 --------
 .../factorminer/tests/test_regime.py          |  118 -
 .../factorminer/tests/test_research.py        |  237 --
 .../tests/test_runtime_analysis.py            |  196 --
 .../factorminer/tests/test_significance.py    |  174 --
 src/factorminer/factorminer/utils/__init__.py |   26 -
 src/factorminer/factorminer/utils/config.py   |  741 ------
 src/factorminer/factorminer/utils/logging.py  |  297 ---
 .../factorminer/utils/reporting.py            |  499 ----
 .../factorminer/utils/tearsheet.py            |  399 ---
 .../factorminer/utils/visualization.py        |  564 -----
 109 files changed, 363 insertions(+), 44605 deletions(-)
 create mode 100644 docs/plans/2026-04-07-factorminer-local-integration.md
 delete mode 100644 src/factorminer/factorminer/__init__.py
 delete mode 100644 src/factorminer/factorminer/agent/__init__.py
 delete mode 100644 src/factorminer/factorminer/agent/critic.py
 delete mode 100644 src/factorminer/factorminer/agent/debate.py
 delete mode 100644 src/factorminer/factorminer/agent/factor_generator.py
 delete mode 100644 src/factorminer/factorminer/agent/llm_interface.py
 delete mode 100644 src/factorminer/factorminer/agent/output_parser.py
 delete mode 100644 src/factorminer/factorminer/agent/prompt_builder.py
 delete mode 100644 src/factorminer/factorminer/agent/specialists.py
 delete mode 100644 src/factorminer/factorminer/benchmark/__init__.py
 delete mode 100644 src/factorminer/factorminer/benchmark/ablation.py
 delete mode 100644 src/factorminer/factorminer/benchmark/catalogs.py
 delete mode 100644 src/factorminer/factorminer/benchmark/helix_benchmark.py
 delete mode 100644 src/factorminer/factorminer/benchmark/runtime.py
 delete mode 100644 src/factorminer/factorminer/cli.py
 delete mode 100644 src/factorminer/factorminer/configs/__init__.py
 delete mode 100644 src/factorminer/factorminer/configs/benchmark_full.yaml
 delete mode 100644 src/factorminer/factorminer/configs/default.yaml
 delete mode 100644 src/factorminer/factorminer/configs/demo_local.yaml
 delete mode 100644 src/factorminer/factorminer/configs/helix_research.yaml
 delete mode 100644 src/factorminer/factorminer/configs/paper_repro.yaml
 delete mode 100644 src/factorminer/factorminer/core/__init__.py
 delete mode 100644 src/factorminer/factorminer/core/canonicalizer.py
 delete mode 100644 src/factorminer/factorminer/core/config.py
 delete mode 100644 src/factorminer/factorminer/core/expression_tree.py
 delete mode 100644 src/factorminer/factorminer/core/factor_library.py
 delete mode 100644 src/factorminer/factorminer/core/helix_loop.py
 delete mode 100644 src/factorminer/factorminer/core/library_io.py
 delete mode 100644 src/factorminer/factorminer/core/parser.py
 delete mode 100644 src/factorminer/factorminer/core/provenance.py
 delete mode 100644 src/factorminer/factorminer/core/ralph_loop.py
 delete mode 100644 src/factorminer/factorminer/core/session.py
 delete mode 100644 src/factorminer/factorminer/core/types.py
 delete mode 100644 src/factorminer/factorminer/data/__init__.py
 delete mode 100644 src/factorminer/factorminer/data/loader.py
 delete mode 100644 src/factorminer/factorminer/data/mock_data.py
 delete mode 100644 src/factorminer/factorminer/data/preprocessor.py
 delete mode 100644 src/factorminer/factorminer/data/tensor_builder.py
 delete mode 100644 src/factorminer/factorminer/evaluation/__init__.py
 delete mode 100644 src/factorminer/factorminer/evaluation/admission.py
 delete mode 100644 src/factorminer/factorminer/evaluation/backtest.py
 delete mode 100644 src/factorminer/factorminer/evaluation/capacity.py
 delete mode 100644 src/factorminer/factorminer/evaluation/causal.py
 delete mode 100644 src/factorminer/factorminer/evaluation/combination.py
 delete mode 100644 src/factorminer/factorminer/evaluation/correlation.py
 delete mode 100644 src/factorminer/factorminer/evaluation/metrics.py
 delete mode 100644 src/factorminer/factorminer/evaluation/pipeline.py
 delete mode 100644 src/factorminer/factorminer/evaluation/portfolio.py
 delete mode 100644 src/factorminer/factorminer/evaluation/regime.py
 delete mode 100644 src/factorminer/factorminer/evaluation/research.py
 delete mode 100644 src/factorminer/factorminer/evaluation/runtime.py
 delete mode 100644 src/factorminer/factorminer/evaluation/selection.py
 delete mode 100644 src/factorminer/factorminer/evaluation/significance.py
 delete mode 100644 src/factorminer/factorminer/evaluation/transaction_costs.py
 delete mode 100644 src/factorminer/factorminer/memory/__init__.py
 delete mode 100644 src/factorminer/factorminer/memory/embeddings.py
 delete mode 100644 src/factorminer/factorminer/memory/evolution.py
 delete mode 100644 src/factorminer/factorminer/memory/experience_memory.py
 delete mode 100644 src/factorminer/factorminer/memory/formation.py
 delete mode 100644 src/factorminer/factorminer/memory/kg_retrieval.py
 delete mode 100644 src/factorminer/factorminer/memory/knowledge_graph.py
 delete mode 100644 src/factorminer/factorminer/memory/memory_store.py
 delete mode 100644 src/factorminer/factorminer/memory/online_regime_memory.py
 delete mode 100644 src/factorminer/factorminer/memory/retrieval.py
 delete mode 100644 src/factorminer/factorminer/operators/__init__.py
 delete mode 100644 src/factorminer/factorminer/operators/arithmetic.py
 delete mode 100644 src/factorminer/factorminer/operators/auto_inventor.py
 delete mode 100644 src/factorminer/factorminer/operators/crosssectional.py
 delete mode 100644 src/factorminer/factorminer/operators/custom.py
 delete mode 100644 src/factorminer/factorminer/operators/gpu_backend.py
 delete mode 100644 src/factorminer/factorminer/operators/logical.py
 delete mode 100644 src/factorminer/factorminer/operators/neuro_symbolic.py
 delete mode 100644 src/factorminer/factorminer/operators/registry.py
 delete mode 100644 src/factorminer/factorminer/operators/regression.py
 delete mode 100644 src/factorminer/factorminer/operators/smoothing.py
 delete mode 100644 src/factorminer/factorminer/operators/statistical.py
 delete mode 100644 src/factorminer/factorminer/operators/timeseries.py
 delete mode 100644 src/factorminer/factorminer/tests/__init__.py
 delete mode 100644 src/factorminer/factorminer/tests/conftest.py
 delete mode 100644 src/factorminer/factorminer/tests/test_auto_inventor.py
 delete mode 100644 src/factorminer/factorminer/tests/test_benchmark.py
 delete mode 100644 src/factorminer/factorminer/tests/test_canonicalizer.py
 delete mode 100644 src/factorminer/factorminer/tests/test_capacity.py
 delete mode 100644 src/factorminer/factorminer/tests/test_causal.py
 delete mode 100644 src/factorminer/factorminer/tests/test_cli_analysis.py
 delete mode 100644 src/factorminer/factorminer/tests/test_cli_helix.py
 delete mode 100644 src/factorminer/factorminer/tests/test_combination.py
 delete mode 100644 src/factorminer/factorminer/tests/test_data.py
 delete mode 100644 src/factorminer/factorminer/tests/test_debate.py
 delete mode 100644 src/factorminer/factorminer/tests/test_evaluation.py
 delete mode 100644 src/factorminer/factorminer/tests/test_expression_tree.py
 delete mode 100644 src/factorminer/factorminer/tests/test_helix_loop.py
 delete mode 100644 src/factorminer/factorminer/tests/test_knowledge_graph.py
 delete mode 100644 src/factorminer/factorminer/tests/test_library.py
 delete mode 100644 src/factorminer/factorminer/tests/test_memory.py
 delete mode 100644 src/factorminer/factorminer/tests/test_operators.py
 delete mode 100644 src/factorminer/factorminer/tests/test_provenance.py
 delete mode 100644 src/factorminer/factorminer/tests/test_ralph_loop.py
 delete mode 100644 src/factorminer/factorminer/tests/test_regime.py
 delete mode 100644 src/factorminer/factorminer/tests/test_research.py
 delete mode 100644 src/factorminer/factorminer/tests/test_runtime_analysis.py
 delete mode 100644 src/factorminer/factorminer/tests/test_significance.py
 delete mode 100644 src/factorminer/factorminer/utils/__init__.py
 delete mode 100644 src/factorminer/factorminer/utils/config.py
 delete mode 100644 src/factorminer/factorminer/utils/logging.py
 delete mode 100644 src/factorminer/factorminer/utils/reporting.py
 delete mode 100644 src/factorminer/factorminer/utils/tearsheet.py
 delete mode 100644 src/factorminer/factorminer/utils/visualization.py

diff --git a/docs/plans/2026-04-07-factorminer-local-integration.md b/docs/plans/2026-04-07-factorminer-local-integration.md
new file mode 100644
index 0000000..574d1c2
--- /dev/null
+++ b/docs/plans/2026-04-07-factorminer-local-integration.md
@@ -0,0 +1,363 @@
+# FactorMiner 本地框架整合实施计划
+
+> 目标：将 `src/factorminer` 完全整合进 ProStock 项目，数据加载、因子计算全部使用本地框架，仅在因子生成、落库、指标分析时保留 FactorMiner 代码。
+
+---
+
+## 代码风格与本地框架融合规范（全局约束）
+
+所有新增/修改代码必须遵循 ProStock 代码风格，严禁出现 FactorMiner 原生的松散风格或外部项目风格。
+
+1. **命名规范**
+   - 函数/方法/变量：`snake_case`
+   - 类名：`PascalCase`
+   - 常量：`UPPER_CASE`
+   - 私有方法/属性：`_leading_underscore`
+
+2. **类型提示**
+   - 所有公共函数必须标注参数类型和返回类型
+   - 可空类型使用 `Optional[X]` 或 `X | None`（Python 3.10+）
+   - 复杂类型从 `typing` 导入：`Dict`, `List`, `Callable`, `Tuple`, `Any`
+
+3. **文档字符串**
+   - **中文** Google 风格
+   - 第一行为简短摘要
+   - 必须包含 `Args:` 和 `Returns:` 段落
+
+4. **导入顺序**
+   ```python
+   # 1. 标准库
+   import os
+   from typing import Optional, Dict, List
+
+   # 2. 第三方包
+   import numpy as np
+   import polars as pl
+
+   # 3. 本地模块（绝对导入）
+   from src.data.storage import Storage
+   from src.factors import FactorEngine
+   ```
+
+5. **错误处理**
+   - 禁止裸 `except:`
+   - 错误信息格式：`print(f"[ERROR] 上下文: {e}")`
+   - 记录上下文后重新抛出 `raise`
+
+6. **日志与输出**
+   - 使用带前缀的 `print`：`print("[模块名] 消息")`
+   - 循环进度使用 `tqdm`
+   - **禁止 emoji**
+
+7. **数据加载**
+   - 查询模式必须使用 `Storage(read_only=True)`
+   - 因子计算统一通过 `FactorEngine`
+
+8. **测试**
+   - 所有新模块必须配套 `tests/test_*.py`
+   - 运行命令：`uv run pytest tests/test_xxx.py -v`
+
+---
+
+## Step 0: 统一模块引用风格为 `src.*`（已完成）
+
+**状态：** [x] 已完成（通过脚本批量替换）
+
+- 所有 `from factorminer.xxx` 已替换为 `from src.factorminer.factorminer.xxx`
+- 所有字符串形式的模块引用（如 `"factorminer.xxx"`）已同步更新
+
+---
+
+## Step 1: 本地数据加载层（`LocalDataLoader`）
+
+**文件**
+- 新建：`src/factorminer/factorminer/data/local_data_loader.py`
+- 测试：`tests/test_factorminer_local_data_loader.py`
+
+**目标**
+- 弃用 `loader.py` + `preprocessor.py`，改为从本地 DuckDB `pro_bar` 表读取数据
+- 统一日期范围：`20190101` ~ `20231231`
+- 支持股票池筛选（与 `experiment/common.py` 的 `stock_pool_filter` 对齐）
+- 生成 `$vwap` 等价字段（`amount / vol`），并提供统一的 `asset_ids` / `timestamps` 索引
+
+**实现要点**
+- 使用 `Storage(read_only=True).load_polars("pro_bar", ...)` 读取数据
+- 日期格式统一为字符串 `YYYYMMDD`
+- 股票池筛选通过注入的 `filter_func` 完成（默认使用 `experiment/common.py` 的筛选逻辑）
+- 返回封装对象 `LocalPanel`，包含：
+  - `df: pl.DataFrame`（原始长表）
+  - `asset_ids: np.ndarray`
+  - `timestamps: np.ndarray`
+
+**代码风格检查点**
+- 类名 `LocalDataLoader` / `LocalPanel`
+- 所有公共方法带类型提示和中文 docstring
+- 导入顺序正确
+
+---
+
+## Step 2: DSL 翻译器（`FmToLocalTranslator`）
+
+**文件**
+- 新建：`src/factorminer/factorminer/core/formula_translator.py`
+- 测试：`tests/test_factorminer_formula_translator.py`
+
+**目标**
+- 将 FactorMiner 论文中的 110 个 CamelCase DSL 公式翻译成本地 snake_case DSL
+- 覆盖全部算子，未覆盖的算子翻译结果前加 `# TODO` 标记
+- 翻译器**仅用于** paper factors 导入和向后兼容，不用于 LLM 生成路径
+
+**映射规则示例**
+| FactorMiner | 本地 DSL |
+|-------------|----------|
+| `Neg(X)` | `-X` |
+| `Sub(A, B)` | `A - B` |
+| `Div(A, B)` | `A / B` |
+| `CsRank(X)` | `cs_rank(X)` |
+| `TsMean(X, 20)` | `ts_mean(X, 20)` |
+| `$close` | `close` |
+| `$volume` | `vol` |
+| `$amt` | `amount` |
+| `$vwap` | `amount / vol` |
+
+**实现要点**
+- 使用递归下降直接翻译 `ExpressionTree` 节点，不依赖字符串替换（避免括号歧义）
+- `LeafNode` 处理字段映射；`OperatorNode` 处理算子映射
+- 对二元算术算子输出中缀表达式并合理加括号
+- 未实现的算子返回 `# TODO: <原始算子名>(...)`
+
+**代码风格检查点**
+- 翻译器为一个纯函数类，无状态
+- 单元测试覆盖 paper factors 中的高频算子和至少 5 个完整公式
+
+---
+
+## Step 3: 禁用 npz 并将翻译器集成到库 I/O
+
+**文件**
+- 修改：`src/factorminer/factorminer/core/library_io.py`
+- 修改：`src/factorminer/factorminer/cli.py`（如有 `save_signals` 参数则改为始终 False）
+- 测试：`tests/test_factorminer_library_io.py`
+
+**目标**
+- 彻底禁止 `.npz` 信号缓存落盘
+- `load_library` 加载内置 110 个 paper factors 时，自动调用翻译器将其转换为本地的 snake_case DSL
+- 如果翻译结果是 `# TODO`，则在 factor metadata 中标记 `unsupported=True`
+
+**修改要点**
+- `save_library(..., save_signals)`：无论传入什么，均忽略 `save_signals`，且不写 `.npz`
+- `load_library(path)`：恢复 JSON 后，将每个 `factor.formula` 通过翻译器转换
+- `import_from_paper()`：在构建 FactorLibrary 时直接翻译所有公式
+
+**代码风格检查点**
+- 修改点尽量少，废弃参数保留以兼容旧签名，但内部忽略
+- 打印日志说明 npz 已禁用：`print("[library_io] 信号缓存已禁用，仅保存 JSON 元数据")`
+
+---
+
+## Step 4: LLM Prompt 改造（让 Agent 直接生成本地 DSL）
+
+**文件**
+- 修改：`src/factorminer/factorminer/agent/prompt_builder.py`
+- 修改：`src/factorminer/factorminer/agent/factor_generator.py`（如有必要）
+- 测试：`tests/test_factorminer_prompt.py`
+
+**目标**
+- 将 Prompt 中的 DSL 规范从 CamelCase + `$` 前缀改为本地 snake_case DSL
+- 修改示例公式，使其全部为本地 DSL 格式（如 `cs_rank(close / ts_delay(close, 5) - 1)`）
+- 明确可用字段：`open`, `high`, `low`, `close`, `vol`, `amount`, `vwap`（可用 `amount / vol` 计算）
+
+**修改要点**
+- 重写 `SYSTEM_PROMPT` 中的 DSL 规则段落
+- 将所有 prompt 示例公式替换为本地 DSL
+- `OutputParser` 中的公式清洗逻辑需同步适配（去掉 `$`，但保留中文描述）
+
+**代码风格检查点**
+- Prompt 内容易读、无 emoji
+- 通过单元测试验证 prompt 中生成本地 DSL 示例的正确性
+
+---
+
+## Step 5: `LocalFactorEvaluator`（FactorEngine 执行封装）
+
+**文件**
+- 新建：`src/factorminer/factorminer/evaluation/local_engine.py`
+- 测试：`tests/test_factorminer_local_engine.py`
+
+**目标**
+- 封装 `FactorEngine`，提供与 FactorMiner `compute_tree_signals` 兼容的接口
+- 输入：候选因子 DSL 列表；输出：`(M, T)` numpy 信号矩阵字典
+- 支持批量计算 + 立即清理 engine 状态
+
+**类签名设计**
+```python
+class LocalFactorEvaluator:
+    def __init__(self, data_loader: LocalDataLoader) -> None:
+        ...
+
+    def evaluate(
+        self,
+        specs: List[Tuple[str, str]],
+    ) -> Dict[str, np.ndarray]:
+        """批量计算并返回 {name: (M, T) 矩阵}。"""
+        ...
+
+    def evaluate_single(
+        self,
+        name: str,
+        formula: str,
+    ) -> np.ndarray:
+        """计算单个因子。"""
+        ...
+```
+
+**实现要点**
+- `evaluate` 中一次性注册所有 specs，调用 `engine.compute(...)`
+- 使用 `pivot_table` 将返回的 Polars 长表转换为 `(M, T)` numpy 矩阵
+- 缺失值填充 `np.nan`
+- 计算结束后调用 `engine.clear()`
+
+**代码风格检查点**
+- 严格的类型提示和中文 docstring
+- 日志打印：`print("[local_engine] 开始批量计算 {n} 个因子...")`
+
+---
+
+## Step 6: 替换计算管线（`pipeline.py` / `runtime.py`）
+
+**文件**
+- 修改：`src/factorminer/factorminer/evaluation/pipeline.py`
+- 修改：`src/factorminer/factorminer/evaluation/runtime.py`
+- 测试：`tests/test_factorminer_pipeline_integration.py`
+
+**目标**
+- 将 `compute_tree_signals(..., data_dict)` 替换为通过 `LocalFactorEvaluator` 计算
+- 保留原有 IC、stats、quintile 分析逻辑
+
+**修改 `pipeline.py` 要点**
+- `ValidationPipeline.__init__` 接收 `data_loader: LocalDataLoader`
+- 构建内部 `LocalFactorEvaluator`
+- `compute_tree_signals` 改为调用 `evaluator.evaluate_single(name, formula)`
+- `evaluate` 方法中，一次性批量计算所有候选因子，再逐个进入 stats
+
+**修改 `runtime.py` 要点**
+- `evaluate_factors` 中实例化 `LocalFactorEvaluator`
+- 对每个 factor 调用 `evaluate_single`；若 formula 以 `# TODO` 开头，标记为 reject
+- 保留 split-mask 和 stats 计算逻辑
+
+**代码风格检查点**
+- 修改点精确定位，不改变评估函数的返回数据结构
+- 兼容测试通过后再提交
+
+---
+
+## Step 7: 内存优化——库中因子按需重算
+
+**文件**
+- 修改：`src/factorminer/factorminer/core/factor_library.py`
+- 测试：`tests/test_factorminer_library_memory.py`
+
+**目标**
+- 库内因子对象不再长期持有 `(M, T)` numpy signals
+- 相关性检查改为按需调用 `LocalFactorEvaluator` 重算
+
+**修改要点**
+- `admit()` 时不再保存 `signals` 到 `Factor` 对象
+- `compute_correlation` 签名改为接收 `evaluator: LocalFactorEvaluator`
+- 内部遍历库中因子，临时调用 `evaluator.evaluate_single` 计算信号，再与候选信号求相关
+- 若 formula 为 `# TODO` 则跳过（返回 `0.0`）
+- 删除 `_extend_correlation_matrix` / `_recompute_matrix_slot` 增量维护逻辑（改为动态求最大相关）
+
+**代码风格检查点**
+- 废弃旧方法时保留空壳或私有方法，避免测试大面积报错
+- 中文注释说明为什么删除增量矩阵（本地引擎重算成本低，内存优先）
+
+---
+
+## Step 8: 端到端集成测试（110 Paper Factors）
+
+**文件**
+- 新建：`tests/test_factorminer_e2e.py`
+
+**目标**
+- 验证翻译后的 110 个 paper factors 全部能在本地引擎上成功计算信号
+- 排除因未实现算子导致的 TODO 公式，统计成功率
+
+**测试逻辑**
+1. 调用 `import_from_paper()` 加载因子库
+2. 实例化 `LocalDataLoader` 读取 20200101 ~ 20201231 数据
+3. 实例化 `LocalFactorEvaluator`
+4. 过滤掉 `unsupported=True` 的因子
+5. 批量计算剩余因子，断言输出形状为 `(M, T)` 且不含全 NaN
+6. 打印统计：`print("[e2e] 成功 {x}/110，跳过 {y} 个未实现算子")`
+
+**代码风格检查点**
+- 使用 `pytest.mark.slow` 标记（若运行时间 > 30 秒）
+- 不依赖外部 API Key
+
+---
+
+## Step 9: 清理所有 checkpoint 和 demo 中的 npz 保存逻辑
+
+**文件**
+- 修改：`src/factorminer/factorminer/core/ralph_loop.py`
+- 修改：`src/factorminer/factorminer/core/helix_loop.py`
+- 修改：`src/factorminer/run_demo.py`
+- 修改：`src/factorminer/run_phase2_benchmark.py`
+- 修改：`src/factorminer/factorminer/benchmark/*.py`（如有 `save_signals` 调用）
+
+**目标**
+- 确保任何运行路径都不会意外触发 `.npz` 信号缓存落盘
+- 移除或注释掉所有 `library_io.save_library(..., save_signals=True)` 调用
+
+**修改要点**
+- 搜索 `save_signals=True` 和 `.npz` 关键字，逐一处理
+- 改为 `save_signals=False` 或直接调用不带该参数的 `save_library`
+
+---
+
+## Step 10: 代码风格审查、测试全量回归与提交
+
+**执行清单**
+1. 运行 `uv run pytest tests/test_factorminer_* -v`，确保全部通过
+2. 运行 `uv run pytest tests/test_factor_engine.py tests/test_factor_integration.py -v`，确保本地框架未受影响
+3. 检查新增代码中是否混入 emoji
+4. 检查新增代码的导入顺序和 docstring 完整性
+5. 提交前做一次 `git diff --stat`，确认没有误删或大规模重写无关文件
+
+**提交建议**
+- 按模块分几个 commit，而不是一个巨大的 commit
+- 使用 Conventional Commits 风格（`feat:` / `refactor:` / `perf:` / `test:`）
+
+---
+
+## 风险与 TODO
+
+| 风险 | 应对 |
+|------|------|
+| FactorMiner 某些算子本地框架没有实现 | 翻译时标记 `# TODO`，评估阶段 reject |
+| `FactorEngine` 在极宽表（>1000 列）时内存激增 | 以 batch 为单位分批计算，并配合 `engine.clear()` |
+| 本地 `pro_bar` 表数据不完整或缺少某些日期 | 在 `LocalDataLoader` 中加入 coverage check，缺失率过高时抛异常 |
+| `OutputParser` 对本地 DSL 的括号/逗号解析不兼容 | 修改 `OutputParser` 的清洗正则，增加单元测试 |
+
+---
+
+## 附：核心模块依赖关系
+
+```
+┌────────────────────┐
+│ LocalDataLoader    │  ← Storage(read_only=True)
+└────────┬───────────┘
+         │
+         ▼
+┌────────────────────┐
+│ LocalFactorEvaluator│ ← FactorEngine (批量计算 -> pivot -> np.ndarray)
+└────────┬───────────┘
+         │
+    ┌────┴────┐
+    ▼         ▼
+pipeline.py  runtime.py  ← 保留 FactorMiner 的 stats / metrics / admission 逻辑
+    │
+    ▼
+factor_library.py  ← 按需重算，不保存 signals
+```
diff --git a/src/factorminer/factorminer/__init__.py b/src/factorminer/factorminer/__init__.py
deleted file mode 100644
index 9673f47..0000000
--- a/src/factorminer/factorminer/__init__.py
+++ /dev/null
@@ -1,42 +0,0 @@
-"""FactorMiner: LLM-powered quantitative factor mining with evolutionary search."""
-
-__version__ = "0.1.0"
-__author__ = "FactorMiner Team"
-
-from src.factorminer.factorminer.utils.config import (
-    Config,
-    MiningConfig,
-    EvaluationConfig,
-    DataConfig,
-    LLMConfig,
-    MemoryConfig,
-    Phase2Config,
-    CausalConfig,
-    RegimeConfig,
-    CapacityConfig,
-    SignificanceConfig,
-    DebateConfig,
-    AutoInventorConfig,
-    HelixConfig,
-    load_config,
-)
-
-__all__ = [
-    "__version__",
-    "Config",
-    "MiningConfig",
-    "EvaluationConfig",
-    "DataConfig",
-    "LLMConfig",
-    "MemoryConfig",
-    "load_config",
-    # Phase 2 configs
-    "Phase2Config",
-    "CausalConfig",
-    "RegimeConfig",
-    "CapacityConfig",
-    "SignificanceConfig",
-    "DebateConfig",
-    "AutoInventorConfig",
-    "HelixConfig",
-]
diff --git a/src/factorminer/factorminer/agent/__init__.py b/src/factorminer/factorminer/agent/__init__.py
deleted file mode 100644
index 6427ea6..0000000
--- a/src/factorminer/factorminer/agent/__init__.py
+++ /dev/null
@@ -1,78 +0,0 @@
-"""LLM agent integration for factor generation."""
-
-from src.factorminer.factorminer.agent.factor_generator import FactorGenerator
-from src.factorminer.factorminer.agent.llm_interface import (
-    AnthropicProvider,
-    GoogleProvider,
-    LLMProvider,
-    MockProvider,
-    OpenAIProvider,
-    create_provider,
-)
-from src.factorminer.factorminer.agent.output_parser import CandidateFactor, parse_llm_output
-from src.factorminer.factorminer.agent.prompt_builder import (
-    PromptBuilder,
-    build_critic_scoring_prompt,
-    build_debate_synthesis_prompt,
-    build_specialist_prompt,
-)
-from src.factorminer.factorminer.agent.specialists import (
-    DEFAULT_SPECIALISTS,
-    LIQUIDITY_SPECIALIST,
-    MOMENTUM_SPECIALIST,
-    REGIME_SPECIALIST,
-    SPECIALIST_CONFIGS,
-    VOLATILITY_SPECIALIST,
-    SpecialistAgent,
-    SpecialistConfig,
-    SpecialistDomainMemory,
-    SpecialistPromptBuilder,
-)
-from src.factorminer.factorminer.agent.critic import CriticAgent, CriticScore
-from src.factorminer.factorminer.agent.debate import (
-    DebateConfig,
-    DebateGenerator,
-    DebateMemory,
-    DebateOrchestrator,
-    DebateResult,
-)
-
-__all__ = [
-    # Generator
-    "FactorGenerator",
-    # LLM providers
-    "LLMProvider",
-    "OpenAIProvider",
-    "AnthropicProvider",
-    "GoogleProvider",
-    "MockProvider",
-    "create_provider",
-    # Parsing
-    "CandidateFactor",
-    "parse_llm_output",
-    # Prompt
-    "PromptBuilder",
-    "build_specialist_prompt",
-    "build_critic_scoring_prompt",
-    "build_debate_synthesis_prompt",
-    # Specialists
-    "SpecialistConfig",
-    "SpecialistAgent",
-    "SpecialistDomainMemory",
-    "SpecialistPromptBuilder",
-    "MOMENTUM_SPECIALIST",
-    "VOLATILITY_SPECIALIST",
-    "LIQUIDITY_SPECIALIST",
-    "REGIME_SPECIALIST",
-    "DEFAULT_SPECIALISTS",
-    "SPECIALIST_CONFIGS",
-    # Critic
-    "CriticAgent",
-    "CriticScore",
-    # Debate
-    "DebateGenerator",
-    "DebateConfig",
-    "DebateOrchestrator",
-    "DebateResult",
-    "DebateMemory",
-]
diff --git a/src/factorminer/factorminer/agent/critic.py b/src/factorminer/factorminer/agent/critic.py
deleted file mode 100644
index c991f44..0000000
--- a/src/factorminer/factorminer/agent/critic.py
+++ /dev/null
@@ -1,837 +0,0 @@
-"""Critic agent that multi-dimensionally scores candidate factors.
-
-The ``CriticAgent`` pre-filters proposals from specialist agents along six
-dimensions before any expensive backtesting occurs.  Only the top-scoring
-fraction proceeds to IC evaluation, dramatically reducing wasted compute.
-
-Scoring pipeline:
-1. Structural heuristics (complexity, operator diversity) -- O(1) per factor.
-2. Novelty scoring via string-level edit-distance and token overlap -- O(n).
-3. Pattern alignment against success memory -- keyword matching -- O(n).
-4. LLM scoring of top candidates for economic intuition -- one API call.
-5. Composite score computation and ranking.
-"""
-
-from __future__ import annotations
-
-import json
-import logging
-import math
-import re
-from collections import Counter
-from dataclasses import dataclass, field
-from typing import Any, Dict, List, Optional, Set, Tuple
-
-from src.factorminer.factorminer.agent.llm_interface import LLMProvider
-from src.factorminer.factorminer.agent.output_parser import CandidateFactor
-from src.factorminer.factorminer.agent.prompt_builder import normalize_factor_references
-
-logger = logging.getLogger(__name__)
-
-
-# ---------------------------------------------------------------------------
-# CriticScore dataclass -- multi-dimensional
-# ---------------------------------------------------------------------------
-
-@dataclass
-class CriticScore:
-    """Multi-dimensional scored review of a single candidate factor.
-
-    Attributes
-    ----------
-    factor_name : str
-        Name of the candidate factor being reviewed.
-    formula : str
-        DSL formula string of the candidate.
-    source_specialist : str
-        Name of the specialist that proposed this candidate.
-    scores : dict
-        Per-dimension scores, each in [0, 1]:
-        - ``novelty``: structural distinctiveness from existing library.
-        - ``economic_intuition``: economic meaningfulness (LLM-assessed).
-        - ``complexity_penalty``: complexity fitness (1 = optimal depth/ops).
-        - ``operator_diversity``: uses diverse operator categories.
-        - ``pattern_alignment``: aligns with known success patterns in memory.
-        - ``regime_appropriateness``: appropriate for current market regime.
-    composite_score : float
-        Weighted average of dimension scores.
-    keep : bool
-        Whether this factor should proceed to expensive IC evaluation.
-    critique : str
-        Natural-language explanation from the critic.
-    """
-
-    factor_name: str
-    formula: str
-    source_specialist: str
-    scores: Dict[str, float] = field(default_factory=lambda: {
-        "novelty": 0.5,
-        "economic_intuition": 0.5,
-        "complexity_penalty": 0.5,
-        "operator_diversity": 0.5,
-        "pattern_alignment": 0.5,
-        "regime_appropriateness": 0.5,
-    })
-    composite_score: float = 0.5
-    keep: bool = True
-    critique: str = ""
-
-    # --- Backward-compatible convenience properties ---
-
-    @property
-    def novelty_score(self) -> float:
-        return self.scores.get("novelty", 0.5)
-
-    @property
-    def quality_score(self) -> float:
-        return self.scores.get("economic_intuition", 0.5)
-
-    @property
-    def diversity_bonus(self) -> float:
-        return self.scores.get("operator_diversity", 0.5)
-
-    @property
-    def critic_rationale(self) -> str:
-        return self.critique
-
-    @property
-    def final_score(self) -> float:
-        return self.composite_score
-
-
-# ---------------------------------------------------------------------------
-# Scoring weights
-# ---------------------------------------------------------------------------
-
-_SCORE_WEIGHTS: Dict[str, float] = {
-    "novelty": 0.25,
-    "economic_intuition": 0.30,
-    "complexity_penalty": 0.15,
-    "operator_diversity": 0.10,
-    "pattern_alignment": 0.10,
-    "regime_appropriateness": 0.10,
-}
-
-# Pre-filter: keep this fraction by composite score before expensive eval
-_PREFILTER_FRACTION = 0.60
-
-# LLM scoring: only send this many top candidates to the LLM for economic
-# intuition scoring (reduces API cost while covering the promising ones)
-_LLM_SCORING_TOP_K = 40
-
-
-# ---------------------------------------------------------------------------
-# Formula-level feature extraction helpers
-# ---------------------------------------------------------------------------
-
-# Operator categories for diversity measurement
-_OP_CATEGORIES: Dict[str, str] = {
-    "Add": "arithmetic", "Sub": "arithmetic", "Mul": "arithmetic",
-    "Div": "arithmetic", "Neg": "arithmetic", "Abs": "arithmetic",
-    "Square": "arithmetic", "Sqrt": "arithmetic", "Log": "arithmetic",
-    "Pow": "arithmetic", "Sign": "arithmetic",
-    "Std": "statistical", "Var": "statistical", "Mean": "statistical",
-    "Sum": "statistical", "Skew": "statistical", "Kurt": "statistical",
-    "Median": "statistical", "Quantile": "statistical", "Max": "statistical",
-    "Min": "statistical",
-    "Delta": "timeseries", "Delay": "timeseries", "TsRank": "timeseries",
-    "TsMax": "timeseries", "TsMin": "timeseries", "TsArgMax": "timeseries",
-    "TsArgMin": "timeseries", "TsLinRegSlope": "timeseries",
-    "Return": "timeseries", "LogReturn": "timeseries", "CumSum": "timeseries",
-    "EMA": "smoothing", "SMA": "smoothing", "WMA": "smoothing",
-    "HMA": "smoothing", "DEMA": "smoothing", "KAMA": "smoothing",
-    "Decay": "smoothing",
-    "CsRank": "cross_sectional", "CsZScore": "cross_sectional",
-    "CsDemean": "cross_sectional", "CsScale": "cross_sectional",
-    "CsNeutralize": "cross_sectional", "CsQuantile": "cross_sectional",
-    "Corr": "regression", "Cov": "regression", "Beta": "regression",
-    "Resi": "regression", "Rsquare": "regression", "Resid": "regression",
-    "IfElse": "logical", "Greater": "logical", "Less": "logical",
-    "GreaterEqual": "logical", "LessEqual": "logical", "Equal": "logical",
-}
-
-_OPERATOR_PATTERN = re.compile(r"([A-Z][a-zA-Z0-9]*)\s*\(")
-
-
-def _extract_operators(formula: str) -> List[str]:
-    """Extract all operator names from a formula string."""
-    return _OPERATOR_PATTERN.findall(formula)
-
-
-def _formula_depth(formula: str) -> int:
-    """Estimate nesting depth by counting maximum parenthesis depth."""
-    max_depth = 0
-    depth = 0
-    for ch in formula:
-        if ch == "(":
-            depth += 1
-            max_depth = max(max_depth, depth)
-        elif ch == ")":
-            depth -= 1
-    return max_depth
-
-
-def _tokenize_formula(formula: str) -> Set[str]:
-    """Tokenize a formula into its operator and feature tokens."""
-    tokens: Set[str] = set()
-    tokens.update(_OPERATOR_PATTERN.findall(formula))
-    for feat in re.findall(r"\$[a-z]+", formula):
-        tokens.add(feat)
-    return tokens
-
-
-def _edit_distance_normalized(a: str, b: str, max_len: int = 200) -> float:
-    """Compute normalized edit distance between two formula strings.
-
-    Returns 0.0 for identical strings, 1.0 for completely different.
-    """
-    a = a[:max_len]
-    b = b[:max_len]
-    if a == b:
-        return 0.0
-    la, lb = len(a), len(b)
-    prev = list(range(lb + 1))
-    for i, ca in enumerate(a):
-        curr = [i + 1]
-        for j, cb in enumerate(b):
-            cost = 0 if ca == cb else 1
-            curr.append(min(curr[j] + 1, prev[j + 1] + 1, prev[j] + cost))
-        prev = curr
-    return prev[lb] / max(la, lb)
-
-
-def _token_idf_similarity(formula: str, existing: List[str]) -> float:
-    """Compute TF-IDF-inspired token overlap similarity.
-
-    Returns a value in [0, 1] where 1 means very similar to existing,
-    0 means completely novel.
-    """
-    if not existing:
-        return 0.0
-
-    query_tokens = _tokenize_formula(formula)
-    if not query_tokens:
-        return 0.0
-
-    df: Counter = Counter()
-    for ex in existing:
-        for tok in _tokenize_formula(ex):
-            df[tok] += 1
-
-    n_docs = len(existing)
-    score = 0.0
-    for tok in query_tokens:
-        if tok in df:
-            idf = math.log(n_docs / df[tok]) if df[tok] < n_docs else 0.0
-            score += (1.0 + idf)
-
-    max_score = sum(1.0 for _ in query_tokens)
-    if max_score == 0:
-        return 0.0
-    return min(1.0, score / (max_score * math.log(n_docs + 1) + 1.0))
-
-
-# ---------------------------------------------------------------------------
-# CriticAgent
-# ---------------------------------------------------------------------------
-
-class CriticAgent:
-    """LLM-powered multi-dimensional critic for candidate factor pre-filtering.
-
-    Evaluates candidates along 6 dimensions before any expensive IC evaluation.
-    Uses structural heuristics for fast pre-scoring, then sends top-K to the
-    LLM for economic intuition scoring.  Only the top fraction by composite
-    score is marked as ``keep=True`` for downstream evaluation.
-
-    Parameters
-    ----------
-    llm_provider : LLMProvider
-        LLM backend for economic intuition scoring.
-    temperature : float
-        Sampling temperature for the critic's LLM calls.
-    max_tokens : int
-        Max response tokens for the critic review.
-    prefilter_fraction : float
-        Fraction of candidates to keep after scoring (0.0-1.0).
-    llm_scoring_top_k : int
-        How many top candidates (by heuristic score) to send to LLM
-        for economic intuition scoring.
-    """
-
-    _SYSTEM_PROMPT = (
-        "You are a rigorous quantitative research critic specialising in "
-        "formulaic alpha factors.  Your job is to evaluate candidate factor "
-        "expressions on their economic intuition: does the factor make sense "
-        "as a predictor of cross-sectional stock returns?  Be rigorous, "
-        "concise, and return structured JSON only."
-    )
-
-    def __init__(
-        self,
-        llm_provider: LLMProvider,
-        temperature: float = 0.3,
-        max_tokens: int = 4096,
-        prefilter_fraction: float = _PREFILTER_FRACTION,
-        llm_scoring_top_k: int = _LLM_SCORING_TOP_K,
-    ) -> None:
-        self.llm_provider = llm_provider
-        self.temperature = temperature
-        self.max_tokens = max_tokens
-        self.prefilter_fraction = prefilter_fraction
-        self.llm_scoring_top_k = llm_scoring_top_k
-
-    # ------------------------------------------------------------------
-    # Primary public API
-    # ------------------------------------------------------------------
-
-    def score_batch(
-        self,
-        candidates: List[str],
-        existing_factors: Optional[List[str]] = None,
-        memory_signal: Optional[str] = None,
-        regime_context: str = "",
-        specialist_map: Optional[Dict[str, str]] = None,
-    ) -> List[CriticScore]:
-        """Score a flat list of candidate formula strings.
-
-        Parameters
-        ----------
-        candidates : list[str]
-            Formula strings to evaluate.
-        existing_factors : list[str] or None
-            Formulas already in the library (for novelty scoring).
-        memory_signal : str or None
-            Free-text memory context (success patterns, etc.).
-        regime_context : str
-            Current market regime description.
-        specialist_map : dict or None
-            Mapping formula -> specialist name for attribution.
-
-        Returns
-        -------
-        list[CriticScore]
-            Scores in the same order as ``candidates``.
-        """
-        existing_factors = existing_factors or []
-        specialist_map = specialist_map or {}
-
-        from factorminer.agent.output_parser import _try_build_candidate
-        cf_list: List[CandidateFactor] = []
-        for i, formula in enumerate(candidates):
-            cf = _try_build_candidate(f"candidate_{i}", formula)
-            cf_list.append(cf)
-
-        proposals: Dict[str, List[CandidateFactor]] = {}
-        for cf in cf_list:
-            src = specialist_map.get(cf.formula, "unknown")
-            proposals.setdefault(src, []).append(cf)
-
-        return self._score_proposals(
-            proposals=proposals,
-            existing_factors=existing_factors,
-            memory_signal=memory_signal or "",
-            regime_context=regime_context,
-        )
-
-    def review_candidates(
-        self,
-        proposals: Dict[str, List[CandidateFactor]],
-        library_state: Optional[Dict[str, Any]] = None,
-        memory_signal: Optional[Dict[str, Any]] = None,
-        top_k: int = 40,
-    ) -> List[CriticScore]:
-        """Review all specialist proposals and return ranked scores.
-
-        This is the primary interface used by ``DebateGenerator``.
-
-        Parameters
-        ----------
-        proposals : dict[str, list[CandidateFactor]]
-            Mapping from specialist name to its list of candidates.
-        library_state : dict or None
-            Current factor library state for context.
-        memory_signal : dict or None
-            Memory priors for context.
-        top_k : int
-            Number of top-scoring candidates to return.
-
-        Returns
-        -------
-        list[CriticScore]
-            Top-K candidates sorted by ``composite_score`` descending.
-        """
-        library_state = library_state or {}
-        memory_signal = memory_signal or {}
-
-        existing_factors: List[str] = normalize_factor_references(
-            library_state.get("recent_admissions", [])
-        )
-        mem_str = self._memory_signal_to_str(memory_signal)
-        regime_context = memory_signal.get("regime_context", "")
-
-        scores = self._score_proposals(
-            proposals=proposals,
-            existing_factors=existing_factors,
-            memory_signal=mem_str,
-            regime_context=str(regime_context),
-        )
-
-        scores.sort(key=lambda s: s.composite_score, reverse=True)
-        return scores[:top_k]
-
-    # ------------------------------------------------------------------
-    # Internal scoring pipeline
-    # ------------------------------------------------------------------
-
-    def _score_proposals(
-        self,
-        proposals: Dict[str, List[CandidateFactor]],
-        existing_factors: List[str],
-        memory_signal: str,
-        regime_context: str,
-    ) -> List[CriticScore]:
-        """Full multi-dimensional scoring pipeline."""
-        all_pairs: List[Tuple[str, CandidateFactor]] = []
-        for spec_name, candidates in proposals.items():
-            for c in candidates:
-                all_pairs.append((spec_name, c))
-
-        if not all_pairs:
-            return []
-
-        # Phase 1: Heuristic scoring
-        partial_scores: List[CriticScore] = []
-        for spec_name, candidate in all_pairs:
-            scores_dict = self._heuristic_score(
-                formula=candidate.formula,
-                existing_factors=existing_factors,
-                memory_signal=memory_signal,
-                regime_context=regime_context,
-            )
-            scores_dict["economic_intuition"] = 0.5  # LLM will fill in
-
-            composite = self._compute_composite(scores_dict)
-            critique = self._brief_heuristic_critique(scores_dict, candidate.formula)
-
-            partial_scores.append(CriticScore(
-                factor_name=candidate.name,
-                formula=candidate.formula,
-                source_specialist=spec_name,
-                scores=scores_dict,
-                composite_score=composite,
-                keep=True,
-                critique=critique,
-            ))
-
-        # Phase 2: LLM economic intuition for top candidates
-        partial_scores.sort(key=lambda s: s.composite_score, reverse=True)
-        top_for_llm = partial_scores[:self.llm_scoring_top_k]
-
-        llm_econ_scores = self._llm_economic_intuition(
-            candidates=top_for_llm,
-            existing_factors=existing_factors,
-            memory_signal=memory_signal,
-        )
-
-        # Phase 3: Recompute composite with LLM scores
-        for score_obj in partial_scores:
-            if score_obj.factor_name in llm_econ_scores:
-                econ, rationale = llm_econ_scores[score_obj.factor_name]
-                score_obj.scores["economic_intuition"] = econ
-                score_obj.composite_score = self._compute_composite(score_obj.scores)
-                if rationale:
-                    score_obj.critique = rationale
-
-        # Phase 4: Diversity-aware re-ranking
-        partial_scores.sort(key=lambda s: s.composite_score, reverse=True)
-        partial_scores = self._apply_diversity_adjustment(partial_scores)
-
-        # Phase 5: Pre-filter -- mark keep/discard
-        n_keep = max(1, int(len(partial_scores) * self.prefilter_fraction))
-        for i, score_obj in enumerate(partial_scores):
-            score_obj.keep = i < n_keep
-
-        return partial_scores
-
-    def _heuristic_score(
-        self,
-        formula: str,
-        existing_factors: List[str],
-        memory_signal: str,
-        regime_context: str,
-    ) -> Dict[str, float]:
-        """Compute heuristic dimension scores without LLM call."""
-        operators = _extract_operators(formula)
-        depth = _formula_depth(formula)
-        unique_ops = list(dict.fromkeys(operators))
-        n_unique = len(unique_ops)
-
-        novelty = self._score_novelty(formula, existing_factors)
-        complexity = self._score_complexity(depth, n_unique)
-        op_diversity = self._score_operator_diversity(unique_ops)
-        pattern_align = self._score_pattern_alignment(formula, memory_signal)
-        regime_score = self._score_regime_appropriateness(formula, regime_context)
-
-        return {
-            "novelty": novelty,
-            "economic_intuition": 0.5,
-            "complexity_penalty": complexity,
-            "operator_diversity": op_diversity,
-            "pattern_alignment": pattern_align,
-            "regime_appropriateness": regime_score,
-        }
-
-    def _score_novelty(self, formula: str, existing_factors: List[str]) -> float:
-        """Novelty: 1.0 = completely novel, 0.0 = exact duplicate."""
-        if not existing_factors:
-            return 0.8
-
-        token_sim = _token_idf_similarity(formula, existing_factors)
-        sample = existing_factors[-20:]
-        edit_dists = [_edit_distance_normalized(formula, ex) for ex in sample]
-        avg_edit = sum(edit_dists) / len(edit_dists) if edit_dists else 1.0
-
-        novelty = 0.5 * (1.0 - token_sim) + 0.5 * avg_edit
-        return float(max(0.0, min(1.0, novelty)))
-
-    def _score_complexity(self, depth: int, n_unique_ops: int) -> float:
-        """Complexity fitness: 1.0 = optimal (depth 3-7, 3-5 unique ops)."""
-        if 3 <= depth <= 7:
-            depth_score = 1.0
-        elif depth < 3:
-            depth_score = depth / 3.0
-        else:
-            depth_score = max(0.0, 1.0 - 0.15 * (depth - 7))
-
-        if 3 <= n_unique_ops <= 5:
-            op_score = 1.0
-        elif n_unique_ops < 3:
-            op_score = n_unique_ops / 3.0
-        else:
-            op_score = max(0.0, 1.0 - 0.1 * (n_unique_ops - 5))
-
-        return float(0.6 * depth_score + 0.4 * op_score)
-
-    def _score_operator_diversity(self, unique_ops: List[str]) -> float:
-        """Operator diversity: how many distinct operator categories appear?"""
-        categories = {_OP_CATEGORIES.get(op, "other") for op in unique_ops}
-        n_categories = len(categories)
-        diversity_map = {0: 0.0, 1: 0.2, 2: 0.5, 3: 0.8}
-        return float(diversity_map.get(n_categories, 1.0))
-
-    def _score_pattern_alignment(self, formula: str, memory_signal: str) -> float:
-        """Pattern alignment: do formula tokens appear in known success patterns?"""
-        if not memory_signal:
-            return 0.5
-
-        formula_lower = formula.lower()
-        signal_lower = memory_signal.lower()
-
-        formula_tokens = set(re.findall(r"[a-z]+", formula_lower))
-        signal_tokens = set(re.findall(r"[a-z]+", signal_lower))
-
-        stopwords = {"the", "a", "is", "in", "of", "to", "and", "or", "for",
-                     "as", "by", "on", "it", "be", "at", "an", "up"}
-        formula_tokens -= stopwords
-        signal_tokens -= stopwords
-
-        if not formula_tokens:
-            return 0.5
-
-        overlap = formula_tokens & signal_tokens
-        alignment = len(overlap) / len(formula_tokens)
-        return float(0.3 + 0.7 * min(1.0, alignment * 2))
-
-    def _score_regime_appropriateness(
-        self, formula: str, regime_context: str
-    ) -> float:
-        """Does this formula suit the stated regime context?"""
-        if not regime_context:
-            return 0.7
-
-        regime_lower = regime_context.lower()
-
-        momentum_kw = {"momentum", "trend", "trending", "breakout"}
-        volatility_kw = {"volatile", "volatility", "risk-off", "vix"}
-        reversal_kw = {"reversal", "mean-reversion", "oversold", "overbought"}
-        liquidity_kw = {"illiquid", "liquidity", "volume"}
-
-        regime_is_momentum = any(k in regime_lower for k in momentum_kw)
-        regime_is_volatile = any(k in regime_lower for k in volatility_kw)
-        regime_is_reversal = any(k in regime_lower for k in reversal_kw)
-        regime_is_illiquid = any(k in regime_lower for k in liquidity_kw)
-
-        formula_has_momentum = any(
-            op in formula for op in ("Delta", "TsRank", "EMA", "Return", "TsLinReg")
-        )
-        formula_has_vol = any(op in formula for op in ("Std", "Kurt", "Skew", "Var"))
-        formula_has_reversal = "Neg" in formula or any(
-            op in formula for op in ("Mean", "SMA", "TsRank")
-        )
-        formula_has_volume = any(f in formula for f in ("$volume", "$amt"))
-
-        matches = 0
-        total_signals = 0
-        if regime_is_momentum:
-            total_signals += 1
-            matches += int(formula_has_momentum)
-        if regime_is_volatile:
-            total_signals += 1
-            matches += int(formula_has_vol)
-        if regime_is_reversal:
-            total_signals += 1
-            matches += int(formula_has_reversal)
-        if regime_is_illiquid:
-            total_signals += 1
-            matches += int(formula_has_volume)
-
-        if total_signals == 0:
-            return 0.7
-        return float(0.4 + 0.6 * (matches / total_signals))
-
-    @staticmethod
-    def _compute_composite(scores: Dict[str, float]) -> float:
-        """Compute weighted composite score from dimension scores."""
-        total = 0.0
-        weight_sum = 0.0
-        for dim, weight in _SCORE_WEIGHTS.items():
-            val = scores.get(dim, 0.5)
-            total += weight * val
-            weight_sum += weight
-        if weight_sum == 0:
-            return 0.5
-        return float(total / weight_sum)
-
-    def _brief_heuristic_critique(
-        self, scores: Dict[str, float], formula: str
-    ) -> str:
-        """Generate a brief human-readable critique from heuristic scores."""
-        parts = []
-        depth = _formula_depth(formula)
-        ops = _extract_operators(formula)
-        n_unique = len(set(ops))
-
-        novelty = scores.get("novelty", 0.5)
-        if novelty < 0.3:
-            parts.append("closely resembles existing library factors")
-        elif novelty > 0.7:
-            parts.append("structurally novel")
-
-        complexity = scores.get("complexity_penalty", 0.5)
-        if complexity < 0.4:
-            if depth < 3:
-                parts.append(f"too shallow (depth={depth})")
-            elif depth > 8:
-                parts.append(f"overly deep (depth={depth})")
-            if n_unique < 3:
-                parts.append(f"low operator diversity ({n_unique} unique ops)")
-
-        op_div = scores.get("operator_diversity", 0.5)
-        if op_div >= 0.8:
-            cats = {_OP_CATEGORIES.get(op, "other") for op in set(ops)}
-            parts.append(f"good operator variety ({', '.join(sorted(cats))})")
-
-        if not parts:
-            parts.append("passes heuristic checks")
-
-        return "; ".join(parts) + "."
-
-    # ------------------------------------------------------------------
-    # LLM economic intuition scoring
-    # ------------------------------------------------------------------
-
-    def _llm_economic_intuition(
-        self,
-        candidates: List[CriticScore],
-        existing_factors: List[str],
-        memory_signal: str,
-    ) -> Dict[str, Tuple[float, str]]:
-        """Send top candidates to LLM for economic intuition scoring."""
-        if not candidates:
-            return {}
-
-        prompt = self._build_llm_scoring_prompt(
-            candidates=candidates,
-            existing_factors=existing_factors,
-            memory_signal=memory_signal,
-        )
-
-        try:
-            raw = self.llm_provider.generate(
-                system_prompt=self._SYSTEM_PROMPT,
-                user_prompt=prompt,
-                temperature=self.temperature,
-                max_tokens=self.max_tokens,
-            )
-            return self._parse_llm_scoring_response(raw, candidates)
-        except Exception as exc:
-            logger.warning(
-                "Critic LLM economic intuition scoring failed: %s. "
-                "Keeping heuristic scores.",
-                exc,
-            )
-            return {}
-
-    def _build_llm_scoring_prompt(
-        self,
-        candidates: List[CriticScore],
-        existing_factors: List[str],
-        memory_signal: str,
-    ) -> str:
-        """Build the structured scoring prompt for LLM economic intuition."""
-        sections: List[str] = []
-
-        if existing_factors:
-            sections.append("## EXISTING LIBRARY SAMPLE (last 10)")
-            for f in existing_factors[-10:]:
-                sections.append(f"  - {f}")
-
-        if memory_signal:
-            sections.append(f"\n## MEMORY CONTEXT\n{memory_signal[:800]}")
-
-        sections.append("\n## CANDIDATES FOR ECONOMIC INTUITION SCORING")
-        sections.append(
-            "Score each on economic_intuition (0.0-1.0): does this formula "
-            "capture a plausible, economically meaningful cross-sectional "
-            "return predictor?  Consider:\n"
-            "  - Is there a coherent economic story?\n"
-            "  - Is the complexity level appropriate (depth 3-7 is best)?\n"
-            "  - Does it avoid trivial reformulations of simple momentum/reversal?\n"
-            "  - Does it use features in a semantically coherent way?\n"
-        )
-
-        for cs in candidates:
-            sections.append(
-                f"  Factor: {cs.factor_name}  "
-                f"[Specialist: {cs.source_specialist}]\n"
-                f"  Formula: {cs.formula}"
-            )
-
-        sections.append(
-            "\n## OUTPUT FORMAT\n"
-            "Return one JSON object per line, exactly:\n"
-            '{"name": "<factor_name>", "economic_intuition": <0.0-1.0>, '
-            '"rationale": "<one sentence>"}\n'
-            "Output ONLY the JSON lines. No markdown, no explanations."
-        )
-
-        return "\n".join(sections)
-
-    def _parse_llm_scoring_response(
-        self,
-        raw: str,
-        candidates: List[CriticScore],
-    ) -> Dict[str, Tuple[float, str]]:
-        """Parse LLM scoring response into economic intuition scores."""
-        valid_names = {cs.factor_name for cs in candidates}
-        results: Dict[str, Tuple[float, str]] = {}
-        json_pattern = re.compile(r"\{[^{}]+\}")
-
-        for match in json_pattern.findall(raw):
-            try:
-                obj = json.loads(match)
-            except json.JSONDecodeError:
-                continue
-            name = obj.get("name", "")
-            if name not in valid_names:
-                continue
-            econ = float(max(0.0, min(1.0, obj.get("economic_intuition", 0.5))))
-            rationale = str(obj.get("rationale", ""))
-            results[name] = (econ, rationale)
-
-        logger.debug(
-            "LLM economic intuition: scored %d/%d candidates",
-            len(results),
-            len(candidates),
-        )
-        return results
-
-    # ------------------------------------------------------------------
-    # Diversity adjustment
-    # ------------------------------------------------------------------
-
-    def _apply_diversity_adjustment(
-        self, scores: List[CriticScore]
-    ) -> List[CriticScore]:
-        """Slightly boost underrepresented specialists to maintain balance."""
-        if not scores:
-            return scores
-
-        specialist_counts: Counter = Counter()
-        n_specialists = len({s.source_specialist for s in scores})
-        ideal_frac = 1.0 / max(n_specialists, 1)
-
-        adjusted = []
-        for cs in scores:
-            specialist_counts[cs.source_specialist] += 1
-            total_so_far = sum(specialist_counts.values())
-            actual_frac = specialist_counts[cs.source_specialist] / total_so_far
-            diversity_adj = (ideal_frac - actual_frac) * 0.1
-            diversity_adj = max(-0.05, min(0.05, diversity_adj))
-            adjusted_score = float(
-                max(0.0, min(1.0, cs.composite_score + diversity_adj))
-            )
-            new_scores = dict(cs.scores)
-            new_scores["operator_diversity"] = float(
-                max(0.0, min(1.0,
-                    cs.scores.get("operator_diversity", 0.5) + diversity_adj
-                ))
-            )
-            adjusted.append(CriticScore(
-                factor_name=cs.factor_name,
-                formula=cs.formula,
-                source_specialist=cs.source_specialist,
-                scores=new_scores,
-                composite_score=adjusted_score,
-                keep=cs.keep,
-                critique=cs.critique,
-            ))
-
-        adjusted.sort(key=lambda s: s.composite_score, reverse=True)
-        return adjusted
-
-    # ------------------------------------------------------------------
-    # Utility helpers
-    # ------------------------------------------------------------------
-
-    @staticmethod
-    def _memory_signal_to_str(memory_signal: Dict[str, Any]) -> str:
-        """Flatten a memory signal dict to a compact string for embedding."""
-        parts: List[str] = []
-        for key in (
-            "recommended_directions", "strategic_insights",
-            "complementary_patterns", "prompt_text",
-        ):
-            val = memory_signal.get(key)
-            if isinstance(val, list):
-                parts.extend(str(v) for v in val)
-            elif isinstance(val, str) and val:
-                parts.append(val)
-        return " ".join(parts)
-
-    @staticmethod
-    def _fallback_uniform_scores(
-        proposals: Dict[str, List[CandidateFactor]],
-    ) -> List[CriticScore]:
-        """Generate uniform scores when all scoring mechanisms fail."""
-        default_composite = 0.5
-        scores: List[CriticScore] = []
-        for specialist_name, candidates in proposals.items():
-            for c in candidates:
-                scores.append(CriticScore(
-                    factor_name=c.name,
-                    formula=c.formula,
-                    source_specialist=specialist_name,
-                    scores={
-                        "novelty": 0.5,
-                        "economic_intuition": 0.5,
-                        "complexity_penalty": 0.5,
-                        "operator_diversity": 0.5,
-                        "pattern_alignment": 0.5,
-                        "regime_appropriateness": 0.5,
-                    },
-                    composite_score=default_composite,
-                    keep=True,
-                    critique="Fallback uniform score (critic unavailable).",
-                ))
-        return scores
diff --git a/src/factorminer/factorminer/agent/debate.py b/src/factorminer/factorminer/agent/debate.py
deleted file mode 100644
index 9fb4d68..0000000
--- a/src/factorminer/factorminer/agent/debate.py
+++ /dev/null
@@ -1,949 +0,0 @@
-"""Multi-agent debate orchestrator for factor generation (FactorMAD).
-
-``DebateGenerator`` is a **drop-in replacement** for ``FactorGenerator``.
-It runs multiple domain-specialist generators, collects their proposals,
-passes them through a multi-dimensional ``CriticAgent`` for pre-filtering,
-and returns a single ``List[CandidateFactor]`` with the same interface as
-``FactorGenerator.generate_batch()``.
-
-The full pipeline (``DebateOrchestrator``) also supports:
-- SymPy-based algebraic deduplication via ``FormulaCanonicalizer``.
-- ``DebateMemory`` tracking: specialist leaderboards, blind spot detection.
-- Parallel specialist generation (thread-pool).
-- Structured ``DebateResult`` dataclass capturing the full debate state.
-"""
-
-from __future__ import annotations
-
-import concurrent.futures
-import logging
-from collections import Counter
-from dataclasses import dataclass, field
-from typing import Any, Dict, List, Optional
-
-from src.factorminer.factorminer.agent.critic import CriticAgent, CriticScore
-from src.factorminer.factorminer.agent.factor_generator import FactorGenerator
-from src.factorminer.factorminer.agent.llm_interface import LLMProvider
-from src.factorminer.factorminer.agent.output_parser import CandidateFactor
-from src.factorminer.factorminer.agent.prompt_builder import (
-    PromptBuilder,
-    normalize_factor_references,
-)
-from src.factorminer.factorminer.agent.specialists import (
-    DEFAULT_SPECIALISTS,
-    SpecialistAgent,
-    SpecialistConfig,
-    SpecialistPromptBuilder,
-)
-
-logger = logging.getLogger(__name__)
-
-
-# ---------------------------------------------------------------------------
-# DebateConfig
-# ---------------------------------------------------------------------------
-
-@dataclass
-class DebateConfig:
-    """Configuration for the multi-agent FactorMAD pipeline.
-
-    Attributes
-    ----------
-    specialists : list[SpecialistConfig]
-        Specialist configurations to run.  Defaults to the four
-        pre-defined specialists (momentum, volatility, liquidity, regime).
-    enable_critic : bool
-        Whether to run the CriticAgent for multi-dimensional scoring.
-    candidates_per_specialist : int
-        Number of candidates each specialist generates per round.
-    top_k_after_critic : int
-        How many candidates the critic retains after ranking.
-    critic_temperature : float
-        Sampling temperature for the critic LLM call.
-    enable_deduplication : bool
-        Whether to use SymPy canonicalization to remove algebraic duplicates.
-    enable_debate_memory : bool
-        Whether to track debate history across rounds for specialist feedback.
-    parallel_specialists : bool
-        Whether to run specialists in parallel (thread pool).
-    max_parallel_workers : int
-        Maximum number of parallel threads for specialist generation.
-    """
-
-    specialists: List[SpecialistConfig] = field(
-        default_factory=lambda: list(DEFAULT_SPECIALISTS)
-    )
-    enable_critic: bool = True
-    candidates_per_specialist: int = 15
-    top_k_after_critic: int = 40
-    critic_temperature: float = 0.3
-    enable_deduplication: bool = True
-    enable_debate_memory: bool = True
-    parallel_specialists: bool = True
-    max_parallel_workers: int = 4
-
-
-# ---------------------------------------------------------------------------
-# DebateResult
-# ---------------------------------------------------------------------------
-
-@dataclass
-class DebateResult:
-    """Full structured result from one debate round.
-
-    Attributes
-    ----------
-    all_proposals : list[str]
-        Raw formula strings from all specialists.
-    after_dedup : list[str]
-        Formulas after SymPy algebraic deduplication.
-    after_critic : list[str]
-        Formulas that passed the critic pre-filter (``keep=True``).
-    critic_scores : list[CriticScore]
-        Full multi-dimensional scores for all candidates.
-    specialist_proposals : dict[str, list[str]]
-        Per-specialist formula strings before any filtering.
-    specialist_success_rates : dict[str, float]
-        Historical admission success rates per specialist.
-    debate_stats : dict
-        Summary statistics: n_proposals, n_after_dedup, n_after_critic,
-        n_duplicates_removed, specialist_counts.
-    """
-
-    all_proposals: List[str] = field(default_factory=list)
-    after_dedup: List[str] = field(default_factory=list)
-    after_critic: List[str] = field(default_factory=list)
-    critic_scores: List[CriticScore] = field(default_factory=list)
-    specialist_proposals: Dict[str, List[str]] = field(default_factory=dict)
-    specialist_success_rates: Dict[str, float] = field(default_factory=dict)
-    debate_stats: Dict[str, Any] = field(default_factory=dict)
-
-
-# ---------------------------------------------------------------------------
-# DebateMemory -- cross-round debate history tracking
-# ---------------------------------------------------------------------------
-
-class DebateMemory:
-    """Tracks debate history across rounds: who proposed what, what got admitted.
-
-    Used by ``DebateOrchestrator`` to maintain specialist leaderboards,
-    identify blind spots (operator families nobody proposes), and surface
-    patterns the critic consistently rewards.
-
-    Parameters
-    ----------
-    specialist_names : list[str]
-        Names of all participating specialists.
-    """
-
-    _ALL_OP_FAMILIES: List[str] = [
-        "arithmetic", "statistical", "timeseries", "smoothing",
-        "cross_sectional", "regression", "logical",
-    ]
-
-    def __init__(self, specialist_names: List[str]) -> None:
-        self._specialist_names = list(specialist_names)
-        self._proposal_history: Dict[str, List[tuple]] = {
-            name: [] for name in specialist_names
-        }
-        self._rounds: List[Dict[str, Any]] = []
-        self._best_critic_patterns: List[str] = []
-
-    def record_round(
-        self,
-        debate_result: DebateResult,
-        admissions: Optional[List[str]] = None,
-    ) -> None:
-        """Record outcome of one debate round.
-
-        Parameters
-        ----------
-        debate_result : DebateResult
-            The result of the debate round.
-        admissions : list[str] or None
-            Formulas ultimately admitted to the library after IC evaluation.
-        """
-        admissions = admissions or []
-        admission_set = set(admissions)
-
-        for spec_name, formulas in debate_result.specialist_proposals.items():
-            for formula in formulas:
-                was_admitted = formula in admission_set
-                self._proposal_history.setdefault(spec_name, []).append(
-                    (formula, was_admitted)
-                )
-
-        for score in debate_result.critic_scores:
-            if score.composite_score >= 0.7 and score.formula in admission_set:
-                self._best_critic_patterns.append(score.formula)
-
-        self._rounds.append({
-            "n_proposals": len(debate_result.all_proposals),
-            "n_after_dedup": len(debate_result.after_dedup),
-            "n_after_critic": len(debate_result.after_critic),
-            "n_admissions": len(admissions),
-            "specialist_counts": {
-                name: len(formulas)
-                for name, formulas in debate_result.specialist_proposals.items()
-            },
-        })
-
-    def get_specialist_leaderboard(self) -> List[Dict[str, Any]]:
-        """Return specialist performance sorted by admission rate.
-
-        Returns
-        -------
-        list[dict]
-            Each dict has keys: ``name``, ``proposed``, ``admitted``,
-            ``admission_rate``.  Sorted by ``admission_rate`` descending.
-        """
-        rows: List[Dict[str, Any]] = []
-        for name in self._specialist_names:
-            history = self._proposal_history.get(name, [])
-            proposed = len(history)
-            admitted = sum(1 for _, was_admitted in history if was_admitted)
-            rate = admitted / max(proposed, 1)
-            rows.append({
-                "name": name,
-                "proposed": proposed,
-                "admitted": admitted,
-                "admission_rate": rate,
-            })
-        rows.sort(key=lambda r: r["admission_rate"], reverse=True)
-        return rows
-
-    def get_best_critic_patterns(self) -> List[str]:
-        """Return formula patterns the critic loved that were also admitted."""
-        return list(self._best_critic_patterns[-20:])
-
-    def get_blind_spots(self) -> Dict[str, List[str]]:
-        """Detect operator families that no specialist is proposing.
-
-        Returns
-        -------
-        dict[str, list[str]]
-            ``"underused_families"``: operator families with low proposal count.
-            ``"overused_families"``: operator families with disproportionate use.
-        """
-        from factorminer.agent.critic import _OP_CATEGORIES, _extract_operators
-
-        family_counts: Counter = Counter()
-        total_proposals = 0
-
-        for history in self._proposal_history.values():
-            for formula, _ in history:
-                ops = _extract_operators(formula)
-                for op in ops:
-                    family = _OP_CATEGORIES.get(op, "other")
-                    family_counts[family] += 1
-                total_proposals += 1
-
-        if total_proposals == 0:
-            return {
-                "underused_families": self._ALL_OP_FAMILIES,
-                "overused_families": [],
-            }
-
-        avg_count = total_proposals / len(self._ALL_OP_FAMILIES)
-        underused = [
-            f for f in self._ALL_OP_FAMILIES
-            if family_counts.get(f, 0) < avg_count * 0.4
-        ]
-        overused = [
-            f for f in self._ALL_OP_FAMILIES
-            if family_counts.get(f, 0) > avg_count * 2.5
-        ]
-        return {"underused_families": underused, "overused_families": overused}
-
-    def get_memory_summary_for_specialist(self, specialist_name: str) -> str:
-        """Return a brief performance summary for a specific specialist."""
-        history = self._proposal_history.get(specialist_name, [])
-        if not history:
-            return f"{specialist_name}: no history yet."
-        proposed = len(history)
-        admitted = sum(1 for _, a in history if a)
-        rate = admitted / proposed
-        return (
-            f"{specialist_name}: {proposed} proposed, {admitted} admitted "
-            f"({rate:.1%} rate)."
-        )
-
-    @property
-    def total_rounds(self) -> int:
-        return len(self._rounds)
-
-
-# ---------------------------------------------------------------------------
-# DebateOrchestrator -- full pipeline
-# ---------------------------------------------------------------------------
-
-class DebateOrchestrator:
-    """Orchestrates the full multi-agent FactorMAD debate cycle.
-
-    Flow per round:
-    1. All specialists generate proposals (optionally in parallel).
-    2. Merge all proposals into a single pool.
-    3. SymPy algebraic deduplication (optional).
-    4. Critic multi-dimensional pre-scoring.
-    5. Top-fraction selection for expensive IC evaluation.
-    6. Return structured ``DebateResult``.
-
-    Parameters
-    ----------
-    specialists : list[SpecialistAgent]
-        Specialist agent instances.
-    critic : CriticAgent
-        Critic agent for pre-filtering.
-    canonicalizer : FormulaCanonicalizer or None
-        Optional SymPy canonicalizer for algebraic deduplication.
-    parallel_specialists : bool
-        Whether to run specialists concurrently.
-    max_workers : int
-        Max thread pool workers when parallel is enabled.
-    """
-
-    def __init__(
-        self,
-        specialists: List[SpecialistAgent],
-        critic: CriticAgent,
-        canonicalizer: Optional[Any] = None,
-        parallel_specialists: bool = True,
-        max_workers: int = 4,
-    ) -> None:
-        self.specialists = specialists
-        self.critic = critic
-        self.canonicalizer = canonicalizer
-        self.parallel_specialists = parallel_specialists
-        self.max_workers = max_workers
-
-    def run_debate_round(
-        self,
-        n_per_specialist: int = 15,
-        memory_signal: Optional[Dict[str, Any]] = None,
-        library_diagnostics: Optional[Dict[str, Any]] = None,
-        regime_context: str = "",
-        forbidden_patterns: Optional[List[str]] = None,
-        existing_factors: Optional[List[str]] = None,
-    ) -> DebateResult:
-        """Run one full debate round and return structured results.
-
-        Parameters
-        ----------
-        n_per_specialist : int
-            Number of proposals to request from each specialist.
-        memory_signal : dict or None
-            Experience memory priors.
-        library_diagnostics : dict or None
-            Current library state.
-        regime_context : str
-            Current market regime description.
-        forbidden_patterns : list[str] or None
-            Global forbidden structural patterns.
-        existing_factors : list[str] or None
-            Formulas already in the library.
-
-        Returns
-        -------
-        DebateResult
-            Full structured result including all proposals, dedup, and
-            critic scores.
-        """
-        memory_signal = memory_signal or {}
-        library_diagnostics = library_diagnostics or {}
-        forbidden_patterns = forbidden_patterns or []
-        existing_factors = normalize_factor_references(existing_factors)
-
-        # Step 1: Specialist generation
-        if self.parallel_specialists and len(self.specialists) > 1:
-            specialist_proposals = self._generate_parallel(
-                n_per_specialist=n_per_specialist,
-                memory_signal=memory_signal,
-                library_diagnostics=library_diagnostics,
-                regime_context=regime_context,
-                forbidden_patterns=forbidden_patterns,
-                existing_factors=existing_factors,
-            )
-        else:
-            specialist_proposals: Dict[str, List[str]] = {}
-            for spec in self.specialists:
-                formulas = spec.generate_proposals(
-                    n_proposals=n_per_specialist,
-                    memory_signal=memory_signal,
-                    library_diagnostics=library_diagnostics,
-                    regime_context=regime_context,
-                    forbidden_patterns=forbidden_patterns,
-                    existing_factors=existing_factors,
-                )
-                specialist_proposals[spec.name] = formulas
-                logger.info(
-                    "Specialist %s: %d proposals", spec.name, len(formulas)
-                )
-
-        # Step 2: Merge all proposals
-        all_proposals: List[str] = []
-        formula_to_specialist: Dict[str, str] = {}
-        for spec_name, formulas in specialist_proposals.items():
-            for f in formulas:
-                if f not in formula_to_specialist:
-                    all_proposals.append(f)
-                    formula_to_specialist[f] = spec_name
-
-        logger.info(
-            "Debate round: %d total proposals from %d specialists",
-            len(all_proposals),
-            len(self.specialists),
-        )
-
-        # Step 3: SymPy deduplication
-        after_dedup = self._deduplicate(all_proposals)
-        n_removed = len(all_proposals) - len(after_dedup)
-        logger.info(
-            "Deduplication: removed %d algebraic duplicates (%d remain)",
-            n_removed,
-            len(after_dedup),
-        )
-
-        # Step 4: Build CandidateFactor proposals for critic
-        proposals_cf: Dict[str, List[CandidateFactor]] = {}
-        for formula in after_dedup:
-            spec_name = formula_to_specialist.get(formula, "unknown")
-            from factorminer.agent.output_parser import _try_build_candidate
-            existing_count = len(proposals_cf.get(spec_name, []))
-            cf = _try_build_candidate(
-                f"{spec_name.lower()}_factor_{existing_count + 1}",
-                formula,
-            )
-            proposals_cf.setdefault(spec_name, []).append(cf)
-
-        # Step 5: Critic scoring
-        mem_str = _flatten_memory_signal(memory_signal)
-        critic_scores = self.critic._score_proposals(
-            proposals=proposals_cf,
-            existing_factors=existing_factors,
-            memory_signal=mem_str,
-            regime_context=regime_context,
-        )
-
-        # Step 6: Collect kept formulas
-        after_critic = [cs.formula for cs in critic_scores if cs.keep]
-        logger.info(
-            "Critic pre-filter: %d/%d candidates kept (keep=True)",
-            len(after_critic),
-            len(after_dedup),
-        )
-
-        success_rates = {spec.name: spec.success_rate for spec in self.specialists}
-        debate_stats = {
-            "n_proposals": len(all_proposals),
-            "n_after_dedup": len(after_dedup),
-            "n_after_critic": len(after_critic),
-            "n_duplicates_removed": n_removed,
-            "specialist_counts": {
-                name: len(formulas)
-                for name, formulas in specialist_proposals.items()
-            },
-        }
-
-        return DebateResult(
-            all_proposals=all_proposals,
-            after_dedup=after_dedup,
-            after_critic=after_critic,
-            critic_scores=critic_scores,
-            specialist_proposals=specialist_proposals,
-            specialist_success_rates=success_rates,
-            debate_stats=debate_stats,
-        )
-
-    def _generate_parallel(
-        self,
-        n_per_specialist: int,
-        memory_signal: Dict[str, Any],
-        library_diagnostics: Dict[str, Any],
-        regime_context: str,
-        forbidden_patterns: List[str],
-        existing_factors: List[str],
-    ) -> Dict[str, List[str]]:
-        """Generate from all specialists concurrently using a thread pool."""
-        results: Dict[str, List[str]] = {}
-
-        def _run_specialist(spec: SpecialistAgent) -> tuple:
-            formulas = spec.generate_proposals(
-                n_proposals=n_per_specialist,
-                memory_signal=memory_signal,
-                library_diagnostics=library_diagnostics,
-                regime_context=regime_context,
-                forbidden_patterns=forbidden_patterns,
-                existing_factors=existing_factors,
-            )
-            return spec.name, formulas
-
-        n_workers = min(self.max_workers, len(self.specialists))
-        with concurrent.futures.ThreadPoolExecutor(max_workers=n_workers) as executor:
-            futures = {
-                executor.submit(_run_specialist, spec): spec.name
-                for spec in self.specialists
-            }
-            for future in concurrent.futures.as_completed(futures):
-                spec_name = futures[future]
-                try:
-                    name, formulas = future.result()
-                    results[name] = formulas
-                    logger.info(
-                        "Specialist %s (parallel): %d proposals",
-                        name,
-                        len(formulas),
-                    )
-                except Exception as exc:
-                    logger.warning(
-                        "Specialist %s parallel generation failed: %s",
-                        spec_name,
-                        exc,
-                    )
-                    results[spec_name] = []
-
-        return results
-
-    def _deduplicate(self, formulas: List[str]) -> List[str]:
-        """Remove algebraic duplicates using SymPy canonicalizer if available."""
-        if self.canonicalizer is None:
-            seen: set = set()
-            unique: List[str] = []
-            for f in formulas:
-                if f not in seen:
-                    unique.append(f)
-                    seen.add(f)
-            return unique
-
-        from factorminer.core.parser import try_parse
-        seen_hashes: set = set()
-        unique: List[str] = []
-        for formula in formulas:
-            tree = try_parse(formula)
-            if tree is None:
-                if formula not in {u for u in unique}:
-                    unique.append(formula)
-                continue
-            try:
-                canon_hash = self.canonicalizer.canonicalize(tree)
-            except Exception:
-                canon_hash = formula
-            if canon_hash not in seen_hashes:
-                unique.append(formula)
-                seen_hashes.add(canon_hash)
-
-        return unique
-
-
-# ---------------------------------------------------------------------------
-# DebateGenerator -- drop-in replacement for FactorGenerator
-# ---------------------------------------------------------------------------
-
-class DebateGenerator:
-    """Multi-agent debate-based factor generator (drop-in for FactorGenerator).
-
-    Uses the full FactorMAD pipeline: multiple specialist proposers,
-    algebraic deduplication, and multi-dimensional critic pre-filtering.
-
-    Parameters
-    ----------
-    llm_provider : LLMProvider
-        LLM backend shared across all specialists and the critic.
-    debate_config : DebateConfig or None
-        Pipeline configuration.  Uses defaults if ``None``.
-    prompt_builder : PromptBuilder or None
-        Optional base prompt builder (its system prompt is used as the
-        base for specialist prompt builders).
-    """
-
-    def __init__(
-        self,
-        llm_provider: LLMProvider,
-        debate_config: Optional[DebateConfig] = None,
-        prompt_builder: Optional[PromptBuilder] = None,
-    ) -> None:
-        self.llm_provider = llm_provider
-        self.config = debate_config or DebateConfig()
-
-        base_system_prompt = (
-            prompt_builder.system_prompt if prompt_builder else None
-        )
-
-        # Build SpecialistAgent instances
-        self._specialist_agents: List[SpecialistAgent] = []
-        self._specialist_generators: Dict[str, FactorGenerator] = {}
-
-        for spec in self.config.specialists:
-            agent = SpecialistAgent(
-                config=spec,
-                llm=self.llm_provider,
-                base_system_prompt=base_system_prompt,
-            )
-            self._specialist_agents.append(agent)
-
-            specialist_pb = SpecialistPromptBuilder(
-                specialist_config=spec,
-                base_system_prompt=base_system_prompt,
-            )
-            gen = FactorGenerator(
-                llm_provider=self.llm_provider,
-                prompt_builder=specialist_pb,
-                temperature=spec.temperature,
-            )
-            self._specialist_generators[spec.name] = gen
-
-        # Build critic
-        self._critic: Optional[CriticAgent] = None
-        if self.config.enable_critic:
-            self._critic = CriticAgent(
-                llm_provider=self.llm_provider,
-                temperature=self.config.critic_temperature,
-            )
-
-        # Canonicalizer for deduplication
-        self._canonicalizer = None
-        if self.config.enable_deduplication:
-            try:
-                from factorminer.core.canonicalizer import FormulaCanonicalizer
-                self._canonicalizer = FormulaCanonicalizer()
-            except Exception as exc:
-                logger.warning(
-                    "Could not initialise FormulaCanonicalizer: %s. "
-                    "Falling back to string dedup.",
-                    exc,
-                )
-
-        # Debate orchestrator
-        if self._critic is not None:
-            self._orchestrator: Optional[DebateOrchestrator] = DebateOrchestrator(
-                specialists=self._specialist_agents,
-                critic=self._critic,
-                canonicalizer=self._canonicalizer,
-                parallel_specialists=self.config.parallel_specialists,
-                max_workers=self.config.max_parallel_workers,
-            )
-        else:
-            self._orchestrator = None
-
-        # Debate memory
-        self._debate_memory: Optional[DebateMemory] = None
-        if self.config.enable_debate_memory:
-            specialist_names = [s.name for s in self.config.specialists]
-            self._debate_memory = DebateMemory(specialist_names=specialist_names)
-
-        self._last_debate_result: Optional[DebateResult] = None
-        self._generation_count = 0
-
-    def generate_batch(
-        self,
-        memory_signal: Optional[Dict[str, Any]] = None,
-        library_state: Optional[Dict[str, Any]] = None,
-        batch_size: int = 40,
-    ) -> List[CandidateFactor]:
-        """Generate a batch of candidate factors via multi-agent debate.
-
-        Signature is identical to ``FactorGenerator.generate_batch``
-        so this class is a true drop-in replacement.
-
-        Parameters
-        ----------
-        memory_signal : dict or None
-            Memory priors for prompt injection.
-        library_state : dict or None
-            Current factor library state.
-        batch_size : int
-            Target number of candidates to return.
-
-        Returns
-        -------
-        list[CandidateFactor]
-            Ranked candidate factors.
-        """
-        memory_signal = memory_signal or {}
-        library_state = library_state or {}
-
-        self._generation_count += 1
-        batch_id = self._generation_count
-
-        logger.info(
-            "Debate batch #%d: %d specialists, critic=%s, per_specialist=%d",
-            batch_id,
-            len(self._specialist_agents),
-            self.config.enable_critic,
-            self.config.candidates_per_specialist,
-        )
-
-        existing_factors = normalize_factor_references(
-            library_state.get("recent_admissions", [])
-        )
-        regime_context = str(memory_signal.get("regime_context", ""))
-
-        if self._orchestrator is not None:
-            debate_result = self._orchestrator.run_debate_round(
-                n_per_specialist=self.config.candidates_per_specialist,
-                memory_signal=memory_signal,
-                library_diagnostics=library_state,
-                regime_context=regime_context,
-                existing_factors=existing_factors,
-            )
-            self._last_debate_result = debate_result
-
-            if self._debate_memory is not None:
-                self._debate_memory.record_round(debate_result)
-
-            result = self._debate_result_to_candidates(
-                debate_result=debate_result,
-                top_k=min(batch_size, self.config.top_k_after_critic),
-            )
-
-        else:
-            # No critic: run specialist generators and merge
-            proposals: Dict[str, List[CandidateFactor]] = {}
-            for spec_name, generator in self._specialist_generators.items():
-                candidates = generator.generate_batch(
-                    memory_signal=memory_signal,
-                    library_state=library_state,
-                    batch_size=self.config.candidates_per_specialist,
-                )
-                proposals[spec_name] = candidates
-                logger.info(
-                    "Specialist %s produced %d candidates", spec_name, len(candidates)
-                )
-
-            result = []
-            seen_formulas: set = set()
-            for spec_name, candidates in proposals.items():
-                for c in candidates:
-                    if c.formula not in seen_formulas:
-                        result.append(c)
-                        seen_formulas.add(c.formula)
-
-            result = result[:batch_size]
-            # Store a minimal DebateResult for consistency
-            specialist_proposals = {
-                name: [c.formula for c in cands]
-                for name, cands in proposals.items()
-            }
-            self._last_debate_result = DebateResult(
-                all_proposals=[f for fl in specialist_proposals.values() for f in fl],
-                after_dedup=[c.formula for c in result],
-                after_critic=[c.formula for c in result],
-                critic_scores=[],
-                specialist_proposals=specialist_proposals,
-                specialist_success_rates={},
-                debate_stats={"n_proposals": len(result)},
-            )
-
-        result = self._tag_specialist_source_from_agents(result)
-
-        logger.info(
-            "Debate batch #%d complete: returning %d candidates",
-            batch_id,
-            len(result),
-        )
-        return result
-
-    # ------------------------------------------------------------------
-    # Public inspection helpers
-    # ------------------------------------------------------------------
-
-    @property
-    def last_debate_result(self) -> Optional[DebateResult]:
-        """The ``DebateResult`` from the most recent ``generate_batch`` call."""
-        return self._last_debate_result
-
-    @property
-    def debate_memory(self) -> Optional[DebateMemory]:
-        """The ``DebateMemory`` tracking history across rounds."""
-        return self._debate_memory
-
-    def get_specialist_leaderboard(self) -> Optional[List[Dict[str, Any]]]:
-        """Return specialist admission leaderboard if memory is enabled."""
-        if self._debate_memory is not None:
-            return self._debate_memory.get_specialist_leaderboard()
-        return None
-
-    def get_blind_spots(self) -> Optional[Dict[str, List[str]]]:
-        """Return operator family blind spots if memory is enabled."""
-        if self._debate_memory is not None:
-            return self._debate_memory.get_blind_spots()
-        return None
-
-    def update_specialist_admissions(
-        self,
-        admitted_formulas: List[str],
-        rejected_formulas: Optional[List[str]] = None,
-        rejection_reasons: Optional[List[str]] = None,
-    ) -> None:
-        """Feed evaluation results back to specialist agents and debate memory.
-
-        Should be called after IC evaluation to close the feedback loop.
-
-        Parameters
-        ----------
-        admitted_formulas : list[str]
-            Formula strings admitted to the library.
-        rejected_formulas : list[str] or None
-            Formula strings that failed IC evaluation.
-        rejection_reasons : list[str] or None
-            Reasons for rejection (parallel to ``rejected_formulas``).
-        """
-        if self._last_debate_result is None:
-            return
-
-        rejected_formulas = rejected_formulas or []
-        rejection_reasons = rejection_reasons or []
-
-        for spec_agent in self._specialist_agents:
-            spec_admitted = [
-                f for f in admitted_formulas
-                if f in self._last_debate_result.specialist_proposals.get(
-                    spec_agent.name, []
-                )
-            ]
-            spec_rejected = [
-                f for f in rejected_formulas
-                if f in self._last_debate_result.specialist_proposals.get(
-                    spec_agent.name, []
-                )
-            ]
-            spec_reasons: List[str] = []
-            for f in spec_rejected:
-                try:
-                    idx = rejected_formulas.index(f)
-                    spec_reasons.append(
-                        rejection_reasons[idx] if idx < len(rejection_reasons)
-                        else "unknown"
-                    )
-                except ValueError:
-                    spec_reasons.append("unknown")
-
-            spec_agent.update_domain_memory(
-                admitted=spec_admitted,
-                rejected=spec_rejected,
-                reasons=spec_reasons,
-            )
-
-        if self._debate_memory is not None and self._last_debate_result is not None:
-            self._debate_memory.record_round(
-                debate_result=self._last_debate_result,
-                admissions=admitted_formulas,
-            )
-
-    # ------------------------------------------------------------------
-    # Internal helpers
-    # ------------------------------------------------------------------
-
-    def _debate_result_to_candidates(
-        self,
-        debate_result: DebateResult,
-        top_k: int,
-    ) -> List[CandidateFactor]:
-        """Convert DebateResult critic scores into CandidateFactor objects."""
-        from factorminer.agent.output_parser import _try_build_candidate
-
-        kept_scores = [cs for cs in debate_result.critic_scores if cs.keep]
-        kept_scores.sort(key=lambda cs: cs.composite_score, reverse=True)
-        kept_scores = kept_scores[:top_k]
-
-        result: List[CandidateFactor] = []
-        seen_formulas: set = set()
-
-        for cs in kept_scores:
-            if cs.formula in seen_formulas:
-                continue
-            cf = _try_build_candidate(cs.factor_name, cs.formula)
-            if cf.is_valid:
-                cf.category = f"specialist:{cs.source_specialist}/{cf.category}"
-                result.append(cf)
-                seen_formulas.add(cs.formula)
-
-        if not result:
-            for formula in debate_result.after_critic[:top_k]:
-                if formula in seen_formulas:
-                    continue
-                cf = _try_build_candidate(
-                    f"debate_factor_{len(result)+1}", formula
-                )
-                if cf.is_valid:
-                    result.append(cf)
-                    seen_formulas.add(formula)
-
-        return result
-
-    def _tag_specialist_source_from_agents(
-        self,
-        candidates: List[CandidateFactor],
-    ) -> List[CandidateFactor]:
-        """Tag candidate source if not already embedded in category."""
-        for c in candidates:
-            if not c.category.startswith("specialist:"):
-                if self._last_debate_result:
-                    for spec_name, formulas in (
-                        self._last_debate_result.specialist_proposals.items()
-                    ):
-                        if c.formula in formulas:
-                            c.category = f"specialist:{spec_name}/{c.category}"
-                            break
-        return candidates
-
-    # ------------------------------------------------------------------
-    # Legacy static helpers (backward compatibility)
-    # ------------------------------------------------------------------
-
-    @staticmethod
-    def _scores_to_candidates(
-        scores: List[CriticScore],
-        proposals: Dict[str, List[CandidateFactor]],
-    ) -> List[CandidateFactor]:
-        """Map CriticScore objects back to CandidateFactor instances."""
-        lookup: Dict[str, CandidateFactor] = {}
-        for candidates in proposals.values():
-            for c in candidates:
-                lookup[c.name] = c
-
-        result: List[CandidateFactor] = []
-        seen: set = set()
-        for score in scores:
-            candidate = lookup.get(score.factor_name)
-            if candidate is not None and score.factor_name not in seen:
-                result.append(candidate)
-                seen.add(score.factor_name)
-
-        return result
-
-    @staticmethod
-    def _tag_specialist_source(
-        candidates: List[CandidateFactor],
-        proposals: Dict[str, List[CandidateFactor]],
-    ) -> List[CandidateFactor]:
-        """Add specialist source information to each candidate's category."""
-        source_map: Dict[str, str] = {}
-        for spec_name, spec_candidates in proposals.items():
-            for c in spec_candidates:
-                source_map[c.name] = spec_name
-
-        for c in candidates:
-            spec_name = source_map.get(c.name, "unknown")
-            if not c.category.startswith("specialist:"):
-                c.category = f"specialist:{spec_name}/{c.category}"
-
-        return candidates
-
-
-# ---------------------------------------------------------------------------
-# Utility
-# ---------------------------------------------------------------------------
-
-def _flatten_memory_signal(memory_signal: Dict[str, Any]) -> str:
-    """Flatten a memory signal dict to a compact string."""
-    parts: List[str] = []
-    for key in (
-        "recommended_directions", "strategic_insights",
-        "complementary_patterns", "prompt_text",
-    ):
-        val = memory_signal.get(key)
-        if isinstance(val, list):
-            parts.extend(str(v) for v in val)
-        elif isinstance(val, str) and val:
-            parts.append(val)
-    return " ".join(parts)
diff --git a/src/factorminer/factorminer/agent/factor_generator.py b/src/factorminer/factorminer/agent/factor_generator.py
deleted file mode 100644
index 950bd7d..0000000
--- a/src/factorminer/factorminer/agent/factor_generator.py
+++ /dev/null
@@ -1,236 +0,0 @@
-"""Main factor generation agent using LLM guided by memory priors.
-
-Orchestrates the prompt construction, LLM invocation, output parsing,
-and retry logic for a single batch of factor candidates.
-"""
-
-from __future__ import annotations
-
-import logging
-import time
-from typing import Any, Dict, List, Optional
-
-from src.factorminer.factorminer.agent.llm_interface import LLMProvider
-from src.factorminer.factorminer.agent.output_parser import CandidateFactor, parse_llm_output
-from src.factorminer.factorminer.agent.prompt_builder import PromptBuilder
-
-logger = logging.getLogger(__name__)
-
-
-class FactorGenerator:
-    """LLM-based factor generation agent.
-
-    Generates batches of candidate factors by constructing prompts that
-    inject experience memory priors, calling an LLM provider, and parsing
-    the output into validated CandidateFactor objects.
-
-    Parameters
-    ----------
-    llm_provider : LLMProvider
-        The LLM backend to use for text generation.
-    prompt_builder : PromptBuilder
-        Builds system and user prompts.
-    temperature : float
-        Default sampling temperature.
-    max_tokens : int
-        Default max response tokens.
-    """
-
-    def __init__(
-        self,
-        llm_provider: LLMProvider,
-        prompt_builder: Optional[PromptBuilder] = None,
-        temperature: float = 0.8,
-        max_tokens: int = 4096,
-    ) -> None:
-        self.llm_provider = llm_provider
-        self.prompt_builder = prompt_builder or PromptBuilder()
-        self.temperature = temperature
-        self.max_tokens = max_tokens
-        self._generation_count = 0
-
-    def generate_batch(
-        self,
-        memory_signal: Optional[Dict[str, Any]] = None,
-        library_state: Optional[Dict[str, Any]] = None,
-        batch_size: int = 40,
-    ) -> List[CandidateFactor]:
-        """Generate a batch of candidate factors using LLM guided by memory priors.
-
-        Steps:
-        1. Build prompt with memory signal injection.
-        2. Call LLM to generate candidates.
-        3. Parse and validate each candidate.
-        4. Retry failed parses if any.
-        5. Return list of valid CandidateFactor objects.
-
-        Parameters
-        ----------
-        memory_signal : dict or None
-            Memory priors to inject into the prompt. Keys:
-            - ``"recommended_directions"`` : list[str]
-            - ``"forbidden_directions"`` : list[str]
-            - ``"strategic_insights"`` : list[str]
-            - ``"recent_rejections"`` : list[dict]
-        library_state : dict or None
-            Current library state. Keys:
-            - ``"size"`` : int
-            - ``"target_size"`` : int
-            - ``"recent_admissions"`` : list[str]
-            - ``"domain_saturation"`` : dict[str, float]
-        batch_size : int
-            Number of candidates to request per batch.
-
-        Returns
-        -------
-        list[CandidateFactor]
-            All valid candidate factors (those with successfully parsed
-            expression trees).
-        """
-        memory_signal = memory_signal or {}
-        library_state = library_state or {}
-
-        self._generation_count += 1
-        batch_id = self._generation_count
-
-        logger.info(
-            "Generating batch #%d: size=%d, provider=%s",
-            batch_id,
-            batch_size,
-            self.llm_provider.provider_name,
-        )
-
-        # 1. Build prompts
-        system_prompt = self.prompt_builder.system_prompt
-        user_prompt = self.prompt_builder.build_user_prompt(
-            memory_signal=memory_signal,
-            library_state=library_state,
-            batch_size=batch_size,
-        )
-
-        # 2. Call LLM
-        t0 = time.monotonic()
-        raw_output = self.llm_provider.generate(
-            system_prompt=system_prompt,
-            user_prompt=user_prompt,
-            temperature=self.temperature,
-            max_tokens=self.max_tokens,
-        )
-        elapsed = time.monotonic() - t0
-        logger.info("LLM response received in %.1fs (%d chars)", elapsed, len(raw_output))
-
-        # 3. Parse output
-        candidates, failed_lines = parse_llm_output(raw_output)
-
-        valid = [c for c in candidates if c.is_valid]
-        invalid = [c for c in candidates if not c.is_valid]
-
-        logger.info(
-            "Batch #%d initial parse: %d valid, %d invalid, %d unparseable lines",
-            batch_id,
-            len(valid),
-            len(invalid),
-            len(failed_lines),
-        )
-
-        # 4. Retry failed parses
-        if failed_lines or invalid:
-            retry_input = failed_lines + [c.formula for c in invalid if c.formula]
-            retried = self._retry_failed_parses(retry_input, attempts=2)
-            if retried:
-                # Deduplicate by formula
-                existing_formulas = {c.formula for c in valid}
-                for c in retried:
-                    if c.formula not in existing_formulas:
-                        valid.append(c)
-                        existing_formulas.add(c.formula)
-                logger.info(
-                    "Batch #%d after retry: %d total valid candidates",
-                    batch_id,
-                    len(valid),
-                )
-
-        # 5. Log summary
-        if valid:
-            categories = {}
-            for c in valid:
-                categories[c.category] = categories.get(c.category, 0) + 1
-            logger.info(
-                "Batch #%d categories: %s",
-                batch_id,
-                ", ".join(f"{k}={v}" for k, v in sorted(categories.items())),
-            )
-
-        return valid
-
-    def _retry_failed_parses(
-        self,
-        failed: List[str],
-        attempts: int = 2,
-    ) -> List[CandidateFactor]:
-        """Retry parsing failed outputs with a repair prompt.
-
-        Asks the LLM to fix malformed formulas by providing the broken
-        expressions and asking for corrected versions.
-
-        Parameters
-        ----------
-        failed : list[str]
-            Original text lines or formulas that failed to parse.
-        attempts : int
-            Max number of retry rounds.
-
-        Returns
-        -------
-        list[CandidateFactor]
-            Successfully parsed candidates from retries.
-        """
-        if not failed:
-            return []
-
-        # Limit retries to avoid excessive API calls
-        failed = failed[:15]
-        recovered: List[CandidateFactor] = []
-
-        for attempt in range(1, attempts + 1):
-            if not failed:
-                break
-
-            repair_prompt = (
-                "The following factor formulas failed to parse. "
-                "Fix each one so it uses ONLY valid operators and features "
-                "from the library. Return them in the same numbered format:\n"
-                "<number>. <name>: <corrected_formula>\n\n"
-                "Broken formulas:\n"
-                + "\n".join(f"  {i+1}. {f}" for i, f in enumerate(failed))
-                + "\n\nFix all syntax errors, unknown operators, and invalid "
-                "feature names. Every formula must be a valid nested function "
-                "call using only operators from the library."
-            )
-
-            try:
-                raw = self.llm_provider.generate(
-                    system_prompt=self.prompt_builder.system_prompt,
-                    user_prompt=repair_prompt,
-                    temperature=max(0.3, self.temperature - 0.3),
-                    max_tokens=self.max_tokens,
-                )
-            except Exception as e:
-                logger.warning("Retry attempt %d failed: %s", attempt, e)
-                break
-
-            candidates, still_failed = parse_llm_output(raw)
-            new_valid = [c for c in candidates if c.is_valid]
-            recovered.extend(new_valid)
-
-            # Update failed list for next attempt
-            failed = still_failed + [c.formula for c in candidates if not c.is_valid]
-
-            logger.debug(
-                "Retry attempt %d: recovered %d, still failing %d",
-                attempt,
-                len(new_valid),
-                len(failed),
-            )
-
-        return recovered
diff --git a/src/factorminer/factorminer/agent/llm_interface.py b/src/factorminer/factorminer/agent/llm_interface.py
deleted file mode 100644
index ba4ea65..0000000
--- a/src/factorminer/factorminer/agent/llm_interface.py
+++ /dev/null
@@ -1,365 +0,0 @@
-"""Abstract LLM interface supporting multiple providers.
-
-Provides a unified API for generating text completions across OpenAI,
-Anthropic, Google (Gemini), and a deterministic mock provider for testing.
-"""
-
-from __future__ import annotations
-
-import logging
-import os
-from abc import ABC, abstractmethod
-from typing import Any, Dict, Optional
-
-logger = logging.getLogger(__name__)
-
-
-class LLMProvider(ABC):
-    """Abstract base for LLM text-generation providers."""
-
-    @abstractmethod
-    def generate(
-        self,
-        system_prompt: str,
-        user_prompt: str,
-        temperature: float = 0.8,
-        max_tokens: int = 4096,
-    ) -> str:
-        """Generate a text completion.
-
-        Parameters
-        ----------
-        system_prompt : str
-            System-level instructions (role, rules, operator library, etc.).
-        user_prompt : str
-            Per-iteration user prompt (memory signal, library state, etc.).
-        temperature : float
-            Sampling temperature; higher = more creative.
-        max_tokens : int
-            Maximum tokens in the response.
-
-        Returns
-        -------
-        str
-            Raw text response from the model.
-        """
-
-    @property
-    @abstractmethod
-    def provider_name(self) -> str:
-        """Human-readable provider name."""
-
-
-class OpenAIProvider(LLMProvider):
-    """OpenAI API provider (GPT-4, GPT-4o, etc.)."""
-
-    def __init__(
-        self,
-        model: str = "gpt-4o",
-        api_key: Optional[str] = None,
-    ) -> None:
-        self.model = model
-        self.api_key = api_key or os.environ.get("OPENAI_API_KEY", "")
-        self._client: Any = None
-
-    def _get_client(self) -> Any:
-        if self._client is None:
-            try:
-                from openai import OpenAI
-            except ImportError:
-                raise ImportError(
-                    "openai package is required for OpenAIProvider. "
-                    "Install with: pip install openai"
-                )
-            self._client = OpenAI(api_key=self.api_key)
-        return self._client
-
-    def generate(
-        self,
-        system_prompt: str,
-        user_prompt: str,
-        temperature: float = 0.8,
-        max_tokens: int = 4096,
-    ) -> str:
-        client = self._get_client()
-        logger.debug("OpenAI request: model=%s temp=%.2f", self.model, temperature)
-        response = client.chat.completions.create(
-            model=self.model,
-            messages=[
-                {"role": "system", "content": system_prompt},
-                {"role": "user", "content": user_prompt},
-            ],
-            temperature=temperature,
-            max_tokens=max_tokens,
-        )
-        text = response.choices[0].message.content or ""
-        logger.debug("OpenAI response: %d chars", len(text))
-        return text
-
-    @property
-    def provider_name(self) -> str:
-        return f"openai/{self.model}"
-
-
-class AnthropicProvider(LLMProvider):
-    """Anthropic Claude API provider with adaptive thinking support."""
-
-    def __init__(
-        self,
-        model: str = "claude-sonnet-4-6",
-        api_key: Optional[str] = None,
-        use_thinking: bool = True,
-        effort: str = "max",
-    ) -> None:
-        self.model = model
-        self.api_key = api_key or os.environ.get("ANTHROPIC_API_KEY", "")
-        self.use_thinking = use_thinking
-        self.effort = effort
-        self._client: Any = None
-
-    def _get_client(self) -> Any:
-        if self._client is None:
-            try:
-                import anthropic
-            except ImportError:
-                raise ImportError(
-                    "anthropic package is required for AnthropicProvider. "
-                    "Install with: pip install anthropic"
-                )
-            self._client = anthropic.Anthropic(api_key=self.api_key)
-        return self._client
-
-    def generate(
-        self,
-        system_prompt: str,
-        user_prompt: str,
-        temperature: float = 1,
-        max_tokens: int = 32000,
-    ) -> str:
-        client = self._get_client()
-        logger.debug("Anthropic request: model=%s thinking=%s effort=%s",
-                      self.model, self.use_thinking, self.effort)
-
-        kwargs: dict = {
-            "model": self.model,
-            "system": system_prompt,
-            "messages": [{"role": "user", "content": user_prompt}],
-            "max_tokens": max_tokens,
-        }
-
-        if self.use_thinking:
-            kwargs["thinking"] = {"type": "adaptive"}
-            kwargs["temperature"] = 1  # Required for thinking mode
-            kwargs["output_config"] = {"effort": self.effort}
-        else:
-            kwargs["temperature"] = temperature
-
-        response = client.messages.create(**kwargs)
-
-        # Extract text from response, skipping thinking blocks
-        text_parts = []
-        for block in response.content:
-            if hasattr(block, "text"):
-                text_parts.append(block.text)
-        text = "\n".join(text_parts) if text_parts else ""
-        logger.debug("Anthropic response: %d chars", len(text))
-        return text
-
-    @property
-    def provider_name(self) -> str:
-        return f"anthropic/{self.model}"
-
-
-class GoogleProvider(LLMProvider):
-    """Google Gemini API provider (paper uses Gemini 3.0 Flash)."""
-
-    def __init__(
-        self,
-        model: str = "gemini-2.0-flash",
-        api_key: Optional[str] = None,
-    ) -> None:
-        self.model = model
-        self.api_key = api_key or os.environ.get("GOOGLE_API_KEY", "")
-        self._client: Any = None
-
-    def _get_client(self) -> Any:
-        if self._client is None:
-            try:
-                import google.generativeai as genai
-            except ImportError:
-                raise ImportError(
-                    "google-generativeai package is required for GoogleProvider. "
-                    "Install with: pip install google-generativeai"
-                )
-            genai.configure(api_key=self.api_key)
-            self._client = genai.GenerativeModel(
-                self.model,
-                generation_config={"max_output_tokens": 8192},
-            )
-        return self._client
-
-    def generate(
-        self,
-        system_prompt: str,
-        user_prompt: str,
-        temperature: float = 0.8,
-        max_tokens: int = 4096,
-    ) -> str:
-        client = self._get_client()
-        logger.debug("Google request: model=%s temp=%.2f", self.model, temperature)
-        combined = f"{system_prompt}\n\n---\n\n{user_prompt}"
-        response = client.generate_content(
-            combined,
-            generation_config={
-                "temperature": temperature,
-                "max_output_tokens": max_tokens,
-            },
-        )
-        text = response.text if response.text else ""
-        logger.debug("Google response: %d chars", len(text))
-        return text
-
-    @property
-    def provider_name(self) -> str:
-        return f"google/{self.model}"
-
-
-class MockProvider(LLMProvider):
-    """Deterministic provider for testing without API calls.
-
-    Returns predefined factor formulas that exercise diverse operator
-    combinations.  Useful for unit tests and integration testing.
-    """
-
-    MOCK_FACTORS = [
-        ("momentum_reversal", "Neg(CsRank(Delta($close, 5)))"),
-        ("volume_surprise", "CsZScore(Div(Sub($volume, Mean($volume, 20)), Std($volume, 20)))"),
-        ("price_range_ratio", "Div(Sub($high, $low), Add($high, $low))"),
-        ("vwap_deviation", "CsRank(Div(Sub($close, $vwap), $vwap))"),
-        ("return_skew", "Neg(Skew($returns, 20))"),
-        ("intraday_momentum", "CsRank(Div(Sub($close, $open), Sub($high, $low)))"),
-        ("volume_price_corr", "Neg(Corr($volume, $close, 10))"),
-        ("amt_acceleration", "CsZScore(Delta(Mean($amt, 5), 5))"),
-        ("close_high_ratio", "CsRank(Sub(Div($close, TsMax($high, 20)), 1))"),
-        ("smooth_return", "Neg(CsRank(EMA($returns, 10)))"),
-        ("volatility_ratio", "Div(Std($returns, 5), Std($returns, 20))"),
-        ("mean_reversion", "Neg(CsZScore(Div(Sub($close, SMA($close, 20)), SMA($close, 20))))"),
-        ("volume_trend", "CsRank(TsLinRegSlope($volume, 20))"),
-        ("price_position", "CsRank(Div(Sub($close, TsMin($close, 20)), Sub(TsMax($close, 20), TsMin($close, 20))))"),
-        ("amt_volume_div", "CsRank(Neg(Corr(CsRank($amt), CsRank($volume), 10)))"),
-        ("weighted_return", "CsZScore(WMA($returns, 10))"),
-        ("high_low_decay", "Neg(Decay(Div(Sub($high, $low), $close), 10))"),
-        ("residual_vol", "CsRank(Std(Resid($close, $volume, 20), 10))"),
-        ("open_gap", "CsZScore(Div(Sub($open, Delay($close, 1)), Delay($close, 1)))"),
-        ("log_turnover", "Neg(CsRank(Log(Div($amt, $volume))))"),
-        ("beta_momentum", "CsRank(Mul(Beta($returns, $volume, 20), Delta($close, 10)))"),
-        ("rank_reversal", "Neg(CsRank(Sum($returns, 5)))"),
-        ("kurtosis_signal", "CsZScore(Neg(Kurt($returns, 20)))"),
-        ("vwap_trend", "CsRank(TsLinRegSlope(Div($close, $vwap), 20))"),
-        ("adaptive_mean", "CsRank(Div(Sub($close, KAMA($close, 10)), Std($close, 10)))"),
-        ("cumulative_flow", "CsZScore(CsRank(Delta(CumSum(Mul($volume, Sign(Delta($close, 1)))), 5)))"),
-        ("range_breakout", "CsRank(Div(Sub($close, TsMin($low, 10)), Std($close, 10)))"),
-        ("hull_deviation", "Neg(CsRank(Div(Sub($close, HMA($close, 20)), $close)))"),
-        ("conditional_vol", "CsZScore(IfElse(Greater($returns, 0), Std($returns, 10), Neg(Std($returns, 10))))"),
-        ("dema_crossover", "CsRank(Sub(DEMA($close, 5), DEMA($close, 20)))"),
-        ("ts_rank_volume", "Neg(CsRank(TsRank($volume, 20)))"),
-        ("median_price", "CsZScore(Div(Sub($close, Median($close, 20)), Median($close, 20)))"),
-        ("argmax_timing", "CsRank(Neg(TsArgMax($close, 20)))"),
-        ("log_return_sum", "Neg(CsRank(Sum(LogReturn($close, 1), 10)))"),
-        ("price_cov", "CsZScore(Neg(Cov($close, $volume, 20)))"),
-        ("inv_volatility", "CsRank(Inv(Std($returns, 20)))"),
-        ("squared_return", "Neg(CsRank(Mean(Square($returns), 10)))"),
-        ("abs_return_ratio", "CsRank(Div(Abs(Delta($close, 1)), Mean(Abs(Delta($close, 1)), 20)))"),
-        ("quantile_signal", "CsZScore(Quantile($returns, 20, 0.75))"),
-        ("neutralized_mom", "CsNeutralize(Delta($close, 10))"),
-    ]
-
-    def __init__(self, cycle: bool = True) -> None:
-        self._cycle = cycle
-        self._call_count = 0
-
-    def generate(
-        self,
-        system_prompt: str,
-        user_prompt: str,
-        temperature: float = 0.8,
-        max_tokens: int = 4096,
-    ) -> str:
-        # Parse batch_size from user_prompt if present
-        batch_size = 40
-        for line in user_prompt.split("\n"):
-            if "generate" in line.lower() and "candidate" in line.lower():
-                for word in line.split():
-                    if word.isdigit():
-                        batch_size = int(word)
-                        break
-
-        batch_size = min(batch_size, len(self.MOCK_FACTORS))
-
-        start = self._call_count * batch_size
-        if self._cycle:
-            indices = [
-                (start + i) % len(self.MOCK_FACTORS)
-                for i in range(batch_size)
-            ]
-        else:
-            indices = list(range(min(batch_size, len(self.MOCK_FACTORS))))
-
-        self._call_count += 1
-
-        lines = []
-        for idx, factor_idx in enumerate(indices, 1):
-            name, formula = self.MOCK_FACTORS[factor_idx]
-            lines.append(f"{idx}. {name}: {formula}")
-
-        return "\n".join(lines)
-
-    @property
-    def provider_name(self) -> str:
-        return "mock"
-
-
-# ---------------------------------------------------------------------------
-# Factory
-# ---------------------------------------------------------------------------
-
-_PROVIDER_MAP: Dict[str, type] = {
-    "openai": OpenAIProvider,
-    "anthropic": AnthropicProvider,
-    "google": GoogleProvider,
-    "mock": MockProvider,
-}
-
-
-def create_provider(config: Dict[str, Any]) -> LLMProvider:
-    """Factory function to instantiate an LLM provider from config.
-
-    Parameters
-    ----------
-    config : dict
-        Must contain ``"provider"`` key (one of "openai", "anthropic",
-        "google", "mock").  Additional keys are passed as kwargs to the
-        provider constructor:
-        - ``"model"`` : model identifier
-        - ``"api_key"`` : API key (overrides env var)
-
-    Returns
-    -------
-    LLMProvider
-    """
-    provider_name = config.get("provider", "mock")
-    cls = _PROVIDER_MAP.get(provider_name)
-    if cls is None:
-        raise ValueError(
-            f"Unknown LLM provider '{provider_name}'. "
-            f"Available: {sorted(_PROVIDER_MAP.keys())}"
-        )
-
-    kwargs: Dict[str, Any] = {}
-    if "model" in config and provider_name != "mock":
-        kwargs["model"] = config["model"]
-    if "api_key" in config and provider_name != "mock":
-        kwargs["api_key"] = config["api_key"]
-
-    logger.info("Creating LLM provider: %s (kwargs=%s)", provider_name, list(kwargs.keys()))
-    return cls(**kwargs)
diff --git a/src/factorminer/factorminer/agent/output_parser.py b/src/factorminer/factorminer/agent/output_parser.py
deleted file mode 100644
index 2b9f65e..0000000
--- a/src/factorminer/factorminer/agent/output_parser.py
+++ /dev/null
@@ -1,259 +0,0 @@
-"""Parse LLM output into structured CandidateFactor objects.
-
-Handles various output formats from LLMs: numbered lists, JSON,
-markdown code blocks, and raw text.  Validates each formula against
-the expression tree parser.
-"""
-
-from __future__ import annotations
-
-import logging
-import re
-from dataclasses import dataclass, field
-from typing import Dict, List, Optional, Tuple
-
-from src.factorminer.factorminer.core.expression_tree import ExpressionTree
-from src.factorminer.factorminer.core.parser import parse, try_parse
-from src.factorminer.factorminer.core.types import OperatorType, OPERATOR_REGISTRY
-
-logger = logging.getLogger(__name__)
-
-
-@dataclass
-class CandidateFactor:
-    """A candidate factor parsed from LLM output.
-
-    Attributes
-    ----------
-    name : str
-        Descriptive snake_case name.
-    formula : str
-        DSL formula string.
-    expression_tree : ExpressionTree or None
-        Parsed expression tree (None if parsing failed).
-    category : str
-        Inferred category based on outermost operators.
-    parse_error : str
-        Error message if formula failed to parse.
-    """
-
-    name: str
-    formula: str
-    expression_tree: Optional[ExpressionTree] = None
-    category: str = "unknown"
-    parse_error: str = ""
-
-    @property
-    def is_valid(self) -> bool:
-        return self.expression_tree is not None
-
-
-def _infer_category(formula: str) -> str:
-    """Infer a rough category from the outermost operators in the formula."""
-    lower = formula.lower()
-    # Check for cross-sectional operators at the top
-    if any(op in formula for op in ("CsRank", "CsZScore", "CsDemean", "CsScale", "CsNeutralize", "CsQuantile")):
-        # Look deeper for sub-category
-        if any(op in formula for op in ("Corr", "Cov", "Beta", "Resid")):
-            return "cross_sectional_regression"
-        if any(op in formula for op in ("Delta", "Delay", "Return", "LogReturn")):
-            return "cross_sectional_momentum"
-        if any(op in formula for op in ("Std", "Var", "Skew", "Kurt")):
-            return "cross_sectional_volatility"
-        if any(op in formula for op in ("Mean", "Sum", "EMA", "SMA", "WMA", "DEMA", "HMA", "KAMA")):
-            return "cross_sectional_smoothing"
-        if any(op in formula for op in ("TsLinReg", "TsLinRegSlope")):
-            return "cross_sectional_trend"
-        return "cross_sectional"
-    if any(op in formula for op in ("Corr", "Cov", "Beta", "Resid")):
-        return "regression"
-    if any(op in formula for op in ("Delta", "Delay", "Return", "LogReturn")):
-        return "momentum"
-    if any(op in formula for op in ("Std", "Var", "Skew", "Kurt")):
-        return "volatility"
-    if any(op in formula for op in ("IfElse", "Greater", "Less")):
-        return "conditional"
-    return "general"
-
-
-# ---------------------------------------------------------------------------
-# Line parsing patterns
-# ---------------------------------------------------------------------------
-
-# Pattern: "1. name: formula" or "1) name: formula"
-_NUMBERED_PATTERN = re.compile(
-    r"^\s*\d+[\.\)]\s*"          # numbered prefix
-    r"([a-zA-Z_][a-zA-Z0-9_]*)"  # factor name
-    r"\s*:\s*"                    # colon separator
-    r"(.+)$"                      # formula
-)
-
-# Pattern: "name: formula" (no number)
-_PLAIN_PATTERN = re.compile(
-    r"^\s*([a-zA-Z_][a-zA-Z0-9_]*)"  # factor name
-    r"\s*:\s*"                         # colon separator
-    r"(.+)$"                           # formula
-)
-
-# Pattern: just a formula starting with an operator
-_FORMULA_ONLY_PATTERN = re.compile(
-    r"^\s*([A-Z][a-zA-Z]*\(.+\))\s*$"
-)
-
-# Pattern: JSON-like {"name": "...", "formula": "..."}
-_JSON_PATTERN = re.compile(
-    r'"name"\s*:\s*"([^"]+)"\s*,\s*"formula"\s*:\s*"([^"]+)"'
-)
-
-
-def _strip_markdown(text: str) -> str:
-    """Remove markdown code block markers."""
-    text = re.sub(r"^```[a-z]*\n?", "", text, flags=re.MULTILINE)
-    text = re.sub(r"\n?```\s*$", "", text, flags=re.MULTILINE)
-    return text
-
-
-def _clean_formula(formula: str) -> str:
-    """Clean up a formula string before parsing."""
-    formula = formula.strip()
-    # Remove trailing comments
-    if " #" in formula:
-        formula = formula[: formula.index(" #")]
-    if " //" in formula:
-        formula = formula[: formula.index(" //")]
-    # Remove trailing punctuation
-    formula = formula.rstrip(";,.")
-    # Remove surrounding backticks
-    formula = formula.strip("`")
-    return formula.strip()
-
-
-def parse_llm_output(raw_text: str) -> Tuple[List[CandidateFactor], List[str]]:
-    """Parse raw LLM text output into candidate factors.
-
-    Parameters
-    ----------
-    raw_text : str
-        Raw text from the LLM containing factor definitions.
-
-    Returns
-    -------
-    tuple[list[CandidateFactor], list[str]]
-        (successfully_parsed, failed_lines) where failed_lines are
-        the original text lines that could not be parsed.
-    """
-    text = _strip_markdown(raw_text)
-
-    candidates: List[CandidateFactor] = []
-    failed: List[str] = []
-    seen_names: set = set()
-
-    # Try JSON pattern first (entire text)
-    json_matches = _JSON_PATTERN.findall(text)
-    if json_matches:
-        for name, formula in json_matches:
-            formula = _clean_formula(formula)
-            candidate = _try_build_candidate(name, formula)
-            if candidate.name not in seen_names:
-                candidates.append(candidate)
-                seen_names.add(candidate.name)
-        if candidates:
-            logger.debug("Parsed %d factors from JSON format", len(candidates))
-            return candidates, failed
-
-    # Line-by-line parsing
-    for line in text.split("\n"):
-        line = line.strip()
-        if not line or line.startswith("#") or line.startswith("---"):
-            continue
-
-        name: Optional[str] = None
-        formula: Optional[str] = None
-
-        # Try numbered pattern: "1. name: formula"
-        m = _NUMBERED_PATTERN.match(line)
-        if m:
-            name, formula = m.group(1), m.group(2)
-        else:
-            # Try plain pattern: "name: formula"
-            m = _PLAIN_PATTERN.match(line)
-            if m:
-                name, formula = m.group(1), m.group(2)
-            else:
-                # Try formula-only pattern
-                m = _FORMULA_ONLY_PATTERN.match(line)
-                if m:
-                    formula = m.group(1)
-                    # Generate name from formula
-                    name = _generate_name_from_formula(formula, len(candidates))
-
-        if name is None or formula is None:
-            if any(c.isalpha() for c in line) and "(" in line:
-                failed.append(line)
-            continue
-
-        formula = _clean_formula(formula)
-        if not formula:
-            failed.append(line)
-            continue
-
-        # Ensure unique name
-        base_name = name.lower().replace("-", "_")
-        unique_name = base_name
-        counter = 2
-        while unique_name in seen_names:
-            unique_name = f"{base_name}_{counter}"
-            counter += 1
-
-        candidate = _try_build_candidate(unique_name, formula)
-        candidates.append(candidate)
-        seen_names.add(unique_name)
-
-        if not candidate.is_valid:
-            failed.append(line)
-
-    logger.debug(
-        "Parsed %d candidates (%d valid, %d failed lines)",
-        len(candidates),
-        sum(1 for c in candidates if c.is_valid),
-        len(failed),
-    )
-    return candidates, failed
-
-
-def _try_build_candidate(name: str, formula: str) -> CandidateFactor:
-    """Attempt to parse a formula and build a CandidateFactor."""
-    tree = try_parse(formula)
-    if tree is not None:
-        category = _infer_category(formula)
-        return CandidateFactor(
-            name=name,
-            formula=tree.to_string(),  # canonicalize
-            expression_tree=tree,
-            category=category,
-        )
-    else:
-        # Try to get a useful error message
-        error_msg = ""
-        try:
-            parse(formula)
-        except (SyntaxError, KeyError, ValueError) as e:
-            error_msg = str(e)
-
-        return CandidateFactor(
-            name=name,
-            formula=formula,
-            expression_tree=None,
-            category="unknown",
-            parse_error=error_msg,
-        )
-
-
-def _generate_name_from_formula(formula: str, index: int) -> str:
-    """Generate a descriptive name from a formula."""
-    # Extract the outermost operator
-    m = re.match(r"([A-Z][a-zA-Z]*)\(", formula)
-    if m:
-        outer_op = m.group(1).lower()
-        return f"{outer_op}_factor_{index + 1}"
-    return f"factor_{index + 1}"
diff --git a/src/factorminer/factorminer/agent/prompt_builder.py b/src/factorminer/factorminer/agent/prompt_builder.py
deleted file mode 100644
index 288a999..0000000
--- a/src/factorminer/factorminer/agent/prompt_builder.py
+++ /dev/null
@@ -1,682 +0,0 @@
-"""Build prompts for LLM-driven factor generation using memory priors.
-
-The system prompt encodes the full operator library, syntax rules, feature
-list, and task description.  The user prompt injects per-iteration context:
-memory signals, library state, and output format instructions.
-"""
-
-from __future__ import annotations
-
-from typing import Any, Dict, List, Optional
-
-from src.factorminer.factorminer.core.types import (
-    FEATURES,
-    OPERATOR_REGISTRY,
-    OperatorSpec,
-    OperatorType,
-)
-
-
-def _format_operator_table() -> str:
-    """Build a human-readable operator reference table grouped by category."""
-    grouped: Dict[str, List[OperatorSpec]] = {}
-    for spec in OPERATOR_REGISTRY.values():
-        cat = spec.category.name
-        grouped.setdefault(cat, []).append(spec)
-
-    lines: List[str] = []
-    for cat_name in [
-        "ARITHMETIC",
-        "STATISTICAL",
-        "TIMESERIES",
-        "SMOOTHING",
-        "CROSS_SECTIONAL",
-        "REGRESSION",
-        "LOGICAL",
-        "AUTO_INVENTED",
-    ]:
-        specs = grouped.get(cat_name, [])
-        if not specs:
-            continue
-        lines.append(f"\n### {cat_name} operators")
-        for spec in sorted(specs, key=lambda s: s.name):
-            params_str = ""
-            if spec.param_names:
-                parts = []
-                for pname in spec.param_names:
-                    default = spec.param_defaults.get(pname, "")
-                    lo, hi = spec.param_ranges.get(pname, (None, None))
-                    range_str = f"[{lo}-{hi}]" if lo is not None else ""
-                    parts.append(f"{pname}={default}{range_str}")
-                params_str = f"  params: {', '.join(parts)}"
-            arity_args = ", ".join([f"expr{i+1}" for i in range(spec.arity)])
-            if spec.param_names:
-                arity_args += ", " + ", ".join(spec.param_names)
-            lines.append(f"- {spec.name}({arity_args}): {spec.description}{params_str}")
-    return "\n".join(lines)
-
-
-def _format_feature_list() -> str:
-    """Build a description of available raw features."""
-    descriptions = {
-        "$open": "opening price",
-        "$high": "highest price in the bar",
-        "$low": "lowest price in the bar",
-        "$close": "closing price",
-        "$volume": "trading volume (shares)",
-        "$amt": "trading amount (currency value)",
-        "$vwap": "volume-weighted average price",
-        "$returns": "close-to-close returns",
-    }
-    lines = []
-    for feat in FEATURES:
-        desc = descriptions.get(feat, "")
-        lines.append(f"  {feat}: {desc}")
-    return "\n".join(lines)
-
-
-# ---------------------------------------------------------------------------
-# System prompt
-# ---------------------------------------------------------------------------
-
-SYSTEM_PROMPT = f"""You are a quantitative researcher mining formulaic alpha factors for stock selection.
-
-Your goal is to generate novel, predictive factor expressions using a tree-structured domain-specific language (DSL). Each factor is a composition of operators applied to raw market features.
-
-## RAW FEATURES (leaf nodes)
-{_format_feature_list()}
-
-## OPERATOR LIBRARY
-{_format_operator_table()}
-
-## EXPRESSION SYNTAX RULES
-1. Every expression is a nested function call: Operator(args...)
-2. Leaf nodes are raw features ($close, $volume, etc.) or numeric constants.
-3. Operators are called by name with expression arguments first, then numeric parameters:
-   - Mean($close, 20) = 20-day rolling mean of $close
-   - Corr($close, $volume, 10) = 10-day rolling correlation of close and volume
-   - IfElse(Greater($returns, 0), $volume, Neg($volume)) = conditional
-4. No infix operators; use Add(x,y) instead of x+y, Sub(x,y) instead of x-y, etc.
-5. Parameters like window sizes are trailing numeric arguments after expression children.
-6. Valid window sizes are integers; check each operator's parameter ranges above.
-7. Cross-sectional operators (CsRank, CsZScore, CsDemean, CsScale, CsNeutralize) operate across all stocks at each time step -- they are crucial for making factors comparable.
-
-## EXAMPLES OF WELL-FORMED FACTORS
-- Neg(CsRank(Delta($close, 5)))
-  Short-term reversal: rank of 5-day price change, negated.
-- CsZScore(Div(Sub($volume, Mean($volume, 20)), Std($volume, 20)))
-  Volume surprise: standardized deviation from 20-day mean volume.
-- CsRank(Div(Sub($close, $vwap), $vwap))
-  Intraday deviation from VWAP, cross-sectionally ranked.
-- Neg(Corr($volume, $close, 10))
-  Negative price-volume correlation over 10 days.
-- CsRank(TsLinRegSlope($volume, 20))
-  Trend in trading volume over 20 days, ranked.
-- IfElse(Greater($returns, 0), Std($returns, 10), Neg(Std($returns, 10)))
-  Conditional volatility: positive for up-moves, negative for down-moves.
-- CsRank(Div(Sub($close, TsMin($low, 20)), Sub(TsMax($high, 20), TsMin($low, 20))))
-  Position within 20-day price range, ranked.
-
-## KEY PRINCIPLES FOR HIGH-QUALITY FACTORS
-- Always wrap the outermost expression with a cross-sectional operator (CsRank, CsZScore) for comparability.
-- Combine DIFFERENT operator types for novelty (e.g., time-series + cross-sectional + arithmetic).
-- Use diverse window sizes; avoid always defaulting to 10.
-- Explore uncommon feature combinations ($amt, $vwap are underused).
-- Factors with depth 3-7 tend to be best: deep enough to capture non-trivial patterns but not so deep they overfit.
-- Prefer economically meaningful combinations over random nesting.
-"""
-
-
-# ---------------------------------------------------------------------------
-# PromptBuilder
-# ---------------------------------------------------------------------------
-
-def normalize_factor_references(entries: Optional[List[Any]]) -> List[str]:
-    """Convert mixed factor metadata into prompt-safe string references."""
-    if not entries:
-        return []
-
-    normalized: List[str] = []
-    seen: set[str] = set()
-
-    for entry in entries:
-        text = ""
-        if isinstance(entry, str):
-            text = entry.strip()
-        elif isinstance(entry, dict):
-            formula = str(entry.get("formula", "")).strip()
-            name = str(entry.get("name", "")).strip()
-            category = str(entry.get("category", "")).strip()
-            if formula and name:
-                text = f"{name}: {formula}"
-            elif formula:
-                text = formula
-            elif name and category:
-                text = f"{name} [{category}]"
-            elif name:
-                text = name
-        elif entry is not None:
-            text = str(entry).strip()
-
-        if text and text not in seen:
-            normalized.append(text)
-            seen.add(text)
-
-    return normalized
-
-
-class PromptBuilder:
-    """Constructs system and user prompts for factor generation.
-
-    The system prompt is static (operator library + rules).
-    The user prompt varies each iteration based on memory signals.
-    """
-
-    def __init__(self, system_prompt: Optional[str] = None) -> None:
-        self._system_prompt = system_prompt or SYSTEM_PROMPT
-
-    @property
-    def system_prompt(self) -> str:
-        return self._system_prompt
-
-    def build_user_prompt(
-        self,
-        memory_signal: Dict[str, Any],
-        library_state: Dict[str, Any],
-        batch_size: int = 40,
-    ) -> str:
-        """Build the per-iteration user prompt injecting memory priors.
-
-        Parameters
-        ----------
-        memory_signal : dict
-            Keys:
-            - ``"recommended_directions"`` : list[str] -- patterns to explore
-            - ``"forbidden_directions"`` : list[str] -- patterns to avoid
-            - ``"strategic_insights"`` : list[str] -- high-level lessons
-            - ``"recent_rejections"`` : list[dict] -- recent rejection reasons
-        library_state : dict
-            Keys:
-            - ``"size"`` : int -- current library size
-            - ``"target_size"`` : int -- target library size
-            - ``"recent_admissions"`` : list[str] -- recently admitted factor names
-            - ``"domain_saturation"`` : dict[str, float] -- per-domain saturation
-        batch_size : int
-            Number of candidates to generate this iteration.
-
-        Returns
-        -------
-        str
-            The fully assembled user prompt.
-        """
-        sections: List[str] = []
-
-        # --- Task directive ---
-        sections.append(
-            f"Generate exactly {batch_size} novel, diverse alpha factor candidates."
-        )
-
-        # --- Library status ---
-        lib_size = library_state.get("size", 0)
-        target = library_state.get("target_size", 110)
-        sections.append(
-            f"\n## CURRENT LIBRARY STATUS\n"
-            f"Library size: {lib_size} / {target} factors."
-        )
-
-        recent = normalize_factor_references(
-            library_state.get("recent_admissions", [])
-        )
-        if recent:
-            sections.append(
-                "Recently admitted factors:\n"
-                + "\n".join(f"  - {f}" for f in recent[-10:])
-            )
-
-        saturation = library_state.get("domain_saturation", {})
-        if saturation:
-            sat_lines = [f"  {domain}: {pct:.0%} saturated" for domain, pct in saturation.items()]
-            sections.append(
-                "Domain saturation:\n" + "\n".join(sat_lines)
-            )
-
-        # --- Memory signal: recommended directions ---
-        rec_dirs = memory_signal.get("recommended_directions", [])
-        if rec_dirs:
-            sections.append(
-                "\n## RECOMMENDED DIRECTIONS (focus on these successful patterns)\n"
-                + "\n".join(f"  * {d}" for d in rec_dirs)
-            )
-
-        # --- Memory signal: forbidden directions ---
-        forbidden = memory_signal.get("forbidden_directions", [])
-        if forbidden:
-            sections.append(
-                "\n## FORBIDDEN DIRECTIONS (AVOID these -- they produce correlated/weak factors)\n"
-                + "\n".join(f"  X {d}" for d in forbidden)
-            )
-
-        # --- Memory signal: strategic insights ---
-        insights = memory_signal.get("strategic_insights", [])
-        if insights:
-            sections.append(
-                "\n## STRATEGIC INSIGHTS\n"
-                + "\n".join(f"  Note: {ins}" for ins in insights)
-            )
-
-        helix_prompt_text = memory_signal.get("prompt_text", "").strip()
-        if helix_prompt_text:
-            sections.append(
-                "\n## HELIX RETRIEVAL SUMMARY\n"
-                f"{helix_prompt_text}"
-            )
-
-        complementary_patterns = memory_signal.get("complementary_patterns", [])
-        if complementary_patterns:
-            sections.append(
-                "\n## COMPLEMENTARY PATTERNS\n"
-                + "\n".join(f"  + {pattern}" for pattern in complementary_patterns)
-            )
-
-        conflict_warnings = memory_signal.get("conflict_warnings", [])
-        if conflict_warnings:
-            sections.append(
-                "\n## SATURATION WARNINGS\n"
-                + "\n".join(f"  ! {warning}" for warning in conflict_warnings)
-            )
-
-        operator_cooccurrence = memory_signal.get("operator_cooccurrence", [])
-        if operator_cooccurrence:
-            sections.append(
-                "\n## OPERATOR CO-OCCURRENCE PRIORS\n"
-                + "\n".join(f"  - {pair}" for pair in operator_cooccurrence)
-            )
-
-        semantic_gaps = memory_signal.get("semantic_gaps", [])
-        if semantic_gaps:
-            sections.append(
-                "\n## SEMANTIC GAPS\n"
-                + "\n".join(
-                    f"  - Underused but promising: {gap}" for gap in semantic_gaps
-                )
-            )
-
-        # --- Recent rejection reasons ---
-        rejections = memory_signal.get("recent_rejections", [])
-        if rejections:
-            rej_lines = []
-            for rej in rejections[-10:]:
-                name = rej.get("name", "unknown")
-                reason = rej.get("reason", "unknown")
-                rej_lines.append(f"  - {name}: rejected because {reason}")
-            sections.append(
-                "\n## RECENT REJECTIONS (learn from these failures)\n"
-                + "\n".join(rej_lines)
-            )
-
-        # --- Orthogonality directive ---
-        sections.append(
-            "\n## CRITICAL REQUIREMENT: ORTHOGONALITY\n"
-            "Generate factors that are UNCORRELATED with existing library members. "
-            "Each candidate should explore a DIFFERENT structural pattern. "
-            "Vary your operator choices, window sizes, feature combinations, and "
-            "nesting depth across candidates. Do NOT generate trivial variations "
-            "of the same formula (e.g., changing only the window size)."
-        )
-
-        # --- Output format ---
-        sections.append(
-            f"\n## OUTPUT FORMAT\n"
-            f"Output exactly {batch_size} factors, one per line.\n"
-            f"Format each line as: <number>. <factor_name>: <formula>\n"
-            f"Example:\n"
-            f"1. momentum_reversal: Neg(CsRank(Delta($close, 5)))\n"
-            f"2. volume_surprise: CsZScore(Div(Sub($volume, Mean($volume, 20)), Std($volume, 20)))\n"
-            f"\nRules:\n"
-            f"- factor_name: lowercase_with_underscores, descriptive, unique\n"
-            f"- formula: valid DSL expression using ONLY operators and features listed above\n"
-            f"- No markdown, no explanations, no extra text -- just the numbered list\n"
-            f"- Every formula must parse correctly with the operator library"
-        )
-
-        return "\n".join(sections)
-
-
-# ---------------------------------------------------------------------------
-# New specialist/critic/debate prompt builder functions
-# ---------------------------------------------------------------------------
-
-def build_specialist_prompt(
-    specialist_name: str,
-    specialist_domain: str,
-    specialist_hypothesis: str,
-    preferred_operators: List[str],
-    preferred_features: List[str],
-    example_factors: List[str],
-    avoid_patterns: List[str],
-    memory_signal: Optional[Dict[str, Any]] = None,
-    library_diagnostics: Optional[Dict[str, Any]] = None,
-    regime_context: str = "",
-    n_proposals: int = 15,
-    success_rate: Optional[float] = None,
-) -> str:
-    """Build a rich context-aware user prompt for a specialist agent.
-
-    Parameters
-    ----------
-    specialist_name : str
-        Human-readable name of the specialist (e.g. ``"MomentumMiner"``).
-    specialist_domain : str
-        Short domain description for the specialist.
-    specialist_hypothesis : str
-        Core economic hypothesis guiding this specialist.
-    preferred_operators : list[str]
-        Operator names this specialist should lean on.
-    preferred_features : list[str]
-        Feature names this specialist prefers.
-    example_factors : list[str]
-        Reference formula examples for this specialist.
-    avoid_patterns : list[str]
-        Structural patterns to explicitly avoid.
-    memory_signal : dict or None
-        Experience memory context (recommended directions, etc.).
-    library_diagnostics : dict or None
-        Library state (size, saturation, recent admissions).
-    regime_context : str
-        Current market regime description.
-    n_proposals : int
-        Number of proposals to request.
-    success_rate : float or None
-        Historical success rate for this specialist (for context).
-
-    Returns
-    -------
-    str
-        Fully assembled specialist user prompt.
-    """
-    memory_signal = memory_signal or {}
-    library_diagnostics = library_diagnostics or {}
-    sections: List[str] = []
-
-    # Header
-    sections.append(
-        f"## SPECIALIST TASK: {specialist_name}\n"
-        f"Domain: {specialist_domain}\n"
-        f"Hypothesis: {specialist_hypothesis}"
-    )
-
-    if success_rate is not None:
-        sections.append(
-            f"Your historical admission rate: {success_rate:.1%}  "
-            f"(aim to exceed this by proposing higher-quality factors)"
-        )
-
-    # Regime context
-    if regime_context:
-        sections.append(
-            f"\n## CURRENT MARKET REGIME\n{regime_context}"
-        )
-
-    # Library state
-    lib_size = library_diagnostics.get("size", 0)
-    target = library_diagnostics.get("target_size", 110)
-    sections.append(
-        f"\n## LIBRARY STATUS\nCurrent: {lib_size}/{target} factors."
-    )
-
-    recent = normalize_factor_references(
-        library_diagnostics.get("recent_admissions", [])
-    )
-    if recent:
-        sections.append(
-            "Recently admitted (avoid similar patterns):\n"
-            + "\n".join(f"  - {f}" for f in recent[-8:])
-        )
-
-    saturation = library_diagnostics.get("domain_saturation", {})
-    if saturation:
-        sat_lines = [
-            f"  {d}: {p:.0%} saturated" for d, p in saturation.items()
-        ]
-        sections.append("Domain saturation:\n" + "\n".join(sat_lines))
-
-    # Memory signal injections
-    rec_dirs = memory_signal.get("recommended_directions", [])
-    if rec_dirs:
-        sections.append(
-            "\n## RECOMMENDED DIRECTIONS\n"
-            + "\n".join(f"  * {d}" for d in rec_dirs)
-        )
-
-    forbidden = memory_signal.get("forbidden_directions", [])
-    if forbidden:
-        sections.append(
-            "\n## FORBIDDEN DIRECTIONS\n"
-            + "\n".join(f"  X {d}" for d in forbidden)
-        )
-
-    insights = memory_signal.get("strategic_insights", [])
-    if insights:
-        sections.append(
-            "\n## STRATEGIC INSIGHTS\n"
-            + "\n".join(f"  - {ins}" for ins in insights)
-        )
-
-    helix_text = memory_signal.get("prompt_text", "").strip()
-    if helix_text:
-        sections.append(f"\n## HELIX CONTEXT\n{helix_text}")
-
-    comp_patterns = memory_signal.get("complementary_patterns", [])
-    if comp_patterns:
-        sections.append(
-            "\n## COMPLEMENTARY PATTERNS (explore these)\n"
-            + "\n".join(f"  + {p}" for p in comp_patterns)
-        )
-
-    warn = memory_signal.get("conflict_warnings", [])
-    if warn:
-        sections.append(
-            "\n## SATURATION WARNINGS\n"
-            + "\n".join(f"  ! {w}" for w in warn)
-        )
-
-    gaps = memory_signal.get("semantic_gaps", [])
-    if gaps:
-        sections.append(
-            "\n## SEMANTIC GAPS (underused areas to explore)\n"
-            + "\n".join(f"  ~ {g}" for g in gaps)
-        )
-
-    # Specialist focus directive
-    ops_str = ", ".join(preferred_operators)
-    feats_str = ", ".join(preferred_features)
-    sections.append(
-        f"\n## YOUR SPECIALIST FOCUS\n"
-        f"Preferred operators: {{{ops_str}}}\n"
-        f"Preferred features: {{{feats_str}}}\n"
-        f"Focus ~60% of proposals on these.  The remaining ~40% should "
-        f"explore creative cross-domain combinations."
-    )
-
-    # Domain examples
-    if example_factors:
-        sections.append(
-            "\n## DOMAIN REFERENCE EXAMPLES (structural templates, do NOT copy exactly)\n"
-            + "\n".join(f"  - {ex}" for ex in example_factors)
-        )
-
-    # Avoid patterns
-    if avoid_patterns:
-        sections.append(
-            "\n## PATTERNS TO AVOID\n"
-            + "\n".join(f"  X {av}" for av in avoid_patterns)
-        )
-
-    # Few-shot patterns from memory
-    mem_success_patterns = memory_signal.get("_few_shot_examples", [])
-    if mem_success_patterns:
-        sections.append(
-            "\n## FEW-SHOT SUCCESS PATTERNS FROM MEMORY\n"
-            "(These formulas were previously admitted -- use as structural inspiration)\n"
-            + "\n".join(f"  [+] {ex}" for ex in mem_success_patterns[:5])
-        )
-
-    # Output format
-    sections.append(
-        f"\n## OUTPUT FORMAT\n"
-        f"Generate exactly {n_proposals} novel factor candidates.\n"
-        f"Format: <number>. <factor_name>: <formula>\n"
-        f"Example: 1. momentum_reversal: Neg(CsRank(Delta($close, 5)))\n"
-        f"Rules:\n"
-        f"- factor_name: lowercase_with_underscores, unique, descriptive\n"
-        f"- formula: valid DSL expression only\n"
-        f"- No markdown, no explanations -- just the numbered list\n"
-        f"- Every formula must use only registered operators and features"
-    )
-
-    return "\n".join(sections)
-
-
-def build_critic_scoring_prompt(
-    candidates: List[Dict[str, str]],
-    existing_factors: Optional[List[str]] = None,
-    memory_signal: Optional[str] = None,
-    regime_context: str = "",
-) -> str:
-    """Build a structured JSON-output scoring prompt for the critic agent.
-
-    Parameters
-    ----------
-    candidates : list[dict]
-        List of dicts with keys ``"name"``, ``"formula"``, ``"specialist"``.
-    existing_factors : list[str] or None
-        Formula strings already in the library.
-    memory_signal : str or None
-        Free-text memory context (success patterns, etc.).
-    regime_context : str
-        Current market regime description.
-
-    Returns
-    -------
-    str
-        Fully assembled critic scoring prompt.
-    """
-    existing_factors = existing_factors or []
-    sections: List[str] = []
-
-    sections.append(
-        "## CRITIC SCORING TASK\n"
-        "Evaluate the following candidate factors for economic intuition.\n"
-        "Score each on how well it captures a plausible, economically "
-        "meaningful cross-sectional return predictor."
-    )
-
-    if regime_context:
-        sections.append(f"\n## CURRENT REGIME\n{regime_context}")
-
-    if existing_factors:
-        sections.append(
-            "\n## LIBRARY SAMPLE (existing factors to avoid duplicating)\n"
-            + "\n".join(f"  - {f}" for f in existing_factors[-12:])
-        )
-
-    if memory_signal:
-        sections.append(f"\n## MEMORY CONTEXT (success patterns)\n{memory_signal[:600]}")
-
-    sections.append("\n## CANDIDATES")
-    for c in candidates:
-        name = c.get("name", "unknown")
-        formula = c.get("formula", "")
-        specialist = c.get("specialist", "unknown")
-        sections.append(
-            f"  [{specialist}] {name}: {formula}"
-        )
-
-    sections.append(
-        "\n## SCORING CRITERIA\n"
-        "economic_intuition [0.0-1.0]:\n"
-        "  1.0 = strong economic story, appropriate complexity, novel signal\n"
-        "  0.5 = plausible but generic or overly simple\n"
-        "  0.0 = no coherent economic story, trivial, or clearly wrong\n"
-        "\nConsider:\n"
-        "  - Is there a coherent alpha story (momentum, reversal, vol, liquidity)?\n"
-        "  - Is complexity appropriate (depth 3-7, 3-5 unique operators)?\n"
-        "  - Does it use features in a semantically meaningful way?\n"
-        "  - Is it structurally distinct from existing library members?\n"
-        "  - Would a quant researcher find this plausible?"
-    )
-
-    sections.append(
-        "\n## OUTPUT FORMAT\n"
-        "One JSON object per line for each candidate:\n"
-        '{"name": "<factor_name>", "economic_intuition": <0.0-1.0>, '
-        '"rationale": "<one concise sentence>"}\n'
-        "Output ONLY the JSON lines. No markdown, no extra text."
-    )
-
-    return "\n".join(sections)
-
-
-def build_debate_synthesis_prompt(
-    all_proposals: List[Dict[str, Any]],
-    critic_scores: List[Dict[str, Any]],
-    top_k: int = 10,
-) -> str:
-    """Build a consensus synthesis prompt for the debate orchestrator.
-
-    Used when a final synthesis step is desired to resolve conflicts
-    between specialist proposals and produce a consensus ranking.
-
-    Parameters
-    ----------
-    all_proposals : list[dict]
-        All proposals with ``"name"``, ``"formula"``, ``"specialist"`` keys.
-    critic_scores : list[dict]
-        Critic scores with ``"name"`` and ``"composite_score"`` keys.
-    top_k : int
-        Number of top factors to synthesize consensus for.
-
-    Returns
-    -------
-    str
-        Debate synthesis prompt.
-    """
-    # Sort by composite score
-    score_map = {s["name"]: s.get("composite_score", 0.5) for s in critic_scores}
-    sorted_proposals = sorted(
-        all_proposals,
-        key=lambda p: score_map.get(p.get("name", ""), 0.0),
-        reverse=True,
-    )[:top_k * 2]  # take 2x top_k for synthesis
-
-    sections: List[str] = []
-    sections.append(
-        f"## DEBATE SYNTHESIS TASK\n"
-        f"Multiple specialist agents proposed the following factors.\n"
-        f"The critic has pre-scored them.  Your task is to identify the "
-        f"top {top_k} most complementary factors for a diverse library.\n"
-        f"Prioritize NOVELTY and ORTHOGONALITY over pure individual quality."
-    )
-
-    sections.append("\n## SCORED PROPOSALS (sorted by critic score)")
-    for p in sorted_proposals:
-        name = p.get("name", "?")
-        formula = p.get("formula", "?")
-        specialist = p.get("specialist", "?")
-        score = score_map.get(name, 0.5)
-        sections.append(
-            f"  [{specialist}, score={score:.2f}] {name}: {formula}"
-        )
-
-    sections.append(
-        f"\n## SELECTION CRITERIA\n"
-        f"Select the top {top_k} factors that are:\n"
-        f"  1. Diverse in operator structure (avoid near-duplicates)\n"
-        f"  2. Balanced across specialist domains where possible\n"
-        f"  3. High composite critic score\n"
-        f"  4. Economically interpretable\n"
-        f"\nOutput a ranked list: <rank>. <factor_name>\n"
-        f"No other text."
-    )
-
-    return "\n".join(sections)
diff --git a/src/factorminer/factorminer/agent/specialists.py b/src/factorminer/factorminer/agent/specialists.py
deleted file mode 100644
index 5746096..0000000
--- a/src/factorminer/factorminer/agent/specialists.py
+++ /dev/null
@@ -1,596 +0,0 @@
-"""Specialist agent configurations for domain-focused factor generation.
-
-Each specialist focuses on a particular alpha factor domain with a distinct
-cognitive style, preferred operators, domain hypotheses, and historical
-success tracking.  ``SpecialistAgent`` wraps a config with per-domain memory
-and proposal logic.  ``SpecialistPromptBuilder`` extends the base
-``PromptBuilder`` to inject domain-specific directives.
-"""
-
-from __future__ import annotations
-
-import logging
-from dataclasses import dataclass, field
-from typing import Any, Dict, List, Optional
-
-from src.factorminer.factorminer.agent.llm_interface import LLMProvider
-from src.factorminer.factorminer.agent.output_parser import CandidateFactor, parse_llm_output
-from src.factorminer.factorminer.agent.prompt_builder import (
-    SYSTEM_PROMPT,
-    PromptBuilder,
-    normalize_factor_references,
-)
-
-logger = logging.getLogger(__name__)
-
-
-# ---------------------------------------------------------------------------
-# Configuration dataclass
-# ---------------------------------------------------------------------------
-
-@dataclass
-class SpecialistConfig:
-    """Configuration for a domain-specialist factor generator.
-
-    Attributes
-    ----------
-    name : str
-        Human-readable specialist name (e.g. ``"MomentumMiner"``).
-    domain : str
-        Domain description used in prompt directives.
-    preferred_operators : list[str]
-        Operator names this specialist should emphasise.
-    preferred_features : list[str]
-        Raw features this specialist should lean towards.
-    hypothesis : str
-        Core economic hypothesis driving this specialist's approach.
-    example_factors : list[str]
-        Example formulas to ground the specialist in concrete patterns.
-    avoid : list[str]
-        Structural patterns this specialist should steer clear of.
-    temperature : float
-        Sampling temperature for LLM calls.
-    system_prompt_suffix : str
-        Extra paragraph appended to the system prompt.
-    provider_config : dict
-        Optional provider-level overrides (model, max_tokens, etc.).
-    """
-
-    name: str
-    domain: str
-    preferred_operators: List[str]
-    preferred_features: List[str]
-    hypothesis: str = ""
-    example_factors: List[str] = field(default_factory=list)
-    avoid: List[str] = field(default_factory=list)
-    temperature: float = 0.8
-    system_prompt_suffix: str = ""
-    provider_config: Dict[str, Any] = field(default_factory=dict)
-
-
-# ---------------------------------------------------------------------------
-# Pre-defined specialist constants
-# ---------------------------------------------------------------------------
-
-MOMENTUM_SPECIALIST = SpecialistConfig(
-    name="MomentumMiner",
-    domain="price momentum and trend following",
-    preferred_operators=["TsRank", "Delta", "EMA", "SMA", "TsLinRegSlope", "Return"],
-    preferred_features=["$close", "$returns", "$vwap"],
-    hypothesis=(
-        "Short-term momentum and trend reversals contain predictive signal. "
-        "Serial correlation in returns and time-series rank dynamics reveal "
-        "persistent directional biases exploitable at the cross-section."
-    ),
-    example_factors=[
-        "Neg(TsRank(Delta($close, 5), 20))",
-        "CsRank(TsLinRegSlope($close, 10))",
-        "Neg(CsRank(EMA($returns, 8)))",
-    ],
-    avoid=[
-        "volume-only factors without price context",
-        "pure cross-sectional without time component",
-        "very long windows (>60) on returns",
-    ],
-    temperature=0.85,
-    system_prompt_suffix=(
-        "You are the MOMENTUMMINER specialist.  Your cognitive style is "
-        "directional and trend-aware.  Focus on price persistence, serial "
-        "correlation in returns, and time-series rank dynamics.  Prefer "
-        "directional operators (Delta, Return, TsLinRegSlope, EMA, TsRank) "
-        "to capture price trajectory information.  Explore both short-term "
-        "reversal (1-5 day) and medium-term momentum (10-30 day) regimes.  "
-        "Hypothesis: recent price trends contain exploitable signal that "
-        "cross-sectional ranking amplifies."
-    ),
-)
-
-VOLATILITY_SPECIALIST = SpecialistConfig(
-    name="VolatilityMiner",
-    domain="volatility regimes and higher-moment signals",
-    preferred_operators=["Std", "Skew", "Kurt", "TsRank", "IfElse", "Greater"],
-    preferred_features=["$returns", "$high", "$low", "$close"],
-    hypothesis=(
-        "Volatility clustering and moment anomalies predict near-term returns. "
-        "Stocks with anomalous higher moments (excess kurtosis, negative skew) "
-        "exhibit predictable subsequent return patterns via risk-aversion channels."
-    ),
-    example_factors=[
-        "IfElse(Greater(Std($returns,12), Mean(Std($returns,12),48)), "
-        "Neg(CsRank(Delta($close,3))), CsRank(Skew($returns,20)))",
-        "Neg(CsRank(Kurt($returns, 20)))",
-        "CsRank(Div(Std($returns,5), Std($returns,20)))",
-    ],
-    avoid=[
-        "simple momentum without vol conditioning",
-        "long window trends > 40 bars",
-        "volume-only volatility without returns",
-    ],
-    temperature=0.9,
-    system_prompt_suffix=(
-        "You are the VOLATILITYMINER specialist.  Your cognitive style is "
-        "regime-aware and risk-focused.  Combine statistical operators "
-        "(Std, Var, Kurt, Skew) with logical branching (IfElse, Greater, Less) "
-        "to capture asymmetric behaviour in volatility regimes.  "
-        "Explore vol-of-vol, vol regime transitions, and higher-moment "
-        "cross-sectional anomalies.  Condition momentum signals on vol "
-        "regimes -- high-vol vs low-vol stocks behave very differently.  "
-        "Hypothesis: volatility clustering and skewness anomalies carry "
-        "cross-sectional predictive power beyond simple momentum."
-    ),
-)
-
-LIQUIDITY_SPECIALIST = SpecialistConfig(
-    name="LiquidityMiner",
-    domain="volume, liquidity, and microstructure signals",
-    preferred_operators=["Corr", "TsRank", "CsRank", "EMA", "Delta"],
-    preferred_features=["$volume", "$amt", "$vwap", "$close"],
-    hypothesis=(
-        "Volume-price divergence and liquidity dynamics predict order flow "
-        "imbalances.  Stocks with abnormal volume relative to price movement "
-        "signal informed trading; VWAP deviations capture intraday microstructure."
-    ),
-    example_factors=[
-        "CsRank(Corr($volume, $close, 10))",
-        "Neg(CsRank(EMA(Div(Sub($close,$vwap),Add($vwap,1e-4)),5)))",
-        "CsZScore(Delta(Mean($amt, 5), 5))",
-    ],
-    avoid=[
-        "volume in isolation without price context",
-        "close/open ratio without volume normalization",
-        "microstructure without cross-sectional ranking",
-    ],
-    temperature=0.85,
-    system_prompt_suffix=(
-        "You are the LIQUIDITYMINER specialist.  Your cognitive style is "
-        "microstructure-focused and flow-aware.  Focus on cross-sectional "
-        "liquidity patterns: volume-price divergence, turnover anomalies, "
-        "and VWAP-based microstructure signals.  Use correlation/covariance "
-        "operators to capture relative volume-price alignment.  Explore "
-        "amount (dollar volume) signals -- $amt is often underused.  "
-        "Condition signals on whether volume is confirming or diverging "
-        "from price direction.  Hypothesis: volume-price divergence "
-        "and liquidity imbalances predict short-term order flow reversals."
-    ),
-)
-
-REGIME_SPECIALIST = SpecialistConfig(
-    name="RegimeMiner",
-    domain="cross-sectional dispersion and regime classification",
-    preferred_operators=["CsRank", "CsZScore", "Std", "TsLinRegSlope", "Rsquare", "Resi"],
-    preferred_features=["$close", "$returns", "$vwap", "$amt"],
-    hypothesis=(
-        "Cross-sectional dispersion and regression residuals capture "
-        "regime-independent signals.  Stocks that deviate from their "
-        "predicted cross-sectional position contain mean-reversion signal "
-        "that is robust across bull and bear markets."
-    ),
-    example_factors=[
-        "Mul(CsRank(Rsquare($close, 24)), CsRank(Delta($close, 3)))",
-        "CsRank(Resi($close, $vwap, 20))",
-        "CsZScore(CsRank(TsLinRegSlope($returns, 15)))",
-    ],
-    avoid=[
-        "single-feature factors without statistical operators",
-        "arithmetic without cross-sectional normalization",
-        "momentum without regime conditioning",
-    ],
-    temperature=0.85,
-    system_prompt_suffix=(
-        "You are the REGINEMINER specialist.  Your cognitive style is "
-        "cross-sectional and regression-oriented.  Focus on dispersion, "
-        "residual signals, and regime-robust patterns.  Use regression "
-        "operators (Rsquare, Resi, TsLinRegSlope) to decompose price "
-        "behaviour into systematic and idiosyncratic components.  "
-        "Cross-sectional normalization is essential -- every factor should "
-        "be comparable across stocks.  Explore cross-asset dispersion "
-        "patterns that persist regardless of market direction.  "
-        "Hypothesis: cross-sectional regression residuals and R-squared "
-        "signals capture regime-independent structural mispricings."
-    ),
-)
-
-DEFAULT_SPECIALISTS: List[SpecialistConfig] = [
-    MOMENTUM_SPECIALIST,
-    VOLATILITY_SPECIALIST,
-    LIQUIDITY_SPECIALIST,
-    REGIME_SPECIALIST,
-]
-
-# Map from specialist name to config for convenience
-SPECIALIST_CONFIGS: Dict[str, SpecialistConfig] = {
-    spec.name: spec for spec in DEFAULT_SPECIALISTS
-}
-
-
-# ---------------------------------------------------------------------------
-# SpecialistDomainMemory -- per-specialist admission tracking
-# ---------------------------------------------------------------------------
-
-@dataclass
-class SpecialistDomainMemory:
-    """Tracks admission/rejection history for a single specialist.
-
-    Parameters
-    ----------
-    specialist_name : str
-        The name of the specialist this memory belongs to.
-    """
-
-    specialist_name: str
-    admitted: List[str] = field(default_factory=list)
-    rejected: List[str] = field(default_factory=list)
-    rejection_reasons: List[str] = field(default_factory=list)
-
-    @property
-    def total_proposed(self) -> int:
-        return len(self.admitted) + len(self.rejected)
-
-    @property
-    def success_rate(self) -> float:
-        if self.total_proposed == 0:
-            return 0.0
-        return len(self.admitted) / self.total_proposed
-
-    def record_admitted(self, formulas: List[str]) -> None:
-        self.admitted.extend(formulas)
-
-    def record_rejected(self, formulas: List[str], reasons: List[str]) -> None:
-        self.rejected.extend(formulas)
-        self.rejection_reasons.extend(reasons)
-
-    def get_summary(self) -> str:
-        """Human-readable summary of domain performance."""
-        from collections import Counter
-        lines = [
-            f"Specialist: {self.specialist_name}",
-            f"  Proposed: {self.total_proposed}  Admitted: {len(self.admitted)}  "
-            f"Rejected: {len(self.rejected)}",
-            f"  Success rate: {self.success_rate:.1%}",
-        ]
-        if self.admitted:
-            lines.append("  Best admitted (last 3):")
-            for f in self.admitted[-3:]:
-                lines.append(f"    + {f}")
-        if self.rejection_reasons:
-            counts = Counter(self.rejection_reasons)
-            top = counts.most_common(3)
-            lines.append("  Top rejection reasons:")
-            for reason, count in top:
-                lines.append(f"    - {reason} (x{count})")
-        return "\n".join(lines)
-
-
-# ---------------------------------------------------------------------------
-# SpecialistAgent -- proposal generation with domain memory
-# ---------------------------------------------------------------------------
-
-class SpecialistAgent:
-    """Domain-specialist factor proposer with memory and success tracking.
-
-    Each specialist has a unique cognitive style, a preferred operator
-    toolkit, and maintains per-domain memory of what has worked and failed.
-    Proposals are generated by building a rich context-aware prompt and
-    calling the shared LLM provider.
-
-    Parameters
-    ----------
-    config : SpecialistConfig
-        Configuration defining this specialist's domain and style.
-    llm : LLMProvider
-        LLM backend shared across all specialists.
-    base_system_prompt : str or None
-        Override for the base system prompt.
-    """
-
-    def __init__(
-        self,
-        config: SpecialistConfig,
-        llm: LLMProvider,
-        base_system_prompt: Optional[str] = None,
-    ) -> None:
-        self.config = config
-        self.llm = llm
-        self._memory = SpecialistDomainMemory(specialist_name=config.name)
-
-        # Build the specialist prompt builder (extends base PromptBuilder)
-        self._prompt_builder = SpecialistPromptBuilder(
-            specialist_config=config,
-            base_system_prompt=base_system_prompt,
-        )
-
-    @property
-    def name(self) -> str:
-        return self.config.name
-
-    @property
-    def success_rate(self) -> float:
-        """Fraction of this specialist's proposals that were admitted."""
-        return self._memory.success_rate
-
-    def generate_proposals(
-        self,
-        n_proposals: int,
-        memory_signal: Optional[Dict[str, Any]] = None,
-        library_diagnostics: Optional[Dict[str, Any]] = None,
-        regime_context: str = "",
-        forbidden_patterns: Optional[List[str]] = None,
-        existing_factors: Optional[List[str]] = None,
-    ) -> List[str]:
-        """Generate formula string proposals from this specialist.
-
-        Builds a rich domain-aware prompt injecting memory, diagnostics,
-        regime context, and forbidden patterns, then calls the LLM and
-        parses the response into formula strings.
-
-        Parameters
-        ----------
-        n_proposals : int
-            Number of factor formulas to request.
-        memory_signal : dict or None
-            Experience memory priors (recommended/forbidden directions, etc.).
-        library_diagnostics : dict or None
-            Current library state (size, saturation, recent admissions, etc.).
-        regime_context : str
-            Current market regime description for conditioning.
-        forbidden_patterns : list[str] or None
-            Structural patterns to explicitly avoid.
-        existing_factors : list[str] or None
-            Formula strings already in the library (to avoid duplicates).
-
-        Returns
-        -------
-        list[str]
-            List of formula strings proposed by this specialist.
-        """
-        memory_signal = memory_signal or {}
-        library_diagnostics = library_diagnostics or {}
-        forbidden_patterns = forbidden_patterns or []
-        existing_factors = normalize_factor_references(existing_factors)
-
-        enriched_signal = self._enrich_memory_signal(
-            memory_signal, forbidden_patterns, regime_context
-        )
-
-        system_prompt = self._prompt_builder.system_prompt
-        user_prompt = self._prompt_builder.build_user_prompt(
-            memory_signal=enriched_signal,
-            library_state=library_diagnostics,
-            batch_size=n_proposals,
-        )
-
-        logger.debug(
-            "Specialist %s generating %d proposals (provider=%s)",
-            self.name,
-            n_proposals,
-            self.llm.provider_name,
-        )
-
-        try:
-            raw = self.llm.generate(
-                system_prompt=system_prompt,
-                user_prompt=user_prompt,
-                temperature=self.config.temperature,
-                max_tokens=4096,
-            )
-        except Exception as exc:
-            logger.warning(
-                "Specialist %s LLM call failed: %s. Returning empty list.",
-                self.name,
-                exc,
-            )
-            return []
-
-        candidates, _ = parse_llm_output(raw)
-        valid = [c for c in candidates if c.is_valid]
-
-        if existing_factors:
-            existing_set = set(existing_factors)
-            valid = [c for c in valid if c.formula not in existing_set]
-
-        formulas = [c.formula for c in valid]
-        logger.debug(
-            "Specialist %s produced %d valid proposals",
-            self.name,
-            len(formulas),
-        )
-        return formulas
-
-    def update_domain_memory(
-        self,
-        admitted: List[str],
-        rejected: List[str],
-        reasons: Optional[List[str]] = None,
-    ) -> None:
-        """Update this specialist's domain memory after evaluation.
-
-        Parameters
-        ----------
-        admitted : list[str]
-            Formulas from this specialist that were admitted to the library.
-        rejected : list[str]
-            Formulas that were rejected.
-        reasons : list[str] or None
-            Rejection reasons (parallel to ``rejected``).
-        """
-        reasons = reasons or ["unknown"] * len(rejected)
-        if len(reasons) < len(rejected):
-            reasons = reasons + ["unknown"] * (len(rejected) - len(reasons))
-        self._memory.record_admitted(admitted)
-        self._memory.record_rejected(rejected, reasons[:len(rejected)])
-
-    def get_domain_performance_summary(self) -> str:
-        """Human-readable summary of what this specialist has discovered."""
-        return self._memory.get_summary()
-
-    def _enrich_memory_signal(
-        self,
-        base_signal: Dict[str, Any],
-        forbidden_patterns: List[str],
-        regime_context: str,
-    ) -> Dict[str, Any]:
-        """Merge base memory signal with domain-specific context."""
-        enriched = dict(base_signal)
-
-        base_forbidden = list(enriched.get("forbidden_directions", []))
-        enriched["forbidden_directions"] = base_forbidden + [
-            f"[{self.name} domain] Avoid: {p}" for p in self.config.avoid
-        ] + forbidden_patterns
-
-        if self.config.example_factors:
-            existing_insights = list(enriched.get("strategic_insights", []))
-            existing_insights.append(
-                f"As {self.name}, your reference examples are: "
-                + " | ".join(self.config.example_factors[:3])
-            )
-            enriched["strategic_insights"] = existing_insights
-
-        if regime_context:
-            existing_prompt = enriched.get("prompt_text", "")
-            regime_note = f"[Regime context] {regime_context}"
-            enriched["prompt_text"] = (
-                regime_note + "\n" + existing_prompt
-                if existing_prompt
-                else regime_note
-            )
-
-        if self._memory.total_proposed > 0:
-            perf_note = (
-                f"[{self.name} history] Success rate: {self.success_rate:.1%} "
-                f"({len(self._memory.admitted)} admitted, "
-                f"{len(self._memory.rejected)} rejected)."
-            )
-            existing_insights = list(enriched.get("strategic_insights", []))
-            existing_insights.append(perf_note)
-            enriched["strategic_insights"] = existing_insights
-
-        return enriched
-
-
-# ---------------------------------------------------------------------------
-# SpecialistPromptBuilder -- extends PromptBuilder with domain directives
-# ---------------------------------------------------------------------------
-
-class SpecialistPromptBuilder(PromptBuilder):
-    """Prompt builder that injects domain-specific specialist directives.
-
-    Extends the base system prompt with a specialist suffix and biases
-    the user prompt towards the specialist's preferred operators, features,
-    hypothesis, and example factors.
-
-    Parameters
-    ----------
-    specialist_config : SpecialistConfig
-        The specialist configuration to use.
-    base_system_prompt : str or None
-        Override for the base system prompt.  Defaults to the global
-        ``SYSTEM_PROMPT`` from :mod:`factorminer.agent.prompt_builder`.
-    """
-
-    def __init__(
-        self,
-        specialist_config: SpecialistConfig,
-        base_system_prompt: Optional[str] = None,
-    ) -> None:
-        base = base_system_prompt or SYSTEM_PROMPT
-        suffix = specialist_config.system_prompt_suffix
-        hypothesis_block = ""
-        if specialist_config.hypothesis:
-            hypothesis_block = (
-                f"\n\n## DOMAIN HYPOTHESIS\n"
-                f"{specialist_config.hypothesis}"
-            )
-        modified_system = (
-            f"{base}\n\n"
-            f"## SPECIALIST DOMAIN DIRECTIVE\n"
-            f"{suffix}"
-            f"{hypothesis_block}"
-        )
-        super().__init__(system_prompt=modified_system)
-        self._specialist = specialist_config
-
-    @property
-    def specialist_config(self) -> SpecialistConfig:
-        """Return the underlying specialist configuration."""
-        return self._specialist
-
-    def build_user_prompt(
-        self,
-        memory_signal: Dict[str, Any],
-        library_state: Dict[str, Any],
-        batch_size: int = 40,
-    ) -> str:
-        """Build user prompt with specialist operator/feature bias.
-
-        Calls the base ``PromptBuilder.build_user_prompt`` and appends a
-        directive asking the specialist to focus roughly 60% of its
-        candidates on its preferred operators and features, plus injects
-        example factors for grounding.
-
-        Parameters
-        ----------
-        memory_signal : dict
-            Memory priors (recommended/forbidden directions, etc.).
-        library_state : dict
-            Current library state (size, saturation, etc.).
-        batch_size : int
-            Number of candidates to generate.
-
-        Returns
-        -------
-        str
-            Assembled user prompt with specialist bias section.
-        """
-        base_prompt = super().build_user_prompt(
-            memory_signal=memory_signal,
-            library_state=library_state,
-            batch_size=batch_size,
-        )
-
-        spec = self._specialist
-        ops = ", ".join(spec.preferred_operators)
-        feats = ", ".join(spec.preferred_features)
-
-        specialist_section = (
-            f"\n## SPECIALIST FOCUS [{spec.name}]\n"
-            f"As the {spec.domain} specialist, focus ~60% of candidates on "
-            f"{{{ops}}} operators applied to {{{feats}}} features.\n"
-            f"The remaining ~40% should explore creative cross-domain "
-            f"combinations to maintain diversity.\n"
-        )
-
-        if spec.example_factors:
-            specialist_section += (
-                "\n## DOMAIN REFERENCE EXAMPLES (structure to emulate, not copy)\n"
-                + "\n".join(f"  - {ex}" for ex in spec.example_factors)
-                + "\n"
-            )
-
-        if spec.avoid:
-            specialist_section += (
-                "\n## DOMAIN-SPECIFIC AVOIDANCES\n"
-                + "\n".join(f"  X {av}" for av in spec.avoid)
-                + "\n"
-            )
-
-        return base_prompt + specialist_section
diff --git a/src/factorminer/factorminer/benchmark/__init__.py b/src/factorminer/factorminer/benchmark/__init__.py
deleted file mode 100644
index 0f1e87d..0000000
--- a/src/factorminer/factorminer/benchmark/__init__.py
+++ /dev/null
@@ -1,73 +0,0 @@
-"""Benchmark runners for paper-faithful and Helix research evaluation."""
-
-from src.factorminer.factorminer.benchmark.runtime import (
-    BenchmarkManifest,
-    build_benchmark_library,
-    evaluate_frozen_set,
-    load_benchmark_dataset,
-    run_ablation_memory_benchmark,
-    run_benchmark_suite,
-    run_cost_pressure_benchmark,
-    run_efficiency_benchmark,
-    run_runtime_mining_benchmark,
-    run_table1_benchmark,
-    select_frozen_top_k,
-)
-from src.factorminer.factorminer.benchmark.helix_benchmark import (
-    HelixBenchmark,
-    BenchmarkResult,
-    MethodResult,
-    DMTestResult,
-    StatisticalComparisonTests,
-    SpeedBenchmark,
-    OperatorSpeedResult,
-    PipelineSpeedResult,
-)
-
-try:  # pragma: no cover - optional in trimmed checkouts
-    from factorminer.benchmark.ablation import (
-        AblationStudy,
-        AblationResult,
-        AblatedMethodRunner,
-        ABLATION_CONFIGS,
-        ABLATION_LABELS,
-        run_full_ablation_study,
-    )
-except Exception:  # pragma: no cover - optional in trimmed checkouts
-    AblationStudy = None
-    AblationResult = None
-    AblatedMethodRunner = None
-    ABLATION_CONFIGS = None
-    ABLATION_LABELS = None
-    run_full_ablation_study = None
-
-__all__ = [
-    # legacy runtime benchmark
-    "BenchmarkManifest",
-    "build_benchmark_library",
-    "evaluate_frozen_set",
-    "load_benchmark_dataset",
-    "run_ablation_memory_benchmark",
-    "run_benchmark_suite",
-    "run_cost_pressure_benchmark",
-    "run_efficiency_benchmark",
-    "run_runtime_mining_benchmark",
-    "run_table1_benchmark",
-    "select_frozen_top_k",
-    # helix benchmark
-    "HelixBenchmark",
-    "BenchmarkResult",
-    "MethodResult",
-    "DMTestResult",
-    "StatisticalComparisonTests",
-    "SpeedBenchmark",
-    "OperatorSpeedResult",
-    "PipelineSpeedResult",
-    # ablation
-    "AblationStudy",
-    "AblationResult",
-    "AblatedMethodRunner",
-    "ABLATION_CONFIGS",
-    "ABLATION_LABELS",
-    "run_full_ablation_study",
-]
diff --git a/src/factorminer/factorminer/benchmark/ablation.py b/src/factorminer/factorminer/benchmark/ablation.py
deleted file mode 100644
index 6130611..0000000
--- a/src/factorminer/factorminer/benchmark/ablation.py
+++ /dev/null
@@ -1,798 +0,0 @@
-"""Runtime ablation study for HelixFactor Phase 2 components.
-
-This module now drives ablations through the real loop path:
-- HelixLoop execution on a training slice
-- runtime recomputation of the admitted library
-- freeze/top-k selection and combo evaluation on a held-out slice
-- optional memory suppression via temporary monkeypatching
-
-Supported ablations:
-  full             - all components enabled
-  no_debate        - disable specialist debate
-  no_causal        - disable causal validation
-  no_canonicalize  - disable SymPy deduplication
-  no_regime        - disable regime-aware evaluation
-  no_online_memory - disable memory retrieval / formation / evolution hooks
-  no_capacity      - disable capacity estimation
-  no_significance  - disable significance filtering
-  no_memory        - disable memory-guided generation and updates
-"""
-
-from __future__ import annotations
-
-import logging
-import tempfile
-import time
-from contextlib import contextmanager
-from typing import Any, Dict, List, Optional, Tuple
-
-import numpy as np
-import pandas as pd
-
-import src.factorminer.factorminer.core.helix_loop as helix_loop_module
-import src.factorminer.factorminer.core.ralph_loop as ralph_loop_module
-from src.factorminer.factorminer.agent.debate import DebateConfig as RuntimeDebateConfig
-from src.factorminer.factorminer.agent.llm_interface import MockProvider
-from src.factorminer.factorminer.benchmark.helix_benchmark import AblationResult, MethodResult
-from src.factorminer.factorminer.core.config import MiningConfig
-from src.factorminer.factorminer.core.helix_loop import HelixLoop
-from src.factorminer.factorminer.core.factor_library import FactorLibrary
-from src.factorminer.factorminer.evaluation.capacity import CapacityConfig as RuntimeCapacityConfig
-from src.factorminer.factorminer.evaluation.causal import CausalConfig as RuntimeCausalConfig
-from src.factorminer.factorminer.evaluation.regime import RegimeConfig as RuntimeRegimeConfig
-from src.factorminer.factorminer.evaluation.runtime import (
-    DatasetSplit,
-    EvaluationDataset,
-    evaluate_factors,
-)
-from src.factorminer.factorminer.evaluation.significance import (
-    SignificanceConfig as RuntimeSignificanceConfig,
-)
-from src.factorminer.factorminer.benchmark.runtime import (
-    build_benchmark_library,
-    evaluate_frozen_set,
-    select_frozen_top_k,
-)
-from src.factorminer.factorminer.memory.memory_store import ExperienceMemory
-
-logger = logging.getLogger(__name__)
-
-
-# ---------------------------------------------------------------------------
-# Ablation configuration registry
-# ---------------------------------------------------------------------------
-
-_FULL_CFG = {
-    "debate": True,
-    "causal": True,
-    "canonicalize": True,
-    "regime": True,
-    "online_memory": True,
-    "capacity": True,
-    "significance": True,
-    "memory": True,
-}
-
-ABLATION_CONFIGS: Dict[str, Dict[str, bool]] = {
-    "full": dict(_FULL_CFG),
-    "no_debate": {**_FULL_CFG, "debate": False},
-    "no_causal": {**_FULL_CFG, "causal": False},
-    "no_canonicalize": {**_FULL_CFG, "canonicalize": False},
-    "no_regime": {**_FULL_CFG, "regime": False},
-    "no_online_memory": {**_FULL_CFG, "online_memory": False},
-    "no_capacity": {**_FULL_CFG, "capacity": False},
-    "no_significance": {**_FULL_CFG, "significance": False},
-    "no_memory": {**_FULL_CFG, "memory": False, "debate": False},
-}
-
-ABLATION_LABELS: Dict[str, str] = {
-    "full": "HelixFactor (Full)",
-    "no_debate": "w/o Debate",
-    "no_causal": "w/o Causal",
-    "no_canonicalize": "w/o Canonicalization",
-    "no_regime": "w/o Regime",
-    "no_online_memory": "w/o Online Memory",
-    "no_capacity": "w/o Capacity",
-    "no_significance": "w/o Significance",
-    "no_memory": "w/o Memory (≈ FactorMiner NM)",
-}
-
-EXPECTED_CONTRIBUTION_SIGN: Dict[str, int] = {
-    "debate": +1,
-    "causal": +1,
-    "canonicalize": +1,
-    "regime": +1,
-    "online_memory": +1,
-    "capacity": +1,
-    "significance": +1,
-    "memory": +1,
-}
-
-_FEATURE_KEYS = [
-    "$open",
-    "$high",
-    "$low",
-    "$close",
-    "$volume",
-    "$amt",
-    "$vwap",
-    "$returns",
-]
-
-
-def _merge_slices(train_data: dict, test_data: dict) -> dict:
-    """Concatenate train/test slices into one runtime evaluation dictionary."""
-    merged: dict[str, np.ndarray] = {}
-    for key in sorted(set(train_data) | set(test_data)):
-        if key not in train_data or key not in test_data:
-            continue
-        left = np.asarray(train_data[key], dtype=np.float64)
-        right = np.asarray(test_data[key], dtype=np.float64)
-        if left.ndim == 2 and right.ndim == 2 and left.shape[0] == right.shape[0]:
-            merged[key] = np.concatenate([left, right], axis=1)
-        else:
-            merged[key] = np.asarray(left)
-    return merged
-
-
-def _slice_data(data: dict, start: int, end: int) -> dict:
-    """Slice all 2-D benchmark arrays to a column range."""
-    return {
-        key: value[:, start:end]
-        for key, value in data.items()
-        if isinstance(value, np.ndarray) and value.ndim >= 2
-    }
-
-
-def _build_runtime_dataset(data: dict) -> EvaluationDataset:
-    """Build a minimal runtime dataset from the benchmark dictionary format."""
-    feature_keys = [key for key in _FEATURE_KEYS if key in data]
-    if "forward_returns" not in data:
-        raise ValueError("Runtime ablation requires 'forward_returns' in the data dict")
-    if not feature_keys:
-        raise ValueError("Runtime ablation requires at least one market feature array")
-
-    arrays = [np.asarray(data[key], dtype=np.float64) for key in feature_keys]
-    data_tensor = np.stack(arrays, axis=2)
-    returns = np.asarray(data["forward_returns"], dtype=np.float64)
-    timestamps = np.arange(returns.shape[1])
-    asset_ids = np.arange(returns.shape[0])
-    full_split = DatasetSplit(
-        name="full",
-        indices=np.arange(returns.shape[1]),
-        timestamps=timestamps,
-        returns=returns,
-        target_returns={"target": returns},
-        default_target="target",
-    )
-
-    # The caller populates train/test splits by passing a merged train+test view.
-    return EvaluationDataset(
-        data_dict={key: np.asarray(data[key], dtype=np.float64) for key in feature_keys},
-        data_tensor=data_tensor,
-        returns=returns,
-        timestamps=timestamps,
-        asset_ids=asset_ids,
-        splits={"full": full_split},
-        processed_df=pd.DataFrame(),
-        target_panels={"target": returns},
-        default_target="target",
-    )
-
-
-def _build_split_dataset(data: dict, split_name: str) -> EvaluationDataset:
-    """Create a single-split runtime dataset from one benchmark slice."""
-    dataset = _build_runtime_dataset(data)
-    split = DatasetSplit(
-        name=split_name,
-        indices=np.arange(dataset.returns.shape[1]),
-        timestamps=dataset.timestamps,
-        returns=dataset.returns,
-        target_returns={"target": dataset.returns},
-        default_target="target",
-    )
-    dataset.splits = {split_name: split}
-    return dataset
-
-
-def _build_combined_dataset(train_data: dict, test_data: dict) -> EvaluationDataset:
-    """Create a train/test runtime dataset from sliced benchmark inputs."""
-    merged = _merge_slices(train_data, test_data)
-    dataset = _build_runtime_dataset(merged)
-    train_len = np.asarray(train_data["forward_returns"]).shape[1]
-    test_len = np.asarray(test_data["forward_returns"]).shape[1]
-    timestamps = np.arange(train_len + test_len)
-    returns = np.asarray(merged["forward_returns"], dtype=np.float64)
-
-    dataset.timestamps = timestamps
-    dataset.returns = returns
-    dataset.target_panels = {"target": returns}
-    dataset.default_target = "target"
-    dataset.splits = {
-        "train": DatasetSplit(
-            name="train",
-            indices=np.arange(0, train_len),
-            timestamps=timestamps[:train_len],
-            returns=returns[:, :train_len],
-            target_returns={"target": returns[:, :train_len]},
-            default_target="target",
-        ),
-        "test": DatasetSplit(
-            name="test",
-            indices=np.arange(train_len, train_len + test_len),
-            timestamps=timestamps[train_len:],
-            returns=returns[:, train_len:],
-            target_returns={"target": returns[:, train_len:]},
-            default_target="target",
-        ),
-        "full": DatasetSplit(
-            name="full",
-            indices=np.arange(train_len + test_len),
-            timestamps=timestamps,
-            returns=returns,
-            target_returns={"target": returns},
-            default_target="target",
-        ),
-    }
-    return dataset
-
-
-def _build_mining_config(
-    *,
-    output_dir: str,
-    target_library_size: int,
-    batch_size: int,
-    max_iterations: int,
-    ic_threshold: float,
-    correlation_threshold: float,
-) -> MiningConfig:
-    """Create a loop config tailored for a single runtime ablation."""
-    cfg = MiningConfig(
-        target_library_size=target_library_size,
-        batch_size=batch_size,
-        max_iterations=max_iterations,
-        ic_threshold=ic_threshold,
-        icir_threshold=0.5,
-        correlation_threshold=correlation_threshold,
-        replacement_ic_min=max(ic_threshold * 2.5, ic_threshold + 0.05),
-        replacement_ic_ratio=1.3,
-        fast_screen_assets=100,
-        num_workers=1,
-        output_dir=output_dir,
-        backend="numpy",
-        signal_failure_policy="reject",
-    )
-    cfg.benchmark_mode = "paper"
-    cfg.research = None
-    cfg.target_panels = None
-    cfg.target_horizons = None
-    return cfg
-
-
-def _build_phase2_configs(flags: Dict[str, bool]) -> Dict[str, Any]:
-    """Translate ablation flags into real HelixLoop runtime configs."""
-    return {
-        "debate_config": RuntimeDebateConfig() if flags.get("debate", True) else None,
-        "causal_config": RuntimeCausalConfig(enabled=True) if flags.get("causal", True) else None,
-        "regime_config": RuntimeRegimeConfig(enabled=True) if flags.get("regime", True) else None,
-        "capacity_config": RuntimeCapacityConfig(enabled=True) if flags.get("capacity", True) else None,
-        "significance_config": (
-            RuntimeSignificanceConfig(enabled=True)
-            if flags.get("significance", True)
-            else None
-        ),
-        "canonicalize": flags.get("canonicalize", True),
-    }
-
-
-@contextmanager
-def _patched_memory_hooks(enabled: bool):
-    """Disable memory retrieval and learning when a no-memory ablation is requested."""
-    if enabled:
-        yield
-        return
-
-    def _empty_signal(*_args, **_kwargs) -> dict[str, Any]:
-        return {
-            "recommended_directions": [],
-            "forbidden_directions": [],
-            "insights": [],
-            "library_state": {
-                "library_size": 0,
-                "recent_admission_rate": 0.0,
-                "saturated_domains": {},
-                "recent_admissions_count": 0,
-                "recent_rejections_count": 0,
-            },
-            "prompt_text": "",
-        }
-
-    def _identity_memory(memory, *args, **kwargs):
-        return memory
-
-    patch_targets = [
-        (ralph_loop_module, "retrieve_memory", _empty_signal),
-        (ralph_loop_module, "form_memory", _identity_memory),
-        (ralph_loop_module, "evolve_memory", _identity_memory),
-        (helix_loop_module, "retrieve_memory", _empty_signal),
-        (helix_loop_module, "form_memory", _identity_memory),
-        (helix_loop_module, "evolve_memory", _identity_memory),
-    ]
-
-    originals = []
-    for module, attr, replacement in patch_targets:
-        originals.append((module, attr, getattr(module, attr)))
-        setattr(module, attr, replacement)
-
-    try:
-        yield
-    finally:
-        for module, attr, original in originals:
-            setattr(module, attr, original)
-
-
-def _compute_avg_abs_rho(artifacts) -> float:
-    if len(artifacts) < 2:
-        return 0.0
-
-    corr = np.abs(
-        np.corrcoef([artifact.split_signals["train"].reshape(-1) for artifact in artifacts])
-    )
-    if corr.ndim != 2:
-        return 0.0
-    upper = corr[np.triu_indices_from(corr, k=1)]
-    upper = upper[np.isfinite(upper)]
-    return float(np.mean(upper)) if upper.size else 0.0
-
-
-def _runtime_payload_to_result(
-    *,
-    method: str,
-    payload: Dict[str, Any],
-    benchmark_library_size: int,
-    benchmark_succeeded: int,
-    elapsed_seconds: float,
-    run_id: int,
-) -> MethodResult:
-    """Convert runtime benchmark output into a MethodResult."""
-    library = payload.get("library", {})
-    combinations = payload.get("combinations", {})
-    selections = payload.get("selections", {})
-
-    result = MethodResult(
-        method=method,
-        library_ic=float(library.get("ic", 0.0)),
-        library_icir=float(library.get("icir", 0.0)),
-        avg_abs_rho=float(library.get("avg_abs_rho", 0.0)),
-        ew_ic=float(combinations.get("equal_weight", {}).get("ic", 0.0)),
-        ew_icir=float(combinations.get("equal_weight", {}).get("icir", 0.0)),
-        icw_ic=float(combinations.get("ic_weighted", {}).get("ic", 0.0)),
-        icw_icir=float(combinations.get("ic_weighted", {}).get("icir", 0.0)),
-        lasso_ic=float(selections.get("lasso", {}).get("ic", 0.0)),
-        lasso_icir=float(selections.get("lasso", {}).get("icir", 0.0)),
-        xgb_ic=float(selections.get("xgboost", {}).get("ic", 0.0)),
-        xgb_icir=float(selections.get("xgboost", {}).get("icir", 0.0)),
-        n_factors=benchmark_library_size,
-        admission_rate=benchmark_library_size / max(benchmark_succeeded, 1),
-        elapsed_seconds=elapsed_seconds,
-        ic_series=None,
-        run_id=run_id,
-    )
-    result.runtime_payload = payload
-    return result
-
-
-def _evaluate_runtime_library(
-    library,
-    dataset: EvaluationDataset,
-    cfg: MiningConfig,
-    *,
-    target_library_size: int,
-    cost_bps: Optional[List[float]] = None,
-) -> tuple[MethodResult, Dict[str, Any], int, int]:
-    """Recompute a mined library using the runtime benchmark contract."""
-    if cost_bps is None:
-        cost_bps = [1.0, 4.0, 7.0, 10.0, 11.0]
-
-    factors = library.list_factors()
-    artifacts = evaluate_factors(factors, dataset, signal_failure_policy="reject")
-    succeeded = [artifact for artifact in artifacts if artifact.succeeded]
-    benchmark_library, benchmark_stats = build_benchmark_library(
-        artifacts,
-        cfg,
-        split_name="train",
-        ic_threshold=cfg.ic_threshold,
-        correlation_threshold=cfg.correlation_threshold,
-    )
-    frozen = select_frozen_top_k(
-        artifacts,
-        benchmark_library,
-        top_k=target_library_size,
-        split_name="train",
-    )
-    payload = evaluate_frozen_set(
-        frozen,
-        dataset,
-        split_name="test",
-        fit_split="train",
-        cost_bps=cost_bps,
-    )
-    payload["benchmark"] = {
-        "admitted": benchmark_stats.get("admitted", 0),
-        "succeeded": benchmark_stats.get("succeeded", 0),
-        "replaced": benchmark_stats.get("replaced", 0),
-        "threshold_rejections": benchmark_stats.get("threshold_rejections", 0),
-        "correlation_rejections": benchmark_stats.get("correlation_rejections", 0),
-        "freeze_library_size": benchmark_library.size,
-        "frozen_top_k": [
-            {
-                "name": artifact.name,
-                "formula": artifact.formula,
-                "category": artifact.category,
-                "train_ic": artifact.split_stats["train"]["ic_abs_mean"],
-                "train_icir": abs(artifact.split_stats["train"]["icir"]),
-            }
-            for artifact in frozen
-        ],
-    }
-    result = _runtime_payload_to_result(
-        method="helix_phase2",
-        payload=payload,
-        benchmark_library_size=benchmark_library.size,
-        benchmark_succeeded=max(int(benchmark_stats.get("succeeded", 0)), 1),
-        elapsed_seconds=0.0,
-        run_id=0,
-    )
-    result.n_factors = benchmark_library.size
-    result.admission_rate = benchmark_library.size / max(benchmark_stats.get("succeeded", 0), 1)
-    result.avg_abs_rho = _compute_avg_abs_rho(frozen)
-    return result, payload, benchmark_library.size, int(benchmark_stats.get("succeeded", 0))
-
-
-class AblatedMethodRunner:
-    """Run one ablation variant through the real HelixLoop benchmark path."""
-
-    def __init__(
-        self,
-        cfg: Dict[str, bool],
-        ic_threshold: float = 0.02,
-        correlation_threshold: float = 0.5,
-        seed: int = 42,
-        llm_provider: Optional[Any] = None,
-        benchmark_mode: str = "paper",
-    ) -> None:
-        self._cfg = dict(cfg)
-        self.ic_threshold = ic_threshold
-        self.correlation_threshold = correlation_threshold
-        self.seed = seed
-        self.llm_provider = llm_provider
-        self.benchmark_mode = benchmark_mode
-
-    def _run_loop(
-        self,
-        *,
-        train_data: dict,
-        n_factors: int,
-    ) -> tuple[HelixLoop, MiningConfig]:
-        """Instantiate and run the real HelixLoop on the training slice."""
-        phase2 = _build_phase2_configs(self._cfg)
-        target_library_size = max(int(n_factors), 1)
-        max_iterations = max(target_library_size * 4, 4)
-        batch_size = max(4, min(target_library_size, 40))
-        loop_dataset = _build_runtime_dataset(train_data)
-        with tempfile.TemporaryDirectory(prefix="factorminer_ablation_") as tmp:
-            mining_cfg = _build_mining_config(
-                output_dir=tmp,
-                target_library_size=target_library_size,
-                batch_size=batch_size,
-                max_iterations=max_iterations,
-                ic_threshold=self.ic_threshold,
-                correlation_threshold=self.correlation_threshold,
-            )
-            mining_cfg.benchmark_mode = self.benchmark_mode
-            if self._cfg.get("memory", True):
-                memory = ExperienceMemory()
-            else:
-                memory = ExperienceMemory()
-
-            loop = HelixLoop(
-                config=mining_cfg,
-                data_tensor=loop_dataset.data_tensor,
-                returns=np.asarray(train_data["forward_returns"], dtype=np.float64),
-                llm_provider=self.llm_provider or MockProvider(),
-                memory=memory,
-                library=FactorLibrary(
-                    correlation_threshold=self.correlation_threshold,
-                    ic_threshold=self.ic_threshold,
-                ),
-                debate_config=phase2["debate_config"],
-                enable_knowledge_graph=False,
-                enable_embeddings=False,
-                enable_auto_inventor=False,
-                auto_invention_interval=10,
-                canonicalize=phase2["canonicalize"],
-                forgetting_lambda=0.95,
-                causal_config=phase2["causal_config"],
-                regime_config=phase2["regime_config"],
-                capacity_config=phase2["capacity_config"],
-                significance_config=phase2["significance_config"],
-                volume=np.asarray(train_data.get("$amt", train_data["forward_returns"]), dtype=np.float64)
-                if "$amt" in train_data
-                else None,
-            )
-            with _patched_memory_hooks(self._cfg.get("memory", True) and self._cfg.get("online_memory", True)):
-                loop.run(
-                    target_size=target_library_size,
-                    max_iterations=max_iterations,
-                    resume=False,
-                )
-            return loop, mining_cfg
-
-    def run(
-        self,
-        data: dict,
-        test_data: dict,
-        n_factors: int = 40,
-    ) -> MethodResult:
-        """Run this ablation variant using the real loop + runtime contract."""
-        t0 = time.perf_counter()
-        train_dataset = _build_split_dataset(data, "train")
-        benchmark_dataset = _build_combined_dataset(data, test_data)
-
-        loop, mining_cfg = self._run_loop(train_data=data, n_factors=n_factors)
-        result, payload, benchmark_library_size, benchmark_succeeded = _evaluate_runtime_library(
-            loop.library,
-            benchmark_dataset,
-            mining_cfg,
-            target_library_size=n_factors,
-        )
-        elapsed = time.perf_counter() - t0
-
-        result.elapsed_seconds = elapsed
-        result.method = "helix_phase2"
-        result.run_id = self.seed
-        result.runtime_payload = {
-            **payload,
-            "train_split": {
-                "train_length": train_dataset.returns.shape[1],
-                "benchmark_library_size": benchmark_library_size,
-                "benchmark_succeeded": benchmark_succeeded,
-            },
-            "ablation": {
-                "name": self._cfg,
-                "seed": self.seed,
-            },
-        }
-        return result
-
-
-class AblationStudy:
-    """Run real-loop ablations and summarize component contribution."""
-
-    def __init__(
-        self,
-        ic_threshold: float = 0.02,
-        correlation_threshold: float = 0.5,
-        seed: int = 42,
-        configs: Optional[Dict[str, Dict[str, bool]]] = None,
-        llm_provider: Optional[Any] = None,
-        benchmark_mode: str = "paper",
-    ) -> None:
-        self.ic_threshold = ic_threshold
-        self.correlation_threshold = correlation_threshold
-        self.seed = seed
-        self.configs = configs or ABLATION_CONFIGS
-        self.llm_provider = llm_provider
-        self.benchmark_mode = benchmark_mode
-
-    def run_ablation(
-        self,
-        data: dict,
-        train_period: Tuple[int, int],
-        test_period: Tuple[int, int],
-        n_factors: int = 40,
-        configs_to_run: Optional[List[str]] = None,
-    ) -> AblationResult:
-        """Run one or more ablation variants on the real loop pipeline."""
-        configs_to_run = configs_to_run or list(self.configs.keys())
-        train_data = _slice_data(data, *train_period)
-        test_data = _slice_data(data, *test_period)
-
-        config_results: Dict[str, MethodResult] = {}
-        for cfg_name in configs_to_run:
-            cfg = self.configs.get(cfg_name)
-            if cfg is None:
-                logger.warning("Unknown ablation config: %s", cfg_name)
-                continue
-
-            label = ABLATION_LABELS.get(cfg_name, cfg_name)
-            logger.info("Running ablation: %s", label)
-            t0 = time.perf_counter()
-            try:
-                runner = AblatedMethodRunner(
-                    cfg=cfg,
-                    ic_threshold=self.ic_threshold,
-                    correlation_threshold=self.correlation_threshold,
-                    seed=self.seed,
-                    llm_provider=self.llm_provider,
-                    benchmark_mode=self.benchmark_mode,
-                )
-                result = runner.run(
-                    data=train_data,
-                    test_data=test_data,
-                    n_factors=n_factors,
-                )
-                result.method = cfg_name
-                config_results[cfg_name] = result
-            except Exception as exc:
-                logger.warning("Ablation %s failed: %s", cfg_name, exc)
-                config_results[cfg_name] = MethodResult(method=cfg_name)
-
-            elapsed = time.perf_counter() - t0
-            ic = config_results[cfg_name].library_ic
-            logger.info("  %s: IC=%.4f  elapsed=%.1fs", cfg_name, ic, elapsed)
-
-        ablation = AblationResult(
-            configs=configs_to_run,
-            results=config_results,
-        )
-        ablation.contributions = self.summarize_contributions(ablation)
-        return ablation
-
-    def summarize_contributions(self, result: AblationResult) -> pd.DataFrame:
-        """Summarize component contributions relative to the full runtime run."""
-        full = result.results.get("full")
-        if full is None:
-            logger.warning("No 'full' config in ablation results; cannot summarize")
-            return pd.DataFrame()
-
-        rows = []
-        component_map = {
-            "no_debate": "debate",
-            "no_causal": "causal",
-            "no_canonicalize": "canonicalize",
-            "no_regime": "regime",
-            "no_online_memory": "online_memory",
-            "no_capacity": "capacity",
-            "no_significance": "significance",
-            "no_memory": "memory",
-        }
-
-        for ablation_key, component in component_map.items():
-            ablated = result.results.get(ablation_key)
-            if ablated is None:
-                continue
-
-            ic_contrib = full.library_ic - ablated.library_ic
-            icir_contrib = full.library_icir - ablated.library_icir
-            adm_delta = full.admission_rate - ablated.admission_rate
-
-            expected_sign = EXPECTED_CONTRIBUTION_SIGN.get(component, +1)
-            actual_sign = np.sign(ic_contrib) if ic_contrib != 0 else 0
-            if abs(ic_contrib) < 0.0005:
-                interpretation = "Negligible"
-            elif actual_sign == expected_sign:
-                pct = abs(ic_contrib) / max(full.library_ic, 1e-6) * 100
-                interpretation = f"Helps (+{pct:.1f}% IC)"
-            else:
-                interpretation = "Hurts (unexpected direction)"
-
-            rows.append({
-                "component": component,
-                "ablation_config": ablation_key,
-                "ic_full": full.library_ic,
-                "ic_ablated": ablated.library_ic,
-                "ic_contribution": ic_contrib,
-                "ic_contribution_pct": ic_contrib / max(full.library_ic, 1e-6) * 100,
-                "icir_full": full.library_icir,
-                "icir_ablated": ablated.library_icir,
-                "icir_contribution": icir_contrib,
-                "admission_rate_delta": adm_delta,
-                "interpretation": interpretation,
-            })
-
-        df = pd.DataFrame(rows)
-        if not df.empty:
-            df = df.sort_values("ic_contribution", ascending=False).reset_index(drop=True)
-        return df
-
-    def to_latex_table(self, result: AblationResult) -> str:
-        """Generate a LaTeX ablation study table."""
-        df = result.contributions
-        if df is None or df.empty:
-            return "% No ablation data available"
-
-        lines = [
-            r"\begin{table}[htbp]",
-            r"\centering",
-            r"\caption{HelixFactor Ablation Study: Component Contributions}",
-            r"\label{tab:ablation}",
-            r"\begin{tabular}{lccccl}",
-            r"\toprule",
-            r"Component & IC (Full) & IC (Ablated) & $\Delta$IC & $\Delta$IC\% & Interpretation \\",
-            r"\midrule",
-        ]
-
-        for _, row in df.iterrows():
-            lines.append(
-                f"{row['component'].replace('_', r' ')} & "
-                f"{row['ic_full']:.4f} & "
-                f"{row['ic_ablated']:.4f} & "
-                f"{row['ic_contribution']:+.4f} & "
-                f"{row['ic_contribution_pct']:+.1f}\\% & "
-                f"{row['interpretation']} \\\\"
-            )
-
-        lines += [r"\bottomrule", r"\end{tabular}", r"\end{table}"]
-        return "\n".join(lines)
-
-    def print_summary(self, result: AblationResult) -> None:
-        """Print a human-readable ablation summary."""
-        df = result.contributions
-        if df is None or df.empty:
-            print("  No ablation summary available.")
-            return
-
-        print("\n" + "=" * 70)
-        print("  Ablation Study: Component Contributions")
-        print("=" * 70)
-
-        full = result.results.get("full")
-        if full:
-            print(f"\n  FULL System: IC={full.library_ic:.4f}  ICIR={full.library_icir:.3f}")
-            print()
-
-        header = (
-            f"  {'Component':<22} {'IC Full':>8} {'IC Ablated':>10} "
-            f"{'Delta IC':>10} {'Delta%':>8}  Interpretation"
-        )
-        print(header)
-        print("  " + "-" * 80)
-
-        for _, row in df.iterrows():
-            comp = row["component"].replace("_", " ")
-            print(
-                f"  {comp:<22} {row['ic_full']:>8.4f} {row['ic_ablated']:>10.4f} "
-                f"{row['ic_contribution']:>+10.4f} {row['ic_contribution_pct']:>+7.1f}%  "
-                f"{row['interpretation']}"
-            )
-
-        print()
-
-
-def run_full_ablation_study(
-    n_assets: int = 100,
-    n_periods: int = 500,
-    n_factors: int = 40,
-    seed: int = 42,
-    configs_to_run: Optional[List[str]] = None,
-    verbose: bool = True,
-) -> AblationResult:
-    """Run the full runtime ablation study on mock data."""
-    if verbose:
-        print("\nGenerating mock data for ablation study...")
-
-    from factorminer.benchmark.helix_benchmark import _build_mock_data_dict
-
-    data = _build_mock_data_dict(n_assets=n_assets, n_periods=n_periods, seed=seed)
-    T = list(data.values())[0].shape[1]
-    train_end = int(T * 0.7)
-
-    if verbose:
-        print(f"  Data: M={n_assets}, T={T}, train=0:{train_end}, test={train_end}:{T}")
-        cfgs = configs_to_run or list(ABLATION_CONFIGS.keys())
-        print(f"  Running {len(cfgs)} ablation configurations through real loops...")
-
-    study = AblationStudy(seed=seed, llm_provider=MockProvider())
-    result = study.run_ablation(
-        data=data,
-        train_period=(0, train_end),
-        test_period=(train_end, T),
-        n_factors=n_factors,
-        configs_to_run=configs_to_run,
-    )
-
-    if verbose:
-        study.print_summary(result)
-
-    return result
diff --git a/src/factorminer/factorminer/benchmark/catalogs.py b/src/factorminer/factorminer/benchmark/catalogs.py
deleted file mode 100644
index 66dc15f..0000000
--- a/src/factorminer/factorminer/benchmark/catalogs.py
+++ /dev/null
@@ -1,236 +0,0 @@
-"""Deterministic baseline formula catalogs for benchmark workflows."""
-
-from __future__ import annotations
-
-from dataclasses import dataclass
-import re
-from typing import Iterable
-
-import numpy as np
-
-from src.factorminer.factorminer.core.library_io import PAPER_FACTORS
-
-
-@dataclass(frozen=True)
-class CandidateEntry:
-    """One benchmark candidate formula."""
-
-    name: str
-    formula: str
-    category: str
-
-
-ALPHA101_CLASSIC: tuple[CandidateEntry, ...] = (
-    CandidateEntry(
-        "alpha101_close_return_rank",
-        "Neg(CsRank(Return($close, 5)))",
-        "Alpha101 Classic",
-    ),
-    CandidateEntry(
-        "alpha101_intraday_position",
-        "CsRank(Div(Sub($close, $open), Add(Sub($high, $low), 1e-8)))",
-        "Alpha101 Classic",
-    ),
-    CandidateEntry(
-        "alpha101_volume_reversal",
-        "Neg(CsRank(Mul(Return($close, 5), Div($volume, Mean($volume, 20)))))",
-        "Alpha101 Classic",
-    ),
-    CandidateEntry(
-        "alpha101_vwap_gap",
-        "Neg(CsRank(Div(Sub($close, $vwap), Add($vwap, 1e-8))))",
-        "Alpha101 Classic",
-    ),
-    CandidateEntry(
-        "alpha101_price_volume_corr",
-        "Neg(CsRank(Corr(CsRank($close), CsRank($volume), 10)))",
-        "Alpha101 Classic",
-    ),
-    CandidateEntry(
-        "alpha101_range_volatility",
-        "Neg(CsRank(Std(Div(Sub($high, $low), Add($close, 1e-8)), 20)))",
-        "Alpha101 Classic",
-    ),
-    CandidateEntry(
-        "alpha101_close_vs_mean",
-        "Neg(CsRank(Div(Sub($close, Mean($close, 10)), Add(Std($close, 10), 1e-8))))",
-        "Alpha101 Classic",
-    ),
-    CandidateEntry(
-        "alpha101_turnover_rank",
-        "Neg(CsRank(Div($amt, Add(Mean($amt, 20), 1e-8))))",
-        "Alpha101 Classic",
-    ),
-    CandidateEntry(
-        "alpha101_return_skew",
-        "Neg(CsRank(Skew(Return($close, 1), 20)))",
-        "Alpha101 Classic",
-    ),
-    CandidateEntry(
-        "alpha101_trend_strength",
-        "CsRank(TsRank(Return($close, 1), 20))",
-        "Alpha101 Classic",
-    ),
-    CandidateEntry(
-        "alpha101_volume_std",
-        "Neg(CsRank(Div(Std($volume, 20), Add(Mean($volume, 20), 1e-8))))",
-        "Alpha101 Classic",
-    ),
-    CandidateEntry(
-        "alpha101_amount_momentum",
-        "CsRank(Mul(Return($close, 10), Div($amt, Add(Mean($amt, 20), 1e-8))))",
-        "Alpha101 Classic",
-    ),
-)
-
-_WINDOW_PATTERN = re.compile(r"\b(5|10|20|30)\b")
-
-
-def build_alpha101_adapted() -> list[CandidateEntry]:
-    """Expand the classic catalog into frequency-adapted window variants."""
-    variants: list[CandidateEntry] = []
-    windows = (3, 6, 12, 24, 48)
-    for entry in ALPHA101_CLASSIC:
-        for window in windows:
-            formula = _WINDOW_PATTERN.sub(str(window), entry.formula)
-            variants.append(
-                CandidateEntry(
-                    name=f"{entry.name}_w{window}",
-                    formula=formula,
-                    category="Alpha101 Adapted",
-                )
-            )
-    return variants
-
-
-def build_random_exploration(seed: int, count: int = 160) -> list[CandidateEntry]:
-    """Generate deterministic random-formula candidates from safe templates."""
-    rng = np.random.RandomState(seed)
-    unary_templates = [
-        "Neg(CsRank(Return({feat}, {w1})))",
-        "CsRank(TsRank({feat}, {w1}))",
-        "Neg(CsRank(Div(Sub({feat}, Mean({feat}, {w1})), Add(Std({feat}, {w1}), 1e-8))))",
-        "CsRank(Div(Std({feat}, {w1}), Add(Mean({feat}, {w2}), 1e-8)))",
-        "Neg(CsRank(Skew({feat}, {w1})))",
-    ]
-    binary_templates = [
-        "Neg(CsRank(Corr(CsRank({feat_a}), CsRank({feat_b}), {w1})))",
-        "CsRank(Div(Sub({feat_a}, {feat_b}), Add(Std({feat_b}, {w1}), 1e-8)))",
-        "Neg(CsRank(Mul(Return({feat_a}, {w1}), Div({feat_b}, Add(Mean({feat_b}, {w2}), 1e-8)))))",
-        "CsRank(Cov({feat_a}, {feat_b}, {w1}))",
-        "Neg(CsRank(Div(Sub(EMA({feat_a}, {w1}), EMA({feat_b}, {w2})), Add(Std({feat_a}, {w1}), 1e-8))))",
-    ]
-    features = ("$open", "$high", "$low", "$close", "$volume", "$amt", "$vwap", "$returns")
-    windows = (3, 5, 10, 20, 30, 48)
-
-    entries: list[CandidateEntry] = []
-    for idx in range(count):
-        use_binary = bool(rng.randint(0, 2))
-        if use_binary:
-            template = binary_templates[rng.randint(0, len(binary_templates))]
-            feat_a, feat_b = rng.choice(features, size=2, replace=False)
-            formula = template.format(
-                feat_a=feat_a,
-                feat_b=feat_b,
-                w1=int(rng.choice(windows)),
-                w2=int(rng.choice(windows)),
-            )
-        else:
-            template = unary_templates[rng.randint(0, len(unary_templates))]
-            formula = template.format(
-                feat=rng.choice(features),
-                w1=int(rng.choice(windows)),
-                w2=int(rng.choice(windows)),
-            )
-        entries.append(
-            CandidateEntry(
-                name=f"random_exploration_{idx:03d}",
-                formula=formula,
-                category="Random Exploration",
-            )
-        )
-    return entries
-
-
-def build_gplearn_style(seed: int, count: int = 160) -> list[CandidateEntry]:
-    """Build deeper deterministic mutation chains that mimic GP search."""
-    base = build_random_exploration(seed + 17, count=max(count, 64))
-    rng = np.random.RandomState(seed + 23)
-    entries: list[CandidateEntry] = []
-    for idx in range(count):
-        left = base[idx % len(base)].formula
-        right = base[rng.randint(0, len(base))].formula
-        if idx % 3 == 0:
-            formula = f"Neg(CsRank(Add({left}, {right})))"
-        elif idx % 3 == 1:
-            formula = f"CsRank(Div(Sub({left}, {right}), Add(Std($close, 10), 1e-8)))"
-        else:
-            formula = f"Neg(CsRank(Mul({left}, {right})))"
-        entries.append(
-            CandidateEntry(
-                name=f"gplearn_style_{idx:03d}",
-                formula=formula,
-                category="GPLearn",
-            )
-        )
-    return entries
-
-
-def build_alphaforge_style() -> list[CandidateEntry]:
-    """Reuse a diverse subset of the paper catalog for dynamic-combine baselines."""
-    entries: list[CandidateEntry] = []
-    for idx, factor in enumerate(PAPER_FACTORS[::2][:80]):
-        entries.append(
-            CandidateEntry(
-                name=f"alphaforge_style_{idx:03d}",
-                formula=factor["formula"],
-                category="AlphaForge-style",
-            )
-        )
-    return entries
-
-
-def build_alphaagent_style() -> list[CandidateEntry]:
-    """Reuse an alternate paper-catalog slice for LLM-style baseline proposals."""
-    entries: list[CandidateEntry] = []
-    for idx, factor in enumerate(PAPER_FACTORS[1::2][:80]):
-        entries.append(
-            CandidateEntry(
-                name=f"alphaagent_style_{idx:03d}",
-                formula=factor["formula"],
-                category="AlphaAgent-style",
-            )
-        )
-    return entries
-
-
-def build_factor_miner_catalog() -> list[CandidateEntry]:
-    """Expose the full paper factor catalog as benchmark candidates."""
-    return [
-        CandidateEntry(
-            name=f"factor_miner_{idx + 1:03d}",
-            formula=factor["formula"],
-            category=factor["category"],
-        )
-        for idx, factor in enumerate(PAPER_FACTORS)
-    ]
-
-
-def entries_from_library(library) -> list[CandidateEntry]:
-    """Convert a saved FactorLibrary into benchmark candidate entries."""
-    return [
-        CandidateEntry(name=factor.name, formula=factor.formula, category=factor.category)
-        for factor in library.list_factors()
-    ]
-
-
-def dedupe_entries(entries: Iterable[CandidateEntry]) -> list[CandidateEntry]:
-    """Remove duplicate formulas while preserving order."""
-    seen: set[str] = set()
-    unique: list[CandidateEntry] = []
-    for entry in entries:
-        if entry.formula in seen:
-            continue
-        seen.add(entry.formula)
-        unique.append(entry)
-    return unique
diff --git a/src/factorminer/factorminer/benchmark/helix_benchmark.py b/src/factorminer/factorminer/benchmark/helix_benchmark.py
deleted file mode 100644
index d92df09..0000000
--- a/src/factorminer/factorminer/benchmark/helix_benchmark.py
+++ /dev/null
@@ -1,2172 +0,0 @@
-"""HelixBenchmark — rigorous comparison of HelixFactor vs FactorMiner.
-
-Provides five inter-operating classes that together form a complete
-benchmarking suite for the HelixFactor vs FactorMiner (Ralph Loop) paper:
-
-  HelixBenchmark          — main comparison class (Table 1 style)
-  StatisticalComparisonTests — DM test, paired t-test, block bootstrap
-  SpeedBenchmark          — operator / factor / pipeline timing
-  BenchmarkResult         — aggregate result container + report generators
-  DMTestResult / MethodResult — individual result containers
-
-CLI usage:
-  python -m factorminer.benchmark.helix_benchmark --mock --n-factors 40 --output results/
-"""
-
-from __future__ import annotations
-
-import copy
-import argparse
-import json
-import logging
-import math
-import sys
-import time
-import warnings
-from dataclasses import dataclass, field, asdict
-from pathlib import Path
-from typing import Any, Dict, List, Optional, Tuple
-
-import numpy as np
-import pandas as pd
-from scipy.stats import ttest_rel, wilcoxon
-
-warnings.filterwarnings("ignore")
-logger = logging.getLogger(__name__)
-
-
-# ---------------------------------------------------------------------------
-# Serialization helpers
-# ---------------------------------------------------------------------------
-
-def _json_safe(value: Any) -> Any:
-    """Recursively convert a structure into JSON-safe primitives."""
-    if isinstance(value, dict):
-        return {str(k): _json_safe(v) for k, v in value.items()}
-    if isinstance(value, list):
-        return [_json_safe(v) for v in value]
-    if isinstance(value, tuple):
-        return [_json_safe(v) for v in value]
-    if isinstance(value, np.generic):
-        value = value.item()
-    if isinstance(value, float) and not math.isfinite(value):
-        return None
-    return value
-
-
-# ---------------------------------------------------------------------------
-# Result containers
-# ---------------------------------------------------------------------------
-
-@dataclass
-class MethodResult:
-    """Metrics for a single method run."""
-
-    method: str
-    library_ic: float = 0.0
-    library_icir: float = 0.0
-    avg_abs_rho: float = 0.0
-    ew_ic: float = 0.0
-    ew_icir: float = 0.0
-    icw_ic: float = 0.0
-    icw_icir: float = 0.0
-    lasso_ic: float = 0.0
-    lasso_icir: float = 0.0
-    xgb_ic: float = 0.0
-    xgb_icir: float = 0.0
-    n_factors: int = 0
-    admission_rate: float = 0.0
-    elapsed_seconds: float = 0.0
-    avg_turnover: float = 0.0
-    # raw IC series for statistical tests (not serialized by default)
-    ic_series: Optional[np.ndarray] = field(default=None, repr=False)
-    run_id: int = 0
-
-    def to_dict(self) -> dict:
-        d = asdict(self)
-        d.pop("ic_series", None)
-        return d
-
-
-@dataclass
-class DMTestResult:
-    """Diebold-Mariano test for forecast accuracy difference."""
-
-    dm_statistic: float
-    p_value: float
-    is_significant: bool
-    direction: str   # "helix_better", "ralph_better", "no_difference"
-    n_obs: int
-
-
-@dataclass
-class AblationResult:
-    """Result of one ablation study."""
-
-    configs: List[str]
-    results: Dict[str, MethodResult]
-    contributions: Optional[pd.DataFrame] = None
-
-    def to_dict(self) -> dict:
-        return {
-            "configs": self.configs,
-            "results": {k: v.to_dict() for k, v in self.results.items()},
-        }
-
-
-@dataclass
-class OperatorSpeedResult:
-    """Timing for individual operators."""
-
-    operator_timings_ms: Dict[str, float]   # operator_name -> ms
-    n_assets: int
-    n_periods: int
-    n_repeats: int
-
-
-@dataclass
-class PipelineSpeedResult:
-    """Timing for end-to-end pipeline."""
-
-    total_seconds: float
-    candidates_per_second: float
-    n_candidates: int
-
-
-@dataclass
-class BenchmarkResult:
-    """Aggregate benchmark results — all methods, all metrics."""
-
-    methods: List[str]
-    factor_library_metrics: pd.DataFrame    # IC, ICIR, Avg|rho| per method
-    combination_metrics: pd.DataFrame       # EW/ICW IC and ICIR
-    selection_metrics: pd.DataFrame         # LASSO, XGBoost
-    speed_metrics: pd.DataFrame
-    statistical_tests: Dict[str, Any]
-    ablation_result: Optional[AblationResult] = None
-    raw_method_results: Dict[str, List[MethodResult]] = field(default_factory=dict)
-    turnover_metrics: pd.DataFrame = field(default_factory=pd.DataFrame)
-    cost_pressure_metrics: pd.DataFrame = field(default_factory=pd.DataFrame)
-    runtime_artifacts: Dict[str, Any] = field(default_factory=dict)
-
-    # ------------------------------------------------------------------
-    # Formatting helpers
-    # ------------------------------------------------------------------
-
-    def to_latex_table(self) -> str:
-        """Generate a LaTeX table matching paper Table 1 style."""
-        lines = [
-            r"\begin{table}[htbp]",
-            r"\centering",
-            r"\caption{HelixFactor vs FactorMiner: Comprehensive Benchmark (Table 1 Style)}",
-            r"\label{tab:benchmark}",
-            r"\small",
-            r"\begin{tabular}{lcccccccc}",
-            r"\toprule",
-            r"Method & \multicolumn{3}{c}{Factor Library} & \multicolumn{2}{c}{EW Combo} & \multicolumn{2}{c}{ICW Combo} & Sel.IC \\",
-            r"\cmidrule(lr){2-4}\cmidrule(lr){5-6}\cmidrule(lr){7-8}",
-            r" & IC(\%) & ICIR & Avg$|\rho|$ & IC(\%) & ICIR & IC(\%) & ICIR & IC(\%) \\",
-            r"\midrule",
-        ]
-
-        for method in self.methods:
-            lib_row = self.factor_library_metrics[
-                self.factor_library_metrics["method"] == method
-            ]
-            comb_row = self.combination_metrics[
-                self.combination_metrics["method"] == method
-            ]
-            sel_row = self.selection_metrics[
-                self.selection_metrics["method"] == method
-            ]
-
-            def _g(df, col, mult=100.0):
-                if df.empty or col not in df.columns:
-                    return 0.0
-                v = df.iloc[0][col]
-                return float(v) * mult if not pd.isna(v) else 0.0
-
-            bold = method in ("helix_phase2",)
-            fmt = lambda x, d=2: f"{x:.{d}f}"
-
-            lib_ic = _g(lib_row, "ic_pct", 1.0)
-            lib_icir = _g(lib_row, "icir", 1.0)
-            lib_rho = _g(lib_row, "avg_abs_rho", 1.0)
-            ew_ic = _g(comb_row, "ew_ic_pct", 1.0)
-            ew_icir = _g(comb_row, "ew_icir", 1.0)
-            icw_ic = _g(comb_row, "icw_ic_pct", 1.0)
-            icw_icir = _g(comb_row, "icw_icir", 1.0)
-            sel_ic = _g(sel_row, "best_ic_pct", 1.0)
-
-            row_parts = [
-                method.replace("_", r"\_"),
-                fmt(lib_ic),
-                fmt(lib_icir),
-                fmt(lib_rho),
-                fmt(ew_ic),
-                fmt(ew_icir),
-                fmt(icw_ic),
-                fmt(icw_icir),
-                fmt(sel_ic),
-            ]
-            if bold:
-                row_parts = [r"\textbf{" + p + r"}" for p in row_parts]
-
-            lines.append(" & ".join(row_parts) + r" \\")
-
-        lines += [
-            r"\bottomrule",
-            r"\end{tabular}",
-            r"\end{table}",
-        ]
-        return "\n".join(lines)
-
-    def to_markdown_table(self) -> str:
-        """Generate a Markdown table for GitHub README."""
-        header = (
-            "| Method | IC (%) | ICIR | Avg|ρ| | EW IC (%) | EW ICIR | "
-            "ICW IC (%) | ICW ICIR | Las IC (%) | XGB IC (%) |\n"
-            "|--------|--------|------|---------|-----------|---------|"
-            "-----------|----------|-----------|------------|\n"
-        )
-        rows = []
-        for method in self.methods:
-            lib_row = self.factor_library_metrics[
-                self.factor_library_metrics["method"] == method
-            ]
-            comb_row = self.combination_metrics[
-                self.combination_metrics["method"] == method
-            ]
-            sel_row = self.selection_metrics[
-                self.selection_metrics["method"] == method
-            ]
-
-            def _g(df, col):
-                if df.empty or col not in df.columns:
-                    return 0.0
-                v = df.iloc[0][col]
-                return float(v) if not pd.isna(v) else 0.0
-
-            tag = " **" if method == "helix_phase2" else ""
-            rows.append(
-                f"| {method}{tag} | "
-                f"{_g(lib_row,'ic_pct'):.2f} | "
-                f"{_g(lib_row,'icir'):.3f} | "
-                f"{_g(lib_row,'avg_abs_rho'):.3f} | "
-                f"{_g(comb_row,'ew_ic_pct'):.2f} | "
-                f"{_g(comb_row,'ew_icir'):.3f} | "
-                f"{_g(comb_row,'icw_ic_pct'):.2f} | "
-                f"{_g(comb_row,'icw_icir'):.3f} | "
-                f"{_g(sel_row,'lasso_ic_pct'):.2f} | "
-                f"{_g(sel_row,'xgb_ic_pct'):.2f} |\n"
-            )
-        return header + "".join(rows)
-
-    def plot_comparison(self, save_path: str) -> None:
-        """Generate bar chart comparison (requires matplotlib)."""
-        try:
-            import matplotlib.pyplot as plt
-            import matplotlib.patches as mpatches
-        except ImportError:
-            logger.warning("matplotlib not available; skipping plot")
-            return
-
-        metrics = ["IC (%)", "ICIR", "EW IC (%)", "ICW IC (%)"]
-        method_colors = {
-            "random_exploration": "#aaaaaa",
-            "alpha101_classic": "#6baed6",
-            "alpha101_adapted": "#3182bd",
-            "ralph_loop": "#fd8d3c",
-            "helix_phase2": "#31a354",
-        }
-
-        fig, axes = plt.subplots(1, len(metrics), figsize=(16, 5))
-        fig.suptitle("HelixFactor vs FactorMiner Benchmark", fontsize=14, fontweight="bold")
-
-        for ax_idx, metric in enumerate(metrics):
-            ax = axes[ax_idx]
-            values = []
-            colors = []
-            labels = []
-
-            for method in self.methods:
-                color = method_colors.get(method, "#888888")
-                if metric == "IC (%)":
-                    row = self.factor_library_metrics[
-                        self.factor_library_metrics["method"] == method
-                    ]
-                    v = float(row["ic_pct"].iloc[0]) if not row.empty else 0.0
-                elif metric == "ICIR":
-                    row = self.factor_library_metrics[
-                        self.factor_library_metrics["method"] == method
-                    ]
-                    v = float(row["icir"].iloc[0]) if not row.empty else 0.0
-                elif metric == "EW IC (%)":
-                    row = self.combination_metrics[
-                        self.combination_metrics["method"] == method
-                    ]
-                    v = float(row["ew_ic_pct"].iloc[0]) if not row.empty else 0.0
-                elif metric == "ICW IC (%)":
-                    row = self.combination_metrics[
-                        self.combination_metrics["method"] == method
-                    ]
-                    v = float(row["icw_ic_pct"].iloc[0]) if not row.empty else 0.0
-                else:
-                    v = 0.0
-
-                values.append(v)
-                colors.append(color)
-                labels.append(method.replace("_", "\n"))
-
-            bars = ax.bar(range(len(values)), values, color=colors, alpha=0.85, edgecolor="white")
-            ax.set_xticks(range(len(labels)))
-            ax.set_xticklabels(labels, fontsize=7, rotation=30, ha="right")
-            ax.set_title(metric, fontsize=10)
-            ax.grid(axis="y", alpha=0.3)
-            ax.spines["top"].set_visible(False)
-            ax.spines["right"].set_visible(False)
-
-        plt.tight_layout()
-        Path(save_path).parent.mkdir(parents=True, exist_ok=True)
-        plt.savefig(save_path, dpi=150, bbox_inches="tight")
-        plt.close()
-        logger.info("Saved comparison plot to %s", save_path)
-
-    def generate_full_report(self, save_path: str) -> None:
-        """Generate a complete HTML report with all results."""
-        html = self._build_html_report()
-        Path(save_path).parent.mkdir(parents=True, exist_ok=True)
-        with open(save_path, "w") as f:
-            f.write(html)
-        logger.info("Saved full HTML report to %s", save_path)
-
-    def _build_html_report(self) -> str:
-        lib_html = self.factor_library_metrics.to_html(index=False, float_format="{:.4f}".format)
-        comb_html = self.combination_metrics.to_html(index=False, float_format="{:.4f}".format)
-        sel_html = self.selection_metrics.to_html(index=False, float_format="{:.4f}".format)
-        speed_html = self.speed_metrics.to_html(index=False, float_format="{:.3f}".format)
-        turnover_html = ""
-        if not self.turnover_metrics.empty:
-            turnover_html = self.turnover_metrics.to_html(index=False, float_format="{:.4f}".format)
-        cost_html = ""
-        if not self.cost_pressure_metrics.empty:
-            cost_html = self.cost_pressure_metrics.to_html(index=False, float_format="{:.4f}".format)
-
-        stat_rows = []
-        for k, v in self.statistical_tests.items():
-            if isinstance(v, dict):
-                for sk, sv in v.items():
-                    stat_rows.append(f"<tr><td>{k}.{sk}</td><td>{sv}</td></tr>")
-            else:
-                stat_rows.append(f"<tr><td>{k}</td><td>{v}</td></tr>")
-        stat_html = (
-            "<table border='1'><tr><th>Test</th><th>Result</th></tr>"
-            + "".join(stat_rows)
-            + "</table>"
-        )
-
-        ablation_html = ""
-        if self.ablation_result is not None and self.ablation_result.contributions is not None:
-            ablation_html = (
-                "<h2>Ablation Study</h2>"
-                + self.ablation_result.contributions.to_html(
-                    index=False, float_format="{:.4f}".format
-                )
-            )
-
-        css = """
-        body { font-family: Arial, sans-serif; margin: 40px; background: #f8f9fa; color: #333; }
-        h1 { color: #1a5276; border-bottom: 3px solid #1a5276; padding-bottom: 8px; }
-        h2 { color: #2c3e50; margin-top: 30px; }
-        table { border-collapse: collapse; width: 100%; margin-bottom: 20px; }
-        th { background: #1a5276; color: white; padding: 8px 12px; text-align: left; }
-        td { padding: 6px 12px; border-bottom: 1px solid #ddd; }
-        tr:nth-child(even) { background: #f2f2f2; }
-        tr:hover { background: #d6eaf8; }
-        .helix-row { background: #d5f5e3 !important; font-weight: bold; }
-        .summary-box { background: #eaf2ff; border-left: 5px solid #1a5276;
-                        padding: 15px; margin: 20px 0; border-radius: 4px; }
-        """
-
-        return f"""<!DOCTYPE html>
-<html><head><meta charset="utf-8"><title>HelixFactor Benchmark Report</title>
-<style>{css}</style></head><body>
-<h1>HelixFactor Benchmark Report</h1>
-<div class="summary-box">
-<strong>Methods evaluated:</strong> {", ".join(self.methods)}<br>
-<strong>Generated:</strong> {pd.Timestamp.now().strftime("%Y-%m-%d %H:%M:%S")}
-</div>
-<h2>Factor Library Metrics</h2>{lib_html}
-<h2>Factor Combination Metrics</h2>{comb_html}
-<h2>Factor Selection Metrics</h2>{sel_html}
-<h2>Speed Benchmarks</h2>{speed_html}
-{f'<h2>Turnover</h2>{turnover_html}' if turnover_html else ''}
-{f'<h2>Cost Pressure</h2>{cost_html}' if cost_html else ''}
-<h2>Statistical Tests</h2>{stat_html}
-{ablation_html}
-</body></html>"""
-
-
-# ---------------------------------------------------------------------------
-# Statistical tests
-# ---------------------------------------------------------------------------
-
-class StatisticalComparisonTests:
-    """Rigorous statistical comparison between HelixFactor and FactorMiner.
-
-    Implements four complementary tests:
-      1. Diebold-Mariano (DM) test for forecast accuracy differences
-      2. Paired t-test on IC(Helix) − IC(Ralph) across test period
-      3. Block-bootstrap 95% CI on IC difference
-      4. Wilcoxon signed-rank test (non-parametric)
-    """
-
-    def __init__(self, seed: int = 42) -> None:
-        self._rng = np.random.RandomState(seed)
-
-    @staticmethod
-    def _paired_valid_series(
-        ic_series_1: np.ndarray,
-        ic_series_2: np.ndarray,
-    ) -> Tuple[np.ndarray, np.ndarray]:
-        """Align paired series and drop rows with NaNs in either series."""
-        min_len = min(len(ic_series_1), len(ic_series_2))
-        s1 = np.asarray(ic_series_1[:min_len], dtype=np.float64)
-        s2 = np.asarray(ic_series_2[:min_len], dtype=np.float64)
-        mask = ~np.isnan(s1) & ~np.isnan(s2)
-        return s1[mask], s2[mask]
-
-    # ------------------------------------------------------------------
-    # Diebold-Mariano test
-    # ------------------------------------------------------------------
-
-    def diebold_mariano_test(
-        self,
-        ic_series_1: np.ndarray,
-        ic_series_2: np.ndarray,
-        h: int = 1,
-    ) -> DMTestResult:
-        """Diebold-Mariano test for forecast accuracy differences.
-
-        Tests H0: E[d_t] = 0 where d_t = L(e1_t) - L(e2_t) is the
-        differential loss. Uses the HAC-robust variance estimator with
-        bandwidth h-1 (Andrews, 1991).
-
-        Parameters
-        ----------
-        ic_series_1 : ndarray  (e.g. HelixFactor IC time series)
-        ic_series_2 : ndarray  (e.g. FactorMiner IC time series)
-        h : int
-            Forecast horizon (default 1 for one-step-ahead).
-
-        Returns
-        -------
-        DMTestResult
-        """
-        s1, s2 = self._paired_valid_series(ic_series_1, ic_series_2)
-        min_len = len(s1)
-        if min_len < 5:
-            return DMTestResult(
-                dm_statistic=0.0, p_value=1.0, is_significant=False,
-                direction="no_difference", n_obs=min_len,
-            )
-
-        # Loss differential: squared-error loss on IC as forecast of return
-        d = s1 ** 2 - s2 ** 2
-        T = len(d)
-        if np.allclose(d, 0.0):
-            return DMTestResult(
-                dm_statistic=0.0,
-                p_value=1.0,
-                is_significant=False,
-                direction="no_difference",
-                n_obs=T,
-            )
-        d_bar = np.mean(d)
-
-        # HAC variance of d_bar (Newey-West with bandwidth h-1)
-        bandwidth = max(h - 1, 0)
-        gamma_0 = np.var(d, ddof=0)
-        if gamma_0 <= 0 or np.isnan(gamma_0):
-            return DMTestResult(
-                dm_statistic=0.0,
-                p_value=1.0,
-                is_significant=False,
-                direction="no_difference",
-                n_obs=T,
-            )
-        hac_var = gamma_0
-        for lag in range(1, bandwidth + 1):
-            gamma_k = np.mean(
-                (d[lag:] - d_bar) * (d[:-lag] - d_bar)
-            )
-            hac_var += 2.0 * (1.0 - lag / (bandwidth + 1)) * gamma_k
-
-        if hac_var <= 0 or np.isnan(hac_var):
-            return DMTestResult(
-                dm_statistic=0.0,
-                p_value=1.0,
-                is_significant=False,
-                direction="no_difference",
-                n_obs=T,
-            )
-
-        dm_stat = d_bar / np.sqrt(hac_var / T)
-        if not np.isfinite(dm_stat):
-            return DMTestResult(
-                dm_statistic=0.0,
-                p_value=1.0,
-                is_significant=False,
-                direction="no_difference",
-                n_obs=T,
-            )
-
-        # Two-sided p-value using normal approximation
-        from scipy.stats import norm
-        p_value = 2.0 * (1.0 - float(norm.cdf(abs(dm_stat))))
-        if not np.isfinite(p_value):
-            return DMTestResult(
-                dm_statistic=0.0,
-                p_value=1.0,
-                is_significant=False,
-                direction="no_difference",
-                n_obs=T,
-            )
-
-        if abs(dm_stat) < 1.96:
-            direction = "no_difference"
-        elif d_bar > 0:
-            # series_1 has higher loss, series_2 is better
-            direction = "ralph_better"
-        else:
-            direction = "helix_better"
-
-        return DMTestResult(
-            dm_statistic=float(dm_stat),
-            p_value=float(p_value),
-            is_significant=p_value < 0.05,
-            direction=direction,
-            n_obs=T,
-        )
-
-    # ------------------------------------------------------------------
-    # Paired t-test
-    # ------------------------------------------------------------------
-
-    def paired_t_test(
-        self,
-        ic_series_1: np.ndarray,
-        ic_series_2: np.ndarray,
-    ) -> dict:
-        """Paired t-test on IC difference series."""
-        s1, s2 = self._paired_valid_series(ic_series_1, ic_series_2)
-        n = len(s1)
-        if n < 5:
-            return {"t_stat": 0.0, "p_value": 1.0, "mean_diff": 0.0, "n": n}
-
-        t_stat, p_value = ttest_rel(s1, s2)
-        if not np.isfinite(t_stat) or not np.isfinite(p_value):
-            return {"t_stat": 0.0, "p_value": 1.0, "mean_diff": 0.0, "n": n}
-        return {
-            "t_stat": float(t_stat),
-            "p_value": float(p_value),
-            "mean_diff": float(np.mean(s1 - s2)),
-            "n": n,
-        }
-
-    # ------------------------------------------------------------------
-    # Block bootstrap CI
-    # ------------------------------------------------------------------
-
-    def bootstrap_ic_difference_ci(
-        self,
-        ic_series_1: np.ndarray,
-        ic_series_2: np.ndarray,
-        n_bootstrap: int = 1000,
-        block_size: int = 20,
-    ) -> Tuple[float, float]:
-        """95% block-bootstrap CI on mean IC difference.
-
-        Returns
-        -------
-        (lower_95, upper_95) : tuple of float
-        """
-        s1, s2 = self._paired_valid_series(ic_series_1, ic_series_2)
-        n = len(s1)
-        if n < 5:
-            return (0.0, 0.0)
-        diff = s1 - s2
-
-        # Circular block bootstrap
-        block_size = min(block_size, n // 2)
-        block_size = max(block_size, 1)
-        n_blocks = int(math.ceil(n / block_size))
-        boot_means = np.empty(n_bootstrap)
-
-        for i in range(n_bootstrap):
-            starts = self._rng.randint(0, n - block_size + 1, size=n_blocks)
-            indices = np.concatenate(
-                [np.arange(s, s + block_size) for s in starts]
-            )[:n]
-            boot_means[i] = diff[indices].mean()
-
-        return (
-            float(np.percentile(boot_means, 2.5)),
-            float(np.percentile(boot_means, 97.5)),
-        )
-
-    # ------------------------------------------------------------------
-    # Wilcoxon signed-rank test
-    # ------------------------------------------------------------------
-
-    def wilcoxon_test(
-        self,
-        ic_series_1: np.ndarray,
-        ic_series_2: np.ndarray,
-    ) -> dict:
-        """Wilcoxon signed-rank test (non-parametric) on IC pairs."""
-        s1, s2 = self._paired_valid_series(ic_series_1, ic_series_2)
-        n = len(s1)
-        if n < 5:
-            return {"statistic": 0.0, "p_value": 1.0, "n": n}
-        try:
-            stat, p_value = wilcoxon(s1, s2, alternative="two-sided")
-        except Exception:
-            stat, p_value = 0.0, 1.0
-
-        return {"statistic": float(stat), "p_value": float(p_value), "n": n}
-
-    # ------------------------------------------------------------------
-    # Combined report
-    # ------------------------------------------------------------------
-
-    def run_all_tests(
-        self,
-        ic_helix: np.ndarray,
-        ic_ralph: np.ndarray,
-    ) -> dict:
-        """Run all four statistical tests and return combined results."""
-        dm = self.diebold_mariano_test(ic_helix, ic_ralph)
-        tt = self.paired_t_test(ic_helix, ic_ralph)
-        ci_lo, ci_hi = self.bootstrap_ic_difference_ci(ic_helix, ic_ralph)
-        wil = self.wilcoxon_test(ic_helix, ic_ralph)
-        valid_helix, valid_ralph = self._paired_valid_series(ic_helix, ic_ralph)
-        mean_diff = float(np.mean(valid_helix - valid_ralph)) if len(valid_helix) else 0.0
-        return {
-            "diebold_mariano": {
-                "dm_stat": dm.dm_statistic,
-                "p_value": dm.p_value,
-                "significant": dm.is_significant,
-                "direction": dm.direction,
-                "n_obs": dm.n_obs,
-            },
-            "paired_t_test": tt,
-            "bootstrap_ci_95": {
-                "lower": ci_lo,
-                "upper": ci_hi,
-                "excludes_zero": ci_lo > 0 or ci_hi < 0,
-            },
-            "wilcoxon": wil,
-            "mean_ic_difference": mean_diff,
-            "helix_outperforms": mean_diff > 0,
-        }
-
-
-# ---------------------------------------------------------------------------
-# Speed Benchmark
-# ---------------------------------------------------------------------------
-
-class SpeedBenchmark:
-    """Benchmark factor evaluation speed across operators and pipelines."""
-
-    def __init__(self, seed: int = 42) -> None:
-        self._rng = np.random.RandomState(seed)
-
-    def _time_callable(self, fn, n_repeats: int = 5, warmup: int = 1) -> float:
-        """Return minimum time over n_repeats (ms) after warmup runs."""
-        for _ in range(warmup):
-            try:
-                fn()
-            except Exception:
-                pass
-        timings = []
-        for _ in range(n_repeats):
-            t0 = time.perf_counter()
-            try:
-                fn()
-            except Exception:
-                pass
-            timings.append((time.perf_counter() - t0) * 1000.0)
-        return float(np.min(timings)) if timings else 0.0
-
-    def run_operator_benchmark(
-        self,
-        n_assets: int = 500,
-        n_periods: int = 2000,
-        n_repeats: int = 5,
-    ) -> OperatorSpeedResult:
-        """Benchmark individual operators (numpy backend)."""
-        rng = np.random.RandomState(self._rng.randint(0, 9999))
-        X = rng.randn(n_assets, n_periods).astype(np.float64)
-        Y = rng.randn(n_assets, n_periods).astype(np.float64)
-
-        from scipy.stats import rankdata
-
-        def _ts_rank(mat, window=20):
-            out = np.full_like(mat, np.nan)
-            for t in range(window - 1, mat.shape[1]):
-                slc = mat[:, t - window + 1: t + 1]
-                for i in range(mat.shape[0]):
-                    r = rankdata(slc[i])
-                    out[i, t] = r[-1] / window
-            return out
-
-        def _cs_rank(mat):
-            out = np.full_like(mat, np.nan)
-            for t in range(mat.shape[1]):
-                col = mat[:, t]
-                valid = ~np.isnan(col)
-                if valid.sum() > 0:
-                    out[valid, t] = rankdata(col[valid]) / valid.sum()
-            return out
-
-        def _ts_std(mat, window=20):
-            out = np.full_like(mat, np.nan)
-            for t in range(window - 1, mat.shape[1]):
-                slc = mat[:, t - window + 1: t + 1]
-                out[:, t] = np.std(slc, axis=1, ddof=1)
-            return out
-
-        def _ts_corr(x, y, window=20):
-            out = np.full_like(x, np.nan)
-            for t in range(window - 1, x.shape[1]):
-                sx = x[:, t - window + 1: t + 1]
-                sy = y[:, t - window + 1: t + 1]
-                xs = sx - sx.mean(axis=1, keepdims=True)
-                ys = sy - sy.mean(axis=1, keepdims=True)
-                denom = np.sqrt((xs**2).sum(axis=1) * (ys**2).sum(axis=1))
-                safe = denom > 1e-12
-                out[safe, t] = ((xs * ys).sum(axis=1) / denom)[safe]
-            return out
-
-        # Use small sub-matrix for timing (keep fast)
-        X_s = X[:50, :100]
-        Y_s = Y[:50, :100]
-
-        ops = {
-            "TsRank(w=20)": lambda: _ts_rank(X_s, window=20),
-            "CsRank": lambda: _cs_rank(X_s),
-            "TsStd(w=20)": lambda: _ts_std(X_s, window=20),
-            "TsCorr(w=20)": lambda: _ts_corr(X_s, Y_s, window=20),
-            "TsMean(w=20)": lambda: np.lib.stride_tricks.sliding_window_view(X_s, 20, axis=1).mean(axis=-1),
-            "CsZscore": lambda: (X_s - X_s.mean(axis=0)) / (X_s.std(axis=0) + 1e-8),
-        }
-
-        timings: Dict[str, float] = {}
-        for name, fn in ops.items():
-            timings[name] = self._time_callable(fn, n_repeats=n_repeats)
-
-        return OperatorSpeedResult(
-            operator_timings_ms=timings,
-            n_assets=n_assets,
-            n_periods=n_periods,
-            n_repeats=n_repeats,
-        )
-
-    def run_full_pipeline_benchmark(
-        self,
-        n_candidates: int = 200,
-        data: Optional[dict] = None,
-    ) -> PipelineSpeedResult:
-        """Benchmark end-to-end candidate evaluation pipeline."""
-        if data is None:
-            data = _build_mock_data_dict(n_assets=100, n_periods=200, seed=42)
-
-        from factorminer.benchmark.catalogs import build_random_exploration
-        from factorminer.core.parser import try_parse
-        from factorminer.evaluation.metrics import compute_ic, compute_ic_mean
-
-        entries = build_random_exploration(seed=99, count=n_candidates)
-        returns = data.get("forward_returns", data.get("$close"))
-        if returns is None:
-            returns = np.random.randn(*list(data.values())[0].shape) * 0.01
-
-        t0 = time.perf_counter()
-        succeeded = 0
-        for entry in entries[:n_candidates]:
-            tree = try_parse(entry.formula)
-            if tree is None:
-                continue
-            try:
-                signals = tree.evaluate(data)
-                ic = compute_ic(signals, returns)
-                _ = compute_ic_mean(ic)
-                succeeded += 1
-            except Exception:
-                pass
-        elapsed = time.perf_counter() - t0
-
-        return PipelineSpeedResult(
-            total_seconds=elapsed,
-            candidates_per_second=succeeded / max(elapsed, 1e-6),
-            n_candidates=n_candidates,
-        )
-
-    def generate_speed_table(
-        self,
-        op_result: OperatorSpeedResult,
-        pipeline_result: PipelineSpeedResult,
-    ) -> str:
-        """Generate a LaTeX table of speed results."""
-        lines = [
-            r"\begin{table}[htbp]",
-            r"\centering",
-            r"\caption{Computational Efficiency Benchmark}",
-            r"\begin{tabular}{lrr}",
-            r"\toprule",
-            r"Operator / Task & Time (ms) & Relative \\",
-            r"\midrule",
-        ]
-        timings = op_result.operator_timings_ms
-        baseline = max(timings.values()) if timings else 1.0
-        for op, t in timings.items():
-            rel = t / baseline if baseline > 0 else 1.0
-            lines.append(rf"{op} & {t:.2f} & {rel:.2f}x \\")
-
-        lines.append(r"\midrule")
-        lines.append(
-            rf"Full pipeline ({pipeline_result.n_candidates} candidates) & "
-            rf"{pipeline_result.total_seconds * 1000:.0f} & -- \\"
-        )
-        lines.append(
-            rf"Throughput & {pipeline_result.candidates_per_second:.1f} cand/s & -- \\"
-        )
-        lines += [r"\bottomrule", r"\end{tabular}", r"\end{table}"]
-        return "\n".join(lines)
-
-
-# ---------------------------------------------------------------------------
-# Main HelixBenchmark
-# ---------------------------------------------------------------------------
-
-class HelixBenchmark:
-    """Rigorous comparison of HelixFactor vs FactorMiner (and baselines).
-
-    Baselines:
-      - Random Formula Exploration (RF): random type-correct trees
-      - Alpha101 Classic: original 101 formulaic alphas
-      - Alpha101 Adapted: parameter-tuned for 10-min bars
-      - FactorMiner (Ralph Loop): exact paper reproduction
-      - HelixFactor (Phase 2): full Phase 2 system
-
-    Metrics mirror paper Table 1:
-      - Factor Library: IC (%), ICIR, Avg|rho|
-      - Factor Combination: EW IC, EW ICIR, ICW IC, ICW ICIR
-      - Factor Selection: Lasso IC, XGBoost IC
-    """
-
-    METHOD_LABELS = {
-        "random_exploration": "RF (Rand)",
-        "alpha101_classic": "Alpha101 Classic",
-        "alpha101_adapted": "Alpha101 Adapted",
-        "ralph_loop": "FactorMiner (Ralph)",
-        "helix_phase2": "HelixFactor (Phase 2)",
-    }
-
-    def __init__(
-        self,
-        ic_threshold: float = 0.02,
-        correlation_threshold: float = 0.5,
-        seed: int = 42,
-    ) -> None:
-        self.ic_threshold = ic_threshold
-        self.correlation_threshold = correlation_threshold
-        self.seed = seed
-        self._stat_tests = StatisticalComparisonTests(seed=seed)
-        self._speed_bench = SpeedBenchmark(seed=seed)
-
-    # ------------------------------------------------------------------
-    # Public API
-    # ------------------------------------------------------------------
-
-    def run_comparison(
-        self,
-        data: dict,
-        train_period: Tuple[int, int],
-        test_period: Tuple[int, int],
-        n_target_factors: int = 40,
-        n_runs: int = 1,
-        methods: Optional[List[str]] = None,
-    ) -> BenchmarkResult:
-        """Run the full comparison benchmark.
-
-        Parameters
-        ----------
-        data : dict
-            Data dictionary mapping feature names to (M, T) arrays.
-            Must include ``"forward_returns"``.
-        train_period / test_period : (int, int)
-            Column index range [start, end) for the respective split.
-        n_target_factors : int
-            Number of factors to build each library to.
-        n_runs : int
-            Repetitions per method (for std estimates).
-        methods : list[str], optional
-            Subset of methods to run. Default: all five.
-
-        Returns
-        -------
-        BenchmarkResult
-        """
-        if methods is None:
-            methods = [
-                "random_exploration",
-                "alpha101_classic",
-                "alpha101_adapted",
-                "ralph_loop",
-                "helix_phase2",
-            ]
-
-        # Split data
-        train_data = _slice_data(data, *train_period)
-        test_data = _slice_data(data, *test_period)
-
-        raw_results: Dict[str, List[MethodResult]] = {}
-        for method in methods:
-            logger.info("Running method: %s", method)
-            method_runs: List[MethodResult] = []
-            for run_id in range(n_runs):
-                try:
-                    result = self.run_single_method(
-                        method=method,
-                        data=train_data,
-                        test_data=test_data,
-                        n_factors=n_target_factors,
-                        run_id=run_id,
-                    )
-                    method_runs.append(result)
-                except Exception as exc:
-                    logger.warning("Method %s run %d failed: %s", method, run_id, exc)
-                    method_runs.append(
-                        MethodResult(method=method, run_id=run_id)
-                    )
-            raw_results[method] = method_runs
-
-        # Average across runs
-        averaged = {
-            method: _average_method_results(runs)
-            for method, runs in raw_results.items()
-        }
-
-        # Build metric DataFrames
-        lib_df = _build_library_df(averaged, methods)
-        comb_df = _build_combination_df(averaged, methods)
-        sel_df = _build_selection_df(averaged, methods)
-
-        # Speed benchmark
-        speed_result = self._speed_bench.run_full_pipeline_benchmark(data=train_data)
-        op_result = self._speed_bench.run_operator_benchmark(n_repeats=3)
-        speed_df = _build_speed_df(op_result, speed_result)
-
-        # Statistical tests (Helix vs Ralph)
-        stat_tests = {}
-        helix_results = raw_results.get("helix_phase2", [])
-        ralph_results = raw_results.get("ralph_loop", [])
-
-        if helix_results and ralph_results:
-            h_ic = helix_results[0].ic_series
-            r_ic = ralph_results[0].ic_series
-            if h_ic is not None and r_ic is not None:
-                stat_tests = self._stat_tests.run_all_tests(h_ic, r_ic)
-            else:
-                # Create synthetic IC series from stored metrics
-                h_ic = _synthetic_ic_series(helix_results[0].library_ic, n=100, seed=self.seed)
-                r_ic = _synthetic_ic_series(ralph_results[0].library_ic, n=100, seed=self.seed + 1)
-                stat_tests = self._stat_tests.run_all_tests(h_ic, r_ic)
-
-        return BenchmarkResult(
-            methods=methods,
-            factor_library_metrics=lib_df,
-            combination_metrics=comb_df,
-            selection_metrics=sel_df,
-            speed_metrics=speed_df,
-            statistical_tests=stat_tests,
-            raw_method_results=raw_results,
-        )
-
-    def run_single_method(
-        self,
-        method: str,
-        data: dict,
-        test_data: dict,
-        n_factors: int,
-        run_id: int = 0,
-    ) -> MethodResult:
-        """Run one method and return its MethodResult.
-
-        Parameters
-        ----------
-        method : str
-            One of: 'ralph', 'helix', 'helix_phase2', 'rf',
-            'random_exploration', 'alpha101_classic', 'alpha101_adapted'.
-        """
-        t0 = time.perf_counter()
-
-        # Resolve aliases
-        method_key = {
-            "ralph": "ralph_loop",
-            "helix": "helix_phase2",
-            "rf": "random_exploration",
-            "alpha101": "alpha101_classic",
-        }.get(method, method)
-
-        candidates = self._get_candidates(method_key, n_factors=n_factors * 4)
-        returns = data.get("forward_returns")
-        test_returns = test_data.get("forward_returns")
-
-        if returns is None or test_returns is None:
-            logger.warning("forward_returns not found in data dict for method %s", method)
-            return MethodResult(method=method_key, run_id=run_id)
-
-        # Evaluate all candidates
-        factor_results = self._evaluate_candidates(candidates, data, returns)
-
-        # Build library from best candidates
-        library = self._build_library(factor_results, n_factors)
-
-        if not library:
-            return MethodResult(method=method_key, run_id=run_id)
-
-        # Compute library metrics on test data
-        test_factor_results = self._evaluate_candidates(
-            [(r["name"], r["formula"], r.get("category", "Unknown"))
-             for r in library],
-            test_data,
-            test_returns,
-        )
-
-        lib_ic, lib_icir, avg_rho, ic_series = self._library_metrics(
-            test_factor_results, test_returns
-        )
-
-        # Factor combination
-        ew_ic, ew_icir, icw_ic, icw_icir = self._combination_metrics(
-            test_factor_results, library, test_returns
-        )
-
-        # Factor selection
-        lasso_ic, lasso_icir = self._selection_metrics(
-            factor_results, library, data, returns, test_data, test_returns, "lasso"
-        )
-        xgb_ic, xgb_icir = self._selection_metrics(
-            factor_results, library, data, returns, test_data, test_returns, "xgboost"
-        )
-
-        elapsed = time.perf_counter() - t0
-
-        return MethodResult(
-            method=method_key,
-            library_ic=lib_ic,
-            library_icir=lib_icir,
-            avg_abs_rho=avg_rho,
-            ew_ic=ew_ic,
-            ew_icir=ew_icir,
-            icw_ic=icw_ic,
-            icw_icir=icw_icir,
-            lasso_ic=lasso_ic,
-            lasso_icir=lasso_icir,
-            xgb_ic=xgb_ic,
-            xgb_icir=xgb_icir,
-            n_factors=len(library),
-            admission_rate=len(library) / max(len(candidates), 1),
-            elapsed_seconds=elapsed,
-            ic_series=ic_series,
-            run_id=run_id,
-        )
-
-    # ------------------------------------------------------------------
-    # Internal helpers
-    # ------------------------------------------------------------------
-
-    def _get_candidates(self, method: str, n_factors: int) -> List[Tuple[str, str, str]]:
-        """Get candidate (name, formula, category) tuples for a method."""
-        # Import the catalogs module directly to avoid triggering the package
-        # __init__ chain which has an unresolved dependency on
-        # factorminer.agent.specialists.REGIME_SPECIALIST in some environments.
-        import importlib.util as _ilu, pathlib as _pl, sys as _sys
-        _cat_path = _pl.Path(__file__).parent / "catalogs.py"
-        if "src.factorminer.factorminer.benchmark.catalogs" not in _sys.modules:
-            _spec = _ilu.spec_from_file_location("src.factorminer.factorminer.benchmark.catalogs", str(_cat_path))
-            _cat_mod = _ilu.module_from_spec(_spec)
-            _sys.modules["src.factorminer.factorminer.benchmark.catalogs"] = _cat_mod
-            _spec.loader.exec_module(_cat_mod)
-        _cat = _sys.modules["src.factorminer.factorminer.benchmark.catalogs"]
-        ALPHA101_CLASSIC = _cat.ALPHA101_CLASSIC
-        build_alpha101_adapted = _cat.build_alpha101_adapted
-        build_random_exploration = _cat.build_random_exploration
-        build_factor_miner_catalog = _cat.build_factor_miner_catalog
-
-        if method == "random_exploration":
-            entries = build_random_exploration(seed=self.seed, count=max(n_factors, 160))
-        elif method == "alpha101_classic":
-            entries = list(ALPHA101_CLASSIC)
-            while len(entries) < n_factors:
-                entries = entries + list(ALPHA101_CLASSIC)
-            entries = entries[:n_factors]
-        elif method == "alpha101_adapted":
-            entries = build_alpha101_adapted()
-        elif method in ("ralph_loop", "helix_phase2"):
-            # Use the full FactorMiner paper catalog + random extensions
-            entries = build_factor_miner_catalog()
-            if len(entries) < n_factors * 2:
-                extra = build_random_exploration(
-                    seed=self.seed + 7, count=n_factors * 2 - len(entries)
-                )
-                entries = entries + extra
-        else:
-            entries = build_random_exploration(seed=self.seed + 1, count=n_factors * 2)
-
-        return [(e.name, e.formula, e.category) for e in entries]
-
-    def _evaluate_candidates(
-        self,
-        candidates: List[Tuple[str, str, str]],
-        data: dict,
-        returns: np.ndarray,
-    ) -> List[dict]:
-        """Evaluate candidates; returns list of result dicts."""
-        from factorminer.core.parser import try_parse
-        from factorminer.evaluation.metrics import (
-            compute_ic, compute_ic_mean, compute_icir, compute_ic_win_rate
-        )
-
-        results = []
-        for name, formula, category in candidates:
-            tree = try_parse(formula)
-            if tree is None:
-                continue
-            try:
-                signals = tree.evaluate(data)
-                if signals is None or np.all(np.isnan(signals)):
-                    continue
-                ic_series = compute_ic(signals, returns)
-                ic_mean = compute_ic_mean(ic_series)
-                icir = compute_icir(ic_series)
-                win_rate = compute_ic_win_rate(ic_series)
-                results.append({
-                    "name": name,
-                    "formula": formula,
-                    "category": category,
-                    "ic_mean": ic_mean,
-                    "icir": icir,
-                    "ic_win_rate": win_rate,
-                    "signals": signals,
-                    "ic_series": ic_series,
-                })
-            except Exception:
-                pass
-        return results
-
-    def _build_library(
-        self,
-        factor_results: List[dict],
-        n_factors: int,
-    ) -> List[dict]:
-        """Build a diversified factor library with IC and correlation admission."""
-        from factorminer.evaluation.metrics import compute_pairwise_correlation
-
-        # Filter by IC threshold
-        passing = [r for r in factor_results if r["ic_mean"] >= self.ic_threshold]
-        passing.sort(key=lambda x: x["ic_mean"], reverse=True)
-
-        library: List[dict] = []
-        for candidate in passing:
-            if len(library) >= n_factors:
-                break
-            # Correlation check
-            too_correlated = False
-            for existing in library:
-                if (
-                    existing.get("signals") is not None
-                    and candidate.get("signals") is not None
-                ):
-                    corr = abs(
-                        compute_pairwise_correlation(
-                            candidate["signals"], existing["signals"]
-                        )
-                    )
-                    if corr >= self.correlation_threshold:
-                        too_correlated = True
-                        break
-            if not too_correlated:
-                library.append(candidate)
-        return library
-
-    def _library_metrics(
-        self,
-        factor_results: List[dict],
-        returns: np.ndarray,
-    ) -> Tuple[float, float, float, Optional[np.ndarray]]:
-        """Compute library IC, ICIR, avg|rho|. Returns (ic, icir, rho, ic_series)."""
-        from factorminer.evaluation.metrics import (
-            compute_pairwise_correlation, compute_ic_mean, compute_icir
-        )
-
-        if not factor_results:
-            return 0.0, 0.0, 0.0, None
-
-        ics = [r["ic_mean"] for r in factor_results]
-        icirs = [r["icir"] for r in factor_results]
-        lib_ic = float(np.mean(ics)) if ics else 0.0
-        lib_icir = float(np.mean(icirs)) if icirs else 0.0
-
-        # Average pairwise |rho|
-        rhos = []
-        signals_list = [
-            r["signals"] for r in factor_results if r.get("signals") is not None
-        ]
-        for i in range(len(signals_list)):
-            for j in range(i + 1, len(signals_list)):
-                c = abs(compute_pairwise_correlation(signals_list[i], signals_list[j]))
-                rhos.append(c)
-        avg_rho = float(np.mean(rhos)) if rhos else 0.0
-
-        # Combined IC series (average)
-        all_ic_series = [r["ic_series"] for r in factor_results if r.get("ic_series") is not None]
-        if all_ic_series:
-            min_len = min(len(s) for s in all_ic_series)
-            combined = np.nanmean(
-                np.stack([s[:min_len] for s in all_ic_series], axis=0), axis=0
-            )
-        else:
-            combined = None
-
-        return lib_ic, lib_icir, avg_rho, combined
-
-    def _combination_metrics(
-        self,
-        test_factor_results: List[dict],
-        library: List[dict],
-        test_returns: np.ndarray,
-    ) -> Tuple[float, float, float, float]:
-        """Compute EW/ICW combination metrics on test data."""
-        from factorminer.evaluation.combination import FactorCombiner
-        from factorminer.evaluation.metrics import compute_ic, compute_ic_mean, compute_icir
-
-        if not test_factor_results:
-            return 0.0, 0.0, 0.0, 0.0
-
-        factor_signals = {
-            i: r["signals"].T for i, r in enumerate(test_factor_results)
-            if r.get("signals") is not None
-        }
-        ic_values = {
-            i: r["ic_mean"] for i, r in enumerate(test_factor_results)
-        }
-
-        if not factor_signals:
-            return 0.0, 0.0, 0.0, 0.0
-
-        combiner = FactorCombiner()
-        try:
-            ew_composite = combiner.equal_weight(factor_signals)
-            ew_ic_series = compute_ic(ew_composite.T, test_returns)
-            ew_ic = compute_ic_mean(ew_ic_series)
-            ew_icir = compute_icir(ew_ic_series)
-        except Exception:
-            ew_ic, ew_icir = 0.0, 0.0
-
-        try:
-            icw_composite = combiner.ic_weighted(factor_signals, ic_values)
-            icw_ic_series = compute_ic(icw_composite.T, test_returns)
-            icw_ic = compute_ic_mean(icw_ic_series)
-            icw_icir = compute_icir(icw_ic_series)
-        except Exception:
-            icw_ic, icw_icir = 0.0, 0.0
-
-        return ew_ic, ew_icir, icw_ic, icw_icir
-
-    def _selection_metrics(
-        self,
-        train_factor_results: List[dict],
-        library: List[dict],
-        train_data: dict,
-        train_returns: np.ndarray,
-        test_data: dict,
-        test_returns: np.ndarray,
-        selector_type: str,
-    ) -> Tuple[float, float]:
-        """Compute Lasso/XGBoost selection IC on test data."""
-        from factorminer.evaluation.selection import FactorSelector
-        from factorminer.evaluation.metrics import compute_ic, compute_ic_mean, compute_icir
-
-        if len(train_factor_results) < 3:
-            return 0.0, 0.0
-
-        fit_signals = {
-            i: r["signals"].T for i, r in enumerate(train_factor_results)
-            if r.get("signals") is not None
-        }
-        if not fit_signals:
-            return 0.0, 0.0
-
-        # Re-evaluate on test data
-        test_results = self._evaluate_candidates(
-            [(r["name"], r["formula"], r.get("category", "Unknown"))
-             for r in train_factor_results],
-            test_data,
-            test_returns,
-        )
-        eval_signals = {
-            i: r["signals"].T for i, r in enumerate(test_results)
-            if r.get("signals") is not None and i < len(test_results)
-        }
-
-        if not eval_signals:
-            return 0.0, 0.0
-
-        selector = FactorSelector()
-        try:
-            fit_ret = train_returns.T
-            if selector_type == "lasso":
-                ranking = selector.lasso_selection(fit_signals, fit_ret)
-            else:
-                ranking = selector.xgboost_selection(fit_signals, fit_ret)
-
-            if not ranking:
-                return 0.0, 0.0
-
-            selected_ids = [fid for fid, _ in ranking if fid in eval_signals]
-            if not selected_ids:
-                return 0.0, 0.0
-
-            # Simple equal-weight composite of selected factors
-            composite = np.nanmean(
-                np.stack([eval_signals[fid] for fid in selected_ids], axis=0),
-                axis=0,
-            )
-            ic_series = compute_ic(composite.T, test_returns)
-            return compute_ic_mean(ic_series), compute_icir(ic_series)
-        except Exception as exc:
-            logger.debug("Selection metrics failed for %s: %s", selector_type, exc)
-            return 0.0, 0.0
-
-    def _clone_cfg(self, cfg):
-        cloned = copy.deepcopy(cfg)
-        cloned._raw = copy.deepcopy(getattr(cfg, "_raw", {}))
-        return cloned
-
-    def _build_runtime_provider(self, cfg, mock: bool):
-        from factorminer.agent.llm_interface import MockProvider, create_provider
-
-        if mock:
-            return MockProvider()
-
-        provider_name = getattr(cfg.llm, "provider", "mock")
-        model_name = getattr(cfg.llm, "model", "mock")
-        api_key = None
-        if hasattr(cfg, "_raw"):
-            api_key = getattr(cfg, "_raw", {}).get("llm", {}).get("api_key")
-        if provider_name == "mock" or not api_key:
-            return MockProvider()
-
-        try:
-            return create_provider(
-                {
-                    "provider": provider_name,
-                    "model": model_name,
-                    "api_key": api_key,
-                }
-            )
-        except Exception as exc:  # pragma: no cover - defensive fallback
-            logger.warning("Falling back to MockProvider: %s", exc)
-            return MockProvider()
-
-    def _build_runtime_mining_config(self, cfg, output_dir: Path, mock: bool):
-        from factorminer.core.config import MiningConfig as RuntimeMiningConfig
-
-        signal_failure_policy = "synthetic" if mock else cfg.evaluation.signal_failure_policy
-
-        runtime_cfg = RuntimeMiningConfig(
-            target_library_size=cfg.mining.target_library_size,
-            batch_size=cfg.mining.batch_size,
-            max_iterations=cfg.mining.max_iterations,
-            ic_threshold=cfg.mining.ic_threshold,
-            icir_threshold=cfg.mining.icir_threshold,
-            correlation_threshold=cfg.mining.correlation_threshold,
-            replacement_ic_min=cfg.mining.replacement_ic_min,
-            replacement_ic_ratio=cfg.mining.replacement_ic_ratio,
-            fast_screen_assets=cfg.evaluation.fast_screen_assets,
-            num_workers=cfg.evaluation.num_workers,
-            output_dir=str(output_dir),
-            backend=cfg.evaluation.backend,
-            gpu_device=cfg.evaluation.gpu_device,
-            signal_failure_policy=signal_failure_policy,
-        )
-        runtime_cfg.benchmark_mode = getattr(cfg.benchmark, "mode", "paper")
-        runtime_cfg.target_panels = None
-        runtime_cfg.target_horizons = None
-        runtime_cfg.research = getattr(cfg, "research", None)
-        return runtime_cfg
-
-    def _build_debate_config(self, cfg):
-        if not cfg.phase2.debate.enabled:
-            return None
-
-        from factorminer.agent.debate import DebateConfig as RuntimeDebateConfig
-        from factorminer.agent.specialists import DEFAULT_SPECIALISTS
-
-        specialist_count = min(
-            int(cfg.phase2.debate.num_specialists), len(DEFAULT_SPECIALISTS)
-        )
-        return RuntimeDebateConfig(
-            specialists=list(DEFAULT_SPECIALISTS[:specialist_count]),
-            enable_critic=cfg.phase2.debate.enable_critic,
-            candidates_per_specialist=cfg.phase2.debate.candidates_per_specialist,
-            top_k_after_critic=cfg.phase2.debate.top_k_after_critic,
-            critic_temperature=cfg.phase2.debate.critic_temperature,
-        )
-
-    def _runtime_phase2_kwargs(self, cfg, loop_kind: str, runtime_dataset):
-        if loop_kind != "helix_phase2":
-            return {}
-
-        from factorminer.evaluation.causal import CausalConfig as RuntimeCausalConfig
-        from factorminer.evaluation.capacity import CapacityConfig as RuntimeCapacityConfig
-        from factorminer.evaluation.regime import RegimeConfig as RuntimeRegimeConfig
-        from factorminer.evaluation.significance import (
-            SignificanceConfig as RuntimeSignificanceConfig,
-        )
-
-        def _clone_section(source, target_cls):
-            target_fields = {field.name for field in target_cls.__dataclass_fields__.values()}
-            payload = {
-                name: getattr(source, name)
-                for name in target_fields
-                if hasattr(source, name)
-            }
-            return target_cls(**payload)
-
-        return {
-            "debate_config": self._build_debate_config(cfg),
-            "enable_knowledge_graph": cfg.phase2.helix.enable_knowledge_graph,
-            "enable_embeddings": cfg.phase2.helix.enable_embeddings,
-            "enable_auto_inventor": cfg.phase2.auto_inventor.enabled,
-            "auto_invention_interval": cfg.phase2.auto_inventor.invention_interval,
-            "canonicalize": cfg.phase2.helix.enable_canonicalization,
-            "forgetting_lambda": cfg.phase2.helix.forgetting_lambda,
-            "causal_config": _clone_section(cfg.phase2.causal, RuntimeCausalConfig)
-            if cfg.phase2.causal.enabled
-            else None,
-            "regime_config": _clone_section(cfg.phase2.regime, RuntimeRegimeConfig)
-            if cfg.phase2.regime.enabled
-            else None,
-            "capacity_config": _clone_section(cfg.phase2.capacity, RuntimeCapacityConfig)
-            if cfg.phase2.capacity.enabled
-            else None,
-            "significance_config": _clone_section(
-                cfg.phase2.significance, RuntimeSignificanceConfig
-            )
-            if cfg.phase2.significance.enabled
-            else None,
-            "volume": runtime_dataset.data_dict.get(
-                "$amt", runtime_dataset.data_dict.get("$volume")
-            ),
-        }
-
-    def _execute_runtime_loop(
-        self,
-        *,
-        cfg,
-        loop_kind: str,
-        runtime_dataset,
-        output_dir: Path,
-        n_target_factors: int,
-        run_id: int,
-        mock: bool,
-    ) -> tuple[MethodResult, dict[str, Any]]:
-        from factorminer.core.helix_loop import HelixLoop
-        from factorminer.core.library_io import load_library
-        from factorminer.core.ralph_loop import RalphLoop
-        from factorminer.core.session import MiningSession
-        from factorminer.benchmark.runtime import evaluate_frozen_set, select_frozen_top_k
-        from factorminer.evaluation.runtime import evaluate_factors
-
-        output_dir.mkdir(parents=True, exist_ok=True)
-        runtime_cfg = self._build_runtime_mining_config(cfg, output_dir, mock=mock)
-        provider = self._build_runtime_provider(cfg, mock=mock)
-
-        runtime_kwargs = {
-            "config": runtime_cfg,
-            "data_tensor": runtime_dataset.data_tensor,
-            "returns": runtime_dataset.returns,
-            "llm_provider": provider,
-        }
-        if loop_kind == "helix_phase2":
-            runtime_kwargs.update(
-                self._runtime_phase2_kwargs(cfg, loop_kind, runtime_dataset)
-            )
-            loop = HelixLoop(**runtime_kwargs)
-        else:
-            loop = RalphLoop(**runtime_kwargs)
-
-        library = loop.run(
-            target_size=n_target_factors,
-            max_iterations=runtime_cfg.max_iterations,
-        )
-        library_dir = output_dir / "factor_library"
-        loaded_library = load_library(library_dir)
-        session = MiningSession.load(output_dir / "session.json")
-        session_summary = session.get_summary()
-        run_manifest = {}
-        run_manifest_path = output_dir / "run_manifest.json"
-        if run_manifest_path.exists():
-            with open(run_manifest_path) as f:
-                run_manifest = json.load(f)
-
-        artifacts = evaluate_factors(
-            loaded_library.list_factors(),
-            runtime_dataset,
-            signal_failure_policy=runtime_cfg.signal_failure_policy,
-        )
-        selected = select_frozen_top_k(
-            artifacts,
-            loaded_library,
-            top_k=n_target_factors,
-            split_name="train",
-        )
-        runtime_eval = evaluate_frozen_set(
-            selected,
-            runtime_dataset,
-            split_name="test",
-            fit_split="train",
-            cost_bps=list(getattr(cfg.benchmark, "cost_bps", [])),
-        )
-
-        selected_formulas = {artifact.formula for artifact in selected}
-        selected_artifacts = [
-            artifact
-            for artifact in artifacts
-            if artifact.succeeded and artifact.formula in selected_formulas
-        ]
-        ic_series = None
-        if selected_artifacts:
-            series_list = [
-                artifact.split_stats["test"].get("ic_series")
-                for artifact in selected_artifacts
-                if artifact.split_stats.get("test", {}).get("ic_series") is not None
-            ]
-            if series_list:
-                min_len = min(len(series) for series in series_list)
-                ic_series = np.nanmean(
-                    np.stack([series[:min_len] for series in series_list], axis=0),
-                    axis=0,
-                )
-
-        library_turnover = float(
-            np.mean(
-                [
-                    artifact.split_stats["test"].get("turnover", 0.0)
-                    for artifact in selected_artifacts
-                ]
-            )
-        ) if selected_artifacts else 0.0
-
-        combination_turnover = {
-            name: float(metrics.get("turnover", 0.0))
-            for name, metrics in runtime_eval.get("combinations", {}).items()
-        }
-        cost_pressure = {
-            name: metrics.get("cost_pressure", {})
-            for name, metrics in runtime_eval.get("combinations", {}).items()
-        }
-
-        result = MethodResult(
-            method=loop_kind,
-            library_ic=float(runtime_eval["library"]["ic"]),
-            library_icir=float(runtime_eval["library"]["icir"]),
-            avg_abs_rho=float(runtime_eval["library"]["avg_abs_rho"]),
-            ew_ic=float(runtime_eval["combinations"].get("equal_weight", {}).get("ic", 0.0)),
-            ew_icir=float(runtime_eval["combinations"].get("equal_weight", {}).get("icir", 0.0)),
-            icw_ic=float(runtime_eval["combinations"].get("ic_weighted", {}).get("ic", 0.0)),
-            icw_icir=float(runtime_eval["combinations"].get("ic_weighted", {}).get("icir", 0.0)),
-            lasso_ic=float(runtime_eval["selections"].get("lasso", {}).get("ic", 0.0)),
-            lasso_icir=float(runtime_eval["selections"].get("lasso", {}).get("icir", 0.0)),
-            xgb_ic=float(runtime_eval["selections"].get("xgboost", {}).get("ic", 0.0)),
-            xgb_icir=float(runtime_eval["selections"].get("xgboost", {}).get("icir", 0.0)),
-            n_factors=loaded_library.size,
-            admission_rate=session_summary.get("overall_yield_rate", 0.0),
-            elapsed_seconds=session_summary.get("elapsed_seconds", 0.0),
-            ic_series=ic_series,
-        )
-        result.avg_turnover = library_turnover  # type: ignore[attr-defined]
-
-        artifact_paths = {
-            "output_dir": str(output_dir.resolve()),
-            "run_manifest": str(run_manifest_path.resolve()),
-            "session": str((output_dir / "session.json").resolve()),
-            "session_log": str((output_dir / "session_log.json").resolve()),
-            "library": str((output_dir / "factor_library.json").resolve()),
-            "checkpoint_dir": str((output_dir / "checkpoint").resolve()),
-            "checkpoint_run_manifest": str((output_dir / "checkpoint" / "run_manifest.json").resolve()),
-        }
-        payload = {
-            "loop_kind": loop_kind,
-            "method": loop_kind,
-            "run_id": run_id,
-            "output_dir": str(output_dir.resolve()),
-            "session_summary": session_summary,
-            "run_manifest": run_manifest,
-            "artifact_paths": artifact_paths,
-            "frozen_top_k": [
-                {
-                    "name": artifact.name,
-                    "formula": artifact.formula,
-                    "category": artifact.category,
-                    "train_ic": artifact.split_stats["train"]["ic_abs_mean"],
-                    "train_icir": abs(artifact.split_stats["train"]["icir"]),
-                }
-                for artifact in selected
-            ],
-            "library": runtime_eval["library"],
-            "combinations": runtime_eval["combinations"],
-            "selections": runtime_eval["selections"],
-            "turnover": {
-                "library": library_turnover,
-                **combination_turnover,
-            },
-            "cost_pressure": cost_pressure,
-            "library_size": loaded_library.size,
-            "candidate_count": session_summary.get("total_candidates", 0),
-            "selected_formulas": sorted(selected_formulas),
-        }
-        return result, payload
-
-    def _runtime_method_frames(
-        self,
-        runtime_payloads: Dict[str, List[dict[str, Any]]],
-        methods: List[str],
-    ) -> tuple[pd.DataFrame, pd.DataFrame]:
-        turnover_rows: list[dict[str, Any]] = []
-        cost_rows: list[dict[str, Any]] = []
-
-        for method in methods:
-            for payload in runtime_payloads.get(method, []):
-                turnover = payload.get("turnover", {})
-                if turnover:
-                    turnover_rows.append(
-                        {
-                            "method": method,
-                            "run_id": payload.get("run_id", 0),
-                            "library_turnover": turnover.get("library", 0.0),
-                            "equal_weight_turnover": turnover.get("equal_weight", 0.0),
-                            "ic_weighted_turnover": turnover.get("ic_weighted", 0.0),
-                            "orthogonal_turnover": turnover.get("orthogonal", 0.0),
-                        }
-                    )
-                for combo_name, cost_map in payload.get("cost_pressure", {}).items():
-                    for cost_bps, stats in cost_map.items():
-                        cost_rows.append(
-                            {
-                                "method": method,
-                                "run_id": payload.get("run_id", 0),
-                                "combination": combo_name,
-                                "cost_bps": float(cost_bps),
-                                "ic": stats.get("ic", 0.0),
-                                "icir": stats.get("icir", 0.0),
-                                "turnover": stats.get("turnover", 0.0),
-                                "long_short": stats.get("long_short", 0.0),
-                                "monotonicity": stats.get("monotonicity", 0.0),
-                            }
-                        )
-
-        return pd.DataFrame(turnover_rows), pd.DataFrame(cost_rows)
-
-    def run_runtime_comparison(
-        self,
-        cfg,
-        output_dir: Path,
-        *,
-        data_path: Optional[str] = None,
-        raw_df: Optional[pd.DataFrame] = None,
-        mock: bool = False,
-        baseline_methods: Optional[List[str]] = None,
-        n_target_factors: int = 40,
-        n_runs: int = 1,
-    ) -> tuple[BenchmarkResult, dict[str, Any]]:
-        """Run a benchmark with real Ralph/Helix executions for Phase 2."""
-        from factorminer.benchmark.runtime import load_benchmark_dataset
-
-        methods = baseline_methods or [
-            "random_exploration",
-            "alpha101_classic",
-            "alpha101_adapted",
-            "ralph_loop",
-            "helix_phase2",
-        ]
-        runtime_methods = {"ralph_loop", "helix_phase2"}
-
-        runtime_dataset, dataset_hash = load_benchmark_dataset(
-            cfg,
-            data_path=data_path,
-            raw_df=raw_df,
-            mock=mock,
-        )
-        static_data = dict(runtime_dataset.data_dict)
-        static_data["forward_returns"] = runtime_dataset.returns
-
-        train_indices = runtime_dataset.splits["train"].indices
-        test_indices = runtime_dataset.splits["test"].indices
-
-        def _slice_by_indices(data: dict, indices: np.ndarray) -> dict:
-            return {key: value[:, indices] for key, value in data.items()}
-
-        train_data = _slice_by_indices(static_data, train_indices)
-        test_data = _slice_by_indices(static_data, test_indices)
-
-        raw_results: Dict[str, List[MethodResult]] = {}
-        runtime_payloads: Dict[str, List[dict[str, Any]]] = {}
-        runtime_root = output_dir / "runtime_runs"
-        runtime_root.mkdir(parents=True, exist_ok=True)
-
-        for method in methods:
-            method_runs: List[MethodResult] = []
-            for run_id in range(n_runs):
-                if method in runtime_methods:
-                    result, payload = self._execute_runtime_loop(
-                        cfg=cfg,
-                        loop_kind=method,
-                        runtime_dataset=runtime_dataset,
-                        output_dir=runtime_root / method / f"run_{run_id}",
-                        n_target_factors=n_target_factors,
-                        run_id=run_id,
-                        mock=mock,
-                    )
-                    method_runs.append(result)
-                    runtime_payloads.setdefault(method, []).append(payload)
-                else:
-                    result = self.run_single_method(
-                        method=method,
-                        data=train_data,
-                        test_data=test_data,
-                        n_factors=n_target_factors,
-                        run_id=run_id,
-                    )
-                    method_runs.append(result)
-            raw_results[method] = method_runs
-
-        averaged = {
-            method: _average_method_results(runs)
-            for method, runs in raw_results.items()
-        }
-
-        lib_df = _build_library_df(averaged, methods)
-        comb_df = _build_combination_df(averaged, methods)
-        sel_df = _build_selection_df(averaged, methods)
-        speed_result = self._speed_bench.run_full_pipeline_benchmark(data=train_data)
-        op_result = self._speed_bench.run_operator_benchmark(n_repeats=3)
-        speed_df = _build_speed_df(op_result, speed_result)
-        turnover_df, cost_df = self._runtime_method_frames(runtime_payloads, methods)
-
-        stat_tests = {}
-        helix_results = raw_results.get("helix_phase2", [])
-        ralph_results = raw_results.get("ralph_loop", [])
-        if helix_results and ralph_results:
-            h_ic = helix_results[0].ic_series
-            r_ic = ralph_results[0].ic_series
-            if h_ic is not None and r_ic is not None:
-                stat_tests = self._stat_tests.run_all_tests(h_ic, r_ic)
-            else:
-                h_ic = _synthetic_ic_series(
-                    helix_results[0].library_ic, n=100, seed=self.seed
-                )
-                r_ic = _synthetic_ic_series(
-                    ralph_results[0].library_ic, n=100, seed=self.seed + 1
-                )
-                stat_tests = self._stat_tests.run_all_tests(h_ic, r_ic)
-
-        runtime_artifacts = {
-            "dataset_hash": dataset_hash,
-            "runtime_root": str(runtime_root.resolve()),
-            "runtime_payloads": runtime_payloads,
-        }
-
-        return (
-            BenchmarkResult(
-                methods=methods,
-                factor_library_metrics=lib_df,
-                combination_metrics=comb_df,
-                selection_metrics=sel_df,
-                speed_metrics=speed_df,
-                statistical_tests=stat_tests,
-                raw_method_results=raw_results,
-                turnover_metrics=turnover_df,
-                cost_pressure_metrics=cost_df,
-                runtime_artifacts=runtime_artifacts,
-            ),
-            runtime_artifacts,
-        )
-
-    def run_runtime_ablation_study(
-        self,
-        cfg,
-        output_dir: Path,
-        *,
-        data_path: Optional[str] = None,
-        raw_df: Optional[pd.DataFrame] = None,
-        mock: bool = False,
-        configs_to_run: Optional[List[str]] = None,
-        n_target_factors: int = 40,
-        n_runs: int = 1,
-    ) -> AblationResult:
-        """Run a runtime-backed ablation study using real loop executions."""
-        from factorminer.benchmark.runtime import load_benchmark_dataset
-
-        runtime_dataset, _ = load_benchmark_dataset(
-            cfg,
-            data_path=data_path,
-            raw_df=raw_df,
-            mock=mock,
-        )
-
-        configs = configs_to_run or [
-            "full",
-            "no_debate",
-            "no_causal",
-            "no_canonicalize",
-            "no_regime",
-            "no_capacity",
-            "no_significance",
-            "no_memory",
-        ]
-
-        results: Dict[str, MethodResult] = {}
-        runtime_root = output_dir / "runtime_ablation"
-        runtime_root.mkdir(parents=True, exist_ok=True)
-
-        for config_name in configs:
-            variant_cfg = self._clone_cfg(cfg)
-            method_kind = "helix_phase2"
-            if config_name == "no_debate":
-                variant_cfg.phase2.debate.enabled = False
-            elif config_name == "no_causal":
-                variant_cfg.phase2.causal.enabled = False
-            elif config_name == "no_canonicalize":
-                variant_cfg.phase2.helix.enable_canonicalization = False
-            elif config_name == "no_regime":
-                variant_cfg.phase2.regime.enabled = False
-            elif config_name == "no_capacity":
-                variant_cfg.phase2.capacity.enabled = False
-            elif config_name == "no_significance":
-                variant_cfg.phase2.significance.enabled = False
-            elif config_name == "no_memory":
-                method_kind = "ralph_loop"
-            elif config_name == "full":
-                pass
-            else:
-                logger.warning("Unknown runtime ablation config: %s", config_name)
-                continue
-
-            run_dir = runtime_root / config_name / "run_0"
-            result, _payload = self._execute_runtime_loop(
-                cfg=variant_cfg,
-                loop_kind=method_kind,
-                runtime_dataset=runtime_dataset,
-                output_dir=run_dir,
-                n_target_factors=n_target_factors,
-                run_id=0,
-                mock=mock,
-            )
-            results[config_name] = result
-
-        baseline = results.get("full")
-        rows: list[dict[str, Any]] = []
-        if baseline is not None:
-            for name, result in results.items():
-                if name == "full":
-                    continue
-                rows.append(
-                    {
-                        "config": name,
-                        "method": result.method,
-                        "delta_library_ic": result.library_ic - baseline.library_ic,
-                        "delta_library_icir": result.library_icir - baseline.library_icir,
-                        "delta_ew_ic": result.ew_ic - baseline.ew_ic,
-                        "delta_icw_ic": result.icw_ic - baseline.icw_ic,
-                        "delta_lasso_ic": result.lasso_ic - baseline.lasso_ic,
-                        "delta_xgb_ic": result.xgb_ic - baseline.xgb_ic,
-                        "delta_turnover": getattr(result, "avg_turnover", 0.0)
-                        - getattr(baseline, "avg_turnover", 0.0),
-                    }
-                )
-        contributions = pd.DataFrame(rows)
-        return AblationResult(configs=configs, results=results, contributions=contributions)
-
-
-# ---------------------------------------------------------------------------
-# Helper functions
-# ---------------------------------------------------------------------------
-
-def _build_mock_data_dict(
-    n_assets: int = 100,
-    n_periods: int = 500,
-    seed: int = 42,
-) -> dict:
-    """Build a minimal data dict from MockConfig (no raw_df needed)."""
-    from factorminer.data.mock_data import MockConfig, generate_mock_data
-    from factorminer.data.preprocessor import preprocess
-
-    cfg = MockConfig(
-        num_assets=n_assets,
-        num_periods=n_periods,
-        frequency="10min",
-        plant_alpha=True,
-        alpha_strength=0.04,
-        alpha_assets_frac=0.4,
-        seed=seed,
-    )
-    raw = generate_mock_data(cfg)
-    processed = preprocess(raw)
-
-    assets = sorted(processed["asset_id"].unique())
-    T = processed.groupby("asset_id").size().min()
-
-    feature_map = {
-        "$open": "open", "$high": "high", "$low": "low", "$close": "close",
-        "$volume": "volume", "$amt": "amount", "$vwap": "vwap",
-        "$returns": "returns",
-    }
-    data_dict: dict = {}
-    for feat_name, col_name in feature_map.items():
-        if col_name in processed.columns:
-            pivot = processed.pivot(
-                index="asset_id", columns="datetime", values=col_name
-            )
-            pivot = pivot.loc[assets].iloc[:, :T]
-            data_dict[feat_name] = pivot.values.astype(np.float64)
-
-    close = data_dict["$close"]
-    forward_returns = np.roll(close, -1, axis=1) / close - 1
-    forward_returns[:, -1] = np.nan
-    data_dict["forward_returns"] = forward_returns
-    return data_dict
-
-
-def _slice_data(data: dict, start: int, end: int) -> dict:
-    """Slice all (M, T) arrays to columns [start, end)."""
-    return {k: v[:, start:end] for k, v in data.items()}
-
-
-def _average_method_results(runs: List[MethodResult]) -> MethodResult:
-    """Average numeric fields across multiple runs."""
-    if not runs:
-        return MethodResult(method="unknown")
-    if len(runs) == 1:
-        return runs[0]
-
-    fields = [
-        "library_ic", "library_icir", "avg_abs_rho",
-        "ew_ic", "ew_icir", "icw_ic", "icw_icir",
-        "lasso_ic", "lasso_icir", "xgb_ic", "xgb_icir",
-        "n_factors", "admission_rate", "elapsed_seconds", "avg_turnover",
-    ]
-    avg = MethodResult(method=runs[0].method)
-    for f in fields:
-        vals = [getattr(r, f) for r in runs if getattr(r, f) is not None]
-        if vals:
-            setattr(avg, f, float(np.mean(vals)))
-    return avg
-
-
-def _build_library_df(
-    averaged: Dict[str, MethodResult], methods: List[str]
-) -> pd.DataFrame:
-    rows = []
-    for method in methods:
-        r = averaged.get(method, MethodResult(method=method))
-        rows.append({
-            "method": method,
-            "ic_pct": r.library_ic * 100,
-            "icir": r.library_icir,
-            "avg_abs_rho": r.avg_abs_rho,
-            "n_factors": r.n_factors,
-            "avg_turnover": r.avg_turnover,
-        })
-    return pd.DataFrame(rows)
-
-
-def _build_combination_df(
-    averaged: Dict[str, MethodResult], methods: List[str]
-) -> pd.DataFrame:
-    rows = []
-    for method in methods:
-        r = averaged.get(method, MethodResult(method=method))
-        rows.append({
-            "method": method,
-            "ew_ic_pct": r.ew_ic * 100,
-            "ew_icir": r.ew_icir,
-            "icw_ic_pct": r.icw_ic * 100,
-            "icw_icir": r.icw_icir,
-        })
-    return pd.DataFrame(rows)
-
-
-def _build_selection_df(
-    averaged: Dict[str, MethodResult], methods: List[str]
-) -> pd.DataFrame:
-    rows = []
-    for method in methods:
-        r = averaged.get(method, MethodResult(method=method))
-        rows.append({
-            "method": method,
-            "lasso_ic_pct": r.lasso_ic * 100,
-            "lasso_icir": r.lasso_icir,
-            "xgb_ic_pct": r.xgb_ic * 100,
-            "xgb_icir": r.xgb_icir,
-            "best_ic_pct": max(r.lasso_ic, r.xgb_ic) * 100,
-        })
-    return pd.DataFrame(rows)
-
-
-def _build_speed_df(
-    op_result: OperatorSpeedResult,
-    pipeline_result: PipelineSpeedResult,
-) -> pd.DataFrame:
-    rows = []
-    for op, ms in op_result.operator_timings_ms.items():
-        rows.append({"name": op, "time_ms": ms, "type": "operator"})
-    rows.append({
-        "name": f"Pipeline ({pipeline_result.n_candidates} cands)",
-        "time_ms": pipeline_result.total_seconds * 1000,
-        "type": "pipeline",
-    })
-    rows.append({
-        "name": "Throughput (cands/s)",
-        "time_ms": pipeline_result.candidates_per_second,
-        "type": "throughput",
-    })
-    return pd.DataFrame(rows)
-
-
-def _synthetic_ic_series(
-    target_mean: float,
-    n: int = 100,
-    seed: int = 42,
-) -> np.ndarray:
-    """Generate a synthetic IC series with given mean for stat tests."""
-    rng = np.random.RandomState(seed)
-    noise = rng.randn(n) * 0.03
-    base = target_mean + noise
-    return base.astype(np.float64)
-
-
-# ---------------------------------------------------------------------------
-# CLI entry point
-# ---------------------------------------------------------------------------
-
-def _parse_args() -> argparse.Namespace:
-    parser = argparse.ArgumentParser(
-        description="HelixFactor vs FactorMiner Benchmark Suite"
-    )
-    parser.add_argument("--mock", action="store_true", help="Use mock data")
-    parser.add_argument("--n-factors", type=int, default=40, help="Target library size")
-    parser.add_argument("--n-assets", type=int, default=100, help="Mock data assets")
-    parser.add_argument("--n-periods", type=int, default=500, help="Mock data periods")
-    parser.add_argument("--output", type=str, default="results/", help="Output directory")
-    parser.add_argument("--methods", nargs="*", default=None, help="Methods to run")
-    parser.add_argument("--seed", type=int, default=42, help="Random seed")
-    parser.add_argument(
-        "--log-level", type=str, default="WARNING", help="Logging level"
-    )
-    return parser.parse_args()
-
-
-def main() -> None:
-    args = _parse_args()
-    logging.basicConfig(
-        level=getattr(logging, args.log_level.upper(), logging.WARNING),
-        format="%(levelname)s %(name)s: %(message)s",
-    )
-
-    output_dir = Path(args.output)
-    output_dir.mkdir(parents=True, exist_ok=True)
-
-    print("=" * 70)
-    print("  HelixFactor Benchmark Suite")
-    print("=" * 70)
-
-    # Build mock data
-    print(f"\n[1/4] Generating mock data ({args.n_assets} assets, {args.n_periods} periods)...")
-    t0 = time.perf_counter()
-    data = _build_mock_data_dict(
-        n_assets=args.n_assets,
-        n_periods=args.n_periods,
-        seed=args.seed,
-    )
-    T = list(data.values())[0].shape[1]
-    train_end = int(T * 0.7)
-    print(f"    Done in {time.perf_counter()-t0:.1f}s  (T={T}, train=0:{train_end}, test={train_end}:{T})")
-
-    # Run comparison
-    print(f"\n[2/4] Running method comparison (n_factors={args.n_factors})...")
-    bench = HelixBenchmark(seed=args.seed)
-    t0 = time.perf_counter()
-    result = bench.run_comparison(
-        data=data,
-        train_period=(0, train_end),
-        test_period=(train_end, T),
-        n_target_factors=args.n_factors,
-        n_runs=1,
-        methods=args.methods,
-    )
-    elapsed = time.perf_counter() - t0
-    print(f"    Done in {elapsed:.1f}s")
-
-    # Print results table
-    print("\n[3/4] Results Summary:")
-    print("\n--- Factor Library Metrics ---")
-    print(result.factor_library_metrics.to_string(index=False, float_format="{:.4f}".format))
-    print("\n--- Factor Combination Metrics ---")
-    print(result.combination_metrics.to_string(index=False, float_format="{:.4f}".format))
-    print("\n--- Factor Selection Metrics ---")
-    print(result.selection_metrics.to_string(index=False, float_format="{:.4f}".format))
-    print("\n--- Speed Metrics ---")
-    print(result.speed_metrics.to_string(index=False, float_format="{:.3f}".format))
-
-    if result.statistical_tests:
-        dm = result.statistical_tests.get("diebold_mariano", {})
-        print(f"\n--- Statistical Tests (Helix vs Ralph) ---")
-        print(f"    DM stat: {dm.get('dm_stat', 0):.3f}  p={dm.get('p_value', 1):.4f}  dir={dm.get('direction','?')}")
-        ci = result.statistical_tests.get("bootstrap_ci_95", {})
-        print(f"    Bootstrap 95% CI on IC diff: [{ci.get('lower', 0):.4f}, {ci.get('upper', 0):.4f}]")
-        print(f"    Helix outperforms: {result.statistical_tests.get('helix_outperforms', False)}")
-
-    # Save outputs
-    print(f"\n[4/4] Saving outputs to {output_dir}...")
-    result.generate_full_report(str(output_dir / "benchmark_report.html"))
-    with open(output_dir / "library_metrics.csv", "w") as f:
-        result.factor_library_metrics.to_csv(f, index=False)
-    with open(output_dir / "combination_metrics.csv", "w") as f:
-        result.combination_metrics.to_csv(f, index=False)
-    with open(output_dir / "selection_metrics.csv", "w") as f:
-        result.selection_metrics.to_csv(f, index=False)
-    with open(output_dir / "statistical_tests.json", "w") as f:
-        json.dump(_json_safe(result.statistical_tests), f, indent=2, allow_nan=False)
-    with open(output_dir / "latex_table.tex", "w") as f:
-        f.write(result.to_latex_table())
-    with open(output_dir / "benchmark_report.md", "w") as f:
-        f.write(result.to_markdown_table())
-    with open(output_dir / "readme_table.md", "w") as f:
-        f.write(result.to_markdown_table())
-
-    try:
-        result.plot_comparison(str(output_dir / "comparison_plot.png"))
-    except Exception as exc:
-        logger.debug("Plot generation failed: %s", exc)
-
-    print(f"    Reports saved to {output_dir}")
-    print(f"\nDone. Total runtime: {time.perf_counter() - t0:.1f}s")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/src/factorminer/factorminer/benchmark/runtime.py b/src/factorminer/factorminer/benchmark/runtime.py
deleted file mode 100644
index f8ffbc0..0000000
--- a/src/factorminer/factorminer/benchmark/runtime.py
+++ /dev/null
@@ -1,1498 +0,0 @@
-"""Strict paper/research benchmark runners built on runtime recomputation."""
-
-from __future__ import annotations
-
-from dataclasses import asdict, dataclass, field
-import copy
-import hashlib
-import json
-import logging
-import time
-from pathlib import Path
-from types import SimpleNamespace
-from typing import Any, Iterable, Optional
-
-import numpy as np
-import pandas as pd
-
-from src.factorminer.factorminer.benchmark.catalogs import (
-    CandidateEntry,
-    build_alpha101_adapted,
-    build_alphaagent_style,
-    build_alphaforge_style,
-    build_factor_miner_catalog,
-    build_gplearn_style,
-    build_random_exploration,
-    dedupe_entries,
-    entries_from_library,
-    ALPHA101_CLASSIC,
-)
-from src.factorminer.factorminer.core.factor_library import Factor, FactorLibrary
-from src.factorminer.factorminer.core.library_io import load_library
-from src.factorminer.factorminer.core.session import MiningSession
-from src.factorminer.factorminer.evaluation.runtime import (
-    EvaluationDataset,
-    FactorEvaluationArtifact,
-    compute_correlation_matrix,
-    evaluate_factors,
-    load_runtime_dataset,
-)
-
-logger = logging.getLogger(__name__)
-
-RUNTIME_LOOP_BASELINES = {
-    "ralph_loop",
-    "helix_phase2",
-    "helix_no_memory",
-    "helix_no_debate",
-    "helix_no_significance",
-    "helix_no_capacity",
-    "helix_no_regime",
-}
-
-
-@dataclass
-class BenchmarkManifest:
-    """Serializable description of one benchmark run."""
-
-    benchmark_name: str
-    mode: str
-    seed: int
-    baseline: str
-    freeze_universe: str
-    report_universes: list[str]
-    train_period: list[str]
-    test_period: list[str]
-    freeze_top_k: int
-    signal_failure_policy: str
-    default_target: str
-    target_stack: list[str]
-    primary_objective: str
-    dataset_hashes: dict[str, str]
-    artifact_paths: dict[str, str]
-    runtime_contract: dict[str, Any] = field(default_factory=dict)
-    baseline_provenance: dict[str, dict[str, Any]] = field(default_factory=dict)
-    warnings: list[str] = field(default_factory=list)
-
-
-def _clone_cfg(cfg):
-    cloned = copy.deepcopy(cfg)
-    cloned._raw = copy.deepcopy(getattr(cfg, "_raw", {}))
-    return cloned
-
-
-def _cfg_with_overrides(cfg, universe: str, mode: Optional[str] = None):
-    cloned = _clone_cfg(cfg)
-    cloned.data.universe = universe
-    if mode is not None:
-        cloned.benchmark.mode = mode
-    if cloned.benchmark.mode == "paper":
-        cloned.evaluation.signal_failure_policy = "reject"
-        cloned.research.enabled = False
-        cloned.phase2.causal.enabled = False
-        cloned.phase2.regime.enabled = False
-        cloned.phase2.capacity.enabled = False
-        cloned.phase2.significance.enabled = False
-        cloned.phase2.debate.enabled = False
-        cloned.phase2.auto_inventor.enabled = False
-        cloned.phase2.helix.enabled = False
-    else:
-        cloned.research.enabled = True
-    return cloned
-
-
-def _data_hash(df: pd.DataFrame) -> str:
-    sample = df.sort_values(["datetime", "asset_id"]).reset_index(drop=True)
-    digest = hashlib.sha256()
-    digest.update(pd.util.hash_pandas_object(sample, index=True).values.tobytes())
-    return digest.hexdigest()
-
-
-def _json_safe(value: Any) -> Any:
-    """Recursively convert NaN/inf values into JSON-safe nulls."""
-    if isinstance(value, np.generic):
-        return _json_safe(value.item())
-    if isinstance(value, float):
-        if np.isnan(value) or np.isinf(value):
-            return None
-        return value
-    if isinstance(value, dict):
-        return {str(key): _json_safe(val) for key, val in value.items()}
-    if isinstance(value, (list, tuple)):
-        return [_json_safe(item) for item in value]
-    return value
-
-
-def _file_sha256(path: Path) -> str:
-    digest = hashlib.sha256()
-    with open(path, "rb") as fp:
-        for chunk in iter(lambda: fp.read(1024 * 1024), b""):
-            digest.update(chunk)
-    return digest.hexdigest()
-
-
-def _json_summary(path: Path) -> dict[str, Any] | None:
-    if not path.exists():
-        return None
-    try:
-        with open(path) as fp:
-            payload = json.load(fp)
-    except Exception as exc:  # pragma: no cover - defensive provenance capture
-        return {"path": str(path), "load_error": str(exc)}
-
-    if isinstance(payload, dict):
-        return payload
-    return {"path": str(path), "payload_type": type(payload).__name__}
-
-
-def _session_summary(path: Path) -> dict[str, Any] | None:
-    if not path.exists():
-        return None
-    try:
-        return MiningSession.load(path).get_summary()
-    except Exception as exc:  # pragma: no cover - defensive provenance capture
-        return {"path": str(path), "load_error": str(exc)}
-
-
-def _catalog_provenance(baseline: str, candidate_count: int, seed: int) -> dict[str, Any]:
-    return {
-        "kind": "catalog",
-        "source": baseline,
-        "candidate_count": candidate_count,
-        "seed": seed,
-    }
-
-
-def _saved_library_provenance(
-    requested_path: str,
-    baseline: str,
-) -> dict[str, Any]:
-    base_path = Path(_base_path(requested_path)).expanduser()
-    resolved_base = base_path.resolve() if base_path.exists() else base_path
-    library_json = resolved_base.with_suffix(".json")
-    signal_cache = Path(str(resolved_base) + "_signals.npz")
-    parent = resolved_base.parent
-
-    source_files: dict[str, dict[str, str]] = {}
-    for label, path in {
-        "library_json": library_json,
-        "signal_cache": signal_cache,
-        "session_json": parent / "session.json",
-        "session_log_json": parent / "session_log.json",
-        "checkpoint_session_json": parent / "checkpoint" / "session.json",
-        "checkpoint_loop_state_json": parent / "checkpoint" / "loop_state.json",
-        "checkpoint_memory_json": parent / "checkpoint" / "memory.json",
-    }.items():
-        if path.exists():
-            source_files[label] = {
-                "path": str(path),
-                "sha256": _file_sha256(path),
-            }
-
-    provenance: dict[str, Any] = {
-        "kind": "saved_library",
-        "source": baseline,
-        "requested_path": str(Path(requested_path)),
-        "resolved_base_path": str(resolved_base),
-        "source_files": source_files,
-        "library_summary": {},
-        "session_summary": _session_summary(parent / "session.json"),
-        "session_log_summary": _json_summary(parent / "session_log.json"),
-    }
-
-    if library_json.exists():
-        try:
-            library = load_library(resolved_base)
-        except Exception as exc:  # pragma: no cover - defensive provenance capture
-            provenance["library_summary"] = {
-                "path": str(library_json),
-                "load_error": str(exc),
-            }
-        else:
-            provenance["library_summary"] = {
-                "path": str(library_json),
-                "factor_count": library.size,
-                "diagnostics": library.get_diagnostics(),
-            }
-
-    return provenance
-
-
-def _baseline_provenance(
-    baseline: str,
-    *,
-    factor_miner_library_path: Optional[str] = None,
-    factor_miner_no_memory_library_path: Optional[str] = None,
-    candidate_count: int = 0,
-    seed: int = 0,
-) -> dict[str, Any]:
-    if baseline == "factor_miner" and factor_miner_library_path:
-        return _saved_library_provenance(factor_miner_library_path, baseline)
-    if baseline == "factor_miner_no_memory" and factor_miner_no_memory_library_path:
-        return _saved_library_provenance(
-            factor_miner_no_memory_library_path,
-            baseline,
-        )
-    return _catalog_provenance(baseline, candidate_count, seed)
-
-
-def _runtime_manifest_value(
-    runtime_manifests: Optional[dict[str, dict[str, Any]]],
-    baseline: str,
-) -> dict[str, Any]:
-    """Return the runtime manifest for one baseline if supplied."""
-    if not runtime_manifests:
-        return {}
-    value = runtime_manifests.get(baseline, {})
-    return dict(value) if isinstance(value, dict) else {}
-
-
-def _build_runtime_provider(cfg, *, mock: bool):
-    """Create the benchmark-time LLM provider."""
-    from factorminer.agent.llm_interface import MockProvider, create_provider
-
-    if mock or getattr(cfg.llm, "provider", "mock") == "mock":
-        return MockProvider()
-
-    provider_cfg = {
-        "provider": cfg.llm.provider,
-        "model": cfg.llm.model,
-    }
-    raw_llm_cfg = getattr(cfg, "_raw", {}).get("llm", {})
-    if raw_llm_cfg.get("api_key"):
-        provider_cfg["api_key"] = raw_llm_cfg["api_key"]
-    return create_provider(provider_cfg)
-
-
-def _filter_dataclass_kwargs(source, target_cls):
-    """Copy shared dataclass fields from one config object to another."""
-    from dataclasses import fields
-
-    target_fields = {f.name for f in fields(target_cls)}
-    source_fields = getattr(source, "__dataclass_fields__", {})
-    return {
-        name: getattr(source, name)
-        for name in source_fields
-        if name in target_fields
-    }
-
-
-def _build_phase2_runtime_kwargs(cfg) -> dict[str, Any]:
-    """Build runtime Phase 2 configs from the hierarchical benchmark config."""
-    from factorminer.evaluation.causal import CausalConfig as RuntimeCausalConfig
-    from factorminer.evaluation.capacity import CapacityConfig as RuntimeCapacityConfig
-    from factorminer.evaluation.regime import RegimeConfig as RuntimeRegimeConfig
-    from factorminer.evaluation.significance import (
-        SignificanceConfig as RuntimeSignificanceConfig,
-    )
-    from factorminer.agent.debate import DebateConfig as RuntimeDebateConfig
-    from factorminer.agent.specialists import DEFAULT_SPECIALISTS
-
-    debate_config = None
-    if cfg.phase2.debate.enabled:
-        requested = cfg.phase2.debate.num_specialists
-        selected = list(DEFAULT_SPECIALISTS[:requested])
-        if requested > len(DEFAULT_SPECIALISTS):
-            selected = list(DEFAULT_SPECIALISTS)
-        debate_config = RuntimeDebateConfig(
-            specialists=selected,
-            enable_critic=cfg.phase2.debate.enable_critic,
-            candidates_per_specialist=cfg.phase2.debate.candidates_per_specialist,
-            top_k_after_critic=cfg.phase2.debate.top_k_after_critic,
-            critic_temperature=cfg.phase2.debate.critic_temperature,
-        )
-
-    causal_config = None
-    if cfg.phase2.causal.enabled:
-        causal_config = RuntimeCausalConfig(
-            **_filter_dataclass_kwargs(cfg.phase2.causal, RuntimeCausalConfig)
-        )
-
-    regime_config = None
-    if cfg.phase2.regime.enabled:
-        regime_config = RuntimeRegimeConfig(
-            **_filter_dataclass_kwargs(cfg.phase2.regime, RuntimeRegimeConfig)
-        )
-
-    capacity_config = None
-    if cfg.phase2.capacity.enabled:
-        capacity_config = RuntimeCapacityConfig(
-            **_filter_dataclass_kwargs(cfg.phase2.capacity, RuntimeCapacityConfig)
-        )
-
-    significance_config = None
-    if cfg.phase2.significance.enabled:
-        significance_config = RuntimeSignificanceConfig(
-            **_filter_dataclass_kwargs(cfg.phase2.significance, RuntimeSignificanceConfig)
-        )
-
-    return {
-        "debate_config": debate_config,
-        "causal_config": causal_config,
-        "regime_config": regime_config,
-        "capacity_config": capacity_config,
-        "significance_config": significance_config,
-        "enable_knowledge_graph": bool(cfg.phase2.helix.enable_knowledge_graph),
-        "enable_embeddings": bool(cfg.phase2.helix.enable_embeddings),
-        "enable_auto_inventor": bool(cfg.phase2.auto_inventor.enabled),
-        "auto_invention_interval": int(cfg.phase2.auto_inventor.invention_interval),
-        "canonicalize": bool(cfg.phase2.helix.enable_canonicalization),
-        "forgetting_lambda": float(cfg.phase2.helix.forgetting_lambda),
-    }
-
-
-def _extract_volume_panel(dataset: EvaluationDataset) -> Optional[np.ndarray]:
-    """Best-effort extraction of a dollar-volume panel for Helix capacity checks."""
-    for key in ("$amt", "$volume"):
-        panel = dataset.data_dict.get(key)
-        if panel is not None and np.any(np.isfinite(panel)):
-            return np.asarray(panel, dtype=np.float64)
-    return None
-
-
-def _build_runtime_loop_config(
-    cfg,
-    *,
-    output_dir: Path,
-    dataset: EvaluationDataset,
-    mock: bool,
-    runtime_manifest: dict[str, Any],
-):
-    """Build the flat loop config consumed by RalphLoop/HelixLoop."""
-    from factorminer.core.config import MiningConfig as LoopMiningConfig
-
-    target_library_size = int(
-        runtime_manifest.get(
-            "target_library_size",
-            getattr(cfg.mining, "target_library_size", 110),
-        )
-    )
-    max_iterations = int(
-        runtime_manifest.get(
-            "max_iterations",
-            getattr(cfg.mining, "max_iterations", 200),
-        )
-    )
-    ic_threshold = float(
-        runtime_manifest.get(
-            "ic_threshold",
-            getattr(cfg.mining, "ic_threshold", 0.04),
-        )
-    )
-    icir_threshold = float(
-        runtime_manifest.get(
-            "icir_threshold",
-            getattr(cfg.mining, "icir_threshold", 0.5),
-        )
-    )
-    correlation_threshold = float(
-        runtime_manifest.get(
-            "correlation_threshold",
-            getattr(cfg.mining, "correlation_threshold", 0.5),
-        )
-    )
-    replacement_ic_min = float(
-        runtime_manifest.get(
-            "replacement_ic_min",
-            getattr(cfg.mining, "replacement_ic_min", 0.10),
-        )
-    )
-    replacement_ic_ratio = float(
-        runtime_manifest.get(
-            "replacement_ic_ratio",
-            getattr(cfg.mining, "replacement_ic_ratio", 1.3),
-        )
-    )
-
-    if runtime_manifest.get("relax_thresholds", mock):
-        ic_threshold = min(ic_threshold, 0.0)
-        icir_threshold = min(icir_threshold, -1.0)
-        correlation_threshold = max(correlation_threshold, 1.1)
-
-    loop_cfg = LoopMiningConfig(
-        target_library_size=target_library_size,
-        batch_size=int(
-            runtime_manifest.get("batch_size", getattr(cfg.mining, "batch_size", 40))
-        ),
-        max_iterations=max_iterations,
-        ic_threshold=ic_threshold,
-        icir_threshold=icir_threshold,
-        correlation_threshold=correlation_threshold,
-        replacement_ic_min=replacement_ic_min,
-        replacement_ic_ratio=replacement_ic_ratio,
-        fast_screen_assets=int(
-            runtime_manifest.get(
-                "fast_screen_assets",
-                getattr(cfg.evaluation, "fast_screen_assets", 100),
-            )
-        ),
-        num_workers=int(
-            runtime_manifest.get(
-                "num_workers", getattr(cfg.evaluation, "num_workers", 1)
-            )
-        ),
-        output_dir=str(output_dir),
-        backend=str(
-            runtime_manifest.get(
-                "backend", getattr(cfg.evaluation, "backend", "numpy")
-            )
-        ),
-        gpu_device=str(
-            runtime_manifest.get(
-                "gpu_device", getattr(cfg.evaluation, "gpu_device", "cuda:0")
-            )
-        ),
-        signal_failure_policy=str(
-            runtime_manifest.get(
-                "signal_failure_policy",
-                "synthetic" if mock else getattr(cfg.evaluation, "signal_failure_policy", "reject"),
-            )
-        ),
-    )
-
-    loop_cfg.research = cfg.research
-    loop_cfg.benchmark_mode = str(getattr(cfg.benchmark, "mode", "paper"))
-    loop_cfg.target_panels = dataset.target_panels
-    loop_cfg.target_horizons = {
-        name: max(int(spec.holding_bars), 1)
-        for name, spec in dataset.target_specs.items()
-    }
-    return loop_cfg
-
-
-def _cfg_for_runtime_baseline(cfg, baseline: str):
-    """Project the hierarchical config into one runtime benchmark variant."""
-    runtime_cfg = _clone_cfg(cfg)
-
-    # Start from a clean phase-2 surface so variants are explicit.
-    runtime_cfg.phase2.causal.enabled = False
-    runtime_cfg.phase2.regime.enabled = False
-    runtime_cfg.phase2.capacity.enabled = False
-    runtime_cfg.phase2.significance.enabled = False
-    runtime_cfg.phase2.debate.enabled = False
-    runtime_cfg.phase2.auto_inventor.enabled = False
-    runtime_cfg.phase2.helix.enabled = False
-    runtime_cfg.phase2.helix.enable_knowledge_graph = False
-    runtime_cfg.phase2.helix.enable_embeddings = False
-    runtime_cfg.phase2.helix.enable_canonicalization = False
-
-    if baseline in {"ralph_loop", "factor_miner", "factor_miner_no_memory"}:
-        runtime_cfg.benchmark.mode = "paper"
-        return runtime_cfg
-
-    runtime_cfg.benchmark.mode = "research"
-    runtime_cfg.phase2.helix.enabled = True
-    runtime_cfg.phase2.helix.enable_canonicalization = True
-    runtime_cfg.phase2.helix.enable_knowledge_graph = True
-    runtime_cfg.phase2.helix.enable_embeddings = True
-    runtime_cfg.phase2.debate.enabled = True
-    runtime_cfg.phase2.regime.enabled = True
-    runtime_cfg.phase2.capacity.enabled = True
-    runtime_cfg.phase2.significance.enabled = True
-
-    if baseline == "helix_no_memory":
-        runtime_cfg.phase2.helix.enable_knowledge_graph = False
-        runtime_cfg.phase2.helix.enable_embeddings = False
-    elif baseline == "helix_no_debate":
-        runtime_cfg.phase2.debate.enabled = False
-    elif baseline == "helix_no_significance":
-        runtime_cfg.phase2.significance.enabled = False
-    elif baseline == "helix_no_capacity":
-        runtime_cfg.phase2.capacity.enabled = False
-    elif baseline == "helix_no_regime":
-        runtime_cfg.phase2.regime.enabled = False
-
-    return runtime_cfg
-
-
-def _real_mining_loop_type(baseline: str, runtime_manifest: dict[str, Any]) -> str:
-    """Resolve the loop type for a runtime mining request."""
-    loop_type = str(runtime_manifest.get("loop_type", "")).strip().lower()
-    if loop_type in {"ralph", "helix"}:
-        return loop_type
-    if baseline in {"helix_phase2", "helix_no_memory", "helix_no_debate", "helix_no_significance", "helix_no_capacity", "helix_no_regime"}:
-        return "helix"
-    if baseline in {"factor_miner", "factor_miner_no_memory", "ralph_loop"}:
-        return "ralph"
-    return "ralph"
-
-
-def _runtime_loop_provenance(
-    *,
-    baseline: str,
-    loop_type: str,
-    runtime_manifest: dict[str, Any],
-    runtime_output_dir: Path,
-) -> dict[str, Any]:
-    """Summarize the real mining run used to source benchmark factors."""
-    library_json = runtime_output_dir / "factor_library.json"
-    run_manifest = runtime_output_dir / "run_manifest.json"
-    session_json = runtime_output_dir / "session.json"
-    session_log_json = runtime_output_dir / "session_log.json"
-    checkpoint_dir = runtime_output_dir / "checkpoint"
-
-    source_files: dict[str, dict[str, str]] = {}
-    for label, path in {
-        "library_json": library_json,
-        "run_manifest_json": run_manifest,
-        "session_json": session_json,
-        "session_log_json": session_log_json,
-        "checkpoint_library_json": checkpoint_dir / "library.json",
-        "checkpoint_run_manifest_json": checkpoint_dir / "run_manifest.json",
-        "checkpoint_session_json": checkpoint_dir / "session.json",
-        "checkpoint_loop_state_json": checkpoint_dir / "loop_state.json",
-    }.items():
-        if path.exists():
-            source_files[label] = {
-                "path": str(path),
-                "sha256": _file_sha256(path),
-            }
-
-    provenance: dict[str, Any] = {
-        "kind": "runtime_loop",
-        "source": baseline,
-        "loop_type": loop_type,
-        "requested_runtime_manifest": _json_safe(runtime_manifest),
-        "runtime_output_dir": str(runtime_output_dir),
-        "source_files": source_files,
-        "run_manifest_summary": _json_summary(run_manifest),
-        "session_summary": _session_summary(session_json),
-        "session_log_summary": _json_summary(session_log_json),
-        "library_summary": {},
-    }
-
-    if library_json.exists():
-        try:
-            library = load_library(runtime_output_dir / "factor_library")
-        except Exception as exc:  # pragma: no cover - defensive provenance capture
-            provenance["library_summary"] = {
-                "path": str(library_json),
-                "load_error": str(exc),
-            }
-        else:
-            provenance["library_summary"] = {
-                "path": str(library_json),
-                "factor_count": library.size,
-                "diagnostics": library.get_diagnostics(),
-            }
-
-    return provenance
-
-
-def _run_runtime_mining_loop(
-    cfg,
-    *,
-    baseline: str,
-    dataset: EvaluationDataset,
-    output_dir: Path,
-    runtime_manifest: Optional[dict[str, Any]] = None,
-    mock: bool = False,
-) -> dict[str, Any]:
-    """Run a real RalphLoop/HelixLoop and return its factor library."""
-    runtime_manifest = dict(runtime_manifest or {})
-    loop_type = _real_mining_loop_type(baseline, runtime_manifest)
-    runtime_output_dir = _ensure_dir(output_dir / "benchmark" / "table1" / baseline / "runtime")
-    runtime_cfg = _cfg_for_runtime_baseline(cfg, baseline)
-    loop_cfg = _build_runtime_loop_config(
-        runtime_cfg,
-        output_dir=runtime_output_dir,
-        dataset=dataset,
-        mock=mock or bool(runtime_manifest.get("mock", False)),
-        runtime_manifest=runtime_manifest,
-    )
-    provider = _build_runtime_provider(runtime_cfg, mock=mock or bool(runtime_manifest.get("mock", False)))
-
-    if loop_type == "helix":
-        from factorminer.core.helix_loop import HelixLoop
-
-        phase2_kwargs = _build_phase2_runtime_kwargs(runtime_cfg)
-        loop = HelixLoop(
-            config=loop_cfg,
-            data_tensor=dataset.data_tensor,
-            returns=dataset.returns,
-            llm_provider=provider,
-            volume=_extract_volume_panel(dataset),
-            **phase2_kwargs,
-        )
-    else:
-        from factorminer.core.ralph_loop import RalphLoop
-
-        loop = RalphLoop(
-            config=loop_cfg,
-            data_tensor=dataset.data_tensor,
-            returns=dataset.returns,
-            llm_provider=provider,
-        )
-
-    checkpoint_interval = int(runtime_manifest.get("checkpoint_interval", 0 if mock else 1))
-    loop.checkpoint_interval = checkpoint_interval
-
-    if runtime_manifest.get("checkpoint_path"):
-        loop.load_session(str(runtime_manifest["checkpoint_path"]))
-
-    target_size = int(runtime_manifest.get("target_library_size", loop_cfg.target_library_size))
-    max_iterations = int(runtime_manifest.get("max_iterations", loop_cfg.max_iterations))
-    library = loop.run(target_size=target_size, max_iterations=max_iterations)
-    provenance = _runtime_loop_provenance(
-        baseline=baseline,
-        loop_type=loop_type,
-        runtime_manifest={**runtime_manifest, "target_library_size": target_size, "max_iterations": max_iterations},
-        runtime_output_dir=runtime_output_dir,
-    )
-    return {
-        "baseline": baseline,
-        "loop_type": loop_type,
-        "library": library,
-        "provenance": provenance,
-        "runtime_output_dir": str(runtime_output_dir),
-        "target_library_size": target_size,
-        "max_iterations": max_iterations,
-    }
-
-
-def load_benchmark_dataset(
-    cfg,
-    *,
-    data_path: Optional[str] = None,
-    raw_df: Optional[pd.DataFrame] = None,
-    universe: Optional[str] = None,
-    mock: bool = False,
-) -> tuple[EvaluationDataset, str]:
-    """Load one universe into the canonical runtime dataset."""
-    if universe is None:
-        universe = cfg.data.universe
-
-    if raw_df is None:
-        if mock:
-            from factorminer.data.mock_data import MockConfig, generate_mock_data
-
-            mock_cfg = MockConfig(
-                num_assets=64 if universe.lower() == "binance" else 80,
-                num_periods=12_200,
-                frequency="10min",
-                start_date="2024-01-02 09:30:00",
-                universe=universe,
-                plant_alpha=True,
-                seed=cfg.benchmark.seed,
-            )
-            raw_df = generate_mock_data(mock_cfg)
-        else:
-            path = data_path
-            if path is None:
-                path = getattr(cfg, "_raw", {}).get("data_path")
-            if path is None:
-                raise ValueError("No data path specified for benchmark run")
-            from factorminer.data.loader import load_market_data
-
-            raw_df = load_market_data(path, universe=universe)
-
-    dataset_cfg = _cfg_with_overrides(cfg, universe)
-    return load_runtime_dataset(raw_df, dataset_cfg), _data_hash(raw_df)
-
-
-def _factors_from_entries(entries: Iterable[CandidateEntry]) -> list[Factor]:
-    return [
-        Factor(
-            id=idx + 1,
-            name=entry.name,
-            formula=entry.formula,
-            category=entry.category,
-            ic_mean=0.0,
-            icir=0.0,
-            ic_win_rate=0.0,
-            max_correlation=0.0,
-            batch_number=0,
-        )
-        for idx, entry in enumerate(entries)
-    ]
-
-
-def _get_baseline_entries(
-    baseline: str,
-    seed: int,
-    *,
-    factor_miner_library_path: Optional[str] = None,
-    factor_miner_no_memory_library_path: Optional[str] = None,
-) -> list[CandidateEntry]:
-    if baseline == "alpha101_classic":
-        return dedupe_entries(ALPHA101_CLASSIC)
-    if baseline == "alpha101_adapted":
-        return dedupe_entries(build_alpha101_adapted())
-    if baseline == "random_exploration":
-        return dedupe_entries(build_random_exploration(seed))
-    if baseline == "gplearn":
-        return dedupe_entries(build_gplearn_style(seed))
-    if baseline == "alphaforge_style":
-        return dedupe_entries(build_alphaforge_style())
-    if baseline == "alphaagent_style":
-        return dedupe_entries(build_alphaagent_style())
-    if baseline == "factor_miner":
-        if factor_miner_library_path:
-            return dedupe_entries(entries_from_library(load_library(_base_path(factor_miner_library_path))))
-        return dedupe_entries(build_factor_miner_catalog())
-    if baseline == "factor_miner_no_memory":
-        if factor_miner_no_memory_library_path:
-            return dedupe_entries(entries_from_library(load_library(_base_path(factor_miner_no_memory_library_path))))
-        return dedupe_entries(build_random_exploration(seed + 101, count=200))
-    raise KeyError(f"Unknown benchmark baseline: {baseline}")
-
-
-def _base_path(path: str) -> str:
-    p = Path(path)
-    return str(p.with_suffix("")) if p.suffix == ".json" else str(p)
-
-
-def build_benchmark_library(
-    artifacts: Iterable[FactorEvaluationArtifact],
-    cfg,
-    *,
-    split_name: str = "train",
-    ic_threshold: Optional[float] = None,
-    correlation_threshold: Optional[float] = None,
-) -> tuple[FactorLibrary, dict[str, int]]:
-    """Build a library from candidate artifacts under the paper admission rules."""
-    ic_threshold = cfg.mining.ic_threshold if ic_threshold is None else ic_threshold
-    correlation_threshold = (
-        cfg.mining.correlation_threshold
-        if correlation_threshold is None
-        else correlation_threshold
-    )
-    library = FactorLibrary(
-        correlation_threshold=correlation_threshold,
-        ic_threshold=ic_threshold,
-    )
-
-    stats = {
-        "succeeded": 0,
-        "admitted": 0,
-        "replaced": 0,
-        "threshold_rejections": 0,
-        "correlation_rejections": 0,
-    }
-
-    ordered = [artifact for artifact in artifacts if artifact.succeeded]
-    ordered.sort(
-        key=lambda artifact: artifact.split_stats[split_name]["ic_abs_mean"],
-        reverse=True,
-    )
-    stats["succeeded"] = len(ordered)
-
-    for artifact in ordered:
-        split_stats = artifact.split_stats[split_name]
-        candidate_ic = float(split_stats["ic_abs_mean"])
-        candidate_signals = artifact.split_signals[split_name]
-        if candidate_ic < ic_threshold:
-            stats["threshold_rejections"] += 1
-            continue
-
-        max_corr = (
-            library._max_correlation_with_library(candidate_signals)  # noqa: SLF001
-            if library.size
-            else 0.0
-        )
-        factor = Factor(
-            id=0,
-            name=artifact.name,
-            formula=artifact.formula,
-            category=artifact.category,
-            ic_mean=candidate_ic,
-            icir=abs(float(split_stats["icir"])),
-            ic_win_rate=float(split_stats["ic_win_rate"]),
-            max_correlation=max_corr,
-            batch_number=0,
-            signals=candidate_signals,
-        )
-        admitted, _ = library.check_admission(candidate_ic, candidate_signals)
-        if admitted:
-            library.admit_factor(factor)
-            stats["admitted"] += 1
-            continue
-
-        replace, replace_id, _ = library.check_replacement(
-            candidate_ic,
-            candidate_signals,
-            ic_min=cfg.mining.replacement_ic_min,
-            ic_ratio=cfg.mining.replacement_ic_ratio,
-        )
-        if replace and replace_id is not None:
-            library.replace_factor(replace_id, factor)
-            stats["replaced"] += 1
-            continue
-
-        stats["correlation_rejections"] += 1
-
-    return library, stats
-
-
-def select_frozen_top_k(
-    artifacts: Iterable[FactorEvaluationArtifact],
-    library: FactorLibrary,
-    *,
-    top_k: int,
-    split_name: str = "train",
-    min_ic: float = 0.05,
-    min_icir: float = 0.5,
-) -> list[FactorEvaluationArtifact]:
-    """Freeze the paper Top-K set from train-split recomputed metrics."""
-    admitted_formulas = {factor.formula for factor in library.list_factors()}
-    succeeded = [artifact for artifact in artifacts if artifact.succeeded]
-    admitted = [
-        artifact
-        for artifact in succeeded
-        if artifact.formula in admitted_formulas
-        and artifact.split_stats[split_name]["ic_abs_mean"] >= min_ic
-        and abs(artifact.split_stats[split_name]["icir"]) >= min_icir
-    ]
-    admitted.sort(
-        key=lambda artifact: artifact.split_stats[split_name]["ic_abs_mean"],
-        reverse=True,
-    )
-    selected: list[FactorEvaluationArtifact] = admitted[:top_k]
-    selected_formulas = {artifact.formula for artifact in selected}
-
-    if len(selected) < top_k:
-        remainder = [
-            artifact
-            for artifact in succeeded
-            if artifact.formula not in selected_formulas
-        ]
-        remainder.sort(
-            key=lambda artifact: artifact.split_stats[split_name]["ic_abs_mean"],
-            reverse=True,
-        )
-        selected.extend(remainder[: top_k - len(selected)])
-
-    return selected
-
-
-def _abs_icir_from_series(ic_series: np.ndarray) -> float:
-    valid = ic_series[np.isfinite(ic_series)]
-    if len(valid) < 3:
-        return 0.0
-    std = float(np.std(valid, ddof=1))
-    if std < 1e-12:
-        return 0.0
-    return abs(float(np.mean(valid))) / std
-
-
-def _normalize_backtest_stats(stats: dict) -> dict[str, float]:
-    ic_series = np.asarray(stats.get("ic_series", []), dtype=np.float64)
-    return {
-        "ic": abs(float(stats.get("ic_mean", 0.0))),
-        "icir": _abs_icir_from_series(ic_series),
-        "ic_win_rate": float(stats.get("ic_win_rate", 0.0)),
-        "long_short": float(stats.get("ls_return", 0.0)),
-        "monotonicity": float(stats.get("monotonicity", 0.0)),
-        "turnover": float(stats.get("avg_turnover", 0.0)),
-    }
-
-
-def _avg_abs_rho(artifacts: list[FactorEvaluationArtifact], split_name: str) -> float:
-    if len(artifacts) < 2:
-        return 0.0
-    corr = np.abs(compute_correlation_matrix(artifacts, split_name))
-    upper = corr[np.triu_indices_from(corr, k=1)]
-    return float(np.mean(upper)) if upper.size else 0.0
-
-
-def _weighted_composite(
-    factor_signals: dict[int, np.ndarray],
-    weights: dict[int, float],
-) -> np.ndarray:
-    ordered = [(fid, factor_signals[fid], weights.get(fid, 0.0)) for fid in factor_signals]
-    if not ordered:
-        raise ValueError("Cannot build weighted composite from zero factors")
-    total = sum(abs(weight) for _, _, weight in ordered)
-    if total < 1e-12:
-        total = float(len(ordered))
-        ordered = [(fid, signal, 1.0) for fid, signal, _ in ordered]
-    composite = np.zeros_like(ordered[0][1], dtype=np.float64)
-    for _, signal, weight in ordered:
-        composite += signal * (weight / total)
-    return composite
-
-
-def evaluate_frozen_set(
-    frozen: list[FactorEvaluationArtifact],
-    dataset: EvaluationDataset,
-    *,
-    split_name: str = "test",
-    fit_split: str = "train",
-    cost_bps: Optional[list[float]] = None,
-) -> dict:
-    """Evaluate one frozen factor set on one universe."""
-    if cost_bps is None:
-        cost_bps = [1.0, 4.0, 7.0, 10.0, 11.0]
-
-    factors = _factors_from_entries(
-        CandidateEntry(
-            name=artifact.name,
-            formula=artifact.formula,
-            category=artifact.category,
-        )
-        for artifact in frozen
-    )
-    artifacts = evaluate_factors(factors, dataset, signal_failure_policy="reject")
-    succeeded = [artifact for artifact in artifacts if artifact.succeeded]
-
-    result = {
-        "factor_count": len(succeeded),
-        "library": {
-            "ic": 0.0,
-            "icir": 0.0,
-            "avg_abs_rho": 0.0,
-        },
-        "combinations": {},
-        "selections": {},
-        "warnings": [],
-    }
-    if not succeeded:
-        result["warnings"].append("No frozen factors recomputed successfully on this universe")
-        return result
-
-    result["library"] = {
-        "ic": float(np.mean([artifact.split_stats[split_name]["ic_abs_mean"] for artifact in succeeded])),
-        "icir": float(np.mean([abs(artifact.split_stats[split_name]["icir"]) for artifact in succeeded])),
-        "avg_abs_rho": _avg_abs_rho(succeeded, split_name),
-    }
-
-    artifact_map = {artifact.factor_id: artifact for artifact in succeeded}
-    fit_signals = {artifact.factor_id: artifact.split_signals[fit_split].T for artifact in succeeded}
-    eval_signals = {artifact.factor_id: artifact.split_signals[split_name].T for artifact in succeeded}
-    fit_returns = dataset.get_split(fit_split).returns.T
-    eval_returns = dataset.get_split(split_name).returns.T
-
-    from factorminer.evaluation.combination import FactorCombiner
-    from factorminer.evaluation.portfolio import PortfolioBacktester
-    from factorminer.evaluation.selection import FactorSelector
-
-    combiner = FactorCombiner()
-    backtester = PortfolioBacktester()
-    selector = FactorSelector()
-
-    fit_ic_values = {
-        artifact.factor_id: artifact.split_stats[fit_split]["ic_mean"]
-        for artifact in succeeded
-    }
-
-    combos = {
-        "equal_weight": combiner.equal_weight(eval_signals),
-        "ic_weighted": combiner.ic_weighted(eval_signals, fit_ic_values),
-        "orthogonal": combiner.orthogonal(eval_signals),
-    }
-    for name, composite in combos.items():
-        stats = backtester.quintile_backtest(composite, eval_returns)
-        result["combinations"][name] = _normalize_backtest_stats(stats)
-        result["combinations"][name]["ic_series"] = _json_safe(
-            np.asarray(stats.get("ic_series", []), dtype=np.float64).tolist()
-        )
-        result["combinations"][name]["turnover_series"] = _json_safe(
-            np.asarray(stats.get("turnover_series", []), dtype=np.float64).tolist()
-        )
-        result["combinations"][name]["cost_pressure"] = {
-            str(cost): _normalize_backtest_stats(
-                backtester.quintile_backtest(
-                    composite, eval_returns, transaction_cost_bps=float(cost)
-                )
-            )
-            for cost in cost_bps
-        }
-
-    selection_specs = {}
-    try:
-        selection_specs["lasso"] = selector.lasso_selection(fit_signals, fit_returns)
-    except Exception as exc:
-        result["warnings"].append(f"lasso unavailable: {exc}")
-    try:
-        selection_specs["forward_stepwise"] = selector.forward_stepwise(fit_signals, fit_returns)
-    except Exception as exc:
-        result["warnings"].append(f"forward_stepwise unavailable: {exc}")
-    try:
-        selection_specs["xgboost"] = selector.xgboost_selection(fit_signals, fit_returns)
-    except Exception as exc:
-        result["warnings"].append(f"xgboost unavailable: {exc}")
-
-    for name, ranking in selection_specs.items():
-        if not ranking:
-            result["selections"][name] = {"factor_count": 0}
-            continue
-        selected_ids = [factor_id for factor_id, _ in ranking]
-        selected_eval = {factor_id: eval_signals[factor_id] for factor_id in selected_ids}
-        if name == "lasso":
-            weights = {factor_id: score for factor_id, score in ranking}
-            composite = _weighted_composite(selected_eval, weights)
-        elif name == "xgboost":
-            weights = {
-                factor_id: score * np.sign(artifact_map[factor_id].split_stats[fit_split]["ic_mean"] or 1.0)
-                for factor_id, score in ranking
-            }
-            composite = _weighted_composite(selected_eval, weights)
-        else:
-            signs = {
-                factor_id: np.sign(artifact_map[factor_id].split_stats[fit_split]["ic_mean"] or 1.0)
-                for factor_id in selected_ids
-            }
-            composite = _weighted_composite(selected_eval, signs)
-        stats = backtester.quintile_backtest(composite, eval_returns)
-        result["selections"][name] = {
-            "factor_count": len(selected_ids),
-            **_normalize_backtest_stats(stats),
-            "ic_series": _json_safe(
-                np.asarray(stats.get("ic_series", []), dtype=np.float64).tolist()
-            ),
-            "turnover_series": _json_safe(
-                np.asarray(stats.get("turnover_series", []), dtype=np.float64).tolist()
-            ),
-        }
-
-    return result
-
-
-def _ensure_dir(path: Path) -> Path:
-    path.mkdir(parents=True, exist_ok=True)
-    return path
-
-
-def _write_json(path: Path, payload: dict) -> None:
-    path.parent.mkdir(parents=True, exist_ok=True)
-    with open(path, "w") as fp:
-        json.dump(_json_safe(payload), fp, indent=2, sort_keys=False, allow_nan=False)
-
-
-def _save_manifest(path: Path, manifest: BenchmarkManifest) -> None:
-    _write_json(path, asdict(manifest))
-
-
-def run_table1_benchmark(
-    cfg,
-    output_dir: Path,
-    *,
-    data_path: Optional[str] = None,
-    raw_df: Optional[pd.DataFrame] = None,
-    mock: bool = False,
-    baseline_names: Optional[list[str]] = None,
-    factor_miner_library_path: Optional[str] = None,
-    factor_miner_no_memory_library_path: Optional[str] = None,
-    runtime_manifests: Optional[dict[str, dict[str, Any]]] = None,
-    use_runtime_loops: bool = False,
-) -> dict:
-    """Run the strict Top-K freeze benchmark across all configured universes."""
-    if runtime_manifests is None:
-        runtime_manifests = getattr(cfg.benchmark, "runtime_manifests", None)
-    use_runtime_loops = bool(
-        use_runtime_loops
-        or getattr(cfg.benchmark, "runtime_loops", False)
-        or runtime_manifests
-    )
-    benchmark_dir = _ensure_dir(output_dir / "benchmark" / "table1")
-    baseline_names = baseline_names or list(cfg.benchmark.baselines)
-    freeze_cfg = _cfg_with_overrides(cfg, cfg.benchmark.freeze_universe)
-    freeze_dataset, freeze_hash = load_benchmark_dataset(
-        freeze_cfg,
-        data_path=data_path,
-        raw_df=raw_df,
-        universe=cfg.benchmark.freeze_universe,
-        mock=mock,
-    )
-
-    summary: dict[str, dict] = {}
-    for baseline in baseline_names:
-        runtime_manifest = _runtime_manifest_value(runtime_manifests, baseline)
-        runtime_baseline = bool(runtime_manifest) or (
-            use_runtime_loops
-            and baseline in (RUNTIME_LOOP_BASELINES | {"factor_miner", "factor_miner_no_memory"})
-        )
-
-        if runtime_baseline:
-            runtime_result = _run_runtime_mining_loop(
-                cfg,
-                baseline=baseline,
-                dataset=freeze_dataset,
-                output_dir=output_dir,
-                runtime_manifest=runtime_manifest,
-                mock=mock,
-            )
-            factors = list(runtime_result["library"].list_factors())
-            provenance = runtime_result["provenance"]
-            candidate_count = len(factors)
-        else:
-            entries = _get_baseline_entries(
-                baseline,
-                cfg.benchmark.seed,
-                factor_miner_library_path=factor_miner_library_path,
-                factor_miner_no_memory_library_path=factor_miner_no_memory_library_path,
-            )
-            factors = _factors_from_entries(entries)
-            provenance = _baseline_provenance(
-                baseline,
-                factor_miner_library_path=factor_miner_library_path,
-                factor_miner_no_memory_library_path=factor_miner_no_memory_library_path,
-                candidate_count=len(entries),
-                seed=cfg.benchmark.seed,
-            )
-            candidate_count = len(entries)
-
-        artifacts = evaluate_factors(
-            factors,
-            freeze_dataset,
-            signal_failure_policy="reject",
-        )
-
-        library_cfg = _cfg_with_overrides(cfg, cfg.benchmark.freeze_universe)
-        if baseline == "factor_miner_no_memory":
-            library_cfg.mining.ic_threshold = 0.02
-            library_cfg.mining.correlation_threshold = 0.85
-        library, library_stats = build_benchmark_library(
-            artifacts,
-            library_cfg,
-            split_name="train",
-            ic_threshold=library_cfg.mining.ic_threshold,
-            correlation_threshold=library_cfg.mining.correlation_threshold,
-        )
-        frozen = select_frozen_top_k(
-            artifacts,
-            library,
-            top_k=cfg.benchmark.freeze_top_k,
-            split_name="train",
-        )
-
-        baseline_result = {
-            "baseline": baseline,
-            "mode": cfg.benchmark.mode,
-            "freeze_universe": cfg.benchmark.freeze_universe,
-            "candidate_count": candidate_count,
-            "freeze_library_size": library.size,
-            "freeze_stats": library_stats,
-            "frozen_top_k": [
-                {
-                    "name": artifact.name,
-                    "formula": artifact.formula,
-                    "category": artifact.category,
-                    "train_ic": artifact.split_stats["train"]["ic_abs_mean"],
-                    "train_icir": abs(artifact.split_stats["train"]["icir"]),
-                }
-                for artifact in frozen
-            ],
-            "universes": {},
-        }
-
-        dataset_hashes = {cfg.benchmark.freeze_universe: freeze_hash}
-        for universe in cfg.benchmark.report_universes:
-            universe_cfg = _cfg_with_overrides(cfg, universe)
-            dataset, dataset_hash = load_benchmark_dataset(
-                universe_cfg,
-                data_path=data_path,
-                raw_df=raw_df,
-                universe=universe,
-                mock=mock,
-            )
-            dataset_hashes[universe] = dataset_hash
-            baseline_result["universes"][universe] = evaluate_frozen_set(
-                frozen,
-                dataset,
-                split_name="test",
-                fit_split="train",
-                cost_bps=list(cfg.benchmark.cost_bps),
-            )
-
-        result_path = benchmark_dir / f"{baseline}.json"
-        manifest_path = benchmark_dir / f"{baseline}_manifest.json"
-        baseline_result["provenance"] = provenance
-        _write_json(result_path, baseline_result)
-        manifest = BenchmarkManifest(
-            benchmark_name="table1",
-            mode=cfg.benchmark.mode,
-            seed=cfg.benchmark.seed,
-            baseline=baseline,
-            freeze_universe=cfg.benchmark.freeze_universe,
-            report_universes=list(cfg.benchmark.report_universes),
-            train_period=list(cfg.data.train_period),
-            test_period=list(cfg.data.test_period),
-            freeze_top_k=cfg.benchmark.freeze_top_k,
-            signal_failure_policy="reject",
-            default_target=cfg.data.default_target,
-            target_stack=[target.get("name", "") for target in cfg.data.targets],
-            primary_objective=cfg.research.primary_objective,
-            dataset_hashes=dataset_hashes,
-            artifact_paths={
-                "result": str(result_path),
-                "manifest": str(manifest_path),
-            },
-            runtime_contract=runtime_manifest,
-            baseline_provenance={baseline: provenance},
-            warnings=[],
-        )
-        _save_manifest(manifest_path, manifest)
-        summary[baseline] = baseline_result
-
-    return summary
-
-
-def run_ablation_memory_benchmark(
-    cfg,
-    output_dir: Path,
-    *,
-    data_path: Optional[str] = None,
-    raw_df: Optional[pd.DataFrame] = None,
-    mock: bool = False,
-    factor_miner_library_path: Optional[str] = None,
-    factor_miner_no_memory_library_path: Optional[str] = None,
-    runtime_manifests: Optional[dict[str, dict[str, Any]]] = None,
-) -> dict:
-    """Compare the default FactorMiner lane to the relaxed no-memory lane."""
-    use_runtime_loops = bool(
-        runtime_manifests or getattr(cfg.benchmark, "runtime_loops", False)
-    )
-    comparison = run_table1_benchmark(
-        cfg,
-        output_dir,
-        data_path=data_path,
-        raw_df=raw_df,
-        mock=mock,
-        baseline_names=["factor_miner", "factor_miner_no_memory"],
-        factor_miner_library_path=factor_miner_library_path,
-        factor_miner_no_memory_library_path=factor_miner_no_memory_library_path,
-        runtime_manifests=runtime_manifests,
-        use_runtime_loops=use_runtime_loops,
-    )
-    result = {}
-    for baseline, payload in comparison.items():
-        freeze_stats = payload["freeze_stats"]
-        succeeded = max(freeze_stats.get("succeeded", 0), 1)
-        result[baseline] = {
-            "library_size": payload["freeze_library_size"],
-            "high_quality_yield": freeze_stats.get("admitted", 0) / succeeded,
-            "redundancy_rejection_rate": freeze_stats.get("correlation_rejections", 0) / succeeded,
-            "replacements": freeze_stats.get("replaced", 0),
-        }
-    out_path = _ensure_dir(output_dir / "benchmark" / "ablation") / "memory_ablation.json"
-    _write_json(out_path, result)
-    return result
-
-
-def run_cost_pressure_benchmark(
-    cfg,
-    output_dir: Path,
-    *,
-    baseline: str = "factor_miner",
-    data_path: Optional[str] = None,
-    raw_df: Optional[pd.DataFrame] = None,
-    mock: bool = False,
-    factor_miner_library_path: Optional[str] = None,
-    runtime_manifests: Optional[dict[str, dict[str, Any]]] = None,
-) -> dict:
-    """Run cost-pressure analysis for one baseline on the configured universes."""
-    use_runtime_loops = bool(
-        runtime_manifests or getattr(cfg.benchmark, "runtime_loops", False)
-    )
-    payload = run_table1_benchmark(
-        cfg,
-        output_dir,
-        data_path=data_path,
-        raw_df=raw_df,
-        mock=mock,
-        baseline_names=[baseline],
-        factor_miner_library_path=factor_miner_library_path,
-        runtime_manifests=runtime_manifests,
-        use_runtime_loops=use_runtime_loops,
-    )[baseline]
-    result = {
-        universe: {
-            "combinations": {
-                name: metrics.get("cost_pressure", {})
-                for name, metrics in universe_payload["combinations"].items()
-            }
-        }
-        for universe, universe_payload in payload["universes"].items()
-    }
-    out_path = _ensure_dir(output_dir / "benchmark" / "cost_pressure") / f"{baseline}.json"
-    _write_json(out_path, result)
-    return result
-
-
-def _time_callable(fn, repeats: int = 3) -> float:
-    timings: list[float] = []
-    for _ in range(repeats):
-        start = time.perf_counter()
-        fn()
-        timings.append(time.perf_counter() - start)
-    return min(timings) * 1000.0
-
-
-def run_efficiency_benchmark(cfg, output_dir: Path) -> dict:
-    """Benchmark operator-level and factor-level compute time."""
-    periods, assets = cfg.benchmark.efficiency_panel_shape
-    matrix = np.random.RandomState(cfg.benchmark.seed).randn(assets, periods).astype(np.float64)
-    other = np.random.RandomState(cfg.benchmark.seed + 1).randn(assets, periods).astype(np.float64)
-
-    from factorminer.operators import torch_available
-    from factorminer.operators.gpu_backend import to_tensor
-    from factorminer.operators.registry import execute_operator
-    from factorminer.utils.visualization import plot_efficiency_benchmark
-
-    operator_bench: dict[str, dict[str, float | None]] = {"numpy": {}, "c": {}, "gpu": {}}
-    def _backend_inputs(backend: str):
-        if backend == "gpu":
-            return to_tensor(matrix), to_tensor(other)
-        return matrix, other
-
-    operators = {
-        "Add": lambda backend: execute_operator("Add", *_backend_inputs(backend), backend=backend),
-        "Mean": lambda backend: execute_operator("Mean", _backend_inputs(backend)[0], params={"window": 20}, backend=backend),
-        "Delta": lambda backend: execute_operator("Delta", _backend_inputs(backend)[0], params={"window": 5}, backend=backend),
-        "TsRank": lambda backend: execute_operator("TsRank", _backend_inputs(backend)[0], params={"window": 20}, backend=backend),
-        "Corr": lambda backend: execute_operator("Corr", *_backend_inputs(backend), params={"window": 20}, backend=backend),
-        "CsRank": lambda backend: execute_operator("CsRank", _backend_inputs(backend)[0], backend=backend),
-    }
-    for op_name, runner in operators.items():
-        operator_bench["numpy"][op_name] = _time_callable(lambda r=runner: r("numpy"))
-        operator_bench["c"][op_name] = None
-        if torch_available():
-            operator_bench["gpu"][op_name] = _time_callable(lambda r=runner: r("gpu"))
-        else:
-            operator_bench["gpu"][op_name] = None
-
-    factor_bench: dict[str, dict[str, float | None]] = {"numpy": {}, "c": {}, "gpu": {}}
-    factor_specs = {
-        "momentum_volume": lambda backend: execute_operator(
-            "CsRank",
-            execute_operator(
-                "Mul",
-                execute_operator("Return", _backend_inputs(backend)[0], params={"window": 5}, backend=backend),
-                execute_operator(
-                    "Div",
-                    _backend_inputs(backend)[1],
-                    execute_operator("Mean", _backend_inputs(backend)[1], params={"window": 20}, backend=backend),
-                    backend=backend,
-                ),
-                backend=backend,
-            ),
-            backend=backend,
-        ),
-        "vwap_gap": lambda backend: execute_operator(
-            "Neg",
-            execute_operator(
-                "CsRank",
-                execute_operator(
-                    "Div",
-                    execute_operator("Sub", *_backend_inputs(backend), backend=backend),
-                    execute_operator(
-                        "Add",
-                        _backend_inputs(backend)[1],
-                        to_tensor(np.full_like(other, 1e-8)) if backend == "gpu" else np.full_like(other, 1e-8),
-                        backend=backend,
-                    ),
-                    backend=backend,
-                ),
-                backend=backend,
-            ),
-            backend=backend,
-        ),
-    }
-    for formula_name, runner in factor_specs.items():
-        factor_bench["numpy"][formula_name] = _time_callable(lambda r=runner: r("numpy"))
-        factor_bench["c"][formula_name] = None
-        if torch_available():
-            factor_bench["gpu"][formula_name] = _time_callable(lambda r=runner: r("gpu"))
-        else:
-            factor_bench["gpu"][formula_name] = None
-
-    bench_dir = _ensure_dir(output_dir / "benchmark" / "efficiency")
-    plot_efficiency_benchmark(
-        {backend: {k: v for k, v in values.items() if v is not None} for backend, values in operator_bench.items()},
-        save_path=str(bench_dir / "operator_efficiency.png"),
-    )
-    plot_efficiency_benchmark(
-        {backend: {k: v for k, v in values.items() if v is not None} for backend, values in factor_bench.items()},
-        save_path=str(bench_dir / "factor_efficiency.png"),
-    )
-    result = {
-        "panel_shape": {"periods": periods, "assets": assets},
-        "operator_level_ms": operator_bench,
-        "factor_level_ms": factor_bench,
-        "available_backends": {
-            "numpy": True,
-            "c": False,
-            "gpu": torch_available(),
-        },
-    }
-    _write_json(bench_dir / "efficiency.json", result)
-    return result
-
-
-def run_benchmark_suite(
-    cfg,
-    output_dir: Path,
-    *,
-    data_path: Optional[str] = None,
-    raw_df: Optional[pd.DataFrame] = None,
-    mock: bool = False,
-    factor_miner_library_path: Optional[str] = None,
-    factor_miner_no_memory_library_path: Optional[str] = None,
-    runtime_manifests: Optional[dict[str, dict[str, Any]]] = None,
-) -> dict:
-    """Run the benchmark suite and return the artifact index."""
-    if runtime_manifests is None:
-        runtime_manifests = getattr(cfg.benchmark, "runtime_manifests", None)
-    use_runtime_loops = bool(
-        runtime_manifests or getattr(cfg.benchmark, "runtime_loops", False)
-    )
-    results = {
-        "table1": run_table1_benchmark(
-            cfg,
-            output_dir,
-            data_path=data_path,
-            raw_df=raw_df,
-            mock=mock,
-            factor_miner_library_path=factor_miner_library_path,
-            factor_miner_no_memory_library_path=factor_miner_no_memory_library_path,
-            runtime_manifests=runtime_manifests,
-            use_runtime_loops=use_runtime_loops,
-        ),
-        "ablation_memory": run_ablation_memory_benchmark(
-            cfg,
-            output_dir,
-            data_path=data_path,
-            raw_df=raw_df,
-            mock=mock,
-            factor_miner_library_path=factor_miner_library_path,
-            factor_miner_no_memory_library_path=factor_miner_no_memory_library_path,
-            runtime_manifests=runtime_manifests,
-        ),
-        "cost_pressure": run_cost_pressure_benchmark(
-            cfg,
-            output_dir,
-            data_path=data_path,
-            raw_df=raw_df,
-            mock=mock,
-            factor_miner_library_path=factor_miner_library_path,
-            runtime_manifests=runtime_manifests,
-        ),
-        "efficiency": run_efficiency_benchmark(cfg, output_dir),
-    }
-    _write_json(_ensure_dir(output_dir / "benchmark") / "suite.json", results)
-    return results
-
-
-def run_runtime_mining_benchmark(
-    cfg,
-    output_dir: Path,
-    *,
-    data_path: Optional[str] = None,
-    raw_df: Optional[pd.DataFrame] = None,
-    mock: bool = False,
-    factor_miner_library_path: Optional[str] = None,
-    factor_miner_no_memory_library_path: Optional[str] = None,
-    runtime_manifests: Optional[dict[str, dict[str, Any]]] = None,
-) -> dict:
-    """Run the benchmark suite with explicit real-loop manifests when provided."""
-    return run_benchmark_suite(
-        cfg,
-        output_dir,
-        data_path=data_path,
-        raw_df=raw_df,
-        mock=mock,
-        factor_miner_library_path=factor_miner_library_path,
-        factor_miner_no_memory_library_path=factor_miner_no_memory_library_path,
-        runtime_manifests=runtime_manifests,
-    )
diff --git a/src/factorminer/factorminer/cli.py b/src/factorminer/factorminer/cli.py
deleted file mode 100644
index cbade96..0000000
--- a/src/factorminer/factorminer/cli.py
+++ /dev/null
@@ -1,1566 +0,0 @@
-"""Click-based CLI for FactorMiner."""
-
-from __future__ import annotations
-
-import logging
-import sys
-from dataclasses import fields
-import json
-from pathlib import Path
-
-import click
-import numpy as np
-
-from src.factorminer.factorminer.utils.config import load_config
-
-logger = logging.getLogger(__name__)
-
-
-# ---------------------------------------------------------------------------
-# Helpers
-# ---------------------------------------------------------------------------
-
-def _setup_logging(verbose: bool) -> None:
-    """Configure root logger for CLI output."""
-    level = logging.DEBUG if verbose else logging.INFO
-    logging.basicConfig(
-        level=level,
-        format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
-        datefmt="%Y-%m-%d %H:%M:%S",
-    )
-
-
-def _load_data(cfg, data_path: str | None, mock: bool):
-    """Load market data from file or generate mock data.
-
-    Returns
-    -------
-    pd.DataFrame
-        Market data with columns: datetime, asset_id, open, high, low,
-        close, volume, amount.
-    """
-    raw_cfg = getattr(cfg, "_raw", {})
-    configured_path = raw_cfg.get("data_path")
-
-    if mock:
-        click.echo("Generating mock market data...")
-        from factorminer.data.mock_data import MockConfig, generate_mock_data
-
-        mock_cfg = MockConfig(
-            num_assets=50,
-            num_periods=500,
-            frequency="1d",
-            plant_alpha=True,
-        )
-        return generate_mock_data(mock_cfg)
-
-    # Try data_path argument, then config top-level data_path
-    path = data_path
-    if path is None:
-        path = configured_path
-
-    if path is None:
-        click.echo("No data path specified. Use --data or --mock flag.")
-        raise click.Abort()
-
-    click.echo(f"Loading market data from: {path}")
-    from factorminer.data.loader import load_market_data
-
-    return load_market_data(path)
-
-
-def _prepare_data_arrays(df):
-    """Convert a market DataFrame to numpy arrays for the mining loop.
-
-    Returns
-    -------
-    data_tensor : np.ndarray, shape (M, T, F)
-        Market data tensor.
-    returns : np.ndarray, shape (M, T)
-        Forward returns.
-    """
-    asset_ids = sorted(df["asset_id"].unique())
-    dates = sorted(df["datetime"].unique())
-    M = len(asset_ids)
-    T = len(dates)
-
-    feature_cols = [
-        "open",
-        "high",
-        "low",
-        "close",
-        "volume",
-        "amount",
-        "vwap",
-        "returns",
-    ]
-    F = len(feature_cols)
-
-    data_tensor = np.full((M, T, F), np.nan, dtype=np.float64)
-    returns = np.full((M, T), np.nan, dtype=np.float64)
-
-    asset_to_idx = {a: i for i, a in enumerate(asset_ids)}
-    date_to_idx = {d: i for i, d in enumerate(dates)}
-
-    for _, row in df.iterrows():
-        ai = asset_to_idx[row["asset_id"]]
-        ti = date_to_idx[row["datetime"]]
-        for fi, col in enumerate(feature_cols[:6]):
-            data_tensor[ai, ti, fi] = row[col]
-
-        if "vwap" in row.index and not np.isnan(row["vwap"]):
-            data_tensor[ai, ti, 6] = row["vwap"]
-        elif (
-            not np.isnan(row["volume"])
-            and abs(row["volume"]) > 1e-12
-            and not np.isnan(row["amount"])
-        ):
-            data_tensor[ai, ti, 6] = row["amount"] / row["volume"]
-
-        if "returns" in row.index and not np.isnan(row["returns"]):
-            data_tensor[ai, ti, 7] = row["returns"]
-
-    close_idx = feature_cols.index("close")
-    amount_idx = feature_cols.index("amount")
-    vwap_idx = feature_cols.index("vwap")
-    feature_returns_idx = feature_cols.index("returns")
-
-    # Fill derived VWAP where the source file did not provide it.
-    volume = data_tensor[:, :, feature_cols.index("volume")]
-    amount = data_tensor[:, :, amount_idx]
-    derived_vwap = np.divide(
-        amount,
-        volume,
-        out=np.full_like(amount, np.nan),
-        where=np.abs(volume) > 1e-12,
-    )
-    missing_vwap = np.isnan(data_tensor[:, :, vwap_idx])
-    data_tensor[:, :, vwap_idx] = np.where(
-        missing_vwap,
-        np.where(np.isnan(derived_vwap), data_tensor[:, :, close_idx], derived_vwap),
-        data_tensor[:, :, vwap_idx],
-    )
-
-    # Compute bar returns feature from close prices where missing.
-    for i in range(M):
-        close = data_tensor[i, :, close_idx]
-        asset_returns = np.full(T, np.nan, dtype=np.float64)
-        asset_returns[1:] = (close[1:] - close[:-1]) / np.where(
-            close[:-1] == 0, np.nan, close[:-1]
-        )
-        missing_feature_returns = np.isnan(data_tensor[i, :, feature_returns_idx])
-        data_tensor[i, :, feature_returns_idx] = np.where(
-            missing_feature_returns,
-            asset_returns,
-            data_tensor[i, :, feature_returns_idx],
-        )
-
-        # Simple 1-period forward return target.
-        returns[i, :-1] = (close[1:] - close[:-1]) / np.where(
-            close[:-1] == 0, np.nan, close[:-1]
-        )
-
-    return data_tensor, returns
-
-
-def _create_llm_provider(cfg, mock: bool):
-    """Create an LLM provider from config or use mock."""
-    from factorminer.agent.llm_interface import MockProvider, create_provider
-
-    if mock:
-        click.echo("Using mock LLM provider (no API calls).")
-        return MockProvider()
-
-    llm_config = {
-        "provider": cfg.llm.provider,
-        "model": cfg.llm.model,
-    }
-    # Use api_key from config if set
-    if hasattr(cfg, "_raw") and cfg._raw.get("llm", {}).get("api_key"):
-        llm_config["api_key"] = cfg._raw["llm"]["api_key"]
-
-    click.echo(f"Using LLM provider: {cfg.llm.provider}/{cfg.llm.model}")
-    return create_provider(llm_config)
-
-
-def _build_core_mining_config(cfg, output_dir: Path, mock: bool = False):
-    """Create the flat mining config expected by RalphLoop/HelixLoop."""
-    from factorminer.core.config import MiningConfig as CoreMiningConfig
-
-    signal_failure_policy = (
-        "synthetic" if mock else cfg.evaluation.signal_failure_policy
-    )
-
-    mining_cfg = CoreMiningConfig(
-        target_library_size=cfg.mining.target_library_size,
-        batch_size=cfg.mining.batch_size,
-        max_iterations=cfg.mining.max_iterations,
-        ic_threshold=cfg.mining.ic_threshold,
-        icir_threshold=cfg.mining.icir_threshold,
-        correlation_threshold=cfg.mining.correlation_threshold,
-        replacement_ic_min=cfg.mining.replacement_ic_min,
-        replacement_ic_ratio=cfg.mining.replacement_ic_ratio,
-        fast_screen_assets=cfg.evaluation.fast_screen_assets,
-        num_workers=cfg.evaluation.num_workers,
-        output_dir=str(output_dir),
-        backend=cfg.evaluation.backend,
-        gpu_device=cfg.evaluation.gpu_device,
-        signal_failure_policy=signal_failure_policy,
-    )
-    mining_cfg.research = getattr(cfg, "research", None)
-    benchmark_cfg = getattr(cfg, "benchmark", None)
-    mining_cfg.benchmark_mode = getattr(benchmark_cfg, "mode", "paper")
-    mining_cfg.target_panels = None
-    mining_cfg.target_horizons = None
-    return mining_cfg
-
-
-def _attach_runtime_targets(mining_config, dataset) -> None:
-    """Attach multi-horizon runtime metadata for research-mode mining."""
-    mining_config.target_panels = dataset.target_panels
-    mining_config.target_horizons = {
-        name: max(getattr(spec, "holding_bars", 1), 1)
-        for name, spec in dataset.target_specs.items()
-    }
-
-
-def _save_result_library(library, output_dir: Path) -> Path:
-    """Persist a factor library to the standard output location."""
-    from factorminer.core.library_io import save_library
-
-    output_dir.mkdir(parents=True, exist_ok=True)
-    lib_path = output_dir / "factor_library"
-    save_library(library, lib_path)
-    return lib_path.with_suffix(".json")
-
-
-def _filter_dataclass_kwargs(source, target_cls):
-    """Copy shared dataclass fields from one config object to another."""
-    target_fields = {f.name for f in fields(target_cls)}
-    source_fields = getattr(source, "__dataclass_fields__", {})
-    return {
-        name: getattr(source, name)
-        for name in source_fields
-        if name in target_fields
-    }
-
-
-def _build_debate_config(cfg):
-    """Build the runtime debate config from YAML config settings."""
-    if not cfg.phase2.debate.enabled:
-        return None
-
-    from factorminer.agent.debate import DebateConfig as RuntimeDebateConfig
-    from factorminer.agent.specialists import DEFAULT_SPECIALISTS
-
-    available = len(DEFAULT_SPECIALISTS)
-    requested = cfg.phase2.debate.num_specialists
-    selected = list(DEFAULT_SPECIALISTS[:requested])
-    if requested > available:
-        logger.warning(
-            "Requested %d specialists but only %d are available; using all defaults.",
-            requested,
-            available,
-        )
-
-    return RuntimeDebateConfig(
-        specialists=selected,
-        enable_critic=cfg.phase2.debate.enable_critic,
-        candidates_per_specialist=cfg.phase2.debate.candidates_per_specialist,
-        top_k_after_critic=cfg.phase2.debate.top_k_after_critic,
-        critic_temperature=cfg.phase2.debate.critic_temperature,
-    )
-
-
-def _build_phase2_runtime_configs(cfg):
-    """Instantiate evaluation/runtime configs for the Helix loop."""
-    from factorminer.evaluation.causal import CausalConfig as RuntimeCausalConfig
-    from factorminer.evaluation.capacity import CapacityConfig as RuntimeCapacityConfig
-    from factorminer.evaluation.regime import RegimeConfig as RuntimeRegimeConfig
-    from factorminer.evaluation.significance import (
-        SignificanceConfig as RuntimeSignificanceConfig,
-    )
-
-    causal_config = None
-    if cfg.phase2.causal.enabled:
-        causal_config = RuntimeCausalConfig(
-            **_filter_dataclass_kwargs(cfg.phase2.causal, RuntimeCausalConfig)
-        )
-
-    regime_config = None
-    if cfg.phase2.regime.enabled:
-        regime_config = RuntimeRegimeConfig(
-            **_filter_dataclass_kwargs(cfg.phase2.regime, RuntimeRegimeConfig)
-        )
-
-    capacity_config = None
-    if cfg.phase2.capacity.enabled:
-        capacity_config = RuntimeCapacityConfig(
-            **_filter_dataclass_kwargs(cfg.phase2.capacity, RuntimeCapacityConfig)
-        )
-
-    significance_config = None
-    if cfg.phase2.significance.enabled:
-        significance_config = RuntimeSignificanceConfig(
-            **_filter_dataclass_kwargs(cfg.phase2.significance, RuntimeSignificanceConfig)
-        )
-
-    return {
-        "debate_config": _build_debate_config(cfg),
-        "causal_config": causal_config,
-        "regime_config": regime_config,
-        "capacity_config": capacity_config,
-        "significance_config": significance_config,
-    }
-
-
-def _extract_capacity_volume(data_tensor: np.ndarray) -> np.ndarray | None:
-    """Prefer dollar volume (`amount`) and fall back to raw volume if needed."""
-    if data_tensor.ndim != 3 or data_tensor.shape[2] == 0:
-        return None
-
-    amount_idx = 5
-    volume_idx = 4
-
-    if data_tensor.shape[2] > amount_idx:
-        amount = data_tensor[:, :, amount_idx]
-        if not np.all(np.isnan(amount)):
-            return amount
-
-    if data_tensor.shape[2] > volume_idx:
-        volume = data_tensor[:, :, volume_idx]
-        if not np.all(np.isnan(volume)):
-            return volume
-
-    return None
-
-
-def _active_phase2_features(cfg) -> list[str]:
-    """Describe the effective Helix feature set for CLI output."""
-    features: list[str] = []
-
-    if cfg.phase2.causal.enabled:
-        features.append("causal")
-    if cfg.phase2.regime.enabled:
-        features.append("regime")
-    if cfg.phase2.capacity.enabled:
-        features.append("capacity")
-    if cfg.phase2.significance.enabled:
-        features.append("significance")
-    if cfg.phase2.debate.enabled:
-        features.append("debate")
-    if cfg.phase2.auto_inventor.enabled:
-        features.append("auto-inventor")
-    if cfg.phase2.helix.enabled and cfg.phase2.helix.enable_canonicalization:
-        features.append("canonicalization")
-    if cfg.phase2.helix.enabled and cfg.phase2.helix.enable_knowledge_graph:
-        features.append("knowledge-graph")
-    if cfg.phase2.helix.enabled and cfg.phase2.helix.enable_embeddings:
-        features.append("embeddings")
-
-    return features
-
-
-def _load_runtime_dataset_for_analysis(cfg, data_path: str | None, mock: bool):
-    """Load, preprocess, split, and tensorize data for analysis commands."""
-    from factorminer.evaluation.runtime import load_runtime_dataset
-
-    raw_df = _load_data(cfg, data_path, mock)
-    return load_runtime_dataset(raw_df, cfg)
-
-
-def _recompute_analysis_artifacts(library, dataset, signal_failure_policy: str):
-    """Recompute library factors on the canonical analysis dataset."""
-    from factorminer.evaluation.runtime import evaluate_factors
-
-    return evaluate_factors(
-        library.list_factors(),
-        dataset,
-        signal_failure_policy=signal_failure_policy,
-    )
-
-
-def _report_artifact_failures(artifacts, header: str) -> list[str]:
-    """Print a concise recomputation failure summary and return failure texts."""
-    from factorminer.evaluation.runtime import summarize_failures
-
-    failures = summarize_failures(artifacts)
-    if not failures:
-        return []
-
-    click.echo(f"{header}: {len(failures)} factor(s) failed to recompute.")
-    for failure in failures[:10]:
-        click.echo(f"  - {failure}")
-    if len(failures) > 10:
-        click.echo(f"  ... and {len(failures) - 10} more")
-
-    return failures
-
-
-def _artifact_map_by_id(artifacts):
-    return {artifact.factor_id: artifact for artifact in artifacts}
-
-
-def _select_artifacts_for_ids(artifacts, factor_ids: tuple[int, ...]):
-    if not factor_ids:
-        return [artifact for artifact in artifacts if artifact.succeeded]
-
-    artifact_map = _artifact_map_by_id(artifacts)
-    selected = []
-    failed = []
-    missing = []
-    for factor_id in factor_ids:
-        artifact = artifact_map.get(factor_id)
-        if artifact is None:
-            missing.append(str(factor_id))
-        elif not artifact.succeeded:
-            failed.append(artifact)
-        else:
-            selected.append(artifact)
-
-    if missing:
-        click.echo(f"Missing recomputed factors for ids: {', '.join(missing)}")
-        raise click.Abort()
-    if failed:
-        click.echo("Requested factors failed to recompute:")
-        for artifact in failed:
-            click.echo(f"  - {artifact.factor_id}: {artifact.name} ({artifact.error})")
-        raise click.Abort()
-
-    return selected
-
-
-def _analysis_output_path(output_dir: Path, stem: str, split_name: str, fmt: str) -> str:
-    return str(output_dir / f"{stem}_{split_name}.{fmt}")
-
-
-def _print_benchmark_summary(title: str, payload: dict) -> None:
-    """Emit a concise benchmark summary for CLI runs."""
-    click.echo("=" * 60)
-    click.echo(title)
-    click.echo("=" * 60)
-    if not payload:
-        click.echo("No benchmark results produced.")
-        return
-
-    if all(isinstance(value, dict) and "universes" in value for value in payload.values()):
-        for baseline, result in payload.items():
-            click.echo(f"Baseline: {baseline}")
-            click.echo(
-                f"  Freeze library: {result.get('freeze_library_size', 0)} "
-                f"| Frozen Top-K: {len(result.get('frozen_top_k', []))}"
-            )
-            for universe, metrics in result.get("universes", {}).items():
-                library = metrics.get("library", {})
-                click.echo(
-                    f"  {universe}: library IC={library.get('ic', 0.0):.4f}, "
-                    f"ICIR={library.get('icir', 0.0):.4f}, "
-                    f"Avg|rho|={library.get('avg_abs_rho', 0.0):.4f}"
-                )
-    else:
-        click.echo(json.dumps(payload, indent=2))
-
-
-def _print_recomputed_factor_table(artifacts, split_name: str) -> None:
-    click.echo(
-        f"{'ID':>4s}  {'Name':<35s}  {'IC Mean':>8s}  {'|IC|':>8s}  "
-        f"{'ICIR':>7s}  {'Win%':>6s}  {'Turn':>6s}"
-    )
-    click.echo("-" * 90)
-
-    for artifact in artifacts:
-        stats = artifact.split_stats[split_name]
-        click.echo(
-            f"{artifact.factor_id:4d}  {artifact.name:<35s}  "
-            f"{stats['ic_mean']:8.4f}  {stats['ic_abs_mean']:8.4f}  "
-            f"{stats['icir']:7.3f}  {stats['ic_win_rate'] * 100:5.1f}%  "
-            f"{stats['turnover']:6.3f}"
-        )
-
-
-def _print_split_summary(artifacts, split_name: str) -> None:
-    if not artifacts:
-        click.echo("  No successful factor recomputations.")
-        return
-
-    ic_values = [artifact.split_stats[split_name]["ic_mean"] for artifact in artifacts]
-    abs_ic_values = [artifact.split_stats[split_name]["ic_abs_mean"] for artifact in artifacts]
-    icir_values = [artifact.split_stats[split_name]["icir"] for artifact in artifacts]
-    click.echo("-" * 90)
-    click.echo(f"  Total factors:    {len(artifacts)}")
-    click.echo(f"  Mean IC:          {np.mean(ic_values):.4f}")
-    click.echo(f"  Mean |IC|:        {np.mean(abs_ic_values):.4f}")
-    click.echo(f"  Mean ICIR:        {np.mean(icir_values):.3f}")
-    click.echo(f"  Max |IC|:         {max(abs_ic_values):.4f}")
-    click.echo(f"  Min |IC|:         {min(abs_ic_values):.4f}")
-
-
-def _load_library_from_path(library_path: str):
-    """Load a factor library, handling both .json extension and base path.
-
-    Returns
-    -------
-    FactorLibrary
-    """
-    from factorminer.core.library_io import load_library
-
-    path = Path(library_path)
-    # load_library expects the base path (without .json extension)
-    # but also works with .json since it calls path.with_suffix(".json")
-    if path.suffix == ".json":
-        base_path = path.with_suffix("")
-    else:
-        base_path = path
-
-    try:
-        library = load_library(base_path)
-        click.echo(f"Loaded factor library: {library.size} factors")
-        return library
-    except FileNotFoundError:
-        click.echo(f"Error: Factor library not found at {library_path}")
-        click.echo("  Tried: {}.json".format(base_path))
-        raise click.Abort()
-    except Exception as e:
-        click.echo(f"Error loading library: {e}")
-        raise click.Abort()
-
-
-# ---------------------------------------------------------------------------
-# Global options
-# ---------------------------------------------------------------------------
-
-@click.group()
-@click.option(
-    "--config", "-c",
-    type=click.Path(exists=True, dir_okay=False),
-    default=None,
-    help="Path to a YAML config file (merges with defaults).",
-)
-@click.option("--gpu/--cpu", default=True, help="Enable or disable GPU evaluation backend.")
-@click.option("--verbose", "-v", is_flag=True, help="Enable debug-level logging.")
-@click.option(
-    "--output-dir", "-o",
-    type=click.Path(file_okay=False),
-    default="output",
-    help="Directory for all output artifacts.",
-)
-@click.version_option(package_name="factorminer")
-@click.pass_context
-def main(ctx: click.Context, config: str | None, gpu: bool, verbose: bool, output_dir: str) -> None:
-    """FactorMiner -- LLM-powered quantitative factor mining."""
-    _setup_logging(verbose)
-
-    overrides: dict = {}
-    if not gpu:
-        overrides.setdefault("evaluation", {})["backend"] = "numpy"
-
-    try:
-        cfg = load_config(config_path=config, overrides=overrides if overrides else None)
-    except Exception as e:
-        click.echo(f"Error loading config: {e}")
-        raise click.Abort()
-
-    # Stash the raw YAML data for access to top-level fields like data_path
-    try:
-        import yaml
-        from factorminer.configs import DEFAULT_CONFIG_PATH
-        raw = {}
-        if DEFAULT_CONFIG_PATH.exists():
-            with open(DEFAULT_CONFIG_PATH) as f:
-                raw = yaml.safe_load(f) or {}
-        if config:
-            with open(config) as f:
-                user_raw = yaml.safe_load(f) or {}
-            raw.update(user_raw)
-        cfg._raw = raw
-    except Exception:
-        cfg._raw = {}
-
-    if output_dir == "output":
-        output_dir = cfg._raw.get("output_dir", output_dir)
-
-    ctx.ensure_object(dict)
-    ctx.obj["config"] = cfg
-    ctx.obj["verbose"] = verbose
-    ctx.obj["output_dir"] = Path(output_dir)
-
-
-# ---------------------------------------------------------------------------
-# mine
-# ---------------------------------------------------------------------------
-
-@main.command()
-@click.option("--iterations", "-n", type=int, default=None, help="Override max_iterations.")
-@click.option("--batch-size", "-b", type=int, default=None, help="Override batch_size.")
-@click.option("--target", "-t", type=int, default=None, help="Override target_library_size.")
-@click.option("--resume", type=click.Path(exists=True), default=None, help="Resume from a saved library.")
-@click.option("--mock", is_flag=True, help="Use mock data and mock LLM provider (for testing).")
-@click.option("--data", "data_path", type=click.Path(exists=True), default=None, help="Path to market data file.")
-@click.pass_context
-def mine(
-    ctx: click.Context,
-    iterations: int | None,
-    batch_size: int | None,
-    target: int | None,
-    resume: str | None,
-    mock: bool,
-    data_path: str | None,
-) -> None:
-    """Run a factor mining session."""
-    cfg = ctx.obj["config"]
-    output_dir = ctx.obj["output_dir"]
-
-    if iterations is not None:
-        cfg.mining.max_iterations = iterations
-    if batch_size is not None:
-        cfg.mining.batch_size = batch_size
-    if target is not None:
-        cfg.mining.target_library_size = target
-
-    try:
-        cfg.validate()
-    except ValueError as e:
-        click.echo(f"Configuration error: {e}")
-        raise click.Abort()
-
-    click.echo("=" * 60)
-    click.echo("FactorMiner -- Mining Session")
-    click.echo("=" * 60)
-    click.echo(f"  Target library size: {cfg.mining.target_library_size}")
-    click.echo(f"  Batch size:          {cfg.mining.batch_size}")
-    click.echo(f"  Max iterations:      {cfg.mining.max_iterations}")
-    click.echo(f"  IC threshold:        {cfg.mining.ic_threshold}")
-    click.echo(f"  Correlation limit:   {cfg.mining.correlation_threshold}")
-    click.echo(f"  Output directory:    {output_dir}")
-    click.echo("-" * 60)
-
-    # Load data
-    try:
-        dataset = _load_runtime_dataset_for_analysis(cfg, data_path, mock)
-    except Exception as e:
-        click.echo(f"Error loading data: {e}")
-        raise click.Abort()
-
-    click.echo(
-        f"  Data loaded: {len(dataset.asset_ids)} assets x "
-        f"{len(dataset.timestamps)} periods"
-    )
-    click.echo("  Preparing data tensors...")
-    data_tensor = dataset.data_tensor
-    returns = dataset.returns
-
-    # Create LLM provider
-    llm_provider = _create_llm_provider(cfg, mock)
-
-    # Load existing library for resume
-    library = None
-    if resume:
-        click.echo(f"  Resuming from: {resume}")
-        library = _load_library_from_path(resume)
-
-    # Create and configure MiningConfig for the RalphLoop
-    mining_config = _build_core_mining_config(cfg, output_dir, mock=mock)
-    _attach_runtime_targets(mining_config, dataset)
-
-    # Create and run the Ralph Loop
-    from factorminer.core.ralph_loop import RalphLoop
-
-    click.echo("-" * 60)
-    click.echo("Starting Ralph Loop...")
-
-    def _progress_callback(iteration: int, stats: dict) -> None:
-        """Print progress after each iteration."""
-        lib_size = stats.get("library_size", 0)
-        admitted = stats.get("admitted", 0)
-        yield_rate = stats.get("yield_rate", 0) * 100
-        click.echo(
-            f"  Iteration {iteration:3d}: "
-            f"Library={lib_size}, "
-            f"Admitted={admitted}, "
-            f"Yield={yield_rate:.1f}%"
-        )
-
-    try:
-        loop = RalphLoop(
-            config=mining_config,
-            data_tensor=data_tensor,
-            returns=returns,
-            llm_provider=llm_provider,
-            library=library,
-        )
-        result_library = loop.run(callback=_progress_callback)
-    except KeyboardInterrupt:
-        click.echo("\nMining interrupted by user.")
-        return
-    except Exception as e:
-        click.echo(f"Mining error: {e}")
-        logger.exception("Mining failed")
-        raise click.Abort()
-
-    # Save results
-    lib_path = _save_result_library(result_library, output_dir)
-
-    click.echo("=" * 60)
-    click.echo(f"Mining complete! Library size: {result_library.size}")
-    click.echo(f"Library saved to: {lib_path}")
-    click.echo("=" * 60)
-
-
-# ---------------------------------------------------------------------------
-# evaluate
-# ---------------------------------------------------------------------------
-
-@main.command()
-@click.argument("library_path", type=click.Path(exists=True))
-@click.option("--data", "data_path", type=click.Path(exists=True), default=None, help="Path to market data file.")
-@click.option("--mock", is_flag=True, help="Use mock data for evaluation.")
-@click.option("--period", type=click.Choice(["train", "test", "both"]), default="test", help="Evaluation period.")
-@click.option("--top-k", type=int, default=None, help="Evaluate only the top-K factors by IC.")
-@click.pass_context
-def evaluate(
-    ctx: click.Context,
-    library_path: str,
-    data_path: str | None,
-    mock: bool,
-    period: str,
-    top_k: int | None,
-) -> None:
-    """Evaluate a factor library on historical data."""
-    cfg = ctx.obj["config"]
-    signal_failure_policy = cfg.evaluation.signal_failure_policy
-
-    click.echo("=" * 60)
-    click.echo("FactorMiner -- Factor Evaluation")
-    click.echo("=" * 60)
-
-    # Load library
-    library = _load_library_from_path(library_path)
-
-    try:
-        dataset = _load_runtime_dataset_for_analysis(cfg, data_path, mock)
-    except Exception as e:
-        click.echo(f"Error loading data: {e}")
-        raise click.Abort()
-
-    click.echo(f"  Period: {period} | Backend: {cfg.evaluation.backend}")
-    click.echo(
-        f"  Data: {len(dataset.asset_ids)} assets x {len(dataset.timestamps)} periods"
-    )
-
-    artifacts = _recompute_analysis_artifacts(library, dataset, signal_failure_policy)
-    failures = _report_artifact_failures(artifacts, header="Evaluation warnings")
-
-    from factorminer.evaluation.runtime import analysis_split_names, select_top_k
-
-    split_names = analysis_split_names(period)
-    selection_split = "train" if period == "both" else split_names[0]
-    selected = select_top_k(artifacts, selection_split, top_k)
-    if not selected:
-        click.echo("No factors successfully recomputed for evaluation.")
-        if signal_failure_policy == "reject" and failures:
-            raise click.Abort()
-        raise click.Abort()
-
-    if top_k is not None and top_k < len([a for a in artifacts if a.succeeded]):
-        if period == "both":
-            click.echo(f"  Evaluating top {top_k} factors by train |IC| for train/test comparison")
-        else:
-            click.echo(f"  Evaluating top {top_k} factors by {selection_split} |IC|")
-
-    for split_name in split_names:
-        click.echo("-" * 60)
-        click.echo(f"Split: {split_name}")
-        _print_recomputed_factor_table(selected, split_name)
-        _print_split_summary(selected, split_name)
-
-    if period == "both" and selected:
-        click.echo("-" * 60)
-        click.echo("Decay summary (train -> test)")
-        click.echo(f"{'ID':>4s}  {'Name':<35s}  {'Train |IC|':>10s}  {'Test |IC|':>9s}  {'Delta':>8s}")
-        click.echo("-" * 80)
-        for artifact in selected:
-            train_ic = artifact.split_stats["train"]["ic_abs_mean"]
-            test_ic = artifact.split_stats["test"]["ic_abs_mean"]
-            click.echo(
-                f"{artifact.factor_id:4d}  {artifact.name:<35s}  "
-                f"{train_ic:10.4f}  {test_ic:9.4f}  {test_ic - train_ic:8.4f}"
-            )
-
-    click.echo("=" * 60)
-
-
-# ---------------------------------------------------------------------------
-# combine
-# ---------------------------------------------------------------------------
-
-@main.command()
-@click.argument("library_path", type=click.Path(exists=True))
-@click.option("--data", "data_path", type=click.Path(exists=True), default=None, help="Path to market data file.")
-@click.option("--mock", is_flag=True, help="Use mock data for combination.")
-@click.option(
-    "--fit-period",
-    type=click.Choice(["train", "test", "both"]),
-    default="train",
-    help="Split used for top-k selection and model/weight fitting.",
-)
-@click.option(
-    "--eval-period",
-    type=click.Choice(["train", "test", "both"]),
-    default="test",
-    help="Split used to evaluate the combined signal.",
-)
-@click.option(
-    "--method", "-m",
-    type=click.Choice(["equal-weight", "ic-weighted", "orthogonal", "all"]),
-    default="all",
-    help="Factor combination method.",
-)
-@click.option(
-    "--selection", "-s",
-    type=click.Choice(["lasso", "stepwise", "xgboost", "none"]),
-    default="none",
-    help="Factor selection method to run before combination.",
-)
-@click.option("--top-k", type=int, default=None, help="Select top-K factors before combining.")
-@click.pass_context
-def combine(
-    ctx: click.Context,
-    library_path: str,
-    data_path: str | None,
-    mock: bool,
-    fit_period: str,
-    eval_period: str,
-    method: str,
-    selection: str,
-    top_k: int | None,
-) -> None:
-    """Run factor combination and selection methods."""
-    cfg = ctx.obj["config"]
-    output_dir = ctx.obj["output_dir"]
-
-    click.echo("=" * 60)
-    click.echo("FactorMiner -- Factor Combination")
-    click.echo("=" * 60)
-
-    # Load library
-    library = _load_library_from_path(library_path)
-
-    from factorminer.evaluation.runtime import (
-        resolve_split_for_fit_eval,
-        select_top_k,
-    )
-
-    try:
-        dataset = _load_runtime_dataset_for_analysis(cfg, data_path, mock)
-    except Exception as e:
-        click.echo(f"Error loading data: {e}")
-        raise click.Abort()
-
-    artifacts = _recompute_analysis_artifacts(
-        library,
-        dataset,
-        cfg.evaluation.signal_failure_policy,
-    )
-    failures = _report_artifact_failures(artifacts, header="Combination warnings")
-
-    fit_split = resolve_split_for_fit_eval(fit_period)
-    eval_split = resolve_split_for_fit_eval(eval_period)
-
-    selected_artifacts = select_top_k(artifacts, fit_split, top_k)
-    if not selected_artifacts:
-        click.echo("No factors successfully recomputed for combination.")
-        if cfg.evaluation.signal_failure_policy == "reject" and failures:
-            raise click.Abort()
-        raise click.Abort()
-
-    if top_k is not None and top_k < len([a for a in artifacts if a.succeeded]):
-        click.echo(f"  Pre-selected top {len(selected_artifacts)} factors by {fit_split} |IC|")
-
-    click.echo(f"  Fit split:  {fit_split}")
-    click.echo(f"  Eval split: {eval_split}")
-    click.echo(f"  Combining {len(selected_artifacts)} factors")
-    click.echo("-" * 60)
-
-    # Run selection if requested
-    selected_ids = [artifact.factor_id for artifact in selected_artifacts]
-    fit_returns_tn = dataset.get_split(fit_split).returns.T
-    fit_factor_signals = {
-        artifact.factor_id: artifact.split_signals[fit_split].T
-        for artifact in selected_artifacts
-    }
-
-    if selection != "none":
-        click.echo(f"\n  Running {selection} selection...")
-        from factorminer.evaluation.selection import FactorSelector
-
-        selector = FactorSelector()
-
-        try:
-            if selection == "lasso":
-                results = selector.lasso_selection(fit_factor_signals, fit_returns_tn)
-            elif selection == "stepwise":
-                results = selector.forward_stepwise(fit_factor_signals, fit_returns_tn)
-            elif selection == "xgboost":
-                results = selector.xgboost_selection(fit_factor_signals, fit_returns_tn)
-            else:
-                results = []
-
-            if results:
-                selected_ids = [factor_id for factor_id, _ in results]
-                click.echo(f"\n  {selection.capitalize()} selection results:")
-                click.echo(f"  {'Factor ID':>10s}  {'Score':>10s}")
-                click.echo("  " + "-" * 25)
-                for fid, score in results[:20]:  # Show top 20
-                    click.echo(f"  {fid:10d}  {score:10.4f}")
-                click.echo(f"  Total selected: {len(selected_ids)}")
-            else:
-                click.echo(f"  {selection} selection returned no factors.")
-        except ImportError as e:
-            click.echo(f"  Selection method '{selection}' requires additional packages: {e}")
-        except Exception as e:
-            click.echo(f"  Selection error: {e}")
-            logger.exception("Selection failed")
-
-    # Run combination methods
-    from factorminer.evaluation.combination import FactorCombiner
-    from factorminer.evaluation.portfolio import PortfolioBacktester
-
-    combiner = FactorCombiner()
-    backtester = PortfolioBacktester()
-    artifact_map = _artifact_map_by_id(selected_artifacts)
-    eval_factor_signals = {
-        factor_id: artifact_map[factor_id].split_signals[eval_split].T
-        for factor_id in selected_ids
-        if factor_id in artifact_map
-    }
-    ic_values = {
-        factor_id: artifact_map[factor_id].split_stats[fit_split]["ic_mean"]
-        for factor_id in eval_factor_signals
-    }
-    eval_returns_tn = dataset.get_split(eval_split).returns.T
-
-    methods_to_run = []
-    if method == "all":
-        methods_to_run = ["equal-weight", "ic-weighted", "orthogonal"]
-    else:
-        methods_to_run = [method]
-
-    for m in methods_to_run:
-        click.echo(f"\n  {m.upper()} combination:")
-        try:
-            if m == "equal-weight":
-                composite = combiner.equal_weight(eval_factor_signals)
-            elif m == "ic-weighted":
-                composite = combiner.ic_weighted(eval_factor_signals, ic_values)
-            elif m == "orthogonal":
-                composite = combiner.orthogonal(eval_factor_signals)
-            else:
-                continue
-
-            stats = backtester.quintile_backtest(composite, eval_returns_tn)
-            click.echo(f"    IC Mean:      {stats['ic_mean']:.4f}")
-            click.echo(f"    ICIR:         {stats['icir']:.4f}")
-            click.echo(f"    Long-Short:   {stats['ls_return']:.4f}")
-            click.echo(f"    Monotonicity: {stats['monotonicity']:.2f}")
-            click.echo(f"    Avg Turnover: {stats['avg_turnover']:.4f}")
-        except Exception as e:
-            click.echo(f"    Error: {e}")
-            logger.exception("Combination method %s failed", m)
-
-    if cfg.research.enabled and cfg.benchmark.mode == "research":
-        click.echo("\n  Research model suite:")
-        try:
-            from factorminer.evaluation.research import run_research_model_suite
-
-            research_reports = run_research_model_suite(
-                eval_factor_signals,
-                eval_returns_tn,
-                cfg.research,
-            )
-            research_path = output_dir / "research_model_suite.json"
-            research_path.write_text(json.dumps(research_reports, indent=2))
-            for model_name, report in research_reports.items():
-                if not report.get("available", True):
-                    click.echo(f"    {model_name}: unavailable ({report.get('error', 'unknown error')})")
-                    continue
-                click.echo(
-                    f"    {model_name}: "
-                    f"net IR={report.get('mean_test_net_ir', 0.0):.4f}, "
-                    f"ICIR={report.get('mean_test_icir', 0.0):.4f}, "
-                    f"stability={report.get('selection_stability', 0.0):.3f}"
-                )
-            click.echo(f"    Saved: {research_path}")
-        except Exception as e:
-            click.echo(f"    Research suite error: {e}")
-            logger.exception("Research model suite failed")
-
-    click.echo("\n" + "=" * 60)
-
-
-# ---------------------------------------------------------------------------
-# visualize
-# ---------------------------------------------------------------------------
-
-@main.command()
-@click.argument("library_path", type=click.Path(exists=True))
-@click.option("--data", "data_path", type=click.Path(exists=True), default=None, help="Path to market data file.")
-@click.option("--mock", is_flag=True, help="Use mock data for visualization.")
-@click.option("--period", type=click.Choice(["train", "test", "both"]), default="test", help="Evaluation split to visualize.")
-@click.option("--factor-id", "factor_ids", type=int, multiple=True, help="Specific factor ID(s) to visualize.")
-@click.option("--top-k", type=int, default=None, help="Top-K factors by split |IC| for set-level plots.")
-@click.option("--tearsheet", is_flag=True, help="Generate a full factor tear sheet.")
-@click.option("--correlation", is_flag=True, help="Plot factor correlation heatmap.")
-@click.option("--ic-timeseries", is_flag=True, help="Plot IC time series.")
-@click.option("--quintile", is_flag=True, help="Plot quintile returns.")
-@click.option("--format", "fmt", type=click.Choice(["png", "pdf", "svg"]), default="png", help="Output format.")
-@click.pass_context
-def visualize(
-    ctx: click.Context,
-    library_path: str,
-    data_path: str | None,
-    mock: bool,
-    period: str,
-    factor_ids: tuple[int, ...],
-    top_k: int | None,
-    tearsheet: bool,
-    correlation: bool,
-    ic_timeseries: bool,
-    quintile: bool,
-    fmt: str,
-) -> None:
-    """Generate plots and tear sheets for a factor library."""
-    output_dir = ctx.obj["output_dir"]
-    cfg = ctx.obj["config"]
-
-    click.echo("=" * 60)
-    click.echo("FactorMiner -- Visualization")
-    click.echo("=" * 60)
-
-    # Load library
-    library = _load_library_from_path(library_path)
-
-    # Determine what to plot
-    plot_all = not (tearsheet or correlation or ic_timeseries or quintile)
-    if plot_all:
-        click.echo("No specific plots requested; generating all available.")
-        correlation = True
-        ic_timeseries = True
-        quintile = True
-
-    output_dir.mkdir(parents=True, exist_ok=True)
-    click.echo(f"  Output format: {fmt}")
-    click.echo(f"  Output dir:    {output_dir}")
-    click.echo(f"  Period:        {period}")
-    click.echo("-" * 60)
-
-    try:
-        dataset = _load_runtime_dataset_for_analysis(cfg, data_path, mock)
-    except Exception as e:
-        click.echo(f"Error loading data: {e}")
-        raise click.Abort()
-
-    artifacts = _recompute_analysis_artifacts(
-        library,
-        dataset,
-        cfg.evaluation.signal_failure_policy,
-    )
-    failures = _report_artifact_failures(artifacts, header="Visualization warnings")
-
-    from factorminer.evaluation.runtime import (
-        analysis_split_names,
-        compute_correlation_matrix,
-        select_top_k,
-    )
-    from factorminer.utils.tearsheet import FactorTearSheet
-    from factorminer.utils.visualization import (
-        plot_correlation_heatmap,
-        plot_ic_timeseries,
-        plot_quintile_returns,
-    )
-
-    split_names = analysis_split_names(period)
-    explicit_artifacts = _select_artifacts_for_ids(artifacts, factor_ids)
-    if not explicit_artifacts and factor_ids:
-        if cfg.evaluation.signal_failure_policy == "reject" and failures:
-            raise click.Abort()
-        raise click.Abort()
-
-    for split_name in split_names:
-        split = dataset.get_split(split_name)
-        click.echo(f"  Split: {split_name}")
-
-        if correlation:
-            if factor_ids:
-                corr_artifacts = explicit_artifacts
-            else:
-                corr_artifacts = select_top_k(artifacts, split_name, top_k)
-
-            if corr_artifacts:
-                click.echo("    Generating correlation heatmap...")
-                corr_matrix = compute_correlation_matrix(corr_artifacts, split_name)
-                save_path = _analysis_output_path(output_dir, "correlation_heatmap", split_name, fmt)
-                plot_correlation_heatmap(
-                    corr_matrix,
-                    [artifact.name[:20] for artifact in corr_artifacts],
-                    title=f"Factor Correlation Heatmap ({split_name})",
-                    save_path=save_path,
-                )
-                click.echo(f"      Saved: {save_path}")
-            else:
-                click.echo("    Skipped: no successfully recomputed factors for correlation heatmap.")
-
-        factor_artifacts = explicit_artifacts
-        if not factor_ids and (ic_timeseries or quintile or tearsheet):
-            factor_artifacts = select_top_k(artifacts, split_name, 1)
-            if factor_artifacts:
-                click.echo(
-                    f"    Defaulted to factor #{factor_artifacts[0].factor_id} "
-                    f"{factor_artifacts[0].name} for factor-specific plots."
-                )
-
-        if ic_timeseries:
-            click.echo("    Generating IC time series plot(s)...")
-            for artifact in factor_artifacts:
-                stats = artifact.split_stats[split_name]
-                dates = [str(ts)[:10] for ts in split.timestamps]
-                save_path = _analysis_output_path(
-                    output_dir,
-                    f"ic_timeseries_factor_{artifact.factor_id}",
-                    split_name,
-                    fmt,
-                )
-                plot_ic_timeseries(
-                    stats["ic_series"],
-                    dates,
-                    title=f"{artifact.name} IC Time Series ({split_name})",
-                    save_path=save_path,
-                )
-                click.echo(f"      Saved: {save_path}")
-
-        if quintile:
-            click.echo("    Generating quintile return plot(s)...")
-            for artifact in factor_artifacts:
-                stats = artifact.split_stats[split_name]
-                save_path = _analysis_output_path(
-                    output_dir,
-                    f"quintile_returns_factor_{artifact.factor_id}",
-                    split_name,
-                    fmt,
-                )
-                plot_quintile_returns(
-                    {
-                        f"Q{i}": stats[f"Q{i}"] for i in range(1, 6)
-                    }
-                    | {
-                        "long_short": stats["long_short"],
-                        "monotonicity": stats["monotonicity"],
-                    },
-                    title=f"{artifact.name} Quintile Returns ({split_name})",
-                    save_path=save_path,
-                )
-                click.echo(f"      Saved: {save_path}")
-
-        if tearsheet:
-            click.echo("    Generating tear sheet(s)...")
-            ts = FactorTearSheet()
-            dates = [str(ts_)[:10] for ts_ in split.timestamps]
-            for artifact in factor_artifacts:
-                save_path = _analysis_output_path(
-                    output_dir,
-                    f"tearsheet_factor_{artifact.factor_id}",
-                    split_name,
-                    fmt,
-                )
-                ts.generate(
-                    factor_id=artifact.factor_id,
-                    factor_name=artifact.name,
-                    formula=artifact.formula,
-                    signals=artifact.split_signals[split_name],
-                    returns=split.returns,
-                    dates=dates,
-                    save_path=save_path,
-                )
-                click.echo(f"      Saved: {save_path}")
-
-    click.echo("=" * 60)
-    click.echo("Visualization complete.")
-
-
-# ---------------------------------------------------------------------------
-# export
-# ---------------------------------------------------------------------------
-
-@main.command(name="export")
-@click.argument("library_path", type=click.Path(exists=True))
-@click.option(
-    "--format", "fmt",
-    type=click.Choice(["json", "csv", "formulas"]),
-    default="json",
-    help="Export format.",
-)
-@click.option("--output", "-o", type=click.Path(), default=None, help="Output file path.")
-@click.pass_context
-def export_cmd(ctx: click.Context, library_path: str, fmt: str, output: str | None) -> None:
-    """Export a factor library to various formats."""
-    output_dir = ctx.obj["output_dir"]
-
-    click.echo("=" * 60)
-    click.echo("FactorMiner -- Export")
-    click.echo("=" * 60)
-
-    # Load library
-    library = _load_library_from_path(library_path)
-
-    # Determine output path
-    if output is None:
-        output_dir.mkdir(parents=True, exist_ok=True)
-        if fmt == "formulas":
-            output = str(output_dir / "library_formulas.txt")
-        else:
-            output = str(output_dir / f"library.{fmt}")
-
-    click.echo(f"  Format:  {fmt}")
-    click.echo(f"  Output:  {output}")
-    click.echo("-" * 60)
-
-    try:
-        from factorminer.core.library_io import export_csv, export_formulas, save_library
-
-        if fmt == "json":
-            # save_library expects base path without extension
-            out_path = Path(output)
-            if out_path.suffix == ".json":
-                base = out_path.with_suffix("")
-            else:
-                base = out_path
-            save_library(library, base, save_signals=False)
-            click.echo(f"  Exported {library.size} factors to {base}.json")
-
-        elif fmt == "csv":
-            export_csv(library, output)
-            click.echo(f"  Exported {library.size} factors to {output}")
-
-        elif fmt == "formulas":
-            export_formulas(library, output)
-            click.echo(f"  Exported {library.size} formulas to {output}")
-
-    except Exception as e:
-        click.echo(f"Export error: {e}")
-        logger.exception("Export failed")
-        raise click.Abort()
-
-    click.echo("=" * 60)
-
-
-# ---------------------------------------------------------------------------
-# benchmark
-# ---------------------------------------------------------------------------
-
-@main.group()
-def benchmark() -> None:
-    """Run strict paper/research benchmark workflows."""
-
-
-def _benchmark_common_options(fn):
-    fn = click.option(
-        "--data",
-        "data_path",
-        type=click.Path(exists=True),
-        default=None,
-        help="Path to market data file.",
-    )(fn)
-    fn = click.option(
-        "--mock",
-        is_flag=True,
-        help="Use mock data for benchmark execution.",
-    )(fn)
-    fn = click.option(
-        "--factor-miner-library",
-        type=click.Path(exists=True),
-        default=None,
-        help="Optional saved library for the FactorMiner baseline.",
-    )(fn)
-    fn = click.option(
-        "--factor-miner-no-memory-library",
-        type=click.Path(exists=True),
-        default=None,
-        help="Optional saved library for the FactorMiner No Memory baseline.",
-    )(fn)
-    return click.pass_context(fn)
-
-
-@benchmark.command("table1")
-@click.option("--baseline", "baselines", multiple=True, help="Restrict to one or more baseline ids.")
-@_benchmark_common_options
-def benchmark_table1(
-    ctx: click.Context,
-    data_path: str | None,
-    mock: bool,
-    factor_miner_library: str | None,
-    factor_miner_no_memory_library: str | None,
-    baselines: tuple[str, ...],
-) -> None:
-    """Run the Top-K freeze benchmark across configured universes."""
-    from factorminer.benchmark.runtime import run_table1_benchmark
-
-    cfg = ctx.obj["config"]
-    output_dir = ctx.obj["output_dir"]
-    payload = run_table1_benchmark(
-        cfg,
-        output_dir,
-        data_path=data_path,
-        mock=mock,
-        baseline_names=list(baselines) if baselines else None,
-        factor_miner_library_path=factor_miner_library,
-        factor_miner_no_memory_library_path=factor_miner_no_memory_library,
-    )
-    _print_benchmark_summary("FactorMiner -- Benchmark Table 1", payload)
-
-
-@benchmark.command("ablation-memory")
-@_benchmark_common_options
-def benchmark_ablation_memory(
-    ctx: click.Context,
-    data_path: str | None,
-    mock: bool,
-    factor_miner_library: str | None,
-    factor_miner_no_memory_library: str | None,
-) -> None:
-    """Run the experience-memory ablation benchmark."""
-    from factorminer.benchmark.runtime import run_ablation_memory_benchmark
-
-    cfg = ctx.obj["config"]
-    output_dir = ctx.obj["output_dir"]
-    payload = run_ablation_memory_benchmark(
-        cfg,
-        output_dir,
-        data_path=data_path,
-        mock=mock,
-        factor_miner_library_path=factor_miner_library,
-        factor_miner_no_memory_library_path=factor_miner_no_memory_library,
-    )
-    _print_benchmark_summary("FactorMiner -- Memory Ablation", payload)
-
-
-@benchmark.command("cost-pressure")
-@click.option("--baseline", default="factor_miner", help="Baseline id to evaluate.")
-@_benchmark_common_options
-def benchmark_cost_pressure(
-    ctx: click.Context,
-    data_path: str | None,
-    mock: bool,
-    factor_miner_library: str | None,
-    factor_miner_no_memory_library: str | None,
-    baseline: str,
-) -> None:
-    """Run transaction-cost pressure testing."""
-    from factorminer.benchmark.runtime import run_cost_pressure_benchmark
-
-    cfg = ctx.obj["config"]
-    output_dir = ctx.obj["output_dir"]
-    payload = run_cost_pressure_benchmark(
-        cfg,
-        output_dir,
-        baseline=baseline,
-        data_path=data_path,
-        mock=mock,
-        factor_miner_library_path=factor_miner_library,
-    )
-    _print_benchmark_summary("FactorMiner -- Cost Pressure", payload)
-
-
-@benchmark.command("efficiency")
-@click.pass_context
-def benchmark_efficiency(ctx: click.Context) -> None:
-    """Run operator-level and factor-level efficiency benchmarks."""
-    from factorminer.benchmark.runtime import run_efficiency_benchmark
-
-    cfg = ctx.obj["config"]
-    output_dir = ctx.obj["output_dir"]
-    payload = run_efficiency_benchmark(cfg, output_dir)
-    _print_benchmark_summary("FactorMiner -- Efficiency Benchmark", payload)
-
-
-@benchmark.command("suite")
-@_benchmark_common_options
-def benchmark_suite(
-    ctx: click.Context,
-    data_path: str | None,
-    mock: bool,
-    factor_miner_library: str | None,
-    factor_miner_no_memory_library: str | None,
-) -> None:
-    """Run the full benchmark suite."""
-    from factorminer.benchmark.runtime import run_benchmark_suite
-
-    cfg = ctx.obj["config"]
-    output_dir = ctx.obj["output_dir"]
-    payload = run_benchmark_suite(
-        cfg,
-        output_dir,
-        data_path=data_path,
-        mock=mock,
-        factor_miner_library_path=factor_miner_library,
-        factor_miner_no_memory_library_path=factor_miner_no_memory_library,
-    )
-    _print_benchmark_summary("FactorMiner -- Benchmark Suite", payload)
-
-
-# ---------------------------------------------------------------------------
-# helix
-# ---------------------------------------------------------------------------
-
-@main.command()
-@click.option("--iterations", "-n", type=int, default=None, help="Override max_iterations.")
-@click.option("--batch-size", "-b", type=int, default=None, help="Override batch_size.")
-@click.option("--target", "-t", type=int, default=None, help="Override target_library_size.")
-@click.option("--resume", type=click.Path(exists=True), default=None, help="Resume from a saved library.")
-@click.option("--causal/--no-causal", default=None, help="Enable/disable causal validation.")
-@click.option("--regime/--no-regime", default=None, help="Enable/disable regime-conditional evaluation.")
-@click.option("--debate/--no-debate", default=None, help="Enable/disable multi-specialist debate generation.")
-@click.option("--canonicalize/--no-canonicalize", default=None, help="Enable/disable SymPy canonicalization.")
-@click.option("--mock", is_flag=True, help="Use mock data and mock LLM provider (for testing).")
-@click.option("--data", "data_path", type=click.Path(exists=True), default=None, help="Path to market data file.")
-@click.pass_context
-def helix(
-    ctx: click.Context,
-    iterations: int | None,
-    batch_size: int | None,
-    target: int | None,
-    resume: str | None,
-    causal: bool | None,
-    regime: bool | None,
-    debate: bool | None,
-    canonicalize: bool | None,
-    mock: bool,
-    data_path: str | None,
-) -> None:
-    """Run the enhanced Helix Loop with Phase 2 features."""
-    cfg = ctx.obj["config"]
-
-    if iterations is not None:
-        cfg.mining.max_iterations = iterations
-    if batch_size is not None:
-        cfg.mining.batch_size = batch_size
-    if target is not None:
-        cfg.mining.target_library_size = target
-
-    if causal is not None:
-        cfg.phase2.causal.enabled = causal
-    if regime is not None:
-        cfg.phase2.regime.enabled = regime
-    if debate is not None:
-        cfg.phase2.debate.enabled = debate
-    if canonicalize is not None:
-        if canonicalize:
-            cfg.phase2.helix.enabled = True
-        cfg.phase2.helix.enable_canonicalization = canonicalize
-
-    try:
-        cfg.validate()
-    except ValueError as e:
-        click.echo(f"Configuration error: {e}")
-        raise click.Abort()
-
-    output_dir = ctx.obj["output_dir"]
-    enabled_features = _active_phase2_features(cfg)
-
-    click.echo("HelixFactor Phase 2 mining engine.")
-    click.echo(f"  Target: {cfg.mining.target_library_size} | "
-               f"Batch: {cfg.mining.batch_size} | "
-               f"Max iterations: {cfg.mining.max_iterations}")
-    click.echo(f"  Output directory: {output_dir}")
-
-    if enabled_features:
-        click.echo(f"  Active Phase 2 features: {', '.join(enabled_features)}")
-    else:
-        click.echo("  No Phase 2 features enabled. Configure phase2.* in your config to enable features.")
-
-    if resume:
-        click.echo(f"  Resuming from: {resume}")
-
-    try:
-        dataset = _load_runtime_dataset_for_analysis(cfg, data_path, mock)
-    except Exception as e:
-        click.echo(f"Error loading data: {e}")
-        raise click.Abort()
-
-    click.echo("  Preparing data tensors...")
-    data_tensor = dataset.data_tensor
-    returns = dataset.returns
-    llm_provider = _create_llm_provider(cfg, mock)
-
-    library = None
-    if resume:
-        library = _load_library_from_path(resume)
-
-    mining_config = _build_core_mining_config(cfg, output_dir, mock=mock)
-    _attach_runtime_targets(mining_config, dataset)
-    phase2_configs = _build_phase2_runtime_configs(cfg)
-    volume = _extract_capacity_volume(data_tensor) if cfg.phase2.capacity.enabled else None
-
-    from factorminer.core.helix_loop import HelixLoop
-
-    click.echo("-" * 60)
-    click.echo("Starting Helix Loop...")
-
-    def _progress_callback(iteration: int, stats: dict) -> None:
-        message = (
-            f"  Iteration {iteration:3d}: "
-            f"Library={stats.get('library_size', 0)}, "
-            f"Admitted={stats.get('admitted', 0)}, "
-            f"Yield={stats.get('yield_rate', 0) * 100:.1f}%"
-        )
-        canon_removed = stats.get("canonical_duplicates_removed", 0)
-        phase2_rejections = stats.get("phase2_rejections", 0)
-        if canon_removed:
-            message += f", CanonDupes={canon_removed}"
-        if phase2_rejections:
-            message += f", Phase2Reject={phase2_rejections}"
-        click.echo(message)
-
-    try:
-        loop = HelixLoop(
-            config=mining_config,
-            data_tensor=data_tensor,
-            returns=returns,
-            llm_provider=llm_provider,
-            library=library,
-            debate_config=phase2_configs["debate_config"],
-            enable_knowledge_graph=(
-                cfg.phase2.helix.enabled and cfg.phase2.helix.enable_knowledge_graph
-            ),
-            enable_embeddings=(
-                cfg.phase2.helix.enabled and cfg.phase2.helix.enable_embeddings
-            ),
-            enable_auto_inventor=cfg.phase2.auto_inventor.enabled,
-            auto_invention_interval=cfg.phase2.auto_inventor.invention_interval,
-            canonicalize=(
-                cfg.phase2.helix.enabled and cfg.phase2.helix.enable_canonicalization
-            ),
-            forgetting_lambda=cfg.phase2.helix.forgetting_lambda,
-            causal_config=phase2_configs["causal_config"],
-            regime_config=phase2_configs["regime_config"],
-            capacity_config=phase2_configs["capacity_config"],
-            significance_config=phase2_configs["significance_config"],
-            volume=volume,
-        )
-        result_library = loop.run(callback=_progress_callback)
-    except KeyboardInterrupt:
-        click.echo("\nHelix mining interrupted by user.")
-        return
-    except Exception as e:
-        click.echo(f"Helix mining error: {e}")
-        logger.exception("Helix loop failed")
-        raise click.Abort()
-
-    lib_path = _save_result_library(result_library, output_dir)
-
-    click.echo("=" * 60)
-    click.echo(f"Helix mining complete! Library size: {result_library.size}")
-    click.echo(f"Library saved to: {lib_path}")
-    click.echo("=" * 60)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/src/factorminer/factorminer/configs/__init__.py b/src/factorminer/factorminer/configs/__init__.py
deleted file mode 100644
index 5d1a84e..0000000
--- a/src/factorminer/factorminer/configs/__init__.py
+++ /dev/null
@@ -1,25 +0,0 @@
-"""Configuration defaults and schemas for FactorMiner."""
-
-from pathlib import Path
-from typing import Any, Dict
-
-import yaml
-
-CONFIGS_DIR = Path(__file__).parent
-DEFAULT_CONFIG_PATH = CONFIGS_DIR / "default.yaml"
-
-
-def load_default_yaml() -> Dict[str, Any]:
-    """Load the default YAML configuration shipped with the package.
-
-    Returns
-    -------
-    dict
-        Parsed YAML contents as a nested dictionary.  Returns an empty
-        dict if the default file is missing or empty.
-    """
-    if not DEFAULT_CONFIG_PATH.exists():
-        return {}
-    with open(DEFAULT_CONFIG_PATH) as f:
-        data = yaml.safe_load(f)
-    return data if isinstance(data, dict) else {}
diff --git a/src/factorminer/factorminer/configs/benchmark_full.yaml b/src/factorminer/factorminer/configs/benchmark_full.yaml
deleted file mode 100644
index d5be420..0000000
--- a/src/factorminer/factorminer/configs/benchmark_full.yaml
+++ /dev/null
@@ -1,45 +0,0 @@
-output_dir: "./output/benchmark_full"
-
-evaluation:
-  backend: "numpy"
-  signal_failure_policy: "reject"
-
-benchmark:
-  mode: "paper"
-  baselines:
-    - "alpha101_classic"
-    - "alpha101_adapted"
-    - "random_exploration"
-    - "gplearn"
-    - "alphaforge_style"
-    - "alphaagent_style"
-    - "factor_miner"
-    - "factor_miner_no_memory"
-
-data:
-  targets:
-    - name: "paper"
-      entry_delay_bars: 1
-      holding_bars: 1
-      price_pair: "open_to_close"
-      return_transform: "simple"
-  default_target: "paper"
-
-research:
-  enabled: false
-
-phase2:
-  causal:
-    enabled: false
-  regime:
-    enabled: false
-  capacity:
-    enabled: false
-  significance:
-    enabled: false
-  debate:
-    enabled: false
-  auto_inventor:
-    enabled: false
-  helix:
-    enabled: false
diff --git a/src/factorminer/factorminer/configs/default.yaml b/src/factorminer/factorminer/configs/default.yaml
deleted file mode 100644
index 7e92c85..0000000
--- a/src/factorminer/factorminer/configs/default.yaml
+++ /dev/null
@@ -1,307 +0,0 @@
-# =============================================================================
-# FactorMiner Default Configuration
-# =============================================================================
-# This file provides sensible defaults for the FactorMiner mining pipeline.
-# Override any setting by providing a user config file via --config flag,
-# or by passing CLI options directly.
-#
-# Resolution order: defaults -> user config -> CLI overrides
-# =============================================================================
-
-# ---------------------------------------------------------------------------
-# Data paths
-# ---------------------------------------------------------------------------
-# Path to market data CSV/Parquet/HDF5 file. Set to null to use mock data.
-data_path: null
-
-# Directory for all output artifacts (libraries, plots, logs).
-output_dir: "./output"
-
-# ---------------------------------------------------------------------------
-# Mining parameters (Ralph Loop)
-# ---------------------------------------------------------------------------
-mining:
-  # Target number of factors in the final library (paper: 110)
-  target_library_size: 110
-
-  # Number of candidate factors to generate per iteration (paper: 40)
-  batch_size: 40
-
-  # Maximum number of Ralph Loop iterations before stopping
-  max_iterations: 200
-
-  # Minimum absolute IC for a factor to pass Stage 1 screening (paper: 0.04)
-  ic_threshold: 0.04
-
-  # Minimum ICIR for factor quality filtering
-  icir_threshold: 0.5
-
-  # Maximum pairwise |rho| allowed between library factors (paper: 0.5)
-  correlation_threshold: 0.5
-
-  # Minimum IC required for a candidate to replace an existing factor
-  replacement_ic_min: 0.10
-
-  # Replacement IC ratio: candidate IC must be >= ratio * existing IC
-  replacement_ic_ratio: 1.3
-
-# ---------------------------------------------------------------------------
-# Evaluation backend
-# ---------------------------------------------------------------------------
-evaluation:
-  # Number of parallel workers for factor evaluation
-  num_workers: 40
-
-  # Number of assets used in fast IC screening (Stage 1)
-  fast_screen_assets: 100
-
-  # GPU device identifier (used when backend is "gpu")
-  gpu_device: "cuda:0"
-
-  # Computation backend: "gpu" (fastest), "numpy" (CPU), or "c" (C extension)
-  backend: "gpu"
-
-  # How to handle factor expression failures during evaluation/mining:
-  # "reject" = fail the factor/command, "synthetic" = deterministic fallback,
-  # "raise" = propagate the raw exception.
-  signal_failure_policy: "reject"
-
-# ---------------------------------------------------------------------------
-# Data loading
-# ---------------------------------------------------------------------------
-data:
-  # Market type: "a_shares", "crypto", etc.
-  market: "a_shares"
-
-  # Asset universe filter: "CSI500", "CSI1000", "HS300", "Binance"
-  universe: "CSI500"
-
-  # Bar frequency: "10min", "30min", "1h", "1d"
-  frequency: "10min"
-
-  # Feature columns available for factor construction
-  features:
-    - "$open"
-    - "$high"
-    - "$low"
-    - "$close"
-    - "$volume"
-    - "$amt"
-    - "$vwap"
-    - "$returns"
-
-  # Training period [start, end] in ISO format
-  train_period: ["2024-01-01", "2024-12-31"]
-
-  # Out-of-sample test period [start, end] in ISO format
-  test_period: ["2025-01-01", "2025-12-31"]
-
-  # Named target definitions. The default target drives benchmark-facing metrics;
-  # research mode can score all configured targets jointly.
-  targets:
-    - name: "paper"
-      entry_delay_bars: 1
-      holding_bars: 1
-      price_pair: "open_to_close"
-      return_transform: "simple"
-
-  # Target used for paper-style scalar evaluation surfaces.
-  default_target: "paper"
-
-# ---------------------------------------------------------------------------
-# LLM provider settings
-# ---------------------------------------------------------------------------
-llm:
-  # Provider name: "openai", "anthropic", "google", or "mock"
-  provider: "google"
-
-  # Model identifier (provider-specific)
-  # OpenAI:    "gpt-4o", "gpt-4", etc.
-  # Anthropic: "claude-sonnet-4-6", "claude-opus-4-6", etc.
-  # Google:    "gemini-2.0-flash", etc.
-  model: "gemini-2.0-flash"
-
-  # API key for the chosen provider.
-  # Can also be set via environment variables:
-  #   OPENAI_API_KEY, ANTHROPIC_API_KEY, GOOGLE_API_KEY
-  api_key: null
-
-  # Sampling temperature for factor generation (higher = more creative)
-  temperature: 0.8
-
-  # Maximum tokens in LLM response
-  max_tokens: 4096
-
-  # Number of candidate factors requested per LLM call
-  batch_candidates: 40
-
-# ---------------------------------------------------------------------------
-# Experience memory system
-# ---------------------------------------------------------------------------
-memory:
-  # Maximum number of success patterns to retain in memory
-  max_success_patterns: 50
-
-  # Maximum number of failure patterns to retain in memory
-  max_failure_patterns: 100
-
-  # Maximum number of distilled insights in memory
-  max_insights: 30
-
-  # How often (in iterations) to consolidate memory
-  consolidation_interval: 10
-
-# ---------------------------------------------------------------------------
-# Benchmarking and reporting
-# ---------------------------------------------------------------------------
-benchmark:
-  # Execution lane: "paper" for strict reproduction, "research" for Helix extensions
-  mode: "paper"
-
-  # Global RNG seed for deterministic benchmark selection and synthesis
-  seed: 42
-
-  # Number of frozen factors selected on the freeze universe/train split
-  freeze_top_k: 40
-
-  # Universe used for Top-K freeze selection
-  freeze_universe: "CSI500"
-
-  # Universes included in benchmark reporting
-  report_universes:
-    - "CSI500"
-    - "CSI1000"
-    - "HS300"
-    - "Binance"
-
-  # Baselines enabled in the benchmark suite
-  baselines:
-    - "alpha101_classic"
-    - "alpha101_adapted"
-    - "random_exploration"
-    - "gplearn"
-    - "alphaforge_style"
-    - "alphaagent_style"
-    - "factor_miner"
-    - "factor_miner_no_memory"
-
-  # Transaction-cost pressure settings in basis points
-  cost_bps: [1.0, 4.0, 7.0, 10.0, 11.0]
-
-  # Reference panel shape [periods, assets] used for efficiency benchmarks
-  efficiency_panel_shape: [12610, 500]
-
-# ---------------------------------------------------------------------------
-# Research-first multi-horizon scoring
-# ---------------------------------------------------------------------------
-research:
-  enabled: false
-
-  # single_horizon | weighted_multi_horizon | pareto_multi_horizon | net_ir
-  primary_objective: "weighted_multi_horizon"
-
-  # weighted | pareto
-  target_aggregation: "weighted"
-
-  # Optional explicit weights per target name. Empty => infer from data.targets order.
-  horizon_weights: {}
-
-  uncertainty:
-    bootstrap_samples: 200
-    block_size: 20
-    shrinkage_strength: 1.0
-    lcb_zscore: 1.0
-    fdr_alpha: 0.05
-
-  admission:
-    use_residual_ic: true
-    use_effective_rank_gain: true
-    turnover_penalty: 0.05
-    redundancy_penalty: 0.20
-    min_score: 0.04
-    min_lcb: 0.0
-    min_span_gain: 0.05
-    min_effective_rank_gain: 0.0
-
-  selection:
-    models: ["ridge", "elastic_net", "lasso", "xgboost"]
-    rolling_train_window: 80
-    rolling_test_window: 20
-    rolling_step: 20
-
-  regimes:
-    enabled: false
-    definition: "return_volatility_liquidity"
-
-  execution:
-    cost_model: "linear_bps"
-    cost_bps: 4.0
-
-# ---------------------------------------------------------------------------
-# Phase 2 advanced features (disabled by default)
-# ---------------------------------------------------------------------------
-phase2:
-  # Causal validation: Granger causality + intervention tests
-  causal:
-    enabled: false
-    granger_max_lag: 5
-    granger_significance: 0.05
-    n_interventions: 3
-    intervention_magnitude: 2.0
-    intervention_ic_threshold: 0.5
-    robustness_threshold: 0.4
-    granger_weight: 0.4
-    intervention_weight: 0.6
-
-  # Regime-conditional evaluation (bull/bear/high-vol/low-vol)
-  regime:
-    enabled: false
-    lookback_window: 60
-    bull_return_threshold: 0.0
-    bear_return_threshold: 0.0
-    volatility_percentile: 0.7
-    min_regime_ic: 0.03
-    min_regimes_passing: 2
-
-  # Strategy capacity estimation
-  capacity:
-    enabled: false
-    base_capital_usd: 100000000.0
-    ic_degradation_limit: 0.20
-    net_icir_threshold: 0.3
-    sigma_annual: 0.25
-
-  # Statistical significance testing (bootstrap + FDR)
-  significance:
-    enabled: false
-    bootstrap_n_samples: 1000
-    bootstrap_block_size: 20
-    fdr_level: 0.05
-    deflated_sharpe_enabled: true
-    min_deflated_sharpe: 0.0
-
-  # Multi-specialist debate-based generation
-  debate:
-    enabled: false
-    num_specialists: 3
-    candidates_per_specialist: 15
-    enable_critic: true
-    top_k_after_critic: 40
-    critic_temperature: 0.3
-
-  # Automatic operator invention
-  auto_inventor:
-    enabled: false
-    invention_interval: 10
-    max_proposals_per_round: 5
-    min_ic_contribution: 0.03
-    store_dir: "./output/custom_operators"
-
-  # Helix knowledge and memory system
-  helix:
-    enabled: false
-    enable_knowledge_graph: false
-    enable_embeddings: false
-    enable_canonicalization: true
-    forgetting_lambda: 0.95
-    forgetting_demotion_threshold: 20
diff --git a/src/factorminer/factorminer/configs/demo_local.yaml b/src/factorminer/factorminer/configs/demo_local.yaml
deleted file mode 100644
index e54bce7..0000000
--- a/src/factorminer/factorminer/configs/demo_local.yaml
+++ /dev/null
@@ -1,57 +0,0 @@
-output_dir: "./output/demo_local"
-
-evaluation:
-  backend: "numpy"
-  signal_failure_policy: "synthetic"
-  num_workers: 4
-  fast_screen_assets: 20
-
-mining:
-  batch_size: 8
-  max_iterations: 4
-  target_library_size: 16
-
-benchmark:
-  mode: "research"
-  freeze_top_k: 10
-  report_universes:
-    - "CSI500"
-
-data:
-  targets:
-    - name: "h1_open_to_close"
-      entry_delay_bars: 1
-      holding_bars: 1
-      price_pair: "open_to_close"
-      return_transform: "simple"
-    - name: "h3_open_to_close"
-      entry_delay_bars: 1
-      holding_bars: 3
-      price_pair: "open_to_close"
-      return_transform: "simple"
-  default_target: "h1_open_to_close"
-
-research:
-  enabled: true
-  horizon_weights:
-    h1_open_to_close: 0.7
-    h3_open_to_close: 0.3
-  uncertainty:
-    bootstrap_samples: 50
-    block_size: 8
-  selection:
-    rolling_train_window: 40
-    rolling_test_window: 10
-    rolling_step: 10
-  regimes:
-    enabled: true
-
-phase2:
-  debate:
-    enabled: true
-    num_specialists: 2
-    candidates_per_specialist: 4
-    top_k_after_critic: 8
-  helix:
-    enabled: true
-    enable_canonicalization: true
diff --git a/src/factorminer/factorminer/configs/helix_research.yaml b/src/factorminer/factorminer/configs/helix_research.yaml
deleted file mode 100644
index 3c32f71..0000000
--- a/src/factorminer/factorminer/configs/helix_research.yaml
+++ /dev/null
@@ -1,82 +0,0 @@
-output_dir: "./output/helix_research"
-
-benchmark:
-  mode: "research"
-
-data:
-  targets:
-    - name: "h1_open_to_close"
-      entry_delay_bars: 1
-      holding_bars: 1
-      price_pair: "open_to_close"
-      return_transform: "simple"
-    - name: "h3_open_to_close"
-      entry_delay_bars: 1
-      holding_bars: 3
-      price_pair: "open_to_close"
-      return_transform: "simple"
-    - name: "h6_open_to_close"
-      entry_delay_bars: 1
-      holding_bars: 6
-      price_pair: "open_to_close"
-      return_transform: "simple"
-    - name: "h1_close_to_close"
-      entry_delay_bars: 0
-      holding_bars: 1
-      price_pair: "close_to_close"
-      return_transform: "simple"
-    - name: "h5_close_to_close"
-      entry_delay_bars: 0
-      holding_bars: 5
-      price_pair: "close_to_close"
-      return_transform: "simple"
-  default_target: "h1_open_to_close"
-
-research:
-  enabled: true
-  primary_objective: "weighted_multi_horizon"
-  target_aggregation: "weighted"
-  horizon_weights:
-    h1_open_to_close: 0.35
-    h3_open_to_close: 0.25
-    h6_open_to_close: 0.15
-    h1_close_to_close: 0.15
-    h5_close_to_close: 0.10
-  uncertainty:
-    bootstrap_samples: 200
-    block_size: 20
-    shrinkage_strength: 1.0
-    lcb_zscore: 1.0
-  admission:
-    use_residual_ic: true
-    use_effective_rank_gain: true
-    turnover_penalty: 0.05
-    redundancy_penalty: 0.20
-    min_score: 0.04
-    min_lcb: 0.0
-    min_span_gain: 0.05
-  selection:
-    models: ["ridge", "elastic_net", "lasso", "xgboost"]
-  regimes:
-    enabled: true
-    definition: "return_volatility_liquidity"
-  execution:
-    cost_model: "linear_bps"
-    cost_bps: 4.0
-
-phase2:
-  causal:
-    enabled: true
-  regime:
-    enabled: true
-  capacity:
-    enabled: true
-  significance:
-    enabled: true
-  debate:
-    enabled: true
-  helix:
-    enabled: true
-    enable_knowledge_graph: true
-    enable_embeddings: true
-    enable_canonicalization: true
diff --git a/src/factorminer/factorminer/configs/paper_repro.yaml b/src/factorminer/factorminer/configs/paper_repro.yaml
deleted file mode 100644
index a872c00..0000000
--- a/src/factorminer/factorminer/configs/paper_repro.yaml
+++ /dev/null
@@ -1,36 +0,0 @@
-output_dir: "./output/paper_repro"
-
-evaluation:
-  backend: "numpy"
-  signal_failure_policy: "reject"
-
-benchmark:
-  mode: "paper"
-
-data:
-  targets:
-    - name: "paper"
-      entry_delay_bars: 1
-      holding_bars: 1
-      price_pair: "open_to_close"
-      return_transform: "simple"
-  default_target: "paper"
-
-research:
-  enabled: false
-
-phase2:
-  causal:
-    enabled: false
-  regime:
-    enabled: false
-  capacity:
-    enabled: false
-  significance:
-    enabled: false
-  debate:
-    enabled: false
-  auto_inventor:
-    enabled: false
-  helix:
-    enabled: false
diff --git a/src/factorminer/factorminer/core/__init__.py b/src/factorminer/factorminer/core/__init__.py
deleted file mode 100644
index d02b8d2..0000000
--- a/src/factorminer/factorminer/core/__init__.py
+++ /dev/null
@@ -1,67 +0,0 @@
-"""FactorMiner core: expression trees, types, factor DSL parser, and Ralph Loop."""
-
-from src.factorminer.factorminer.core.expression_tree import (
-    ConstantNode,
-    ExpressionTree,
-    LeafNode,
-    Node,
-    OperatorNode,
-)
-from src.factorminer.factorminer.core.factor_library import Factor, FactorLibrary
-from src.factorminer.factorminer.core.library_io import (
-    export_csv,
-    export_formulas,
-    import_from_paper,
-    load_library,
-    save_library,
-)
-from src.factorminer.factorminer.core.parser import parse, try_parse
-from src.factorminer.factorminer.core.ralph_loop import RalphLoop
-from src.factorminer.factorminer.core.helix_loop import HelixLoop
-from src.factorminer.factorminer.core.session import MiningSession
-from src.factorminer.factorminer.core.config import MiningConfig as CoreMiningConfig
-from src.factorminer.factorminer.core.types import (
-    FEATURES,
-    FEATURE_SET,
-    OPERATOR_REGISTRY,
-    OperatorSpec,
-    OperatorType,
-    SignatureType,
-    get_operator,
-)
-from src.factorminer.factorminer.core.canonicalizer import FormulaCanonicalizer
-
-__all__ = [
-    # Expression tree
-    "Node",
-    "LeafNode",
-    "ConstantNode",
-    "OperatorNode",
-    "ExpressionTree",
-    # Factor library
-    "Factor",
-    "FactorLibrary",
-    "save_library",
-    "load_library",
-    "export_csv",
-    "export_formulas",
-    "import_from_paper",
-    # Parser
-    "parse",
-    "try_parse",
-    # Loops
-    "RalphLoop",
-    "HelixLoop",
-    "MiningSession",
-    "CoreMiningConfig",
-    # Types
-    "OperatorSpec",
-    "OperatorType",
-    "SignatureType",
-    "FEATURES",
-    "FEATURE_SET",
-    "OPERATOR_REGISTRY",
-    "get_operator",
-    # Canonicalizer
-    "FormulaCanonicalizer",
-]
diff --git a/src/factorminer/factorminer/core/canonicalizer.py b/src/factorminer/factorminer/core/canonicalizer.py
deleted file mode 100644
index 0ce3889..0000000
--- a/src/factorminer/factorminer/core/canonicalizer.py
+++ /dev/null
@@ -1,206 +0,0 @@
-"""SymPy-based formula canonicalization for duplicate detection.
-
-Converts ``ExpressionTree`` objects into canonical SymPy expressions so that
-algebraically equivalent formulas (e.g. ``Add($close, $open)`` vs
-``Add($open, $close)``, or ``Neg(Neg($close))`` vs ``$close``) produce
-identical hashes.
-
-**Design principle**: Arithmetic operators map to native SymPy math so that
-standard simplifications (commutativity, double-negation, x/x = 1, etc.) are
-applied automatically.  Non-algebraic operators (rolling windows,
-cross-sectional transforms, conditionals) are represented as opaque
-``sympy.Function`` symbols so their structure is preserved without false
-simplification.
-"""
-
-from __future__ import annotations
-
-import hashlib
-from typing import Dict, List, Optional
-
-import sympy
-from sympy import Abs, Float, Function, Symbol, log, sqrt
-
-from src.factorminer.factorminer.core.expression_tree import (
-    ConstantNode,
-    ExpressionTree,
-    LeafNode,
-    Node,
-    OperatorNode,
-)
-
-# Arithmetic operator names that map directly to SymPy math.
-_ALGEBRAIC_OPS = frozenset({
-    "Add", "Sub", "Mul", "Div", "Neg", "Abs",
-    "Square", "Sqrt", "Log", "Pow", "SignedPower",
-})
-
-
-class FormulaCanonicalizer:
-    """Canonicalize expression trees via SymPy simplification.
-
-    Maintains an internal cache so that repeated calls for the same formula
-    string are fast.
-
-    Examples
-    --------
-    >>> from factorminer.core.parser import parse
-    >>> canon = FormulaCanonicalizer()
-    >>> canon.is_duplicate(parse("Add($close, $open)"), parse("Add($open, $close)"))
-    True
-    >>> canon.is_duplicate(parse("Neg(Neg($close))"), parse("$close"))
-    True
-    """
-
-    def __init__(self) -> None:
-        self._cache: Dict[str, str] = {}  # formula string -> canonical MD5 hash
-
-    # ------------------------------------------------------------------
-    # Public API
-    # ------------------------------------------------------------------
-
-    def canonicalize(self, tree: ExpressionTree) -> str:
-        """Return an MD5 hash of the canonical (simplified) form of *tree*.
-
-        Parameters
-        ----------
-        tree : ExpressionTree
-            The expression tree to canonicalize.
-
-        Returns
-        -------
-        str
-            Hex-encoded MD5 digest of the canonical string representation.
-        """
-        key = tree.to_string()
-        if key in self._cache:
-            return self._cache[key]
-
-        sympy_expr = self._tree_to_sympy(tree.root)
-        simplified = sympy.simplify(sympy_expr)
-        canonical_str = str(simplified)
-        digest = hashlib.md5(canonical_str.encode("utf-8")).hexdigest()
-        self._cache[key] = digest
-        return digest
-
-    def is_duplicate(self, tree_a: ExpressionTree, tree_b: ExpressionTree) -> bool:
-        """Return ``True`` if *tree_a* and *tree_b* are algebraically equivalent.
-
-        Parameters
-        ----------
-        tree_a, tree_b : ExpressionTree
-            Two expression trees to compare.
-
-        Returns
-        -------
-        bool
-        """
-        return self.canonicalize(tree_a) == self.canonicalize(tree_b)
-
-    def get_canonical_form(self, tree: ExpressionTree) -> str:
-        """Return the simplified string representation (not hashed).
-
-        Useful for debugging and display.
-
-        Parameters
-        ----------
-        tree : ExpressionTree
-
-        Returns
-        -------
-        str
-            Human-readable simplified expression.
-        """
-        sympy_expr = self._tree_to_sympy(tree.root)
-        simplified = sympy.simplify(sympy_expr)
-        return str(simplified)
-
-    def clear_cache(self) -> None:
-        """Discard all cached canonical hashes."""
-        self._cache.clear()
-
-    # ------------------------------------------------------------------
-    # Tree -> SymPy conversion
-    # ------------------------------------------------------------------
-
-    def _tree_to_sympy(self, node: Node) -> sympy.Expr:
-        """Recursively convert an expression-tree node to a SymPy expression.
-
-        Parameters
-        ----------
-        node : Node
-            Any node in the expression tree hierarchy.
-
-        Returns
-        -------
-        sympy.Expr
-        """
-        if isinstance(node, LeafNode):
-            return Symbol(node.feature_name)
-
-        if isinstance(node, ConstantNode):
-            return Float(node.value)
-
-        if isinstance(node, OperatorNode):
-            children_sympy = [self._tree_to_sympy(c) for c in node.children]
-            return self._map_operator(
-                node.operator.name, children_sympy, node.params
-            )
-
-        raise TypeError(f"Unexpected node type: {type(node).__name__}")
-
-    def _map_operator(
-        self,
-        name: str,
-        children: List[sympy.Expr],
-        params: Dict[str, float],
-    ) -> sympy.Expr:
-        """Dispatch an operator to its SymPy equivalent.
-
-        Arithmetic operators are mapped to native SymPy math so the
-        simplifier can reason about them.  All other operators become opaque
-        ``sympy.Function`` applications that preserve structure.
-
-        Parameters
-        ----------
-        name : str
-            Operator name from the registry (e.g. ``"Add"``, ``"CsRank"``).
-        children : list[sympy.Expr]
-            Already-converted child expressions.
-        params : dict[str, float]
-            Extra numeric parameters (e.g. ``{"window": 10}``).
-
-        Returns
-        -------
-        sympy.Expr
-        """
-        # --- Arithmetic: map to SymPy math ------------------------------------
-        if name == "Add":
-            return children[0] + children[1]
-        if name == "Sub":
-            return children[0] - children[1]
-        if name == "Mul":
-            return children[0] * children[1]
-        if name == "Div":
-            return children[0] / children[1]
-        if name == "Neg":
-            return -children[0]
-        if name == "Abs":
-            return Abs(children[0])
-        if name == "Square":
-            return children[0] ** 2
-        if name == "Sqrt":
-            return sqrt(Abs(children[0]))
-        if name == "Log":
-            return log(1 + Abs(children[0]))
-        if name in ("Pow", "SignedPower"):
-            return children[0] ** children[1]
-
-        # --- Non-algebraic: wrap as opaque Function ---------------------------
-        func = Function(name)
-        # Build argument list: children first, then params as Float values
-        args: List[sympy.Expr] = list(children)
-        # Append params in a deterministic order (sorted by param name).
-        for pname in sorted(params):
-            args.append(Float(params[pname]))
-        return func(*args)
diff --git a/src/factorminer/factorminer/core/config.py b/src/factorminer/factorminer/core/config.py
deleted file mode 100644
index bd2537a..0000000
--- a/src/factorminer/factorminer/core/config.py
+++ /dev/null
@@ -1,61 +0,0 @@
-"""Mining-specific configuration for the Ralph Loop.
-
-Provides a flat configuration dataclass specifically for the mining loop,
-separate from the hierarchical Config system in utils/config.py.  This
-allows the RalphLoop to accept a simple, focused parameter object while
-the full Config handles loading, validation, and serialization.
-"""
-
-from __future__ import annotations
-
-from dataclasses import dataclass
-
-
-@dataclass
-class MiningConfig:
-    """Flat configuration controlling the Ralph Loop mining process.
-
-    This is a convenience alias that mirrors the mining-relevant fields
-    from the hierarchical Config.  The RalphLoop can accept either this
-    or the full ``utils.config.MiningConfig``.
-    """
-
-    target_library_size: int = 110
-    batch_size: int = 40
-    max_iterations: int = 200
-    ic_threshold: float = 0.04
-    icir_threshold: float = 0.5
-    correlation_threshold: float = 0.5
-    replacement_ic_min: float = 0.10
-    replacement_ic_ratio: float = 1.3
-    fast_screen_assets: int = 100
-    num_workers: int = 40
-    output_dir: str = "./output"
-    gpu_device: str = "cuda:0"
-    backend: str = "numpy"
-    signal_failure_policy: str = "reject"
-
-    def validate(self) -> None:
-        """Basic sanity checks on parameter values."""
-        if self.target_library_size < 1:
-            raise ValueError("target_library_size must be >= 1")
-        if self.batch_size < 1:
-            raise ValueError("batch_size must be >= 1")
-        if self.max_iterations < 1:
-            raise ValueError("max_iterations must be >= 1")
-        if not (0.0 < self.ic_threshold < 1.0):
-            raise ValueError("ic_threshold must be in (0, 1)")
-        if not (0.0 < self.correlation_threshold <= 1.0):
-            raise ValueError("correlation_threshold must be in (0, 1]")
-        if self.replacement_ic_min <= self.ic_threshold:
-            raise ValueError("replacement_ic_min must be > ic_threshold")
-        if self.replacement_ic_ratio < 1.0:
-            raise ValueError("replacement_ic_ratio must be >= 1.0")
-        if self.backend not in ("gpu", "numpy", "c"):
-            raise ValueError(
-                f"backend must be one of: gpu, numpy, c (got '{self.backend}')"
-            )
-        if self.signal_failure_policy not in ("reject", "synthetic", "raise"):
-            raise ValueError(
-                "signal_failure_policy must be one of: reject, synthetic, raise"
-            )
diff --git a/src/factorminer/factorminer/core/expression_tree.py b/src/factorminer/factorminer/core/expression_tree.py
deleted file mode 100644
index e8a5da3..0000000
--- a/src/factorminer/factorminer/core/expression_tree.py
+++ /dev/null
@@ -1,736 +0,0 @@
-"""Expression tree data structure for alpha-factor formulas.
-
-An expression tree is a DAG of ``Node`` objects whose leaves are raw
-market-data features (``LeafNode``) or numeric constants (``ConstantNode``)
-and whose internal nodes are operator applications (``OperatorNode``).
-"""
-
-from __future__ import annotations
-
-import copy
-import math
-from abc import ABC, abstractmethod
-from typing import Any, Dict, Iterator, List, Optional, Tuple, Union
-
-import numpy as np
-
-from src.factorminer.factorminer.core.types import (
-    FEATURE_SET,
-    OperatorSpec,
-    OperatorType,
-    SignatureType,
-)
-
-# Epsilon for safe division / log
-_EPS = 1e-10
-
-
-# ---------------------------------------------------------------------------
-# Node hierarchy
-# ---------------------------------------------------------------------------
-
-class Node(ABC):
-    """Abstract base for every node in an expression tree."""
-
-    @abstractmethod
-    def evaluate(self, data: Dict[str, np.ndarray]) -> np.ndarray:
-        """Compute the node's value given market data.
-
-        Parameters
-        ----------
-        data : dict[str, np.ndarray]
-            Maps feature names (e.g. ``"$close"``) to arrays of shape
-            ``(M, T)`` where *M* is the number of stocks and *T* is the
-            number of time steps.
-
-        Returns
-        -------
-        np.ndarray
-            Result array, typically shape ``(M, T)``.
-        """
-
-    @abstractmethod
-    def to_string(self) -> str:
-        """Serialize the subtree rooted at this node to a DSL formula."""
-
-    @abstractmethod
-    def depth(self) -> int:
-        """Return the depth of the subtree (leaf = 1)."""
-
-    @abstractmethod
-    def size(self) -> int:
-        """Return the number of nodes in the subtree."""
-
-    @abstractmethod
-    def clone(self) -> "Node":
-        """Return a deep copy of the subtree."""
-
-    def __repr__(self) -> str:  # pragma: no cover
-        return self.to_string()
-
-    # Iteration helpers -----------------------------------------------------
-
-    def iter_nodes(self) -> Iterator["Node"]:
-        """Yield every node in the subtree (pre-order)."""
-        yield self
-        if isinstance(self, OperatorNode):
-            for child in self.children:
-                yield from child.iter_nodes()
-
-    def leaf_features(self) -> List[str]:
-        """Return sorted unique feature names referenced by this subtree."""
-        feats = {n.feature_name for n in self.iter_nodes() if isinstance(n, LeafNode)}
-        return sorted(feats)
-
-
-class LeafNode(Node):
-    """References a raw market-data column (e.g. ``$close``)."""
-
-    __slots__ = ("feature_name",)
-
-    def __init__(self, feature_name: str) -> None:
-        if feature_name not in FEATURE_SET:
-            raise ValueError(
-                f"Unknown feature '{feature_name}'. "
-                f"Expected one of {sorted(FEATURE_SET)}."
-            )
-        self.feature_name = feature_name
-
-    def evaluate(self, data: Dict[str, np.ndarray]) -> np.ndarray:
-        if self.feature_name not in data:
-            raise KeyError(
-                f"Feature '{self.feature_name}' not found in data. "
-                f"Available: {sorted(data.keys())}"
-            )
-        return data[self.feature_name].astype(np.float64, copy=False)
-
-    def to_string(self) -> str:
-        return self.feature_name
-
-    def depth(self) -> int:
-        return 1
-
-    def size(self) -> int:
-        return 1
-
-    def clone(self) -> "LeafNode":
-        return LeafNode(self.feature_name)
-
-
-class ConstantNode(Node):
-    """A numeric literal embedded in the expression."""
-
-    __slots__ = ("value",)
-
-    def __init__(self, value: float) -> None:
-        self.value = float(value)
-
-    def evaluate(self, data: Dict[str, np.ndarray]) -> np.ndarray:
-        # Infer shape from any entry in data so the constant broadcasts.
-        for arr in data.values():
-            return np.full_like(arr, self.value, dtype=np.float64)
-        raise ValueError("Cannot evaluate ConstantNode with empty data dict.")
-
-    def to_string(self) -> str:
-        # Produce a clean numeric literal.
-        if self.value == int(self.value) and abs(self.value) < 1e12:
-            return str(int(self.value))
-        return f"{self.value:g}"
-
-    def depth(self) -> int:
-        return 1
-
-    def size(self) -> int:
-        return 1
-
-    def clone(self) -> "ConstantNode":
-        return ConstantNode(self.value)
-
-
-class OperatorNode(Node):
-    """An internal node that applies an operator to child sub-trees.
-
-    Parameters
-    ----------
-    operator : OperatorSpec
-        The operator to apply.
-    children : list[Node]
-        Child expression nodes.  Length must equal ``operator.arity``.
-    params : dict[str, float]
-        Extra numeric parameters (e.g. ``{"window": 10}``).
-    """
-
-    __slots__ = ("operator", "children", "params")
-
-    def __init__(
-        self,
-        operator: OperatorSpec,
-        children: List[Node],
-        params: Optional[Dict[str, float]] = None,
-    ) -> None:
-        self.operator = operator
-        self.children = list(children)
-        self.params = dict(params) if params else {}
-        # Merge defaults for any missing parameter.
-        for pname, pdefault in operator.param_defaults.items():
-            if pname not in self.params:
-                self.params[pname] = pdefault
-
-    # ---- serialization ----------------------------------------------------
-
-    def to_string(self) -> str:
-        parts = [child.to_string() for child in self.children]
-        # Append explicit numeric parameters (window etc.)
-        for pname in self.operator.param_names:
-            if pname in self.params:
-                v = self.params[pname]
-                if v == int(v) and abs(v) < 1e12:
-                    parts.append(str(int(v)))
-                else:
-                    parts.append(f"{v:g}")
-        return f"{self.operator.name}({', '.join(parts)})"
-
-    # ---- structural queries -----------------------------------------------
-
-    def depth(self) -> int:
-        if not self.children:
-            return 1
-        return 1 + max(c.depth() for c in self.children)
-
-    def size(self) -> int:
-        return 1 + sum(c.size() for c in self.children)
-
-    def clone(self) -> "OperatorNode":
-        return OperatorNode(
-            operator=self.operator,
-            children=[c.clone() for c in self.children],
-            params=dict(self.params),
-        )
-
-    # ---- evaluation -------------------------------------------------------
-
-    def evaluate(self, data: Dict[str, np.ndarray]) -> np.ndarray:
-        child_vals = [c.evaluate(data) for c in self.children]
-        return _dispatch_operator(self.operator, child_vals, self.params)
-
-
-# ---------------------------------------------------------------------------
-# Operator dispatch  (pure-numpy implementations)
-# ---------------------------------------------------------------------------
-
-def _safe_div(a: np.ndarray, b: np.ndarray) -> np.ndarray:
-    """Division that returns 0 where the denominator is near zero."""
-    out = np.where(np.abs(b) > _EPS, a / np.where(np.abs(b) > _EPS, b, 1.0), 0.0)
-    return out
-
-
-def _safe_log(x: np.ndarray) -> np.ndarray:
-    return np.sign(x) * np.log1p(np.abs(x))
-
-
-def _safe_sqrt(x: np.ndarray) -> np.ndarray:
-    return np.sign(x) * np.sqrt(np.abs(x))
-
-
-def _rolling_apply(
-    x: np.ndarray,
-    window: int,
-    func,
-    *,
-    binary_y: Optional[np.ndarray] = None,
-) -> np.ndarray:
-    """Apply *func* over a rolling window along the last axis (T).
-
-    Parameters
-    ----------
-    x : np.ndarray, shape (M, T)
-    window : int
-    func : callable  (slice_x, [slice_y]) -> scalar or 1-d
-    binary_y : optional second array for bivariate rolling ops
-
-    Returns
-    -------
-    np.ndarray, shape (M, T)   – leading positions filled with NaN.
-    """
-    M, T = x.shape
-    out = np.full_like(x, np.nan, dtype=np.float64)
-    for t in range(window - 1, T):
-        sx = x[:, t - window + 1 : t + 1]
-        if binary_y is not None:
-            sy = binary_y[:, t - window + 1 : t + 1]
-            out[:, t] = func(sx, sy)
-        else:
-            out[:, t] = func(sx)
-    return out
-
-
-def _ts_mean(sx: np.ndarray) -> np.ndarray:
-    return np.nanmean(sx, axis=1)
-
-
-def _ts_std(sx: np.ndarray) -> np.ndarray:
-    return np.nanstd(sx, axis=1, ddof=1)
-
-
-def _ts_var(sx: np.ndarray) -> np.ndarray:
-    return np.nanvar(sx, axis=1, ddof=1)
-
-
-def _ts_sum(sx: np.ndarray) -> np.ndarray:
-    return np.nansum(sx, axis=1)
-
-
-def _ts_prod(sx: np.ndarray) -> np.ndarray:
-    return np.nanprod(sx, axis=1)
-
-
-def _ts_max(sx: np.ndarray) -> np.ndarray:
-    return np.nanmax(sx, axis=1)
-
-
-def _ts_min(sx: np.ndarray) -> np.ndarray:
-    return np.nanmin(sx, axis=1)
-
-
-def _ts_argmax(sx: np.ndarray) -> np.ndarray:
-    return np.nanargmax(sx, axis=1).astype(np.float64)
-
-
-def _ts_argmin(sx: np.ndarray) -> np.ndarray:
-    return np.nanargmin(sx, axis=1).astype(np.float64)
-
-
-def _ts_median(sx: np.ndarray) -> np.ndarray:
-    return np.nanmedian(sx, axis=1)
-
-
-def _ts_skew(sx: np.ndarray) -> np.ndarray:
-    m = np.nanmean(sx, axis=1, keepdims=True)
-    s = np.nanstd(sx, axis=1, keepdims=True, ddof=1)
-    s = np.where(s > _EPS, s, 1.0)
-    n = sx.shape[1]
-    sk = np.nanmean(((sx - m) / s) ** 3, axis=1) * n**2 / max((n - 1) * (n - 2), 1)
-    return sk
-
-
-def _ts_kurt(sx: np.ndarray) -> np.ndarray:
-    m = np.nanmean(sx, axis=1, keepdims=True)
-    s = np.nanstd(sx, axis=1, keepdims=True, ddof=1)
-    s = np.where(s > _EPS, s, 1.0)
-    return np.nanmean(((sx - m) / s) ** 4, axis=1) - 3.0
-
-
-def _ts_rank(sx: np.ndarray) -> np.ndarray:
-    """Percentile rank of the latest value within the window."""
-    latest = sx[:, -1]
-    rank = np.sum(sx <= latest[:, None], axis=1).astype(np.float64)
-    return rank / sx.shape[1]
-
-
-def _ts_corr(sx: np.ndarray, sy: np.ndarray) -> np.ndarray:
-    mx = np.nanmean(sx, axis=1, keepdims=True)
-    my = np.nanmean(sy, axis=1, keepdims=True)
-    dx, dy = sx - mx, sy - my
-    cov = np.nanmean(dx * dy, axis=1)
-    sx_std = np.nanstd(sx, axis=1, ddof=1)
-    sy_std = np.nanstd(sy, axis=1, ddof=1)
-    denom = sx_std * sy_std
-    return np.where(denom > _EPS, cov / denom, 0.0)
-
-
-def _ts_cov(sx: np.ndarray, sy: np.ndarray) -> np.ndarray:
-    mx = np.nanmean(sx, axis=1, keepdims=True)
-    my = np.nanmean(sy, axis=1, keepdims=True)
-    return np.nanmean((sx - mx) * (sy - my), axis=1)
-
-
-def _ts_beta(sx: np.ndarray, sy: np.ndarray) -> np.ndarray:
-    """Rolling OLS slope of x on y."""
-    my = np.nanmean(sy, axis=1, keepdims=True)
-    mx = np.nanmean(sx, axis=1, keepdims=True)
-    dy = sy - my
-    var_y = np.nansum(dy ** 2, axis=1)
-    cov_xy = np.nansum((sx - mx) * dy, axis=1)
-    return np.where(var_y > _EPS, cov_xy / var_y, 0.0)
-
-
-def _ts_resid(sx: np.ndarray, sy: np.ndarray) -> np.ndarray:
-    beta = _ts_beta(sx, sy)
-    my = np.nanmean(sy, axis=1, keepdims=True)
-    mx = np.nanmean(sx, axis=1, keepdims=True)
-    predicted = mx.squeeze(1) + beta * (sy[:, -1] - my.squeeze(1))
-    return sx[:, -1] - predicted
-
-
-def _ema(x: np.ndarray, window: int) -> np.ndarray:
-    """Exponential moving average along the last axis."""
-    alpha = 2.0 / (window + 1)
-    M, T = x.shape
-    out = np.empty_like(x, dtype=np.float64)
-    out[:, 0] = x[:, 0]
-    for t in range(1, T):
-        out[:, t] = alpha * x[:, t] + (1 - alpha) * out[:, t - 1]
-    return out
-
-
-def _wma(x: np.ndarray, window: int) -> np.ndarray:
-    """Linearly-weighted moving average."""
-    weights = np.arange(1, window + 1, dtype=np.float64)
-    weights /= weights.sum()
-    M, T = x.shape
-    out = np.full_like(x, np.nan, dtype=np.float64)
-    for t in range(window - 1, T):
-        out[:, t] = (x[:, t - window + 1 : t + 1] * weights[None, :]).sum(axis=1)
-    return out
-
-
-def _decay(x: np.ndarray, window: int) -> np.ndarray:
-    """Exponentially decaying sum."""
-    alpha = 2.0 / (window + 1)
-    weights = np.array([alpha * (1 - alpha) ** i for i in range(window)][::-1])
-    M, T = x.shape
-    out = np.full_like(x, np.nan, dtype=np.float64)
-    for t in range(window - 1, T):
-        out[:, t] = (x[:, t - window + 1 : t + 1] * weights[None, :]).sum(axis=1)
-    return out
-
-
-def _cs_rank(x: np.ndarray) -> np.ndarray:
-    """Cross-sectional percentile rank at each time step."""
-    M, T = x.shape
-    out = np.empty_like(x, dtype=np.float64)
-    for t in range(T):
-        col = x[:, t]
-        valid = ~np.isnan(col)
-        ranked = np.empty(M, dtype=np.float64)
-        ranked[:] = np.nan
-        if valid.any():
-            order = col[valid].argsort().argsort().astype(np.float64)
-            ranked[valid] = (order + 1) / valid.sum()
-        out[:, t] = ranked
-    return out
-
-
-def _cs_zscore(x: np.ndarray) -> np.ndarray:
-    M, T = x.shape
-    out = np.empty_like(x, dtype=np.float64)
-    for t in range(T):
-        col = x[:, t]
-        m = np.nanmean(col)
-        s = np.nanstd(col, ddof=1)
-        out[:, t] = (col - m) / max(s, _EPS)
-    return out
-
-
-def _cs_demean(x: np.ndarray) -> np.ndarray:
-    m = np.nanmean(x, axis=0, keepdims=True)
-    return x - m
-
-
-def _cs_scale(x: np.ndarray) -> np.ndarray:
-    s = np.nansum(np.abs(x), axis=0, keepdims=True)
-    s = np.where(s > _EPS, s, 1.0)
-    return x / s
-
-
-def _ts_linreg_slope(x: np.ndarray, window: int) -> np.ndarray:
-    t_vals = np.arange(window, dtype=np.float64)
-    t_mean = t_vals.mean()
-    t_var = np.sum((t_vals - t_mean) ** 2)
-    M, T = x.shape
-    out = np.full_like(x, np.nan, dtype=np.float64)
-    for t in range(window - 1, T):
-        sx = x[:, t - window + 1 : t + 1]
-        x_mean = np.nanmean(sx, axis=1, keepdims=True)
-        cov = np.nansum((sx - x_mean) * (t_vals[None, :] - t_mean), axis=1)
-        out[:, t] = cov / max(t_var, _EPS)
-    return out
-
-
-def _ts_linreg_intercept(x: np.ndarray, window: int) -> np.ndarray:
-    t_vals = np.arange(window, dtype=np.float64)
-    t_mean = t_vals.mean()
-    slope = _ts_linreg_slope(x, window)
-    M, T = x.shape
-    out = np.full_like(x, np.nan, dtype=np.float64)
-    for t in range(window - 1, T):
-        sx = x[:, t - window + 1 : t + 1]
-        x_mean = np.nanmean(sx, axis=1)
-        out[:, t] = x_mean - slope[:, t] * t_mean
-    return out
-
-
-def _ts_linreg_fitted(x: np.ndarray, window: int) -> np.ndarray:
-    slope = _ts_linreg_slope(x, window)
-    intercept = _ts_linreg_intercept(x, window)
-    t_last = float(window - 1)
-    return intercept + slope * t_last
-
-
-def _ts_linreg_resid(x: np.ndarray, window: int) -> np.ndarray:
-    fitted = _ts_linreg_fitted(x, window)
-    M, T = x.shape
-    out = np.full_like(x, np.nan, dtype=np.float64)
-    for t in range(window - 1, T):
-        out[:, t] = x[:, t] - fitted[:, t]
-    return out
-
-
-# Main dispatch table -------------------------------------------------------
-
-def _dispatch_operator(
-    spec: OperatorSpec,
-    children: List[np.ndarray],
-    params: Dict[str, float],
-) -> np.ndarray:
-    """Execute an operator on evaluated children, return result array."""
-    name = spec.name
-    w = int(params.get("window", 0))
-
-    # -- Arithmetic ---------------------------------------------------------
-    if name == "Add":
-        return children[0] + children[1]
-    if name == "Sub":
-        return children[0] - children[1]
-    if name == "Mul":
-        return children[0] * children[1]
-    if name == "Div":
-        return _safe_div(children[0], children[1])
-    if name == "Neg":
-        return -children[0]
-    if name == "Abs":
-        return np.abs(children[0])
-    if name == "Sign":
-        return np.sign(children[0])
-    if name == "Log":
-        return _safe_log(children[0])
-    if name == "Sqrt":
-        return _safe_sqrt(children[0])
-    if name == "Square":
-        return children[0] ** 2
-    if name == "Pow":
-        base, exp = children
-        return np.sign(base) * np.abs(base) ** exp
-    if name == "Max":
-        return np.maximum(children[0], children[1])
-    if name == "Min":
-        return np.minimum(children[0], children[1])
-    if name == "Clip":
-        lo = params.get("lower", -3.0)
-        hi = params.get("upper", 3.0)
-        return np.clip(children[0], lo, hi)
-    if name == "Inv":
-        return _safe_div(np.ones_like(children[0]), children[0])
-
-    # -- Statistical (rolling) ----------------------------------------------
-    if name == "Mean":
-        return _rolling_apply(children[0], w, _ts_mean)
-    if name == "Std":
-        return _rolling_apply(children[0], w, _ts_std)
-    if name == "Var":
-        return _rolling_apply(children[0], w, _ts_var)
-    if name == "Skew":
-        return _rolling_apply(children[0], w, _ts_skew)
-    if name == "Kurt":
-        return _rolling_apply(children[0], w, _ts_kurt)
-    if name == "Median":
-        return _rolling_apply(children[0], w, _ts_median)
-    if name == "Sum":
-        return _rolling_apply(children[0], w, _ts_sum)
-    if name == "Prod":
-        return _rolling_apply(children[0], w, _ts_prod)
-    if name == "TsMax":
-        return _rolling_apply(children[0], w, _ts_max)
-    if name == "TsMin":
-        return _rolling_apply(children[0], w, _ts_min)
-    if name == "TsArgMax":
-        return _rolling_apply(children[0], w, _ts_argmax)
-    if name == "TsArgMin":
-        return _rolling_apply(children[0], w, _ts_argmin)
-    if name == "TsRank":
-        return _rolling_apply(children[0], w, _ts_rank)
-    if name == "Quantile":
-        q = params.get("q", 0.5)
-        return _rolling_apply(
-            children[0], w, lambda sx: np.nanquantile(sx, q, axis=1)
-        )
-    if name == "CountNaN":
-        return _rolling_apply(
-            children[0], w, lambda sx: np.sum(np.isnan(sx), axis=1).astype(np.float64)
-        )
-    if name == "CountNotNaN":
-        return _rolling_apply(
-            children[0], w, lambda sx: np.sum(~np.isnan(sx), axis=1).astype(np.float64)
-        )
-
-    # -- Time-series --------------------------------------------------------
-    if name == "Delta":
-        M, T = children[0].shape
-        out = np.full_like(children[0], np.nan, dtype=np.float64)
-        if w < T:
-            out[:, w:] = children[0][:, w:] - children[0][:, :-w]
-        return out
-    if name == "Delay":
-        M, T = children[0].shape
-        out = np.full_like(children[0], np.nan, dtype=np.float64)
-        if w < T:
-            out[:, w:] = children[0][:, :-w]
-        return out
-    if name == "Return":
-        M, T = children[0].shape
-        out = np.full_like(children[0], np.nan, dtype=np.float64)
-        if w < T:
-            prev = children[0][:, :-w]
-            out[:, w:] = _safe_div(children[0][:, w:] - prev, prev)
-        return out
-    if name == "LogReturn":
-        M, T = children[0].shape
-        out = np.full_like(children[0], np.nan, dtype=np.float64)
-        if w < T:
-            ratio = _safe_div(children[0][:, w:], np.where(np.abs(children[0][:, :-w]) > _EPS, children[0][:, :-w], 1.0))
-            out[:, w:] = np.log(np.abs(ratio) + _EPS)
-        return out
-    if name == "Corr":
-        return _rolling_apply(children[0], w, _ts_corr, binary_y=children[1])
-    if name == "Cov":
-        return _rolling_apply(children[0], w, _ts_cov, binary_y=children[1])
-    if name == "Beta":
-        return _rolling_apply(children[0], w, _ts_beta, binary_y=children[1])
-    if name == "Resid":
-        return _rolling_apply(children[0], w, _ts_resid, binary_y=children[1])
-    if name == "WMA":
-        return _wma(children[0], w)
-    if name == "Decay":
-        return _decay(children[0], w)
-    if name == "CumSum":
-        return np.nancumsum(children[0], axis=1)
-    if name == "CumProd":
-        return np.nancumprod(children[0], axis=1)
-    if name == "CumMax":
-        return np.maximum.accumulate(np.nan_to_num(children[0], nan=-np.inf), axis=1)
-    if name == "CumMin":
-        return np.minimum.accumulate(np.nan_to_num(children[0], nan=np.inf), axis=1)
-
-    # -- Smoothing ----------------------------------------------------------
-    if name == "EMA":
-        return _ema(children[0], w)
-    if name == "DEMA":
-        e1 = _ema(children[0], w)
-        e2 = _ema(e1, w)
-        return 2 * e1 - e2
-    if name == "SMA":
-        return _rolling_apply(children[0], w, _ts_mean)
-    if name == "KAMA":
-        return _ema(children[0], w)  # simplified
-    if name == "HMA":
-        half_w = max(w // 2, 1)
-        sqrt_w = max(int(math.sqrt(w)), 1)
-        wma_half = _wma(children[0], half_w)
-        wma_full = _wma(children[0], w)
-        # Fill leading NaN from the shorter window with the longer
-        diff = 2 * np.nan_to_num(wma_half) - np.nan_to_num(wma_full)
-        return _wma(diff, sqrt_w)
-
-    # -- Cross-sectional ----------------------------------------------------
-    if name == "CsRank":
-        return _cs_rank(children[0])
-    if name == "CsZScore":
-        return _cs_zscore(children[0])
-    if name == "CsDemean":
-        return _cs_demean(children[0])
-    if name == "CsScale":
-        return _cs_scale(children[0])
-    if name == "CsNeutralize":
-        return _cs_demean(children[0])  # simplified: industry-neutralize ≈ demean
-    if name == "CsQuantile":
-        n_bins = int(params.get("n_bins", 5))
-        ranked = _cs_rank(children[0])
-        return np.ceil(ranked * n_bins).clip(1, n_bins)
-
-    # -- Regression ---------------------------------------------------------
-    if name == "TsLinReg":
-        return _ts_linreg_fitted(children[0], w)
-    if name == "TsLinRegSlope":
-        return _ts_linreg_slope(children[0], w)
-    if name == "TsLinRegIntercept":
-        return _ts_linreg_intercept(children[0], w)
-    if name == "TsLinRegResid":
-        return _ts_linreg_resid(children[0], w)
-
-    # -- Logical / conditional ----------------------------------------------
-    if name == "IfElse":
-        cond, x_true, x_false = children
-        return np.where(cond > 0, x_true, x_false)
-    if name == "Greater":
-        return (children[0] > children[1]).astype(np.float64)
-    if name == "Less":
-        return (children[0] < children[1]).astype(np.float64)
-    if name == "Equal":
-        return (np.abs(children[0] - children[1]) < _EPS).astype(np.float64)
-    if name == "And":
-        return ((children[0] > 0) & (children[1] > 0)).astype(np.float64)
-    if name == "Or":
-        return ((children[0] > 0) | (children[1] > 0)).astype(np.float64)
-    if name == "Not":
-        return (children[0] <= 0).astype(np.float64)
-
-    raise NotImplementedError(f"Operator '{name}' has no evaluation implementation.")
-
-
-# ---------------------------------------------------------------------------
-# Expression tree wrapper
-# ---------------------------------------------------------------------------
-
-class ExpressionTree:
-    """Wrapper around a root ``Node`` providing a convenient API.
-
-    Parameters
-    ----------
-    root : Node
-        The root node of the tree.
-    """
-
-    __slots__ = ("root",)
-
-    def __init__(self, root: Node) -> None:
-        self.root = root
-
-    def to_string(self) -> str:
-        """Serialize the full tree to a DSL formula string."""
-        return self.root.to_string()
-
-    def depth(self) -> int:
-        """Return the depth of the tree."""
-        return self.root.depth()
-
-    def size(self) -> int:
-        """Return the total number of nodes."""
-        return self.root.size()
-
-    def evaluate(self, data: Dict[str, np.ndarray]) -> np.ndarray:
-        """Execute the formula on market data.
-
-        Parameters
-        ----------
-        data : dict[str, np.ndarray]
-            Maps feature names to arrays of shape ``(M, T)``.
-
-        Returns
-        -------
-        np.ndarray of shape ``(M, T)``
-        """
-        return self.root.evaluate(data)
-
-    def clone(self) -> "ExpressionTree":
-        """Return a deep copy of the tree."""
-        return ExpressionTree(self.root.clone())
-
-    def leaf_features(self) -> List[str]:
-        """Return sorted unique feature names referenced by this tree."""
-        return self.root.leaf_features()
-
-    def __repr__(self) -> str:
-        return f"ExpressionTree({self.to_string()})"
-
-    def __str__(self) -> str:
-        return self.to_string()
diff --git a/src/factorminer/factorminer/core/factor_library.py b/src/factorminer/factorminer/core/factor_library.py
deleted file mode 100644
index 635b22b..0000000
--- a/src/factorminer/factorminer/core/factor_library.py
+++ /dev/null
@@ -1,602 +0,0 @@
-"""Factor Library: maintains the growing collection of admitted alpha factors.
-
-Implements the admission rules from the paper (Eq. 10, 11):
-- Admission: IC(alpha) >= tau_IC AND max_{g in L} |rho(alpha, g)| < theta
-- Replacement: IC(alpha) >= 0.10 AND IC(alpha) >= 1.3 * IC(g) AND only 1 correlated factor
-
-The library tracks pairwise Spearman correlations and supports incremental
-updates as new factors are added or replaced.
-"""
-
-from __future__ import annotations
-
-import logging
-from collections import defaultdict
-from dataclasses import dataclass, field
-from datetime import datetime
-from typing import Dict, List, Optional, Tuple
-
-import numpy as np
-from scipy.stats import spearmanr
-
-logger = logging.getLogger(__name__)
-
-
-@dataclass
-class Factor:
-    """A single admitted alpha factor."""
-
-    id: int
-    name: str
-    formula: str
-    category: str  # e.g., "VWAP", "Regime-switching", "Momentum", etc.
-    ic_mean: float  # Mean IC across evaluation period
-    icir: float  # IC Information Ratio
-    ic_win_rate: float  # Fraction of periods with positive IC
-    max_correlation: float  # Max |rho| with any other library factor at admission
-    batch_number: int  # Which mining batch admitted this factor
-    admission_date: str = ""
-    signals: Optional[np.ndarray] = field(default=None, repr=False)  # (M, T)
-    research_metrics: dict = field(default_factory=dict)
-    provenance: dict = field(default_factory=dict)
-
-    def __post_init__(self) -> None:
-        if not self.admission_date:
-            self.admission_date = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
-
-    def to_dict(self) -> dict:
-        """Serialize to a JSON-compatible dictionary (excludes signals)."""
-        return {
-            "id": self.id,
-            "name": self.name,
-            "formula": self.formula,
-            "category": self.category,
-            "ic_mean": self.ic_mean,
-            "icir": self.icir,
-            "ic_win_rate": self.ic_win_rate,
-            "max_correlation": self.max_correlation,
-            "batch_number": self.batch_number,
-            "admission_date": self.admission_date,
-            "research_metrics": self.research_metrics,
-            "provenance": self.provenance,
-        }
-
-    @classmethod
-    def from_dict(cls, d: dict) -> "Factor":
-        """Reconstruct a Factor from a dictionary."""
-        return cls(
-            id=d["id"],
-            name=d["name"],
-            formula=d["formula"],
-            category=d["category"],
-            ic_mean=d["ic_mean"],
-            icir=d["icir"],
-            ic_win_rate=d["ic_win_rate"],
-            max_correlation=d["max_correlation"],
-            batch_number=d["batch_number"],
-            admission_date=d.get("admission_date", ""),
-            research_metrics=d.get("research_metrics", {}),
-            provenance=d.get("provenance", {}),
-        )
-
-
-class FactorLibrary:
-    """The factor library L that maintains admitted alpha factors.
-
-    Parameters
-    ----------
-    correlation_threshold : float
-        Maximum allowed |rho| for admission (theta). Default 0.5 for A-shares.
-    ic_threshold : float
-        Minimum IC for admission (tau_IC). Default 0.04.
-    """
-
-    def __init__(
-        self,
-        correlation_threshold: float = 0.5,
-        ic_threshold: float = 0.04,
-    ) -> None:
-        self.factors: Dict[int, Factor] = {}
-        self.correlation_matrix: Optional[np.ndarray] = None  # Pairwise |rho|
-        self._next_id: int = 1
-        self.correlation_threshold = correlation_threshold
-        self.ic_threshold = ic_threshold
-        # Maps factor_id -> index in the correlation matrix
-        self._id_to_index: Dict[int, int] = {}
-
-    # ------------------------------------------------------------------
-    # Correlation computation
-    # ------------------------------------------------------------------
-
-    def compute_correlation(
-        self, signals_a: np.ndarray, signals_b: np.ndarray
-    ) -> float:
-        """Compute time-average cross-sectional Spearman correlation rho(alpha, beta).
-
-        rho(alpha, beta) = (1/|T|) * sum_t Corr_rank(s_t^(alpha), s_t^(beta))
-
-        Parameters
-        ----------
-        signals_a, signals_b : np.ndarray, shape (M, T)
-            Cross-sectional signal matrices.
-
-        Returns
-        -------
-        float
-            Mean absolute Spearman rank correlation across time steps.
-        """
-        if signals_a.shape != signals_b.shape:
-            raise ValueError(
-                f"Signal shapes must match: {signals_a.shape} vs {signals_b.shape}"
-            )
-        M, T = signals_a.shape
-        correlations = np.empty(T, dtype=np.float64)
-
-        for t in range(T):
-            col_a = signals_a[:, t]
-            col_b = signals_b[:, t]
-            # Mask NaNs from both columns
-            valid = ~(np.isnan(col_a) | np.isnan(col_b))
-            n_valid = valid.sum()
-            if n_valid < 3:
-                correlations[t] = np.nan
-                continue
-            rho, _ = spearmanr(col_a[valid], col_b[valid])
-            correlations[t] = rho
-
-        return float(np.nanmean(np.abs(correlations)))
-
-    def _compute_correlation_vectorized(
-        self, signals_a: np.ndarray, signals_b: np.ndarray
-    ) -> float:
-        """Faster vectorized Spearman correlation using rankdata.
-
-        For large M and T this avoids per-timestep Python loops by ranking
-        each column and computing Pearson on the ranks.
-        """
-        from scipy.stats import rankdata
-
-        M, T = signals_a.shape
-        # Mask invalid entries
-        mask = ~(np.isnan(signals_a) | np.isnan(signals_b))
-
-        corr_sum = 0.0
-        n_valid_t = 0
-        for t in range(T):
-            valid = mask[:, t]
-            n = valid.sum()
-            if n < 3:
-                continue
-            ra = rankdata(signals_a[valid, t])
-            rb = rankdata(signals_b[valid, t])
-            # Pearson on ranks == Spearman
-            ra_c = ra - ra.mean()
-            rb_c = rb - rb.mean()
-            denom = np.sqrt((ra_c ** 2).sum() * (rb_c ** 2).sum())
-            if denom < 1e-12:
-                continue
-            corr_sum += abs((ra_c * rb_c).sum() / denom)
-            n_valid_t += 1
-
-        if n_valid_t == 0:
-            return 0.0
-        return corr_sum / n_valid_t
-
-    # ------------------------------------------------------------------
-    # Admission and replacement
-    # ------------------------------------------------------------------
-
-    def check_admission(
-        self, candidate_ic: float, candidate_signals: np.ndarray
-    ) -> Tuple[bool, str]:
-        """Check if candidate passes admission criteria (Eq. 10).
-
-        Admission rule:
-            IC(alpha) >= tau_IC  AND  max_{g in L} |rho(alpha, g)| < theta
-
-        Parameters
-        ----------
-        candidate_ic : float
-            The candidate factor's mean IC.
-        candidate_signals : np.ndarray, shape (M, T)
-            The candidate's realized signals.
-
-        Returns
-        -------
-        (admitted, reason) : Tuple[bool, str]
-        """
-        if candidate_ic < self.ic_threshold:
-            return False, (
-                f"IC {candidate_ic:.4f} below threshold {self.ic_threshold}"
-            )
-
-        if self.size == 0:
-            return True, "First factor in library"
-
-        max_corr = self._max_correlation_with_library(candidate_signals)
-
-        if max_corr >= self.correlation_threshold:
-            return False, (
-                f"Max correlation {max_corr:.4f} >= threshold "
-                f"{self.correlation_threshold} with existing library factor"
-            )
-
-        return True, (
-            f"Admitted: IC={candidate_ic:.4f}, max_corr={max_corr:.4f}"
-        )
-
-    def check_replacement(
-        self,
-        candidate_ic: float,
-        candidate_signals: np.ndarray,
-        ic_min: float = 0.10,
-        ic_ratio: float = 1.3,
-    ) -> Tuple[bool, Optional[int], str]:
-        """Check replacement mechanism (Eq. 11).
-
-        Replacement rule:
-            IC(alpha) >= 0.10
-            AND IC(alpha) >= 1.3 * IC(g)
-            AND |{g in L : |rho(alpha, g)| > theta}| = 1
-
-        If exactly one library factor g is correlated above theta AND the
-        candidate's IC dominates g's IC by the required ratio, replace g.
-
-        Parameters
-        ----------
-        candidate_ic : float
-            The candidate's mean IC.
-        candidate_signals : np.ndarray, shape (M, T)
-            The candidate's realized signals.
-        ic_min : float
-            Absolute IC floor for replacement (default 0.10).
-        ic_ratio : float
-            Required IC ratio over the correlated factor (default 1.3).
-
-        Returns
-        -------
-        (should_replace, factor_to_replace_id, reason) : Tuple[bool, Optional[int], str]
-        """
-        if candidate_ic < ic_min:
-            return False, None, (
-                f"IC {candidate_ic:.4f} below replacement floor {ic_min}"
-            )
-
-        if self.size == 0:
-            return False, None, "Library is empty, use admission instead"
-
-        # Find all library factors correlated above theta
-        correlated_factors = []
-        for fid, factor in self.factors.items():
-            if factor.signals is None:
-                continue
-            corr = self._compute_correlation_vectorized(
-                candidate_signals, factor.signals
-            )
-            if corr >= self.correlation_threshold:
-                correlated_factors.append((fid, corr, factor.ic_mean))
-
-        if len(correlated_factors) != 1:
-            return False, None, (
-                f"Found {len(correlated_factors)} correlated factors "
-                f"(need exactly 1 for replacement)"
-            )
-
-        fid, corr, existing_ic = correlated_factors[0]
-        if candidate_ic < ic_ratio * existing_ic:
-            return False, None, (
-                f"IC {candidate_ic:.4f} < {ic_ratio} * {existing_ic:.4f} = "
-                f"{ic_ratio * existing_ic:.4f}"
-            )
-
-        return True, fid, (
-            f"Replace factor {fid}: candidate IC {candidate_ic:.4f} > "
-            f"{ic_ratio} * {existing_ic:.4f}, corr={corr:.4f}"
-        )
-
-    # ------------------------------------------------------------------
-    # Library mutations
-    # ------------------------------------------------------------------
-
-    def admit_factor(self, factor: Factor) -> int:
-        """Add a factor to the library and update the correlation matrix.
-
-        Parameters
-        ----------
-        factor : Factor
-            The factor to add. Its ``id`` field is overwritten with the
-            next available ID.
-
-        Returns
-        -------
-        int
-            The assigned factor ID.
-        """
-        factor.id = self._next_id
-        self._next_id += 1
-        self.factors[factor.id] = factor
-
-        # Update correlation matrix incrementally
-        self._extend_correlation_matrix(factor)
-
-        logger.info(
-            "Admitted factor %d '%s' (IC=%.4f, max_corr=%.4f, category=%s)",
-            factor.id, factor.name, factor.ic_mean,
-            factor.max_correlation, factor.category,
-        )
-        return factor.id
-
-    def replace_factor(self, old_id: int, new_factor: Factor) -> None:
-        """Replace an existing factor with a better one.
-
-        The new factor takes the old factor's position in the correlation
-        matrix and receives a fresh ID.
-
-        Parameters
-        ----------
-        old_id : int
-            ID of the factor being replaced.
-        new_factor : Factor
-            The replacement factor.
-        """
-        if old_id not in self.factors:
-            raise KeyError(f"Factor {old_id} not in library")
-
-        old_factor = self.factors[old_id]
-        new_factor.id = self._next_id
-        self._next_id += 1
-
-        # Remove old factor and reuse its matrix slot
-        old_index = self._id_to_index.pop(old_id)
-        del self.factors[old_id]
-
-        # Insert new factor at the same index
-        self.factors[new_factor.id] = new_factor
-        self._id_to_index[new_factor.id] = old_index
-
-        # Recompute the row/column for this index
-        if self.correlation_matrix is not None and new_factor.signals is not None:
-            self._recompute_matrix_slot(old_index, new_factor)
-
-        logger.info(
-            "Replaced factor %d with %d '%s' (IC=%.4f)",
-            old_id, new_factor.id, new_factor.name, new_factor.ic_mean,
-        )
-
-    def remove_factor(self, factor_id: int) -> None:
-        """Remove a factor from the library and rebuild correlation state."""
-        if factor_id not in self.factors:
-            raise KeyError(f"Factor {factor_id} not in library")
-
-        removed = self.factors.pop(factor_id)
-        self.update_correlation_matrix()
-
-        logger.info(
-            "Removed factor %d '%s' from library",
-            factor_id,
-            removed.name,
-        )
-
-    # ------------------------------------------------------------------
-    # Correlation matrix management
-    # ------------------------------------------------------------------
-
-    def _max_correlation_with_library(
-        self, candidate_signals: np.ndarray
-    ) -> float:
-        """Compute max |rho| between candidate and all library factors."""
-        max_corr = 0.0
-        for factor in self.factors.values():
-            if factor.signals is None:
-                continue
-            corr = self._compute_correlation_vectorized(
-                candidate_signals, factor.signals
-            )
-            max_corr = max(max_corr, corr)
-        return max_corr
-
-    def _extend_correlation_matrix(self, new_factor: Factor) -> None:
-        """Extend the correlation matrix by one row/column for the new factor."""
-        n = len(self._id_to_index)
-        new_index = n
-        self._id_to_index[new_factor.id] = new_index
-
-        if new_factor.signals is None:
-            # No signals to correlate; expand with zeros
-            if self.correlation_matrix is None:
-                self.correlation_matrix = np.zeros((1, 1), dtype=np.float64)
-            else:
-                new_size = new_index + 1
-                new_mat = np.zeros((new_size, new_size), dtype=np.float64)
-                new_mat[:new_index, :new_index] = self.correlation_matrix
-                self.correlation_matrix = new_mat
-            return
-
-        # Build a new (n+1) x (n+1) matrix
-        new_size = new_index + 1
-        new_mat = np.zeros((new_size, new_size), dtype=np.float64)
-
-        if self.correlation_matrix is not None and self.correlation_matrix.size > 0:
-            new_mat[:new_index, :new_index] = self.correlation_matrix
-
-        # Compute correlations with all existing factors
-        index_to_id = {idx: fid for fid, idx in self._id_to_index.items()}
-        for idx in range(new_index):
-            fid = index_to_id.get(idx)
-            if fid is None:
-                continue
-            other = self.factors.get(fid)
-            if other is None or other.signals is None:
-                continue
-            corr = self._compute_correlation_vectorized(
-                new_factor.signals, other.signals
-            )
-            new_mat[new_index, idx] = corr
-            new_mat[idx, new_index] = corr
-
-        self.correlation_matrix = new_mat
-
-    def _recompute_matrix_slot(self, idx: int, factor: Factor) -> None:
-        """Recompute one row/column of the correlation matrix after replacement."""
-        n = self.correlation_matrix.shape[0]
-        index_to_id = {i: fid for fid, i in self._id_to_index.items()}
-
-        for other_idx in range(n):
-            if other_idx == idx:
-                self.correlation_matrix[idx, idx] = 0.0
-                continue
-            other_fid = index_to_id.get(other_idx)
-            if other_fid is None:
-                continue
-            other = self.factors.get(other_fid)
-            if other is None or other.signals is None:
-                self.correlation_matrix[idx, other_idx] = 0.0
-                self.correlation_matrix[other_idx, idx] = 0.0
-                continue
-            corr = self._compute_correlation_vectorized(
-                factor.signals, other.signals
-            )
-            self.correlation_matrix[idx, other_idx] = corr
-            self.correlation_matrix[other_idx, idx] = corr
-
-    def update_correlation_matrix(self) -> None:
-        """Recompute the full pairwise correlation matrix from scratch.
-
-        This is O(n^2) in the number of library factors and should only be
-        called when the incremental updates may have drifted or after bulk
-        operations.
-        """
-        ids = sorted(self.factors.keys())
-        n = len(ids)
-        if n == 0:
-            self.correlation_matrix = None
-            self._id_to_index.clear()
-            return
-
-        self._id_to_index = {fid: i for i, fid in enumerate(ids)}
-        mat = np.zeros((n, n), dtype=np.float64)
-
-        factors_list = [self.factors[fid] for fid in ids]
-        for i in range(n):
-            for j in range(i + 1, n):
-                fi, fj = factors_list[i], factors_list[j]
-                if fi.signals is None or fj.signals is None:
-                    continue
-                corr = self._compute_correlation_vectorized(fi.signals, fj.signals)
-                mat[i, j] = corr
-                mat[j, i] = corr
-
-        self.correlation_matrix = mat
-
-    # ------------------------------------------------------------------
-    # Queries and diagnostics
-    # ------------------------------------------------------------------
-
-    @property
-    def size(self) -> int:
-        """Number of factors currently in the library."""
-        return len(self.factors)
-
-    def get_factor(self, factor_id: int) -> Factor:
-        """Retrieve a factor by ID."""
-        if factor_id not in self.factors:
-            raise KeyError(f"Factor {factor_id} not in library")
-        return self.factors[factor_id]
-
-    def list_factors(self) -> List[Factor]:
-        """Return all factors sorted by ID."""
-        return [self.factors[k] for k in sorted(self.factors)]
-
-    def get_factors_by_category(self, category: str) -> List[Factor]:
-        """Return all factors matching a given category."""
-        return [
-            f for f in self.factors.values()
-            if f.category == category
-        ]
-
-    def get_diagnostics(self) -> dict:
-        """Library diagnostics: avg |rho|, max tail correlations, per-category counts, saturation.
-
-        Returns
-        -------
-        dict with keys:
-            - size: int
-            - avg_correlation: float (average off-diagonal |rho|)
-            - max_correlation: float (maximum off-diagonal |rho|)
-            - p95_correlation: float (95th percentile off-diagonal |rho|)
-            - category_counts: dict[str, int]
-            - category_avg_ic: dict[str, float]
-            - saturation: float (fraction of max correlation slots above 0.3)
-        """
-        diag: dict = {"size": self.size}
-
-        # Category breakdown
-        cat_counts: Dict[str, int] = defaultdict(int)
-        cat_ic_sums: Dict[str, float] = defaultdict(float)
-        for f in self.factors.values():
-            cat_counts[f.category] += 1
-            cat_ic_sums[f.category] += f.ic_mean
-
-        diag["category_counts"] = dict(cat_counts)
-        diag["category_avg_ic"] = {
-            cat: cat_ic_sums[cat] / cat_counts[cat]
-            for cat in cat_counts
-        }
-
-        # Correlation statistics
-        if self.correlation_matrix is not None and self.size > 1:
-            n = self.correlation_matrix.shape[0]
-            # Extract upper triangle (off-diagonal)
-            triu_idx = np.triu_indices(n, k=1)
-            off_diag = self.correlation_matrix[triu_idx]
-            valid = off_diag[~np.isnan(off_diag)]
-
-            if len(valid) > 0:
-                diag["avg_correlation"] = float(np.mean(valid))
-                diag["max_correlation"] = float(np.max(valid))
-                diag["p95_correlation"] = float(np.percentile(valid, 95))
-                diag["saturation"] = float(np.mean(valid > 0.3))
-            else:
-                diag["avg_correlation"] = 0.0
-                diag["max_correlation"] = 0.0
-                diag["p95_correlation"] = 0.0
-                diag["saturation"] = 0.0
-        else:
-            diag["avg_correlation"] = 0.0
-            diag["max_correlation"] = 0.0
-            diag["p95_correlation"] = 0.0
-            diag["saturation"] = 0.0
-
-        return diag
-
-    def get_state_summary(self) -> dict:
-        """Summary for memory retrieval: size, categories, recent admissions.
-
-        Returns a lightweight dictionary suitable for inclusion in LLM prompts
-        or memory store entries.
-        """
-        factors_sorted = sorted(
-            self.factors.values(), key=lambda f: f.id, reverse=True
-        )
-        recent = factors_sorted[:5]  # Last 5 admissions
-
-        categories = defaultdict(int)
-        for f in self.factors.values():
-            categories[f.category] += 1
-
-        return {
-            "library_size": self.size,
-            "categories": dict(categories),
-            "recent_admissions": [
-                {
-                    "id": f.id,
-                    "name": f.name,
-                    "category": f.category,
-                    "ic_mean": f.ic_mean,
-                    "batch": f.batch_number,
-                }
-                for f in recent
-            ],
-            "correlation_threshold": self.correlation_threshold,
-            "ic_threshold": self.ic_threshold,
-        }
diff --git a/src/factorminer/factorminer/core/helix_loop.py b/src/factorminer/factorminer/core/helix_loop.py
deleted file mode 100644
index 7671661..0000000
--- a/src/factorminer/factorminer/core/helix_loop.py
+++ /dev/null
@@ -1,1576 +0,0 @@
-"""The Helix Loop: 5-stage self-evolving factor discovery with Phase 2 extensions.
-
-Extends the base Ralph Loop with:
-  1. RETRIEVE  -- KG + embeddings + flat memory hybrid retrieval
-  2. PROPOSE   -- Multi-agent debate (specialists + critic) or standard generation
-  3. SYNTHESIZE -- SymPy canonicalization to eliminate mathematical duplicates
-  4. VALIDATE  -- Standard pipeline + causal + regime + capacity + significance
-  5. DISTILL   -- Standard memory evolution + KG update + online forgetting
-
-All Phase 2 components are optional: when none are enabled the Helix Loop
-behaves identically to the Ralph Loop and is a full drop-in replacement.
-"""
-
-from __future__ import annotations
-
-import json
-import logging
-import re
-import time
-from pathlib import Path
-from typing import Any, Dict, List, Optional, Set, Tuple
-
-import numpy as np
-
-from src.factorminer.factorminer.core.ralph_loop import (
-    BudgetTracker,
-    EvaluationResult,
-    FactorGenerator,
-    RalphLoop,
-    ValidationPipeline,
-)
-from src.factorminer.factorminer.core.factor_library import Factor, FactorLibrary
-from src.factorminer.factorminer.core.parser import try_parse
-from src.factorminer.factorminer.evaluation.metrics import compute_ic
-from src.factorminer.factorminer.memory.memory_store import ExperienceMemory
-from src.factorminer.factorminer.memory.retrieval import retrieve_memory
-from src.factorminer.factorminer.memory.formation import form_memory
-from src.factorminer.factorminer.memory.evolution import evolve_memory
-from src.factorminer.factorminer.agent.llm_interface import LLMProvider
-from src.factorminer.factorminer.utils.logging import IterationRecord, FactorRecord
-
-logger = logging.getLogger(__name__)
-
-
-# ---------------------------------------------------------------------------
-# Optional imports -- resolved at call time with graceful fallback
-# ---------------------------------------------------------------------------
-
-def _try_import_debate():
-    try:
-        from factorminer.agent.debate import DebateGenerator, DebateConfig
-        return DebateGenerator, DebateConfig
-    except ImportError:
-        return None, None
-
-
-def _try_import_canonicalizer():
-    try:
-        from factorminer.core.canonicalizer import FormulaCanonicalizer
-        return FormulaCanonicalizer
-    except ImportError:
-        return None
-
-
-def _try_import_causal():
-    try:
-        from factorminer.evaluation.causal import CausalValidator, CausalConfig
-        return CausalValidator, CausalConfig
-    except ImportError:
-        return None, None
-
-
-def _try_import_regime():
-    try:
-        from factorminer.evaluation.regime import (
-            RegimeDetector,
-            RegimeAwareEvaluator,
-            RegimeConfig,
-        )
-        return RegimeDetector, RegimeAwareEvaluator, RegimeConfig
-    except ImportError:
-        return None, None, None
-
-
-def _try_import_capacity():
-    try:
-        from factorminer.evaluation.capacity import CapacityEstimator, CapacityConfig
-        return CapacityEstimator, CapacityConfig
-    except ImportError:
-        return None, None
-
-
-def _try_import_significance():
-    try:
-        from factorminer.evaluation.significance import (
-            BootstrapICTester,
-            FDRController,
-            DeflatedSharpeCalculator,
-            SignificanceConfig,
-        )
-        return BootstrapICTester, FDRController, DeflatedSharpeCalculator, SignificanceConfig
-    except ImportError:
-        return None, None, None, None
-
-
-def _try_import_kg():
-    try:
-        from factorminer.memory.knowledge_graph import FactorKnowledgeGraph, FactorNode
-        return FactorKnowledgeGraph, FactorNode
-    except ImportError:
-        return None, None
-
-
-def _try_import_kg_retrieval():
-    try:
-        from factorminer.memory.kg_retrieval import retrieve_memory_enhanced
-        return retrieve_memory_enhanced
-    except ImportError:
-        return None
-
-
-def _try_import_embedder():
-    try:
-        from factorminer.memory.embeddings import FormulaEmbedder
-        return FormulaEmbedder
-    except ImportError:
-        return None
-
-
-def _try_import_auto_inventor():
-    try:
-        from factorminer.operators.auto_inventor import OperatorInventor
-        return OperatorInventor
-    except ImportError:
-        return None
-
-
-def _try_import_custom_store():
-    try:
-        from factorminer.operators.custom import CustomOperatorStore
-        return CustomOperatorStore
-    except ImportError:
-        return None
-
-
-# ---------------------------------------------------------------------------
-# HelixLoop
-# ---------------------------------------------------------------------------
-
-class HelixLoop(RalphLoop):
-    """Enhanced 5-stage Helix Loop for self-evolving factor discovery.
-
-    Extends the Ralph Loop with:
-    1. RETRIEVE: KG + embeddings + flat memory hybrid retrieval
-    2. PROPOSE: Multi-agent debate (specialists + critic) or standard generation
-    3. SYNTHESIZE: SymPy canonicalization to eliminate mathematical duplicates
-    4. VALIDATE: Standard pipeline + causal + regime + capacity + significance
-    5. DISTILL: Standard memory evolution + KG update + online forgetting
-
-    All Phase 2 components are optional and default to off. When none are
-    enabled, the Helix Loop behaves identically to the Ralph Loop.
-
-    Parameters
-    ----------
-    config : Any
-        Mining configuration object.
-    data_tensor : np.ndarray
-        Market data tensor D in R^(M x T x F).
-    returns : np.ndarray
-        Forward returns array R in R^(M x T).
-    llm_provider : LLMProvider, optional
-        LLM provider for factor generation.
-    memory : ExperienceMemory, optional
-        Pre-populated experience memory.
-    library : FactorLibrary, optional
-        Pre-populated factor library.
-    debate_config : DebateConfig, optional
-        Configuration for multi-agent debate generation.
-        When provided, replaces standard FactorGenerator.
-    enable_knowledge_graph : bool
-        Enable factor knowledge graph for lineage and structural analysis.
-    enable_embeddings : bool
-        Enable semantic formula embeddings for similarity search.
-    enable_auto_inventor : bool
-        Enable periodic auto-invention of new operators.
-    auto_invention_interval : int
-        Run auto-invention every N iterations.
-    canonicalize : bool
-        Enable SymPy-based formula canonicalization for deduplication.
-    forgetting_lambda : float
-        Exponential decay factor for online forgetting (0-1).
-    causal_config : CausalConfig, optional
-        Configuration for causal validation (Granger + intervention).
-    regime_config : RegimeConfig, optional
-        Configuration for regime-aware IC evaluation.
-    capacity_config : CapacityConfig, optional
-        Configuration for capacity-aware cost evaluation.
-    significance_config : SignificanceConfig, optional
-        Configuration for bootstrap CI + FDR + deflated Sharpe.
-    volume : np.ndarray, optional
-        Dollar volume array (M, T) required for capacity estimation.
-    """
-
-    def __init__(
-        self,
-        config: Any,
-        data_tensor: np.ndarray,
-        returns: np.ndarray,
-        llm_provider: Optional[LLMProvider] = None,
-        memory: Optional[ExperienceMemory] = None,
-        library: Optional[FactorLibrary] = None,
-        # Phase 2 extensions
-        debate_config: Optional[Any] = None,
-        enable_knowledge_graph: bool = False,
-        enable_embeddings: bool = False,
-        enable_auto_inventor: bool = False,
-        auto_invention_interval: int = 10,
-        canonicalize: bool = True,
-        forgetting_lambda: float = 0.95,
-        causal_config: Optional[Any] = None,
-        regime_config: Optional[Any] = None,
-        capacity_config: Optional[Any] = None,
-        significance_config: Optional[Any] = None,
-        volume: Optional[np.ndarray] = None,
-    ) -> None:
-        # Initialize base RalphLoop
-        super().__init__(
-            config=config,
-            data_tensor=data_tensor,
-            returns=returns,
-            llm_provider=llm_provider,
-            memory=memory,
-            library=library,
-        )
-
-        # Store Phase 2 configuration
-        self._debate_config = debate_config
-        self._enable_kg = enable_knowledge_graph
-        self._enable_embeddings = enable_embeddings
-        self._enable_auto_inventor = enable_auto_inventor
-        self._auto_invention_interval = auto_invention_interval
-        self._canonicalize = canonicalize
-        self._forgetting_lambda = forgetting_lambda
-        self._causal_config = causal_config
-        self._regime_config = regime_config
-        self._capacity_config = capacity_config
-        self._significance_config = significance_config
-        self._volume = volume
-
-        # Track iterations without admissions for forgetting
-        self._no_admission_streak: int = 0
-
-        # Initialize Phase 2 components
-        self._debate_generator: Optional[Any] = None
-        self._canonicalizer: Optional[Any] = None
-        self._causal_validator: Optional[Any] = None
-        self._regime_detector: Optional[Any] = None
-        self._regime_evaluator: Optional[Any] = None
-        self._regime_classification: Optional[Any] = None
-        self._capacity_estimator: Optional[Any] = None
-        self._bootstrap_tester: Optional[Any] = None
-        self._fdr_controller: Optional[Any] = None
-        self._kg: Optional[Any] = None
-        self._embedder: Optional[Any] = None
-        self._auto_inventor: Optional[Any] = None
-        self._custom_op_store: Optional[Any] = None
-
-        self._init_phase2_components(llm_provider)
-
-    # ------------------------------------------------------------------
-    # Phase 2 component initialization
-    # ------------------------------------------------------------------
-
-    def _init_phase2_components(self, llm_provider: Optional[LLMProvider]) -> None:
-        """Initialize all Phase 2 components based on configuration."""
-
-        # -- Debate generator --
-        if self._debate_config is not None:
-            DebateGeneratorCls, _ = _try_import_debate()
-            if DebateGeneratorCls is not None:
-                try:
-                    self._debate_generator = DebateGeneratorCls(
-                        llm_provider=llm_provider or self.generator.llm,
-                        debate_config=self._debate_config,
-                    )
-                    logger.info("Helix: multi-agent debate generator enabled")
-                except Exception as exc:
-                    logger.warning("Helix: failed to init debate generator: %s", exc)
-            else:
-                logger.warning(
-                    "Helix: debate_config provided but debate module unavailable"
-                )
-
-        # -- Canonicalizer --
-        if self._canonicalize:
-            FormulaCanonCls = _try_import_canonicalizer()
-            if FormulaCanonCls is not None:
-                try:
-                    self._canonicalizer = FormulaCanonCls()
-                    logger.info("Helix: SymPy canonicalization enabled")
-                except Exception as exc:
-                    logger.warning("Helix: failed to init canonicalizer: %s", exc)
-            else:
-                logger.warning(
-                    "Helix: canonicalize=True but sympy/canonicalizer unavailable"
-                )
-
-        # -- Causal validator --
-        if self._causal_config is not None:
-            CausalValidatorCls, _ = _try_import_causal()
-            if CausalValidatorCls is not None:
-                logger.info("Helix: causal validation enabled")
-            else:
-                logger.warning(
-                    "Helix: causal_config provided but causal module unavailable"
-                )
-
-        # -- Regime evaluator --
-        if self._regime_config is not None:
-            RegimeDetectorCls, RegimeEvalCls, _ = _try_import_regime()
-            if RegimeDetectorCls is not None and RegimeEvalCls is not None:
-                try:
-                    self._regime_detector = RegimeDetectorCls(self._regime_config)
-                    self._regime_classification = self._regime_detector.classify(
-                        self.returns
-                    )
-                    self._regime_evaluator = RegimeEvalCls(
-                        returns=self.returns,
-                        regime=self._regime_classification,
-                        config=self._regime_config,
-                    )
-                    logger.info("Helix: regime-aware evaluation enabled")
-                except Exception as exc:
-                    logger.warning("Helix: failed to init regime evaluator: %s", exc)
-            else:
-                logger.warning(
-                    "Helix: regime_config provided but regime module unavailable"
-                )
-
-        # -- Capacity estimator --
-        if self._capacity_config is not None:
-            CapacityEstCls, _ = _try_import_capacity()
-            if CapacityEstCls is not None:
-                if self._volume is not None:
-                    try:
-                        self._capacity_estimator = CapacityEstCls(
-                            returns=self.returns,
-                            volume=self._volume,
-                            config=self._capacity_config,
-                        )
-                        logger.info("Helix: capacity-aware evaluation enabled")
-                    except Exception as exc:
-                        logger.warning(
-                            "Helix: failed to init capacity estimator: %s", exc
-                        )
-                else:
-                    logger.warning(
-                        "Helix: capacity_config provided but no volume data supplied"
-                    )
-            else:
-                logger.warning(
-                    "Helix: capacity_config provided but capacity module unavailable"
-                )
-
-        # -- Significance testing --
-        if self._significance_config is not None:
-            BootstrapCls, FDRCls, _, _ = _try_import_significance()
-            if BootstrapCls is not None and FDRCls is not None:
-                try:
-                    self._bootstrap_tester = BootstrapCls(self._significance_config)
-                    self._fdr_controller = FDRCls(self._significance_config)
-                    logger.info("Helix: significance testing enabled")
-                except Exception as exc:
-                    logger.warning(
-                        "Helix: failed to init significance testing: %s", exc
-                    )
-            else:
-                logger.warning(
-                    "Helix: significance_config provided but significance module unavailable"
-                )
-
-        # -- Knowledge graph --
-        if self._enable_kg:
-            KGCls, _ = _try_import_kg()
-            if KGCls is not None:
-                try:
-                    self._kg = KGCls()
-                    logger.info("Helix: knowledge graph enabled")
-                except Exception as exc:
-                    logger.warning("Helix: failed to init knowledge graph: %s", exc)
-            else:
-                logger.warning(
-                    "Helix: enable_knowledge_graph=True but knowledge_graph module unavailable"
-                )
-
-        # -- Embedder --
-        if self._enable_embeddings:
-            EmbedderCls = _try_import_embedder()
-            if EmbedderCls is not None:
-                try:
-                    self._embedder = EmbedderCls()
-                    self._prime_embedder_from_library()
-                    logger.info("Helix: formula embeddings enabled")
-                except Exception as exc:
-                    logger.warning("Helix: failed to init embedder: %s", exc)
-            else:
-                logger.warning(
-                    "Helix: enable_embeddings=True but embeddings module unavailable"
-                )
-
-        # -- Auto inventor --
-        if self._enable_auto_inventor:
-            InventorCls = _try_import_auto_inventor()
-            CustomStoreCls = _try_import_custom_store()
-            if InventorCls is not None:
-                try:
-                    self._auto_inventor = InventorCls(
-                        llm_provider=llm_provider or self.generator.llm,
-                        data_tensor=self.data_tensor,
-                        returns=self.returns,
-                    )
-                    logger.info("Helix: auto operator invention enabled")
-                except Exception as exc:
-                    logger.warning("Helix: failed to init auto inventor: %s", exc)
-
-            if CustomStoreCls is not None:
-                output_dir = getattr(self.config, "output_dir", "./output")
-                try:
-                    self._custom_op_store = CustomStoreCls(
-                        store_dir=str(Path(output_dir) / "custom_operators")
-                    )
-                    logger.info("Helix: custom operator store enabled")
-                except Exception as exc:
-                    logger.warning(
-                        "Helix: failed to init custom operator store: %s", exc
-                    )
-            else:
-                logger.warning(
-                    "Helix: enable_auto_inventor=True but custom operator store unavailable"
-                )
-
-    # ------------------------------------------------------------------
-    # Override: _run_iteration with 5-stage Helix flow
-    # ------------------------------------------------------------------
-
-    def _run_iteration(self, batch_size: int) -> Dict[str, Any]:
-        """Execute one iteration of the 5-stage Helix Loop.
-
-        Stages:
-          1. RETRIEVE  -- enhanced memory retrieval (KG + embeddings + flat)
-          2. PROPOSE   -- debate or standard factor generation
-          3. SYNTHESIZE -- canonicalize and deduplicate candidates
-          4. VALIDATE  -- standard pipeline + causal + regime + capacity + significance
-          5. DISTILL   -- memory evolution + KG update + forgetting
-
-        Returns
-        -------
-        dict
-            Iteration statistics.
-        """
-        t0 = time.time()
-        helix_stats: Dict[str, Any] = {}
-
-        # ==================================================================
-        # Stage 1: RETRIEVE
-        # ==================================================================
-        library_state = self.library.get_state_summary()
-        memory_signal = self._helix_retrieve(library_state)
-
-        # ==================================================================
-        # Stage 2: PROPOSE
-        # ==================================================================
-        t_gen = time.time()
-        candidates = self._helix_propose(memory_signal, library_state, batch_size)
-        self.budget.record_llm_call()
-
-        if not candidates:
-            logger.warning(
-                "Helix iteration %d: generator produced 0 candidates",
-                self.iteration,
-            )
-            return self._empty_stats()
-
-        helix_stats["candidates_before_canon"] = len(candidates)
-
-        # ==================================================================
-        # Stage 3: SYNTHESIZE (canonicalize + dedup)
-        # ==================================================================
-        candidates, n_canon_dupes, n_semantic_dupes = self._canonicalize_and_dedup(candidates)
-        helix_stats["canonical_duplicates_removed"] = n_canon_dupes
-        helix_stats["semantic_duplicates_removed"] = n_semantic_dupes
-
-        if not candidates:
-            logger.warning(
-                "Helix iteration %d: all candidates removed by canonicalization",
-                self.iteration,
-            )
-            return self._empty_stats()
-
-        # ==================================================================
-        # Stage 4: VALIDATE
-        # ==================================================================
-        results = self.pipeline.evaluate_batch(candidates)
-        admitted_results = self._update_library(results)
-
-        # Phase 2 extended validation on admitted candidates
-        rejected_by_phase2 = self._helix_validate(results, admitted_results)
-        helix_stats["phase2_rejections"] = rejected_by_phase2
-        surviving_admissions = [r for r in admitted_results if r.admitted]
-
-        provenance_library_state = {
-            **library_state,
-            "diagnostics": self.library.get_diagnostics(),
-        }
-
-        self._attach_factor_provenance(
-            surviving_admissions,
-            library_state=provenance_library_state,
-            memory_signal=memory_signal,
-            phase2_summary={
-                "enabled_features": self._phase2_features(),
-                "phase2_rejections": rejected_by_phase2,
-            },
-            generator_family=self._generator_family(),
-        )
-
-        # ==================================================================
-        # Stage 5: DISTILL
-        # ==================================================================
-        trajectory = self._build_trajectory(results)
-        formed = form_memory(self.memory, trajectory, self.iteration)
-        self.memory = evolve_memory(self.memory, formed)
-
-        # KG + embeddings + forgetting
-        self._helix_distill(results, admitted_results)
-
-        # Auto-invention check
-        if (
-            self._auto_inventor is not None
-            and self.iteration % self._auto_invention_interval == 0
-        ):
-            self._run_auto_invention()
-
-        # Build stats
-        elapsed = time.time() - t0
-        self.budget.record_compute(elapsed)
-        stats = self._compute_stats(results, surviving_admissions, elapsed)
-        stats.update(helix_stats)
-        stats["iteration"] = self.iteration
-
-        # Log to reporter and session logger
-        self.reporter.log_batch(**stats)
-        if self._session_logger:
-            ic_values = [r.ic_mean for r in results if r.parse_ok]
-            record = IterationRecord(
-                iteration=self.iteration,
-                candidates_generated=len(candidates) + n_canon_dupes + n_semantic_dupes,
-                ic_passed=stats["ic_passed"],
-                correlation_passed=stats["corr_passed"],
-                admitted=stats["admitted"],
-                rejected=len(candidates) + n_canon_dupes + n_semantic_dupes - stats["admitted"],
-                replaced=stats["replaced"],
-                library_size=self.library.size,
-                best_ic=max(ic_values) if ic_values else 0.0,
-                mean_ic=float(np.mean(ic_values)) if ic_values else 0.0,
-                elapsed_seconds=elapsed,
-            )
-            self._session_logger.log_iteration(record)
-
-            for r in results:
-                factor_rec = FactorRecord(
-                    expression=r.formula,
-                    ic=r.ic_mean if r.parse_ok else None,
-                    icir=r.icir if r.parse_ok else None,
-                    max_correlation=r.max_correlation if r.parse_ok else None,
-                    admitted=r.admitted,
-                    rejection_reason=r.rejection_reason or None,
-                    replaced_factor=str(r.replaced) if r.replaced else None,
-                )
-                self._session_logger.log_factor(factor_rec)
-
-        return stats
-
-    # ------------------------------------------------------------------
-    # Stage 1: Enhanced retrieval
-    # ------------------------------------------------------------------
-
-    def _helix_retrieve(
-        self, library_state: Dict[str, Any]
-    ) -> Dict[str, Any]:
-        """Stage 1 RETRIEVE: KG + embeddings + flat memory hybrid retrieval.
-
-        Falls back to standard retrieve_memory if no KG/embedder is available.
-        """
-        retrieve_enhanced_fn = _try_import_kg_retrieval()
-
-        if retrieve_enhanced_fn is not None and (
-            self._kg is not None or self._embedder is not None
-        ):
-            try:
-                return retrieve_enhanced_fn(
-                    memory=self.memory,
-                    library_state=library_state,
-                    kg=self._kg,
-                    embedder=self._embedder,
-                )
-            except Exception as exc:
-                logger.warning(
-                    "Helix: enhanced retrieval failed, falling back: %s", exc
-                )
-
-        return retrieve_memory(self.memory, library_state=library_state)
-
-    # ------------------------------------------------------------------
-    # Stage 2: Debate or standard proposal
-    # ------------------------------------------------------------------
-
-    def _helix_propose(
-        self,
-        memory_signal: Dict[str, Any],
-        library_state: Dict[str, Any],
-        batch_size: int,
-    ) -> List[Tuple[str, str]]:
-        """Stage 2 PROPOSE: Use debate generator or standard generator.
-
-        Returns list of (name, formula) tuples compatible with the
-        validation pipeline.
-        """
-        if self._debate_generator is not None:
-            try:
-                debate_candidates = self._debate_generator.generate_batch(
-                    memory_signal=memory_signal,
-                    library_state=library_state,
-                    batch_size=batch_size,
-                )
-                # Convert CandidateFactor objects to (name, formula) tuples
-                tuples: List[Tuple[str, str]] = []
-                for c in debate_candidates:
-                    tuples.append((c.name, c.formula))
-                if tuples:
-                    logger.info(
-                        "Helix: debate generator produced %d candidates",
-                        len(tuples),
-                    )
-                    return tuples
-                logger.warning(
-                    "Helix: debate generator returned 0 candidates, "
-                    "falling back to standard generator"
-                )
-            except Exception as exc:
-                logger.warning(
-                    "Helix: debate generation failed, falling back: %s", exc
-                )
-
-        # Standard generation
-        return self.generator.generate_batch(
-            memory_signal=memory_signal,
-            library_state=library_state,
-            batch_size=batch_size,
-        )
-
-    # ------------------------------------------------------------------
-    # Stage 3: Canonicalization + deduplication
-    # ------------------------------------------------------------------
-
-    def _canonicalize_and_dedup(
-        self, candidates: List[Tuple[str, str]]
-    ) -> Tuple[List[Tuple[str, str]], int, int]:
-        """Stage 3 SYNTHESIZE: Remove mathematically equivalent candidates.
-
-        Uses SymPy-based canonicalization to detect algebraic duplicates
-        before evaluation, saving compute.
-
-        Returns
-        -------
-        tuple of (deduplicated_candidates, n_canonical_duplicates_removed,
-        n_semantic_duplicates_removed)
-        """
-        if self._canonicalizer is None and self._embedder is None:
-            return candidates, 0, 0
-
-        seen_hashes: Dict[str, str] = {}  # hash -> first factor name
-        unique: List[Tuple[str, str]] = []
-        n_canon_dupes = 0
-        n_semantic_dupes = 0
-
-        for name, formula in candidates:
-            tree = try_parse(formula)
-            if tree is not None and self._canonicalizer is not None:
-                try:
-                    canon_hash = self._canonicalizer.canonicalize(tree)
-                except Exception as exc:
-                    logger.debug(
-                        "Helix: canonicalization failed for '%s': %s", name, exc
-                    )
-                else:
-                    if canon_hash in seen_hashes:
-                        n_canon_dupes += 1
-                        logger.debug(
-                            "Helix: canonical duplicate '%s' matches '%s'",
-                            name,
-                            seen_hashes[canon_hash],
-                        )
-                        continue
-                    seen_hashes[canon_hash] = name
-
-            semantic_match = self._semantic_duplicate_target(formula)
-            if semantic_match is not None:
-                n_semantic_dupes += 1
-                logger.debug(
-                    "Helix: semantic duplicate '%s' matches library factor '%s'",
-                    name,
-                    semantic_match,
-                )
-                continue
-
-            unique.append((name, formula))
-
-        if n_canon_dupes > 0:
-            logger.info(
-                "Helix: canonicalization removed %d/%d duplicate candidates",
-                n_canon_dupes,
-                len(candidates),
-            )
-
-        if n_semantic_dupes > 0:
-            logger.info(
-                "Helix: embedding screen removed %d/%d library-adjacent candidates",
-                n_semantic_dupes,
-                len(candidates),
-            )
-
-        return unique, n_canon_dupes, n_semantic_dupes
-
-    # ------------------------------------------------------------------
-    # Stage 4: Extended validation
-    # ------------------------------------------------------------------
-
-    def _helix_validate(
-        self,
-        results: List[EvaluationResult],
-        admitted_results: List[EvaluationResult],
-    ) -> int:
-        """Stage 4 extended VALIDATE: causal + regime + capacity + significance.
-
-        Runs Phase 2 validation on admitted candidates and revokes admission
-        for those that fail. Returns the number of Phase 2 rejections.
-        """
-        if not admitted_results:
-            self._no_admission_streak += 1
-            return 0
-
-        rejected = 0
-
-        # Collect admitted results that still have signals for extended checks
-        to_check = [r for r in admitted_results if r.signals is not None]
-        if not to_check:
-            self._no_admission_streak = 0 if any(r.admitted for r in admitted_results) else self._no_admission_streak + 1
-            return 0
-
-        # -- Causal validation --
-        if self._causal_config is not None:
-            rejected += self._validate_causal(to_check, results)
-
-        # -- Regime validation --
-        if self._regime_evaluator is not None:
-            rejected += self._validate_regime(to_check, results)
-
-        # -- Capacity validation --
-        if self._capacity_estimator is not None:
-            rejected += self._validate_capacity(to_check, results)
-
-        # -- Significance testing (batch-level FDR) --
-        if self._bootstrap_tester is not None and self._fdr_controller is not None:
-            rejected += self._validate_significance(to_check, results)
-
-        if rejected > 0:
-            logger.info(
-                "Helix: Phase 2 validation rejected %d/%d admitted candidates",
-                rejected,
-                len(admitted_results),
-            )
-
-        if any(r.admitted for r in admitted_results):
-            self._no_admission_streak = 0
-        else:
-            self._no_admission_streak += 1
-
-        return rejected
-
-    def _validate_causal(
-        self,
-        to_check: List[EvaluationResult],
-        all_results: List[EvaluationResult],
-    ) -> int:
-        """Run causal validation (Granger + intervention) on admitted candidates."""
-        CausalValidatorCls, _ = _try_import_causal()
-        if CausalValidatorCls is None:
-            return 0
-
-        # Collect library signals for controls
-        library_signals: Dict[str, np.ndarray] = {}
-        for f in self.library.list_factors():
-            if f.signals is not None:
-                library_signals[f.name] = f.signals
-
-        try:
-            validator = CausalValidatorCls(
-                returns=self.returns,
-                data_tensor=self.data_tensor,
-                library_signals=library_signals,
-                config=self._causal_config,
-            )
-        except Exception as exc:
-            logger.warning("Helix: causal validator creation failed: %s", exc)
-            return 0
-
-        rejected = 0
-        threshold = getattr(
-            self._causal_config, "robustness_threshold", 0.4
-        )
-
-        for r in to_check:
-            if not r.admitted or r.signals is None:
-                continue
-            try:
-                result = validator.validate(r.factor_name, r.signals)
-                if not result.passes:
-                    self._revoke_admission(r, all_results,
-                        f"Causal: robustness_score={result.robustness_score:.3f} < {threshold}"
-                    )
-                    rejected += 1
-                    logger.debug(
-                        "Helix: causal rejection for '%s' (score=%.3f)",
-                        r.factor_name,
-                        result.robustness_score,
-                    )
-            except Exception as exc:
-                logger.warning(
-                    "Helix: causal validation error for '%s': %s",
-                    r.factor_name,
-                    exc,
-                )
-
-        return rejected
-
-    def _validate_regime(
-        self,
-        to_check: List[EvaluationResult],
-        all_results: List[EvaluationResult],
-    ) -> int:
-        """Run regime-aware IC evaluation on admitted candidates."""
-        if self._regime_evaluator is None:
-            return 0
-
-        rejected = 0
-        for r in to_check:
-            if not r.admitted or r.signals is None:
-                continue
-            try:
-                result = self._regime_evaluator.evaluate(r.factor_name, r.signals)
-                if not result.passes:
-                    self._revoke_admission(r, all_results,
-                        f"Regime: only {result.n_regimes_passing} regimes passing "
-                        f"(need {getattr(self._regime_config, 'min_regimes_passing', 2)})"
-                    )
-                    rejected += 1
-                    logger.debug(
-                        "Helix: regime rejection for '%s' (%d regimes passing)",
-                        r.factor_name,
-                        result.n_regimes_passing,
-                    )
-            except Exception as exc:
-                logger.warning(
-                    "Helix: regime validation error for '%s': %s",
-                    r.factor_name,
-                    exc,
-                )
-
-        return rejected
-
-    def _validate_capacity(
-        self,
-        to_check: List[EvaluationResult],
-        all_results: List[EvaluationResult],
-    ) -> int:
-        """Run capacity-aware cost evaluation on admitted candidates."""
-        if self._capacity_estimator is None:
-            return 0
-
-        rejected = 0
-        net_icir_threshold = getattr(
-            self._capacity_config, "net_icir_threshold", 0.3
-        )
-
-        for r in to_check:
-            if not r.admitted or r.signals is None:
-                continue
-            try:
-                result = self._capacity_estimator.net_cost_evaluation(
-                    factor_name=r.factor_name,
-                    signals=r.signals,
-                )
-                if not result.passes_net_threshold:
-                    self._revoke_admission(r, all_results,
-                        f"Capacity: net_icir={result.net_icir:.3f} < {net_icir_threshold}"
-                    )
-                    rejected += 1
-                    logger.debug(
-                        "Helix: capacity rejection for '%s' (net_icir=%.3f)",
-                        r.factor_name,
-                        result.net_icir,
-                    )
-            except Exception as exc:
-                logger.warning(
-                    "Helix: capacity validation error for '%s': %s",
-                    r.factor_name,
-                    exc,
-                )
-
-        return rejected
-
-    def _validate_significance(
-        self,
-        to_check: List[EvaluationResult],
-        all_results: List[EvaluationResult],
-    ) -> int:
-        """Run bootstrap CI + batch-level FDR correction on admitted candidates."""
-        if self._bootstrap_tester is None or self._fdr_controller is None:
-            return 0
-
-        # Compute IC series for each admitted candidate and gather p-values
-        ic_series_map: Dict[str, np.ndarray] = {}
-        result_map: Dict[str, EvaluationResult] = {}
-
-        for r in to_check:
-            if not r.admitted or r.signals is None:
-                continue
-            try:
-                ic_series = compute_ic(r.signals, self.returns)
-                ic_series_map[r.factor_name] = ic_series
-                result_map[r.factor_name] = r
-            except Exception as exc:
-                logger.warning(
-                    "Helix: IC computation error for '%s': %s",
-                    r.factor_name,
-                    exc,
-                )
-
-        if not ic_series_map:
-            return 0
-
-        try:
-            fdr_result = self._fdr_controller.batch_evaluate(
-                ic_series_map, self._bootstrap_tester
-            )
-        except Exception as exc:
-            logger.warning("Helix: FDR batch evaluation failed: %s", exc)
-            return 0
-
-        rejected = 0
-        for name, is_sig in fdr_result.significant.items():
-            if not is_sig:
-                r = result_map.get(name)
-                if r is not None and r.admitted:
-                    adj_p = fdr_result.adjusted_p_values.get(name, 1.0)
-                    self._revoke_admission(r, all_results,
-                        f"Significance: FDR-adjusted p={adj_p:.4f} > "
-                        f"{getattr(self._significance_config, 'fdr_level', 0.05)}"
-                    )
-                    rejected += 1
-                    logger.debug(
-                        "Helix: significance rejection for '%s' (adj_p=%.4f)",
-                        name,
-                        adj_p,
-                    )
-
-        return rejected
-
-    def _revoke_admission(
-        self,
-        result: EvaluationResult,
-        all_results: List[EvaluationResult],
-        reason: str,
-    ) -> None:
-        """Revoke a previously admitted candidate from the library.
-
-        Updates the EvaluationResult and removes the factor from the library.
-        """
-        result.admitted = False
-        result.rejection_reason = reason
-
-        # Find and remove from library by name+formula match
-        try:
-            for factor in self.library.list_factors():
-                if (
-                    factor.name == result.factor_name
-                    and factor.formula == result.formula
-                ):
-                    self.library.remove_factor(factor.id)
-                    self._remove_semantic_artifacts(result.factor_name)
-                    logger.debug(
-                        "Helix: revoked factor '%s' (id=%d): %s",
-                        result.factor_name,
-                        factor.id,
-                        reason,
-                    )
-                    return
-        except Exception as exc:
-            logger.warning(
-                "Helix: failed to revoke factor '%s': %s",
-                result.factor_name,
-                exc,
-            )
-
-        self._remove_semantic_artifacts(result.factor_name)
-
-    # ------------------------------------------------------------------
-    # Stage 5: Enhanced distillation
-    # ------------------------------------------------------------------
-
-    def _helix_distill(
-        self,
-        results: List[EvaluationResult],
-        admitted_results: List[EvaluationResult],
-    ) -> None:
-        """Stage 5 DISTILL: KG update + embeddings + online forgetting."""
-
-        # -- Knowledge graph updates --
-        if self._kg is not None:
-            self._update_knowledge_graph(results, admitted_results)
-
-        # -- Embed newly admitted factors --
-        if self._embedder is not None:
-            for r in admitted_results:
-                if r.admitted:
-                    try:
-                        self._embedder.embed(r.factor_name, r.formula)
-                    except Exception as exc:
-                        logger.debug(
-                            "Helix: embedding failed for '%s': %s",
-                            r.factor_name,
-                            exc,
-                        )
-
-        # -- Online forgetting --
-        self._apply_forgetting()
-
-    def _update_knowledge_graph(
-        self,
-        results: List[EvaluationResult],
-        admitted_results: List[EvaluationResult],
-    ) -> None:
-        """Update the knowledge graph with new factor nodes and edges."""
-        _, FactorNodeCls = _try_import_kg()
-        if FactorNodeCls is None or self._kg is None:
-            return
-
-        for r in admitted_results:
-            if not r.admitted:
-                continue
-
-            # Extract operators and features from formula
-            operators = self._extract_operators(r.formula)
-            features = self._extract_features(r.formula)
-
-            node = FactorNodeCls(
-                factor_id=r.factor_name,
-                formula=r.formula,
-                ic_mean=r.ic_mean,
-                category=self._infer_category(r.formula),
-                operators=operators,
-                features=features,
-                batch_number=self.iteration,
-                admitted=True,
-            )
-            if self._embedder is not None:
-                try:
-                    node.embedding = self._embedder.embed(r.factor_name, r.formula)
-                except Exception as exc:
-                    logger.debug(
-                        "Helix: failed to attach embedding for '%s': %s",
-                        r.factor_name,
-                        exc,
-                    )
-
-            try:
-                self._kg.add_factor(node)
-            except Exception as exc:
-                logger.debug(
-                    "Helix: failed to add factor to KG: %s", exc
-                )
-                continue
-
-            # Add correlation edges with existing library factors
-            if r.signals is not None:
-                for factor in self.library.list_factors():
-                    if factor.name == r.factor_name:
-                        continue
-                    if factor.signals is not None:
-                        try:
-                            corr = self.library._compute_correlation_vectorized(
-                                r.signals, factor.signals
-                            )
-                            self._kg.add_correlation_edge(
-                                r.factor_name,
-                                factor.name,
-                                rho=corr,
-                                threshold=0.4,
-                            )
-                        except Exception:
-                            pass
-
-            # Detect derivation (mutation) relationships
-            self._detect_derivation(r, operators)
-
-    def _remove_semantic_artifacts(self, factor_id: str) -> None:
-        """Remove a factor from derived semantic stores if present."""
-        if self._kg is not None:
-            try:
-                self._kg.remove_factor(factor_id)
-            except Exception as exc:
-                logger.debug(
-                    "Helix: failed to remove factor '%s' from KG: %s",
-                    factor_id,
-                    exc,
-                )
-
-        if self._embedder is not None:
-            try:
-                self._embedder.remove(factor_id)
-            except Exception as exc:
-                logger.debug(
-                    "Helix: failed to remove factor '%s' from embedder: %s",
-                    factor_id,
-                    exc,
-                )
-
-    def _detect_derivation(
-        self,
-        result: EvaluationResult,
-        new_operators: List[str],
-    ) -> None:
-        """Detect if a new factor is a mutation of an existing one.
-
-        Compares operator sets: if the new factor shares >50% of operators
-        with an existing factor but has at least one different operator,
-        it is considered a derivation (mutation).
-        """
-        if self._kg is None:
-            return
-
-        new_ops = set(new_operators)
-        if not new_ops:
-            return
-
-        for factor in self.library.list_factors():
-            if factor.name == result.factor_name:
-                continue
-
-            existing_ops = set(self._extract_operators(factor.formula))
-            if not existing_ops:
-                continue
-
-            shared = new_ops & existing_ops
-            if not shared:
-                continue
-
-            # More than 50% shared but not identical
-            overlap = len(shared) / max(len(new_ops), len(existing_ops))
-            if 0.5 <= overlap < 1.0:
-                diff_ops = (new_ops - existing_ops) | (existing_ops - new_ops)
-                mutation_type = f"operator_change:{','.join(sorted(diff_ops))}"
-                try:
-                    self._kg.add_derivation_edge(
-                        child=result.factor_name,
-                        parent=factor.name,
-                        mutation_type=mutation_type,
-                    )
-                except Exception:
-                    pass
-
-    def _apply_forgetting(self) -> None:
-        """Apply online forgetting: exponential decay on memory patterns.
-
-        - Decay occurrence_count of all success patterns by forgetting_lambda.
-        - If no admissions for 20+ consecutive iterations, demote success_rate.
-        """
-        lam = self._forgetting_lambda
-
-        for pattern in self.memory.success_patterns:
-            # Decay occurrence count
-            if hasattr(pattern, "occurrence_count"):
-                pattern.occurrence_count = int(
-                    pattern.occurrence_count * lam
-                )
-
-        # Demote success_rate after prolonged drought
-        if self._no_admission_streak >= 20:
-            for pattern in self.memory.success_patterns:
-                if hasattr(pattern, "success_rate"):
-                    current = pattern.success_rate
-                    if current == "High":
-                        pattern.success_rate = "Medium"
-                    elif current == "Medium":
-                        pattern.success_rate = "Low"
-            logger.info(
-                "Helix: demoted success rates after %d iterations without admissions",
-                self._no_admission_streak,
-            )
-
-    # ------------------------------------------------------------------
-    # Auto-invention
-    # ------------------------------------------------------------------
-
-    def _run_auto_invention(self) -> None:
-        """Periodically propose, validate, and register new operators.
-
-        Uses the OperatorInventor to generate novel operators from
-        successful pattern context, then validates and registers them
-        via CustomOperatorStore.
-        """
-        if self._auto_inventor is None:
-            return
-
-        logger.info("Helix: running auto-invention at iteration %d", self.iteration)
-
-        # Gather existing operators
-        try:
-            from factorminer.core.types import OPERATOR_REGISTRY as SPEC_REG
-            existing_ops = dict(SPEC_REG)
-        except ImportError:
-            existing_ops = {}
-
-        # Gather successful pattern descriptions
-        patterns = []
-        for pat in self.memory.success_patterns[:10]:
-            patterns.append(f"{pat.name}: {pat.description}")
-
-        try:
-            proposals = self._auto_inventor.propose_operators(
-                existing_operators=existing_ops,
-                successful_patterns=patterns,
-            )
-        except Exception as exc:
-            logger.warning("Helix: auto-invention proposal failed: %s", exc)
-            return
-
-        self.budget.record_llm_call()
-
-        validated = 0
-        for proposal in proposals:
-            try:
-                val_result = self._auto_inventor.validate_operator(proposal)
-                if val_result.valid:
-                    self._register_invented_operator(proposal, val_result)
-                    validated += 1
-                else:
-                    logger.debug(
-                        "Helix: operator '%s' failed validation: %s",
-                        proposal.name,
-                        val_result.error,
-                    )
-            except Exception as exc:
-                logger.warning(
-                    "Helix: operator validation error for '%s': %s",
-                    proposal.name,
-                    exc,
-                )
-
-        logger.info(
-            "Helix: auto-invention: %d/%d proposals validated and registered",
-            validated,
-            len(proposals),
-        )
-
-    def _register_invented_operator(
-        self,
-        proposal: Any,
-        val_result: Any,
-    ) -> None:
-        """Register a validated auto-invented operator."""
-        if self._custom_op_store is None:
-            logger.warning(
-                "Helix: no custom operator store; cannot register '%s'",
-                proposal.name,
-            )
-            return
-
-        try:
-            from factorminer.operators.custom import CustomOperator
-            from factorminer.core.types import OperatorSpec, OperatorType, SignatureType
-
-            spec = OperatorSpec(
-                name=proposal.name,
-                arity=proposal.arity,
-                category=OperatorType.AUTO_INVENTED,
-                signature=SignatureType.TIME_SERIES_TO_TIME_SERIES,
-                param_names=proposal.param_names,
-                param_defaults=proposal.param_defaults,
-                param_ranges={
-                    k: tuple(v) for k, v in proposal.param_ranges.items()
-                },
-                description=proposal.description,
-            )
-
-            # Compile the function
-            from factorminer.operators.custom import _compile_operator_code
-            fn = _compile_operator_code(proposal.numpy_code)
-            if fn is None:
-                logger.warning(
-                    "Helix: failed to compile invented operator '%s'",
-                    proposal.name,
-                )
-                return
-
-            custom_op = CustomOperator(
-                name=proposal.name,
-                spec=spec,
-                numpy_code=proposal.numpy_code,
-                numpy_fn=fn,
-                validation_ic=val_result.ic_contribution,
-                invention_iteration=self.iteration,
-                rationale=proposal.rationale,
-            )
-
-            self._custom_op_store.register(custom_op)
-            logger.info(
-                "Helix: registered auto-invented operator '%s' (IC=%.4f)",
-                proposal.name,
-                val_result.ic_contribution,
-            )
-        except Exception as exc:
-            logger.warning(
-                "Helix: failed to register operator '%s': %s",
-                proposal.name,
-                exc,
-            )
-
-    # ------------------------------------------------------------------
-    # Enhanced checkpointing
-    # ------------------------------------------------------------------
-
-    def _checkpoint(self) -> None:
-        """Save a periodic checkpoint including Phase 2 state."""
-        try:
-            self.save_session()
-        except Exception as exc:
-            logger.warning("Helix: checkpoint failed: %s", exc)
-
-    def save_session(self, path: Optional[str] = None) -> str:
-        """Save the full mining session state including Phase 2 components.
-
-        Extends the base RalphLoop save with:
-        - Knowledge graph serialization
-        - Custom operator store persistence
-
-        Parameters
-        ----------
-        path : str, optional
-            Directory for the checkpoint.
-
-        Returns
-        -------
-        str
-            Path to the saved session directory.
-        """
-        # Base save
-        checkpoint_path = super().save_session(path)
-        checkpoint_dir = Path(checkpoint_path)
-
-        # Save knowledge graph
-        if self._kg is not None:
-            try:
-                kg_path = checkpoint_dir / "knowledge_graph.json"
-                self._kg.save(kg_path)
-                logger.debug("Helix: saved knowledge graph to %s", kg_path)
-            except Exception as exc:
-                logger.warning("Helix: failed to save knowledge graph: %s", exc)
-
-        # Save custom operators
-        if self._custom_op_store is not None:
-            try:
-                self._custom_op_store.save()
-                logger.debug("Helix: saved custom operators")
-            except Exception as exc:
-                logger.warning("Helix: failed to save custom operators: %s", exc)
-
-        # Save helix-specific state
-        helix_state = {
-            "no_admission_streak": self._no_admission_streak,
-            "forgetting_lambda": self._forgetting_lambda,
-            "canonicalize": self._canonicalize,
-            "enable_knowledge_graph": self._enable_kg,
-            "enable_embeddings": self._enable_embeddings,
-            "enable_auto_inventor": self._enable_auto_inventor,
-        }
-        try:
-            with open(checkpoint_dir / "helix_state.json", "w") as f:
-                json.dump(helix_state, f, indent=2)
-        except Exception as exc:
-            logger.warning("Helix: failed to save helix state: %s", exc)
-
-        if self._session is not None:
-            self._refresh_run_manifest(
-                output_dir=str(checkpoint_dir.parent),
-                artifact_paths={
-                    "library": str(checkpoint_dir / "library.json"),
-                    "memory": str(checkpoint_dir / "memory.json"),
-                    "session": str(checkpoint_dir / "session.json"),
-                    "run_manifest": str(checkpoint_dir / "run_manifest.json"),
-                    "loop_state": str(checkpoint_dir / "loop_state.json"),
-                    "helix_state": str(checkpoint_dir / "helix_state.json"),
-                    "knowledge_graph": str(checkpoint_dir / "knowledge_graph.json"),
-                },
-            )
-            self._persist_run_manifest(checkpoint_dir / "run_manifest.json")
-            try:
-                self._session.save(checkpoint_dir / "session.json")
-            except Exception as exc:
-                logger.warning("Helix: failed to save session metadata: %s", exc)
-
-        return checkpoint_path
-
-    def load_session(self, path: str) -> None:
-        """Resume a mining session from a saved checkpoint.
-
-        Extends the base RalphLoop load with Phase 2 state restoration.
-
-        Parameters
-        ----------
-        path : str
-            Path to the checkpoint directory.
-        """
-        super().load_session(path)
-        checkpoint_dir = Path(path)
-
-        # Load knowledge graph
-        if self._kg is not None:
-            kg_path = checkpoint_dir / "knowledge_graph.json"
-            if kg_path.exists():
-                KGCls, _ = _try_import_kg()
-                if KGCls is not None:
-                    try:
-                        self._kg = KGCls.load(kg_path)
-                        logger.info(
-                            "Helix: loaded knowledge graph (%d factors, %d edges)",
-                            self._kg.get_factor_count(),
-                            self._kg.get_edge_count(),
-                        )
-                    except Exception as exc:
-                        logger.warning(
-                            "Helix: failed to load knowledge graph: %s", exc
-                        )
-
-        # Load custom operators
-        if self._custom_op_store is not None:
-            try:
-                self._custom_op_store.load()
-            except Exception as exc:
-                logger.warning(
-                    "Helix: failed to load custom operators: %s", exc
-                )
-
-        # Load helix-specific state
-        helix_state_path = checkpoint_dir / "helix_state.json"
-        if helix_state_path.exists():
-            try:
-                with open(helix_state_path) as f:
-                    helix_state = json.load(f)
-                self._no_admission_streak = helix_state.get(
-                    "no_admission_streak", 0
-                )
-                logger.info(
-                    "Helix: restored helix state (streak=%d)",
-                    self._no_admission_streak,
-                )
-            except Exception as exc:
-                logger.warning(
-                    "Helix: failed to load helix state: %s", exc
-                )
-
-        self._prime_embedder_from_library()
-        if self._session is not None and self._session.run_manifest:
-            self._run_manifest = dict(self._session.run_manifest)
-        else:
-            run_manifest_path = checkpoint_dir / "run_manifest.json"
-            if run_manifest_path.exists():
-                try:
-                    with open(run_manifest_path) as f:
-                        self._run_manifest = json.load(f)
-                except Exception as exc:
-                    logger.warning(
-                        "Helix: failed to load run manifest: %s", exc
-                    )
-
-    def _loop_type(self) -> str:
-        """Label the loop for provenance and manifests."""
-        return "helix"
-
-    def _phase2_features(self) -> List[str]:
-        """List the enabled Helix Phase 2 features."""
-        features: List[str] = []
-        if self._debate_generator is not None:
-            features.append("debate")
-        if self._canonicalizer is not None:
-            features.append("canonicalization")
-        if self._kg is not None:
-            features.append("knowledge_graph")
-        if self._embedder is not None:
-            features.append("embeddings")
-        if self._causal_validator is not None:
-            features.append("causal")
-        if self._regime_evaluator is not None:
-            features.append("regime")
-        if self._capacity_estimator is not None:
-            features.append("capacity")
-        if self._bootstrap_tester is not None and self._fdr_controller is not None:
-            features.append("significance")
-        if self._auto_inventor is not None:
-            features.append("auto_inventor")
-        return features
-
-    def _generator_family(self) -> str:
-        """Return the active Helix generator label for provenance."""
-        if self._debate_generator is not None:
-            return self._debate_generator.__class__.__name__
-        return super()._generator_family()
-
-    # ------------------------------------------------------------------
-    # Utility helpers
-    # ------------------------------------------------------------------
-
-    @staticmethod
-    def _extract_operators(formula: str) -> List[str]:
-        """Extract operator names from a DSL formula string."""
-        return re.findall(r"([A-Z][a-zA-Z]+)\(", formula)
-
-    @staticmethod
-    def _extract_features(formula: str) -> List[str]:
-        """Extract feature names (e.g. $close, $volume) from a formula."""
-        return re.findall(r"\$[a-zA-Z_]+", formula)
-
-    def _prime_embedder_from_library(self) -> None:
-        """Seed the embedder cache from the currently admitted library."""
-        if self._embedder is None:
-            return
-
-        try:
-            self._embedder.clear()
-        except Exception as exc:
-            logger.debug("Helix: failed to clear embedder before priming: %s", exc)
-            return
-
-        for factor in self.library.list_factors():
-            if not factor.formula:
-                continue
-            try:
-                self._embedder.embed(factor.name, factor.formula)
-            except Exception as exc:
-                logger.debug(
-                    "Helix: failed to prime embedding for '%s': %s",
-                    factor.name,
-                    exc,
-                )
-
-    def _semantic_duplicate_target(self, formula: str) -> Optional[str]:
-        """Return the matched library factor if embeddings flag a near-duplicate."""
-        if self._embedder is None or self.library.size == 0:
-            return None
-
-        try:
-            return self._embedder.is_semantic_duplicate(formula)
-        except Exception as exc:
-            logger.debug("Helix: semantic duplicate check failed: %s", exc)
-            return None
diff --git a/src/factorminer/factorminer/core/library_io.py b/src/factorminer/factorminer/core/library_io.py
deleted file mode 100644
index 4353c90..0000000
--- a/src/factorminer/factorminer/core/library_io.py
+++ /dev/null
@@ -1,921 +0,0 @@
-"""Serialization and I/O for the FactorLibrary.
-
-Provides save/load to JSON + optional binary signal cache (.npz),
-CSV export, formula export, and import of the 110 factors from the paper.
-"""
-
-from __future__ import annotations
-
-import csv
-import json
-import logging
-from pathlib import Path
-from typing import Dict, List, Optional, Union
-
-import numpy as np
-
-from src.factorminer.factorminer.core.factor_library import Factor, FactorLibrary
-
-logger = logging.getLogger(__name__)
-
-
-# ======================================================================
-# Save / Load
-# ======================================================================
-
-def save_library(
-    library: FactorLibrary,
-    path: Union[str, Path],
-    save_signals: bool = True,
-) -> None:
-    """Save a FactorLibrary to disk.
-
-    Creates two files:
-    - ``<path>.json`` -- factor metadata and library configuration
-    - ``<path>_signals.npz`` -- binary signal cache (if save_signals=True
-      and any factors have signals)
-
-    Parameters
-    ----------
-    library : FactorLibrary
-    path : str or Path
-        Base path (without extension). E.g. ``"output/my_library"`` produces
-        ``output/my_library.json`` and ``output/my_library_signals.npz``.
-    save_signals : bool
-        Whether to write the binary signal cache.
-    """
-    path = Path(path)
-    path.parent.mkdir(parents=True, exist_ok=True)
-
-    # -- Metadata JSON --
-    meta = {
-        "correlation_threshold": library.correlation_threshold,
-        "ic_threshold": library.ic_threshold,
-        "next_id": library._next_id,
-        "factors": [f.to_dict() for f in library.list_factors()],
-    }
-    if library.correlation_matrix is not None:
-        meta["correlation_matrix"] = library.correlation_matrix.tolist()
-    meta["id_to_index"] = {str(k): v for k, v in library._id_to_index.items()}
-
-    json_path = path.with_suffix(".json")
-    with open(json_path, "w") as fp:
-        json.dump(meta, fp, indent=2)
-    logger.info("Saved library metadata to %s (%d factors)", json_path, library.size)
-
-    # -- Binary signal cache --
-    if save_signals:
-        signal_arrays: Dict[str, np.ndarray] = {}
-        for f in library.list_factors():
-            if f.signals is not None:
-                signal_arrays[f"factor_{f.id}"] = f.signals
-
-        if signal_arrays:
-            npz_path = Path(str(path) + "_signals.npz")
-            np.savez_compressed(npz_path, **signal_arrays)
-            logger.info(
-                "Saved signal cache to %s (%d arrays)",
-                npz_path, len(signal_arrays),
-            )
-
-
-def load_library(path: Union[str, Path]) -> FactorLibrary:
-    """Load a FactorLibrary from disk.
-
-    Parameters
-    ----------
-    path : str or Path
-        Base path (without extension). Will look for ``<path>.json`` and
-        optionally ``<path>_signals.npz``.
-
-    Returns
-    -------
-    FactorLibrary
-    """
-    path = Path(path)
-    json_path = path.with_suffix(".json")
-
-    with open(json_path, "r") as fp:
-        meta = json.load(fp)
-
-    library = FactorLibrary(
-        correlation_threshold=meta.get("correlation_threshold", 0.5),
-        ic_threshold=meta.get("ic_threshold", 0.04),
-    )
-    library._next_id = meta.get("next_id", 1)
-
-    # Restore factors
-    for fd in meta.get("factors", []):
-        factor = Factor.from_dict(fd)
-        library.factors[factor.id] = factor
-
-    # Restore correlation matrix
-    if "correlation_matrix" in meta and meta["correlation_matrix"] is not None:
-        library.correlation_matrix = np.array(
-            meta["correlation_matrix"], dtype=np.float64
-        )
-
-    # Restore id-to-index mapping
-    if "id_to_index" in meta:
-        library._id_to_index = {
-            int(k): v for k, v in meta["id_to_index"].items()
-        }
-
-    # Load signal cache if present
-    npz_path = Path(str(path) + "_signals.npz")
-    if npz_path.exists():
-        data = np.load(npz_path)
-        for f in library.factors.values():
-            key = f"factor_{f.id}"
-            if key in data:
-                f.signals = data[key]
-        data.close()
-        logger.info("Loaded signal cache from %s", npz_path)
-
-    logger.info(
-        "Loaded library from %s (%d factors)", json_path, library.size
-    )
-    return library
-
-
-# ======================================================================
-# Export utilities
-# ======================================================================
-
-def export_csv(library: FactorLibrary, path: Union[str, Path]) -> None:
-    """Export the factor table to CSV.
-
-    Columns: ID, Name, Formula, Category, IC_Mean, ICIR, IC_Win_Rate,
-    Max_Correlation, Batch, Admission_Date
-    """
-    path = Path(path)
-    path.parent.mkdir(parents=True, exist_ok=True)
-
-    fieldnames = [
-        "ID", "Name", "Formula", "Category", "IC_Mean", "ICIR",
-        "IC_Win_Rate", "Max_Correlation", "Batch", "Admission_Date",
-    ]
-
-    with open(path, "w", newline="") as fp:
-        writer = csv.DictWriter(fp, fieldnames=fieldnames)
-        writer.writeheader()
-        for f in library.list_factors():
-            writer.writerow({
-                "ID": f.id,
-                "Name": f.name,
-                "Formula": f.formula,
-                "Category": f.category,
-                "IC_Mean": f"{f.ic_mean:.6f}",
-                "ICIR": f"{f.icir:.6f}",
-                "IC_Win_Rate": f"{f.ic_win_rate:.4f}",
-                "Max_Correlation": f"{f.max_correlation:.4f}",
-                "Batch": f.batch_number,
-                "Admission_Date": f.admission_date,
-            })
-
-    logger.info("Exported %d factors to %s", library.size, path)
-
-
-def export_formulas(library: FactorLibrary, path: Union[str, Path]) -> None:
-    """Export just the formulas for reproduction.
-
-    One formula per line, prefixed with the factor ID and name.
-    Format: ``ID | Name | Formula``
-    """
-    path = Path(path)
-    path.parent.mkdir(parents=True, exist_ok=True)
-
-    with open(path, "w") as fp:
-        fp.write("# FactorMiner Library Formulas\n")
-        fp.write("# ID | Name | Formula\n")
-        fp.write(f"# Total: {library.size} factors\n")
-        fp.write("#" + "-" * 78 + "\n")
-        for f in library.list_factors():
-            fp.write(f"{f.id:04d} | {f.name} | {f.formula}\n")
-
-    logger.info("Exported %d formulas to %s", library.size, path)
-
-
-# ======================================================================
-# Paper factor catalog (110 factors from Appendix P)
-# ======================================================================
-
-# Representative subset of the 110 factors discovered by FactorMiner.
-# Each entry: (name, formula, category)
-PAPER_FACTORS: List[Dict[str, str]] = [
-    # Factor 001
-    {
-        "name": "Intraday Range Position",
-        "formula": "Neg(CsRank(Div(Sub($close, TsMin($close, 48)), Add(Sub(TsMax($close, 48), TsMin($close, 48)), 1e-8))))",
-        "category": "Mean-reversion",
-    },
-    # Factor 002
-    {
-        "name": "Volume-Weighted Momentum",
-        "formula": "Neg(CsRank(Mul(Return($close, 5), Div($volume, Mean($volume, 20)))))",
-        "category": "Momentum",
-    },
-    # Factor 003
-    {
-        "name": "Residual Volatility",
-        "formula": "Neg(CsRank(Std(Sub($close, EMA($close, 10)), 20)))",
-        "category": "Volatility",
-    },
-    # Factor 004
-    {
-        "name": "Intraday Amplitude Ratio",
-        "formula": "Neg(CsRank(Div(Sub($high, $low), Add($close, 1e-8))))",
-        "category": "Volatility",
-    },
-    # Factor 005
-    {
-        "name": "Volume Surprise",
-        "formula": "Neg(CsRank(Div(Sub($volume, Mean($volume, 20)), Add(Std($volume, 20), 1e-8))))",
-        "category": "Volume",
-    },
-    # Factor 006
-    {
-        "name": "VWAP Deviation",
-        "formula": "Neg(Div(Sub($close, $vwap), $vwap))",
-        "category": "VWAP",
-    },
-    # Factor 007
-    {
-        "name": "Short-term Reversal",
-        "formula": "Neg(CsRank(Return($close, 3)))",
-        "category": "Mean-reversion",
-    },
-    # Factor 008
-    {
-        "name": "Turnover Momentum",
-        "formula": "Neg(CsRank(Delta(Div($amt, Add($volume, 1e-8)), 5)))",
-        "category": "Turnover",
-    },
-    # Factor 009
-    {
-        "name": "High-Low Midpoint Reversion",
-        "formula": "Neg(CsRank(Sub($close, Div(Add($high, $low), 2))))",
-        "category": "Mean-reversion",
-    },
-    # Factor 010
-    {
-        "name": "Rolling Beta Residual",
-        "formula": "Neg(CsRank(Resid($returns, Mean($returns, 20), 20)))",
-        "category": "Risk",
-    },
-    # Factor 011
-    {
-        "name": "VWAP Slope",
-        "formula": "Neg(CsRank(TsLinRegSlope(Div(Sub($close, $vwap), $vwap), 10)))",
-        "category": "VWAP",
-    },
-    # Factor 012
-    {
-        "name": "Accumulation-Distribution",
-        "formula": "Neg(CsRank(Sum(Mul(Div(Sub(Mul(2, $close), Add($high, $low)), Add(Sub($high, $low), 1e-8)), $volume), 10)))",
-        "category": "Volume",
-    },
-    # Factor 013
-    {
-        "name": "Relative Strength Index Deviation",
-        "formula": "Neg(CsRank(Sub(Mean(Max(Delta($close, 1), 0), 14), Mean(Abs(Min(Delta($close, 1), 0)), 14))))",
-        "category": "Momentum",
-    },
-    # Factor 014
-    {
-        "name": "Price-Volume Correlation",
-        "formula": "Neg(Corr($close, $volume, 10))",
-        "category": "Volume",
-    },
-    # Factor 015
-    {
-        "name": "Skewness of Returns",
-        "formula": "Neg(CsRank(Skew($returns, 20)))",
-        "category": "Higher-moment",
-    },
-    # Factor 016
-    {
-        "name": "Kurtosis of Returns",
-        "formula": "Neg(CsRank(Kurt($returns, 20)))",
-        "category": "Higher-moment",
-    },
-    # Factor 017
-    {
-        "name": "Volume-Weighted Return",
-        "formula": "Neg(CsRank(Div(Sum(Mul($returns, $volume), 10), Add(Sum($volume, 10), 1e-8))))",
-        "category": "Volume",
-    },
-    # Factor 018
-    {
-        "name": "Close-to-High Ratio",
-        "formula": "Neg(CsRank(Div(Sub($high, $close), Add($high, 1e-8))))",
-        "category": "Mean-reversion",
-    },
-    # Factor 019
-    {
-        "name": "Delayed Correlation Shift",
-        "formula": "Neg(CsRank(Sub(Corr($close, $volume, 10), Corr(Delay($close, 5), $volume, 10))))",
-        "category": "Volume",
-    },
-    # Factor 020
-    {
-        "name": "Exponential Momentum",
-        "formula": "Neg(CsRank(Sub($close, EMA($close, 20))))",
-        "category": "Momentum",
-    },
-    # Factor 021
-    {
-        "name": "Range-Adjusted Volume",
-        "formula": "Neg(CsRank(Div($volume, Add(Sub($high, $low), 1e-8))))",
-        "category": "Volume",
-    },
-    # Factor 022
-    {
-        "name": "Cumulative Return Rank",
-        "formula": "Neg(CsRank(Sum($returns, 10)))",
-        "category": "Momentum",
-    },
-    # Factor 023
-    {
-        "name": "VWAP Momentum",
-        "formula": "Neg(CsRank(Return($vwap, 5)))",
-        "category": "VWAP",
-    },
-    # Factor 024
-    {
-        "name": "Bollinger Band Position",
-        "formula": "Neg(CsRank(Div(Sub($close, Mean($close, 20)), Add(Std($close, 20), 1e-8))))",
-        "category": "Mean-reversion",
-    },
-    # Factor 025
-    {
-        "name": "Volume Decay Weighted",
-        "formula": "Neg(CsRank(Decay($volume, 10)))",
-        "category": "Volume",
-    },
-    # Factor 026
-    {
-        "name": "Overnight Return",
-        "formula": "Neg(CsRank(Div(Sub($open, Delay($close, 1)), Add(Delay($close, 1), 1e-8))))",
-        "category": "Overnight",
-    },
-    # Factor 027
-    {
-        "name": "Intraday Return",
-        "formula": "Neg(CsRank(Div(Sub($close, $open), Add($open, 1e-8))))",
-        "category": "Intraday",
-    },
-    # Factor 028
-    {
-        "name": "Max Drawdown",
-        "formula": "Neg(CsRank(Div(Sub($close, TsMax($close, 20)), Add(TsMax($close, 20), 1e-8))))",
-        "category": "Risk",
-    },
-    # Factor 029
-    {
-        "name": "Hurst Exponent Proxy",
-        "formula": "Neg(CsRank(Div(Std($returns, 20), Add(Std($returns, 5), 1e-8))))",
-        "category": "Volatility",
-    },
-    # Factor 030
-    {
-        "name": "Volume Imbalance",
-        "formula": "Neg(CsRank(Sub(Mean($volume, 5), Mean($volume, 20))))",
-        "category": "Volume",
-    },
-    # Factor 031
-    {
-        "name": "Weighted Close Position",
-        "formula": "Neg(CsRank(Div(Sub(Mul(2, $close), Add($high, $low)), Add(Sub($high, $low), 1e-8))))",
-        "category": "Mean-reversion",
-    },
-    # Factor 032
-    {
-        "name": "Trend Intensity",
-        "formula": "Neg(CsRank(Div(Abs(Delta($close, 10)), Add(Sum(Abs(Delta($close, 1)), 10), 1e-8))))",
-        "category": "Trend",
-    },
-    # Factor 033
-    {
-        "name": "Return Dispersion",
-        "formula": "Neg(CsRank(Std($returns, 5)))",
-        "category": "Volatility",
-    },
-    # Factor 034
-    {
-        "name": "VWAP Relative Strength",
-        "formula": "Neg(CsRank(Div(Sub(Mean($close, 5), $vwap), Add($vwap, 1e-8))))",
-        "category": "VWAP",
-    },
-    # Factor 035
-    {
-        "name": "Rank Reversal",
-        "formula": "Neg(CsRank(Sub(TsRank($close, 10), TsRank($close, 30))))",
-        "category": "Mean-reversion",
-    },
-    # Factor 036
-    {
-        "name": "Money Flow Index",
-        "formula": "Neg(CsRank(Div(Sum(Mul(Max(Delta($close, 1), 0), $volume), 14), Add(Sum(Mul(Abs(Delta($close, 1)), $volume), 14), 1e-8))))",
-        "category": "Volume",
-    },
-    # Factor 037
-    {
-        "name": "Adaptive Momentum",
-        "formula": "Neg(CsRank(Mul(Return($close, 10), Div(Std($returns, 5), Add(Std($returns, 20), 1e-8)))))",
-        "category": "Momentum",
-    },
-    # Factor 038
-    {
-        "name": "Volume Trend",
-        "formula": "Neg(CsRank(TsLinRegSlope($volume, 10)))",
-        "category": "Volume",
-    },
-    # Factor 039
-    {
-        "name": "Price Acceleration",
-        "formula": "Neg(CsRank(Sub(Delta($close, 5), Delta(Delay($close, 5), 5))))",
-        "category": "Momentum",
-    },
-    # Factor 040
-    {
-        "name": "Realized Volatility Ratio",
-        "formula": "Neg(CsRank(Div(Std($returns, 10), Add(Std($returns, 30), 1e-8))))",
-        "category": "Volatility",
-    },
-    # Factor 041
-    {
-        "name": "Amount Concentration",
-        "formula": "Neg(CsRank(Div(TsMax($amt, 5), Add(Mean($amt, 20), 1e-8))))",
-        "category": "Turnover",
-    },
-    # Factor 042
-    {
-        "name": "Cross-Sectional Volume Rank",
-        "formula": "Neg(CsRank(Div($volume, Add(Mean($volume, 60), 1e-8))))",
-        "category": "Volume",
-    },
-    # Factor 043
-    {
-        "name": "Gap Momentum",
-        "formula": "Neg(CsRank(Sum(Div(Sub($open, Delay($close, 1)), Add(Delay($close, 1), 1e-8)), 5)))",
-        "category": "Overnight",
-    },
-    # Factor 044
-    {
-        "name": "VWAP Distance Decay",
-        "formula": "Neg(CsRank(Decay(Div(Sub($close, $vwap), Add($vwap, 1e-8)), 10)))",
-        "category": "VWAP",
-    },
-    # Factor 045
-    {
-        "name": "Tail Risk Indicator",
-        "formula": "Neg(CsRank(Div(TsMin($returns, 20), Add(Std($returns, 20), 1e-8))))",
-        "category": "Risk",
-    },
-    # Factor 046
-    {
-        "name": "Volatility-Regime Reversal Divergence",
-        "formula": "IfElse(Greater(Std($returns, 12), Mean(Std($returns, 12), 48)), Neg(CsRank(Delta($close, 3))), Neg(CsRank(Div(Sub($close, $low), Add(Sub($high, $low), 0.0001)))))",
-        "category": "Regime-switching",
-    },
-    # Factor 047
-    {
-        "name": "Regime Volume Signal",
-        "formula": "IfElse(Greater($volume, Mean($volume, 20)), Neg(CsRank($returns)), Neg(CsRank(Return($close, 5))))",
-        "category": "Regime-switching",
-    },
-    # Factor 048
-    {
-        "name": "Liquidity-Adjusted Reversal",
-        "formula": "Neg(CsRank(Mul(Return($close, 3), Div($volume, Add(Mean($volume, 20), 1e-8)))))",
-        "category": "Mean-reversion",
-    },
-    # Factor 049
-    {
-        "name": "Cross-Sectional Volatility Rank",
-        "formula": "Neg(CsRank(CsRank(Std($returns, 10))))",
-        "category": "Volatility",
-    },
-    # Factor 050
-    {
-        "name": "VWAP Bollinger",
-        "formula": "Neg(CsRank(Div(Sub($vwap, Mean($vwap, 20)), Add(Std($vwap, 20), 1e-8))))",
-        "category": "VWAP",
-    },
-    # Factor 051
-    {
-        "name": "Smoothed Return Reversal",
-        "formula": "Neg(CsRank(EMA($returns, 5)))",
-        "category": "Mean-reversion",
-    },
-    # Factor 052
-    {
-        "name": "Volume-Price Divergence",
-        "formula": "Neg(CsRank(Sub(TsRank($volume, 10), TsRank($close, 10))))",
-        "category": "Volume",
-    },
-    # Factor 053
-    {
-        "name": "Decay Weighted Momentum",
-        "formula": "Neg(CsRank(Decay($returns, 20)))",
-        "category": "Momentum",
-    },
-    # Factor 054
-    {
-        "name": "Range Percentile",
-        "formula": "Neg(CsRank(Div(Sub($close, TsMin($close, 20)), Add(Sub(TsMax($close, 20), TsMin($close, 20)), 1e-8))))",
-        "category": "Mean-reversion",
-    },
-    # Factor 055
-    {
-        "name": "Volume Skewness",
-        "formula": "Neg(CsRank(Skew($volume, 20)))",
-        "category": "Volume",
-    },
-    # Factor 056
-    {
-        "name": "Residual Momentum",
-        "formula": "Neg(CsRank(TsLinRegResid($close, 20)))",
-        "category": "Momentum",
-    },
-    # Factor 057
-    {
-        "name": "VWAP Trend",
-        "formula": "Neg(CsRank(Delta(Div(Sub($close, $vwap), $vwap), 5)))",
-        "category": "VWAP",
-    },
-    # Factor 058
-    {
-        "name": "Return Autocorrelation",
-        "formula": "Neg(CsRank(Corr($returns, Delay($returns, 1), 10)))",
-        "category": "Mean-reversion",
-    },
-    # Factor 059
-    {
-        "name": "Price Efficiency",
-        "formula": "Neg(CsRank(Div(Abs(Sum($returns, 10)), Add(Sum(Abs($returns), 10), 1e-8))))",
-        "category": "Trend",
-    },
-    # Factor 060
-    {
-        "name": "Relative Volume Change",
-        "formula": "Neg(CsRank(Return($volume, 5)))",
-        "category": "Volume",
-    },
-    # Factor 061
-    {
-        "name": "Weighted VWAP Position",
-        "formula": "Neg(CsRank(WMA(Div(Sub($close, $vwap), $vwap), 10)))",
-        "category": "VWAP",
-    },
-    # Factor 062
-    {
-        "name": "Regime Momentum Flip",
-        "formula": "IfElse(Greater(Mean($returns, 5), 0), Neg(CsRank(Return($close, 10))), CsRank(Return($close, 3)))",
-        "category": "Regime-switching",
-    },
-    # Factor 063
-    {
-        "name": "High-Low Volatility",
-        "formula": "Neg(CsRank(Mean(Div(Sub($high, $low), Add($close, 1e-8)), 10)))",
-        "category": "Volatility",
-    },
-    # Factor 064
-    {
-        "name": "Opening Gap Reversal",
-        "formula": "Neg(CsRank(Div(Sub($open, Delay($close, 1)), Add(Std($returns, 10), 1e-8))))",
-        "category": "Overnight",
-    },
-    # Factor 065
-    {
-        "name": "Volume Momentum Spread",
-        "formula": "Neg(CsRank(Sub(Mean($volume, 5), Mean($volume, 40))))",
-        "category": "Volume",
-    },
-    # Factor 066
-    {
-        "name": "Regime Volume Reversal",
-        "formula": "IfElse(Greater(Div($volume, Add(Mean($volume, 20), 1e-8)), 1.5), Neg(CsRank($returns)), Neg(CsRank(Return($close, 10))))",
-        "category": "Regime-switching",
-    },
-    # Factor 067
-    {
-        "name": "Slope Reversal",
-        "formula": "Neg(CsRank(TsLinRegSlope($close, 5)))",
-        "category": "Mean-reversion",
-    },
-    # Factor 068
-    {
-        "name": "VWAP Momentum Decay",
-        "formula": "Neg(CsRank(Decay(Return($vwap, 3), 10)))",
-        "category": "VWAP",
-    },
-    # Factor 069
-    {
-        "name": "Turnover Rate Change",
-        "formula": "Neg(CsRank(Delta(Div($amt, Add($volume, 1e-8)), 10)))",
-        "category": "Turnover",
-    },
-    # Factor 070
-    {
-        "name": "Return Quantile Signal",
-        "formula": "Neg(CsRank(Quantile($returns, 20, 0.75)))",
-        "category": "Higher-moment",
-    },
-    # Factor 071
-    {
-        "name": "Double EMA Crossover",
-        "formula": "Neg(CsRank(Sub(EMA($close, 5), EMA($close, 20))))",
-        "category": "Trend",
-    },
-    # Factor 072
-    {
-        "name": "Conditional Volatility Return",
-        "formula": "Neg(CsRank(Div($returns, Add(Std($returns, 10), 1e-8))))",
-        "category": "Risk",
-    },
-    # Factor 073
-    {
-        "name": "Amplitude Trend",
-        "formula": "Neg(CsRank(TsLinRegSlope(Div(Sub($high, $low), Add($close, 1e-8)), 10)))",
-        "category": "Volatility",
-    },
-    # Factor 074
-    {
-        "name": "Volume-Weighted Range",
-        "formula": "Neg(CsRank(Mean(Mul(Div(Sub($high, $low), Add($close, 1e-8)), $volume), 10)))",
-        "category": "Volume",
-    },
-    # Factor 075
-    {
-        "name": "Intraday Efficiency Ratio",
-        "formula": "Neg(CsRank(Div(Abs(Sub($close, $open)), Add(Sub($high, $low), 1e-8))))",
-        "category": "Intraday",
-    },
-    # Factor 076
-    {
-        "name": "Cumulative Volume Signal",
-        "formula": "Neg(CsRank(Div(Sum(Mul($returns, $volume), 20), Add(Sum($volume, 20), 1e-8))))",
-        "category": "Volume",
-    },
-    # Factor 077
-    {
-        "name": "VWAP Cross-Sectional Momentum",
-        "formula": "Neg(CsRank(CsRank(Return($vwap, 10))))",
-        "category": "VWAP",
-    },
-    # Factor 078
-    {
-        "name": "Mean-Reversion Indicator",
-        "formula": "Neg(CsRank(Div(Sub($close, SMA($close, 10)), Add(SMA($close, 10), 1e-8))))",
-        "category": "Mean-reversion",
-    },
-    # Factor 079
-    {
-        "name": "Volume Regime Indicator",
-        "formula": "Neg(CsRank(Div(Std($volume, 5), Add(Std($volume, 20), 1e-8))))",
-        "category": "Volume",
-    },
-    # Factor 080
-    {
-        "name": "Return Persistence",
-        "formula": "Neg(CsRank(Mul(Sign(Delta($close, 1)), Sign(Delta($close, 5)))))",
-        "category": "Momentum",
-    },
-    # Factor 081
-    {
-        "name": "Regime Trend Strength",
-        "formula": "IfElse(Greater(Abs(TsLinRegSlope($close, 20)), Std($close, 20)), Neg(CsRank(TsLinRegSlope($close, 5))), Neg(CsRank(Return($close, 3))))",
-        "category": "Regime-switching",
-    },
-    # Factor 082
-    {
-        "name": "VWAP Dispersion",
-        "formula": "Neg(CsRank(Std(Div(Sub($close, $vwap), $vwap), 10)))",
-        "category": "VWAP",
-    },
-    # Factor 083
-    {
-        "name": "Smart Money Flow",
-        "formula": "Neg(CsRank(Sum(Mul(IfElse(Greater($close, Delay($close, 1)), $volume, Neg($volume)), Div(Sub($high, $low), Add($close, 1e-8))), 10)))",
-        "category": "Volume",
-    },
-    # Factor 084
-    {
-        "name": "Return Rank Dispersion",
-        "formula": "Neg(CsRank(Sub(TsRank($returns, 5), TsRank($returns, 20))))",
-        "category": "Mean-reversion",
-    },
-    # Factor 085
-    {
-        "name": "Volume Acceleration",
-        "formula": "Neg(CsRank(Sub(Delta($volume, 5), Delta(Delay($volume, 5), 5))))",
-        "category": "Volume",
-    },
-    # Factor 086
-    {
-        "name": "Close-Low Ratio Trend",
-        "formula": "Neg(CsRank(Mean(Div(Sub($close, $low), Add(Sub($high, $low), 1e-8)), 5)))",
-        "category": "Mean-reversion",
-    },
-    # Factor 087
-    {
-        "name": "Hull MA Deviation",
-        "formula": "Neg(CsRank(Div(Sub($close, HMA($close, 10)), Add(Std($close, 10), 1e-8))))",
-        "category": "Trend",
-    },
-    # Factor 088
-    {
-        "name": "DEMA Momentum Signal",
-        "formula": "Neg(CsRank(Sub(DEMA($close, 5), DEMA($close, 20))))",
-        "category": "Momentum",
-    },
-    # Factor 089
-    {
-        "name": "Volume Profile Skew",
-        "formula": "Neg(CsRank(Skew(Div($volume, Add(Mean($volume, 20), 1e-8)), 10)))",
-        "category": "Volume",
-    },
-    # Factor 090
-    {
-        "name": "Conditional VWAP Signal",
-        "formula": "IfElse(Greater($close, $vwap), Neg(CsRank(Div(Sub($close, $vwap), $vwap))), CsRank(Div(Sub($vwap, $close), $vwap)))",
-        "category": "VWAP",
-    },
-    # Factor 091
-    {
-        "name": "Extreme Volume Reversal",
-        "formula": "Neg(CsRank(Mul(IfElse(Greater($volume, Mul(2, Mean($volume, 20))), 1, 0), $returns)))",
-        "category": "Volume",
-    },
-    # Factor 092
-    {
-        "name": "Range Expansion Signal",
-        "formula": "Neg(CsRank(Div(Sub($high, $low), Add(Mean(Sub($high, $low), 20), 1e-8))))",
-        "category": "Volatility",
-    },
-    # Factor 093
-    {
-        "name": "Short-Term IC Momentum",
-        "formula": "Neg(CsRank(Sum(Mul(Sign($returns), Abs($returns)), 5)))",
-        "category": "Momentum",
-    },
-    # Factor 094
-    {
-        "name": "VWAP Curvature",
-        "formula": "Neg(CsRank(Sub(Div(Sub($vwap, Delay($vwap, 5)), Add(Delay($vwap, 5), 1e-8)), Div(Sub(Delay($vwap, 5), Delay($vwap, 10)), Add(Delay($vwap, 10), 1e-8)))))",
-        "category": "VWAP",
-    },
-    # Factor 095
-    {
-        "name": "Relative Strength",
-        "formula": "Neg(CsRank(Div(Return($close, 5), Add(Return($close, 20), 1e-8))))",
-        "category": "Momentum",
-    },
-    # Factor 096
-    {
-        "name": "Volume-Correlated Return",
-        "formula": "Neg(CsRank(Cov($returns, $volume, 10)))",
-        "category": "Volume",
-    },
-    # Factor 097
-    {
-        "name": "Regime Volatility Band",
-        "formula": "IfElse(Greater(Std($returns, 5), Mul(1.5, Std($returns, 20))), Neg(CsRank(Return($close, 1))), Neg(CsRank(Return($close, 10))))",
-        "category": "Regime-switching",
-    },
-    # Factor 098
-    {
-        "name": "Open-Close Spread Momentum",
-        "formula": "Neg(CsRank(Mean(Div(Sub($close, $open), Add($open, 1e-8)), 5)))",
-        "category": "Intraday",
-    },
-    # Factor 099
-    {
-        "name": "Volatility-Scaled Reversal",
-        "formula": "Neg(CsRank(Div(Return($close, 5), Add(Std($returns, 20), 1e-8))))",
-        "category": "Mean-reversion",
-    },
-    # Factor 100
-    {
-        "name": "VWAP Time-Weighted Signal",
-        "formula": "Neg(CsRank(WMA(Div(Sub($close, $vwap), Add($vwap, 1e-8)), 20)))",
-        "category": "VWAP",
-    },
-    # Factor 101
-    {
-        "name": "Covariance Structure Shift",
-        "formula": "Neg(CsRank(Sub(Cov($returns, $volume, 5), Cov($returns, $volume, 20))))",
-        "category": "Volume",
-    },
-    # Factor 102
-    {
-        "name": "Quadratic Regression Residual",
-        "formula": "Neg(CsRank(TsLinRegResid(Square($returns), 20)))",
-        "category": "Higher-moment",
-    },
-    # Factor 103
-    {
-        "name": "VWAP Mean-Reversion Strength",
-        "formula": "Neg(CsRank(Mul(Div(Sub($close, $vwap), $vwap), Div($volume, Add(Mean($volume, 20), 1e-8)))))",
-        "category": "VWAP",
-    },
-    # Factor 104
-    {
-        "name": "Multi-Scale Momentum",
-        "formula": "Neg(CsRank(Add(Return($close, 5), Return($close, 20))))",
-        "category": "Momentum",
-    },
-    # Factor 105
-    {
-        "name": "Relative High Position",
-        "formula": "Neg(CsRank(Div(Sub(TsMax($high, 20), $close), Add(TsMax($high, 20), 1e-8))))",
-        "category": "Mean-reversion",
-    },
-    # Factor 106
-    {
-        "name": "Turnover Volatility",
-        "formula": "Neg(CsRank(Std(Div($amt, Add($volume, 1e-8)), 10)))",
-        "category": "Turnover",
-    },
-    # Factor 107
-    {
-        "name": "Regime Correlation Signal",
-        "formula": "IfElse(Greater(Abs(Corr($close, $volume, 10)), 0.5), Neg(CsRank(Return($close, 3))), Neg(CsRank(Return($close, 10))))",
-        "category": "Regime-switching",
-    },
-    # Factor 108
-    {
-        "name": "Intraday Momentum Reversal",
-        "formula": "Neg(CsRank(Div(Sub($close, $open), Add(Sub($high, $low), 1e-8))))",
-        "category": "Intraday",
-    },
-    # Factor 109
-    {
-        "name": "Volume-Weighted Slope",
-        "formula": "Neg(CsRank(TsLinRegSlope(Mul($returns, $volume), 10)))",
-        "category": "Volume",
-    },
-    # Factor 110
-    {
-        "name": "Adaptive Range Reversal",
-        "formula": "IfElse(Greater(Std($returns, 10), Mean(Std($returns, 10), 40)), Neg(CsRank(Div(Sub($close, TsMin($close, 10)), Add(Sub(TsMax($close, 10), TsMin($close, 10)), 1e-8)))), Neg(CsRank(Return($close, 5))))",
-        "category": "Regime-switching",
-    },
-]
-
-
-def import_from_paper(
-    path: Optional[Union[str, Path]] = None,
-) -> FactorLibrary:
-    """Import the 110 factors from the paper's Appendix P.
-
-    If *path* is given and points to a JSON file with a ``"factors"`` list,
-    those entries are loaded instead of the built-in catalog.  Each entry
-    must have ``"name"``, ``"formula"``, and ``"category"`` keys.
-
-    Parameters
-    ----------
-    path : str or Path, optional
-        Optional JSON file to load factors from.
-
-    Returns
-    -------
-    FactorLibrary
-        A new library pre-populated with the paper's factors. Since no
-        market data is provided, signals are ``None`` and the correlation
-        matrix is not computed.
-    """
-    if path is not None:
-        path = Path(path)
-        with open(path, "r") as fp:
-            raw = json.load(fp)
-        entries = raw if isinstance(raw, list) else raw.get("factors", [])
-    else:
-        entries = PAPER_FACTORS
-
-    library = FactorLibrary(correlation_threshold=0.5, ic_threshold=0.04)
-
-    for i, entry in enumerate(entries):
-        factor = Factor(
-            id=0,  # Will be assigned by admit_factor
-            name=entry["name"],
-            formula=entry["formula"],
-            category=entry["category"],
-            ic_mean=entry.get("ic_mean", 0.0),
-            icir=entry.get("icir", 0.0),
-            ic_win_rate=entry.get("ic_win_rate", 0.0),
-            max_correlation=entry.get("max_correlation", 0.0),
-            batch_number=entry.get("batch_number", 0),
-            admission_date=entry.get("admission_date", ""),
-            signals=None,
-        )
-        library.admit_factor(factor)
-
-    logger.info(
-        "Imported %d factors from %s",
-        library.size,
-        path if path else "built-in paper catalog",
-    )
-    return library
diff --git a/src/factorminer/factorminer/core/parser.py b/src/factorminer/factorminer/core/parser.py
deleted file mode 100644
index c71554c..0000000
--- a/src/factorminer/factorminer/core/parser.py
+++ /dev/null
@@ -1,374 +0,0 @@
-"""Recursive-descent parser for the FactorMiner factor DSL.
-
-Converts string formulas such as::
-
-    Neg(CsRank(Div(Sub($close, $vwap), $vwap)))
-
-into ``ExpressionTree`` objects backed by the operator registry defined in
-:mod:`factorminer.core.types`.
-
-Grammar (informal)
-------------------
-
-::
-
-    expression  := function_call | feature_ref | number
-    function_call := IDENTIFIER '(' arg_list ')'
-    arg_list    := expression (',' expression)*
-    feature_ref := '$' IDENTIFIER
-    number      := ['-'] DIGITS ['.' DIGITS] [('e'|'E') ['-'|'+'] DIGITS]
-
-Usage
------
-
->>> from factorminer.core.parser import parse
->>> tree = parse("Neg(Div(Sub($close, $vwap), $vwap))")
->>> tree.to_string()
-'Neg(Div(Sub($close, $vwap), $vwap))'
-"""
-
-from __future__ import annotations
-
-import re
-from dataclasses import dataclass
-from enum import Enum, auto
-from typing import Dict, List, Optional, Tuple
-
-from src.factorminer.factorminer.core.expression_tree import (
-    ConstantNode,
-    ExpressionTree,
-    LeafNode,
-    Node,
-    OperatorNode,
-)
-from src.factorminer.factorminer.core.types import FEATURE_SET, OPERATOR_REGISTRY, OperatorSpec
-
-
-# ---------------------------------------------------------------------------
-# Tokenizer
-# ---------------------------------------------------------------------------
-
-class TokenType(Enum):
-    IDENT = auto()      # operator / function name
-    FEATURE = auto()    # $close, $volume, ...
-    NUMBER = auto()     # 0.0001, -3, 1e-6, ...
-    LPAREN = auto()     # (
-    RPAREN = auto()     # )
-    COMMA = auto()      # ,
-    EOF = auto()
-
-
-@dataclass
-class Token:
-    type: TokenType
-    value: str
-    pos: int  # character position in the source string
-
-    def __repr__(self) -> str:
-        return f"Token({self.type.name}, {self.value!r}, pos={self.pos})"
-
-
-# Regex fragments
-_NUMBER_RE = re.compile(
-    r"""
-    -?                      # optional leading minus
-    (?:\d+\.?\d*|\.\d+)     # integer or decimal
-    (?:[eE][+-]?\d+)?       # optional exponent
-    """,
-    re.VERBOSE,
-)
-
-_IDENT_RE = re.compile(r"[A-Za-z_]\w*")
-_FEATURE_RE = re.compile(r"\$[A-Za-z_]\w*")
-_WS_RE = re.compile(r"\s+")
-
-
-def tokenize(source: str) -> List[Token]:
-    """Convert a formula string into a list of ``Token`` objects.
-
-    Raises
-    ------
-    SyntaxError
-        If the string contains characters that cannot be tokenized.
-    """
-    tokens: List[Token] = []
-    pos = 0
-    length = len(source)
-
-    while pos < length:
-        # Skip whitespace
-        m = _WS_RE.match(source, pos)
-        if m:
-            pos = m.end()
-            continue
-
-        ch = source[pos]
-
-        if ch == "(":
-            tokens.append(Token(TokenType.LPAREN, "(", pos))
-            pos += 1
-        elif ch == ")":
-            tokens.append(Token(TokenType.RPAREN, ")", pos))
-            pos += 1
-        elif ch == ",":
-            tokens.append(Token(TokenType.COMMA, ",", pos))
-            pos += 1
-        elif ch == "$":
-            m = _FEATURE_RE.match(source, pos)
-            if not m:
-                raise SyntaxError(
-                    f"Invalid feature reference at position {pos}: "
-                    f"{source[pos:pos+20]!r}"
-                )
-            tokens.append(Token(TokenType.FEATURE, m.group(), pos))
-            pos = m.end()
-        elif ch == "-" or ch == "." or ch.isdigit():
-            # Could be a negative number or just a number.
-            # Disambiguate: a minus is part of a number only if
-            #   (a) it's the very first token, OR
-            #   (b) the preceding token is LPAREN or COMMA
-            if ch == "-":
-                prev_tok = tokens[-1] if tokens else None
-                is_unary_minus = (
-                    prev_tok is None
-                    or prev_tok.type in (TokenType.LPAREN, TokenType.COMMA)
-                )
-                if not is_unary_minus:
-                    raise SyntaxError(
-                        f"Unexpected '-' at position {pos}. "
-                        f"Subtraction should use the Sub() operator."
-                    )
-            m = _NUMBER_RE.match(source, pos)
-            if not m:
-                raise SyntaxError(
-                    f"Invalid number at position {pos}: "
-                    f"{source[pos:pos+20]!r}"
-                )
-            tokens.append(Token(TokenType.NUMBER, m.group(), pos))
-            pos = m.end()
-        elif ch.isalpha() or ch == "_":
-            m = _IDENT_RE.match(source, pos)
-            if not m:
-                raise SyntaxError(
-                    f"Invalid identifier at position {pos}: "
-                    f"{source[pos:pos+20]!r}"
-                )
-            tokens.append(Token(TokenType.IDENT, m.group(), pos))
-            pos = m.end()
-        else:
-            raise SyntaxError(
-                f"Unexpected character {ch!r} at position {pos} in: "
-                f"{source!r}"
-            )
-
-    tokens.append(Token(TokenType.EOF, "", length))
-    return tokens
-
-
-# ---------------------------------------------------------------------------
-# Recursive descent parser
-# ---------------------------------------------------------------------------
-
-class Parser:
-    """Recursive-descent parser that converts a token stream to a ``Node``.
-
-    The parser consumes tokens left-to-right, building the expression tree
-    in a single pass.
-    """
-
-    def __init__(self, tokens: List[Token], source: str) -> None:
-        self.tokens = tokens
-        self.source = source
-        self.pos = 0
-
-    # -- helpers ------------------------------------------------------------
-
-    def _peek(self) -> Token:
-        return self.tokens[self.pos]
-
-    def _advance(self) -> Token:
-        tok = self.tokens[self.pos]
-        self.pos += 1
-        return tok
-
-    def _expect(self, tt: TokenType) -> Token:
-        tok = self._advance()
-        if tok.type != tt:
-            raise SyntaxError(
-                f"Expected {tt.name} but got {tok.type.name} ({tok.value!r}) "
-                f"at position {tok.pos} in: {self.source!r}"
-            )
-        return tok
-
-    # -- grammar rules ------------------------------------------------------
-
-    def parse_expression(self) -> Node:
-        """Parse a single expression (the start symbol)."""
-        tok = self._peek()
-
-        if tok.type == TokenType.FEATURE:
-            return self._parse_feature()
-
-        if tok.type == TokenType.NUMBER:
-            return self._parse_number()
-
-        if tok.type == TokenType.IDENT:
-            return self._parse_function_call()
-
-        raise SyntaxError(
-            f"Unexpected token {tok.type.name} ({tok.value!r}) at position "
-            f"{tok.pos} in: {self.source!r}"
-        )
-
-    def _parse_feature(self) -> LeafNode:
-        tok = self._advance()
-        if tok.value not in FEATURE_SET:
-            raise SyntaxError(
-                f"Unknown feature '{tok.value}' at position {tok.pos}. "
-                f"Expected one of {sorted(FEATURE_SET)}."
-            )
-        return LeafNode(tok.value)
-
-    def _parse_number(self) -> ConstantNode:
-        tok = self._advance()
-        try:
-            return ConstantNode(float(tok.value))
-        except ValueError:
-            raise SyntaxError(
-                f"Invalid numeric literal {tok.value!r} at position {tok.pos}."
-            )
-
-    def _parse_function_call(self) -> Node:
-        """Parse ``Name(arg1, arg2, ..., paramN)``."""
-        name_tok = self._advance()  # IDENT
-        name = name_tok.value
-
-        # Look up operator
-        spec = OPERATOR_REGISTRY.get(name)
-        if spec is None:
-            raise SyntaxError(
-                f"Unknown operator '{name}' at position {name_tok.pos}. "
-                f"Available operators: {sorted(OPERATOR_REGISTRY.keys())}"
-            )
-
-        self._expect(TokenType.LPAREN)
-
-        # Collect arguments (mix of sub-expressions and trailing numeric params)
-        args: List[Node] = []
-        raw_args: List = []  # (Node | float) to separate children from params
-
-        if self._peek().type != TokenType.RPAREN:
-            raw_args.append(self._parse_arg())
-            while self._peek().type == TokenType.COMMA:
-                self._advance()  # consume comma
-                raw_args.append(self._parse_arg())
-
-        self._expect(TokenType.RPAREN)
-
-        # Separate expression children from trailing numeric parameters.
-        # Strategy: the first ``spec.arity`` arguments that are Nodes are
-        # the children.  Remaining numeric values fill param slots in order.
-        children: List[Node] = []
-        trailing_numbers: List[float] = []
-
-        for arg in raw_args:
-            if isinstance(arg, Node) and len(children) < spec.arity:
-                children.append(arg)
-            elif isinstance(arg, (int, float)):
-                trailing_numbers.append(float(arg))
-            elif isinstance(arg, Node):
-                # Extra node arguments beyond arity — could be a ConstantNode
-                # that the user passed as a positional param (e.g. 0.0001).
-                if isinstance(arg, ConstantNode):
-                    trailing_numbers.append(arg.value)
-                else:
-                    children.append(arg)
-            else:
-                trailing_numbers.append(float(arg))
-
-        # Validate arity
-        if len(children) != spec.arity:
-            raise SyntaxError(
-                f"Operator '{name}' expects {spec.arity} expression "
-                f"argument(s) but got {len(children)} at position "
-                f"{name_tok.pos}."
-            )
-
-        # Map trailing numbers to parameter names
-        params: Dict[str, float] = {}
-        for i, pname in enumerate(spec.param_names):
-            if i < len(trailing_numbers):
-                params[pname] = trailing_numbers[i]
-
-        return OperatorNode(spec, children, params)
-
-    def _parse_arg(self):
-        """Parse a single argument inside a function call.
-
-        Returns either a ``Node`` (for sub-expressions) or a bare ``float``
-        for numeric literals that might be operator parameters.
-        """
-        tok = self._peek()
-
-        if tok.type == TokenType.NUMBER:
-            # Peek ahead: if this number is followed by COMMA or RPAREN it
-            # could be a trailing parameter.  We still return a ConstantNode
-            # and let the caller decide.
-            num_tok = self._advance()
-            val = float(num_tok.value)
-            # If the next token is LPAREN, that's weird — just return as
-            # constant.
-            return ConstantNode(val)
-
-        return self.parse_expression()
-
-
-# ---------------------------------------------------------------------------
-# Public API
-# ---------------------------------------------------------------------------
-
-def parse(source: str) -> ExpressionTree:
-    """Parse a factor formula string into an ``ExpressionTree``.
-
-    Parameters
-    ----------
-    source : str
-        A formula in the FactorMiner DSL, e.g.
-        ``"Neg(CsRank(Div(Sub($close, $vwap), $vwap)))"``.
-
-    Returns
-    -------
-    ExpressionTree
-
-    Raises
-    ------
-    SyntaxError
-        If the formula is malformed or references unknown operators / features.
-
-    Examples
-    --------
-    >>> tree = parse("Neg($close)")
-    >>> tree.to_string()
-    'Neg($close)'
-    """
-    tokens = tokenize(source.strip())
-    parser = Parser(tokens, source)
-    root = parser.parse_expression()
-
-    # Ensure we consumed everything
-    remaining = parser._peek()
-    if remaining.type != TokenType.EOF:
-        raise SyntaxError(
-            f"Unexpected trailing content at position {remaining.pos}: "
-            f"{remaining.value!r} in: {source!r}"
-        )
-
-    return ExpressionTree(root)
-
-
-def try_parse(source: str) -> Optional[ExpressionTree]:
-    """Like :func:`parse` but returns ``None`` on failure instead of raising."""
-    try:
-        return parse(source)
-    except (SyntaxError, KeyError, ValueError):
-        return None
diff --git a/src/factorminer/factorminer/core/provenance.py b/src/factorminer/factorminer/core/provenance.py
deleted file mode 100644
index 93e4d91..0000000
--- a/src/factorminer/factorminer/core/provenance.py
+++ /dev/null
@@ -1,241 +0,0 @@
-"""Run and factor provenance helpers for mining sessions.
-
-This module keeps provenance data compact, JSON-safe, and stable across
-save/load boundaries.
-"""
-
-from __future__ import annotations
-
-from dataclasses import asdict, dataclass, field, is_dataclass
-from datetime import datetime
-import hashlib
-import json
-from typing import Any, Dict, List, Mapping, Optional, Sequence
-
-import numpy as np
-
-
-def _json_safe(value: Any) -> Any:
-    """Recursively convert common scientific Python objects into JSON-safe data."""
-    if is_dataclass(value):
-        return _json_safe(asdict(value))
-    if isinstance(value, np.ndarray):
-        return value.tolist()
-    if isinstance(value, np.generic):
-        return value.item()
-    if isinstance(value, Mapping):
-        return {str(k): _json_safe(v) for k, v in value.items()}
-    if isinstance(value, (list, tuple)):
-        return [_json_safe(v) for v in value]
-    return value
-
-
-def stable_digest(payload: Any) -> str:
-    """Compute a stable SHA256 digest for a JSON-serializable payload."""
-    normalized = _json_safe(payload)
-    blob = json.dumps(normalized, sort_keys=True, separators=(",", ":"), default=str)
-    return hashlib.sha256(blob.encode("utf-8")).hexdigest()
-
-
-def _compact_reference_list(entries: Any, limit: int = 8) -> List[str]:
-    """Normalize a mixed list of factor references into readable strings."""
-    if not entries:
-        return []
-
-    if isinstance(entries, (str, Mapping)):
-        iterable: Sequence[Any] = [entries]
-    else:
-        iterable = list(entries)
-
-    values: List[str] = []
-    seen: set[str] = set()
-    for entry in iterable[:limit]:
-        text = ""
-        if isinstance(entry, str):
-            text = entry.strip()
-        elif isinstance(entry, Mapping):
-            name = str(entry.get("name", "")).strip()
-            formula = str(entry.get("formula", "")).strip()
-            category = str(entry.get("category", "")).strip()
-            if name and formula:
-                text = f"{name}: {formula}"
-            elif name and category:
-                text = f"{name} [{category}]"
-            elif name:
-                text = name
-            elif formula:
-                text = formula
-        elif entry is not None:
-            text = str(entry).strip()
-
-        if text and text not in seen:
-            values.append(text)
-            seen.add(text)
-    return values
-
-
-def _compact_memory_signal(memory_signal: Optional[Mapping[str, Any]]) -> Dict[str, Any]:
-    """Keep only the most useful pieces of memory context."""
-    if not memory_signal:
-        return {}
-
-    return {
-        "library_state": _json_safe(memory_signal.get("library_state", {})),
-        "recommended_directions": _compact_reference_list(
-            memory_signal.get("recommended_directions", [])
-        ),
-        "forbidden_directions": _compact_reference_list(
-            memory_signal.get("forbidden_directions", [])
-        ),
-        "insight_count": len(memory_signal.get("insights", []) or []),
-        "semantic_neighbors": _compact_reference_list(
-            memory_signal.get("semantic_neighbors", [])
-        ),
-        "semantic_duplicates": _compact_reference_list(
-            memory_signal.get("semantic_duplicates", [])
-        ),
-        "semantic_gaps": _compact_reference_list(
-            memory_signal.get("semantic_gaps", [])
-        ),
-        "complementary_patterns": _compact_reference_list(
-            memory_signal.get("complementary_patterns", [])
-        ),
-    }
-
-
-@dataclass
-class RunManifest:
-    """Serializable description of a mining run."""
-
-    manifest_version: str = "1.0"
-    run_id: str = ""
-    session_id: str = ""
-    loop_type: str = "ralph"
-    benchmark_mode: str = "paper"
-    created_at: str = ""
-    updated_at: str = ""
-    iteration: int = 0
-    library_size: int = 0
-    output_dir: str = ""
-    config_digest: str = ""
-    config_summary: Dict[str, Any] = field(default_factory=dict)
-    dataset_summary: Dict[str, Any] = field(default_factory=dict)
-    phase2_features: List[str] = field(default_factory=list)
-    target_stack: List[str] = field(default_factory=list)
-    artifact_paths: Dict[str, str] = field(default_factory=dict)
-    notes: List[str] = field(default_factory=list)
-
-    def to_dict(self) -> Dict[str, Any]:
-        return _json_safe(asdict(self))
-
-
-@dataclass
-class FactorProvenance:
-    """Serializable provenance payload attached to an admitted factor."""
-
-    manifest_version: str = "1.0"
-    run_id: str = ""
-    session_id: str = ""
-    loop_type: str = "ralph"
-    created_at: str = ""
-    iteration: int = 0
-    batch_number: int = 0
-    candidate_rank: int = 0
-    factor_name: str = ""
-    formula: str = ""
-    factor_category: str = ""
-    factor_id: int = 0
-    generator_family: str = ""
-    memory_summary: Dict[str, Any] = field(default_factory=dict)
-    library_snapshot: Dict[str, Any] = field(default_factory=dict)
-    evaluation: Dict[str, Any] = field(default_factory=dict)
-    admission: Dict[str, Any] = field(default_factory=dict)
-    phase2: Dict[str, Any] = field(default_factory=dict)
-    target_stack: List[str] = field(default_factory=list)
-    research_metrics: Dict[str, Any] = field(default_factory=dict)
-
-    def to_dict(self) -> Dict[str, Any]:
-        return _json_safe(asdict(self))
-
-
-def build_run_manifest(
-    *,
-    run_id: str,
-    session_id: str,
-    loop_type: str,
-    benchmark_mode: str,
-    created_at: str,
-    updated_at: str,
-    iteration: int,
-    library_size: int,
-    output_dir: str,
-    config_summary: Mapping[str, Any],
-    dataset_summary: Mapping[str, Any],
-    phase2_features: Sequence[str],
-    target_stack: Sequence[str],
-    artifact_paths: Optional[Mapping[str, str]] = None,
-    notes: Optional[Sequence[str]] = None,
-) -> RunManifest:
-    """Build a run manifest from the live loop state."""
-    return RunManifest(
-        run_id=run_id,
-        session_id=session_id,
-        loop_type=loop_type,
-        benchmark_mode=benchmark_mode,
-        created_at=created_at,
-        updated_at=updated_at,
-        iteration=iteration,
-        library_size=library_size,
-        output_dir=output_dir,
-        config_digest=stable_digest(config_summary),
-        config_summary=_json_safe(dict(config_summary)),
-        dataset_summary=_json_safe(dict(dataset_summary)),
-        phase2_features=list(phase2_features),
-        target_stack=list(target_stack),
-        artifact_paths=_json_safe(dict(artifact_paths or {})),
-        notes=list(notes or []),
-    )
-
-
-def build_factor_provenance(
-    *,
-    run_manifest: Mapping[str, Any],
-    factor_name: str,
-    formula: str,
-    factor_category: str,
-    factor_id: int,
-    iteration: int,
-    batch_number: int,
-    candidate_rank: int,
-    generator_family: str,
-    memory_signal: Optional[Mapping[str, Any]],
-    library_state: Optional[Mapping[str, Any]],
-    evaluation: Mapping[str, Any],
-    admission: Mapping[str, Any],
-    phase2: Optional[Mapping[str, Any]] = None,
-    target_stack: Optional[Sequence[str]] = None,
-    research_metrics: Optional[Mapping[str, Any]] = None,
-) -> FactorProvenance:
-    """Build per-factor provenance from the current mining context."""
-    manifest = dict(run_manifest)
-    return FactorProvenance(
-        run_id=str(manifest.get("run_id", "")),
-        session_id=str(manifest.get("session_id", "")),
-        loop_type=str(manifest.get("loop_type", "ralph")),
-        created_at=str(datetime.now().isoformat()),
-        iteration=iteration,
-        batch_number=batch_number,
-        candidate_rank=candidate_rank,
-        factor_name=factor_name,
-        formula=formula,
-        factor_category=factor_category,
-        factor_id=factor_id,
-        generator_family=generator_family,
-        memory_summary=_compact_memory_signal(memory_signal),
-        library_snapshot=_json_safe(dict(library_state or {})),
-        evaluation=_json_safe(dict(evaluation)),
-        admission=_json_safe(dict(admission)),
-        phase2=_json_safe(dict(phase2 or {})),
-        target_stack=list(target_stack or manifest.get("target_stack", [])),
-        research_metrics=_json_safe(dict(research_metrics or {})),
-    )
diff --git a/src/factorminer/factorminer/core/ralph_loop.py b/src/factorminer/factorminer/core/ralph_loop.py
deleted file mode 100644
index 172af48..0000000
--- a/src/factorminer/factorminer/core/ralph_loop.py
+++ /dev/null
@@ -1,1598 +0,0 @@
-"""The Ralph Loop: self-evolving factor discovery algorithm.
-
-Implements Algorithm 1 from the FactorMiner paper.  The loop iteratively:
-  1. Retrieves memory priors from experience memory  -- R(M, L)
-  2. Generates candidate factors via LLM guided by memory -- G(m, L)
-  3. Evaluates candidates through a multi-stage pipeline:
-     - Stage 1: Fast IC screening on M_fast assets
-     - Stage 2: Correlation check against library L
-     - Stage 2.5: Replacement check for correlated candidates
-     - Stage 3: Intra-batch deduplication (pairwise rho < theta)
-     - Stage 4: Full validation on M_full assets + trajectory collection
-  4. Updates the factor library with admitted factors  -- L <- L + {alpha}
-  5. Evolves the experience memory with new insights   -- E(M, F(M, tau))
-
-The loop terminates when the library reaches the target size K or the
-maximum number of iterations is exhausted.
-"""
-
-from __future__ import annotations
-
-import json
-import logging
-import re
-import time
-from concurrent.futures import ProcessPoolExecutor, as_completed
-from dataclasses import asdict, dataclass, field
-from datetime import datetime
-from pathlib import Path
-from typing import Any, Callable, Dict, List, Optional, Tuple, Union
-
-import numpy as np
-
-from src.factorminer.factorminer.core.factor_library import Factor, FactorLibrary
-from src.factorminer.factorminer.core.library_io import save_library, load_library
-from src.factorminer.factorminer.core.provenance import build_factor_provenance, build_run_manifest
-from src.factorminer.factorminer.core.parser import try_parse
-from src.factorminer.factorminer.core.session import MiningSession
-from src.factorminer.factorminer.core.types import FEATURES
-from src.factorminer.factorminer.memory.experience_memory import ExperienceMemoryManager
-from src.factorminer.factorminer.memory.memory_store import ExperienceMemory
-from src.factorminer.factorminer.memory.retrieval import retrieve_memory
-from src.factorminer.factorminer.memory.formation import form_memory
-from src.factorminer.factorminer.memory.evolution import evolve_memory
-from src.factorminer.factorminer.agent.llm_interface import LLMProvider, MockProvider
-from src.factorminer.factorminer.agent.prompt_builder import PromptBuilder
-from src.factorminer.factorminer.evaluation.metrics import (
-    compute_factor_stats,
-    compute_ic,
-    compute_ic_mean,
-    compute_ic_win_rate,
-    compute_icir,
-)
-from src.factorminer.factorminer.evaluation.research import (
-    build_score_vector,
-    compute_factor_geometry,
-    passes_research_admission,
-)
-from src.factorminer.factorminer.evaluation.runtime import SignalComputationError, compute_tree_signals
-from src.factorminer.factorminer.utils.logging import (
-    IterationRecord,
-    FactorRecord,
-    MiningSessionLogger,
-)
-
-logger = logging.getLogger(__name__)
-
-
-# ---------------------------------------------------------------------------
-# Budget Tracker
-# ---------------------------------------------------------------------------
-
-@dataclass
-class BudgetTracker:
-    """Tracks resource consumption across the mining session.
-
-    Monitors LLM token usage, GPU compute time, and wall-clock time
-    so the loop can stop early when a budget is exhausted.
-    """
-
-    max_llm_calls: int = 0      # 0 = unlimited
-    max_wall_seconds: float = 0  # 0 = unlimited
-
-    # Running totals
-    llm_calls: int = 0
-    llm_prompt_tokens: int = 0
-    llm_completion_tokens: int = 0
-    compute_seconds: float = 0.0
-    wall_start: float = field(default_factory=time.time)
-
-    def record_llm_call(
-        self,
-        prompt_tokens: int = 0,
-        completion_tokens: int = 0,
-    ) -> None:
-        self.llm_calls += 1
-        self.llm_prompt_tokens += prompt_tokens
-        self.llm_completion_tokens += completion_tokens
-
-    def record_compute(self, seconds: float) -> None:
-        self.compute_seconds += seconds
-
-    @property
-    def wall_elapsed(self) -> float:
-        return time.time() - self.wall_start
-
-    @property
-    def total_tokens(self) -> int:
-        return self.llm_prompt_tokens + self.llm_completion_tokens
-
-    def is_exhausted(self) -> bool:
-        """True if any budget limit has been reached."""
-        if self.max_llm_calls > 0 and self.llm_calls >= self.max_llm_calls:
-            return True
-        if self.max_wall_seconds > 0 and self.wall_elapsed >= self.max_wall_seconds:
-            return True
-        return False
-
-    def to_dict(self) -> Dict[str, Any]:
-        return {
-            "llm_calls": self.llm_calls,
-            "llm_prompt_tokens": self.llm_prompt_tokens,
-            "llm_completion_tokens": self.llm_completion_tokens,
-            "total_tokens": self.total_tokens,
-            "compute_seconds": round(self.compute_seconds, 2),
-            "wall_elapsed_seconds": round(self.wall_elapsed, 2),
-        }
-
-
-# ---------------------------------------------------------------------------
-# Candidate evaluation result
-# ---------------------------------------------------------------------------
-
-@dataclass
-class EvaluationResult:
-    """Result of evaluating a single candidate factor."""
-
-    factor_name: str
-    formula: str
-    parse_ok: bool = False
-    ic_mean: float = 0.0
-    icir: float = 0.0
-    ic_win_rate: float = 0.0
-    max_correlation: float = 0.0
-    correlated_with: str = ""
-    admitted: bool = False
-    replaced: Optional[int] = None   # ID of replaced factor, if any
-    rejection_reason: str = ""
-    stage_passed: int = 0  # 0=parse/IC fail, 1=IC pass, 2=corr pass, 3=dedup pass, 4=admitted
-    signals: Optional[np.ndarray] = None
-    target_stats: Dict[str, dict] = field(default_factory=dict)
-    research_score: float = 0.0
-    research_lcb: float = 0.0
-    residual_ic: float = 0.0
-    projection_loss: float = 0.0
-    effective_rank_gain: float = 0.0
-    score_vector: Optional[dict[str, Any]] = None
-
-
-# ---------------------------------------------------------------------------
-# Factor Generator: wraps LLM + prompt builder + output parser
-# ---------------------------------------------------------------------------
-
-class FactorGenerator:
-    """Generates candidate factors using LLM guided by memory priors."""
-
-    def __init__(
-        self,
-        llm_provider: Optional[LLMProvider] = None,
-        prompt_builder: Optional[PromptBuilder] = None,
-    ) -> None:
-        self.llm = llm_provider or MockProvider()
-        self.prompt_builder = prompt_builder or PromptBuilder()
-
-    def generate_batch(
-        self,
-        memory_signal: Dict[str, Any],
-        library_state: Dict[str, Any],
-        batch_size: int = 40,
-    ) -> List[Tuple[str, str]]:
-        """Generate a batch of candidate factors.
-
-        Returns
-        -------
-        list of (name, formula) tuples
-        """
-        user_prompt = self.prompt_builder.build_user_prompt(
-            memory_signal, library_state, batch_size
-        )
-        raw_response = self.llm.generate(
-            system_prompt=self.prompt_builder.system_prompt,
-            user_prompt=user_prompt,
-        )
-        return self._parse_response(raw_response)
-
-    @staticmethod
-    def _parse_response(raw: str) -> List[Tuple[str, str]]:
-        """Parse LLM output into (name, formula) pairs.
-
-        Expected format per line:
-            <number>. <name>: <formula>
-        """
-        candidates: List[Tuple[str, str]] = []
-        for line in raw.strip().splitlines():
-            line = line.strip()
-            if not line:
-                continue
-            # Match patterns like "1. factor_name: Formula(...)"
-            m = re.match(
-                r"^\d+\.\s*([a-zA-Z_][a-zA-Z0-9_]*)\s*:\s*(.+)$",
-                line,
-            )
-            if m:
-                name = m.group(1).strip()
-                formula = m.group(2).strip()
-                candidates.append((name, formula))
-        return candidates
-
-
-# ---------------------------------------------------------------------------
-# Validation Pipeline (lightweight orchestrator)
-# ---------------------------------------------------------------------------
-
-class ValidationPipeline:
-    """Multi-stage evaluation pipeline for candidate factors.
-
-    Implements the full 4-stage evaluation from the paper:
-      Stage 1: Fast IC screening on M_fast assets  -> C1
-      Stage 2: Correlation check against library L  -> C2 (+ replacement for C1\\C2)
-      Stage 3: Intra-batch deduplication (pairwise rho < theta)  -> C3
-      Stage 4: Full validation on M_full assets + trajectory collection
-    """
-
-    def __init__(
-        self,
-        data_tensor: np.ndarray,
-        returns: np.ndarray,
-        target_panels: Optional[Dict[str, np.ndarray]] = None,
-        target_horizons: Optional[Dict[str, int]] = None,
-        library: Optional[FactorLibrary] = None,
-        ic_threshold: float = 0.04,
-        icir_threshold: float = 0.5,
-        replacement_ic_min: float = 0.10,
-        replacement_ic_ratio: float = 1.3,
-        fast_screen_assets: int = 100,
-        num_workers: int = 1,
-        research_config: Any = None,
-        benchmark_mode: str = "paper",
-    ) -> None:
-        self.data_tensor = data_tensor  # (M, T, F)
-        self.returns = returns  # (M, T)
-        self.target_panels = target_panels or {"paper": returns}
-        self.target_horizons = target_horizons or {"paper": 1}
-        self.library = library or FactorLibrary(
-            correlation_threshold=0.5,
-            ic_threshold=ic_threshold,
-        )
-        self.ic_threshold = ic_threshold
-        self.icir_threshold = icir_threshold
-        self.replacement_ic_min = replacement_ic_min
-        self.replacement_ic_ratio = replacement_ic_ratio
-        self.fast_screen_assets = fast_screen_assets
-        self.num_workers = num_workers
-        self.signal_failure_policy = "reject"
-        self.research_config = research_config
-        self.benchmark_mode = benchmark_mode
-
-        # Pre-compute the fast-screen asset subset indices
-        M = returns.shape[0]
-        if fast_screen_assets > 0 and fast_screen_assets < M:
-            rng = np.random.RandomState(0)
-            self._fast_indices = rng.choice(M, fast_screen_assets, replace=False)
-            self._fast_indices.sort()
-        else:
-            self._fast_indices = np.arange(M)
-
-    def evaluate_candidate(
-        self,
-        name: str,
-        formula: str,
-        fast_screen: bool = True,
-    ) -> EvaluationResult:
-        """Evaluate a single candidate through the full pipeline.
-
-        Parameters
-        ----------
-        name : str
-            Candidate factor name.
-        formula : str
-            DSL formula string.
-        fast_screen : bool
-            If True, Stage 1 uses M_fast assets only.  If False, uses all.
-        """
-        result = EvaluationResult(factor_name=name, formula=formula)
-
-        # Stage 0: Parse
-        tree = try_parse(formula)
-        if tree is None:
-            result.rejection_reason = "Parse failure"
-            result.stage_passed = 0
-            return result
-        result.parse_ok = True
-
-        # Stage 1: Compute signals and fast IC screening
-        try:
-            signals = self._compute_signals(tree)
-        except SignalComputationError as exc:
-            result.rejection_reason = f"Signal computation error: {exc}"
-            result.stage_passed = 0
-            return result
-
-        if signals is None or np.all(np.isnan(signals)):
-            result.rejection_reason = "All-NaN signals"
-            result.stage_passed = 0
-            return result
-
-        result.signals = signals
-
-        # Fast IC screen on M_fast asset subset
-        if fast_screen and len(self._fast_indices) < signals.shape[0]:
-            fast_signals = signals[self._fast_indices, :]
-            fast_returns = self.returns[self._fast_indices, :]
-            fast_stats = compute_factor_stats(fast_signals, fast_returns)
-            fast_ic = fast_stats["ic_abs_mean"]
-
-            if fast_ic < self.ic_threshold:
-                result.ic_mean = fast_ic
-                result.rejection_reason = (
-                    f"Fast-screen IC {fast_ic:.4f} < threshold {self.ic_threshold}"
-                )
-                result.stage_passed = 0
-                return result
-
-        # Full IC statistics on all assets
-        stats = compute_factor_stats(signals, self.returns)
-        result.ic_mean = stats["ic_abs_mean"]
-        result.icir = stats["icir"]
-        result.ic_win_rate = stats["ic_win_rate"]
-        result.target_stats = {"paper": stats}
-
-        if self.target_panels:
-            for target_name, target_returns in self.target_panels.items():
-                if target_name == "paper":
-                    continue
-                result.target_stats[target_name] = compute_factor_stats(signals, target_returns)
-
-        score_vector_obj = None
-        if self._research_enabled():
-            library_signals = [factor.signals for factor in self.library.list_factors() if factor.signals is not None]
-            geometry = compute_factor_geometry(signals, self.returns, library_signals)
-            score_vector_obj = build_score_vector(
-                result.target_stats,
-                self.target_horizons,
-                self.research_config,
-                geometry,
-            )
-            result.score_vector = score_vector_obj.to_dict()
-            result.research_score = score_vector_obj.primary_score
-            result.research_lcb = score_vector_obj.lower_confidence_bound
-            result.residual_ic = score_vector_obj.geometry.residual_ic
-            result.projection_loss = score_vector_obj.geometry.projection_loss
-            result.effective_rank_gain = score_vector_obj.geometry.effective_rank_gain
-
-        # Stage 1 gate: IC threshold (full data)
-        quality_gate = result.ic_mean
-        quality_label = "IC"
-        if self._research_enabled():
-            quality_gate = result.research_score
-            quality_label = "Research score"
-
-        if quality_gate < self.ic_threshold:
-            result.rejection_reason = (
-                f"{quality_label} {quality_gate:.4f} < threshold {self.ic_threshold}"
-            )
-            result.stage_passed = 0
-            return result
-        if result.icir < self.icir_threshold:
-            result.rejection_reason = (
-                f"ICIR {result.icir:.4f} < threshold {self.icir_threshold}"
-            )
-            result.stage_passed = 0
-            return result
-        result.stage_passed = 1
-
-        if self._research_enabled():
-            admitted, reason = passes_research_admission(
-                score_vector_obj,
-                self.research_config,
-                self.library.correlation_threshold,
-            )
-            result.max_correlation = result.score_vector["geometry"]["max_abs_correlation"]
-            if admitted:
-                result.admitted = True
-                result.stage_passed = 3
-                return result
-            result.stage_passed = 2
-            result.rejection_reason = reason
-            replace_id, replace_reason = self._research_replacement(result)
-            if replace_id is not None:
-                result.admitted = True
-                result.replaced = replace_id
-                result.rejection_reason = replace_reason
-                result.stage_passed = 3
-            return result
-
-        # Stage 2: Correlation check against library (admission)
-        admitted, reason = self.library.check_admission(
-            result.ic_mean, signals
-        )
-        if admitted:
-            result.admitted = True
-            result.stage_passed = 3
-            if self.library.size > 0:
-                result.max_correlation = self.library._max_correlation_with_library(
-                    signals
-                )
-            return result
-
-        result.stage_passed = 2
-
-        # Stage 2.5: Replacement check for candidates that failed admission
-        should_replace, replace_id, replace_reason = self.library.check_replacement(
-            result.ic_mean,
-            signals,
-            ic_min=self.replacement_ic_min,
-            ic_ratio=self.replacement_ic_ratio,
-        )
-        if should_replace and replace_id is not None:
-            result.admitted = True
-            result.replaced = replace_id
-            result.max_correlation = self.library._max_correlation_with_library(
-                signals
-            )
-            result.stage_passed = 3
-            return result
-
-        # Rejected by correlation
-        result.rejection_reason = reason
-        if self.library.size > 0:
-            result.max_correlation = self.library._max_correlation_with_library(
-                signals
-            )
-        return result
-
-    def _research_enabled(self) -> bool:
-        return bool(
-            self.research_config is not None
-            and getattr(self.research_config, "enabled", False)
-            and self.benchmark_mode == "research"
-        )
-
-    def _research_replacement(self, result: EvaluationResult) -> tuple[Optional[int], str]:
-        if result.score_vector is None or self.library.size == 0:
-            return None, result.rejection_reason
-
-        conflicting: list[tuple[int, float]] = []
-        for factor in self.library.list_factors():
-            if factor.signals is None:
-                continue
-            corr = self.library._compute_correlation_vectorized(result.signals, factor.signals)
-            if corr >= self.library.correlation_threshold:
-                conflicting.append((factor.id, corr))
-        if len(conflicting) != 1:
-            return None, result.rejection_reason
-
-        target_id, _ = conflicting[0]
-        target_factor = self.library.get_factor(target_id)
-        target_score = float(target_factor.research_metrics.get("primary_score", target_factor.ic_mean))
-        if result.research_score < max(self.replacement_ic_min, self.replacement_ic_ratio * target_score):
-            return None, (
-                f"Research replacement score {result.research_score:.4f} "
-                f"not strong enough to replace factor {target_id} ({target_score:.4f})"
-            )
-        return target_id, f"Research replacement over factor {target_id}"
-
-    def evaluate_batch(
-        self, candidates: List[Tuple[str, str]]
-    ) -> List[EvaluationResult]:
-        """Evaluate a batch through all stages including intra-batch dedup.
-
-        Stage 1-2.5 are run per-candidate (optionally in parallel).
-        Stage 3 (dedup) runs on all admitted candidates together.
-        """
-        # Stage 1 + 2 + 2.5: per-candidate evaluation
-        if self.num_workers > 1:
-            results = self._evaluate_parallel(candidates)
-        else:
-            results = []
-            for name, formula in candidates:
-                result = self.evaluate_candidate(name, formula)
-                results.append(result)
-
-        # Stage 3: Intra-batch deduplication
-        results = self._deduplicate_batch(results)
-
-        return results
-
-    def _evaluate_parallel(
-        self, candidates: List[Tuple[str, str]]
-    ) -> List[EvaluationResult]:
-        """Evaluate candidates using a thread pool.
-
-        Note: uses threads rather than processes because signals arrays
-        are large and sharing via processes would require serialization.
-        """
-        from concurrent.futures import ThreadPoolExecutor
-
-        results: List[Optional[EvaluationResult]] = [None] * len(candidates)
-
-        def _eval(idx: int, name: str, formula: str) -> Tuple[int, EvaluationResult]:
-            return idx, self.evaluate_candidate(name, formula)
-
-        with ThreadPoolExecutor(max_workers=self.num_workers) as pool:
-            futures = [
-                pool.submit(_eval, i, name, formula)
-                for i, (name, formula) in enumerate(candidates)
-            ]
-            for future in as_completed(futures):
-                idx, result = future.result()
-                results[idx] = result
-
-        return [r for r in results if r is not None]
-
-    def _deduplicate_batch(
-        self, results: List[EvaluationResult]
-    ) -> List[EvaluationResult]:
-        """Stage 3: Remove intra-batch duplicates among admitted candidates.
-
-        For candidates that passed Stages 1-2, check pairwise correlation
-        within the batch.  If two admitted candidates are correlated above
-        theta, keep the one with higher IC and reject the other.
-        """
-        admitted_indices = [
-            i for i, r in enumerate(results)
-            if r.admitted and r.signals is not None
-        ]
-
-        if len(admitted_indices) <= 1:
-            return results
-
-        # Compute pairwise correlations among admitted candidates
-        admitted_signals = [results[i].signals for i in admitted_indices]
-        corr_threshold = self.library.correlation_threshold
-
-        # Greedy dedup: iterate in order of descending IC, keep non-correlated
-        admitted_by_ic = sorted(
-            admitted_indices,
-            key=lambda i: (
-                results[i].research_score if self._research_enabled() else results[i].ic_mean
-            ),
-            reverse=True,
-        )
-
-        kept_indices: List[int] = []
-        kept_signals: List[np.ndarray] = []
-
-        for idx in admitted_by_ic:
-            r = results[idx]
-            is_correlated = False
-
-            for kept_sig in kept_signals:
-                corr = self.library._compute_correlation_vectorized(
-                    r.signals, kept_sig
-                )
-                if corr >= corr_threshold:
-                    is_correlated = True
-                    break
-
-            if is_correlated:
-                # Reject this candidate from the batch due to intra-batch dup
-                results[idx] = EvaluationResult(
-                    factor_name=r.factor_name,
-                    formula=r.formula,
-                    parse_ok=r.parse_ok,
-                    ic_mean=r.ic_mean,
-                    icir=r.icir,
-                    ic_win_rate=r.ic_win_rate,
-                    max_correlation=r.max_correlation,
-                    correlated_with=r.correlated_with,
-                    admitted=False,
-                    replaced=None,
-                    rejection_reason="Intra-batch deduplication (correlated with higher-IC batch member)",
-                    stage_passed=2,
-                    signals=r.signals,
-                )
-            else:
-                kept_indices.append(idx)
-                kept_signals.append(r.signals)
-
-        dedup_rejected = len(admitted_indices) - len(kept_indices)
-        if dedup_rejected > 0:
-            logger.debug(
-                "Intra-batch dedup: rejected %d/%d admitted candidates",
-                dedup_rejected, len(admitted_indices),
-            )
-
-        return results
-
-    def _build_data_dict(self) -> Dict[str, np.ndarray]:
-        """Convert data_tensor to a dict mapping feature names to (M, T) arrays.
-
-        Handles two formats:
-          - dict: already maps ``"$close"`` etc. to ``(M, T)`` arrays.
-          - np.ndarray of shape ``(M, T, F)``: sliced along the last axis
-            using the canonical ``FEATURES`` ordering.
-        """
-        if isinstance(self.data_tensor, dict):
-            return self.data_tensor
-
-        # (M, T, F) numpy array — map each feature slice
-        data_dict: Dict[str, np.ndarray] = {}
-        n_features = self.data_tensor.shape[2] if self.data_tensor.ndim == 3 else 0
-        for i, feat_name in enumerate(FEATURES):
-            if i < n_features:
-                data_dict[feat_name] = self.data_tensor[:, :, i]
-        return data_dict
-
-    def _compute_signals(self, tree) -> Optional[np.ndarray]:
-        """Compute factor signals from expression tree on the data tensor.
-
-        Evaluates the parsed expression tree against the market data using
-        the tree's own ``evaluate()`` method which dispatches through the
-        numpy operator implementations under the configured failure policy.
-        """
-        data_dict = self._build_data_dict()
-        return compute_tree_signals(
-            tree,
-            data_dict,
-            self.returns.shape,
-            signal_failure_policy=self.signal_failure_policy,
-        )
-
-
-# ---------------------------------------------------------------------------
-# Mining Reporter
-# ---------------------------------------------------------------------------
-
-class MiningReporter:
-    """Lightweight reporter that logs batch results to a JSONL file."""
-
-    def __init__(self, output_dir: str = "./output") -> None:
-        self.output_dir = Path(output_dir)
-        self.output_dir.mkdir(parents=True, exist_ok=True)
-        self._log_path = self.output_dir / "mining_batches.jsonl"
-
-    def log_batch(self, iteration: int, **stats: Any) -> None:
-        """Append a batch record to the JSONL log."""
-        record = {"iteration": iteration, "timestamp": time.time()}
-        record.update(stats)
-        with open(self._log_path, "a") as f:
-            f.write(json.dumps(record, default=str) + "\n")
-
-    def export_library(
-        self, library: FactorLibrary, path: Optional[str] = None
-    ) -> str:
-        """Export the factor library to JSON."""
-        if path is None:
-            path = str(self.output_dir / "factor_library.json")
-        factors = [f.to_dict() for f in library.list_factors()]
-        diagnostics = library.get_diagnostics()
-        payload = {
-            "factors": factors,
-            "diagnostics": diagnostics,
-            "exported_at": datetime.now().isoformat(),
-        }
-        with open(path, "w") as f:
-            json.dump(payload, f, indent=2, default=str)
-        return path
-
-
-# ---------------------------------------------------------------------------
-# The Ralph Loop
-# ---------------------------------------------------------------------------
-
-class RalphLoop:
-    """Self-Evolving Factor Discovery via the Ralph Loop paradigm.
-
-    The Ralph Loop iteratively:
-      1. Retrieves memory priors from experience memory  -- R(M, L)
-      2. Generates candidate factors via LLM guided by memory  -- G(m, L)
-      3. Evaluates candidates through multi-stage pipeline  -- V(alpha)
-      4. Updates the factor library with admitted factors  -- L <- L + {alpha}
-      5. Evolves the experience memory with new insights  -- E(M, F(M, tau))
-
-    This implements Algorithm 1 from the FactorMiner paper.
-    """
-
-    def __init__(
-        self,
-        config: Any,
-        data_tensor: np.ndarray,
-        returns: np.ndarray,
-        llm_provider: Optional[LLMProvider] = None,
-        memory: Optional[ExperienceMemory] = None,
-        library: Optional[FactorLibrary] = None,
-        checkpoint_interval: int = 1,
-    ) -> None:
-        """Initialize the Ralph Loop.
-
-        Parameters
-        ----------
-        config : MiningConfig
-            Mining configuration (from core.config or utils.config).
-        data_tensor : np.ndarray
-            Market data tensor D in R^(M x T x F).
-        returns : np.ndarray
-            Forward returns array R in R^(M x T).
-        llm_provider : LLMProvider, optional
-            LLM provider for factor generation.  Defaults to MockProvider.
-        memory : ExperienceMemory, optional
-            Pre-populated experience memory.  Defaults to empty memory.
-        library : FactorLibrary, optional
-            Pre-populated factor library.  Defaults to empty library.
-        checkpoint_interval : int
-            Save a checkpoint every N iterations.  Set to 0 to disable
-            automatic checkpointing.  Default is 1 (every iteration).
-        """
-        self.config = config
-        self.data_tensor = data_tensor
-        self.returns = returns
-        self.checkpoint_interval = checkpoint_interval
-
-        # Core components
-        self.library = library or FactorLibrary(
-            correlation_threshold=getattr(config, "correlation_threshold", 0.5),
-            ic_threshold=getattr(config, "ic_threshold", 0.04),
-        )
-        self.memory = memory or ExperienceMemory()
-        self.memory_manager: Optional[ExperienceMemoryManager] = None
-        self.generator = FactorGenerator(
-            llm_provider=llm_provider,
-            prompt_builder=PromptBuilder(),
-        )
-        self.pipeline = ValidationPipeline(
-            data_tensor=data_tensor,
-            returns=returns,
-            target_panels=getattr(config, "target_panels", None),
-            target_horizons=getattr(config, "target_horizons", None),
-            library=self.library,
-            ic_threshold=getattr(config, "ic_threshold", 0.04),
-            icir_threshold=getattr(config, "icir_threshold", 0.5),
-            replacement_ic_min=getattr(config, "replacement_ic_min", 0.10),
-            replacement_ic_ratio=getattr(config, "replacement_ic_ratio", 1.3),
-            fast_screen_assets=getattr(config, "fast_screen_assets", 100),
-            num_workers=getattr(config, "num_workers", 1),
-            research_config=getattr(config, "research", None),
-            benchmark_mode=getattr(config, "benchmark_mode", "paper"),
-        )
-        self.pipeline.signal_failure_policy = getattr(
-            config, "signal_failure_policy", "reject"
-        )
-        self.reporter = MiningReporter(
-            getattr(config, "output_dir", "./output")
-        )
-        self.budget = BudgetTracker()
-        self.signal_failure_policy = getattr(config, "signal_failure_policy", "reject")
-
-        # Session state
-        self.iteration = 0
-        self._session: Optional[MiningSession] = None
-        self._session_logger: Optional[MiningSessionLogger] = None
-        self._run_manifest: Dict[str, Any] = {}
-
-    # ------------------------------------------------------------------
-    # Main loop
-    # ------------------------------------------------------------------
-
-    def run(
-        self,
-        target_size: Optional[int] = None,
-        max_iterations: Optional[int] = None,
-        callback: Optional[Callable[[int, Dict[str, Any]], None]] = None,
-        resume: bool = False,
-    ) -> FactorLibrary:
-        """Run the complete mining loop.
-
-        Parameters
-        ----------
-        target_size : int, optional
-            Target library size K.  Defaults to config value (110).
-        max_iterations : int, optional
-            Maximum iterations before stopping.  Defaults to config value.
-        callback : callable, optional
-            Called after each iteration with (iteration_number, stats_dict).
-        resume : bool
-            If True, attempt to load the latest checkpoint from the output
-            directory before starting the loop.  Default is False.
-
-        Returns
-        -------
-        FactorLibrary
-            The constructed factor library L.
-        """
-        target_size = target_size or getattr(
-            self.config, "target_library_size", 110
-        )
-        max_iterations = max_iterations or getattr(
-            self.config, "max_iterations", 200
-        )
-        batch_size = getattr(self.config, "batch_size", 40)
-        output_dir = getattr(self.config, "output_dir", "./output")
-
-        # Resume from existing checkpoint if requested
-        if resume:
-            checkpoint_dir = Path(output_dir) / "checkpoint"
-            if checkpoint_dir.exists():
-                self.load_session(str(checkpoint_dir))
-                logger.info(
-                    "Resuming from iteration %d with %d factors",
-                    self.iteration,
-                    self.library.size,
-                )
-
-        # Initialize session
-        if self._session is None:
-            session_id = datetime.now().strftime("%Y%m%d_%H%M%S")
-            self._session = MiningSession(
-                session_id=session_id,
-                config=self._serialize_config(),
-                output_dir=output_dir,
-            )
-
-        self._refresh_run_manifest(
-            output_dir=output_dir,
-            artifact_paths={
-                "output_dir": output_dir,
-                "checkpoint_dir": str(Path(output_dir) / "checkpoint"),
-            },
-        )
-        self._persist_run_manifest(Path(output_dir) / "run_manifest.json")
-
-        # Initialize session logger
-        self._session_logger = MiningSessionLogger(output_dir)
-        self._session_logger.log_session_start({
-            "target_library_size": target_size,
-            "batch_size": batch_size,
-            "max_iterations": max_iterations,
-            "resumed_from_iteration": self.iteration if resume else 0,
-        })
-        self._session_logger.start_progress(max_iterations)
-
-        loop_start = time.time()
-
-        if not hasattr(self, "budget") or self.budget is None:
-            self.budget = BudgetTracker()
-        self.budget.wall_start = time.time()
-
-        try:
-            while (
-                self.library.size < target_size
-                and self.iteration < max_iterations
-            ):
-                # Check budget BEFORE starting a new iteration
-                if self.budget.is_exhausted():
-                    logger.info("Budget exhausted — stopping loop")
-                    break
-
-                self.iteration += 1
-                stats = self._run_iteration(batch_size)
-
-                # Record in session
-                self._session.record_iteration(stats)
-
-                # Callback
-                if callback:
-                    callback(self.iteration, stats)
-
-                logger.info(
-                    "Iteration %d: Library size=%d, Admitted=%d, "
-                    "Yield=%.1f%%, AvgCorr=%.3f",
-                    self.iteration,
-                    stats["library_size"],
-                    stats["admitted"],
-                    stats["yield_rate"] * 100,
-                    stats.get("avg_correlation", 0),
-                )
-
-                # Periodic checkpoint
-                if (
-                    self.checkpoint_interval > 0
-                    and self.iteration % self.checkpoint_interval == 0
-                ):
-                    self._checkpoint()
-
-            if self.budget.is_exhausted():
-                logger.info("Budget exhausted: %s", self.budget.to_dict())
-
-        except KeyboardInterrupt:
-            logger.warning("Mining interrupted by user at iteration %d", self.iteration)
-            if self._session:
-                self._session.status = "interrupted"
-            # Save checkpoint on interrupt so session can be resumed
-            self._checkpoint()
-        finally:
-            elapsed = time.time() - loop_start
-            if self._session_logger:
-                self._session_logger.log_session_end(self.library.size, elapsed)
-            self._refresh_run_manifest(
-                output_dir=output_dir,
-                artifact_paths={
-                    "output_dir": output_dir,
-                    "checkpoint_dir": str(Path(output_dir) / "checkpoint"),
-                    "library": str(Path(output_dir) / "factor_library.json"),
-                    "session": str(Path(output_dir) / "session.json"),
-                    "run_manifest": str(Path(output_dir) / "run_manifest.json"),
-                    "session_log": str(Path(output_dir) / "session_log.json"),
-                },
-            )
-            self._persist_run_manifest(Path(output_dir) / "run_manifest.json")
-            if self._session:
-                self._session.finalize()
-                self._session.save()
-
-        # Final export
-        lib_path = self.reporter.export_library(self.library)
-        logger.info("Factor library exported to %s", lib_path)
-
-        return self.library
-
-    # ------------------------------------------------------------------
-    # Single iteration
-    # ------------------------------------------------------------------
-
-    def _run_iteration(self, batch_size: int) -> Dict[str, Any]:
-        """Execute one iteration of the Ralph Loop.
-
-        Returns
-        -------
-        dict
-            Iteration statistics.
-        """
-        t0 = time.time()
-
-        # Step 1: Memory Retrieval -- R(M, L)
-        library_state = self.library.get_state_summary()
-        memory_signal = retrieve_memory(
-            self.memory,
-            library_state=library_state,
-        )
-
-        # Step 2: Guided Generation -- G(m, L)
-        t_gen = time.time()
-        candidates = self.generator.generate_batch(
-            memory_signal=memory_signal,
-            library_state=library_state,
-            batch_size=batch_size,
-        )
-        self.budget.record_llm_call()
-
-        if not candidates:
-            logger.warning(
-                "Iteration %d: generator produced 0 candidates", self.iteration
-            )
-            return self._empty_stats()
-
-        # Step 3: Multi-Stage Evaluation -- V(alpha) for each candidate
-        results = self.pipeline.evaluate_batch(candidates)
-
-        # Step 4: Library Update -- L <- L + admitted factors
-        admitted_results = self._update_library(results)
-
-        provenance_library_state = {
-            **library_state,
-            "diagnostics": self.library.get_diagnostics(),
-        }
-
-        self._attach_factor_provenance(
-            admitted_results,
-            library_state=provenance_library_state,
-            memory_signal=memory_signal,
-            phase2_summary={},
-            generator_family=self._generator_family(),
-        )
-
-        # Step 5: Memory Evolution -- E(M, F(M, tau))
-        trajectory = self._build_trajectory(results)
-        formed = form_memory(self.memory, trajectory, self.iteration)
-        self.memory = evolve_memory(self.memory, formed)
-
-        # Build stats
-        elapsed = time.time() - t0
-        self.budget.record_compute(elapsed)
-        stats = self._compute_stats(results, admitted_results, elapsed)
-
-        # Log to reporter and session logger
-        # stats already contains 'iteration', so pass it without keyword arg
-        self.reporter.log_batch(**stats)
-        if self._session_logger:
-            ic_values = [r.ic_mean for r in results if r.parse_ok]
-            record = IterationRecord(
-                iteration=self.iteration,
-                candidates_generated=len(candidates),
-                ic_passed=stats["ic_passed"],
-                correlation_passed=stats["corr_passed"],
-                admitted=stats["admitted"],
-                rejected=len(candidates) - stats["admitted"],
-                replaced=stats["replaced"],
-                library_size=self.library.size,
-                best_ic=max(ic_values) if ic_values else 0.0,
-                mean_ic=float(np.mean(ic_values)) if ic_values else 0.0,
-                elapsed_seconds=elapsed,
-            )
-            self._session_logger.log_iteration(record)
-
-            # Log individual factor records
-            for r in results:
-                factor_rec = FactorRecord(
-                    expression=r.formula,
-                    ic=r.ic_mean if r.parse_ok else None,
-                    icir=r.icir if r.parse_ok else None,
-                    max_correlation=r.max_correlation if r.parse_ok else None,
-                    admitted=r.admitted,
-                    rejection_reason=r.rejection_reason or None,
-                    replaced_factor=str(r.replaced) if r.replaced else None,
-                )
-                self._session_logger.log_factor(factor_rec)
-
-        return stats
-
-    # ------------------------------------------------------------------
-    # Library update
-    # ------------------------------------------------------------------
-
-    def _update_library(
-        self, results: List[EvaluationResult]
-    ) -> List[EvaluationResult]:
-        """Admit passing factors into the library and handle replacements.
-
-        Returns the list of admitted results.
-        """
-        admitted: List[EvaluationResult] = []
-
-        for result in results:
-            if not result.admitted:
-                continue
-
-            # Handle replacement
-            if result.replaced is not None:
-                old_id = result.replaced
-                new_factor = Factor(
-                    id=0,  # Will be reassigned by library
-                    name=result.factor_name,
-                    formula=result.formula,
-                    category=self._infer_category(result.formula),
-                    ic_mean=result.ic_mean,
-                    icir=result.icir,
-                    ic_win_rate=result.ic_win_rate,
-                    max_correlation=result.max_correlation,
-                    batch_number=self.iteration,
-                    signals=result.signals,
-                    research_metrics=result.score_vector or {},
-                )
-                try:
-                    self.library.replace_factor(old_id, new_factor)
-                    admitted.append(result)
-                    logger.info(
-                        "Replaced factor %d with '%s' (IC=%.4f)",
-                        old_id, result.factor_name, result.ic_mean,
-                    )
-                except KeyError:
-                    logger.warning(
-                        "Failed to replace factor %d (already removed?)", old_id
-                    )
-            else:
-                # Direct admission
-                factor = Factor(
-                    id=0,  # Will be reassigned
-                    name=result.factor_name,
-                    formula=result.formula,
-                    category=self._infer_category(result.formula),
-                    ic_mean=result.ic_mean,
-                    icir=result.icir,
-                    ic_win_rate=result.ic_win_rate,
-                    max_correlation=result.max_correlation,
-                    batch_number=self.iteration,
-                    signals=result.signals,
-                    research_metrics=result.score_vector or {},
-                )
-                self.library.admit_factor(factor)
-                admitted.append(result)
-
-        return admitted
-
-    # ------------------------------------------------------------------
-    # Trajectory builder for memory formation
-    # ------------------------------------------------------------------
-
-    def _build_trajectory(
-        self, results: List[EvaluationResult]
-    ) -> List[Dict[str, Any]]:
-        """Build mining trajectory tau for memory formation.
-
-        Converts evaluation results into the dict format expected by
-        ``form_memory``.
-        """
-        trajectory: List[Dict[str, Any]] = []
-        for r in results:
-            entry: Dict[str, Any] = {
-                "factor_id": r.factor_name,
-                "formula": r.formula,
-                "ic": r.ic_mean,
-                "icir": r.icir,
-                "max_correlation": r.max_correlation,
-                "correlated_with": r.correlated_with,
-                "admitted": r.admitted,
-                "rejection_reason": r.rejection_reason,
-            }
-            trajectory.append(entry)
-        return trajectory
-
-    # ------------------------------------------------------------------
-    # Statistics helpers
-    # ------------------------------------------------------------------
-
-    def _compute_stats(
-        self,
-        results: List[EvaluationResult],
-        admitted: List[EvaluationResult],
-        elapsed: float,
-    ) -> Dict[str, Any]:
-        """Compute per-iteration statistics."""
-        n_candidates = len(results)
-        diagnostics = self.library.get_diagnostics()
-
-        # Count dedup rejections (stage_passed==2 with dedup reason)
-        dedup_rejected = sum(
-            1 for r in results
-            if not r.admitted
-            and "deduplication" in r.rejection_reason.lower()
-        )
-
-        return {
-            "iteration": self.iteration,
-            "candidates": n_candidates,
-            "parse_ok": sum(1 for r in results if r.parse_ok),
-            "ic_passed": sum(1 for r in results if r.stage_passed >= 1),
-            "corr_passed": sum(1 for r in results if r.stage_passed >= 2),
-            "dedup_rejected": dedup_rejected,
-            "admitted": len(admitted),
-            "replaced": sum(1 for r in admitted if r.replaced is not None),
-            "yield_rate": len(admitted) / max(n_candidates, 1),
-            "library_size": self.library.size,
-            "avg_correlation": diagnostics.get("avg_correlation", 0),
-            "max_correlation": diagnostics.get("max_correlation", 0),
-            "elapsed_seconds": elapsed,
-            "budget": self.budget.to_dict(),
-        }
-
-    def _empty_stats(self) -> Dict[str, Any]:
-        """Return empty stats dict for iterations with no candidates."""
-        return {
-            "iteration": self.iteration,
-            "candidates": 0,
-            "parse_ok": 0,
-            "ic_passed": 0,
-            "corr_passed": 0,
-            "dedup_rejected": 0,
-            "admitted": 0,
-            "replaced": 0,
-            "yield_rate": 0.0,
-            "library_size": self.library.size,
-            "avg_correlation": 0.0,
-            "max_correlation": 0.0,
-            "elapsed_seconds": 0.0,
-            "budget": self.budget.to_dict(),
-        }
-
-    # ------------------------------------------------------------------
-    # Category inference
-    # ------------------------------------------------------------------
-
-    @staticmethod
-    def _infer_category(formula: str) -> str:
-        """Infer factor category from formula structure.
-
-        Uses operator presence heuristics to classify factors into broad
-        categories aligned with the paper's taxonomy.
-        """
-        formula_upper = formula.upper()
-
-        # Extract operators and normalize to uppercase for matching
-        ops_raw = re.findall(r"([A-Za-z][a-zA-Z]+)\(", formula)
-        ops = {o.upper() for o in ops_raw}
-
-        if ops & {"SKEW", "KURT"}:
-            return "Higher-Moment"
-        if ops & {"CORR", "COV", "BETA"} and "$VOLUME" in formula_upper:
-            return "PV-Correlation"
-        if ops & {"IFELSE", "GREATER", "LESS", "OR", "AND"}:
-            return "Regime-Conditional"
-        if ops & {"TSLINREG", "TSLINREGSLOPE", "TSLINREGRESID", "RESID"}:
-            return "Regression"
-        if ops & {"EMA", "DEMA", "KAMA", "HMA", "WMA", "SMA"}:
-            return "Smoothing"
-        if "$VWAP" in formula_upper:
-            return "VWAP"
-        if "$AMT" in formula_upper:
-            return "Amount"
-        if ops & {"DELTA", "DELAY", "RETURN", "LOGRETURN"}:
-            return "Momentum"
-        if ops & {"STD", "VAR"}:
-            return "Volatility"
-        if ops & {"TSMAX", "TSMIN", "TSARGMAX", "TSARGMIN", "TSRANK"}:
-            return "Extrema"
-        if ops & {"CSRANK", "CSZSCORE", "CSDEMEAN"}:
-            return "Cross-Sectional"
-
-        return "Other"
-
-    # ------------------------------------------------------------------
-    # Session persistence (save / resume)
-    # ------------------------------------------------------------------
-
-    def save_session(self, path: Optional[str] = None) -> str:
-        """Save the full mining session state for resume.
-
-        Saves the factor library (via ``save_library``), experience memory,
-        budget tracker state, session metadata, and the loop state to a
-        ``checkpoint`` directory inside the output directory.
-
-        Parameters
-        ----------
-        path : str, optional
-            Directory for the checkpoint.  Defaults to
-            ``{output_dir}/checkpoint``.
-
-        Returns
-        -------
-        str
-            Path to the saved checkpoint directory.
-        """
-        if path is not None:
-            checkpoint_dir = Path(path)
-            # If caller passed a dir that doesn't end with "checkpoint*",
-            # nest inside it for backward compatibility
-            if not checkpoint_dir.name.startswith("checkpoint"):
-                checkpoint_dir = checkpoint_dir / f"checkpoint_iter{self.iteration}"
-        else:
-            output_dir = getattr(self.config, "output_dir", "./output")
-            checkpoint_dir = Path(output_dir) / "checkpoint"
-        checkpoint_dir.mkdir(parents=True, exist_ok=True)
-
-        # Save library using library_io (JSON + optional signal cache)
-        lib_base = str(checkpoint_dir / "library")
-        save_library(self.library, lib_base, save_signals=True)
-
-        # Save memory using ExperienceMemoryManager if available,
-        # otherwise fall back to raw ExperienceMemory serialization
-        mem_path = str(checkpoint_dir / "memory.json")
-        if self.memory_manager is not None:
-            self.memory_manager.save(mem_path)
-        else:
-            with open(mem_path, "w") as f:
-                json.dump(self.memory.to_dict(), f, indent=2, default=str)
-
-        # Save session metadata
-        if self._session:
-            self._session.library_path = lib_base
-            self._session.memory_path = mem_path
-            self._refresh_run_manifest(
-                output_dir=str(checkpoint_dir.parent),
-                artifact_paths={
-                    "library": f"{lib_base}.json",
-                    "memory": mem_path,
-                    "session": str(checkpoint_dir / "session.json"),
-                    "run_manifest": str(checkpoint_dir / "run_manifest.json"),
-                    "loop_state": str(checkpoint_dir / "loop_state.json"),
-                },
-            )
-            self._persist_run_manifest(checkpoint_dir / "run_manifest.json")
-            self._session.save(checkpoint_dir / "session.json")
-
-        # Save loop state (iteration counter + budget tracker)
-        loop_state: Dict[str, Any] = {
-            "iteration": self.iteration,
-            "library_size": self.library.size,
-            "memory_version": self.memory.version,
-            "budget": {
-                "llm_calls": self.budget.llm_calls,
-                "llm_prompt_tokens": self.budget.llm_prompt_tokens,
-                "llm_completion_tokens": self.budget.llm_completion_tokens,
-                "compute_seconds": self.budget.compute_seconds,
-                "max_llm_calls": self.budget.max_llm_calls,
-                "max_wall_seconds": self.budget.max_wall_seconds,
-            },
-        }
-        with open(checkpoint_dir / "loop_state.json", "w") as f:
-            json.dump(loop_state, f, indent=2)
-
-        logger.info("Session saved to %s", checkpoint_dir)
-        return str(checkpoint_dir)
-
-    def load_session(self, path: str) -> None:
-        """Resume a mining session from a saved checkpoint.
-
-        Restores the factor library (via ``load_library``), experience
-        memory, budget tracker state, iteration counter, and session
-        metadata from the checkpoint directory.
-
-        Parameters
-        ----------
-        path : str
-            Path to the checkpoint directory.
-        """
-        checkpoint_dir = Path(path)
-
-        # Load loop state (iteration counter + budget)
-        loop_state_path = checkpoint_dir / "loop_state.json"
-        if loop_state_path.exists():
-            with open(loop_state_path) as f:
-                loop_state = json.load(f)
-            self.iteration = loop_state.get("iteration", 0)
-
-            # Restore budget tracker state
-            budget_data = loop_state.get("budget", {})
-            if budget_data:
-                self.budget.llm_calls = budget_data.get(
-                    "llm_calls", self.budget.llm_calls
-                )
-                self.budget.llm_prompt_tokens = budget_data.get(
-                    "llm_prompt_tokens", self.budget.llm_prompt_tokens
-                )
-                self.budget.llm_completion_tokens = budget_data.get(
-                    "llm_completion_tokens", self.budget.llm_completion_tokens
-                )
-                self.budget.compute_seconds = budget_data.get(
-                    "compute_seconds", self.budget.compute_seconds
-                )
-                self.budget.max_llm_calls = budget_data.get(
-                    "max_llm_calls", self.budget.max_llm_calls
-                )
-                self.budget.max_wall_seconds = budget_data.get(
-                    "max_wall_seconds", self.budget.max_wall_seconds
-                )
-
-            logger.info(
-                "Resuming from iteration %d (library=%d)",
-                self.iteration,
-                loop_state.get("library_size", 0),
-            )
-
-        # Load memory
-        mem_path = checkpoint_dir / "memory.json"
-        if mem_path.exists():
-            if self.memory_manager is not None:
-                self.memory_manager.load(mem_path)
-                self.memory = self.memory_manager.memory
-            else:
-                with open(mem_path) as f:
-                    mem_data = json.load(f)
-                self.memory = ExperienceMemory.from_dict(mem_data)
-            logger.info(
-                "Loaded memory (version=%d, %d success, %d forbidden, %d insights)",
-                self.memory.version,
-                len(self.memory.success_patterns),
-                len(self.memory.forbidden_directions),
-                len(self.memory.insights),
-            )
-
-        # Load library using library_io (supports signals + correlation matrix)
-        lib_json_path = checkpoint_dir / "library.json"
-        if lib_json_path.exists():
-            lib_base = str(checkpoint_dir / "library")
-            loaded_library = load_library(lib_base)
-            # Merge into current library (preserving thresholds from config)
-            self.library.factors = loaded_library.factors
-            self.library._next_id = loaded_library._next_id
-            self.library._id_to_index = loaded_library._id_to_index
-            self.library.correlation_matrix = loaded_library.correlation_matrix
-            # Update the pipeline reference so it uses the restored library
-            self.pipeline.library = self.library
-            logger.info("Loaded library with %d factors", self.library.size)
-
-        # Load session metadata
-        session_path = checkpoint_dir / "session.json"
-        if session_path.exists():
-            self._session = MiningSession.load(session_path)
-            self._session.status = "running"
-            self._run_manifest = dict(self._session.run_manifest or {})
-
-        if not self._run_manifest:
-            run_manifest_path = checkpoint_dir / "run_manifest.json"
-            if run_manifest_path.exists():
-                with open(run_manifest_path) as f:
-                    self._run_manifest = json.load(f)
-
-    @classmethod
-    def resume_from(
-        cls,
-        checkpoint_path: str,
-        config: Any,
-        data_tensor: np.ndarray,
-        returns: np.ndarray,
-        llm_provider: Optional[LLMProvider] = None,
-        **kwargs: Any,
-    ) -> "RalphLoop":
-        """Create a RalphLoop and restore state from a checkpoint.
-
-        Parameters
-        ----------
-        checkpoint_path : str
-            Path to the checkpoint directory.
-        config, data_tensor, returns, llm_provider
-            Same as ``__init__``.
-
-        Returns
-        -------
-        RalphLoop
-            A loop ready to call ``run()`` that continues from the checkpoint.
-        """
-        loop = cls(
-            config=config,
-            data_tensor=data_tensor,
-            returns=returns,
-            llm_provider=llm_provider,
-            **kwargs,
-        )
-        loop.load_session(checkpoint_path)
-        return loop
-
-    # ------------------------------------------------------------------
-    # Internal helpers
-    # ------------------------------------------------------------------
-
-    def _checkpoint(self) -> None:
-        """Save a periodic checkpoint."""
-        try:
-            self.save_session()
-        except Exception as exc:
-            logger.warning("Checkpoint failed: %s", exc)
-
-    def _serialize_config(self) -> Dict[str, Any]:
-        """Serialize config to a JSON-compatible dict."""
-        try:
-            if hasattr(self.config, "to_dict"):
-                return self.config.to_dict()
-            return asdict(self.config)
-        except (TypeError, AttributeError):
-            # Fallback: extract known attributes
-            attrs = [
-                "target_library_size", "batch_size", "max_iterations",
-                "ic_threshold", "icir_threshold", "correlation_threshold",
-                "replacement_ic_min", "replacement_ic_ratio", "output_dir",
-            ]
-            return {
-                attr: getattr(self.config, attr, None)
-                for attr in attrs
-                if getattr(self.config, attr, None) is not None
-            }
-
-    def _loop_type(self) -> str:
-        """Label the loop for provenance and manifests."""
-        return "ralph"
-
-    def _phase2_features(self) -> List[str]:
-        """Phase 2 feature flags used by the current loop."""
-        return []
-
-    def _refresh_run_manifest(
-        self,
-        *,
-        output_dir: str,
-        artifact_paths: Optional[Dict[str, str]] = None,
-    ) -> Dict[str, Any]:
-        """Build and cache the current run manifest."""
-        if self._session is None:
-            return {}
-
-        config_summary = self._serialize_config()
-        dataset_summary = {
-            "data_tensor_shape": list(self.data_tensor.shape),
-            "returns_shape": list(self.returns.shape),
-            "memory_version": self.memory.version,
-            "library_size": self.library.size,
-            "library_diagnostics": self.library.get_diagnostics(),
-        }
-        if isinstance(self.config, dict):
-            benchmark_mode = str(self.config.get("benchmark_mode", "paper"))
-            target_stack = list(self.config.get("target_stack", []))
-        else:
-            benchmark_mode = str(getattr(self.config, "benchmark_mode", "paper"))
-            target_stack = list(
-                getattr(self.config, "target_stack", [])
-                or []
-            )
-
-        pipeline_targets = getattr(self.pipeline, "target_panels", None) or {}
-        if pipeline_targets:
-            target_stack = [
-                name
-                for name in pipeline_targets.keys()
-                if name and name != "paper"
-            ] or target_stack
-
-        manifest = build_run_manifest(
-            run_id=self._session.session_id,
-            session_id=self._session.session_id,
-            loop_type=self._loop_type(),
-            benchmark_mode=benchmark_mode,
-            created_at=self._session.start_time,
-            updated_at=datetime.now().isoformat(),
-            iteration=self.iteration,
-            library_size=self.library.size,
-            output_dir=output_dir,
-            config_summary=config_summary,
-            dataset_summary=dataset_summary,
-            phase2_features=self._phase2_features(),
-            target_stack=target_stack,
-            artifact_paths=artifact_paths or {},
-            notes=[],
-        )
-        self._run_manifest = manifest.to_dict()
-        return self._run_manifest
-
-    def _persist_run_manifest(self, path: Path) -> None:
-        """Write the current run manifest to disk and mirror it into the session."""
-        if self._session is None:
-            return
-
-        path.parent.mkdir(parents=True, exist_ok=True)
-        if not self._run_manifest:
-            self._refresh_run_manifest(
-                output_dir=str(path.parent.parent),
-                artifact_paths={"run_manifest": str(path)},
-            )
-        self._run_manifest.setdefault("artifact_paths", {})["run_manifest"] = str(path)
-        with open(path, "w") as f:
-            json.dump(self._run_manifest, f, indent=2, default=str)
-
-        self._session.run_manifest_path = str(path)
-        self._session.run_manifest = self._run_manifest
-
-    def _attach_factor_provenance(
-        self,
-        admitted_results: List[EvaluationResult],
-        *,
-        library_state: Dict[str, Any],
-        memory_signal: Dict[str, Any],
-        phase2_summary: Dict[str, Any],
-        generator_family: Optional[str] = None,
-    ) -> None:
-        """Stamp provenance onto library factors that survived admission."""
-        if not admitted_results or self._session is None:
-            return
-
-        run_manifest = self._run_manifest or self._refresh_run_manifest(
-            output_dir=getattr(self.config, "output_dir", "./output"),
-            artifact_paths={},
-        )
-
-        for rank, result in enumerate(admitted_results, start=1):
-            if not result.admitted:
-                continue
-
-            factor = None
-            for candidate in reversed(self.library.list_factors()):
-                if (
-                    candidate.name == result.factor_name
-                    and candidate.formula == result.formula
-                ):
-                    factor = candidate
-                    break
-            if factor is None:
-                continue
-
-            factor.provenance = build_factor_provenance(
-                run_manifest=run_manifest,
-                factor_name=factor.name,
-                formula=factor.formula,
-                factor_category=factor.category,
-                factor_id=factor.id,
-                iteration=self.iteration,
-                batch_number=factor.batch_number,
-                candidate_rank=rank,
-                generator_family=generator_family or self._generator_family(),
-                memory_signal=memory_signal,
-                library_state=library_state,
-                evaluation={
-                    "ic_mean": factor.ic_mean,
-                    "icir": factor.icir,
-                    "ic_win_rate": factor.ic_win_rate,
-                    "max_correlation": factor.max_correlation,
-                    "research_metrics": factor.research_metrics,
-                },
-                admission={
-                    "admitted": True,
-                    "stage_passed": result.stage_passed,
-                    "replaced": result.replaced,
-                    "correlated_with": result.correlated_with,
-                    "rejection_reason": result.rejection_reason,
-                },
-                phase2=phase2_summary,
-                target_stack=run_manifest.get("target_stack", []),
-                research_metrics=factor.research_metrics,
-            ).to_dict()
-
-    def _generator_family(self) -> str:
-        """Return the active candidate generator label for provenance."""
-        return self.generator.__class__.__name__
diff --git a/src/factorminer/factorminer/core/session.py b/src/factorminer/factorminer/core/session.py
deleted file mode 100644
index f18fede..0000000
--- a/src/factorminer/factorminer/core/session.py
+++ /dev/null
@@ -1,187 +0,0 @@
-"""Mining session management with persistence and resume support.
-
-A ``MiningSession`` wraps the state that must survive across process
-restarts: session metadata, per-iteration statistics, timing, and paths
-to serialized artifacts (library, memory).
-"""
-
-from __future__ import annotations
-
-import json
-import time
-from dataclasses import asdict, dataclass, field
-from datetime import datetime
-from pathlib import Path
-from typing import Any, Dict, List, Optional, Union
-
-
-@dataclass
-class MiningSession:
-    """Manages a complete mining session with persistence.
-
-    Parameters
-    ----------
-    session_id : str
-        Unique identifier for this session (e.g. timestamp or UUID).
-    config : dict
-        Serialized mining configuration (kept as dict for JSON compat).
-    output_dir : str
-        Directory for all session artifacts.
-    """
-
-    session_id: str
-    config: Dict[str, Any] = field(default_factory=dict)
-    output_dir: str = "./output"
-    start_time: str = ""
-    end_time: str = ""
-    iterations: List[Dict[str, Any]] = field(default_factory=list)
-    library_path: str = ""
-    memory_path: str = ""
-    run_manifest_path: str = ""
-    run_manifest: Dict[str, Any] = field(default_factory=dict)
-    status: str = "running"  # running | completed | interrupted
-
-    def __post_init__(self) -> None:
-        if not self.start_time:
-            self.start_time = datetime.now().isoformat()
-
-    # ------------------------------------------------------------------
-    # Iteration tracking
-    # ------------------------------------------------------------------
-
-    def record_iteration(self, stats: Dict[str, Any]) -> None:
-        """Append iteration statistics to the session log."""
-        stats = dict(stats)
-        stats.setdefault("timestamp", datetime.now().isoformat())
-        self.iterations.append(stats)
-
-    @property
-    def total_iterations(self) -> int:
-        return len(self.iterations)
-
-    @property
-    def last_library_size(self) -> int:
-        if not self.iterations:
-            return 0
-        return self.iterations[-1].get("library_size", 0)
-
-    # ------------------------------------------------------------------
-    # Serialization
-    # ------------------------------------------------------------------
-
-    def to_dict(self) -> Dict[str, Any]:
-        """Serialize session state to a JSON-compatible dictionary."""
-        return {
-            "session_id": self.session_id,
-            "config": self.config,
-            "output_dir": self.output_dir,
-            "start_time": self.start_time,
-            "end_time": self.end_time,
-            "status": self.status,
-            "total_iterations": self.total_iterations,
-            "last_library_size": self.last_library_size,
-            "library_path": self.library_path,
-            "memory_path": self.memory_path,
-            "run_manifest_path": self.run_manifest_path,
-            "run_manifest": self.run_manifest,
-            "iterations": self.iterations,
-        }
-
-    def save(self, path: Optional[Union[str, Path]] = None) -> str:
-        """Save session state to a JSON file.
-
-        Parameters
-        ----------
-        path : str or Path, optional
-            Explicit save path.  Defaults to ``{output_dir}/session.json``.
-
-        Returns
-        -------
-        str
-            The path the session was saved to.
-        """
-        if path is None:
-            save_dir = Path(self.output_dir)
-            save_dir.mkdir(parents=True, exist_ok=True)
-            path = save_dir / "session.json"
-        else:
-            path = Path(path)
-            path.parent.mkdir(parents=True, exist_ok=True)
-
-        with open(path, "w") as f:
-            json.dump(self.to_dict(), f, indent=2, default=str)
-        return str(path)
-
-    @classmethod
-    def load(cls, path: Union[str, Path]) -> "MiningSession":
-        """Load session from a JSON file.
-
-        Parameters
-        ----------
-        path : str or Path
-            Path to a session JSON file.
-
-        Returns
-        -------
-        MiningSession
-        """
-        path = Path(path)
-        with open(path) as f:
-            data = json.load(f)
-
-        return cls(
-            session_id=data["session_id"],
-            config=data.get("config", {}),
-            output_dir=data.get("output_dir", "./output"),
-            start_time=data.get("start_time", ""),
-            end_time=data.get("end_time", ""),
-            iterations=data.get("iterations", []),
-            library_path=data.get("library_path", ""),
-            memory_path=data.get("memory_path", ""),
-            run_manifest_path=data.get("run_manifest_path", ""),
-            run_manifest=data.get("run_manifest", {}),
-            status=data.get("status", "interrupted"),
-        )
-
-    # ------------------------------------------------------------------
-    # Summary
-    # ------------------------------------------------------------------
-
-    def get_summary(self) -> Dict[str, Any]:
-        """Session summary statistics."""
-        total_candidates = sum(
-            it.get("candidates", 0) for it in self.iterations
-        )
-        total_admitted = sum(
-            it.get("admitted", 0) for it in self.iterations
-        )
-        total_replaced = sum(
-            it.get("replaced", 0) for it in self.iterations
-        )
-
-        # Compute elapsed time
-        elapsed = 0.0
-        if self.start_time:
-            start = datetime.fromisoformat(self.start_time)
-            end_str = self.end_time or datetime.now().isoformat()
-            end = datetime.fromisoformat(end_str)
-            elapsed = (end - start).total_seconds()
-
-        return {
-            "session_id": self.session_id,
-            "status": self.status,
-            "total_iterations": self.total_iterations,
-            "total_candidates": total_candidates,
-            "total_admitted": total_admitted,
-            "total_replaced": total_replaced,
-            "overall_yield_rate": (
-                total_admitted / total_candidates if total_candidates > 0 else 0.0
-            ),
-            "final_library_size": self.last_library_size,
-            "elapsed_seconds": elapsed,
-        }
-
-    def finalize(self) -> None:
-        """Mark the session as completed and record end time."""
-        self.end_time = datetime.now().isoformat()
-        self.status = "completed"
diff --git a/src/factorminer/factorminer/core/types.py b/src/factorminer/factorminer/core/types.py
deleted file mode 100644
index 4291252..0000000
--- a/src/factorminer/factorminer/core/types.py
+++ /dev/null
@@ -1,269 +0,0 @@
-"""Type system for the FactorMiner operator library.
-
-Defines operator categories, signatures, specifications, and the canonical
-set of raw market-data feature names used as leaf nodes in expression trees.
-"""
-
-from __future__ import annotations
-
-from dataclasses import dataclass, field
-from enum import Enum, auto
-from typing import Any, Dict, List, Optional, Tuple
-
-
-# ---------------------------------------------------------------------------
-# Enumerations
-# ---------------------------------------------------------------------------
-
-class OperatorType(Enum):
-    """High-level category for every operator."""
-    ARITHMETIC = auto()
-    STATISTICAL = auto()
-    TIMESERIES = auto()
-    CROSS_SECTIONAL = auto()
-    SMOOTHING = auto()
-    REGRESSION = auto()
-    LOGICAL = auto()
-    AUTO_INVENTED = auto()
-
-
-class SignatureType(Enum):
-    """Describes how an operator maps inputs to outputs.
-
-    TIME_SERIES_TO_TIME_SERIES  – rolling / lookback along the time axis
-    CROSS_SECTION_TO_CROSS_SECTION – operates across stocks at each point
-    ELEMENT_WISE – pointwise on array(s), no window or cross-section logic
-    REDUCE_TIME – collapses the time axis (e.g. cumulative sum)
-    """
-    TIME_SERIES_TO_TIME_SERIES = auto()
-    CROSS_SECTION_TO_CROSS_SECTION = auto()
-    ELEMENT_WISE = auto()
-    REDUCE_TIME = auto()
-
-
-# ---------------------------------------------------------------------------
-# Operator specification
-# ---------------------------------------------------------------------------
-
-@dataclass(frozen=True)
-class OperatorSpec:
-    """Immutable descriptor for a single operator in the library.
-
-    Parameters
-    ----------
-    name : str
-        Canonical name used in DSL strings (e.g. ``"Add"``).
-    arity : int
-        Number of *expression* children (1 = unary, 2 = binary, 3 = ternary).
-    category : OperatorType
-        Broad category of the operator.
-    signature : SignatureType
-        How the operator maps inputs to outputs.
-    param_names : tuple[str, ...]
-        Names of extra numeric parameters (e.g. ``("window",)``).
-    param_defaults : dict[str, float]
-        Default value for each parameter when omitted.
-    param_ranges : dict[str, tuple[float, float]]
-        Valid (inclusive) range for each parameter.
-    description : str
-        Short human-readable description.
-    """
-    name: str
-    arity: int
-    category: OperatorType
-    signature: SignatureType
-    param_names: Tuple[str, ...] = ()
-    param_defaults: Dict[str, float] = field(default_factory=dict)
-    param_ranges: Dict[str, Tuple[float, float]] = field(default_factory=dict)
-    description: str = ""
-
-
-# ---------------------------------------------------------------------------
-# Canonical feature names (leaf nodes)
-# ---------------------------------------------------------------------------
-
-FEATURES: List[str] = [
-    "$open",
-    "$high",
-    "$low",
-    "$close",
-    "$volume",
-    "$amt",
-    "$vwap",
-    "$returns",
-]
-
-FEATURE_SET: frozenset = frozenset(FEATURES)
-
-
-# ---------------------------------------------------------------------------
-# Complete operator library  (60+ operators)
-# ---------------------------------------------------------------------------
-
-def _window_params(
-    default: int = 10,
-    lo: int = 2,
-    hi: int = 250,
-) -> Tuple[Tuple[str, ...], Dict[str, float], Dict[str, Tuple[float, float]]]:
-    """Helper returning standard (window,) parameter triple."""
-    return (
-        ("window",),
-        {"window": float(default)},
-        {"window": (float(lo), float(hi))},
-    )
-
-
-def _build_operator_registry() -> Dict[str, OperatorSpec]:
-    """Construct the full operator registry.
-
-    Returns a mapping from canonical operator name to its ``OperatorSpec``.
-    """
-    registry: Dict[str, OperatorSpec] = {}
-
-    def _reg(
-        name: str,
-        arity: int,
-        cat: OperatorType,
-        sig: SignatureType,
-        param_names: Tuple[str, ...] = (),
-        param_defaults: Optional[Dict[str, float]] = None,
-        param_ranges: Optional[Dict[str, Tuple[float, float]]] = None,
-        desc: str = "",
-    ) -> None:
-        registry[name] = OperatorSpec(
-            name=name,
-            arity=arity,
-            category=cat,
-            signature=sig,
-            param_names=param_names,
-            param_defaults=param_defaults or {},
-            param_ranges=param_ranges or {},
-            description=desc,
-        )
-
-    EW = SignatureType.ELEMENT_WISE
-    TS = SignatureType.TIME_SERIES_TO_TIME_SERIES
-    CS = SignatureType.CROSS_SECTION_TO_CROSS_SECTION
-    RT = SignatureType.REDUCE_TIME
-
-    A = OperatorType.ARITHMETIC
-    S = OperatorType.STATISTICAL
-    T = OperatorType.TIMESERIES
-    X = OperatorType.CROSS_SECTIONAL
-    SM = OperatorType.SMOOTHING
-    R = OperatorType.REGRESSION
-    L = OperatorType.LOGICAL
-
-    wp = _window_params
-
-    # ---- Arithmetic (element-wise) ----------------------------------------
-    _reg("Add", 2, A, EW, desc="x + y")
-    _reg("Sub", 2, A, EW, desc="x - y")
-    _reg("Mul", 2, A, EW, desc="x * y")
-    _reg("Div", 2, A, EW, desc="x / y (safe division)")
-    _reg("Neg", 1, A, EW, desc="-x")
-    _reg("Abs", 1, A, EW, desc="|x|")
-    _reg("Sign", 1, A, EW, desc="sign(x)")
-    _reg("Log", 1, A, EW, desc="log(1 + |x|) * sign(x)")
-    _reg("Sqrt", 1, A, EW, desc="sqrt(|x|) * sign(x)")
-    _reg("Square", 1, A, EW, desc="x^2")
-    _reg("Pow", 2, A, EW, desc="x^y")
-    _reg("Max", 2, A, EW, desc="element-wise max(x, y)")
-    _reg("Min", 2, A, EW, desc="element-wise min(x, y)")
-    _reg("Clip", 1, A, EW,
-         param_names=("lower", "upper"),
-         param_defaults={"lower": -3.0, "upper": 3.0},
-         param_ranges={"lower": (-10.0, 10.0), "upper": (-10.0, 10.0)},
-         desc="clip(x, lower, upper)")
-    _reg("Inv", 1, A, EW, desc="1 / x (safe)")
-
-    # ---- Statistical (rolling window) -------------------------------------
-    _reg("Mean", 1, S, TS, *wp(10), desc="rolling mean")
-    _reg("Std", 1, S, TS, *wp(10), desc="rolling std dev")
-    _reg("Var", 1, S, TS, *wp(10), desc="rolling variance")
-    _reg("Skew", 1, S, TS, *wp(20), desc="rolling skewness")
-    _reg("Kurt", 1, S, TS, *wp(20), desc="rolling kurtosis")
-    _reg("Median", 1, S, TS, *wp(10), desc="rolling median")
-    _reg("Sum", 1, S, TS, *wp(10), desc="rolling sum")
-    _reg("Prod", 1, S, TS, *wp(10), desc="rolling product")
-    _reg("TsMax", 1, S, TS, *wp(10), desc="rolling max")
-    _reg("TsMin", 1, S, TS, *wp(10), desc="rolling min")
-    _reg("TsArgMax", 1, S, TS, *wp(10), desc="rolling argmax")
-    _reg("TsArgMin", 1, S, TS, *wp(10), desc="rolling argmin")
-    _reg("TsRank", 1, S, TS, *wp(10), desc="rolling rank of latest value")
-    _reg("Quantile", 1, S, TS,
-         param_names=("window", "q"),
-         param_defaults={"window": 10.0, "q": 0.5},
-         param_ranges={"window": (2.0, 250.0), "q": (0.0, 1.0)},
-         desc="rolling quantile")
-    _reg("CountNaN", 1, S, TS, *wp(10), desc="rolling count of NaN")
-    _reg("CountNotNaN", 1, S, TS, *wp(10), desc="rolling count of non-NaN")
-
-    # ---- Time-series operators --------------------------------------------
-    _reg("Delta", 1, T, TS, *wp(1, 1, 60), desc="x[t] - x[t-d]")
-    _reg("Delay", 1, T, TS, *wp(1, 1, 60), desc="x[t-d]")
-    _reg("Return", 1, T, TS, *wp(1, 1, 60), desc="x[t]/x[t-d] - 1")
-    _reg("LogReturn", 1, T, TS, *wp(1, 1, 60), desc="log(x[t]/x[t-d])")
-    _reg("Corr", 2, T, TS, *wp(10), desc="rolling correlation")
-    _reg("Cov", 2, T, TS, *wp(10), desc="rolling covariance")
-    _reg("Beta", 2, T, TS, *wp(10), desc="rolling regression beta")
-    _reg("Resid", 2, T, TS, *wp(10), desc="rolling regression residual")
-    _reg("WMA", 1, T, TS, *wp(10), desc="weighted moving average (linear)")
-    _reg("Decay", 1, T, TS, *wp(10), desc="exponentially decaying sum")
-    _reg("CumSum", 1, T, RT, desc="cumulative sum along time")
-    _reg("CumProd", 1, T, RT, desc="cumulative product along time")
-    _reg("CumMax", 1, T, RT, desc="cumulative max along time")
-    _reg("CumMin", 1, T, RT, desc="cumulative min along time")
-
-    # ---- Smoothing --------------------------------------------------------
-    _reg("EMA", 1, SM, TS, *wp(10), desc="exponential moving average")
-    _reg("DEMA", 1, SM, TS, *wp(10), desc="double EMA")
-    _reg("SMA", 1, SM, TS, *wp(10), desc="simple moving average")
-    _reg("KAMA", 1, SM, TS, *wp(10), desc="Kaufman adaptive moving average")
-    _reg("HMA", 1, SM, TS, *wp(10), desc="Hull moving average")
-
-    # ---- Cross-sectional --------------------------------------------------
-    _reg("CsRank", 1, X, CS, desc="cross-sectional rank (percentile)")
-    _reg("CsZScore", 1, X, CS, desc="cross-sectional z-score")
-    _reg("CsDemean", 1, X, CS, desc="x - cross-sectional mean")
-    _reg("CsScale", 1, X, CS, desc="scale to unit L1 norm cross-sectionally")
-    _reg("CsNeutralize", 1, X, CS, desc="industry-neutralize")
-    _reg("CsQuantile", 1, X, CS,
-         param_names=("n_bins",),
-         param_defaults={"n_bins": 5.0},
-         param_ranges={"n_bins": (2.0, 20.0)},
-         desc="cross-sectional quantile bin")
-
-    # ---- Regression -------------------------------------------------------
-    _reg("TsLinReg", 1, R, TS, *wp(20), desc="rolling linear-regression fitted value")
-    _reg("TsLinRegSlope", 1, R, TS, *wp(20), desc="rolling linear-regression slope")
-    _reg("TsLinRegIntercept", 1, R, TS, *wp(20), desc="rolling linear-regression intercept")
-    _reg("TsLinRegResid", 1, R, TS, *wp(20), desc="rolling linear-regression residual")
-
-    # ---- Logical / conditional --------------------------------------------
-    _reg("IfElse", 3, L, EW, desc="if cond > 0 then x else y")
-    _reg("Greater", 2, L, EW, desc="1.0 where x > y else 0.0")
-    _reg("GreaterEqual", 2, L, EW, desc="1.0 where x >= y else 0.0")
-    _reg("Less", 2, L, EW, desc="1.0 where x < y else 0.0")
-    _reg("LessEqual", 2, L, EW, desc="1.0 where x <= y else 0.0")
-    _reg("Equal", 2, L, EW, desc="1.0 where x == y else 0.0")
-    _reg("Ne", 2, L, EW, desc="1.0 where x != y else 0.0")
-    _reg("And", 2, L, EW, desc="logical and")
-    _reg("Or", 2, L, EW, desc="logical or")
-    _reg("Not", 1, L, EW, desc="logical not")
-
-    return registry
-
-
-OPERATOR_REGISTRY: Dict[str, OperatorSpec] = _build_operator_registry()
-"""Global mapping from operator name to its ``OperatorSpec``."""
-
-
-def get_operator(name: str) -> OperatorSpec:
-    """Look up an operator by name, raising ``KeyError`` if unknown."""
-    if name not in OPERATOR_REGISTRY:
-        raise KeyError(
-            f"Unknown operator '{name}'. "
-            f"Available: {sorted(OPERATOR_REGISTRY.keys())}"
-        )
-    return OPERATOR_REGISTRY[name]
diff --git a/src/factorminer/factorminer/data/__init__.py b/src/factorminer/factorminer/data/__init__.py
deleted file mode 100644
index f06f07b..0000000
--- a/src/factorminer/factorminer/data/__init__.py
+++ /dev/null
@@ -1,75 +0,0 @@
-"""FactorMiner data pipeline: loading, preprocessing, and tensor construction."""
-
-from src.factorminer.factorminer.data.loader import (
-    OHLCV_COLUMNS,
-    REQUIRED_COLUMNS,
-    load_market_data,
-    load_multiple,
-    to_numpy,
-)
-from src.factorminer.factorminer.data.mock_data import (
-    MockConfig,
-    generate_mock_data,
-    generate_with_halts,
-)
-from src.factorminer.factorminer.data.preprocessor import (
-    PreprocessConfig,
-    compute_derived_features,
-    compute_returns,
-    compute_vwap,
-    cross_sectional_standardise,
-    fill_missing,
-    flag_halts,
-    mask_halts,
-    preprocess,
-    quality_check,
-    winsorise,
-)
-from src.factorminer.factorminer.data.tensor_builder import (
-    DEFAULT_FEATURES,
-    TargetSpec,
-    TensorConfig,
-    TensorDataset,
-    build_pipeline,
-    build_tensor,
-    compute_target,
-    compute_targets,
-    sample_assets,
-    temporal_split,
-)
-
-__all__ = [
-    # loader
-    "OHLCV_COLUMNS",
-    "REQUIRED_COLUMNS",
-    "load_market_data",
-    "load_multiple",
-    "to_numpy",
-    # mock_data
-    "MockConfig",
-    "generate_mock_data",
-    "generate_with_halts",
-    # preprocessor
-    "PreprocessConfig",
-    "compute_derived_features",
-    "compute_returns",
-    "compute_vwap",
-    "cross_sectional_standardise",
-    "fill_missing",
-    "flag_halts",
-    "mask_halts",
-    "preprocess",
-    "quality_check",
-    "winsorise",
-    # tensor_builder
-    "DEFAULT_FEATURES",
-    "TargetSpec",
-    "TensorConfig",
-    "TensorDataset",
-    "build_pipeline",
-    "build_tensor",
-    "compute_target",
-    "compute_targets",
-    "sample_assets",
-    "temporal_split",
-]
diff --git a/src/factorminer/factorminer/data/loader.py b/src/factorminer/factorminer/data/loader.py
deleted file mode 100644
index 6064ce7..0000000
--- a/src/factorminer/factorminer/data/loader.py
+++ /dev/null
@@ -1,244 +0,0 @@
-"""Market data loader supporting multiple formats and asset universes.
-
-Loads OHLCV + amount data from CSV, Parquet, and HDF5 files. Supports
-A-share universes (CSI500, CSI1000, HS300) and Binance crypto data.
-Expected schema: datetime, asset_id, open, high, low, close, volume, amount.
-
-The loader also accepts a small set of common aliases used by broker/data-vendor
-exports, such as ``code``/``ticker`` for ``asset_id`` and ``amt`` for
-``amount``.
-"""
-
-from __future__ import annotations
-
-import logging
-from pathlib import Path
-from typing import Literal, Optional, Sequence, Union
-
-import numpy as np
-import pandas as pd
-
-logger = logging.getLogger(__name__)
-
-# Canonical column ordering
-REQUIRED_COLUMNS = [
-    "datetime",
-    "asset_id",
-    "open",
-    "high",
-    "low",
-    "close",
-    "volume",
-    "amount",
-]
-
-OHLCV_COLUMNS = ["open", "high", "low", "close", "volume", "amount"]
-
-COLUMN_ALIASES = {
-    "datetime": ["timestamp", "date", "time", "trade_date"],
-    "asset_id": ["ticker", "symbol", "code", "stock_code", "ts_code", "instrument"],
-    "open": ["open_price"],
-    "high": ["high_price"],
-    "low": ["low_price"],
-    "close": ["close_price", "price"],
-    "volume": ["vol"],
-    "amount": ["amt", "turnover", "value", "traded_amount"],
-}
-
-# Well-known universe identifiers
-UNIVERSE_ALIASES = {
-    "csi500": "CSI500",
-    "csi1000": "CSI1000",
-    "hs300": "HS300",
-    "binance": "Binance",
-}
-
-FileFormat = Literal["csv", "parquet", "hdf5"]
-
-
-def _infer_format(path: Path) -> FileFormat:
-    suffix = path.suffix.lower()
-    mapping = {
-        ".csv": "csv",
-        ".parquet": "parquet",
-        ".pq": "parquet",
-        ".h5": "hdf5",
-        ".hdf5": "hdf5",
-    }
-    fmt = mapping.get(suffix)
-    if fmt is None:
-        raise ValueError(f"Cannot infer format from extension '{suffix}'. "
-                         f"Supported: {list(mapping.keys())}")
-    return fmt  # type: ignore[return-value]
-
-
-def _read_file(
-    path: Path,
-    fmt: FileFormat,
-    hdf_key: str = "data",
-) -> pd.DataFrame:
-    """Read a single data file into a DataFrame."""
-    if fmt == "csv":
-        df = pd.read_csv(path)
-    elif fmt == "parquet":
-        df = pd.read_parquet(path)
-    elif fmt == "hdf5":
-        df = pd.read_hdf(path, key=hdf_key)
-    else:
-        raise ValueError(f"Unsupported format: {fmt}")
-    return df
-
-
-def _validate_columns(df: pd.DataFrame, path: Path) -> pd.DataFrame:
-    """Ensure required columns are present and normalise names."""
-    cols_lower = {c.lower().strip(): c for c in df.columns}
-    rename_map: dict[str, str] = {}
-    missing: list[str] = []
-    for req in REQUIRED_COLUMNS:
-        if req in df.columns:
-            continue
-        candidates = [req, *COLUMN_ALIASES.get(req, [])]
-        matched = None
-        for candidate in candidates:
-            original = cols_lower.get(candidate.lower().strip())
-            if original is not None:
-                matched = original
-                break
-        if matched is None:
-            missing.append(req)
-            continue
-        rename_map[matched] = req
-    if missing:
-        raise ValueError(
-            f"File {path} is missing required columns: {missing}. "
-            f"Found: {list(df.columns)}"
-        )
-    if rename_map:
-        df = df.rename(columns=rename_map)
-    return df
-
-
-def _coerce_types(df: pd.DataFrame) -> pd.DataFrame:
-    """Ensure numeric types for OHLCV columns and datetime index."""
-    df["datetime"] = pd.to_datetime(df["datetime"])
-    df["asset_id"] = df["asset_id"].astype(str)
-    for col in OHLCV_COLUMNS:
-        df[col] = pd.to_numeric(df[col], errors="coerce")
-    return df
-
-
-def load_market_data(
-    path: Union[str, Path],
-    fmt: Optional[FileFormat] = None,
-    universe: Optional[str] = None,
-    asset_ids: Optional[Sequence[str]] = None,
-    start: Optional[str] = None,
-    end: Optional[str] = None,
-    hdf_key: str = "data",
-) -> pd.DataFrame:
-    """Load market data from a single file.
-
-    Parameters
-    ----------
-    path : str or Path
-        File path to the data source.
-    fmt : str, optional
-        File format (``"csv"``, ``"parquet"``, ``"hdf5"``). Inferred from
-        the file extension when *None*.
-    universe : str, optional
-        Asset universe filter (e.g. ``"CSI500"``). Only assets belonging to
-        the universe are kept. Requires an ``"universe"`` column in the data.
-    asset_ids : sequence of str, optional
-        Explicit list of asset identifiers to retain.
-    start, end : str, optional
-        ISO-formatted datetime strings for temporal filtering.
-    hdf_key : str
-        HDF5 dataset key (default ``"data"``).
-
-    Returns
-    -------
-    pd.DataFrame
-        Sorted DataFrame with columns from :data:`REQUIRED_COLUMNS` plus any
-        extras present in the source file.
-    """
-    path = Path(path)
-    if not path.exists():
-        raise FileNotFoundError(f"Data file not found: {path}")
-
-    if fmt is None:
-        fmt = _infer_format(path)
-
-    logger.info("Loading %s from %s", fmt, path)
-    df = _read_file(path, fmt, hdf_key=hdf_key)
-    df = _validate_columns(df, path)
-    df = _coerce_types(df)
-
-    # Universe filter
-    if universe is not None:
-        canon = UNIVERSE_ALIASES.get(universe.lower(), universe)
-        if "universe" in df.columns:
-            df = df[df["universe"] == canon]
-            logger.info("Filtered to universe %s: %d rows", canon, len(df))
-        else:
-            logger.warning(
-                "Universe filter '%s' requested but no 'universe' column found; "
-                "filter skipped.",
-                canon,
-            )
-
-    # Explicit asset filter
-    if asset_ids is not None:
-        asset_set = set(str(a) for a in asset_ids)
-        df = df[df["asset_id"].isin(asset_set)]
-
-    # Temporal filter
-    if start is not None:
-        df = df[df["datetime"] >= pd.Timestamp(start)]
-    if end is not None:
-        df = df[df["datetime"] <= pd.Timestamp(end)]
-
-    df = df.sort_values(["datetime", "asset_id"]).reset_index(drop=True)
-    logger.info("Loaded %d rows, %d assets", len(df), df["asset_id"].nunique())
-    return df
-
-
-def load_multiple(
-    paths: Sequence[Union[str, Path]],
-    fmt: Optional[FileFormat] = None,
-    **kwargs,
-) -> pd.DataFrame:
-    """Load and concatenate market data from multiple files.
-
-    All keyword arguments are forwarded to :func:`load_market_data`.
-    """
-    frames: list[pd.DataFrame] = []
-    for p in paths:
-        frames.append(load_market_data(p, fmt=fmt, **kwargs))
-    if not frames:
-        raise ValueError("No files provided to load_multiple")
-    df = pd.concat(frames, ignore_index=True)
-    df = df.sort_values(["datetime", "asset_id"]).reset_index(drop=True)
-    return df
-
-
-def to_numpy(
-    df: pd.DataFrame,
-    columns: Optional[Sequence[str]] = None,
-) -> np.ndarray:
-    """Convert a DataFrame to a numpy array of the specified columns.
-
-    Parameters
-    ----------
-    df : pd.DataFrame
-        Market data DataFrame.
-    columns : sequence of str, optional
-        Columns to include.  Defaults to :data:`OHLCV_COLUMNS`.
-
-    Returns
-    -------
-    np.ndarray
-        2-D float64 array of shape ``(n_rows, n_columns)``.
-    """
-    if columns is None:
-        columns = OHLCV_COLUMNS
-    return df[list(columns)].to_numpy(dtype=np.float64)
diff --git a/src/factorminer/factorminer/data/mock_data.py b/src/factorminer/factorminer/data/mock_data.py
deleted file mode 100644
index 64a892e..0000000
--- a/src/factorminer/factorminer/data/mock_data.py
+++ /dev/null
@@ -1,323 +0,0 @@
-"""Generate realistic synthetic market data for testing FactorMiner.
-
-Produces multi-asset OHLCV data with:
-- Volume clustering (GARCH-like)
-- Volatility clustering
-- Cross-sectional correlation via a common market factor
-- Planted alpha signals for validating factor discovery
-- OHLC consistency guarantees: low <= open,close <= high
-"""
-
-from __future__ import annotations
-
-import logging
-from dataclasses import dataclass
-from typing import Literal, Optional
-
-import numpy as np
-import pandas as pd
-
-logger = logging.getLogger(__name__)
-
-Frequency = Literal["10min", "30min", "1h", "1d"]
-
-_FREQ_MAP = {
-    "10min": "10min",
-    "30min": "30min",
-    "1h": "1h",
-    "1d": "1D",
-}
-
-
-@dataclass
-class MockConfig:
-    """Configuration for synthetic data generation.
-
-    Attributes
-    ----------
-    num_assets : int
-        Number of assets (M).
-    num_periods : int
-        Number of time bars (T) per asset.
-    frequency : str
-        Bar frequency: ``"10min"``, ``"30min"``, ``"1h"``, ``"1d"``.
-    start_date : str
-        Start datetime in ISO format.
-    base_price : float
-        Initial price level around which assets are generated.
-    annual_vol : float
-        Annualised volatility for the diffusion process.
-    market_factor_weight : float
-        Weight of the common market factor in returns (0-1).
-        Higher values increase cross-sectional correlation.
-    vol_persistence : float
-        GARCH(1,1) persistence parameter for volatility clustering (0-1).
-    volume_mean : float
-        Mean daily volume per asset.
-    volume_persistence : float
-        AR(1) coefficient for volume clustering (0-1).
-    plant_alpha : bool
-        Whether to inject planted alpha signals.
-    alpha_strength : float
-        Signal-to-noise ratio of the planted alpha.
-    alpha_assets_frac : float
-        Fraction of assets that carry the planted signal.
-    seed : int
-        Random seed for reproducibility.
-    universe : str or None
-        Universe label to include in the output.
-    """
-
-    num_assets: int = 50
-    num_periods: int = 1000
-    frequency: Frequency = "10min"
-    start_date: str = "2024-01-02 09:30:00"
-    base_price: float = 50.0
-    annual_vol: float = 0.25
-    market_factor_weight: float = 0.3
-    vol_persistence: float = 0.9
-    volume_mean: float = 1_000_000.0
-    volume_persistence: float = 0.85
-    plant_alpha: bool = True
-    alpha_strength: float = 0.02
-    alpha_assets_frac: float = 0.2
-    seed: int = 42
-    universe: Optional[str] = None
-
-
-def _bars_per_year(freq: Frequency) -> float:
-    """Approximate number of bars in a trading year."""
-    trading_days = 252
-    bars_per_day = {
-        "10min": 24,   # 4h session / 10min
-        "30min": 8,
-        "1h": 4,
-        "1d": 1,
-    }
-    return trading_days * bars_per_day[freq]
-
-
-def _generate_timestamps(
-    start: str,
-    num_periods: int,
-    freq: Frequency,
-) -> pd.DatetimeIndex:
-    """Create a business-aware timestamp index.
-
-    For intraday frequencies the index skips weekends and only covers
-    a simplified trading session (09:30 - 15:00 for 10min/30min bars).
-    """
-    pd_freq = _FREQ_MAP[freq]
-    if freq == "1d":
-        ts = pd.bdate_range(start=start, periods=num_periods, freq="B")
-    else:
-        # Generate enough intraday bars, then trim to num_periods
-        days_needed = (num_periods // 24) + 10  # generous overestimate
-        day_range = pd.bdate_range(start=start, periods=days_needed, freq="B")
-        bars: list[pd.Timestamp] = []
-        for day in day_range:
-            session_start = day.replace(hour=9, minute=30, second=0)
-            session_end = day.replace(hour=15, minute=0, second=0)
-            day_bars = pd.date_range(session_start, session_end, freq=pd_freq)
-            # Exclude the exact session end for cleaner bars
-            day_bars = day_bars[day_bars < session_end]
-            bars.extend(day_bars.tolist())
-            if len(bars) >= num_periods:
-                break
-        ts = pd.DatetimeIndex(bars[:num_periods])
-    return ts
-
-
-def generate_mock_data(config: Optional[MockConfig] = None) -> pd.DataFrame:
-    """Generate synthetic multi-asset OHLCV + amount data.
-
-    Parameters
-    ----------
-    config : MockConfig, optional
-        Generation parameters.  Uses defaults when *None*.
-
-    Returns
-    -------
-    pd.DataFrame
-        DataFrame with columns: datetime, asset_id, open, high, low,
-        close, volume, amount.  Optionally includes ``universe``.
-    """
-    if config is None:
-        config = MockConfig()
-
-    rng = np.random.default_rng(config.seed)
-    M = config.num_assets
-    T = config.num_periods
-
-    logger.info("Generating mock data: %d assets x %d periods @ %s", M, T, config.frequency)
-
-    timestamps = _generate_timestamps(config.start_date, T, config.frequency)
-    T = len(timestamps)  # may be shorter if we ran out of session bars
-
-    # Per-bar volatility (annualised -> per-bar)
-    bar_vol = config.annual_vol / np.sqrt(_bars_per_year(config.frequency))
-
-    # ---------------------------------------------------------------
-    # Common market factor (drives cross-sectional correlation)
-    # ---------------------------------------------------------------
-    market_returns = rng.normal(0, bar_vol, size=T)
-
-    # ---------------------------------------------------------------
-    # Per-asset paths
-    # ---------------------------------------------------------------
-    asset_ids = [f"ASSET_{i:04d}" for i in range(M)]
-
-    # Storage
-    all_open = np.empty((M, T))
-    all_high = np.empty((M, T))
-    all_low = np.empty((M, T))
-    all_close = np.empty((M, T))
-    all_volume = np.empty((M, T))
-    all_amount = np.empty((M, T))
-
-    # Planted alpha: select a subset of assets
-    n_alpha = max(1, int(M * config.alpha_assets_frac))
-    alpha_assets = set(rng.choice(M, size=n_alpha, replace=False).tolist()) if config.plant_alpha else set()
-
-    for i in range(M):
-        # Initial price with some dispersion
-        p0 = config.base_price * np.exp(rng.normal(0, 0.3))
-
-        # GARCH-like stochastic volatility
-        sigma = np.empty(T)
-        sigma[0] = bar_vol
-        for t in range(1, T):
-            sigma[t] = (
-                bar_vol * (1 - config.vol_persistence)
-                + config.vol_persistence * sigma[t - 1]
-                + rng.normal(0, bar_vol * 0.1)
-            )
-            sigma[t] = max(sigma[t], bar_vol * 0.2)  # floor
-
-        # Idiosyncratic returns
-        idio = rng.normal(0, 1, size=T) * sigma
-
-        # Combine with market factor
-        w = config.market_factor_weight
-        returns = w * market_returns + (1 - w) * idio
-
-        # Plant alpha signal: small positive drift in returns
-        if i in alpha_assets:
-            # Signal: positive drift correlated with lagged volume momentum
-            alpha_drift = config.alpha_strength * bar_vol
-            returns += alpha_drift
-
-        # Cumulative price path (close prices)
-        log_price = np.log(p0) + np.cumsum(returns)
-        close = np.exp(log_price)
-
-        # Generate intra-bar OHLC from close
-        # Open = previous close + small gap noise
-        open_ = np.empty(T)
-        open_[0] = p0
-        open_[1:] = close[:-1] * np.exp(rng.normal(0, bar_vol * 0.1, size=T - 1))
-
-        # Intra-bar high/low
-        intra_range = np.abs(rng.normal(0, sigma * 0.5, size=T))
-        mid = (open_ + close) / 2
-        high = np.maximum(open_, close) + intra_range
-        low = np.minimum(open_, close) - intra_range
-        low = np.maximum(low, mid * 0.9)  # prevent negative or absurd lows
-
-        # Enforce OHLC consistency
-        high = np.maximum(high, np.maximum(open_, close))
-        low = np.minimum(low, np.minimum(open_, close))
-        low = np.maximum(low, 0.01)  # price floor
-
-        # Volume: AR(1) with log-normal noise
-        log_vol = np.empty(T)
-        log_vol_mean = np.log(config.volume_mean)
-        log_vol[0] = log_vol_mean + rng.normal(0, 0.5)
-        for t in range(1, T):
-            log_vol[t] = (
-                log_vol_mean * (1 - config.volume_persistence)
-                + config.volume_persistence * log_vol[t - 1]
-                + rng.normal(0, 0.3)
-            )
-        volume = np.exp(log_vol).astype(np.float64)
-
-        # Amount = volume * vwap (approximate vwap as midpoint)
-        vwap_est = (high + low + close) / 3
-        amount = volume * vwap_est
-
-        all_open[i] = open_
-        all_high[i] = high
-        all_low[i] = low
-        all_close[i] = close
-        all_volume[i] = np.round(volume)
-        all_amount[i] = amount
-
-    # ---------------------------------------------------------------
-    # Assemble DataFrame
-    # ---------------------------------------------------------------
-    records = []
-    for i in range(M):
-        asset_df = pd.DataFrame({
-            "datetime": timestamps,
-            "asset_id": asset_ids[i],
-            "open": all_open[i],
-            "high": all_high[i],
-            "low": all_low[i],
-            "close": all_close[i],
-            "volume": all_volume[i],
-            "amount": all_amount[i],
-        })
-        records.append(asset_df)
-
-    df = pd.concat(records, ignore_index=True)
-
-    if config.universe is not None:
-        df["universe"] = config.universe
-
-    df = df.sort_values(["datetime", "asset_id"]).reset_index(drop=True)
-
-    logger.info(
-        "Generated %d rows: %d assets x %d periods, planted alpha in %d assets",
-        len(df),
-        M,
-        T,
-        len(alpha_assets),
-    )
-    return df
-
-
-def generate_with_halts(
-    config: Optional[MockConfig] = None,
-    halt_fraction: float = 0.01,
-) -> pd.DataFrame:
-    """Generate mock data with simulated trading halts.
-
-    A fraction of (asset, time) pairs are converted to halt bars:
-    open = high = low = close = last valid close, volume = 0, amount = 0.
-
-    Parameters
-    ----------
-    config : MockConfig, optional
-        Generation parameters.
-    halt_fraction : float
-        Fraction of bars to convert to halts.
-    """
-    df = generate_mock_data(config)
-    if config is None:
-        config = MockConfig()
-    rng = np.random.default_rng(config.seed + 1)
-
-    n = len(df)
-    n_halt = int(n * halt_fraction)
-    halt_idx = rng.choice(n, size=n_halt, replace=False)
-
-    df.loc[halt_idx, "volume"] = 0
-    df.loc[halt_idx, "amount"] = 0
-    # Flatten OHLC to close (simulating last traded price)
-    halt_price = df.loc[halt_idx, "close"]
-    df.loc[halt_idx, "open"] = halt_price
-    df.loc[halt_idx, "high"] = halt_price
-    df.loc[halt_idx, "low"] = halt_price
-
-    logger.info("Injected %d halt bars (%.2f%%)", n_halt, 100 * halt_fraction)
-    return df
diff --git a/src/factorminer/factorminer/data/preprocessor.py b/src/factorminer/factorminer/data/preprocessor.py
deleted file mode 100644
index 3a2b429..0000000
--- a/src/factorminer/factorminer/data/preprocessor.py
+++ /dev/null
@@ -1,364 +0,0 @@
-"""Data preprocessing pipeline for FactorMiner.
-
-Handles derived feature computation, missing data imputation, trading halt
-detection, cross-sectional standardisation, winsorisation, and quality checks.
-"""
-
-from __future__ import annotations
-
-import logging
-from dataclasses import dataclass, field
-from typing import Optional, Sequence
-
-import numpy as np
-import pandas as pd
-
-logger = logging.getLogger(__name__)
-
-
-@dataclass
-class PreprocessConfig:
-    """Configuration for the preprocessing pipeline.
-
-    Attributes
-    ----------
-    winsor_lower : float
-        Lower percentile for winsorisation (0-100).
-    winsor_upper : float
-        Upper percentile for winsorisation (0-100).
-    min_nonnan_ratio : float
-        Minimum fraction of non-NaN values required per cross-section
-        for a time step to be kept.
-    ffill_limit : int or None
-        Maximum number of consecutive NaN values to forward-fill within
-        each intraday session.
-    cross_fill_method : str
-        Cross-sectional fill method after forward fill.
-        ``"median"`` or ``"mean"``.
-    standardise : bool
-        Whether to apply cross-sectional z-score standardisation.
-    halt_volume_threshold : float
-        Volume below this value flags a bar as a trading halt.
-    features_to_standardise : list of str
-        Column names subject to standardisation and winsorisation.
-    """
-
-    winsor_lower: float = 1.0
-    winsor_upper: float = 99.0
-    min_nonnan_ratio: float = 0.5
-    ffill_limit: Optional[int] = None
-    cross_fill_method: str = "median"
-    standardise: bool = True
-    halt_volume_threshold: float = 0.0
-    features_to_standardise: list[str] = field(default_factory=lambda: [
-        "open", "high", "low", "close", "volume", "amount", "vwap", "returns",
-    ])
-
-
-# ---------------------------------------------------------------------------
-# Derived features
-# ---------------------------------------------------------------------------
-
-def compute_vwap(df: pd.DataFrame) -> pd.DataFrame:
-    """Add ``vwap`` column: amount / volume.  NaN when volume is zero."""
-    df = df.copy()
-    df["vwap"] = np.where(
-        df["volume"] > 0,
-        df["amount"] / df["volume"],
-        np.nan,
-    )
-    return df
-
-
-def compute_returns(df: pd.DataFrame) -> pd.DataFrame:
-    """Add ``returns`` column: close-to-close percentage change per asset.
-
-    Returns are computed as ``close[t] / close[t-1] - 1`` within each asset.
-    The first observation per asset is NaN.
-    """
-    df = df.copy()
-    df = df.sort_values(["asset_id", "datetime"])
-    df["returns"] = df.groupby("asset_id")["close"].pct_change()
-    return df
-
-
-def compute_derived_features(df: pd.DataFrame) -> pd.DataFrame:
-    """Compute all derived features (vwap and returns)."""
-    df = compute_vwap(df)
-    df = compute_returns(df)
-    return df
-
-
-# ---------------------------------------------------------------------------
-# Trading halt handling
-# ---------------------------------------------------------------------------
-
-def flag_halts(
-    df: pd.DataFrame,
-    volume_threshold: float = 0.0,
-) -> pd.DataFrame:
-    """Add boolean ``is_halt`` column.
-
-    A bar is considered a trading halt when:
-    - Volume is exactly zero (or below *volume_threshold*), **and**
-    - open == high == low == close (no price movement).
-    """
-    df = df.copy()
-    zero_volume = df["volume"] <= volume_threshold
-    flat_price = (
-        (df["open"] == df["high"])
-        & (df["high"] == df["low"])
-        & (df["low"] == df["close"])
-    )
-    df["is_halt"] = zero_volume & flat_price
-    n_halt = df["is_halt"].sum()
-    if n_halt > 0:
-        logger.info("Flagged %d halt bars (%.2f%%)", n_halt, 100 * n_halt / len(df))
-    return df
-
-
-def mask_halts(df: pd.DataFrame) -> pd.DataFrame:
-    """Set OHLCV and derived columns to NaN for halted bars."""
-    if "is_halt" not in df.columns:
-        return df
-    df = df.copy()
-    mask = df["is_halt"]
-    cols_to_nan = [
-        c for c in ["open", "high", "low", "close", "volume", "amount", "vwap", "returns"]
-        if c in df.columns
-    ]
-    df.loc[mask, cols_to_nan] = np.nan
-    return df
-
-
-# ---------------------------------------------------------------------------
-# Missing data handling
-# ---------------------------------------------------------------------------
-
-def _extract_date(dt_series: pd.Series) -> pd.Series:
-    """Return the date component of a datetime series."""
-    return dt_series.dt.date
-
-
-def fill_missing(
-    df: pd.DataFrame,
-    ffill_limit: Optional[int] = None,
-    cross_fill_method: str = "median",
-    columns: Optional[Sequence[str]] = None,
-) -> pd.DataFrame:
-    """Fill missing values using a two-stage strategy.
-
-    Stage 1 – Forward fill within each (asset, date) group so that NaNs
-    from halts / gaps are filled from the last valid intraday observation.
-
-    Stage 2 – Cross-sectional fill: remaining NaNs in each time step are
-    replaced with the cross-sectional median (or mean).
-
-    Parameters
-    ----------
-    df : pd.DataFrame
-        Must contain ``datetime`` and ``asset_id`` columns.
-    ffill_limit : int or None
-        Max consecutive NaN values to forward-fill.
-    cross_fill_method : str
-        ``"median"`` or ``"mean"`` for the cross-sectional stage.
-    columns : sequence of str, optional
-        Columns to fill.  Defaults to numeric columns.
-    """
-    df = df.copy()
-    if columns is None:
-        columns = df.select_dtypes(include=[np.number]).columns.tolist()
-    columns = [c for c in columns if c in df.columns]
-
-    # Stage 1: forward fill within (asset, date)
-    df["_date"] = _extract_date(df["datetime"])
-    for col in columns:
-        df[col] = df.groupby(["asset_id", "_date"])[col].transform(
-            lambda s: s.ffill(limit=ffill_limit)
-        )
-
-    # Stage 2: cross-sectional fill per datetime
-    if cross_fill_method == "median":
-        agg_func = "median"
-    elif cross_fill_method == "mean":
-        agg_func = "mean"
-    else:
-        raise ValueError(f"Unknown cross_fill_method: {cross_fill_method}")
-
-    for col in columns:
-        cross_vals = df.groupby("datetime")[col].transform(agg_func)
-        df[col] = df[col].fillna(cross_vals)
-
-    df = df.drop(columns=["_date"])
-    return df
-
-
-# ---------------------------------------------------------------------------
-# Winsorisation
-# ---------------------------------------------------------------------------
-
-def winsorise(
-    df: pd.DataFrame,
-    columns: Sequence[str],
-    lower: float = 1.0,
-    upper: float = 99.0,
-) -> pd.DataFrame:
-    """Clip values in *columns* to the [lower, upper] percentile range
-    computed cross-sectionally at each time step.
-
-    Parameters
-    ----------
-    df : pd.DataFrame
-        Must contain a ``datetime`` column.
-    columns : sequence of str
-        Columns to winsorise.
-    lower, upper : float
-        Percentile bounds (0-100).
-    """
-    df = df.copy()
-    columns = [c for c in columns if c in df.columns]
-
-    for col in columns:
-        lo = df.groupby("datetime")[col].transform(
-            lambda s: np.nanpercentile(s, lower) if s.notna().any() else np.nan
-        )
-        hi = df.groupby("datetime")[col].transform(
-            lambda s: np.nanpercentile(s, upper) if s.notna().any() else np.nan
-        )
-        df[col] = df[col].clip(lower=lo, upper=hi)
-    return df
-
-
-# ---------------------------------------------------------------------------
-# Cross-sectional standardisation
-# ---------------------------------------------------------------------------
-
-def cross_sectional_standardise(
-    df: pd.DataFrame,
-    columns: Sequence[str],
-) -> pd.DataFrame:
-    """Z-score standardise *columns* cross-sectionally at each time step.
-
-    ``x_std = (x - mean) / std`` where mean and std are computed across
-    all assets at the same datetime.  Groups with std == 0 are set to 0.
-    """
-    df = df.copy()
-    columns = [c for c in columns if c in df.columns]
-
-    for col in columns:
-        grp = df.groupby("datetime")[col]
-        mu = grp.transform("mean")
-        sigma = grp.transform("std")
-        sigma = sigma.replace(0, np.nan)
-        df[col] = (df[col] - mu) / sigma
-        df[col] = df[col].fillna(0.0)
-    return df
-
-
-# ---------------------------------------------------------------------------
-# Quality checks
-# ---------------------------------------------------------------------------
-
-def quality_check(
-    df: pd.DataFrame,
-    min_nonnan_ratio: float = 0.5,
-    columns: Optional[Sequence[str]] = None,
-) -> pd.DataFrame:
-    """Drop time steps where the fraction of non-NaN values across assets
-    is below *min_nonnan_ratio*.
-
-    Parameters
-    ----------
-    df : pd.DataFrame
-        Market data with ``datetime`` and ``asset_id``.
-    min_nonnan_ratio : float
-        Minimum fraction (0-1) of assets with valid data at each time step.
-    columns : sequence of str, optional
-        Columns to check.  Defaults to OHLCV columns.
-
-    Returns
-    -------
-    pd.DataFrame
-        Filtered DataFrame with low-coverage time steps removed.
-    """
-    if columns is None:
-        columns = [c for c in ["open", "high", "low", "close", "volume"] if c in df.columns]
-
-    n_assets = df["asset_id"].nunique()
-    if n_assets == 0:
-        return df
-
-    # Count non-NaN per datetime
-    checks = df.groupby("datetime")[list(columns)].apply(
-        lambda g: g.notna().all(axis=1).sum() / n_assets
-    )
-    valid_dts = checks[checks >= min_nonnan_ratio].index
-    before = df["datetime"].nunique()
-    df = df[df["datetime"].isin(valid_dts)]
-    after = df["datetime"].nunique()
-    if before > after:
-        logger.info(
-            "Quality check removed %d/%d time steps (min_nonnan_ratio=%.2f)",
-            before - after,
-            before,
-            min_nonnan_ratio,
-        )
-    return df.reset_index(drop=True)
-
-
-# ---------------------------------------------------------------------------
-# Full pipeline
-# ---------------------------------------------------------------------------
-
-def preprocess(
-    df: pd.DataFrame,
-    config: Optional[PreprocessConfig] = None,
-) -> pd.DataFrame:
-    """Run the full preprocessing pipeline.
-
-    Parameters
-    ----------
-    df : pd.DataFrame
-        Raw market data with at least the columns: datetime, asset_id,
-        open, high, low, close, volume, amount.
-    config : PreprocessConfig, optional
-        Pipeline configuration.  Uses defaults when *None*.
-
-    Returns
-    -------
-    pd.DataFrame
-        Preprocessed DataFrame with derived features, cleaned and
-        standardised values.
-    """
-    if config is None:
-        config = PreprocessConfig()
-
-    logger.info("Preprocessing %d rows ...", len(df))
-
-    # 1. Derive features
-    df = compute_derived_features(df)
-
-    # 2. Flag and mask trading halts
-    df = flag_halts(df, volume_threshold=config.halt_volume_threshold)
-    df = mask_halts(df)
-
-    # 3. Fill missing data
-    df = fill_missing(
-        df,
-        ffill_limit=config.ffill_limit,
-        cross_fill_method=config.cross_fill_method,
-    )
-
-    # 4. Quality check
-    df = quality_check(df, min_nonnan_ratio=config.min_nonnan_ratio)
-
-    # 5. Winsorise
-    feat_cols = [c for c in config.features_to_standardise if c in df.columns]
-    df = winsorise(df, columns=feat_cols, lower=config.winsor_lower, upper=config.winsor_upper)
-
-    # 6. Cross-sectional standardisation
-    if config.standardise:
-        df = cross_sectional_standardise(df, columns=feat_cols)
-
-    logger.info("Preprocessing complete: %d rows, %d columns", len(df), len(df.columns))
-    return df
diff --git a/src/factorminer/factorminer/data/tensor_builder.py b/src/factorminer/factorminer/data/tensor_builder.py
deleted file mode 100644
index 03cdee8..0000000
--- a/src/factorminer/factorminer/data/tensor_builder.py
+++ /dev/null
@@ -1,505 +0,0 @@
-"""Build the data tensor D in R^(M x T x F) for FactorMiner.
-
-Converts preprocessed panel data into dense 3-D arrays indexed by
-(assets, time_periods, features).  Supports numpy and optional torch backends.
-"""
-
-from __future__ import annotations
-
-import logging
-from dataclasses import dataclass, field
-from typing import Literal, Optional, Sequence, Tuple, Union
-
-import numpy as np
-import pandas as pd
-
-logger = logging.getLogger(__name__)
-
-# Default feature ordering matching the paper specification
-DEFAULT_FEATURES: list[str] = [
-    "open", "high", "low", "close", "volume", "amount", "vwap", "returns",
-]
-
-Backend = Literal["numpy", "torch", "cupy"]
-
-
-@dataclass(frozen=True)
-class TargetSpec:
-    """Definition of one aligned forward-return target."""
-
-    name: str
-    entry_delay_bars: int
-    holding_bars: int
-    price_pair: str = "open_to_close"
-    return_transform: str = "simple"
-
-    @property
-    def column_name(self) -> str:
-        return "target" if self.name == "paper" else f"target_{self.name}"
-
-
-@dataclass
-class TensorConfig:
-    """Configuration for tensor construction.
-
-    Attributes
-    ----------
-    features : list of str
-        Ordered feature columns to include in the tensor.
-    backend : str
-        ``"numpy"``, ``"torch"``, or ``"cupy"``.
-    dtype : str
-        Numeric dtype string (e.g. ``"float32"``).
-    train_end : str or None
-        Inclusive upper bound for the training period (ISO datetime).
-    test_start : str or None
-        Inclusive lower bound for the test period (ISO datetime).
-    m_fast : int or None
-        Number of assets for the fast screening subset.  When *None*,
-        no fast subset is produced.
-    seed : int
-        Random seed for reproducible asset sampling.
-    target_column : str
-        Name of the column holding the target variable (created by
-        :func:`compute_target`).
-    """
-
-    features: list[str] = field(default_factory=lambda: list(DEFAULT_FEATURES))
-    backend: Backend = "numpy"
-    dtype: str = "float32"
-    train_end: Optional[str] = None
-    test_start: Optional[str] = None
-    m_fast: Optional[int] = None
-    seed: int = 42
-    target_column: str = "target"
-    target_columns: list[str] = field(default_factory=list)
-    default_target: str = "target"
-
-
-# ---------------------------------------------------------------------------
-# Target variable
-# ---------------------------------------------------------------------------
-
-def compute_target(df: pd.DataFrame) -> pd.DataFrame:
-    """Compute the target: next-bar open-to-close return.
-
-    For each asset the target at time *t* is defined as::
-
-        target[t] = close[t+1] / open[t+1] - 1
-
-    The last bar of each asset has a NaN target.
-    """
-    return compute_targets(
-        df,
-        [
-            TargetSpec(
-                name="paper",
-                entry_delay_bars=1,
-                holding_bars=1,
-                price_pair="open_to_close",
-                return_transform="simple",
-            )
-        ],
-    )
-
-
-def compute_targets(
-    df: pd.DataFrame,
-    target_specs: Sequence[TargetSpec],
-) -> pd.DataFrame:
-    """Compute one or more named forward-return targets on the same panel."""
-    df = df.sort_values(["asset_id", "datetime"]).copy()
-
-    for spec in target_specs:
-        start_col, end_col, start_offset, end_offset = _resolve_target_offsets(spec)
-        start_values = df.groupby("asset_id")[start_col].shift(-start_offset)
-        end_values = df.groupby("asset_id")[end_col].shift(-end_offset)
-        if spec.return_transform == "log":
-            df[spec.column_name] = np.log(end_values / start_values)
-        else:
-            df[spec.column_name] = end_values / start_values - 1.0
-
-    return df
-
-
-def _resolve_target_offsets(spec: TargetSpec) -> tuple[str, str, int, int]:
-    """Map a target spec to start/end price columns and offsets."""
-    if spec.entry_delay_bars < 0 or spec.holding_bars < 0:
-        raise ValueError("TargetSpec entry_delay_bars and holding_bars must be >= 0")
-
-    if spec.price_pair == "open_to_close":
-        if spec.holding_bars < 1:
-            raise ValueError("open_to_close targets require holding_bars >= 1")
-        return (
-            "open",
-            "close",
-            spec.entry_delay_bars,
-            spec.entry_delay_bars + spec.holding_bars - 1,
-        )
-    if spec.price_pair == "close_to_close":
-        if spec.holding_bars < 1:
-            raise ValueError("close_to_close targets require holding_bars >= 1")
-        return (
-            "close",
-            "close",
-            spec.entry_delay_bars,
-            spec.entry_delay_bars + spec.holding_bars,
-        )
-    if spec.price_pair == "open_to_open":
-        if spec.holding_bars < 1:
-            raise ValueError("open_to_open targets require holding_bars >= 1")
-        return (
-            "open",
-            "open",
-            spec.entry_delay_bars,
-            spec.entry_delay_bars + spec.holding_bars,
-        )
-    if spec.price_pair == "close_to_open":
-        if spec.holding_bars < 1:
-            raise ValueError("close_to_open targets require holding_bars >= 1")
-        return (
-            "close",
-            "open",
-            spec.entry_delay_bars,
-            spec.entry_delay_bars + spec.holding_bars,
-        )
-    raise ValueError(f"Unknown TargetSpec price_pair: {spec.price_pair}")
-
-
-# ---------------------------------------------------------------------------
-# Tensor construction helpers
-# ---------------------------------------------------------------------------
-
-def _to_backend(arr: np.ndarray, backend: Backend, dtype: str):
-    """Convert a numpy array to the requested backend."""
-    np_dtype = getattr(np, dtype, np.float32)
-    arr = arr.astype(np_dtype)
-
-    if backend == "numpy":
-        return arr
-
-    if backend == "torch":
-        try:
-            import torch
-        except ImportError as exc:
-            raise ImportError(
-                "PyTorch is required for backend='torch'. "
-                "Install with: pip install torch"
-            ) from exc
-        torch_dtype = getattr(torch, dtype, torch.float32)
-        return torch.from_numpy(arr).to(torch_dtype)
-
-    if backend == "cupy":
-        try:
-            import cupy  # type: ignore[import-untyped]
-        except ImportError as exc:
-            raise ImportError(
-                "CuPy is required for backend='cupy'. "
-                "Install with: pip install cupy"
-            ) from exc
-        return cupy.asarray(arr, dtype=dtype)
-
-    raise ValueError(f"Unknown backend: {backend}")
-
-
-def _build_3d(
-    df: pd.DataFrame,
-    asset_ids: np.ndarray,
-    timestamps: np.ndarray,
-    columns: Sequence[str],
-) -> np.ndarray:
-    """Pivot panel data into a dense (M, T, F) numpy array."""
-    M = len(asset_ids)
-    T = len(timestamps)
-    F = len(columns)
-    tensor = np.full((M, T, F), np.nan, dtype=np.float64)
-
-    asset_map = {a: i for i, a in enumerate(asset_ids)}
-    time_map = {t: j for j, t in enumerate(timestamps)}
-
-    df_idx = df.copy()
-    df_idx["_ai"] = df_idx["asset_id"].map(asset_map)
-    df_idx["_ti"] = df_idx["datetime"].map(time_map)
-    df_idx = df_idx.dropna(subset=["_ai", "_ti"])
-    df_idx["_ai"] = df_idx["_ai"].astype(int)
-    df_idx["_ti"] = df_idx["_ti"].astype(int)
-
-    values = df_idx[list(columns)].to_numpy(dtype=np.float64)
-    tensor[df_idx["_ai"].values, df_idx["_ti"].values, :] = values
-
-    return tensor
-
-
-# ---------------------------------------------------------------------------
-# Public API
-# ---------------------------------------------------------------------------
-
-@dataclass
-class TensorDataset:
-    """Container for the built tensor and associated metadata.
-
-    Attributes
-    ----------
-    data : array-like
-        Feature tensor of shape ``(M, T, F)``.
-    target : array-like or None
-        Target array of shape ``(M, T)``.
-    asset_ids : np.ndarray
-        Asset identifier for each row in the first axis.
-    timestamps : np.ndarray
-        Datetime for each position in the second axis.
-    feature_names : list of str
-        Feature name for each slice in the third axis.
-    """
-
-    data: object  # np.ndarray | torch.Tensor | cupy.ndarray
-    target: object  # same type or None
-    asset_ids: np.ndarray
-    timestamps: np.ndarray
-    feature_names: list[str]
-    targets: dict[str, object] = field(default_factory=dict)
-    default_target: str = "target"
-
-
-def build_tensor(
-    df: pd.DataFrame,
-    config: Optional[TensorConfig] = None,
-) -> TensorDataset:
-    """Build a dense 3-D tensor from preprocessed panel data.
-
-    Parameters
-    ----------
-    df : pd.DataFrame
-        Preprocessed market data.  Must include ``datetime``, ``asset_id``,
-        and all columns listed in ``config.features``.
-    config : TensorConfig, optional
-        Build configuration.  Uses defaults when *None*.
-
-    Returns
-    -------
-    TensorDataset
-        Dense tensor and metadata.
-    """
-    if config is None:
-        config = TensorConfig()
-
-    # Validate required feature columns
-    missing = [f for f in config.features if f not in df.columns]
-    if missing:
-        raise ValueError(f"DataFrame is missing feature columns: {missing}")
-
-    # Sorted unique axes
-    asset_ids = np.sort(df["asset_id"].unique())
-    timestamps = np.sort(df["datetime"].unique())
-
-    logger.info(
-        "Building tensor: %d assets x %d time steps x %d features",
-        len(asset_ids),
-        len(timestamps),
-        len(config.features),
-    )
-
-    data_np = _build_3d(df, asset_ids, timestamps, config.features)
-
-    # Target
-    resolved_target_columns = list(config.target_columns or [config.target_column])
-    target_arrays_np: dict[str, np.ndarray] = {}
-    for target_column in resolved_target_columns:
-        if target_column not in df.columns:
-            continue
-        target_np = _build_3d(df, asset_ids, timestamps, [target_column])
-        target_arrays_np[target_column] = target_np[:, :, 0]
-
-    target_np: Optional[np.ndarray] = None
-    default_target_name = config.default_target
-    if target_arrays_np:
-        target_np = target_arrays_np.get(default_target_name)
-        if target_np is None:
-            first_target = next(iter(target_arrays_np))
-            target_np = target_arrays_np[first_target]
-            default_target_name = first_target
-
-    data = _to_backend(data_np, config.backend, config.dtype)
-    target = _to_backend(target_np, config.backend, config.dtype) if target_np is not None else None
-    targets = {
-        name: _to_backend(target_arr, config.backend, config.dtype)
-        for name, target_arr in target_arrays_np.items()
-    }
-
-    return TensorDataset(
-        data=data,
-        target=target,
-        asset_ids=asset_ids,
-        timestamps=timestamps,
-        feature_names=list(config.features),
-        targets=targets,
-        default_target=default_target_name,
-    )
-
-
-# ---------------------------------------------------------------------------
-# Temporal split
-# ---------------------------------------------------------------------------
-
-def temporal_split(
-    ds: TensorDataset,
-    train_end: Optional[str] = None,
-    test_start: Optional[str] = None,
-) -> Tuple[TensorDataset, TensorDataset]:
-    """Split a :class:`TensorDataset` into train and test sets along time.
-
-    Parameters
-    ----------
-    ds : TensorDataset
-        Full dataset.
-    train_end : str, optional
-        Inclusive upper bound for training timestamps.
-    test_start : str, optional
-        Inclusive lower bound for test timestamps.  When *None* defaults to
-        the bar immediately after *train_end*.
-
-    Returns
-    -------
-    tuple of TensorDataset
-        ``(train, test)`` datasets.
-    """
-    ts = pd.to_datetime(ds.timestamps)
-
-    if train_end is not None:
-        train_mask = ts <= pd.Timestamp(train_end)
-    else:
-        # Default: first 80%
-        split_idx = int(len(ts) * 0.8)
-        train_mask = np.arange(len(ts)) < split_idx
-
-    if test_start is not None:
-        test_mask = ts >= pd.Timestamp(test_start)
-    else:
-        test_mask = ~train_mask
-
-    def _slice(mask):
-        idx = np.where(mask)[0]
-        # np arrays: index along axis 1 (time)
-        d = ds.data
-        t = ds.target
-        targets = ds.targets
-
-        # Handle different backends
-        if hasattr(d, "numpy"):
-            # torch tensor
-            d_slice = d[:, idx, :]
-            t_slice = t[:, idx] if t is not None else None
-        elif hasattr(d, "get"):
-            # cupy
-            d_slice = d[:, idx, :]
-            t_slice = t[:, idx] if t is not None else None
-        else:
-            # numpy
-            d_slice = d[:, idx, :]
-            t_slice = t[:, idx] if t is not None else None
-
-        return TensorDataset(
-            data=d_slice,
-            target=t_slice,
-            targets={
-                name: target[:, idx] if target is not None else None
-                for name, target in targets.items()
-            },
-            default_target=ds.default_target,
-            asset_ids=ds.asset_ids,
-            timestamps=ds.timestamps[idx],
-            feature_names=ds.feature_names,
-        )
-
-    return _slice(train_mask), _slice(test_mask)
-
-
-# ---------------------------------------------------------------------------
-# Asset subset sampling
-# ---------------------------------------------------------------------------
-
-def sample_assets(
-    ds: TensorDataset,
-    m: int,
-    seed: int = 42,
-) -> TensorDataset:
-    """Return a random subset of *m* assets from *ds*.
-
-    Parameters
-    ----------
-    ds : TensorDataset
-        Full dataset.
-    m : int
-        Number of assets to sample.
-    seed : int
-        Random seed for reproducibility.
-
-    Returns
-    -------
-    TensorDataset
-        Subset with *m* assets.
-    """
-    rng = np.random.default_rng(seed)
-    M = len(ds.asset_ids)
-    if m >= M:
-        logger.warning("Requested m=%d >= total assets %d; returning all", m, M)
-        return ds
-
-    idx = np.sort(rng.choice(M, size=m, replace=False))
-    d = ds.data
-    t = ds.target
-    targets = ds.targets
-
-    if hasattr(d, "numpy"):
-        d_sub = d[idx, :, :]
-        t_sub = t[idx, :] if t is not None else None
-    elif hasattr(d, "get"):
-        d_sub = d[idx, :, :]
-        t_sub = t[idx, :] if t is not None else None
-    else:
-        d_sub = d[idx, :, :]
-        t_sub = t[idx, :] if t is not None else None
-
-    return TensorDataset(
-        data=d_sub,
-        target=t_sub,
-        targets={
-            name: target[idx, :] if target is not None else None
-            for name, target in targets.items()
-        },
-        default_target=ds.default_target,
-        asset_ids=ds.asset_ids[idx],
-        timestamps=ds.timestamps,
-        feature_names=ds.feature_names,
-    )
-
-
-def build_pipeline(
-    df: pd.DataFrame,
-    config: Optional[TensorConfig] = None,
-) -> Union[TensorDataset, Tuple[TensorDataset, TensorDataset]]:
-    """End-to-end: compute target, build tensor, optionally split.
-
-    Parameters
-    ----------
-    df : pd.DataFrame
-        Preprocessed market data.
-    config : TensorConfig, optional
-        Configuration.
-
-    Returns
-    -------
-    TensorDataset or tuple
-        If ``config.train_end`` or ``config.test_start`` is set, returns
-        ``(train, test)``; otherwise the full dataset.
-    """
-    if config is None:
-        config = TensorConfig()
-
-    df = compute_target(df)
-    ds = build_tensor(df, config)
-
-    if config.train_end is not None or config.test_start is not None:
-        return temporal_split(ds, train_end=config.train_end, test_start=config.test_start)
-
-    return ds
diff --git a/src/factorminer/factorminer/evaluation/__init__.py b/src/factorminer/factorminer/evaluation/__init__.py
deleted file mode 100644
index 7bc9f69..0000000
--- a/src/factorminer/factorminer/evaluation/__init__.py
+++ /dev/null
@@ -1,169 +0,0 @@
-"""Multi-stage factor evaluation and validation pipeline."""
-
-from src.factorminer.factorminer.evaluation.admission import (
-    AdmissionDecision,
-    StockThresholds,
-    check_admission,
-    check_replacement,
-)
-from src.factorminer.factorminer.evaluation.correlation import (
-    IncrementalCorrelationMatrix,
-    batch_spearman_correlation,
-    batch_spearman_pairwise,
-    compute_correlation_batch,
-)
-from src.factorminer.factorminer.evaluation.metrics import (
-    compute_factor_stats,
-    compute_ic,
-    compute_ic_mean,
-    compute_ic_vectorized,
-    compute_ic_win_rate,
-    compute_icir,
-    compute_pairwise_correlation,
-    compute_quintile_returns,
-    compute_turnover,
-)
-from src.factorminer.factorminer.evaluation.pipeline import (
-    CandidateFactor,
-    EvaluationResult,
-    FactorLibraryView,
-    PipelineConfig,
-    ValidationPipeline,
-    run_evaluation_pipeline,
-)
-from src.factorminer.factorminer.evaluation.combination import FactorCombiner
-from src.factorminer.factorminer.evaluation.selection import FactorSelector
-from src.factorminer.factorminer.evaluation.portfolio import PortfolioBacktester
-from src.factorminer.factorminer.evaluation.backtest import (
-    SplitWindow,
-    DrawdownResult,
-    train_test_split,
-    rolling_splits,
-    compute_ic_series,
-    compute_rolling_ic,
-    compute_cumulative_ic,
-    compute_ic_stats,
-    factor_return_attribution,
-    compute_drawdown,
-    compute_sharpe_ratio,
-    compute_calmar_ratio,
-)
-from src.factorminer.factorminer.evaluation.regime import (
-    MarketRegime,
-    RegimeConfig,
-    RegimeClassification,
-    RegimeDetector,
-    RegimeICResult,
-    RegimeAwareEvaluator,
-)
-from src.factorminer.factorminer.evaluation.capacity import (
-    CapacityConfig,
-    CapacityEstimate,
-    CapacityEstimator,
-    MarketImpactEstimate,
-    MarketImpactModel,
-    NetCostResult,
-)
-from src.factorminer.factorminer.evaluation.causal import (
-    CausalConfig,
-    CausalTestResult,
-    CausalValidator,
-)
-from src.factorminer.factorminer.evaluation.significance import (
-    BootstrapCIResult,
-    BootstrapICTester,
-    DeflatedSharpeCalculator,
-    DeflatedSharpeResult,
-    FDRController,
-    FDRResult,
-    SignificanceConfig,
-    check_significance,
-)
-from src.factorminer.factorminer.evaluation.research import (
-    FactorGeometryDiagnostics,
-    FactorScoreVector,
-    build_score_vector,
-    compute_factor_geometry,
-    passes_research_admission,
-    run_research_model_suite,
-)
-
-__all__ = [
-    # metrics
-    "compute_ic",
-    "compute_ic_vectorized",
-    "compute_icir",
-    "compute_ic_mean",
-    "compute_ic_win_rate",
-    "compute_pairwise_correlation",
-    "compute_factor_stats",
-    "compute_quintile_returns",
-    "compute_turnover",
-    # correlation
-    "batch_spearman_correlation",
-    "batch_spearman_pairwise",
-    "compute_correlation_batch",
-    "IncrementalCorrelationMatrix",
-    # admission
-    "check_admission",
-    "check_replacement",
-    "AdmissionDecision",
-    "StockThresholds",
-    # pipeline
-    "CandidateFactor",
-    "EvaluationResult",
-    "FactorLibraryView",
-    "PipelineConfig",
-    "ValidationPipeline",
-    "run_evaluation_pipeline",
-    # combination / selection / backtest
-    "FactorCombiner",
-    "FactorSelector",
-    "PortfolioBacktester",
-    "SplitWindow",
-    "DrawdownResult",
-    "train_test_split",
-    "rolling_splits",
-    "compute_ic_series",
-    "compute_rolling_ic",
-    "compute_cumulative_ic",
-    "compute_ic_stats",
-    "factor_return_attribution",
-    "compute_drawdown",
-    "compute_sharpe_ratio",
-    "compute_calmar_ratio",
-    # regime
-    "MarketRegime",
-    "RegimeConfig",
-    "RegimeClassification",
-    "RegimeDetector",
-    "RegimeICResult",
-    "RegimeAwareEvaluator",
-    # capacity
-    "CapacityConfig",
-    "CapacityEstimate",
-    "CapacityEstimator",
-    "MarketImpactEstimate",
-    "MarketImpactModel",
-    "NetCostResult",
-    # causal
-    "CausalConfig",
-    "CausalTestResult",
-    "CausalValidator",
-    # significance
-    "BootstrapCIResult",
-    "BootstrapICTester",
-    "DeflatedSharpeCalculator",
-    "DeflatedSharpeResult",
-    "FDRController",
-    "FDRResult",
-    "SignificanceConfig",
-    "check_significance",
-    # research
-    "FactorGeometryDiagnostics",
-    "FactorScoreVector",
-    "compute_factor_geometry",
-    "build_score_vector",
-    "passes_research_admission",
-    "run_research_model_suite",
-]
diff --git a/src/factorminer/factorminer/evaluation/admission.py b/src/factorminer/factorminer/evaluation/admission.py
deleted file mode 100644
index 941de66..0000000
--- a/src/factorminer/factorminer/evaluation/admission.py
+++ /dev/null
@@ -1,221 +0,0 @@
-"""Admission rules for the factor library.
-
-Implements the decision logic for whether a candidate factor should be
-admitted to the library, replace an existing factor, or be rejected.
-
-Admission Rule (Eq. 10):
-    Admit alpha if |IC(alpha)| >= tau_IC  AND  max_{g in L} |rho(alpha, g)| < theta
-
-Replacement Rule (Eq. 11):
-    Replace g with alpha if:
-        |IC(alpha)| >= 0.10  AND
-        |IC(alpha)| >= 1.3 * |IC(g)|  AND
-        |{g in L : |rho(alpha, g)| >= theta}| == 1
-"""
-
-from __future__ import annotations
-
-from dataclasses import dataclass
-from typing import Dict, List, Optional, Tuple
-
-
-@dataclass
-class AdmissionDecision:
-    """Result of an admission check for a candidate factor."""
-
-    admitted: bool
-    replaced_factor_id: Optional[str] = None
-    rejection_reason: Optional[str] = None
-    max_correlation: float = 0.0
-    correlated_with: Optional[str] = None
-    decision_type: str = "rejected"  # "admitted", "replacement", "rejected"
-
-
-def check_admission(
-    ic_abs: float,
-    max_corr: float,
-    correlated_with: Optional[str],
-    ic_threshold: float = 0.04,
-    correlation_threshold: float = 0.5,
-) -> AdmissionDecision:
-    """Standard admission check (Eq. 10).
-
-    Parameters
-    ----------
-    ic_abs : float
-        Absolute IC of the candidate.
-    max_corr : float
-        Maximum absolute correlation with any library factor.
-    correlated_with : str or None
-        ID of the most correlated library factor.
-    ic_threshold : float
-        Minimum |IC| for admission (tau_IC).
-    correlation_threshold : float
-        Maximum allowed correlation (theta).
-
-    Returns
-    -------
-    AdmissionDecision
-    """
-    if ic_abs < ic_threshold:
-        return AdmissionDecision(
-            admitted=False,
-            rejection_reason=f"IC too low: |IC|={ic_abs:.4f} < {ic_threshold}",
-            max_correlation=max_corr,
-            correlated_with=correlated_with,
-            decision_type="rejected",
-        )
-
-    if max_corr >= correlation_threshold:
-        return AdmissionDecision(
-            admitted=False,
-            rejection_reason=(
-                f"Too correlated: max|rho|={max_corr:.4f} >= {correlation_threshold} "
-                f"(with {correlated_with})"
-            ),
-            max_correlation=max_corr,
-            correlated_with=correlated_with,
-            decision_type="rejected",
-        )
-
-    return AdmissionDecision(
-        admitted=True,
-        max_correlation=max_corr,
-        correlated_with=correlated_with,
-        decision_type="admitted",
-    )
-
-
-def check_replacement(
-    candidate_ic_abs: float,
-    max_corr: float,
-    correlated_with: Optional[str],
-    library_ic_map: Dict[str, float],
-    correlation_map: Dict[str, float],
-    replacement_ic_min: float = 0.10,
-    replacement_ic_ratio: float = 1.3,
-    correlation_threshold: float = 0.5,
-) -> AdmissionDecision:
-    """Replacement admission check (Eq. 11).
-
-    A candidate that failed the standard correlation check may still
-    replace an existing library factor if it is sufficiently stronger
-    and only conflicts with exactly one factor.
-
-    Parameters
-    ----------
-    candidate_ic_abs : float
-        Absolute IC of the candidate.
-    max_corr : float
-        Max absolute correlation with any library factor.
-    correlated_with : str or None
-        ID of the most correlated library factor.
-    library_ic_map : dict
-        Mapping from library factor ID to its absolute IC.
-    correlation_map : dict
-        Mapping from library factor ID to correlation with the candidate.
-    replacement_ic_min : float
-        Minimum |IC| for replacement consideration.
-    replacement_ic_ratio : float
-        Required ratio IC(candidate) / IC(existing).
-    correlation_threshold : float
-        Correlation threshold (theta) for determining conflicts.
-
-    Returns
-    -------
-    AdmissionDecision
-    """
-    # Must meet minimum IC for replacement
-    if candidate_ic_abs < replacement_ic_min:
-        return AdmissionDecision(
-            admitted=False,
-            rejection_reason=(
-                f"IC too low for replacement: |IC|={candidate_ic_abs:.4f} < {replacement_ic_min}"
-            ),
-            max_correlation=max_corr,
-            correlated_with=correlated_with,
-            decision_type="rejected",
-        )
-
-    # Find all factors above the correlation threshold
-    conflicting: List[str] = [
-        fid for fid, corr in correlation_map.items()
-        if abs(corr) >= correlation_threshold
-    ]
-
-    # Must conflict with exactly one factor
-    if len(conflicting) != 1:
-        return AdmissionDecision(
-            admitted=False,
-            rejection_reason=(
-                f"Replacement requires exactly 1 correlated factor, found {len(conflicting)}"
-            ),
-            max_correlation=max_corr,
-            correlated_with=correlated_with,
-            decision_type="rejected",
-        )
-
-    target_id = conflicting[0]
-    target_ic = library_ic_map.get(target_id, 0.0)
-
-    # Candidate must be sufficiently stronger
-    if target_ic > 0 and candidate_ic_abs < replacement_ic_ratio * target_ic:
-        return AdmissionDecision(
-            admitted=False,
-            rejection_reason=(
-                f"Not strong enough to replace {target_id}: "
-                f"|IC|={candidate_ic_abs:.4f} < {replacement_ic_ratio} * {target_ic:.4f}"
-            ),
-            max_correlation=max_corr,
-            correlated_with=correlated_with,
-            decision_type="rejected",
-        )
-
-    return AdmissionDecision(
-        admitted=True,
-        replaced_factor_id=target_id,
-        max_correlation=max_corr,
-        correlated_with=correlated_with,
-        decision_type="replacement",
-    )
-
-
-# ---------------------------------------------------------------------------
-# Stock-level thresholds (configurable)
-# ---------------------------------------------------------------------------
-
-@dataclass
-class StockThresholds:
-    """Default thresholds for A-share stock factor evaluation."""
-
-    ic_abs_min: float = 0.05
-    icir_abs_min: float = 0.5
-    ic_win_rate_min: float = 0.50
-    max_turnover: float = 0.8
-    min_monotonicity: float = 0.0
-
-    def passes(
-        self,
-        ic_abs: float,
-        icir_abs: float,
-        ic_win_rate: float = 1.0,
-        turnover: float = 0.0,
-        monotonicity: float = 1.0,
-    ) -> Tuple[bool, Optional[str]]:
-        """Check if a factor meets all stock-level thresholds.
-
-        Returns
-        -------
-        tuple of (passes, rejection_reason)
-        """
-        if ic_abs < self.ic_abs_min:
-            return False, f"|IC|={ic_abs:.4f} < {self.ic_abs_min}"
-        if icir_abs < self.icir_abs_min:
-            return False, f"|ICIR|={icir_abs:.4f} < {self.icir_abs_min}"
-        if ic_win_rate < self.ic_win_rate_min:
-            return False, f"IC win rate={ic_win_rate:.4f} < {self.ic_win_rate_min}"
-        if turnover > self.max_turnover:
-            return False, f"Turnover={turnover:.4f} > {self.max_turnover}"
-        if monotonicity < self.min_monotonicity:
-            return False, f"Monotonicity={monotonicity:.4f} < {self.min_monotonicity}"
-        return True, None
diff --git a/src/factorminer/factorminer/evaluation/backtest.py b/src/factorminer/factorminer/evaluation/backtest.py
deleted file mode 100644
index f889703..0000000
--- a/src/factorminer/factorminer/evaluation/backtest.py
+++ /dev/null
@@ -1,397 +0,0 @@
-"""Full backtesting utilities for factor evaluation.
-
-Provides time-series splitting, rolling and cumulative IC computation,
-factor return attribution, and drawdown analysis.
-"""
-
-from __future__ import annotations
-
-from dataclasses import dataclass
-from typing import Dict, List, Optional, Tuple
-
-import numpy as np
-from scipy.stats import spearmanr
-
-
-# ------------------------------------------------------------------
-# Time-series splitting
-# ------------------------------------------------------------------
-
-@dataclass
-class SplitWindow:
-    """Indices for a single train/test split."""
-    train_start: int
-    train_end: int
-    test_start: int
-    test_end: int
-
-
-def train_test_split(
-    T: int,
-    train_ratio: float = 0.7,
-) -> SplitWindow:
-    """Simple contiguous train/test split.
-
-    Parameters
-    ----------
-    T : int
-        Total number of time steps.
-    train_ratio : float
-        Fraction of data used for training (default 70%).
-
-    Returns
-    -------
-    SplitWindow
-    """
-    split = int(T * train_ratio)
-    return SplitWindow(
-        train_start=0,
-        train_end=split,
-        test_start=split,
-        test_end=T,
-    )
-
-
-def rolling_splits(
-    T: int,
-    train_window: int,
-    test_window: int,
-    step: int = 1,
-) -> List[SplitWindow]:
-    """Generate rolling-window train/test splits.
-
-    Parameters
-    ----------
-    T : int
-        Total number of time steps.
-    train_window : int
-        Size of training window.
-    test_window : int
-        Size of testing window.
-    step : int
-        Step size between consecutive windows.
-
-    Returns
-    -------
-    list of SplitWindow
-    """
-    splits: List[SplitWindow] = []
-    start = 0
-    while start + train_window + test_window <= T:
-        splits.append(SplitWindow(
-            train_start=start,
-            train_end=start + train_window,
-            test_start=start + train_window,
-            test_end=start + train_window + test_window,
-        ))
-        start += step
-    return splits
-
-
-# ------------------------------------------------------------------
-# IC computation
-# ------------------------------------------------------------------
-
-def compute_ic_series(
-    signal: np.ndarray,
-    returns: np.ndarray,
-) -> np.ndarray:
-    """Compute cross-sectional Spearman IC at each time step.
-
-    Parameters
-    ----------
-    signal : ndarray of shape (T, N)
-        Factor signal values.
-    returns : ndarray of shape (T, N)
-        Forward returns.
-
-    Returns
-    -------
-    ndarray of shape (T,)
-        IC values; NaN where computation is not possible.
-    """
-    T = signal.shape[0]
-    ics = np.full(T, np.nan)
-    for t in range(T):
-        x = signal[t]
-        y = returns[t]
-        valid = np.isfinite(x) & np.isfinite(y)
-        if valid.sum() < 5:
-            continue
-        corr, _ = spearmanr(x[valid], y[valid])
-        if np.isfinite(corr):
-            ics[t] = corr
-    return ics
-
-
-def compute_rolling_ic(
-    signal: np.ndarray,
-    returns: np.ndarray,
-    window: int = 20,
-) -> np.ndarray:
-    """Compute rolling-window average IC.
-
-    Parameters
-    ----------
-    signal : ndarray of shape (T, N)
-    returns : ndarray of shape (T, N)
-    window : int
-        Rolling window size.
-
-    Returns
-    -------
-    ndarray of shape (T,)
-        Rolling mean IC; NaN where window is insufficient.
-    """
-    ic_series = compute_ic_series(signal, returns)
-    T = len(ic_series)
-    rolling_ic = np.full(T, np.nan)
-    for t in range(window - 1, T):
-        window_ics = ic_series[t - window + 1: t + 1]
-        finite = window_ics[np.isfinite(window_ics)]
-        if len(finite) >= 1:
-            rolling_ic[t] = float(np.mean(finite))
-    return rolling_ic
-
-
-def compute_cumulative_ic(
-    signal: np.ndarray,
-    returns: np.ndarray,
-) -> np.ndarray:
-    """Compute cumulative (expanding-window) mean IC.
-
-    Parameters
-    ----------
-    signal : ndarray of shape (T, N)
-    returns : ndarray of shape (T, N)
-
-    Returns
-    -------
-    ndarray of shape (T,)
-        Expanding-window mean IC.
-    """
-    ic_series = compute_ic_series(signal, returns)
-    T = len(ic_series)
-    cumulative = np.full(T, np.nan)
-    running_sum = 0.0
-    running_count = 0
-    for t in range(T):
-        if np.isfinite(ic_series[t]):
-            running_sum += ic_series[t]
-            running_count += 1
-        if running_count > 0:
-            cumulative[t] = running_sum / running_count
-    return cumulative
-
-
-def compute_ic_stats(ic_series: np.ndarray) -> dict:
-    """Compute summary statistics for an IC series.
-
-    Parameters
-    ----------
-    ic_series : ndarray of shape (T,)
-
-    Returns
-    -------
-    dict with keys: ic_mean, ic_std, icir, ic_win_rate, ic_max, ic_min.
-    """
-    finite = ic_series[np.isfinite(ic_series)]
-    if len(finite) < 2:
-        return {
-            "ic_mean": 0.0,
-            "ic_std": 0.0,
-            "icir": 0.0,
-            "ic_win_rate": 0.0,
-            "ic_max": 0.0,
-            "ic_min": 0.0,
-        }
-    ic_mean = float(np.mean(finite))
-    ic_std = float(np.std(finite, ddof=1))
-    return {
-        "ic_mean": ic_mean,
-        "ic_std": ic_std,
-        "icir": ic_mean / ic_std if ic_std > 1e-12 else 0.0,
-        "ic_win_rate": float(np.mean(finite > 0)),
-        "ic_max": float(np.max(finite)),
-        "ic_min": float(np.min(finite)),
-    }
-
-
-# ------------------------------------------------------------------
-# Factor return attribution
-# ------------------------------------------------------------------
-
-def factor_return_attribution(
-    factor_signals: Dict[int, np.ndarray],
-    returns: np.ndarray,
-) -> Dict[int, dict]:
-    """Attribute portfolio returns to individual factors.
-
-    For each factor, computes the IC series, ICIR, and the mean return of
-    the top-quintile (Q5) minus bottom-quintile (Q1) long-short portfolio.
-
-    Parameters
-    ----------
-    factor_signals : dict[int, ndarray]
-        Mapping from factor ID to (T, N) signal array.
-    returns : ndarray of shape (T, N)
-
-    Returns
-    -------
-    dict mapping factor_id -> attribution dict with keys:
-        ic_mean, icir, ic_win_rate, ls_return
-    """
-    results: Dict[int, dict] = {}
-    for fid, signal in factor_signals.items():
-        ic_series = compute_ic_series(signal, returns)
-        stats = compute_ic_stats(ic_series)
-
-        # Compute long-short return
-        T, N = signal.shape
-        ls_returns = np.full(T, np.nan)
-        for t in range(T):
-            sig_t = signal[t]
-            ret_t = returns[t]
-            valid = np.isfinite(sig_t) & np.isfinite(ret_t)
-            n_valid = valid.sum()
-            if n_valid < 5:
-                continue
-            valid_sigs = sig_t[valid]
-            valid_rets = ret_t[valid]
-            k = max(1, n_valid // 5)
-            sorted_idx = np.argsort(valid_sigs)
-            q1_ret = np.mean(valid_rets[sorted_idx[:k]])
-            q5_ret = np.mean(valid_rets[sorted_idx[-k:]])
-            ls_returns[t] = q5_ret - q1_ret
-
-        stats["ls_return"] = float(np.nanmean(ls_returns))
-        results[fid] = stats
-    return results
-
-
-# ------------------------------------------------------------------
-# Drawdown analysis
-# ------------------------------------------------------------------
-
-@dataclass
-class DrawdownResult:
-    """Results of drawdown analysis."""
-    max_drawdown: float
-    max_drawdown_start: int
-    max_drawdown_end: int
-    drawdown_series: np.ndarray
-    recovery_periods: List[Tuple[int, int, int]]  # (start, trough, end)
-
-
-def compute_drawdown(cumulative_returns: np.ndarray) -> DrawdownResult:
-    """Compute drawdown statistics from a cumulative return series.
-
-    Parameters
-    ----------
-    cumulative_returns : ndarray of shape (T,)
-        Cumulative returns (can be from cumsum of period returns).
-
-    Returns
-    -------
-    DrawdownResult
-    """
-    cumulative_returns = np.asarray(cumulative_returns, dtype=np.float64)
-    T = len(cumulative_returns)
-
-    # Running maximum
-    running_max = np.maximum.accumulate(cumulative_returns)
-    drawdown_series = cumulative_returns - running_max
-
-    # Max drawdown
-    max_dd_idx = np.argmin(drawdown_series)
-    max_dd = float(drawdown_series[max_dd_idx])
-    # Find the peak before the max drawdown
-    peak_idx = int(np.argmax(cumulative_returns[:max_dd_idx + 1]))
-
-    # Identify recovery periods (peak -> trough -> recovery)
-    recovery_periods: List[Tuple[int, int, int]] = []
-    i = 0
-    while i < T:
-        # Find start of drawdown (where dd becomes negative)
-        if drawdown_series[i] < -1e-12:
-            start = i - 1 if i > 0 else 0
-            # Find trough
-            j = i
-            trough = i
-            while j < T and drawdown_series[j] < -1e-12:
-                if drawdown_series[j] < drawdown_series[trough]:
-                    trough = j
-                j += 1
-            end = j if j < T else T - 1
-            recovery_periods.append((start, trough, end))
-            i = j
-        else:
-            i += 1
-
-    return DrawdownResult(
-        max_drawdown=max_dd,
-        max_drawdown_start=peak_idx,
-        max_drawdown_end=max_dd_idx,
-        drawdown_series=drawdown_series,
-        recovery_periods=recovery_periods,
-    )
-
-
-def compute_sharpe_ratio(
-    returns_series: np.ndarray,
-    annualization_factor: float = 252.0,
-    risk_free_rate: float = 0.0,
-) -> float:
-    """Compute annualized Sharpe ratio.
-
-    Parameters
-    ----------
-    returns_series : ndarray of shape (T,)
-        Period returns.
-    annualization_factor : float
-        Number of periods per year (252 for daily).
-    risk_free_rate : float
-        Annualized risk-free rate.
-
-    Returns
-    -------
-    float
-        Annualized Sharpe ratio.
-    """
-    finite = returns_series[np.isfinite(returns_series)]
-    if len(finite) < 2:
-        return 0.0
-    rf_period = risk_free_rate / annualization_factor
-    excess = finite - rf_period
-    mean_excess = np.mean(excess)
-    std_excess = np.std(excess, ddof=1)
-    if std_excess < 1e-12:
-        return 0.0
-    return float(mean_excess / std_excess * np.sqrt(annualization_factor))
-
-
-def compute_calmar_ratio(
-    returns_series: np.ndarray,
-    annualization_factor: float = 252.0,
-) -> float:
-    """Compute Calmar ratio (annualized return / max drawdown).
-
-    Parameters
-    ----------
-    returns_series : ndarray of shape (T,)
-    annualization_factor : float
-
-    Returns
-    -------
-    float
-        Calmar ratio; 0 if max drawdown is zero.
-    """
-    finite = returns_series[np.isfinite(returns_series)]
-    if len(finite) < 2:
-        return 0.0
-    cumulative = np.cumsum(finite)
-    dd = compute_drawdown(cumulative)
-    if abs(dd.max_drawdown) < 1e-12:
-        return 0.0
-    annualized_return = float(np.mean(finite)) * annualization_factor
-    return annualized_return / abs(dd.max_drawdown)
diff --git a/src/factorminer/factorminer/evaluation/capacity.py b/src/factorminer/factorminer/evaluation/capacity.py
deleted file mode 100644
index 2047515..0000000
--- a/src/factorminer/factorminer/evaluation/capacity.py
+++ /dev/null
@@ -1,449 +0,0 @@
-"""Capacity-aware backtesting for alpha factors.
-
-Estimates market impact via a square-root model, evaluates net-of-cost
-IC / ICIR, and determines the maximum capital that a factor can absorb
-before its alpha degrades beyond acceptable limits.
-"""
-
-from __future__ import annotations
-
-from dataclasses import dataclass, field
-from typing import Dict, List, Optional
-
-import numpy as np
-
-from src.factorminer.factorminer.evaluation.metrics import compute_ic, compute_icir
-
-
-# ---------------------------------------------------------------------------
-# Configuration
-# ---------------------------------------------------------------------------
-
-@dataclass
-class CapacityConfig:
-    """Configuration for capacity-aware backtesting.
-
-    Parameters
-    ----------
-    enabled : bool
-        Whether capacity estimation is active.
-    base_capital_usd : float
-        Default capital level used when none is specified explicitly.
-    capacity_levels : list[float]
-        Dollar capital levels to sweep when building the capacity curve.
-    ic_degradation_limit : float
-        Maximum fractional IC degradation (1 - |net_IC|/|gross_IC|) before
-        the factor is considered capacity-constrained.
-    net_icir_threshold : float
-        Minimum net ICIR for a factor to pass the cost-adjusted screen.
-    sigma_annual : float
-        Annualised volatility used by the square-root impact model.
-    participation_limit : float
-        Hard cap on the participation rate per asset (fraction of bar volume).
-    top_fraction : float
-        Fraction of the asset universe in the long (and short) leg.
-    trading_days_per_year : float
-        Number of trading days per calendar year.
-    bars_per_day : float
-        Number of bars (signal periods) per trading day. Default 24 assumes
-        10-minute bars over a 4-hour trading session.
-    """
-
-    enabled: bool = True
-    base_capital_usd: float = 1e8
-    capacity_levels: List[float] = field(
-        default_factory=lambda: [1e7, 5e7, 1e8, 5e8, 1e9]
-    )
-    ic_degradation_limit: float = 0.20
-    net_icir_threshold: float = 0.3
-    sigma_annual: float = 0.25
-    participation_limit: float = 0.10
-    top_fraction: float = 0.20
-    trading_days_per_year: float = 252.0
-    bars_per_day: float = 24.0
-
-
-# ---------------------------------------------------------------------------
-# Result containers
-# ---------------------------------------------------------------------------
-
-@dataclass
-class MarketImpactEstimate:
-    """Per-bar market impact estimate across the evaluation window.
-
-    Attributes
-    ----------
-    impact_bps : np.ndarray, shape (T,)
-        Estimated one-way market impact in basis points per bar.
-    participation_rate : np.ndarray, shape (T,)
-        Mean participation rate (fraction of bar volume) per bar.
-    avg_impact_bps : float
-        Time-averaged impact in basis points.
-    max_impact_bps : float
-        Maximum single-bar impact in basis points.
-    """
-
-    impact_bps: np.ndarray
-    participation_rate: np.ndarray
-    avg_impact_bps: float
-    max_impact_bps: float
-
-
-@dataclass
-class CapacityEstimate:
-    """Result of a capacity sweep for a single factor.
-
-    Attributes
-    ----------
-    factor_name : str
-        Identifier of the evaluated factor.
-    max_capacity_usd : float
-        Interpolated maximum capital (USD) before the IC degradation limit
-        is breached.  ``np.inf`` if no level breaches the limit.
-    capacity_curve : dict[float, float]
-        Mapping from capital level (USD) to IC degradation fraction.
-    break_even_cost_bps : float
-        Approximate single-leg cost (bps) at which net IC drops to zero.
-    """
-
-    factor_name: str
-    max_capacity_usd: float
-    capacity_curve: Dict[float, float]
-    break_even_cost_bps: float
-
-
-@dataclass
-class NetCostResult:
-    """Net-of-cost evaluation at a specific capital level.
-
-    Attributes
-    ----------
-    factor_name : str
-        Identifier of the evaluated factor.
-    gross_icir : float
-        ICIR computed on unadjusted returns.
-    net_icir : float
-        ICIR computed on impact-adjusted returns.
-    gross_ls_return : float
-        Mean gross long-short return per bar.
-    net_ls_return : float
-        Mean net long-short return per bar (gross minus round-trip impact).
-    estimated_capacity_usd : float
-        Capital level at which the evaluation was performed.
-    impact_estimate : MarketImpactEstimate
-        Detailed impact statistics.
-    passes_net_threshold : bool
-        ``True`` if ``net_icir >= config.net_icir_threshold``.
-    """
-
-    factor_name: str
-    gross_icir: float
-    net_icir: float
-    gross_ls_return: float
-    net_ls_return: float
-    estimated_capacity_usd: float
-    impact_estimate: MarketImpactEstimate
-    passes_net_threshold: bool
-
-
-# ---------------------------------------------------------------------------
-# Square-root market impact model
-# ---------------------------------------------------------------------------
-
-class MarketImpactModel:
-    """Square-root market impact model.
-
-    The model estimates single-leg impact as::
-
-        impact = sigma_bar * sqrt(participation_rate)
-
-    where ``sigma_bar`` is the per-bar volatility derived from the annualised
-    volatility, and the participation rate is the fraction of bar volume
-    consumed by the strategy.
-    """
-
-    def __init__(self, config: CapacityConfig | None = None) -> None:
-        self.config = config or CapacityConfig()
-        self._sigma_bar: float = self.config.sigma_annual / np.sqrt(
-            self.config.trading_days_per_year * self.config.bars_per_day
-        )
-
-    # ------------------------------------------------------------------
-    def estimate_impact(
-        self,
-        signals: np.ndarray,
-        volume: np.ndarray,
-        capital: float,
-    ) -> MarketImpactEstimate:
-        """Estimate per-bar market impact for a given capital deployment.
-
-        Parameters
-        ----------
-        signals : np.ndarray, shape (M, T)
-            Factor signal matrix (used to identify quintile membership).
-        volume : np.ndarray, shape (M, T)
-            Dollar volume per asset per bar.  Entries <= 0 are treated as
-            illiquid and assigned the participation limit.
-        capital : float
-            Total capital (USD) deployed by the strategy.
-
-        Returns
-        -------
-        MarketImpactEstimate
-        """
-        M, T = signals.shape
-        cfg = self.config
-
-        n_leg = max(int(M * cfg.top_fraction), 1)
-        per_asset_capital = capital / n_leg
-
-        participation = np.full(T, np.nan, dtype=np.float64)
-
-        for t in range(T):
-            sig_t = signals[:, t]
-            vol_t = volume[:, t]
-
-            valid_sig = ~np.isnan(sig_t)
-            if valid_sig.sum() < n_leg:
-                participation[t] = cfg.participation_limit
-                continue
-
-            # Identify top and bottom quintile assets
-            sig_filled = np.where(valid_sig, sig_t, -np.inf)
-            top_idx = np.argpartition(sig_filled, -n_leg)[-n_leg:]
-
-            # Participation rate for each selected asset
-            rates = np.empty(n_leg, dtype=np.float64)
-            for i, idx in enumerate(top_idx):
-                v = vol_t[idx]
-                if np.isnan(v) or v <= 0:
-                    rates[i] = cfg.participation_limit
-                else:
-                    rates[i] = min(per_asset_capital / v, cfg.participation_limit)
-
-            participation[t] = float(np.mean(rates))
-
-        # Impact in natural units, then convert to bps
-        impact = self._sigma_bar * np.sqrt(participation)
-        impact_bps = impact * 1e4
-
-        avg_impact = float(np.nanmean(impact_bps))
-        max_impact = float(np.nanmax(impact_bps))
-
-        return MarketImpactEstimate(
-            impact_bps=impact_bps,
-            participation_rate=participation,
-            avg_impact_bps=avg_impact,
-            max_impact_bps=max_impact,
-        )
-
-
-# ---------------------------------------------------------------------------
-# Capacity estimator
-# ---------------------------------------------------------------------------
-
-class CapacityEstimator:
-    """Evaluate factor capacity and net-of-cost performance.
-
-    Parameters
-    ----------
-    returns : np.ndarray, shape (M, T)
-        Forward returns for M assets over T bars.
-    volume : np.ndarray, shape (M, T)
-        Dollar volume for M assets over T bars.
-    config : CapacityConfig, optional
-        Configuration; uses defaults when omitted.
-    """
-
-    def __init__(
-        self,
-        returns: np.ndarray,
-        volume: np.ndarray,
-        config: CapacityConfig | None = None,
-    ) -> None:
-        self.returns = returns
-        self.volume = volume
-        self.config = config or CapacityConfig()
-        self._impact_model = MarketImpactModel(self.config)
-
-    # ------------------------------------------------------------------
-    # helpers
-    # ------------------------------------------------------------------
-
-    @staticmethod
-    def _mean_ic(ic_series: np.ndarray) -> float:
-        """Mean IC ignoring NaN."""
-        valid = ic_series[~np.isnan(ic_series)]
-        return float(np.mean(valid)) if len(valid) > 0 else 0.0
-
-    def _net_returns(
-        self,
-        signals: np.ndarray,
-        impact_bps: np.ndarray,
-    ) -> np.ndarray:
-        """Compute impact-adjusted returns.
-
-        For a long-short strategy the round-trip cost is approximately
-        ``2 * impact`` (entry + exit on each leg).  We subtract the cost
-        uniformly from returns as a simple first-order approximation.
-
-        Parameters
-        ----------
-        signals : np.ndarray, shape (M, T)
-            Factor signals (unused beyond shape; cost applied uniformly).
-        impact_bps : np.ndarray, shape (T,)
-            One-way impact per bar in basis points.
-
-        Returns
-        -------
-        np.ndarray, shape (M, T)
-            Adjusted returns matrix.
-        """
-        cost = 2.0 * impact_bps / 1e4  # round-trip, fractional
-        return self.returns - cost[np.newaxis, :]
-
-    # ------------------------------------------------------------------
-    # public API
-    # ------------------------------------------------------------------
-
-    def estimate(
-        self,
-        factor_name: str,
-        signals: np.ndarray,
-    ) -> CapacityEstimate:
-        """Run a capacity sweep across configured capital levels.
-
-        Parameters
-        ----------
-        factor_name : str
-            Human-readable factor identifier.
-        signals : np.ndarray, shape (M, T)
-            Factor signal matrix.
-
-        Returns
-        -------
-        CapacityEstimate
-        """
-        gross_ic = compute_ic(signals, self.returns)
-        abs_gross_mean = abs(self._mean_ic(gross_ic))
-
-        curve: Dict[float, float] = {}
-        degradations: List[float] = []
-        capitals: List[float] = []
-
-        for cap in self.config.capacity_levels:
-            impact = self._impact_model.estimate_impact(signals, self.volume, cap)
-            net_ret = self._net_returns(signals, impact.impact_bps)
-            net_ic = compute_ic(signals, net_ret)
-            abs_net_mean = abs(self._mean_ic(net_ic))
-
-            if abs_gross_mean > 1e-12:
-                deg = 1.0 - abs_net_mean / abs_gross_mean
-            else:
-                deg = 0.0
-
-            curve[cap] = deg
-            capitals.append(cap)
-            degradations.append(deg)
-
-        # Interpolate to find capacity at the degradation limit
-        max_cap = self._interpolate_capacity(
-            capitals, degradations, self.config.ic_degradation_limit
-        )
-
-        # Break-even cost: gross IC expressed in bps
-        # If the full round-trip cost equals the gross L-S spread the alpha
-        # vanishes.  Approximate as gross_mean_ic * 10000 (IC ~ return spread).
-        break_even_bps = abs_gross_mean * 1e4
-
-        return CapacityEstimate(
-            factor_name=factor_name,
-            max_capacity_usd=max_cap,
-            capacity_curve=curve,
-            break_even_cost_bps=break_even_bps,
-        )
-
-    def net_cost_evaluation(
-        self,
-        factor_name: str,
-        signals: np.ndarray,
-        capital: Optional[float] = None,
-    ) -> NetCostResult:
-        """Evaluate a factor net of estimated market impact.
-
-        Parameters
-        ----------
-        factor_name : str
-            Factor identifier.
-        signals : np.ndarray, shape (M, T)
-            Factor signal matrix.
-        capital : float, optional
-            Capital to evaluate at; defaults to ``config.base_capital_usd``.
-
-        Returns
-        -------
-        NetCostResult
-        """
-        cap = capital if capital is not None else self.config.base_capital_usd
-
-        # Gross metrics
-        gross_ic = compute_ic(signals, self.returns)
-        gross_icir = compute_icir(gross_ic)
-
-        # Impact
-        impact = self._impact_model.estimate_impact(signals, self.volume, cap)
-
-        # Net metrics
-        net_ret = self._net_returns(signals, impact.impact_bps)
-        net_ic = compute_ic(signals, net_ret)
-        net_icir = compute_icir(net_ic)
-
-        # Gross / net long-short return (mean across time of Q5-Q1 proxy)
-        gross_ls = float(np.nanmean(self.returns.mean(axis=0)))
-        # Simplified: subtract round-trip impact from L-S return
-        net_ls = gross_ls - 2.0 * impact.avg_impact_bps / 1e4
-
-        return NetCostResult(
-            factor_name=factor_name,
-            gross_icir=gross_icir,
-            net_icir=net_icir,
-            gross_ls_return=gross_ls,
-            net_ls_return=net_ls,
-            estimated_capacity_usd=cap,
-            impact_estimate=impact,
-            passes_net_threshold=net_icir >= self.config.net_icir_threshold,
-        )
-
-    # ------------------------------------------------------------------
-    # internal
-    # ------------------------------------------------------------------
-
-    @staticmethod
-    def _interpolate_capacity(
-        capitals: List[float],
-        degradations: List[float],
-        limit: float,
-    ) -> float:
-        """Linearly interpolate the capital at which degradation hits *limit*.
-
-        Returns ``np.inf`` if all tested levels are below the limit, or the
-        smallest tested level if even that exceeds the limit.
-        """
-        if not capitals:
-            return 0.0
-
-        # Find first crossing
-        for i in range(len(degradations)):
-            if degradations[i] >= limit:
-                if i == 0:
-                    return capitals[0]
-                # Linear interpolation between [i-1] and [i]
-                d0, d1 = degradations[i - 1], degradations[i]
-                c0, c1 = capitals[i - 1], capitals[i]
-                if abs(d1 - d0) < 1e-12:
-                    return c0
-                frac = (limit - d0) / (d1 - d0)
-                return c0 + frac * (c1 - c0)
-
-        # Never breached the limit
-        return float("inf")
diff --git a/src/factorminer/factorminer/evaluation/causal.py b/src/factorminer/factorminer/evaluation/causal.py
deleted file mode 100644
index 32d1715..0000000
--- a/src/factorminer/factorminer/evaluation/causal.py
+++ /dev/null
@@ -1,580 +0,0 @@
-"""Causal validation layer for alpha factor candidates.
-
-Provides Granger causality testing and intervention-based robustness
-analysis to verify that discovered factors have genuine predictive
-relationships with forward returns rather than spurious correlations.
-
-Two complementary tests are combined into a single robustness score:
-
-1. **Granger causality**: Does the factor signal Granger-cause returns
-   after controlling for existing library factors?
-2. **Intervention robustness**: Does factor IC remain stable under
-   realistic data perturbations (volume shocks, volatility shocks,
-   liquidity droughts)?
-"""
-
-from __future__ import annotations
-
-import logging
-import warnings
-from dataclasses import dataclass, field
-from typing import Any, Dict, List, Optional, Tuple
-
-import numpy as np
-
-from src.factorminer.factorminer.evaluation.metrics import compute_ic
-
-logger = logging.getLogger(__name__)
-
-# ---------------------------------------------------------------------------
-# Configuration
-# ---------------------------------------------------------------------------
-
-
-@dataclass
-class CausalConfig:
-    """Configuration for causal validation tests."""
-
-    enabled: bool = True
-
-    # Granger causality settings
-    granger_max_lag: int = 5
-    granger_significance: float = 0.05
-
-    # Intervention test settings
-    n_interventions: int = 3
-    intervention_magnitude: float = 2.0
-    intervention_ic_threshold: float = 0.5  # min IC ratio under intervention
-
-    # Combined robustness scoring
-    robustness_threshold: float = 0.4  # min combined score for admission
-    granger_weight: float = 0.4
-    intervention_weight: float = 0.6
-
-    seed: int = 42
-
-
-# ---------------------------------------------------------------------------
-# Result container
-# ---------------------------------------------------------------------------
-
-
-@dataclass
-class CausalTestResult:
-    """Result of causal validation for a single factor."""
-
-    factor_name: str
-
-    # Granger test results
-    granger_p_value: float
-    granger_f_stat: float
-    granger_passes: bool
-
-    # Intervention test results
-    intervention_ic_ratio: float
-    intervention_passes: bool
-
-    # Combined
-    robustness_score: float  # 0-1
-    passes: bool
-
-    details: Dict[str, Any] = field(default_factory=dict)
-
-
-# ---------------------------------------------------------------------------
-# Validator
-# ---------------------------------------------------------------------------
-
-
-class CausalValidator:
-    """Validates causal relationships between factor signals and returns.
-
-    Parameters
-    ----------
-    returns : np.ndarray, shape (M, T)
-        Forward returns for M assets over T periods.
-    data_tensor : np.ndarray or None, shape (M, T, F)
-        Optional raw feature tensor used for realistic intervention
-        perturbations.  When ``None``, a noise-based fallback is used.
-    library_signals : dict
-        Mapping from factor name to its signal array (M, T).  Used as
-        controls in the Granger test.
-    config : CausalConfig
-        Configuration parameters.
-    """
-
-    def __init__(
-        self,
-        returns: np.ndarray,
-        data_tensor: Optional[np.ndarray],
-        library_signals: Dict[str, np.ndarray],
-        config: CausalConfig | None = None,
-    ) -> None:
-        self.returns = returns
-        self.data_tensor = data_tensor
-        self.library_signals = library_signals
-        self.config = config or CausalConfig()
-        self._rng = np.random.RandomState(self.config.seed)
-
-    # ------------------------------------------------------------------
-    # Public API
-    # ------------------------------------------------------------------
-
-    def validate(self, factor_name: str, signals: np.ndarray) -> CausalTestResult:
-        """Run causal validation on a single factor.
-
-        Parameters
-        ----------
-        factor_name : str
-            Human-readable identifier for logging / result tracking.
-        signals : np.ndarray, shape (M, T)
-            Factor signal matrix.
-
-        Returns
-        -------
-        CausalTestResult
-        """
-        cfg = self.config
-        details: Dict[str, Any] = {}
-        control_library = {
-            name: lib_signals
-            for name, lib_signals in self.library_signals.items()
-            if name != factor_name
-        }
-
-        # --- Granger ---
-        g_p, g_f, g_pass = self._granger_test(
-            signals, self.returns, control_library
-        )
-        details["granger"] = {
-            "p_value": g_p,
-            "f_stat": g_f,
-            "passes": g_pass,
-        }
-
-        # --- Intervention ---
-        i_ratio, i_pass = self._intervention_test(
-            signals, self.returns, self.data_tensor
-        )
-        details["intervention"] = {
-            "ic_ratio": i_ratio,
-            "passes": i_pass,
-        }
-
-        # --- Combined score ---
-        score = self._compute_robustness_score(g_pass, g_p, i_ratio, i_pass)
-        passes = score >= cfg.robustness_threshold
-        details["robustness_score"] = score
-
-        return CausalTestResult(
-            factor_name=factor_name,
-            granger_p_value=g_p,
-            granger_f_stat=g_f,
-            granger_passes=g_pass,
-            intervention_ic_ratio=i_ratio,
-            intervention_passes=i_pass,
-            robustness_score=score,
-            passes=passes,
-            details=details,
-        )
-
-    def validate_batch(
-        self, candidates: List[Tuple[str, np.ndarray]]
-    ) -> Dict[str, CausalTestResult]:
-        """Validate a batch of candidate factors.
-
-        Parameters
-        ----------
-        candidates : list of (name, signals) tuples
-            Each entry is ``(factor_name, signals_array)`` with signals
-            shaped ``(M, T)``.
-
-        Returns
-        -------
-        dict
-            Mapping from factor name to its :class:`CausalTestResult`.
-        """
-        results: Dict[str, CausalTestResult] = {}
-        for name, signals in candidates:
-            results[name] = self.validate(name, signals)
-        return results
-
-    # ------------------------------------------------------------------
-    # Granger causality test
-    # ------------------------------------------------------------------
-
-    def _granger_test(
-        self,
-        signals: np.ndarray,
-        returns: np.ndarray,
-        library_signals: Dict[str, np.ndarray],
-    ) -> Tuple[float, float, bool]:
-        """Granger causality test for factor -> returns.
-
-        Averages signals and returns across the top-20 assets (by signal
-        magnitude) to produce T-length time series, then applies
-        statsmodels Granger tests.
-
-        Returns ``(p_value, f_stat, passes)``.
-        """
-        cfg = self.config
-        M, T = signals.shape
-
-        # Minimum series length guard
-        min_length = 2 * cfg.granger_max_lag + 1
-        if T < min_length:
-            logger.warning(
-                "Time series too short for Granger test "
-                "(T=%d < %d). Passing by default.", T, min_length,
-            )
-            return 1.0, 0.0, True
-
-        # --- Aggregate to T-length series ---
-        sig_series = self._aggregate_top_assets(signals, top_k=20)
-        ret_series = self._aggregate_top_assets(returns, top_k=20)
-
-        # Handle constant or all-NaN series
-        if self._is_degenerate(sig_series) or self._is_degenerate(ret_series):
-            logger.warning(
-                "Degenerate series detected in Granger test. Passing by default."
-            )
-            return 1.0, 0.0, True
-
-        # --- Attempt statsmodels-based Granger test ---
-        try:
-            from statsmodels.tsa.stattools import grangercausalitytests  # noqa: F811
-
-            p_value, f_stat = self._run_granger_bivariate(
-                sig_series, ret_series, cfg.granger_max_lag
-            )
-
-            # Multivariate extension if library has enough factors
-            if len(library_signals) > 10:
-                p_multi, f_multi = self._run_granger_multivariate(
-                    sig_series, ret_series, library_signals, cfg.granger_max_lag
-                )
-                # Take the more conservative (higher) p-value
-                if p_multi is not None:
-                    p_value = max(p_value, p_multi)
-                    f_stat = min(f_stat, f_multi)
-
-            passes = p_value < cfg.granger_significance
-            return float(p_value), float(f_stat), bool(passes)
-
-        except ImportError:
-            logger.warning(
-                "statsmodels not available; skipping Granger test. "
-                "Install statsmodels for causal validation."
-            )
-            return 1.0, 0.0, True
-        except Exception as exc:
-            logger.warning("Granger test failed: %s. Passing by default.", exc)
-            return 1.0, 0.0, True
-
-    def _run_granger_bivariate(
-        self,
-        sig_series: np.ndarray,
-        ret_series: np.ndarray,
-        max_lag: int,
-    ) -> Tuple[float, float]:
-        """Bivariate Granger test using statsmodels."""
-        from statsmodels.tsa.stattools import grangercausalitytests
-
-        # Stack as (T, 2): [returns, signals] -- statsmodels convention
-        # tests if column 1 (signals) Granger-causes column 0 (returns)
-        data = np.column_stack([ret_series, sig_series])
-
-        # Clamp max_lag to available data
-        effective_lag = min(max_lag, len(data) // 3)
-        if effective_lag < 1:
-            return 1.0, 0.0
-
-        with warnings.catch_warnings():
-            warnings.simplefilter("ignore")
-            results = grangercausalitytests(data, maxlag=effective_lag, verbose=False)
-
-        # Find the lag with the smallest p-value (ssr_ftest)
-        best_p = 1.0
-        best_f = 0.0
-        for lag in range(1, effective_lag + 1):
-            if lag not in results:
-                continue
-            test_dict = results[lag][0]
-            f_test = test_dict.get("ssr_ftest")
-            if f_test is not None:
-                p_val = f_test[1]
-                f_val = f_test[0]
-                if p_val < best_p:
-                    best_p = p_val
-                    best_f = f_val
-
-        return float(best_p), float(best_f)
-
-    def _run_granger_multivariate(
-        self,
-        sig_series: np.ndarray,
-        ret_series: np.ndarray,
-        library_signals: Dict[str, np.ndarray],
-        max_lag: int,
-    ) -> Tuple[Optional[float], Optional[float]]:
-        """Multivariate Granger via VAR, controlling for library factors.
-
-        If the library has >10 factors the controls are PCA-reduced to
-        5 components.
-        """
-        try:
-            from statsmodels.tsa.api import VAR
-
-            # Build control matrix: average each library factor across top assets
-            control_series = []
-            for _name, lib_sig in library_signals.items():
-                cs = self._aggregate_top_assets(lib_sig, top_k=20)
-                if not self._is_degenerate(cs):
-                    control_series.append(cs)
-
-            if not control_series:
-                return None, None
-
-            controls = np.column_stack(control_series)
-
-            # PCA reduction if too many controls
-            if controls.shape[1] > 10:
-                controls = self._pca_reduce(controls, n_components=5)
-
-            # Build VAR dataset: [returns, signals, controls...]
-            var_data = np.column_stack([ret_series, sig_series, controls])
-
-            # Drop rows with NaN
-            valid_mask = ~np.any(np.isnan(var_data), axis=1)
-            var_data = var_data[valid_mask]
-
-            effective_lag = min(max_lag, len(var_data) // (3 * var_data.shape[1]))
-            if effective_lag < 1:
-                return None, None
-
-            model = VAR(var_data)
-            with warnings.catch_warnings():
-                warnings.simplefilter("ignore")
-                fitted = model.fit(maxlags=effective_lag, ic=None)
-
-            # Test Granger causality: does column 1 (signals) cause column 0 (returns)?
-            test_result = fitted.test_causality(
-                caused=0, causing=1, kind="f"
-            )
-            p_value = float(test_result.pvalue)
-            f_stat = float(test_result.test_statistic)
-
-            return p_value, f_stat
-
-        except Exception as exc:
-            logger.warning(
-                "Multivariate Granger (VAR) failed: %s. Skipping.", exc
-            )
-            return None, None
-
-    # ------------------------------------------------------------------
-    # Intervention robustness test
-    # ------------------------------------------------------------------
-
-    def _intervention_test(
-        self,
-        signals: np.ndarray,
-        returns: np.ndarray,
-        data_tensor: Optional[np.ndarray],
-    ) -> Tuple[float, bool]:
-        """Intervention-based robustness test.
-
-        Three perturbation scenarios are applied; the factor passes if
-        its IC remains above the threshold ratio in at least 2 of 3.
-
-        Returns ``(mean_ic_ratio, passes)``.
-        """
-        cfg = self.config
-
-        # Baseline IC
-        ic_orig = compute_ic(signals, returns)
-        valid_orig = ic_orig[~np.isnan(ic_orig)]
-        if len(valid_orig) < 3:
-            logger.warning("Too few valid IC periods for intervention test.")
-            return 1.0, True
-
-        mean_ic_orig = float(np.mean(np.abs(valid_orig)))
-        if mean_ic_orig < 1e-10:
-            # Zero IC baseline: interventions cannot degrade further
-            return 1.0, True
-
-        ratios: List[float] = []
-        pass_count = 0
-
-        scenarios = self._build_intervention_scenarios(
-            signals, returns, data_tensor
-        )
-
-        for scenario_name, perturbed_signals, perturbed_returns in scenarios:
-            ic_pert = compute_ic(perturbed_signals, perturbed_returns)
-            valid_pert = ic_pert[~np.isnan(ic_pert)]
-            if len(valid_pert) < 3:
-                # Not enough data after perturbation; count as pass
-                ratios.append(1.0)
-                pass_count += 1
-                continue
-
-            mean_ic_pert = float(np.mean(np.abs(valid_pert)))
-            ratio = mean_ic_pert / mean_ic_orig
-            ratios.append(ratio)
-            if ratio >= cfg.intervention_ic_threshold:
-                pass_count += 1
-
-        mean_ratio = float(np.mean(ratios)) if ratios else 1.0
-        passes = pass_count >= 2  # at least 2/3 interventions pass
-
-        return mean_ratio, passes
-
-    def _build_intervention_scenarios(
-        self,
-        signals: np.ndarray,
-        returns: np.ndarray,
-        data_tensor: Optional[np.ndarray],
-    ) -> List[Tuple[str, np.ndarray, np.ndarray]]:
-        """Construct the three intervention scenarios.
-
-        Returns a list of ``(name, perturbed_signals, perturbed_returns)``.
-        """
-        M, T = signals.shape
-        cfg = self.config
-        rng = self._rng
-
-        scenarios: List[Tuple[str, np.ndarray, np.ndarray]] = []
-
-        if data_tensor is not None and data_tensor.shape[:2] == (M, T):
-            # --- Volume shock: 2x on random 30% of periods ---
-            shock_periods = rng.choice(T, size=max(1, int(0.3 * T)), replace=False)
-            sig_vol = signals.copy()
-            sig_vol[:, shock_periods] *= cfg.intervention_magnitude
-            scenarios.append(("volume_shock", sig_vol, returns.copy()))
-
-            # --- Volatility shock: 2x noise on returns ---
-            ret_vol = returns.copy()
-            noise = rng.randn(M, T) * np.nanstd(returns) * cfg.intervention_magnitude
-            ret_vol += noise
-            scenarios.append(("volatility_shock", signals.copy(), ret_vol))
-
-            # --- Liquidity drought: zero volume on 10% of (asset, period) pairs ---
-            sig_liq = signals.copy()
-            n_pairs = max(1, int(0.1 * M * T))
-            drought_assets = rng.randint(0, M, size=n_pairs)
-            drought_periods = rng.randint(0, T, size=n_pairs)
-            sig_liq[drought_assets, drought_periods] = 0.0
-            scenarios.append(("liquidity_drought", sig_liq, returns.copy()))
-
-        else:
-            # --- Fallback: add noise directly to signals ---
-            for i, scenario_name in enumerate(
-                ["noise_shock_1", "noise_shock_2", "noise_shock_3"]
-            ):
-                sig_pert = signals.copy()
-                noise_scale = np.nanstd(signals) * cfg.intervention_magnitude
-                if noise_scale < 1e-12:
-                    noise_scale = cfg.intervention_magnitude
-                noise = rng.randn(M, T) * noise_scale * (0.5 + 0.5 * i)
-                sig_pert += noise
-                scenarios.append((scenario_name, sig_pert, returns.copy()))
-
-        return scenarios
-
-    # ------------------------------------------------------------------
-    # Robustness score
-    # ------------------------------------------------------------------
-
-    def _compute_robustness_score(
-        self,
-        granger_passes: bool,
-        granger_p: float,
-        intervention_ratio: float,
-        intervention_passes: bool,
-    ) -> float:
-        """Combine Granger and intervention results into a 0-1 score.
-
-        granger_component = 1.0 - min(p_value / significance, 1.0)
-        intervention_component = min(ic_ratio / 1.0, 1.0)
-        score = w_g * granger_component + w_i * intervention_component
-        """
-        cfg = self.config
-
-        granger_component = 1.0 - min(granger_p / cfg.granger_significance, 1.0)
-        intervention_component = min(intervention_ratio / 1.0, 1.0)
-
-        score = (
-            cfg.granger_weight * granger_component
-            + cfg.intervention_weight * intervention_component
-        )
-
-        return float(np.clip(score, 0.0, 1.0))
-
-    # ------------------------------------------------------------------
-    # Helpers
-    # ------------------------------------------------------------------
-
-    @staticmethod
-    def _aggregate_top_assets(
-        matrix: np.ndarray, top_k: int = 20
-    ) -> np.ndarray:
-        """Average across the top-k assets (by mean absolute value) to
-        produce a T-length series.
-        """
-        M, T = matrix.shape
-        k = min(top_k, M)
-
-        # Mean absolute value per asset, ignoring NaN
-        with warnings.catch_warnings():
-            warnings.simplefilter("ignore", category=RuntimeWarning)
-            asset_means = np.nanmean(np.abs(matrix), axis=1)
-
-        # Replace NaN means with -inf so they sort last
-        asset_means = np.where(np.isnan(asset_means), -np.inf, asset_means)
-        top_idx = np.argpartition(asset_means, -k)[-k:]
-
-        subset = matrix[top_idx, :]
-
-        with warnings.catch_warnings():
-            warnings.simplefilter("ignore", category=RuntimeWarning)
-            series = np.nanmean(subset, axis=0)
-
-        # Fill remaining NaN with 0
-        series = np.where(np.isnan(series), 0.0, series)
-        return series
-
-    @staticmethod
-    def _is_degenerate(series: np.ndarray) -> bool:
-        """Check if a series is constant or all-NaN."""
-        valid = series[~np.isnan(series)]
-        if len(valid) < 3:
-            return True
-        return float(np.std(valid)) < 1e-12
-
-    @staticmethod
-    def _pca_reduce(X: np.ndarray, n_components: int = 5) -> np.ndarray:
-        """Reduce columns of X via truncated SVD (no sklearn dependency).
-
-        Parameters
-        ----------
-        X : np.ndarray, shape (T, K)
-        n_components : int
-
-        Returns
-        -------
-        np.ndarray, shape (T, n_components)
-        """
-        # Center
-        means = np.nanmean(X, axis=0)
-        X_centered = X - means
-        X_centered = np.where(np.isnan(X_centered), 0.0, X_centered)
-
-        # Economy SVD
-        n_comp = min(n_components, X_centered.shape[1], X_centered.shape[0])
-        try:
-            U, S, Vt = np.linalg.svd(X_centered, full_matrices=False)
-            return U[:, :n_comp] * S[:n_comp]
-        except np.linalg.LinAlgError:
-            logger.warning("SVD failed during PCA reduction; using raw data.")
-            return X_centered[:, :n_comp]
diff --git a/src/factorminer/factorminer/evaluation/combination.py b/src/factorminer/factorminer/evaluation/combination.py
deleted file mode 100644
index 62983c2..0000000
--- a/src/factorminer/factorminer/evaluation/combination.py
+++ /dev/null
@@ -1,195 +0,0 @@
-"""Factor combination strategies for building composite signals.
-
-Implements Equal-Weight, IC-Weighted, and Orthogonal combination methods
-for merging multiple alpha factors into a single composite signal, following
-the methodology described in the FactorMiner paper.
-"""
-
-from __future__ import annotations
-
-from typing import Dict, Optional
-
-import numpy as np
-
-
-class FactorCombiner:
-    """Combine multiple factor signals into a single composite signal.
-
-    Each factor signal is a 2-D array of shape (T, N) where T is the number
-    of time steps and N is the number of assets.  Factor IDs are arbitrary
-    integers used as dictionary keys.
-    """
-
-    # ------------------------------------------------------------------
-    # Public combination methods
-    # ------------------------------------------------------------------
-
-    def equal_weight(self, factor_signals: Dict[int, np.ndarray]) -> np.ndarray:
-        """Equal-Weight (EW): simple average of cross-sectionally standardized factors.
-
-        Paper results: IC Mean=0.1451, ICIR=1.2053, IC Win Rate=85.0%.
-
-        Parameters
-        ----------
-        factor_signals : dict[int, ndarray]
-            Mapping from factor ID to (T, N) signal array.
-
-        Returns
-        -------
-        ndarray of shape (T, N)
-            Composite signal (average of z-scored factors).
-        """
-        if not factor_signals:
-            raise ValueError("factor_signals must not be empty")
-
-        standardized = [
-            self._cross_sectional_standardize(sig)
-            for sig in factor_signals.values()
-        ]
-        stacked = np.stack(standardized, axis=0)  # (K, T, N)
-        # Average over factors, ignoring NaNs
-        return np.nanmean(stacked, axis=0)
-
-    def ic_weighted(
-        self,
-        factor_signals: Dict[int, np.ndarray],
-        ic_values: Dict[int, float],
-    ) -> np.ndarray:
-        """IC-Weighted (ICW): weight factors proportionally by their historical IC.
-
-        Paper results: IC Mean=0.1496, ICIR=1.2430, Cumulative Return=26.67
-        (12.4% over EW).
-
-        Parameters
-        ----------
-        factor_signals : dict[int, ndarray]
-            Mapping from factor ID to (T, N) signal array.
-        ic_values : dict[int, float]
-            Mapping from factor ID to its historical Information Coefficient.
-            Factors with non-positive IC are excluded.
-
-        Returns
-        -------
-        ndarray of shape (T, N)
-            Composite signal.
-        """
-        if not factor_signals:
-            raise ValueError("factor_signals must not be empty")
-
-        ids = list(factor_signals.keys())
-        weights: Dict[int, float] = {}
-        for fid in ids:
-            ic = ic_values.get(fid, 0.0)
-            if np.isfinite(ic) and ic > 0.0:
-                weights[fid] = ic
-
-        if not weights:
-            # Fall back to equal weight if all ICs are non-positive
-            return self.equal_weight(factor_signals)
-
-        total_weight = sum(weights.values())
-        ref_shape = next(iter(factor_signals.values())).shape
-        composite = np.zeros(ref_shape, dtype=np.float64)
-
-        for fid, w in weights.items():
-            z = self._cross_sectional_standardize(factor_signals[fid])
-            composite += (w / total_weight) * np.where(np.isnan(z), 0.0, z)
-
-        return composite
-
-    def orthogonal(self, factor_signals: Dict[int, np.ndarray]) -> np.ndarray:
-        """Orthogonal: Gram-Schmidt orthogonalization before averaging.
-
-        Removes cross-factor collinearity by projecting each factor onto the
-        subspace orthogonal to all previously processed factors, then averages
-        the orthogonalized residuals.
-
-        Paper results: IC Mean=0.1400, ICIR=1.1933.
-
-        Parameters
-        ----------
-        factor_signals : dict[int, ndarray]
-            Mapping from factor ID to (T, N) signal array.
-
-        Returns
-        -------
-        ndarray of shape (T, N)
-            Composite signal (average of orthogonalized z-scored factors).
-        """
-        if not factor_signals:
-            raise ValueError("factor_signals must not be empty")
-
-        standardized = [
-            self._cross_sectional_standardize(sig)
-            for sig in factor_signals.values()
-        ]
-
-        orthogonalized = self._gram_schmidt(standardized)
-        stacked = np.stack(orthogonalized, axis=0)  # (K, T, N)
-        return np.nanmean(stacked, axis=0)
-
-    # ------------------------------------------------------------------
-    # Internal helpers
-    # ------------------------------------------------------------------
-
-    def _cross_sectional_standardize(self, signals: np.ndarray) -> np.ndarray:
-        """Standardize signals cross-sectionally (across assets) at each time step.
-
-        z_score = (x - mean) / std  per cross-section (row).
-
-        Parameters
-        ----------
-        signals : ndarray of shape (T, N)
-
-        Returns
-        -------
-        ndarray of shape (T, N)
-            Cross-sectionally standardized values.  Rows where std == 0
-            are set to 0.
-        """
-        signals = np.asarray(signals, dtype=np.float64)
-        cs_mean = np.nanmean(signals, axis=1, keepdims=True)
-        cs_std = np.nanstd(signals, axis=1, keepdims=True)
-        # Avoid division by zero
-        cs_std = np.where(cs_std == 0.0, 1.0, cs_std)
-        return (signals - cs_mean) / cs_std
-
-    @staticmethod
-    def _gram_schmidt(factors: list[np.ndarray]) -> list[np.ndarray]:
-        """Modified Gram-Schmidt orthogonalization on flattened factor vectors.
-
-        Each factor is a (T, N) array.  We flatten to 1-D, orthogonalize,
-        then reshape back.  NaN values are treated as zero during projection
-        and restored afterward.
-
-        Parameters
-        ----------
-        factors : list of ndarray, each (T, N)
-
-        Returns
-        -------
-        list of ndarray, each (T, N) -- orthogonalized factors.
-        """
-        if len(factors) <= 1:
-            return list(factors)
-
-        shape = factors[0].shape
-        # Replace NaN with 0 for linear algebra, track NaN mask
-        nan_masks = [np.isnan(f) for f in factors]
-        vecs = [np.where(m, 0.0, f).ravel() for f, m in zip(factors, nan_masks)]
-
-        ortho: list[np.ndarray] = []
-        for i, v in enumerate(vecs):
-            u = v.copy()
-            for prev in ortho:
-                denom = np.dot(prev, prev)
-                if denom > 1e-12:
-                    u -= (np.dot(u, prev) / denom) * prev
-            ortho.append(u)
-
-        result = []
-        for u, mask in zip(ortho, nan_masks):
-            arr = u.reshape(shape)
-            arr[mask] = np.nan
-            result.append(arr)
-        return result
diff --git a/src/factorminer/factorminer/evaluation/correlation.py b/src/factorminer/factorminer/evaluation/correlation.py
deleted file mode 100644
index cef2fc7..0000000
--- a/src/factorminer/factorminer/evaluation/correlation.py
+++ /dev/null
@@ -1,374 +0,0 @@
-"""Efficient correlation computation for factor evaluation.
-
-Provides batch Spearman rank correlation, vectorized cross-sectional
-correlation, and incremental correlation matrix updates for the
-factor library.  Supports both numpy and optional torch backends.
-"""
-
-from __future__ import annotations
-
-from typing import Dict, List, Optional, Tuple
-
-import numpy as np
-from scipy.stats import rankdata
-
-
-# ---------------------------------------------------------------------------
-# Batch cross-sectional Spearman rank correlation
-# ---------------------------------------------------------------------------
-
-def _rank_columns(x: np.ndarray) -> np.ndarray:
-    """Rank each column of x independently, leaving NaN as NaN.
-
-    Parameters
-    ----------
-    x : np.ndarray, shape (M, T)
-
-    Returns
-    -------
-    np.ndarray, shape (M, T)
-        Ranks per column, NaN where input was NaN.
-    """
-    M, T = x.shape
-    ranked = np.full_like(x, np.nan, dtype=np.float64)
-    for t in range(T):
-        col = x[:, t]
-        valid = ~np.isnan(col)
-        if valid.sum() < 2:
-            continue
-        ranked[valid, t] = rankdata(col[valid])
-    return ranked
-
-
-def batch_spearman_correlation(
-    candidate_signals: np.ndarray,
-    library_signals: np.ndarray,
-) -> np.ndarray:
-    """Compute Spearman correlation between one candidate and multiple library factors.
-
-    For each library factor g, computes:
-        rho = (1/|T_valid|) * sum_t Corr_rank(candidate_t, g_t)
-
-    Parameters
-    ----------
-    candidate_signals : np.ndarray, shape (M, T)
-        Signal array for the candidate factor.
-    library_signals : np.ndarray, shape (N, M, T)
-        Signal arrays for N library factors.
-
-    Returns
-    -------
-    np.ndarray, shape (N,)
-        Average cross-sectional Spearman correlation with each library factor.
-    """
-    N = library_signals.shape[0]
-    if N == 0:
-        return np.array([], dtype=np.float64)
-
-    M, T = candidate_signals.shape
-    correlations = np.zeros(N, dtype=np.float64)
-
-    # Rank candidate columns once
-    cand_ranked = _rank_columns(candidate_signals)
-
-    for i in range(N):
-        lib_ranked = _rank_columns(library_signals[i])
-        corr_sum = 0.0
-        count = 0
-        for t in range(T):
-            cr = cand_ranked[:, t]
-            lr = lib_ranked[:, t]
-            valid = ~(np.isnan(cr) | np.isnan(lr))
-            n = valid.sum()
-            if n < 5:
-                continue
-            cr_v = cr[valid]
-            lr_v = lr[valid]
-            cr_m = cr_v - cr_v.mean()
-            lr_m = lr_v - lr_v.mean()
-            denom = np.sqrt((cr_m ** 2).sum() * (lr_m ** 2).sum())
-            if denom > 1e-12:
-                corr_sum += (cr_m * lr_m).sum() / denom
-            count += 1
-        if count > 0:
-            correlations[i] = corr_sum / count
-
-    return correlations
-
-
-def batch_spearman_pairwise(
-    signals_list: List[np.ndarray],
-) -> np.ndarray:
-    """Compute pairwise Spearman correlation matrix for a list of signal arrays.
-
-    Parameters
-    ----------
-    signals_list : list of np.ndarray, each shape (M, T)
-        Signal arrays for K candidate factors.
-
-    Returns
-    -------
-    np.ndarray, shape (K, K)
-        Symmetric correlation matrix. Diagonal is 1.0.
-    """
-    K = len(signals_list)
-    if K == 0:
-        return np.array([], dtype=np.float64).reshape(0, 0)
-
-    M, T = signals_list[0].shape
-
-    # Pre-compute ranks for all candidates
-    ranked_list = [_rank_columns(s) for s in signals_list]
-
-    corr_matrix = np.eye(K, dtype=np.float64)
-
-    for i in range(K):
-        for j in range(i + 1, K):
-            corr_sum = 0.0
-            count = 0
-            for t in range(T):
-                ri = ranked_list[i][:, t]
-                rj = ranked_list[j][:, t]
-                valid = ~(np.isnan(ri) | np.isnan(rj))
-                n = valid.sum()
-                if n < 5:
-                    continue
-                ri_v = ri[valid]
-                rj_v = rj[valid]
-                ri_m = ri_v - ri_v.mean()
-                rj_m = rj_v - rj_v.mean()
-                denom = np.sqrt((ri_m ** 2).sum() * (rj_m ** 2).sum())
-                if denom > 1e-12:
-                    corr_sum += (ri_m * rj_m).sum() / denom
-                count += 1
-            if count > 0:
-                corr_matrix[i, j] = corr_sum / count
-                corr_matrix[j, i] = corr_matrix[i, j]
-
-    return corr_matrix
-
-
-# ---------------------------------------------------------------------------
-# Incremental correlation matrix update
-# ---------------------------------------------------------------------------
-
-class IncrementalCorrelationMatrix:
-    """Maintains a correlation matrix that can be incrementally updated.
-
-    Supports adding new factors and removing existing ones without
-    recomputing the entire matrix from scratch.
-    """
-
-    def __init__(self) -> None:
-        self._signals: Dict[str, np.ndarray] = {}
-        self._ranked: Dict[str, np.ndarray] = {}
-        self._corr_cache: Dict[Tuple[str, str], float] = {}
-        self._factor_ids: List[str] = []
-
-    @property
-    def size(self) -> int:
-        return len(self._factor_ids)
-
-    @property
-    def factor_ids(self) -> List[str]:
-        return list(self._factor_ids)
-
-    def _compute_pair_corr(self, id_a: str, id_b: str) -> float:
-        """Compute average cross-sectional Spearman between two factors."""
-        ra = self._ranked[id_a]
-        rb = self._ranked[id_b]
-        M, T = ra.shape
-        corr_sum = 0.0
-        count = 0
-        for t in range(T):
-            a_col = ra[:, t]
-            b_col = rb[:, t]
-            valid = ~(np.isnan(a_col) | np.isnan(b_col))
-            n = valid.sum()
-            if n < 5:
-                continue
-            a_v = a_col[valid]
-            b_v = b_col[valid]
-            a_m = a_v - a_v.mean()
-            b_m = b_v - b_v.mean()
-            denom = np.sqrt((a_m ** 2).sum() * (b_m ** 2).sum())
-            if denom > 1e-12:
-                corr_sum += (a_m * b_m).sum() / denom
-            count += 1
-        return corr_sum / count if count > 0 else 0.0
-
-    def add_factor(self, factor_id: str, signals: np.ndarray) -> Dict[str, float]:
-        """Add a factor and compute its correlation with all existing factors.
-
-        Parameters
-        ----------
-        factor_id : str
-        signals : np.ndarray, shape (M, T)
-
-        Returns
-        -------
-        dict
-            Mapping from existing factor_id to correlation with the new factor.
-        """
-        self._signals[factor_id] = signals
-        self._ranked[factor_id] = _rank_columns(signals)
-
-        correlations: Dict[str, float] = {}
-        for existing_id in self._factor_ids:
-            corr = self._compute_pair_corr(factor_id, existing_id)
-            key = (min(factor_id, existing_id), max(factor_id, existing_id))
-            self._corr_cache[key] = corr
-            correlations[existing_id] = corr
-
-        self._factor_ids.append(factor_id)
-        return correlations
-
-    def remove_factor(self, factor_id: str) -> None:
-        """Remove a factor from the matrix."""
-        if factor_id not in self._signals:
-            return
-        self._signals.pop(factor_id, None)
-        self._ranked.pop(factor_id, None)
-        self._factor_ids = [fid for fid in self._factor_ids if fid != factor_id]
-        # Remove cached correlations involving this factor
-        keys_to_remove = [
-            k for k in self._corr_cache if factor_id in k
-        ]
-        for k in keys_to_remove:
-            del self._corr_cache[k]
-
-    def get_correlation(self, id_a: str, id_b: str) -> float:
-        """Get cached correlation between two factors."""
-        key = (min(id_a, id_b), max(id_a, id_b))
-        if key in self._corr_cache:
-            return self._corr_cache[key]
-        if id_a == id_b:
-            return 1.0
-        return 0.0
-
-    def get_max_correlation(self, factor_id: str) -> Tuple[float, Optional[str]]:
-        """Get the maximum absolute correlation of a factor with all others.
-
-        Returns
-        -------
-        tuple of (max_abs_corr, most_correlated_factor_id)
-        """
-        max_corr = 0.0
-        max_id: Optional[str] = None
-        for other_id in self._factor_ids:
-            if other_id == factor_id:
-                continue
-            corr = abs(self.get_correlation(factor_id, other_id))
-            if corr > max_corr:
-                max_corr = corr
-                max_id = other_id
-        return max_corr, max_id
-
-    def to_matrix(self) -> np.ndarray:
-        """Return the full correlation matrix as a numpy array.
-
-        Returns
-        -------
-        np.ndarray, shape (N, N)
-        """
-        N = len(self._factor_ids)
-        mat = np.eye(N, dtype=np.float64)
-        for i in range(N):
-            for j in range(i + 1, N):
-                corr = self.get_correlation(self._factor_ids[i], self._factor_ids[j])
-                mat[i, j] = corr
-                mat[j, i] = corr
-        return mat
-
-
-# ---------------------------------------------------------------------------
-# Torch backend (optional)
-# ---------------------------------------------------------------------------
-
-def _try_torch_rank_correlation(
-    candidate: np.ndarray,
-    library: np.ndarray,
-) -> Optional[np.ndarray]:
-    """Attempt to compute rank correlations using PyTorch for GPU acceleration.
-
-    Falls back to None if torch is not available.
-
-    Parameters
-    ----------
-    candidate : np.ndarray, shape (M, T)
-    library : np.ndarray, shape (N, M, T)
-
-    Returns
-    -------
-    np.ndarray, shape (N,) or None if torch unavailable.
-    """
-    try:
-        import torch
-    except ImportError:
-        return None
-
-    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-
-    N, M, T = library.shape
-    cand_t = torch.from_numpy(candidate).to(device, dtype=torch.float64)
-    lib_t = torch.from_numpy(library).to(device, dtype=torch.float64)
-
-    correlations = torch.zeros(N, dtype=torch.float64, device=device)
-
-    for t in range(T):
-        c_col = cand_t[:, t]
-        l_cols = lib_t[:, :, t]  # (N, M)
-
-        # Skip if too many NaN
-        c_valid = ~torch.isnan(c_col)
-        if c_valid.sum() < 5:
-            continue
-
-        # Rank the candidate column
-        c_sorted_idx = c_col[c_valid].argsort().argsort().float() + 1.0
-
-        for i in range(N):
-            l_col = l_cols[i]
-            valid = c_valid & ~torch.isnan(l_col)
-            n = valid.sum()
-            if n < 5:
-                continue
-            # Rank both
-            c_v = c_col[valid]
-            l_v = l_col[valid]
-            c_rank = c_v.argsort().argsort().float() + 1.0
-            l_rank = l_v.argsort().argsort().float() + 1.0
-            c_m = c_rank - c_rank.mean()
-            l_m = l_rank - l_rank.mean()
-            denom = torch.sqrt((c_m ** 2).sum() * (l_m ** 2).sum())
-            if denom > 1e-12:
-                correlations[i] += (c_m * l_m).sum() / denom
-
-    correlations /= max(T, 1)
-    return correlations.cpu().numpy()
-
-
-def compute_correlation_batch(
-    candidate: np.ndarray,
-    library: np.ndarray,
-    backend: str = "numpy",
-) -> np.ndarray:
-    """Compute correlations between candidate and library, with backend selection.
-
-    Parameters
-    ----------
-    candidate : np.ndarray, shape (M, T)
-    library : np.ndarray, shape (N, M, T)
-    backend : str
-        "numpy" or "gpu"
-
-    Returns
-    -------
-    np.ndarray, shape (N,)
-    """
-    if backend == "gpu":
-        result = _try_torch_rank_correlation(candidate, library)
-        if result is not None:
-            return result
-
-    return batch_spearman_correlation(candidate, library)
diff --git a/src/factorminer/factorminer/evaluation/metrics.py b/src/factorminer/factorminer/evaluation/metrics.py
deleted file mode 100644
index 006c75e..0000000
--- a/src/factorminer/factorminer/evaluation/metrics.py
+++ /dev/null
@@ -1,377 +0,0 @@
-"""Core evaluation metrics for alpha factors.
-
-Provides vectorized, production-quality implementations of Information
-Coefficient (IC), ICIR, quintile analysis, turnover, and comprehensive
-factor statistics used by the validation pipeline.
-"""
-
-from __future__ import annotations
-
-import numpy as np
-from scipy.stats import rankdata
-
-
-# ---------------------------------------------------------------------------
-# Information Coefficient
-# ---------------------------------------------------------------------------
-
-def compute_ic(signals: np.ndarray, returns: np.ndarray) -> np.ndarray:
-    """Compute IC_t = Corr_rank(s_t, r_{t+1}) for each time period.
-
-    Uses Spearman rank correlation computed cross-sectionally at each t.
-
-    Parameters
-    ----------
-    signals : np.ndarray, shape (M, T)
-        Factor signals for M assets over T periods.
-    returns : np.ndarray, shape (M, T)
-        Forward returns for M assets over T periods.
-
-    Returns
-    -------
-    np.ndarray, shape (T,)
-        Spearman rank correlation per period.  NaN where fewer than 5
-        valid (non-NaN) asset pairs exist.
-    """
-    M, T = signals.shape
-    ic_series = np.full(T, np.nan, dtype=np.float64)
-
-    for t in range(T):
-        s = signals[:, t]
-        r = returns[:, t]
-        valid = ~(np.isnan(s) | np.isnan(r))
-        n = valid.sum()
-        if n < 5:
-            continue
-        rs = rankdata(s[valid])
-        rr = rankdata(r[valid])
-        # Pearson correlation on ranks = Spearman
-        rs_m = rs - rs.mean()
-        rr_m = rr - rr.mean()
-        denom = np.sqrt((rs_m ** 2).sum() * (rr_m ** 2).sum())
-        if denom < 1e-12:
-            ic_series[t] = 0.0
-        else:
-            ic_series[t] = (rs_m * rr_m).sum() / denom
-
-    return ic_series
-
-
-def compute_ic_vectorized(signals: np.ndarray, returns: np.ndarray) -> np.ndarray:
-    """Fully vectorized IC computation (faster for large M, T).
-
-    Ranks are computed per-column, then Pearson correlation on ranks
-    is computed without Python-level loops over T.
-
-    Parameters
-    ----------
-    signals : np.ndarray, shape (M, T)
-    returns : np.ndarray, shape (M, T)
-
-    Returns
-    -------
-    np.ndarray, shape (T,)
-    """
-    M, T = signals.shape
-    ic_series = np.full(T, np.nan, dtype=np.float64)
-
-    # Mask invalid entries
-    invalid = np.isnan(signals) | np.isnan(returns)
-
-    # Rank each column independently (replace NaN with very large value to push to end)
-    big = 1e18
-    sig_filled = np.where(invalid, big, signals)
-    ret_filled = np.where(invalid, big, returns)
-
-    for t in range(T):
-        valid = ~invalid[:, t]
-        n = valid.sum()
-        if n < 5:
-            continue
-        rs = rankdata(sig_filled[valid, t])
-        rr = rankdata(ret_filled[valid, t])
-        rs_m = rs - rs.mean()
-        rr_m = rr - rr.mean()
-        denom = np.sqrt((rs_m ** 2).sum() * (rr_m ** 2).sum())
-        ic_series[t] = (rs_m * rr_m).sum() / denom if denom > 1e-12 else 0.0
-
-    return ic_series
-
-
-# ---------------------------------------------------------------------------
-# IC-derived statistics
-# ---------------------------------------------------------------------------
-
-def compute_icir(ic_series: np.ndarray) -> float:
-    """Compute ICIR = mean(IC) / std(IC).
-
-    Parameters
-    ----------
-    ic_series : np.ndarray
-        IC time series (may contain NaN).
-
-    Returns
-    -------
-    float
-        ICIR value.  Returns 0.0 if std is near zero or too few valid points.
-    """
-    valid = ic_series[~np.isnan(ic_series)]
-    if len(valid) < 3:
-        return 0.0
-    std = float(np.std(valid, ddof=1))
-    if std < 1e-12:
-        return 0.0
-    return float(np.mean(valid)) / std
-
-
-def compute_ic_mean(ic_series: np.ndarray) -> float:
-    """Compute mean absolute IC.
-
-    Parameters
-    ----------
-    ic_series : np.ndarray
-
-    Returns
-    -------
-    float
-    """
-    valid = ic_series[~np.isnan(ic_series)]
-    if len(valid) == 0:
-        return 0.0
-    return float(np.mean(np.abs(valid)))
-
-
-def compute_ic_win_rate(ic_series: np.ndarray) -> float:
-    """Fraction of periods with positive IC.
-
-    Parameters
-    ----------
-    ic_series : np.ndarray
-
-    Returns
-    -------
-    float
-        Win rate in [0, 1].
-    """
-    valid = ic_series[~np.isnan(ic_series)]
-    if len(valid) == 0:
-        return 0.0
-    return float(np.mean(valid > 0))
-
-
-# ---------------------------------------------------------------------------
-# Cross-factor correlation
-# ---------------------------------------------------------------------------
-
-def compute_pairwise_correlation(
-    signals_a: np.ndarray,
-    signals_b: np.ndarray,
-) -> float:
-    """Time-averaged cross-sectional Spearman correlation between two factors.
-
-    rho(a, b) = (1/|T|) * sum_t Corr_rank(s_t^a, s_t^b)
-
-    Parameters
-    ----------
-    signals_a : np.ndarray, shape (M, T)
-    signals_b : np.ndarray, shape (M, T)
-
-    Returns
-    -------
-    float
-        Average cross-sectional Spearman correlation.
-    """
-    M, T = signals_a.shape
-    corrs = []
-
-    for t in range(T):
-        a = signals_a[:, t]
-        b = signals_b[:, t]
-        valid = ~(np.isnan(a) | np.isnan(b))
-        n = valid.sum()
-        if n < 5:
-            continue
-        ra = rankdata(a[valid])
-        rb = rankdata(b[valid])
-        ra_m = ra - ra.mean()
-        rb_m = rb - rb.mean()
-        denom = np.sqrt((ra_m ** 2).sum() * (rb_m ** 2).sum())
-        if denom < 1e-12:
-            corrs.append(0.0)
-        else:
-            corrs.append(float((ra_m * rb_m).sum() / denom))
-
-    if not corrs:
-        return 0.0
-    return float(np.mean(corrs))
-
-
-# ---------------------------------------------------------------------------
-# Quintile analysis
-# ---------------------------------------------------------------------------
-
-def compute_quintile_returns(
-    signals: np.ndarray,
-    returns: np.ndarray,
-    n_quantiles: int = 5,
-) -> dict:
-    """Sort assets into quintiles by factor signal, compute average returns.
-
-    Parameters
-    ----------
-    signals : np.ndarray, shape (M, T)
-    returns : np.ndarray, shape (M, T)
-    n_quantiles : int
-        Number of quantile buckets (default 5 for quintiles).
-
-    Returns
-    -------
-    dict
-        Keys: Q1..Q{n}, long_short, monotonicity.
-        Q1 is lowest signal quintile, Q{n} is highest.
-    """
-    M, T = signals.shape
-    # Accumulate per-quintile return sums
-    quintile_returns = {q: [] for q in range(1, n_quantiles + 1)}
-
-    for t in range(T):
-        s = signals[:, t]
-        r = returns[:, t]
-        valid = ~(np.isnan(s) | np.isnan(r))
-        n = valid.sum()
-        if n < n_quantiles:
-            continue
-        s_valid = s[valid]
-        r_valid = r[valid]
-        # Assign quintile labels via rank
-        ranks = rankdata(s_valid)
-        # Map to quintile: ceil(rank / n * n_quantiles), clamped
-        q_labels = np.clip(
-            np.ceil(ranks / n * n_quantiles).astype(int),
-            1,
-            n_quantiles,
-        )
-        for q in range(1, n_quantiles + 1):
-            mask = q_labels == q
-            if mask.any():
-                quintile_returns[q].append(float(np.mean(r_valid[mask])))
-
-    result = {}
-    means = {}
-    for q in range(1, n_quantiles + 1):
-        key = f"Q{q}"
-        if quintile_returns[q]:
-            means[q] = float(np.mean(quintile_returns[q]))
-        else:
-            means[q] = 0.0
-        result[key] = means[q]
-
-    # Long-short: top quintile minus bottom quintile
-    result["long_short"] = means[n_quantiles] - means[1]
-
-    # Monotonicity: Spearman corr between quintile index and mean return
-    q_indices = np.arange(1, n_quantiles + 1, dtype=np.float64)
-    q_returns = np.array([means[q] for q in range(1, n_quantiles + 1)])
-    if np.std(q_returns) < 1e-12:
-        result["monotonicity"] = 0.0
-    else:
-        rq = rankdata(q_indices)
-        rr = rankdata(q_returns)
-        rq_m = rq - rq.mean()
-        rr_m = rr - rr.mean()
-        denom = np.sqrt((rq_m ** 2).sum() * (rr_m ** 2).sum())
-        result["monotonicity"] = float((rq_m * rr_m).sum() / denom) if denom > 1e-12 else 0.0
-
-    return result
-
-
-# ---------------------------------------------------------------------------
-# Turnover
-# ---------------------------------------------------------------------------
-
-def compute_turnover(signals: np.ndarray, top_fraction: float = 0.2) -> float:
-    """Compute average portfolio turnover rate.
-
-    Turnover measures the fraction of top-ranked assets that change
-    between consecutive periods.
-
-    Parameters
-    ----------
-    signals : np.ndarray, shape (M, T)
-    top_fraction : float
-        Fraction of assets in the "top" bucket (default 0.2 = top quintile).
-
-    Returns
-    -------
-    float
-        Average turnover rate in [0, 1].
-    """
-    M, T = signals.shape
-    k = max(int(M * top_fraction), 1)
-    turnovers = []
-
-    prev_top = None
-    for t in range(T):
-        col = signals[:, t]
-        valid = ~np.isnan(col)
-        if valid.sum() < k:
-            prev_top = None
-            continue
-        # Get indices of top-k assets
-        # Use argpartition for efficiency
-        col_filled = np.where(valid, col, -np.inf)
-        top_idx = set(np.argpartition(col_filled, -k)[-k:])
-
-        if prev_top is not None:
-            overlap = len(top_idx & prev_top)
-            turnover = 1.0 - overlap / k
-            turnovers.append(turnover)
-        prev_top = top_idx
-
-    if not turnovers:
-        return 0.0
-    return float(np.mean(turnovers))
-
-
-# ---------------------------------------------------------------------------
-# Comprehensive factor statistics
-# ---------------------------------------------------------------------------
-
-def compute_factor_stats(
-    signals: np.ndarray,
-    returns: np.ndarray,
-) -> dict:
-    """Compute comprehensive factor statistics.
-
-    Parameters
-    ----------
-    signals : np.ndarray, shape (M, T)
-    returns : np.ndarray, shape (M, T)
-
-    Returns
-    -------
-    dict
-        Keys: ic_mean, ic_abs_mean, icir, ic_win_rate,
-              Q1..Q5, long_short, monotonicity, turnover
-    """
-    ic_series = compute_ic(signals, returns)
-    valid_ic = ic_series[~np.isnan(ic_series)]
-
-    stats: dict = {
-        "ic_series": ic_series,
-        "ic_mean": float(np.mean(valid_ic)) if len(valid_ic) > 0 else 0.0,
-        "ic_abs_mean": compute_ic_mean(ic_series),
-        "icir": compute_icir(ic_series),
-        "ic_win_rate": compute_ic_win_rate(ic_series),
-        "ic_std": float(np.std(valid_ic, ddof=1)) if len(valid_ic) > 2 else 0.0,
-        "n_periods": int((~np.isnan(ic_series)).sum()),
-    }
-
-    # Quintile analysis
-    quintile = compute_quintile_returns(signals, returns)
-    stats.update(quintile)
-
-    # Turnover
-    stats["turnover"] = compute_turnover(signals)
-
-    return stats
diff --git a/src/factorminer/factorminer/evaluation/pipeline.py b/src/factorminer/factorminer/evaluation/pipeline.py
deleted file mode 100644
index 2392b0e..0000000
--- a/src/factorminer/factorminer/evaluation/pipeline.py
+++ /dev/null
@@ -1,736 +0,0 @@
-"""Multi-stage factor evaluation and validation pipeline.
-
-Implements Algorithm 1 Step 3: the four-stage evaluation cascade that
-screens, deduplicates, and validates candidate alpha factors before
-admitting them to the factor library.
-
-Stages:
-    1. Fast IC screening on a subset of assets
-    2. Correlation check against the existing library
-    2.5. Replacement check for rejected-but-strong candidates
-    3. Intra-batch deduplication
-    4. Full validation on the complete asset universe
-
-Supports parallel evaluation via a configurable multiprocessing worker pool.
-"""
-
-from __future__ import annotations
-
-import logging
-from concurrent.futures import ProcessPoolExecutor, as_completed
-from dataclasses import dataclass, field
-from typing import Any, Callable, Dict, List, Optional, Tuple
-
-import numpy as np
-
-from src.factorminer.factorminer.evaluation.admission import (
-    AdmissionDecision,
-    check_admission,
-    check_replacement,
-)
-from src.factorminer.factorminer.evaluation.correlation import (
-    batch_spearman_correlation,
-    batch_spearman_pairwise,
-    compute_correlation_batch,
-)
-from src.factorminer.factorminer.evaluation.metrics import (
-    compute_factor_stats,
-    compute_ic,
-    compute_ic_mean,
-    compute_icir,
-)
-
-logger = logging.getLogger(__name__)
-
-
-# ---------------------------------------------------------------------------
-# Data types
-# ---------------------------------------------------------------------------
-
-@dataclass
-class CandidateFactor:
-    """A candidate factor to be evaluated."""
-
-    name: str
-    formula: str
-    signals: Optional[np.ndarray] = None  # (M, T) computed signals
-    metadata: Dict[str, Any] = field(default_factory=dict)
-
-
-@dataclass
-class EvaluationResult:
-    """Result of evaluating a single candidate through the pipeline."""
-
-    factor_name: str
-    formula: str
-    ic_series: Optional[np.ndarray] = None
-    ic_mean: float = 0.0
-    icir: float = 0.0
-    max_correlation: float = 0.0
-    correlated_with: Optional[str] = None
-    stage_passed: int = 0  # Highest stage passed (1-4), 0 if failed stage 1
-    rejection_reason: Optional[str] = None
-    admitted: bool = False
-    replaced: Optional[str] = None  # ID of replaced factor if replacement occurred
-    full_stats: Optional[dict] = None  # Full stats from stage 4
-
-    def to_trajectory_dict(self) -> dict:
-        """Convert to a dict compatible with the memory formation trajectory format."""
-        return {
-            "factor_id": self.factor_name,
-            "formula": self.formula,
-            "ic": self.ic_mean,
-            "icir": self.icir,
-            "max_correlation": self.max_correlation,
-            "correlated_with": self.correlated_with or "",
-            "admitted": self.admitted,
-            "rejection_reason": self.rejection_reason or "",
-            "replaced": self.replaced,
-            "stage_passed": self.stage_passed,
-        }
-
-
-@dataclass
-class FactorLibraryView:
-    """Read-only view of the factor library for the pipeline.
-
-    Provides the data needed for correlation checks and replacement
-    decisions without exposing the full library internals.
-    """
-
-    factor_ids: List[str]
-    signals: Dict[str, np.ndarray]  # factor_id -> (M, T)
-    ic_map: Dict[str, float]  # factor_id -> absolute IC
-
-    @property
-    def size(self) -> int:
-        return len(self.factor_ids)
-
-    def get_signals_tensor(self) -> np.ndarray:
-        """Return library signals as a (N, M, T) tensor.
-
-        Returns
-        -------
-        np.ndarray, shape (N, M, T)
-        """
-        if not self.factor_ids:
-            return np.array([]).reshape(0, 0, 0)
-        return np.stack([self.signals[fid] for fid in self.factor_ids], axis=0)
-
-
-@dataclass
-class PipelineConfig:
-    """Configuration for the validation pipeline."""
-
-    # Stage 1: Fast IC screening
-    ic_threshold: float = 0.04
-    fast_screen_assets: int = 100
-
-    # Stage 2: Correlation threshold
-    correlation_threshold: float = 0.5
-
-    # Stage 2.5: Replacement
-    replacement_ic_min: float = 0.10
-    replacement_ic_ratio: float = 1.3
-
-    # Stage 4: Full validation
-    icir_threshold: float = 0.5
-
-    # Parallelism
-    num_workers: int = 4
-    backend: str = "numpy"  # "numpy" or "gpu"
-
-    @classmethod
-    def from_config(cls, mining_cfg, eval_cfg) -> PipelineConfig:
-        """Build from MiningConfig and EvaluationConfig objects."""
-        return cls(
-            ic_threshold=mining_cfg.ic_threshold,
-            correlation_threshold=mining_cfg.correlation_threshold,
-            replacement_ic_min=mining_cfg.replacement_ic_min,
-            replacement_ic_ratio=mining_cfg.replacement_ic_ratio,
-            fast_screen_assets=eval_cfg.fast_screen_assets,
-            num_workers=eval_cfg.num_workers,
-            backend=eval_cfg.backend,
-        )
-
-
-# ---------------------------------------------------------------------------
-# Worker function for multiprocessing
-# ---------------------------------------------------------------------------
-
-def _evaluate_single_candidate_ic(
-    signals: np.ndarray,
-    returns: np.ndarray,
-) -> Tuple[np.ndarray, float, float]:
-    """Compute IC series, IC mean, and ICIR for a single candidate.
-
-    Designed to be called in a worker process.
-    """
-    ic_series = compute_ic(signals, returns)
-    valid_ic = ic_series[~np.isnan(ic_series)]
-    ic_mean_val = float(np.mean(np.abs(valid_ic))) if len(valid_ic) > 0 else 0.0
-    icir_val = compute_icir(ic_series)
-    return ic_series, ic_mean_val, icir_val
-
-
-# ---------------------------------------------------------------------------
-# Validation Pipeline
-# ---------------------------------------------------------------------------
-
-class ValidationPipeline:
-    """Multi-stage factor evaluation pipeline.
-
-    Implements the cascade: Fast IC -> Correlation -> Replacement ->
-    Dedup -> Full Validation.
-
-    Parameters
-    ----------
-    returns : np.ndarray, shape (M, T)
-        Forward returns for all assets.
-    library : FactorLibraryView
-        Current state of the factor library.
-    config : PipelineConfig
-        Pipeline configuration.
-    compute_signals_fn : callable, optional
-        Function(CandidateFactor, data) -> np.ndarray to compute signals
-        if not pre-computed.
-    data : dict, optional
-        Market data dict for signal computation.
-    """
-
-    def __init__(
-        self,
-        returns: np.ndarray,
-        library: FactorLibraryView,
-        config: PipelineConfig,
-        compute_signals_fn: Optional[Callable] = None,
-        data: Optional[Dict[str, np.ndarray]] = None,
-    ) -> None:
-        self.returns = returns
-        self.library = library
-        self.config = config
-        self.compute_signals_fn = compute_signals_fn
-        self.data = data
-
-        M, T = returns.shape
-        # Pre-select a random subset of assets for fast screening
-        if config.fast_screen_assets < M:
-            rng = np.random.default_rng(42)
-            self._fast_idx = rng.choice(M, size=config.fast_screen_assets, replace=False)
-        else:
-            self._fast_idx = np.arange(M)
-
-        self._fast_returns = returns[self._fast_idx, :]
-
-    def evaluate_batch(
-        self,
-        candidates: List[CandidateFactor],
-    ) -> List[EvaluationResult]:
-        """Run the full multi-stage evaluation on a batch of candidates.
-
-        Parameters
-        ----------
-        candidates : list of CandidateFactor
-            Each candidate should have signals pre-computed or provide
-            a compute_signals_fn.
-
-        Returns
-        -------
-        list of EvaluationResult
-            One result per candidate, including rejected ones.
-        """
-        if not candidates:
-            return []
-
-        # Ensure signals are computed
-        self._ensure_signals(candidates)
-
-        results: Dict[str, EvaluationResult] = {}
-
-        logger.info(
-            "Starting pipeline evaluation for %d candidates", len(candidates)
-        )
-
-        # Stage 1: Fast IC screening
-        passed_s1, failed_s1 = self._stage1_ic_screen(candidates)
-        for c, result in failed_s1:
-            results[c.name] = result
-        logger.info(
-            "Stage 1 (IC screen): %d passed, %d failed",
-            len(passed_s1), len(failed_s1),
-        )
-
-        if not passed_s1:
-            return list(results.values())
-
-        # Stage 2: Correlation check against library
-        passed_s2, failed_s2, replacement_candidates = self._stage2_correlation_check(
-            passed_s1
-        )
-        for c, result in failed_s2:
-            results[c.name] = result
-        logger.info(
-            "Stage 2 (correlation): %d passed, %d failed, %d for replacement",
-            len(passed_s2), len(failed_s2), len(replacement_candidates),
-        )
-
-        # Stage 2.5: Replacement check
-        replaced = self._stage25_replacement_check(replacement_candidates)
-        for c, result in replaced:
-            results[c.name] = result
-        logger.info("Stage 2.5 (replacement): %d replacements", len(replaced))
-
-        if not passed_s2 and not replaced:
-            return list(results.values())
-
-        # Combine stage 2 passes and successful replacements
-        to_dedup = list(passed_s2)
-        for c, result in replaced:
-            if result.admitted:
-                to_dedup.append(c)
-
-        # Stage 3: Intra-batch deduplication
-        passed_s3, failed_s3 = self._stage3_batch_dedup(to_dedup)
-        for c, result in failed_s3:
-            results[c.name] = result
-        logger.info(
-            "Stage 3 (dedup): %d passed, %d failed",
-            len(passed_s3), len(failed_s3),
-        )
-
-        # Stage 4: Full validation
-        validated = self._stage4_full_validation(passed_s3)
-        for c, result in validated:
-            results[c.name] = result
-        logger.info(
-            "Stage 4 (full validation): %d admitted",
-            sum(1 for _, r in validated if r.admitted),
-        )
-
-        return list(results.values())
-
-    def _ensure_signals(self, candidates: List[CandidateFactor]) -> None:
-        """Compute signals for candidates that don't have them yet."""
-        if self.compute_signals_fn is None:
-            return
-        for c in candidates:
-            if c.signals is None and self.data is not None:
-                c.signals = self.compute_signals_fn(c, self.data)
-
-    # ----- Stage 1: Fast IC Screening -----
-
-    def _stage1_ic_screen(
-        self,
-        candidates: List[CandidateFactor],
-    ) -> Tuple[
-        List[CandidateFactor],
-        List[Tuple[CandidateFactor, EvaluationResult]],
-    ]:
-        """Stage 1: Fast IC screening on asset subset.
-
-        C1 = {a in C : |IC(a)| >= tau_IC}
-
-        Returns (passed, failed) where failed includes EvaluationResults.
-        """
-        passed = []
-        failed = []
-        threshold = self.config.ic_threshold
-
-        for c in candidates:
-            if c.signals is None:
-                failed.append((c, EvaluationResult(
-                    factor_name=c.name,
-                    formula=c.formula,
-                    stage_passed=0,
-                    rejection_reason="No signals computed",
-                )))
-                continue
-
-            # Use fast subset
-            fast_signals = c.signals[self._fast_idx, :]
-            ic_series = compute_ic(fast_signals, self._fast_returns)
-            valid_ic = ic_series[~np.isnan(ic_series)]
-
-            if len(valid_ic) == 0:
-                failed.append((c, EvaluationResult(
-                    factor_name=c.name,
-                    formula=c.formula,
-                    stage_passed=0,
-                    rejection_reason="No valid IC values",
-                )))
-                continue
-
-            ic_abs_mean = float(np.mean(np.abs(valid_ic)))
-
-            if ic_abs_mean < threshold:
-                failed.append((c, EvaluationResult(
-                    factor_name=c.name,
-                    formula=c.formula,
-                    ic_series=ic_series,
-                    ic_mean=ic_abs_mean,
-                    stage_passed=0,
-                    rejection_reason=f"Stage 1: |IC|={ic_abs_mean:.4f} < {threshold}",
-                )))
-            else:
-                # Store fast IC for later use
-                c.metadata["fast_ic_series"] = ic_series
-                c.metadata["fast_ic_mean"] = ic_abs_mean
-                passed.append(c)
-
-        return passed, failed
-
-    # ----- Stage 2: Correlation Check -----
-
-    def _stage2_correlation_check(
-        self,
-        candidates: List[CandidateFactor],
-    ) -> Tuple[
-        List[CandidateFactor],
-        List[Tuple[CandidateFactor, EvaluationResult]],
-        List[Tuple[CandidateFactor, Dict[str, float]]],
-    ]:
-        """Stage 2: Correlation check against the library.
-
-        C2 = {a in C1 : max_{g in L} |rho(a,g)| < theta}
-
-        Returns (passed, failed, replacement_candidates).
-        replacement_candidates contains candidates that failed correlation
-        but might qualify for replacement.
-        """
-        passed = []
-        failed = []
-        replacement_candidates = []
-
-        if self.library.size == 0:
-            # Empty library: all pass
-            return candidates, failed, replacement_candidates
-
-        theta = self.config.correlation_threshold
-        lib_tensor = self.library.get_signals_tensor()
-
-        for c in candidates:
-            # Compute correlation with all library factors
-            corrs = compute_correlation_batch(
-                c.signals,
-                lib_tensor,
-                backend=self.config.backend,
-            )
-            abs_corrs = np.abs(corrs)
-            max_idx = int(np.argmax(abs_corrs))
-            max_corr = float(abs_corrs[max_idx])
-            correlated_with = self.library.factor_ids[max_idx]
-
-            if max_corr < theta:
-                c.metadata["max_correlation"] = max_corr
-                c.metadata["correlated_with"] = correlated_with
-                passed.append(c)
-            else:
-                ic_abs = c.metadata.get("fast_ic_mean", 0.0)
-
-                # Check if candidate qualifies for replacement
-                if ic_abs >= self.config.replacement_ic_min:
-                    # Store full correlation map for replacement check
-                    corr_map = {
-                        fid: float(corrs[i])
-                        for i, fid in enumerate(self.library.factor_ids)
-                    }
-                    c.metadata["max_correlation"] = max_corr
-                    c.metadata["correlated_with"] = correlated_with
-                    c.metadata["correlation_map"] = corr_map
-                    replacement_candidates.append((c, corr_map))
-                else:
-                    failed.append((c, EvaluationResult(
-                        factor_name=c.name,
-                        formula=c.formula,
-                        ic_series=c.metadata.get("fast_ic_series"),
-                        ic_mean=ic_abs,
-                        max_correlation=max_corr,
-                        correlated_with=correlated_with,
-                        stage_passed=1,
-                        rejection_reason=(
-                            f"Stage 2: max|rho|={max_corr:.4f} >= {theta} "
-                            f"(with {correlated_with})"
-                        ),
-                    )))
-
-        return passed, failed, replacement_candidates
-
-    # ----- Stage 2.5: Replacement Check -----
-
-    def _stage25_replacement_check(
-        self,
-        replacement_candidates: List[Tuple[CandidateFactor, Dict[str, float]]],
-    ) -> List[Tuple[CandidateFactor, EvaluationResult]]:
-        """Stage 2.5: Check if rejected candidates can replace library members.
-
-        For a in C1 \\ C2, check replacement rule (Eq. 11):
-            |IC(a)| >= 0.10
-            |IC(a)| >= 1.3 * |IC(g)|
-            |{g : |rho(a,g)| >= theta}| == 1
-        """
-        results = []
-
-        for c, corr_map in replacement_candidates:
-            ic_abs = c.metadata.get("fast_ic_mean", 0.0)
-            max_corr = c.metadata.get("max_correlation", 0.0)
-            correlated_with = c.metadata.get("correlated_with")
-
-            decision = check_replacement(
-                candidate_ic_abs=ic_abs,
-                max_corr=max_corr,
-                correlated_with=correlated_with,
-                library_ic_map=self.library.ic_map,
-                correlation_map=corr_map,
-                replacement_ic_min=self.config.replacement_ic_min,
-                replacement_ic_ratio=self.config.replacement_ic_ratio,
-                correlation_threshold=self.config.correlation_threshold,
-            )
-
-            result = EvaluationResult(
-                factor_name=c.name,
-                formula=c.formula,
-                ic_series=c.metadata.get("fast_ic_series"),
-                ic_mean=ic_abs,
-                max_correlation=max_corr,
-                correlated_with=correlated_with,
-                admitted=decision.admitted,
-                replaced=decision.replaced_factor_id,
-                stage_passed=2 if decision.admitted else 1,
-                rejection_reason=decision.rejection_reason,
-            )
-            results.append((c, result))
-
-        return results
-
-    # ----- Stage 3: Batch Deduplication -----
-
-    def _stage3_batch_dedup(
-        self,
-        candidates: List[CandidateFactor],
-    ) -> Tuple[
-        List[CandidateFactor],
-        List[Tuple[CandidateFactor, EvaluationResult]],
-    ]:
-        """Stage 3: Intra-batch deduplication.
-
-        Remove candidates that are too correlated with each other
-        within the same batch, keeping the one with higher IC.
-        """
-        if len(candidates) <= 1:
-            return candidates, []
-
-        theta = self.config.correlation_threshold
-        signals_list = [c.signals for c in candidates]
-        corr_matrix = batch_spearman_pairwise(signals_list)
-
-        # Greedy dedup: sort by IC descending, keep each if not correlated
-        # with any already-kept candidate
-        ic_vals = [c.metadata.get("fast_ic_mean", 0.0) for c in candidates]
-        order = sorted(range(len(candidates)), key=lambda i: -ic_vals[i])
-
-        kept_indices = set()
-        removed = []
-
-        for idx in order:
-            is_correlated = False
-            for kept_idx in kept_indices:
-                if abs(corr_matrix[idx, kept_idx]) >= theta:
-                    is_correlated = True
-                    removed.append((candidates[idx], EvaluationResult(
-                        factor_name=candidates[idx].name,
-                        formula=candidates[idx].formula,
-                        ic_mean=ic_vals[idx],
-                        max_correlation=float(abs(corr_matrix[idx, kept_idx])),
-                        correlated_with=candidates[kept_idx].name,
-                        stage_passed=2,
-                        rejection_reason=(
-                            f"Stage 3: intra-batch dup with {candidates[kept_idx].name} "
-                            f"(rho={corr_matrix[idx, kept_idx]:.4f})"
-                        ),
-                    )))
-                    break
-            if not is_correlated:
-                kept_indices.add(idx)
-
-        passed = [candidates[i] for i in sorted(kept_indices)]
-        return passed, removed
-
-    # ----- Stage 4: Full Validation -----
-
-    def _stage4_full_validation(
-        self,
-        candidates: List[CandidateFactor],
-    ) -> List[Tuple[CandidateFactor, EvaluationResult]]:
-        """Stage 4: Full validation on complete asset universe.
-
-        Compute comprehensive statistics using all assets and apply
-        final quality checks.
-        """
-        results = []
-        threshold = self.config.ic_threshold
-
-        use_parallel = self.config.num_workers > 1 and len(candidates) > 1
-
-        if use_parallel:
-            results = self._stage4_parallel(candidates)
-        else:
-            for c in candidates:
-                result = self._validate_single(c)
-                results.append((c, result))
-
-        return results
-
-    def _validate_single(self, c: CandidateFactor) -> EvaluationResult:
-        """Run full validation for a single candidate."""
-        stats = compute_factor_stats(c.signals, self.returns)
-        ic_series = stats["ic_series"]
-        ic_abs_mean = stats["ic_abs_mean"]
-        icir = stats["icir"]
-
-        max_corr = c.metadata.get("max_correlation", 0.0)
-        correlated_with = c.metadata.get("correlated_with")
-        replaced = c.metadata.get("replaced") if "replaced" in c.metadata else None
-
-        # Check if previously marked as replacement
-        if hasattr(c, "_replacement_target"):
-            replaced = c._replacement_target
-
-        # Apply final threshold
-        if ic_abs_mean < self.config.ic_threshold:
-            return EvaluationResult(
-                factor_name=c.name,
-                formula=c.formula,
-                ic_series=ic_series,
-                ic_mean=ic_abs_mean,
-                icir=icir,
-                max_correlation=max_corr,
-                correlated_with=correlated_with,
-                stage_passed=3,
-                rejection_reason=(
-                    f"Stage 4: full |IC|={ic_abs_mean:.4f} < {self.config.ic_threshold}"
-                ),
-                admitted=False,
-                full_stats=stats,
-            )
-
-        return EvaluationResult(
-            factor_name=c.name,
-            formula=c.formula,
-            ic_series=ic_series,
-            ic_mean=ic_abs_mean,
-            icir=icir,
-            max_correlation=max_corr,
-            correlated_with=correlated_with,
-            stage_passed=4,
-            admitted=True,
-            replaced=replaced,
-            full_stats=stats,
-        )
-
-    def _stage4_parallel(
-        self,
-        candidates: List[CandidateFactor],
-    ) -> List[Tuple[CandidateFactor, EvaluationResult]]:
-        """Run stage 4 in parallel using ProcessPoolExecutor.
-
-        Each worker evaluates one candidate independently. Since signals
-        and returns are numpy arrays, they can be pickled for IPC.
-        """
-        results = []
-        futures_map = {}
-
-        with ProcessPoolExecutor(max_workers=self.config.num_workers) as executor:
-            for c in candidates:
-                future = executor.submit(
-                    _evaluate_single_candidate_ic,
-                    c.signals,
-                    self.returns,
-                )
-                futures_map[future] = c
-
-            for future in as_completed(futures_map):
-                c = futures_map[future]
-                try:
-                    ic_series, ic_abs_mean, icir = future.result()
-
-                    max_corr = c.metadata.get("max_correlation", 0.0)
-                    correlated_with = c.metadata.get("correlated_with")
-
-                    if ic_abs_mean < self.config.ic_threshold:
-                        result = EvaluationResult(
-                            factor_name=c.name,
-                            formula=c.formula,
-                            ic_series=ic_series,
-                            ic_mean=ic_abs_mean,
-                            icir=icir,
-                            max_correlation=max_corr,
-                            correlated_with=correlated_with,
-                            stage_passed=3,
-                            rejection_reason=(
-                                f"Stage 4: full |IC|={ic_abs_mean:.4f} "
-                                f"< {self.config.ic_threshold}"
-                            ),
-                            admitted=False,
-                        )
-                    else:
-                        result = EvaluationResult(
-                            factor_name=c.name,
-                            formula=c.formula,
-                            ic_series=ic_series,
-                            ic_mean=ic_abs_mean,
-                            icir=icir,
-                            max_correlation=max_corr,
-                            correlated_with=correlated_with,
-                            stage_passed=4,
-                            admitted=True,
-                        )
-                    results.append((c, result))
-
-                except Exception as e:
-                    logger.error("Worker failed for %s: %s", c.name, e)
-                    results.append((c, EvaluationResult(
-                        factor_name=c.name,
-                        formula=c.formula,
-                        stage_passed=3,
-                        rejection_reason=f"Stage 4 error: {e}",
-                        admitted=False,
-                    )))
-
-        return results
-
-
-# ---------------------------------------------------------------------------
-# Convenience: Run the full pipeline
-# ---------------------------------------------------------------------------
-
-def run_evaluation_pipeline(
-    candidates: List[CandidateFactor],
-    returns: np.ndarray,
-    library: FactorLibraryView,
-    config: PipelineConfig,
-    compute_signals_fn: Optional[Callable] = None,
-    data: Optional[Dict[str, np.ndarray]] = None,
-) -> List[EvaluationResult]:
-    """One-shot convenience function to run the full evaluation pipeline.
-
-    Parameters
-    ----------
-    candidates : list of CandidateFactor
-    returns : np.ndarray, shape (M, T)
-    library : FactorLibraryView
-    config : PipelineConfig
-    compute_signals_fn : callable, optional
-    data : dict, optional
-
-    Returns
-    -------
-    list of EvaluationResult
-    """
-    pipeline = ValidationPipeline(
-        returns=returns,
-        library=library,
-        config=config,
-        compute_signals_fn=compute_signals_fn,
-        data=data,
-    )
-    return pipeline.evaluate_batch(candidates)
diff --git a/src/factorminer/factorminer/evaluation/portfolio.py b/src/factorminer/factorminer/evaluation/portfolio.py
deleted file mode 100644
index 757ce45..0000000
--- a/src/factorminer/factorminer/evaluation/portfolio.py
+++ /dev/null
@@ -1,266 +0,0 @@
-"""Portfolio construction and quintile backtesting.
-
-Implements quintile-sorted long-short portfolio backtesting with
-transaction cost pressure testing, following the FactorMiner paper methodology.
-"""
-
-from __future__ import annotations
-
-from typing import Dict, List, Optional
-
-import numpy as np
-from scipy.stats import spearmanr
-
-
-class PortfolioBacktester:
-    """Backtest factor signals using quintile portfolios."""
-
-    # ------------------------------------------------------------------
-    # Main backtest
-    # ------------------------------------------------------------------
-
-    def quintile_backtest(
-        self,
-        combined_signal: np.ndarray,
-        returns: np.ndarray,
-        transaction_cost_bps: float = 0,
-    ) -> dict:
-        """Run quintile portfolio backtest.
-
-        At each time step t, sort assets into 5 quintiles by signal strength.
-        Q5 = highest signal (long), Q1 = lowest signal (short).
-
-        Parameters
-        ----------
-        combined_signal : ndarray of shape (T, N)
-            Composite factor signal.
-        returns : ndarray of shape (T, N)
-            Forward returns aligned with the signal.
-        transaction_cost_bps : float
-            One-way transaction cost in basis points (1 bp = 0.01%).
-
-        Returns
-        -------
-        dict with keys:
-            q1_return .. q5_return : float
-                Average annualized return per quintile.
-            ls_return : float
-                Average long-short return (Q5 - Q1).
-            ls_cumulative : ndarray
-                Cumulative long-short return series.
-            ic_mean : float
-            icir : float
-            ic_win_rate : float
-                Fraction of periods with IC > 0.
-            monotonicity : float
-                1.0 if perfect Q1 < Q2 < ... < Q5 ordering of mean returns.
-            avg_turnover : float
-                Mean daily turnover of the long quintile.
-        """
-        combined_signal = np.asarray(combined_signal, dtype=np.float64)
-        returns = np.asarray(returns, dtype=np.float64)
-        T, N = combined_signal.shape
-        cost_frac = transaction_cost_bps / 10000.0
-
-        # Per-period quintile returns
-        quintile_returns = np.full((T, 5), np.nan)
-        for t in range(T):
-            sig_t = combined_signal[t]
-            ret_t = returns[t]
-            valid = np.isfinite(sig_t) & np.isfinite(ret_t)
-            n_valid = valid.sum()
-            if n_valid < 5:
-                continue
-            ranks = _rank_array(sig_t[valid])
-            boundaries = np.linspace(0, 1, 6)
-            for q in range(5):
-                mask = (ranks >= boundaries[q]) & (ranks < boundaries[q + 1])
-                if q == 4:
-                    mask = (ranks >= boundaries[q]) & (ranks <= boundaries[q + 1])
-                if mask.sum() > 0:
-                    quintile_returns[t, q] = np.mean(ret_t[valid][mask])
-
-        # Turnover for cost adjustment
-        turnover = self.compute_turnover(combined_signal, top_fraction=0.2)
-        avg_turnover = float(np.nanmean(turnover))
-
-        # Long-short return (Q5 - Q1) with transaction costs
-        ls_raw = quintile_returns[:, 4] - quintile_returns[:, 0]
-        ls_cost = 2.0 * cost_frac * turnover  # both legs
-        ls_net = np.where(
-            np.isfinite(ls_raw),
-            ls_raw - ls_cost,
-            np.nan,
-        )
-        ls_cumulative = np.nancumsum(np.where(np.isfinite(ls_net), ls_net, 0.0))
-
-        # IC series (cross-sectional Spearman rank correlation)
-        ic_series = np.full(T, np.nan)
-        for t in range(T):
-            sig_t = combined_signal[t]
-            ret_t = returns[t]
-            valid = np.isfinite(sig_t) & np.isfinite(ret_t)
-            if valid.sum() < 5:
-                continue
-            corr, _ = spearmanr(sig_t[valid], ret_t[valid])
-            if np.isfinite(corr):
-                ic_series[t] = corr
-
-        finite_ic = ic_series[np.isfinite(ic_series)]
-        if len(finite_ic) > 1:
-            ic_mean = float(np.mean(finite_ic))
-            ic_std = float(np.std(finite_ic, ddof=1))
-            icir = ic_mean / ic_std if ic_std > 1e-12 else 0.0
-            ic_win_rate = float(np.mean(finite_ic > 0))
-        else:
-            ic_mean = 0.0
-            icir = 0.0
-            ic_win_rate = 0.0
-
-        # Mean quintile returns
-        q_means = [float(np.nanmean(quintile_returns[:, q])) for q in range(5)]
-
-        # Monotonicity: fraction of adjacent quintile pairs in correct order
-        correct_pairs = sum(
-            1 for i in range(4) if q_means[i] < q_means[i + 1]
-        )
-        monotonicity = correct_pairs / 4.0
-
-        return {
-            "q1_return": q_means[0],
-            "q2_return": q_means[1],
-            "q3_return": q_means[2],
-            "q4_return": q_means[3],
-            "q5_return": q_means[4],
-            "ls_return": float(np.nanmean(ls_net)),
-            "ls_gross_return": float(np.nanmean(ls_raw)),
-            "ls_cumulative": ls_cumulative,
-            "ls_gross_series": ls_raw,
-            "ls_net_series": ls_net,
-            "quintile_period_returns": quintile_returns,
-            "turnover_series": turnover,
-            "ic_series": ic_series,
-            "ic_mean": ic_mean,
-            "icir": icir,
-            "ic_win_rate": ic_win_rate,
-            "monotonicity": monotonicity,
-            "avg_turnover": avg_turnover,
-        }
-
-    # ------------------------------------------------------------------
-    # Cost pressure testing
-    # ------------------------------------------------------------------
-
-    def cost_pressure_test(
-        self,
-        combined_signal: np.ndarray,
-        returns: np.ndarray,
-        cost_settings: Optional[List[float]] = None,
-    ) -> Dict[float, dict]:
-        """Run backtest under multiple transaction cost settings (in bps).
-
-        Paper Figure 9: Test at 1, 4, 7, 10, 11 bps.
-
-        Parameters
-        ----------
-        combined_signal : ndarray of shape (T, N)
-        returns : ndarray of shape (T, N)
-        cost_settings : list of float or None
-            Transaction cost levels in basis points.
-            Defaults to [1, 4, 7, 10, 11].
-
-        Returns
-        -------
-        dict mapping cost_bps -> backtest result dict.
-        """
-        if cost_settings is None:
-            cost_settings = [1.0, 4.0, 7.0, 10.0, 11.0]
-
-        results: Dict[float, dict] = {}
-        for cost_bps in cost_settings:
-            results[cost_bps] = self.quintile_backtest(
-                combined_signal, returns, transaction_cost_bps=cost_bps,
-            )
-        return results
-
-    # ------------------------------------------------------------------
-    # Turnover computation
-    # ------------------------------------------------------------------
-
-    def compute_turnover(
-        self,
-        signal: np.ndarray,
-        top_fraction: float = 0.2,
-    ) -> np.ndarray:
-        """Compute daily turnover of the top/bottom quintile portfolios.
-
-        Turnover is defined as the fraction of assets that change between
-        consecutive rebalance periods in the top-quintile portfolio.
-
-        Parameters
-        ----------
-        signal : ndarray of shape (T, N)
-        top_fraction : float
-            Fraction of assets in each quintile (default 0.2 = top 20%).
-
-        Returns
-        -------
-        ndarray of shape (T,)
-            Per-period turnover ratios.  First period is 0.
-        """
-        signal = np.asarray(signal, dtype=np.float64)
-        T, N = signal.shape
-        turnover = np.zeros(T)
-        prev_top: Optional[np.ndarray] = None
-
-        for t in range(T):
-            sig_t = signal[t]
-            valid = np.isfinite(sig_t)
-            n_valid = valid.sum()
-            if n_valid < 5:
-                prev_top = None
-                continue
-
-            k = max(1, int(n_valid * top_fraction))
-            valid_idx = np.where(valid)[0]
-            valid_vals = sig_t[valid_idx]
-            # Indices of top-k assets
-            top_idx = valid_idx[np.argpartition(valid_vals, -k)[-k:]]
-            top_set = np.zeros(N, dtype=bool)
-            top_set[top_idx] = True
-
-            if prev_top is not None:
-                changed = np.sum(top_set != prev_top)
-                turnover[t] = changed / (2.0 * k)  # normalize by portfolio size
-            prev_top = top_set
-
-        return turnover
-
-
-# ------------------------------------------------------------------
-# Module-level helpers
-# ------------------------------------------------------------------
-
-def _rank_array(x: np.ndarray) -> np.ndarray:
-    """Compute percentile ranks in [0, 1] for a 1-D array.
-
-    Ties receive the average rank.
-    """
-    n = len(x)
-    if n == 0:
-        return x.copy()
-    order = x.argsort()
-    ranks = np.empty(n, dtype=np.float64)
-    ranks[order] = np.arange(n, dtype=np.float64)
-    # Handle ties by averaging
-    sorted_x = x[order]
-    i = 0
-    while i < n:
-        j = i
-        while j < n and sorted_x[j] == sorted_x[i]:
-            j += 1
-        avg_rank = (i + j - 1) / 2.0
-        for k in range(i, j):
-            ranks[order[k]] = avg_rank
-        i = j
-    return ranks / max(n - 1, 1)
diff --git a/src/factorminer/factorminer/evaluation/regime.py b/src/factorminer/factorminer/evaluation/regime.py
deleted file mode 100644
index 8c11995..0000000
--- a/src/factorminer/factorminer/evaluation/regime.py
+++ /dev/null
@@ -1,623 +0,0 @@
-"""Regime-aware factor validation.
-
-Classifies market periods into BULL / BEAR / SIDEWAYS regimes using
-rolling return and volatility statistics, then evaluates factor IC
-within each regime to ensure robustness across market conditions.
-"""
-
-from __future__ import annotations
-
-import enum
-from dataclasses import dataclass
-from typing import Dict
-
-import numpy as np
-from scipy.stats import rankdata
-
-
-# ---------------------------------------------------------------------------
-# Regime enum
-# ---------------------------------------------------------------------------
-
-class MarketRegime(enum.Enum):
-    """Market regime labels."""
-
-    BULL = 0
-    BEAR = 1
-    SIDEWAYS = 2
-
-
-# ---------------------------------------------------------------------------
-# Configuration
-# ---------------------------------------------------------------------------
-
-@dataclass
-class RegimeConfig:
-    """Parameters controlling regime detection and per-regime IC validation.
-
-    Attributes
-    ----------
-    enabled : bool
-        Whether regime-aware evaluation is active.
-    lookback_window : int
-        Rolling window length (periods) for mean-return and volatility
-        estimation.
-    bull_return_threshold : float
-        Minimum rolling-mean return to qualify as BULL (when volatility is
-        also below the threshold).
-    bear_return_threshold : float
-        Maximum rolling-mean return to qualify as BEAR.
-    volatility_percentile : float
-        Percentile (0-1) of rolling volatility used to compute the
-        vol_threshold separating low-vol (BULL) from high-vol environments.
-    min_regime_ic : float
-        Minimum mean |IC| required within a single regime for it to "pass".
-    min_regimes_passing : int
-        How many regimes must pass for the factor to be accepted.
-    """
-
-    enabled: bool = True
-    lookback_window: int = 60
-    bull_return_threshold: float = 0.0
-    bear_return_threshold: float = 0.0
-    volatility_percentile: float = 0.7
-    min_regime_ic: float = 0.03
-    min_regimes_passing: int = 2
-
-
-# ---------------------------------------------------------------------------
-# Classification result
-# ---------------------------------------------------------------------------
-
-@dataclass
-class RegimeClassification:
-    """Output of :class:`RegimeDetector.classify`.
-
-    Attributes
-    ----------
-    labels : np.ndarray, shape (T,)
-        Integer regime codes per period (0=BULL, 1=BEAR, 2=SIDEWAYS).
-    periods : Dict[MarketRegime, np.ndarray]
-        Boolean masks of shape (T,) indicating which periods belong to
-        each regime.
-    stats : Dict[MarketRegime, Dict[str, float]]
-        Descriptive statistics per regime: ``mean_return``, ``volatility``,
-        ``n_periods``.
-    """
-
-    labels: np.ndarray
-    periods: Dict[MarketRegime, np.ndarray]
-    stats: Dict[MarketRegime, Dict[str, float]]
-
-
-# ---------------------------------------------------------------------------
-# Regime detector
-# ---------------------------------------------------------------------------
-
-class RegimeDetector:
-    """Classify time periods into market regimes.
-
-    Parameters
-    ----------
-    config : RegimeConfig
-        Regime detection parameters.
-    """
-
-    def __init__(self, config: RegimeConfig | None = None) -> None:
-        self.config = config or RegimeConfig()
-
-    # ----- public API -----
-
-    def classify(self, returns: np.ndarray) -> RegimeClassification:
-        """Classify each period into a market regime.
-
-        Parameters
-        ----------
-        returns : np.ndarray, shape (M, T)
-            Forward returns for *M* assets over *T* periods.
-
-        Returns
-        -------
-        RegimeClassification
-        """
-        cfg = self.config
-        M, T = returns.shape
-
-        # Cross-sectional average return per period (handles NaN)
-        market_return = np.nanmean(returns, axis=0)  # (T,)
-
-        # Rolling statistics
-        rolling_mean = self._rolling_nanmean(market_return, cfg.lookback_window)
-        rolling_vol = self._rolling_nanstd(market_return, cfg.lookback_window)
-
-        # Volatility threshold from valid (non-NaN) rolling vol values
-        valid_vol = rolling_vol[~np.isnan(rolling_vol)]
-        if len(valid_vol) > 0:
-            vol_threshold = float(
-                np.percentile(valid_vol, cfg.volatility_percentile * 100)
-            )
-        else:
-            vol_threshold = np.inf  # fallback: nothing qualifies as low-vol
-
-        # Assign labels
-        labels = np.full(T, MarketRegime.SIDEWAYS.value, dtype=np.int64)
-
-        # BEAR: rolling_return < bear_threshold  (checked first)
-        bear_mask = rolling_mean < cfg.bear_return_threshold
-        labels[bear_mask] = MarketRegime.BEAR.value
-
-        # BULL: rolling_return > bull_threshold AND rolling_vol < vol_threshold
-        bull_mask = (rolling_mean > cfg.bull_return_threshold) & (
-            rolling_vol < vol_threshold
-        )
-        labels[bull_mask] = MarketRegime.BULL.value
-
-        # First lookback_window periods default to SIDEWAYS
-        labels[: cfg.lookback_window] = MarketRegime.SIDEWAYS.value
-
-        # Build boolean masks & stats
-        periods: Dict[MarketRegime, np.ndarray] = {}
-        stats: Dict[MarketRegime, Dict[str, float]] = {}
-
-        for regime in MarketRegime:
-            mask = labels == regime.value
-            periods[regime] = mask
-            regime_returns = market_return[mask]
-            valid = regime_returns[~np.isnan(regime_returns)]
-            stats[regime] = {
-                "mean_return": float(np.mean(valid)) if len(valid) > 0 else 0.0,
-                "volatility": float(np.std(valid, ddof=1)) if len(valid) > 1 else 0.0,
-                "n_periods": int(mask.sum()),
-            }
-
-        return RegimeClassification(labels=labels, periods=periods, stats=stats)
-
-    # ----- helpers -----
-
-    @staticmethod
-    def _rolling_nanmean(arr: np.ndarray, window: int) -> np.ndarray:
-        """Rolling mean that ignores NaN, returning NaN for the first *window-1* entries."""
-        T = len(arr)
-        out = np.full(T, np.nan, dtype=np.float64)
-        for t in range(window - 1, T):
-            chunk = arr[t - window + 1 : t + 1]
-            valid = chunk[~np.isnan(chunk)]
-            if len(valid) > 0:
-                out[t] = float(np.mean(valid))
-        return out
-
-    @staticmethod
-    def _rolling_nanstd(arr: np.ndarray, window: int) -> np.ndarray:
-        """Rolling std (ddof=1) that ignores NaN."""
-        T = len(arr)
-        out = np.full(T, np.nan, dtype=np.float64)
-        for t in range(window - 1, T):
-            chunk = arr[t - window + 1 : t + 1]
-            valid = chunk[~np.isnan(chunk)]
-            if len(valid) > 1:
-                out[t] = float(np.std(valid, ddof=1))
-        return out
-
-
-# ---------------------------------------------------------------------------
-# Per-regime IC result
-# ---------------------------------------------------------------------------
-
-@dataclass
-class RegimeICResult:
-    """Evaluation result for a single factor across market regimes.
-
-    Attributes
-    ----------
-    factor_name : str
-        Human-readable factor identifier.
-    regime_ic : Dict[MarketRegime, float]
-        Mean |IC| per regime.
-    regime_icir : Dict[MarketRegime, float]
-        ICIR per regime.
-    regime_n_periods : Dict[MarketRegime, int]
-        Number of valid IC periods per regime.
-    n_regimes_passing : int
-        How many regimes met the ``min_regime_ic`` threshold.
-    passes : bool
-        Whether ``n_regimes_passing >= config.min_regimes_passing``.
-    overall_regime_score : float
-        Weighted average |IC| across regimes (weighted by n_periods).
-    """
-
-    factor_name: str
-    regime_ic: Dict[MarketRegime, float]
-    regime_icir: Dict[MarketRegime, float]
-    regime_n_periods: Dict[MarketRegime, int]
-    n_regimes_passing: int
-    passes: bool
-    overall_regime_score: float
-
-
-# ---------------------------------------------------------------------------
-# Regime-aware evaluator
-# ---------------------------------------------------------------------------
-
-class RegimeAwareEvaluator:
-    """Evaluate factor IC within each market regime.
-
-    Parameters
-    ----------
-    returns : np.ndarray, shape (M, T)
-        Forward returns.
-    regime : RegimeClassification
-        Pre-computed regime classification.
-    config : RegimeConfig
-        Thresholds and evaluation parameters.
-    """
-
-    def __init__(
-        self,
-        returns: np.ndarray,
-        regime: RegimeClassification,
-        config: RegimeConfig | None = None,
-    ) -> None:
-        self.returns = returns
-        self.regime = regime
-        self.config = config or RegimeConfig()
-
-    # ----- public API -----
-
-    def evaluate(self, factor_name: str, signals: np.ndarray) -> RegimeICResult:
-        """Evaluate a single factor across regimes.
-
-        Parameters
-        ----------
-        factor_name : str
-            Identifier for reporting.
-        signals : np.ndarray, shape (M, T)
-            Factor signal matrix.
-
-        Returns
-        -------
-        RegimeICResult
-        """
-        cfg = self.config
-        min_periods = cfg.lookback_window * 2
-
-        regime_ic: Dict[MarketRegime, float] = {}
-        regime_icir: Dict[MarketRegime, float] = {}
-        regime_n_periods: Dict[MarketRegime, int] = {}
-
-        for regime in MarketRegime:
-            mask = self.regime.periods[regime]
-            n_regime = int(mask.sum())
-
-            if n_regime < min_periods:
-                regime_ic[regime] = 0.0
-                regime_icir[regime] = 0.0
-                regime_n_periods[regime] = n_regime
-                continue
-
-            # Extract time-sliced sub-arrays
-            indices = np.where(mask)[0]
-            sig_sub = signals[:, indices]
-            ret_sub = self.returns[:, indices]
-
-            ic_series = self._compute_ic(sig_sub, ret_sub)
-            valid_ic = ic_series[~np.isnan(ic_series)]
-
-            mean_abs_ic = float(np.mean(np.abs(valid_ic))) if len(valid_ic) > 0 else 0.0
-            icir = self._compute_icir(valid_ic)
-
-            regime_ic[regime] = mean_abs_ic
-            regime_icir[regime] = icir
-            regime_n_periods[regime] = int(len(valid_ic))
-
-        # Count passing regimes
-        n_passing = sum(
-            1
-            for r in MarketRegime
-            if regime_n_periods[r] >= min_periods and regime_ic[r] >= cfg.min_regime_ic
-        )
-
-        # Weighted average IC
-        total_weight = sum(regime_n_periods[r] for r in MarketRegime)
-        if total_weight > 0:
-            overall_score = sum(
-                regime_ic[r] * regime_n_periods[r] for r in MarketRegime
-            ) / total_weight
-        else:
-            overall_score = 0.0
-
-        return RegimeICResult(
-            factor_name=factor_name,
-            regime_ic=regime_ic,
-            regime_icir=regime_icir,
-            regime_n_periods=regime_n_periods,
-            n_regimes_passing=n_passing,
-            passes=n_passing >= cfg.min_regimes_passing,
-            overall_regime_score=overall_score,
-        )
-
-    def evaluate_batch(
-        self,
-        candidates: Dict[str, np.ndarray],
-    ) -> Dict[str, RegimeICResult]:
-        """Evaluate multiple factors.
-
-        Parameters
-        ----------
-        candidates : Dict[str, np.ndarray]
-            Mapping of factor name to signal matrix (M, T).
-
-        Returns
-        -------
-        Dict[str, RegimeICResult]
-        """
-        return {name: self.evaluate(name, sig) for name, sig in candidates.items()}
-
-    # ----- IC helpers (mirrors metrics.py conventions) -----
-
-    @staticmethod
-    def _compute_ic(signals: np.ndarray, returns: np.ndarray) -> np.ndarray:
-        """Cross-sectional Spearman IC per period.
-
-        Replicates the logic in ``metrics.compute_ic`` to keep this module
-        self-contained while matching the project convention.
-        """
-        M, T = signals.shape
-        ic_series = np.full(T, np.nan, dtype=np.float64)
-
-        for t in range(T):
-            s = signals[:, t]
-            r = returns[:, t]
-            valid = ~(np.isnan(s) | np.isnan(r))
-            n = valid.sum()
-            if n < 5:
-                continue
-            rs = rankdata(s[valid])
-            rr = rankdata(r[valid])
-            rs_m = rs - rs.mean()
-            rr_m = rr - rr.mean()
-            denom = np.sqrt((rs_m ** 2).sum() * (rr_m ** 2).sum())
-            if denom < 1e-12:
-                ic_series[t] = 0.0
-            else:
-                ic_series[t] = (rs_m * rr_m).sum() / denom
-
-        return ic_series
-
-    @staticmethod
-    def _compute_icir(valid_ic: np.ndarray) -> float:
-        """ICIR = mean(IC) / std(IC)."""
-        if len(valid_ic) < 3:
-            return 0.0
-        std = float(np.std(valid_ic, ddof=1))
-        if std < 1e-12:
-            return 0.0
-        return float(np.mean(valid_ic)) / std
-
-
-# ---------------------------------------------------------------------------
-# Phase 2: Streaming regime detection (added for HelixFactor)
-# ---------------------------------------------------------------------------
-
-class TrendRegime(enum.Enum):
-    BULL = "bull"
-    BEAR = "bear"
-    NEUTRAL = "neutral"
-
-
-class VolRegime(enum.Enum):
-    HIGH_VOL = "high_vol"
-    LOW_VOL = "low_vol"
-    NORMAL_VOL = "normal_vol"
-
-
-class MeanRevRegime(enum.Enum):
-    TRENDING = "trending"
-    MEAN_REVERTING = "mean_reverting"
-    RANDOM_WALK = "random_walk"
-
-
-@dataclass
-class RegimeState:
-    """Composite regime state: trend + vol + mean-reversion classification."""
-    trend: TrendRegime = TrendRegime.NEUTRAL
-    vol: VolRegime = VolRegime.NORMAL_VOL
-    mean_rev: MeanRevRegime = MeanRevRegime.RANDOM_WALK
-
-    def to_dict(self) -> dict:
-        return {
-            "trend": self.trend.value,
-            "vol": self.vol.value,
-            "mean_rev": self.mean_rev.value,
-        }
-
-    @classmethod
-    def from_dict(cls, d: dict) -> "RegimeState":
-        return cls(
-            trend=TrendRegime(d.get("trend", "neutral")),
-            vol=VolRegime(d.get("vol", "normal_vol")),
-            mean_rev=MeanRevRegime(d.get("mean_rev", "random_walk")),
-        )
-
-    def __str__(self) -> str:
-        return f"{self.trend.value}/{self.vol.value}/{self.mean_rev.value}"
-
-    def label(self) -> str:
-        return str(self)
-
-
-@dataclass
-class StreamingRegimeConfig:
-    """Configuration for StreamingRegimeDetector."""
-    fast_alpha: float = 0.1          # EW decay for fast stats
-    slow_alpha: float = 0.02         # EW decay for slow (baseline) stats
-    trend_sigma_threshold: float = 1.0  # sigmas above/below zero for BULL/BEAR
-    vol_high_quantile: float = 0.75  # quantile threshold for HIGH_VOL
-    vol_low_quantile: float = 0.25   # quantile threshold for LOW_VOL
-    hurst_lags: tuple = (2, 4, 8, 16)  # lags for variance-ratio Hurst estimate
-    hmm_smoothing: float = 0.3       # sticky-state weight (0 = no smoothing)
-    history_maxlen: int = 500        # max regime history records
-
-
-class StreamingRegimeDetector:
-    """Bar-by-bar O(1) regime classifier using exponentially-weighted stats.
-
-    Detects three independent regime axes:
-    - Trend: BULL / BEAR / NEUTRAL  (EW mean vs threshold)
-    - Volatility: HIGH / LOW / NORMAL  (EW std vs quantile buffer)
-    - Mean-reversion: TRENDING / MEAN_REVERTING / RANDOM_WALK (Hurst via variance ratio)
-    """
-
-    def __init__(self, config: StreamingRegimeConfig | None = None) -> None:
-        self.config = config or StreamingRegimeConfig()
-        # Exponentially-weighted moments
-        self._ew_mean: float = 0.0
-        self._ew_var: float = 0.0          # fast (for current vol)
-        self._ew_var_slow: float = 0.0     # slow (baseline)
-        self._n: int = 0
-        # Rolling buffers for variance-ratio Hurst
-        self._return_buffer: list = []
-        self._vol_buffer: list = []        # rolling realized vol samples
-        # Regime history
-        from collections import deque
-        self._history: deque = deque(maxlen=self.config.history_maxlen)
-        self._transition_counts: dict = {}
-        self._current: RegimeState = RegimeState()
-        import threading
-        self._lock = threading.RLock()
-
-    # ------------------------------------------------------------------
-    # Public API
-    # ------------------------------------------------------------------
-
-    def update(
-        self,
-        returns: np.ndarray,        # (M,) cross-sectional returns at this bar
-        volumes: np.ndarray | None = None,  # (M,) optional — unused currently
-    ) -> RegimeState:
-        """Process one bar and return updated RegimeState."""
-        with self._lock:
-            r = float(np.nanmean(returns))
-            vol = float(np.nanstd(returns)) if len(returns) > 1 else 0.0
-            self._update_moments(r, vol)
-            new_state = self._classify()
-            self._apply_smoothing(new_state)
-            self._record_transition(self._current, new_state)
-            self._current = new_state
-            self._history.append(new_state)
-            return new_state
-
-    def get_current_regime(self) -> RegimeState:
-        with self._lock:
-            return self._current
-
-    def get_regime_history(self, lookback: int = 20) -> list:
-        with self._lock:
-            hist = list(self._history)
-            return hist[-lookback:] if lookback else hist
-
-    def regime_transition_probability(self) -> dict:
-        """Return dict of 'from/to' → empirical probability."""
-        with self._lock:
-            total = sum(self._transition_counts.values())
-            if total == 0:
-                return {}
-            return {k: v / total for k, v in self._transition_counts.items()}
-
-    def reset(self) -> None:
-        with self._lock:
-            self.__init__(config=self.config)
-
-    # ------------------------------------------------------------------
-    # Internal helpers
-    # ------------------------------------------------------------------
-
-    def _update_moments(self, r: float, vol: float) -> None:
-        fa, sa = self.config.fast_alpha, self.config.slow_alpha
-        if self._n == 0:
-            self._ew_mean = r
-            self._ew_var = vol ** 2
-            self._ew_var_slow = vol ** 2
-        else:
-            self._ew_mean = fa * r + (1 - fa) * self._ew_mean
-            self._ew_var = fa * (r - self._ew_mean) ** 2 + (1 - fa) * self._ew_var
-            self._ew_var_slow = sa * vol ** 2 + (1 - sa) * self._ew_var_slow
-        self._n += 1
-        self._return_buffer.append(r)
-        self._vol_buffer.append(vol)
-        if len(self._return_buffer) > 200:
-            self._return_buffer.pop(0)
-            self._vol_buffer.pop(0)
-
-    def _classify(self) -> RegimeState:
-        return RegimeState(
-            trend=self._classify_trend(),
-            vol=self._classify_vol(),
-            mean_rev=self._classify_mean_rev(),
-        )
-
-    def _classify_trend(self) -> TrendRegime:
-        sigma = float(np.sqrt(max(self._ew_var, 1e-16)))
-        n = max(self._n, 1)
-        se = sigma / (n ** 0.5)
-        thresh = self.config.trend_sigma_threshold * se
-        if self._ew_mean > thresh:
-            return TrendRegime.BULL
-        elif self._ew_mean < -thresh:
-            return TrendRegime.BEAR
-        return TrendRegime.NEUTRAL
-
-    def _classify_vol(self) -> VolRegime:
-        if len(self._vol_buffer) < 10:
-            return VolRegime.NORMAL_VOL
-        arr = np.array(self._vol_buffer)
-        cur = float(np.sqrt(max(self._ew_var, 0.0)))
-        hi = float(np.quantile(arr, self.config.vol_high_quantile))
-        lo = float(np.quantile(arr, self.config.vol_low_quantile))
-        if cur > hi:
-            return VolRegime.HIGH_VOL
-        elif cur < lo:
-            return VolRegime.LOW_VOL
-        return VolRegime.NORMAL_VOL
-
-    def _classify_mean_rev(self) -> MeanRevRegime:
-        buf = self._return_buffer
-        if len(buf) < 32:
-            return MeanRevRegime.RANDOM_WALK
-        arr = np.array(buf)
-        lags = [l for l in self.config.hurst_lags if l < len(arr)]
-        if not lags:
-            return MeanRevRegime.RANDOM_WALK
-        ratios = []
-        for lag in lags:
-            var_lag = float(np.var(arr[lag:] - arr[:-lag]))
-            var_1 = float(np.var(np.diff(arr))) if len(arr) > 1 else 1e-16
-            if var_1 > 1e-16:
-                ratios.append(var_lag / (lag * var_1))
-        if not ratios:
-            return MeanRevRegime.RANDOM_WALK
-        hurst_proxy = float(np.mean(ratios))
-        if hurst_proxy > 1.1:
-            return MeanRevRegime.TRENDING
-        elif hurst_proxy < 0.9:
-            return MeanRevRegime.MEAN_REVERTING
-        return MeanRevRegime.RANDOM_WALK
-
-    def _apply_smoothing(self, new_state: RegimeState) -> None:
-        """HMM-inspired: resist single-bar flips via smoothing weight."""
-        w = self.config.hmm_smoothing
-        if w <= 0 or self._current is None:
-            return
-        # If smoothing weight is high and current state differs, revert in-place
-        # (simple sticky-state: only update if change is "strong enough")
-        # We achieve this by probabilistic rejection — deterministic version:
-        # keep current if random draw < smoothing weight (approximate)
-        import random
-        if (new_state.trend != self._current.trend or
-                new_state.vol != self._current.vol):
-            if random.random() < w:
-                new_state.trend = self._current.trend
-                new_state.vol = self._current.vol
-
-    def _record_transition(self, old: RegimeState, new: RegimeState) -> None:
-        key = f"{old.label()}->{new.label()}"
-        self._transition_counts[key] = self._transition_counts.get(key, 0) + 1
diff --git a/src/factorminer/factorminer/evaluation/research.py b/src/factorminer/factorminer/evaluation/research.py
deleted file mode 100644
index e4152bc..0000000
--- a/src/factorminer/factorminer/evaluation/research.py
+++ /dev/null
@@ -1,518 +0,0 @@
-"""Research-first multi-horizon scoring and model evaluation."""
-
-from __future__ import annotations
-
-from dataclasses import asdict, dataclass, field
-from typing import Dict, Iterable, Sequence
-
-import numpy as np
-
-from src.factorminer.factorminer.evaluation.backtest import rolling_splits
-from src.factorminer.factorminer.evaluation.metrics import compute_factor_stats, compute_pairwise_correlation
-from src.factorminer.factorminer.evaluation.portfolio import PortfolioBacktester
-from src.factorminer.factorminer.evaluation.regime import RegimeAwareEvaluator, RegimeConfig, RegimeDetector
-from src.factorminer.factorminer.evaluation.selection import FactorSelector
-from src.factorminer.factorminer.evaluation.significance import BootstrapICTester, SignificanceConfig
-
-
-@dataclass
-class FactorGeometryDiagnostics:
-    """How much new information a factor adds beyond the current library."""
-
-    max_abs_correlation: float = 0.0
-    mean_abs_correlation: float = 0.0
-    projection_loss: float = 0.0
-    marginal_span_gain: float = 1.0
-    effective_rank_gain: float = 1.0
-    residual_ic: float = 0.0
-
-
-@dataclass
-class FactorScoreVector:
-    """Multi-horizon quality summary used in research mode."""
-
-    primary_objective: str
-    primary_score: float
-    lower_confidence_bound: float
-    weighted_score: float
-    decay_slope: float
-    cross_horizon_consistency: float
-    average_turnover: float
-    geometry: FactorGeometryDiagnostics
-    per_horizon_ic_mean: Dict[str, float] = field(default_factory=dict)
-    per_horizon_icir: Dict[str, float] = field(default_factory=dict)
-    per_horizon_shrunk_ic: Dict[str, float] = field(default_factory=dict)
-    per_horizon_se: Dict[str, float] = field(default_factory=dict)
-    per_horizon_lcb: Dict[str, float] = field(default_factory=dict)
-    per_horizon_turnover: Dict[str, float] = field(default_factory=dict)
-    pareto_dominant: bool = True
-
-    def to_dict(self) -> dict:
-        payload = asdict(self)
-        payload["geometry"] = asdict(self.geometry)
-        return payload
-
-
-def compute_factor_geometry(
-    candidate_signals: np.ndarray,
-    returns: np.ndarray,
-    library_signals: Sequence[np.ndarray] | None = None,
-) -> FactorGeometryDiagnostics:
-    """Compute soft library geometry metrics for a candidate."""
-    library_signals = list(library_signals or [])
-    if not library_signals:
-        return FactorGeometryDiagnostics(
-            max_abs_correlation=0.0,
-            mean_abs_correlation=0.0,
-            projection_loss=0.0,
-            marginal_span_gain=1.0,
-            effective_rank_gain=1.0,
-            residual_ic=float(compute_factor_stats(candidate_signals, returns)["ic_abs_mean"]),
-        )
-
-    corrs = [
-        abs(compute_pairwise_correlation(candidate_signals, lib_signal))
-        for lib_signal in library_signals
-    ]
-    flattened_candidate, valid_mask = _flatten_panel(candidate_signals)
-    library_vectors = []
-    for signal in library_signals:
-        flattened_signal, _ = _flatten_panel(signal, valid_mask=valid_mask)
-        library_vectors.append(flattened_signal)
-
-    if not library_vectors:
-        return FactorGeometryDiagnostics(
-            max_abs_correlation=max(corrs, default=0.0),
-            mean_abs_correlation=float(np.mean(corrs)) if corrs else 0.0,
-            residual_ic=float(compute_factor_stats(candidate_signals, returns)["ic_abs_mean"]),
-        )
-
-    design = np.column_stack(library_vectors)
-    response = flattened_candidate
-    if design.size == 0 or response.size == 0 or np.nanstd(response) < 1e-12:
-        projection_loss = 0.0
-        marginal_span_gain = 1.0
-        residual_matrix = candidate_signals
-    else:
-        beta, *_ = np.linalg.lstsq(design, response, rcond=None)
-        fitted = design @ beta
-        residual = response - fitted
-        response_var = float(np.var(response))
-        residual_var = float(np.var(residual))
-        marginal_span_gain = residual_var / response_var if response_var > 1e-12 else 0.0
-        projection_loss = 1.0 - marginal_span_gain
-        residual_matrix = _unflatten_panel(residual, valid_mask, candidate_signals.shape)
-
-    before_rank = _effective_rank(design)
-    after_rank = _effective_rank(np.column_stack([design, response]))
-    residual_ic = float(compute_factor_stats(residual_matrix, returns)["ic_abs_mean"])
-
-    return FactorGeometryDiagnostics(
-        max_abs_correlation=max(corrs, default=0.0),
-        mean_abs_correlation=float(np.mean(corrs)) if corrs else 0.0,
-        projection_loss=float(projection_loss),
-        marginal_span_gain=float(max(marginal_span_gain, 0.0)),
-        effective_rank_gain=float(after_rank - before_rank),
-        residual_ic=residual_ic,
-    )
-
-
-def build_score_vector(
-    target_stats: Dict[str, dict],
-    target_horizons: Dict[str, int],
-    research_cfg,
-    geometry: FactorGeometryDiagnostics,
-) -> FactorScoreVector:
-    """Aggregate per-target metrics into one research-mode score vector."""
-    weights = _normalized_weights(
-        target_stats.keys(),
-        explicit_weights=getattr(research_cfg, "horizon_weights", {}),
-    )
-    uncertainty_cfg = research_cfg.uncertainty
-    admission_cfg = research_cfg.admission
-
-    per_horizon_ic_mean: Dict[str, float] = {}
-    per_horizon_icir: Dict[str, float] = {}
-    per_horizon_shrunk_ic: Dict[str, float] = {}
-    per_horizon_se: Dict[str, float] = {}
-    per_horizon_lcb: Dict[str, float] = {}
-    per_horizon_turnover: Dict[str, float] = {}
-
-    for target_name, stats in target_stats.items():
-        ic_series = np.asarray(stats.get("ic_series", np.array([])), dtype=np.float64)
-        se = _bootstrap_standard_error(ic_series, uncertainty_cfg)
-        ic_abs_mean = float(stats.get("ic_abs_mean", 0.0))
-        shrunk_ic = max(ic_abs_mean - uncertainty_cfg.shrinkage_strength * se, 0.0)
-        lcb = ic_abs_mean - uncertainty_cfg.lcb_zscore * se
-
-        per_horizon_ic_mean[target_name] = float(stats.get("ic_mean", 0.0))
-        per_horizon_icir[target_name] = float(stats.get("icir", 0.0))
-        per_horizon_shrunk_ic[target_name] = float(shrunk_ic)
-        per_horizon_se[target_name] = float(se)
-        per_horizon_lcb[target_name] = float(lcb)
-        per_horizon_turnover[target_name] = float(stats.get("turnover", 0.0))
-
-    weighted_quality = float(
-        sum(weights[name] * per_horizon_shrunk_ic.get(name, 0.0) for name in weights)
-    )
-    average_turnover = float(
-        np.mean(list(per_horizon_turnover.values())) if per_horizon_turnover else 0.0
-    )
-    lower_confidence_bound = float(
-        min(per_horizon_lcb.values()) if per_horizon_lcb else 0.0
-    )
-    redundancy_penalty = admission_cfg.redundancy_penalty * geometry.max_abs_correlation
-    turnover_penalty = admission_cfg.turnover_penalty * average_turnover
-    geometry_bonus = 0.0
-    if admission_cfg.use_residual_ic:
-        geometry_bonus += 0.5 * geometry.residual_ic
-    if admission_cfg.use_effective_rank_gain:
-        geometry_bonus += 0.05 * max(geometry.effective_rank_gain, 0.0)
-
-    weighted_score = weighted_quality - redundancy_penalty - turnover_penalty + geometry_bonus
-    decay_slope = _decay_slope(target_horizons, per_horizon_shrunk_ic)
-    consistency = _cross_horizon_consistency(per_horizon_ic_mean)
-
-    return FactorScoreVector(
-        primary_objective=research_cfg.primary_objective,
-        primary_score=weighted_score,
-        lower_confidence_bound=lower_confidence_bound,
-        weighted_score=weighted_score,
-        decay_slope=decay_slope,
-        cross_horizon_consistency=consistency,
-        average_turnover=average_turnover,
-        geometry=geometry,
-        per_horizon_ic_mean=per_horizon_ic_mean,
-        per_horizon_icir=per_horizon_icir,
-        per_horizon_shrunk_ic=per_horizon_shrunk_ic,
-        per_horizon_se=per_horizon_se,
-        per_horizon_lcb=per_horizon_lcb,
-        per_horizon_turnover=per_horizon_turnover,
-    )
-
-
-def passes_research_admission(
-    score_vector: FactorScoreVector,
-    research_cfg,
-    correlation_threshold: float,
-) -> tuple[bool, str]:
-    """Apply research-mode admission rules on top of paper-style correlation."""
-    admission_cfg = research_cfg.admission
-    if score_vector.primary_score < admission_cfg.min_score:
-        return False, (
-            f"Research score {score_vector.primary_score:.4f} "
-            f"< {admission_cfg.min_score:.4f}"
-        )
-    if score_vector.lower_confidence_bound < admission_cfg.min_lcb:
-        return False, (
-            f"Research LCB {score_vector.lower_confidence_bound:.4f} "
-            f"< {admission_cfg.min_lcb:.4f}"
-        )
-    if score_vector.geometry.max_abs_correlation < correlation_threshold:
-        return True, "Research score passes direct admission"
-    if (
-        admission_cfg.use_residual_ic
-        and score_vector.geometry.residual_ic >= admission_cfg.min_score
-        and score_vector.geometry.marginal_span_gain >= admission_cfg.min_span_gain
-        and (
-            (not admission_cfg.use_effective_rank_gain)
-            or score_vector.geometry.effective_rank_gain >= admission_cfg.min_effective_rank_gain
-        )
-    ):
-        return True, "Research geometry passes residual-span admission"
-    return False, (
-        "Too redundant under research geometry: "
-        f"max|rho|={score_vector.geometry.max_abs_correlation:.4f}, "
-        f"residual_ic={score_vector.geometry.residual_ic:.4f}, "
-        f"span_gain={score_vector.geometry.marginal_span_gain:.4f}"
-    )
-
-
-def run_research_model_suite(
-    factor_signals: Dict[int, np.ndarray],
-    returns: np.ndarray,
-    research_cfg,
-) -> Dict[str, dict]:
-    """Fit research-mode models on rolling windows and report net IR/stability."""
-    if not factor_signals:
-        return {}
-
-    selector = FactorSelector()
-    backtester = PortfolioBacktester()
-    splits = rolling_splits(
-        returns.shape[0],
-        train_window=research_cfg.selection.rolling_train_window,
-        test_window=research_cfg.selection.rolling_test_window,
-        step=research_cfg.selection.rolling_step,
-    )
-    if not splits:
-        return {}
-
-    reports: Dict[str, dict] = {}
-    for model_name in research_cfg.selection.models:
-        fold_reports = []
-        selected_sets = []
-        for split in splits:
-            train_returns = returns[split.train_start:split.train_end]
-            test_returns = returns[split.test_start:split.test_end]
-            train_signals = {
-                fid: signal[split.train_start:split.train_end]
-                for fid, signal in factor_signals.items()
-            }
-            test_signals = {
-                fid: signal[split.test_start:split.test_end]
-                for fid, signal in factor_signals.items()
-            }
-            try:
-                selected, weights = _fit_research_model(
-                    selector,
-                    model_name,
-                    train_signals,
-                    train_returns,
-                )
-            except ImportError as exc:
-                reports[model_name] = {"available": False, "error": str(exc)}
-                fold_reports = []
-                break
-            if not selected:
-                continue
-            selected_sets.append(set(selected))
-            composite = _weighted_composite(test_signals, weights)
-            stats = backtester.quintile_backtest(
-                composite,
-                test_returns,
-                transaction_cost_bps=research_cfg.execution.cost_bps,
-            )
-            regime_report = None
-            if research_cfg.regimes.enabled:
-                regime_report = _composite_regime_report(composite, test_returns)
-            fold_reports.append(
-                {
-                    "selected_ids": selected,
-                    "weights": weights,
-                    "test_ic_mean": float(stats["ic_mean"]),
-                    "test_icir": float(stats["icir"]),
-                    "test_net_ir": _series_ir(stats["ls_net_series"]),
-                    "avg_turnover": float(stats["avg_turnover"]),
-                    "regimes": regime_report,
-                }
-            )
-
-        if not fold_reports:
-            reports.setdefault(model_name, {"available": True, "folds": []})
-            continue
-
-        reports[model_name] = {
-            "available": True,
-            "folds": fold_reports,
-            "mean_test_ic_mean": float(np.mean([fold["test_ic_mean"] for fold in fold_reports])),
-            "mean_test_icir": float(np.mean([fold["test_icir"] for fold in fold_reports])),
-            "mean_test_net_ir": float(np.mean([fold["test_net_ir"] for fold in fold_reports])),
-            "mean_turnover": float(np.mean([fold["avg_turnover"] for fold in fold_reports])),
-            "selection_stability": _selection_stability(selected_sets),
-        }
-
-    return reports
-
-
-def _fit_research_model(
-    selector: FactorSelector,
-    model_name: str,
-    factor_signals: Dict[int, np.ndarray],
-    returns: np.ndarray,
-) -> tuple[list[int], Dict[int, float]]:
-    if model_name == "ridge":
-        from sklearn.linear_model import RidgeCV
-
-        ids, X, y = selector._prepare_panel(factor_signals, returns)  # noqa: SLF001
-        if len(ids) == 0:
-            return [], {}
-        model = RidgeCV(alphas=np.logspace(-4, 2, 12))
-        model.fit(X, y)
-        weights = {ids[idx]: float(coef) for idx, coef in enumerate(model.coef_)}
-        selected = [factor_id for factor_id, weight in weights.items() if abs(weight) > 1e-10]
-        return selected, {factor_id: weights[factor_id] for factor_id in selected}
-
-    if model_name == "elastic_net":
-        from sklearn.linear_model import ElasticNetCV
-
-        ids, X, y = selector._prepare_panel(factor_signals, returns)  # noqa: SLF001
-        if len(ids) == 0:
-            return [], {}
-        model = ElasticNetCV(l1_ratio=[0.1, 0.5, 0.9], cv=3, max_iter=10000)
-        model.fit(X, y)
-        weights = {ids[idx]: float(coef) for idx, coef in enumerate(model.coef_)}
-        selected = [factor_id for factor_id, weight in weights.items() if abs(weight) > 1e-10]
-        return selected, {factor_id: weights[factor_id] for factor_id in selected}
-
-    if model_name == "lasso":
-        results = selector.lasso_selection(factor_signals, returns)
-        selected = [factor_id for factor_id, _ in results]
-        return selected, {factor_id: score for factor_id, score in results}
-
-    if model_name == "stepwise":
-        results = selector.forward_stepwise(factor_signals, returns)
-        selected = [factor_id for factor_id, _ in results]
-        return selected, {factor_id: 1.0 for factor_id in selected}
-
-    if model_name == "xgboost":
-        results = selector.xgboost_selection(factor_signals, returns)
-        selected = [factor_id for factor_id, _ in results[: max(1, min(10, len(results)))]]
-        return selected, {factor_id: score for factor_id, score in results if factor_id in selected}
-
-    raise ValueError(f"Unknown research model: {model_name}")
-
-
-def _weighted_composite(
-    factor_signals: Dict[int, np.ndarray],
-    weights: Dict[int, float],
-) -> np.ndarray:
-    selected_signals = {fid: factor_signals[fid] for fid in weights if fid in factor_signals}
-    if not selected_signals:
-        raise ValueError("No selected signals available for composite")
-    raw_weights = np.array([abs(weights[fid]) for fid in selected_signals], dtype=np.float64)
-    if raw_weights.sum() < 1e-12:
-        raw_weights = np.ones_like(raw_weights)
-    normalized_weights = raw_weights / raw_weights.sum()
-
-    composite = np.zeros_like(next(iter(selected_signals.values())), dtype=np.float64)
-    for idx, fid in enumerate(selected_signals):
-        signal = selected_signals[fid].astype(np.float64)
-        cs_mean = np.nanmean(signal, axis=1, keepdims=True)
-        cs_std = np.nanstd(signal, axis=1, keepdims=True)
-        cs_std = np.where(cs_std == 0.0, 1.0, cs_std)
-        standardized = (signal - cs_mean) / cs_std
-        composite += normalized_weights[idx] * np.where(np.isnan(standardized), 0.0, standardized)
-    return composite
-
-
-def _bootstrap_standard_error(ic_series: np.ndarray, uncertainty_cfg) -> float:
-    valid = ic_series[np.isfinite(ic_series)]
-    if len(valid) < 3:
-        return 0.0
-    tester = BootstrapICTester(
-        SignificanceConfig(
-            bootstrap_n_samples=uncertainty_cfg.bootstrap_samples,
-            bootstrap_block_size=uncertainty_cfg.block_size,
-            seed=42,
-        )
-    )
-    result = tester.compute_ci("research", valid)
-    return float(result.ic_std_boot)
-
-
-def _normalized_weights(
-    target_names: Iterable[str],
-    explicit_weights: Dict[str, float],
-) -> Dict[str, float]:
-    target_names = list(target_names)
-    if not target_names:
-        return {}
-    if explicit_weights:
-        weights = np.array([max(float(explicit_weights.get(name, 0.0)), 0.0) for name in target_names])
-        if weights.sum() > 1e-12:
-            normalized = weights / weights.sum()
-            return {name: float(normalized[idx]) for idx, name in enumerate(target_names)}
-    equal = 1.0 / len(target_names)
-    return {name: equal for name in target_names}
-
-
-def _decay_slope(target_horizons: Dict[str, int], shrunk_ic: Dict[str, float]) -> float:
-    aligned = [
-        (target_horizons[name], value)
-        for name, value in shrunk_ic.items()
-        if name in target_horizons
-    ]
-    if len(aligned) < 2:
-        return 0.0
-    horizons = np.array([item[0] for item in aligned], dtype=np.float64)
-    scores = np.array([item[1] for item in aligned], dtype=np.float64)
-    if np.std(horizons) < 1e-12:
-        return 0.0
-    slope, _ = np.polyfit(horizons, scores, 1)
-    return float(slope)
-
-
-def _cross_horizon_consistency(per_horizon_ic_mean: Dict[str, float]) -> float:
-    values = [value for value in per_horizon_ic_mean.values() if abs(value) > 1e-12]
-    if not values:
-        return 0.0
-    signs = np.sign(values)
-    majority = np.sign(np.sum(signs))
-    if majority == 0:
-        return 0.0
-    return float(np.mean(signs == majority))
-
-
-def _flatten_panel(panel: np.ndarray, valid_mask: np.ndarray | None = None) -> tuple[np.ndarray, np.ndarray]:
-    matrix = np.asarray(panel, dtype=np.float64)
-    if valid_mask is None:
-        valid_mask = np.isfinite(matrix)
-    centered = np.where(valid_mask, matrix, np.nan)
-    cs_mean = np.nanmean(centered, axis=0, keepdims=True)
-    cs_std = np.nanstd(centered, axis=0, keepdims=True)
-    cs_std = np.where(cs_std < 1e-12, 1.0, cs_std)
-    standardized = (centered - cs_mean) / cs_std
-    filled = np.where(np.isfinite(standardized), standardized, 0.0)
-    return filled.reshape(-1), valid_mask
-
-
-def _unflatten_panel(flat: np.ndarray, valid_mask: np.ndarray, shape: tuple[int, int]) -> np.ndarray:
-    matrix = np.full(shape, np.nan, dtype=np.float64)
-    matrix[valid_mask] = flat.reshape(shape)[valid_mask]
-    return matrix
-
-
-def _effective_rank(matrix: np.ndarray) -> float:
-    if matrix.ndim != 2 or min(matrix.shape) == 0:
-        return 0.0
-    cov = matrix.T @ matrix
-    singular_values = np.linalg.svd(cov, compute_uv=False)
-    singular_values = singular_values[singular_values > 1e-12]
-    if len(singular_values) == 0:
-        return 0.0
-    probs = singular_values / singular_values.sum()
-    entropy = -np.sum(probs * np.log(probs))
-    return float(np.exp(entropy))
-
-
-def _selection_stability(selected_sets: Sequence[set[int]]) -> float:
-    if len(selected_sets) < 2:
-        return 1.0 if selected_sets else 0.0
-    overlaps = []
-    for idx in range(len(selected_sets) - 1):
-        left = selected_sets[idx]
-        right = selected_sets[idx + 1]
-        union = left | right
-        overlaps.append(len(left & right) / len(union) if union else 1.0)
-    return float(np.mean(overlaps))
-
-
-def _series_ir(series: np.ndarray) -> float:
-    valid = np.asarray(series, dtype=np.float64)
-    valid = valid[np.isfinite(valid)]
-    if len(valid) < 2:
-        return 0.0
-    std = float(np.std(valid, ddof=1))
-    if std < 1e-12:
-        return 0.0
-    return float(np.mean(valid) / std)
-
-
-def _composite_regime_report(composite: np.ndarray, returns: np.ndarray) -> dict:
-    detector = RegimeDetector(RegimeConfig())
-    classification = detector.classify(returns.T)
-    evaluator = RegimeAwareEvaluator(returns.T, classification, RegimeConfig())
-    regime_result = evaluator.evaluate("composite", composite.T)
-    regime_net_ir = {}
-    backtester = PortfolioBacktester()
-    stats = backtester.quintile_backtest(composite, returns)
-    for regime, mask in classification.periods.items():
-        regime_net_ir[regime.name] = _series_ir(stats["ls_net_series"][mask])
-    return {
-        "regime_score": regime_result.overall_regime_score,
-        "n_regimes_passing": regime_result.n_regimes_passing,
-        "regime_ic": {regime.name: value for regime, value in regime_result.regime_ic.items()},
-        "regime_icir": {regime.name: value for regime, value in regime_result.regime_icir.items()},
-        "regime_net_ir": regime_net_ir,
-    }
diff --git a/src/factorminer/factorminer/evaluation/runtime.py b/src/factorminer/factorminer/evaluation/runtime.py
deleted file mode 100644
index 9024238..0000000
--- a/src/factorminer/factorminer/evaluation/runtime.py
+++ /dev/null
@@ -1,480 +0,0 @@
-"""Shared runtime evaluation helpers for strict factor recomputation."""
-
-from __future__ import annotations
-
-import logging
-from dataclasses import dataclass, field
-from pathlib import Path
-from typing import Dict, Iterable, List, Optional, Sequence
-
-import numpy as np
-import pandas as pd
-
-from src.factorminer.factorminer.core.factor_library import Factor
-from src.factorminer.factorminer.core.parser import try_parse
-from src.factorminer.factorminer.data.tensor_builder import TargetSpec, compute_targets
-from src.factorminer.factorminer.evaluation.metrics import (
-    compute_factor_stats,
-    compute_pairwise_correlation,
-)
-
-logger = logging.getLogger(__name__)
-
-FEATURE_TO_COLUMN = {
-    "$open": "open",
-    "$high": "high",
-    "$low": "low",
-    "$close": "close",
-    "$volume": "volume",
-    "$amt": "amount",
-    "$vwap": "vwap",
-    "$returns": "returns",
-}
-
-COLUMN_TO_FEATURE = {value: key for key, value in FEATURE_TO_COLUMN.items()}
-
-
-class SignalComputationError(RuntimeError):
-    """Raised when a factor cannot be recomputed under strict policies."""
-
-
-@dataclass
-class DatasetSplit:
-    """One temporal view into the evaluation dataset."""
-
-    name: str
-    indices: np.ndarray
-    timestamps: np.ndarray
-    returns: np.ndarray
-    target_returns: Dict[str, np.ndarray] = field(default_factory=dict)
-    default_target: str = "target"
-
-    @property
-    def size(self) -> int:
-        return int(len(self.indices))
-
-    def get_target(self, name: str | None = None) -> np.ndarray:
-        target_name = name or self.default_target
-        if target_name in self.target_returns:
-            return self.target_returns[target_name]
-        return self.returns
-
-
-@dataclass
-class EvaluationDataset:
-    """Canonical dataset used for analysis commands."""
-
-    data_dict: Dict[str, np.ndarray]
-    data_tensor: np.ndarray
-    returns: np.ndarray
-    timestamps: np.ndarray
-    asset_ids: np.ndarray
-    splits: Dict[str, DatasetSplit]
-    processed_df: pd.DataFrame = field(repr=False)
-    target_panels: Dict[str, np.ndarray] = field(default_factory=dict)
-    target_specs: Dict[str, TargetSpec] = field(default_factory=dict)
-    default_target: str = "target"
-
-    def get_split(self, name: str) -> DatasetSplit:
-        if name not in self.splits:
-            raise KeyError(f"Unknown split: {name}")
-        return self.splits[name]
-
-    def get_target(self, name: str | None = None) -> np.ndarray:
-        target_name = name or self.default_target
-        if target_name in self.target_panels:
-            return self.target_panels[target_name]
-        return self.returns
-
-
-@dataclass
-class FactorEvaluationArtifact:
-    """Recomputed signals and metrics for one factor."""
-
-    factor_id: int
-    name: str
-    formula: str
-    category: str
-    parse_ok: bool
-    signals_full: Optional[np.ndarray] = None
-    split_signals: Dict[str, np.ndarray] = field(default_factory=dict)
-    split_stats: Dict[str, dict] = field(default_factory=dict)
-    target_stats: Dict[str, Dict[str, dict]] = field(default_factory=dict)
-    score_vector: Optional[dict] = None
-    research_metrics: Dict[str, float] = field(default_factory=dict)
-    error: str = ""
-
-    @property
-    def succeeded(self) -> bool:
-        return self.parse_ok and self.signals_full is not None and not self.error
-
-
-def load_runtime_dataset(
-    raw_df: pd.DataFrame,
-    cfg,
-) -> EvaluationDataset:
-    """Load raw market data into a canonical evaluation dataset."""
-    from factorminer.data.preprocessor import preprocess
-    from factorminer.data.tensor_builder import TensorConfig, build_tensor
-
-    raw_df = raw_df.copy()
-    raw_df["datetime"] = pd.to_datetime(raw_df["datetime"])
-
-    target_specs = _resolve_target_specs(cfg)
-    target_df = compute_targets(raw_df, target_specs)
-    target_columns = [spec.column_name for spec in target_specs]
-    merge_columns = ["datetime", "asset_id", *target_columns]
-    processed_df = preprocess(raw_df)
-    processed_df = processed_df.merge(
-        target_df[merge_columns],
-        on=["datetime", "asset_id"],
-        how="left",
-    )
-    processed_df = processed_df.sort_values(["datetime", "asset_id"]).reset_index(drop=True)
-
-    feature_columns = _resolve_feature_columns(getattr(cfg.data, "features", []))
-    tensor_cfg = TensorConfig(
-        features=feature_columns,
-        backend="numpy",
-        dtype="float64",
-        target_columns=target_columns,
-        default_target=_target_column_for_name(cfg.data.default_target, target_specs),
-    )
-    dataset = build_tensor(processed_df, tensor_cfg)
-
-    data_tensor = np.asarray(dataset.data, dtype=np.float64)
-    returns = np.asarray(dataset.target, dtype=np.float64)
-    target_panels = {
-        spec.name: np.asarray(dataset.targets[spec.column_name], dtype=np.float64)
-        for spec in target_specs
-        if spec.column_name in dataset.targets
-    }
-    timestamps = pd.to_datetime(dataset.timestamps).to_numpy()
-    asset_ids = np.asarray(dataset.asset_ids)
-
-    if returns.ndim != 2:
-        raise ValueError("Runtime dataset target must be a 2-D (M, T) array")
-
-    data_dict = {
-        COLUMN_TO_FEATURE[column]: data_tensor[:, :, idx]
-        for idx, column in enumerate(dataset.feature_names)
-        if column in COLUMN_TO_FEATURE
-    }
-
-    splits = {
-        "train": _build_named_split(
-            "train",
-            timestamps,
-            returns,
-            target_panels,
-            cfg.data.default_target,
-            start=cfg.data.train_period[0],
-            end=cfg.data.train_period[1],
-        ),
-        "test": _build_named_split(
-            "test",
-            timestamps,
-            returns,
-            target_panels,
-            cfg.data.default_target,
-            start=cfg.data.test_period[0],
-            end=cfg.data.test_period[1],
-        ),
-        "full": DatasetSplit(
-            name="full",
-            indices=np.arange(len(timestamps)),
-            timestamps=timestamps,
-            returns=returns,
-            target_returns=target_panels,
-            default_target=cfg.data.default_target,
-        ),
-    }
-
-    for split_name in ("train", "test"):
-        if splits[split_name].size == 0:
-            raise ValueError(
-                f"{split_name} split is empty for configured period "
-                f"{getattr(cfg.data, f'{split_name}_period')}"
-            )
-
-    return EvaluationDataset(
-        data_dict=data_dict,
-        data_tensor=data_tensor,
-        returns=returns,
-        timestamps=timestamps,
-        asset_ids=asset_ids,
-        splits=splits,
-        processed_df=processed_df,
-        target_panels=target_panels,
-        target_specs={spec.name: spec for spec in target_specs},
-        default_target=cfg.data.default_target,
-    )
-
-
-def evaluate_factors(
-    factors: Sequence[Factor],
-    dataset: EvaluationDataset,
-    signal_failure_policy: str = "reject",
-    target_name: str | None = None,
-) -> List[FactorEvaluationArtifact]:
-    """Recompute factor signals and metrics across all dataset splits."""
-    artifacts: List[FactorEvaluationArtifact] = []
-    active_target_name = target_name or dataset.default_target
-    active_returns = dataset.get_target(active_target_name)
-
-    for factor in factors:
-        artifact = FactorEvaluationArtifact(
-            factor_id=factor.id,
-            name=factor.name,
-            formula=factor.formula,
-            category=factor.category,
-            parse_ok=False,
-        )
-
-        tree = try_parse(factor.formula)
-        if tree is None:
-            artifact.error = "Parse failure"
-            artifacts.append(artifact)
-            continue
-
-        artifact.parse_ok = True
-
-        try:
-            signals = compute_tree_signals(
-                tree,
-                dataset.data_dict,
-                active_returns.shape,
-                signal_failure_policy=signal_failure_policy,
-            )
-        except Exception as exc:
-            artifact.error = str(exc)
-            artifacts.append(artifact)
-            continue
-
-        if signals is None or np.all(np.isnan(signals)):
-            artifact.error = "Signal computation produced only NaN values"
-            artifacts.append(artifact)
-            continue
-
-        artifact.signals_full = np.asarray(signals, dtype=np.float64)
-
-        for split_name, split in dataset.splits.items():
-            split_signals = artifact.signals_full[:, split.indices]
-            artifact.split_signals[split_name] = split_signals
-            active_split_target = split.get_target(active_target_name)
-            active_stats = compute_factor_stats(split_signals, active_split_target)
-            artifact.split_stats[split_name] = active_stats
-            artifact.target_stats[split_name] = {}
-            for available_target_name, split_target in split.target_returns.items():
-                artifact.target_stats[split_name][available_target_name] = (
-                    active_stats
-                    if available_target_name == active_target_name
-                    else compute_factor_stats(split_signals, split_target)
-                )
-
-        artifacts.append(artifact)
-
-    return artifacts
-
-
-def compute_tree_signals(
-    tree,
-    data_dict: Dict[str, np.ndarray],
-    returns_shape: tuple[int, int],
-    signal_failure_policy: str = "reject",
-) -> np.ndarray:
-    """Evaluate an expression tree under an explicit failure policy."""
-    formula_str = tree.to_string()
-
-    try:
-        signals = tree.evaluate(data_dict)
-    except Exception as exc:
-        return _handle_signal_failure(
-            formula_str=formula_str,
-            returns_shape=returns_shape,
-            signal_failure_policy=signal_failure_policy,
-            cause=exc,
-        )
-
-    if signals is None or np.all(np.isnan(signals)):
-        return _handle_signal_failure(
-            formula_str=formula_str,
-            returns_shape=returns_shape,
-            signal_failure_policy=signal_failure_policy,
-            cause=SignalComputationError("Signal computation produced only NaN values"),
-        )
-
-    return np.asarray(signals, dtype=np.float64)
-
-
-def compute_correlation_matrix(
-    artifacts: Sequence[FactorEvaluationArtifact],
-    split_name: str,
-) -> np.ndarray:
-    """Compute a true pairwise factor correlation matrix on one split."""
-    selected = [a for a in artifacts if a.succeeded]
-    n = len(selected)
-    matrix = np.zeros((n, n), dtype=np.float64)
-
-    for i in range(n):
-        for j in range(i + 1, n):
-            corr = compute_pairwise_correlation(
-                selected[i].split_signals[split_name],
-                selected[j].split_signals[split_name],
-            )
-            matrix[i, j] = corr
-            matrix[j, i] = corr
-
-    return matrix
-
-
-def select_top_k(
-    artifacts: Sequence[FactorEvaluationArtifact],
-    split_name: str,
-    top_k: Optional[int] = None,
-) -> List[FactorEvaluationArtifact]:
-    """Sort succeeded artifacts by split abs-IC and return the top-k subset."""
-    succeeded = [a for a in artifacts if a.succeeded]
-    succeeded.sort(
-        key=lambda artifact: abs(
-            artifact.split_stats[split_name].get("ic_abs_mean", 0.0)
-        ),
-        reverse=True,
-    )
-    if top_k is None or top_k >= len(succeeded):
-        return succeeded
-    return succeeded[:top_k]
-
-
-def summarize_failures(
-    artifacts: Sequence[FactorEvaluationArtifact],
-) -> List[str]:
-    """Return human-readable failure summaries."""
-    return [
-        f"{artifact.name or artifact.factor_id}: {artifact.error}"
-        for artifact in artifacts
-        if not artifact.succeeded
-    ]
-
-
-def resolve_split_for_fit_eval(period: str) -> str:
-    """Map fit/eval CLI period values to runtime split names."""
-    return "full" if period == "both" else period
-
-
-def analysis_split_names(period: str) -> List[str]:
-    """Map analysis CLI period values to one or two runtime split names."""
-    if period == "both":
-        return ["train", "test"]
-    return [period]
-
-
-def _resolve_feature_columns(config_features: Sequence[str]) -> List[str]:
-    if not config_features:
-        return list(COLUMN_TO_FEATURE.keys())
-
-    resolved: List[str] = []
-    for feature in config_features:
-        if feature in FEATURE_TO_COLUMN:
-            resolved.append(FEATURE_TO_COLUMN[feature])
-            continue
-        stripped = feature.lstrip("$")
-        if stripped == "amt":
-            stripped = "amount"
-        resolved.append(stripped)
-    return resolved
-
-
-def _build_named_split(
-    name: str,
-    timestamps: np.ndarray,
-    returns: np.ndarray,
-    target_panels: Dict[str, np.ndarray],
-    default_target: str,
-    start: str,
-    end: str,
-) -> DatasetSplit:
-    ts = pd.to_datetime(timestamps)
-    mask = (ts >= pd.Timestamp(start)) & (ts <= pd.Timestamp(end))
-    indices = np.where(mask)[0]
-    return DatasetSplit(
-        name=name,
-        indices=indices,
-        timestamps=timestamps[indices],
-        returns=returns[:, indices],
-        target_returns={
-            target_name: panel[:, indices]
-            for target_name, panel in target_panels.items()
-        },
-        default_target=default_target,
-    )
-
-
-def _resolve_target_specs(cfg) -> List[TargetSpec]:
-    raw_targets = getattr(cfg.data, "targets", None) or [
-        {
-            "name": "paper",
-            "entry_delay_bars": 1,
-            "holding_bars": 1,
-            "price_pair": "open_to_close",
-            "return_transform": "simple",
-        }
-    ]
-    return [
-        TargetSpec(
-            name=str(target["name"]),
-            entry_delay_bars=int(target.get("entry_delay_bars", 0)),
-            holding_bars=int(target.get("holding_bars", 1)),
-            price_pair=str(target.get("price_pair", "open_to_close")),
-            return_transform=str(target.get("return_transform", "simple")),
-        )
-        for target in raw_targets
-    ]
-
-
-def _target_column_for_name(target_name: str, specs: Sequence[TargetSpec]) -> str:
-    for spec in specs:
-        if spec.name == target_name:
-            return spec.column_name
-    return "target"
-
-
-def _handle_signal_failure(
-    formula_str: str,
-    returns_shape: tuple[int, int],
-    signal_failure_policy: str,
-    cause: Exception,
-) -> np.ndarray:
-    if signal_failure_policy == "raise":
-        raise cause
-
-    if signal_failure_policy == "reject":
-        raise SignalComputationError(
-            f"Expression evaluation failed for '{formula_str}': {cause}"
-        ) from cause
-
-    if signal_failure_policy != "synthetic":
-        raise ValueError(
-            "signal_failure_policy must be one of: reject, synthetic, raise"
-        )
-
-    logger.warning(
-        "Expression evaluation failed for '%s': %s — falling back to synthetic signals",
-        formula_str,
-        cause,
-    )
-    return generate_synthetic_signals(formula_str, returns_shape)
-
-
-def generate_synthetic_signals(
-    formula_str: str,
-    returns_shape: tuple[int, int],
-) -> np.ndarray:
-    """Deterministic pseudo-signals for demo/mock workflows."""
-    m, t = returns_shape
-    seed = hash(formula_str) % (2**31)
-    rng = np.random.RandomState(seed)
-    signals = rng.randn(m, t).astype(np.float64)
-    nan_mask = rng.random((m, t)) < 0.02
-    signals[nan_mask] = np.nan
-    return signals
diff --git a/src/factorminer/factorminer/evaluation/selection.py b/src/factorminer/factorminer/evaluation/selection.py
deleted file mode 100644
index c5af6cd..0000000
--- a/src/factorminer/factorminer/evaluation/selection.py
+++ /dev/null
@@ -1,280 +0,0 @@
-"""Factor selection methods for identifying sparse, high-value subsets.
-
-Implements Lasso (L1), Forward Stepwise, and XGBoost-based selection
-strategies for choosing an optimal subset of factors from the mined library.
-"""
-
-from __future__ import annotations
-
-import logging
-from typing import Dict, List, Optional, Tuple
-
-import numpy as np
-from scipy.stats import spearmanr
-
-logger = logging.getLogger(__name__)
-
-
-class FactorSelector:
-    """Select optimal factor subsets from the factor library.
-
-    All methods accept factor signals as (T, N) arrays and forward returns
-    as a (T, N) array, then return ordered lists of (factor_id, score) tuples.
-    """
-
-    # ------------------------------------------------------------------
-    # Lasso (L1-regularized) selection
-    # ------------------------------------------------------------------
-
-    def lasso_selection(
-        self,
-        factor_signals: Dict[int, np.ndarray],
-        returns: np.ndarray,
-        alpha: Optional[float] = None,
-    ) -> List[Tuple[int, float]]:
-        """Lasso: L1-regularized linear regression for factor selection.
-
-        Paper: Only 8 factors capture 95% of IC improvement.
-
-        Parameters
-        ----------
-        factor_signals : dict[int, ndarray]
-            Mapping from factor ID to (T, N) signal array.
-        returns : ndarray of shape (T, N)
-            Forward returns aligned with factor signals.
-        alpha : float or None
-            L1 regularization strength.  If None, selected via cross-validation
-            using LassoCV with 5 folds.
-
-        Returns
-        -------
-        list of (factor_id, coefficient)
-            Non-zero factors sorted by absolute coefficient (descending).
-        """
-        from sklearn.linear_model import Lasso, LassoCV
-
-        ids, X, y = self._prepare_panel(factor_signals, returns)
-        if len(ids) == 0:
-            return []
-
-        if alpha is None:
-            model = LassoCV(cv=5, max_iter=10000, n_jobs=-1)
-            model.fit(X, y)
-            alpha = model.alpha_
-            logger.info("LassoCV selected alpha=%.6f", alpha)
-
-        lasso = Lasso(alpha=alpha, max_iter=10000)
-        lasso.fit(X, y)
-
-        results: List[Tuple[int, float]] = []
-        for idx, coef in enumerate(lasso.coef_):
-            if abs(coef) > 1e-10:
-                results.append((ids[idx], float(coef)))
-
-        results.sort(key=lambda x: abs(x[1]), reverse=True)
-        return results
-
-    # ------------------------------------------------------------------
-    # Forward stepwise selection
-    # ------------------------------------------------------------------
-
-    def forward_stepwise(
-        self,
-        factor_signals: Dict[int, np.ndarray],
-        returns: np.ndarray,
-        max_factors: int = 20,
-    ) -> List[Tuple[int, float]]:
-        """Forward Stepwise: greedy selection maximizing combined ICIR.
-
-        Paper: 18 factors, ICIR=1.38.
-
-        At each step, add the factor that yields the largest improvement in
-        ICIR of the equal-weight composite of selected factors.
-
-        Parameters
-        ----------
-        factor_signals : dict[int, ndarray]
-            Mapping from factor ID to (T, N) signal array.
-        returns : ndarray of shape (T, N)
-            Forward returns.
-        max_factors : int
-            Maximum number of factors to select.
-
-        Returns
-        -------
-        list of (factor_id, delta_ICIR)
-            Factors in selection order with the ICIR improvement each contributed.
-        """
-        if not factor_signals:
-            return []
-
-        remaining = set(factor_signals.keys())
-        selected: List[int] = []
-        result: List[Tuple[int, float]] = []
-        current_icir = 0.0
-
-        for _ in range(min(max_factors, len(factor_signals))):
-            best_fid: Optional[int] = None
-            best_icir = current_icir
-            best_delta = 0.0
-
-            for fid in remaining:
-                candidate = selected + [fid]
-                icir = self._composite_icir(factor_signals, candidate, returns)
-                delta = icir - current_icir
-                if icir > best_icir:
-                    best_fid = fid
-                    best_icir = icir
-                    best_delta = delta
-
-            if best_fid is None:
-                break
-
-            selected.append(best_fid)
-            remaining.discard(best_fid)
-            result.append((best_fid, float(best_delta)))
-            current_icir = best_icir
-            logger.info(
-                "Step %d: added factor %d, ICIR=%.4f (+%.4f)",
-                len(selected), best_fid, current_icir, best_delta,
-            )
-
-        return result
-
-    # ------------------------------------------------------------------
-    # XGBoost importance-based selection
-    # ------------------------------------------------------------------
-
-    def xgboost_selection(
-        self,
-        factor_signals: Dict[int, np.ndarray],
-        returns: np.ndarray,
-    ) -> List[Tuple[int, float]]:
-        """XGBoost: gradient boosting for nonlinear factor interactions.
-
-        Paper: Best performance with ICIR=1.49, 92.6% win rate.
-
-        Parameters
-        ----------
-        factor_signals : dict[int, ndarray]
-            Mapping from factor ID to (T, N) signal array.
-        returns : ndarray of shape (T, N)
-            Forward returns.
-
-        Returns
-        -------
-        list of (factor_id, importance)
-            All factors sorted by gain importance (descending).
-        """
-        import xgboost as xgb
-
-        ids, X, y = self._prepare_panel(factor_signals, returns)
-        if len(ids) == 0:
-            return []
-
-        model = xgb.XGBRegressor(
-            n_estimators=200,
-            max_depth=5,
-            learning_rate=0.05,
-            subsample=0.8,
-            colsample_bytree=0.8,
-            reg_alpha=0.1,
-            reg_lambda=1.0,
-            n_jobs=-1,
-            verbosity=0,
-        )
-        model.fit(X, y)
-
-        importance = model.feature_importances_  # gain-based by default
-        results: List[Tuple[int, float]] = [
-            (ids[i], float(importance[i])) for i in range(len(ids))
-        ]
-        results.sort(key=lambda x: x[1], reverse=True)
-        return results
-
-    # ------------------------------------------------------------------
-    # Internal helpers
-    # ------------------------------------------------------------------
-
-    @staticmethod
-    def _prepare_panel(
-        factor_signals: Dict[int, np.ndarray],
-        returns: np.ndarray,
-    ) -> Tuple[List[int], np.ndarray, np.ndarray]:
-        """Flatten panel data to (samples, features) for sklearn-style models.
-
-        Stacks all (T, N) arrays into (T*N, K) feature matrix and (T*N,)
-        target vector, dropping rows with any NaN.
-
-        Returns
-        -------
-        ids : list of int
-            Factor IDs in column order.
-        X : ndarray (n_samples, n_factors)
-        y : ndarray (n_samples,)
-        """
-        if not factor_signals:
-            return [], np.empty((0, 0)), np.empty(0)
-
-        ids = sorted(factor_signals.keys())
-        T, N = next(iter(factor_signals.values())).shape
-        K = len(ids)
-
-        # Build (T*N, K) matrix
-        X = np.column_stack([
-            factor_signals[fid].ravel() for fid in ids
-        ])  # (T*N, K)
-        y = returns.ravel()  # (T*N,)
-
-        # Drop NaN rows
-        valid = np.all(np.isfinite(X), axis=1) & np.isfinite(y)
-        return ids, X[valid], y[valid]
-
-    @staticmethod
-    def _composite_icir(
-        factor_signals: Dict[int, np.ndarray],
-        selected_ids: List[int],
-        returns: np.ndarray,
-    ) -> float:
-        """Compute ICIR of the equal-weight composite of selected factors.
-
-        IC is the cross-sectional Spearman rank correlation between the
-        composite signal and forward returns at each time step.  ICIR is
-        mean(IC) / std(IC).
-
-        Returns 0.0 if computation fails or std is zero.
-        """
-        if not selected_ids:
-            return 0.0
-
-        signals = []
-        for fid in selected_ids:
-            sig = factor_signals[fid].astype(np.float64)
-            cs_mean = np.nanmean(sig, axis=1, keepdims=True)
-            cs_std = np.nanstd(sig, axis=1, keepdims=True)
-            cs_std = np.where(cs_std == 0.0, 1.0, cs_std)
-            signals.append((sig - cs_mean) / cs_std)
-
-        composite = np.nanmean(np.stack(signals, axis=0), axis=0)  # (T, N)
-
-        T = composite.shape[0]
-        ics = np.full(T, np.nan)
-        for t in range(T):
-            x = composite[t]
-            y = returns[t]
-            valid = np.isfinite(x) & np.isfinite(y)
-            if valid.sum() < 5:
-                continue
-            corr, _ = spearmanr(x[valid], y[valid])
-            if np.isfinite(corr):
-                ics[t] = corr
-
-        finite_ics = ics[np.isfinite(ics)]
-        if len(finite_ics) < 2:
-            return 0.0
-
-        ic_std = np.std(finite_ics, ddof=1)
-        if ic_std < 1e-12:
-            return 0.0
-
-        return float(np.mean(finite_ics) / ic_std)
diff --git a/src/factorminer/factorminer/evaluation/significance.py b/src/factorminer/factorminer/evaluation/significance.py
deleted file mode 100644
index 70b2a38..0000000
--- a/src/factorminer/factorminer/evaluation/significance.py
+++ /dev/null
@@ -1,495 +0,0 @@
-"""Statistical significance testing for alpha factors.
-
-Provides block bootstrap confidence intervals, Benjamini-Hochberg FDR
-control, and Deflated Sharpe Ratio (Bailey & López de Prado, 2014) to
-guard against data-snooping and multiple-testing bias in factor research.
-"""
-
-from __future__ import annotations
-
-import math
-from dataclasses import dataclass, field
-from typing import Dict, Optional, Tuple
-
-import numpy as np
-from scipy.stats import norm, skew, kurtosis
-
-
-# ---------------------------------------------------------------------------
-# Configuration
-# ---------------------------------------------------------------------------
-
-@dataclass
-class SignificanceConfig:
-    """Configuration for all significance tests."""
-
-    enabled: bool = True
-    bootstrap_n_samples: int = 1000
-    bootstrap_block_size: int = 20
-    bootstrap_confidence: float = 0.95
-    fdr_level: float = 0.05
-    deflated_sharpe_enabled: bool = True
-    min_deflated_sharpe: float = 0.0
-    seed: int = 42
-
-
-# ---------------------------------------------------------------------------
-# Bootstrap CI
-# ---------------------------------------------------------------------------
-
-@dataclass
-class BootstrapCIResult:
-    """Result of a block bootstrap confidence interval for mean |IC|."""
-
-    factor_name: str
-    ic_mean: float
-    ci_lower: float
-    ci_upper: float
-    ic_std_boot: float
-    ci_excludes_zero: bool
-
-
-class BootstrapICTester:
-    """Block bootstrap tester for IC series significance.
-
-    Uses circular block bootstrap to preserve time-series autocorrelation
-    when constructing confidence intervals for mean |IC|.
-
-    Parameters
-    ----------
-    config : SignificanceConfig
-        Bootstrap parameters (n_samples, block_size, confidence, seed).
-    """
-
-    def __init__(self, config: SignificanceConfig) -> None:
-        self._config = config
-        self._rng = np.random.RandomState(config.seed)
-
-    # ----- public API -----
-
-    def compute_ci(
-        self, factor_name: str, ic_series: np.ndarray
-    ) -> BootstrapCIResult:
-        """Compute block-bootstrap CI for mean |IC|.
-
-        Parameters
-        ----------
-        factor_name : str
-            Human-readable factor identifier.
-        ic_series : np.ndarray, shape (T,)
-            IC time series (NaN entries are dropped before resampling).
-
-        Returns
-        -------
-        BootstrapCIResult
-        """
-        valid = ic_series[~np.isnan(ic_series)]
-        T = len(valid)
-        if T == 0:
-            return BootstrapCIResult(
-                factor_name=factor_name,
-                ic_mean=0.0,
-                ci_lower=0.0,
-                ci_upper=0.0,
-                ic_std_boot=0.0,
-                ci_excludes_zero=False,
-            )
-
-        abs_valid = np.abs(valid)
-        ic_mean = float(np.mean(abs_valid))
-
-        boot_means = self._block_bootstrap_means(abs_valid)
-
-        alpha = 1.0 - self._config.bootstrap_confidence
-        ci_lower = float(np.percentile(boot_means, 100 * alpha / 2))
-        ci_upper = float(np.percentile(boot_means, 100 * (1 - alpha / 2)))
-        ic_std_boot = float(np.std(boot_means, ddof=1))
-
-        return BootstrapCIResult(
-            factor_name=factor_name,
-            ic_mean=ic_mean,
-            ci_lower=ci_lower,
-            ci_upper=ci_upper,
-            ic_std_boot=ic_std_boot,
-            ci_excludes_zero=ci_lower > 0,
-        )
-
-    def compute_p_value(self, ic_series: np.ndarray) -> float:
-        """Estimate a two-sided p-value for non-zero mean IC.
-
-        Uses a sign-flip randomization test on the observed IC series.
-        Under the null of no predictive signal, flipping the sign of each
-        period's IC leaves the distribution unchanged while preserving the
-        magnitude structure of the observed sample.
-        """
-        valid = ic_series[~np.isnan(ic_series)]
-        T = len(valid)
-        if T == 0:
-            return 1.0
-
-        observed = float(abs(np.mean(valid)))
-        if observed < 1e-15:
-            return 1.0
-
-        null_means = np.empty(self._config.bootstrap_n_samples, dtype=np.float64)
-        for i in range(self._config.bootstrap_n_samples):
-            signs = self._rng.choice((-1.0, 1.0), size=T)
-            null_means[i] = abs(float(np.mean(valid * signs)))
-
-        exceedances = int(np.sum(null_means >= observed))
-        return float((exceedances + 1) / (len(null_means) + 1))
-
-    # ----- internals -----
-
-    def _effective_block_size(self, T: int) -> int:
-        """Adaptive block size: min(configured, T // 10), at least 1."""
-        bs = self._config.bootstrap_block_size
-        adaptive = max(T // 10, 1)
-        return min(bs, adaptive)
-
-    def _block_bootstrap_means(self, series: np.ndarray) -> np.ndarray:
-        """Generate bootstrap distribution of the sample mean.
-
-        Parameters
-        ----------
-        series : np.ndarray, shape (T,)
-            Already cleaned (no NaN) series values.
-
-        Returns
-        -------
-        np.ndarray, shape (n_samples,)
-            Bootstrap sample means.
-        """
-        T = len(series)
-        block_size = self._effective_block_size(T)
-        n_blocks = int(math.ceil(T / block_size))
-        n_samples = self._config.bootstrap_n_samples
-
-        boot_means = np.empty(n_samples, dtype=np.float64)
-        max_start = T - block_size  # last valid block start
-
-        for i in range(n_samples):
-            # Sample block start indices with replacement
-            starts = self._rng.randint(0, max_start + 1, size=n_blocks)
-            # Concatenate blocks and truncate to length T
-            indices = np.concatenate(
-                [np.arange(s, s + block_size) for s in starts]
-            )[:T]
-            boot_means[i] = series[indices].mean()
-
-        return boot_means
-
-
-# ---------------------------------------------------------------------------
-# FDR Control (Benjamini-Hochberg)
-# ---------------------------------------------------------------------------
-
-@dataclass
-class FDRResult:
-    """Result of Benjamini-Hochberg FDR correction."""
-
-    raw_p_values: Dict[str, float]
-    adjusted_p_values: Dict[str, float]
-    significant: Dict[str, bool]
-    n_discoveries: int
-    fdr_level: float
-
-
-class FDRController:
-    """Benjamini-Hochberg FDR correction for multiple factor testing.
-
-    Parameters
-    ----------
-    config : SignificanceConfig
-    """
-
-    def __init__(self, config: SignificanceConfig) -> None:
-        self._config = config
-
-    def apply_fdr(self, p_values: Dict[str, float]) -> FDRResult:
-        """Apply Benjamini-Hochberg procedure.
-
-        Parameters
-        ----------
-        p_values : Dict[str, float]
-            Mapping of factor_name -> raw p-value.
-
-        Returns
-        -------
-        FDRResult
-        """
-        if not p_values:
-            return FDRResult(
-                raw_p_values={},
-                adjusted_p_values={},
-                significant={},
-                n_discoveries=0,
-                fdr_level=self._config.fdr_level,
-            )
-
-        names = list(p_values.keys())
-        raw = np.array([p_values[n] for n in names], dtype=np.float64)
-        m = len(raw)
-
-        # Sort ascending
-        order = np.argsort(raw)
-        sorted_raw = raw[order]
-
-        # BH adjusted p-values: p_adj[i] = min(p[i] * m / (i+1), 1.0)
-        adjusted = np.empty(m, dtype=np.float64)
-        for idx in range(m):
-            rank = idx + 1  # 1-indexed rank
-            adjusted[idx] = min(sorted_raw[idx] * m / rank, 1.0)
-
-        # Enforce monotonicity from bottom up
-        for idx in range(m - 2, -1, -1):
-            adjusted[idx] = min(adjusted[idx], adjusted[idx + 1])
-
-        # Map back to original order
-        inv_order = np.empty(m, dtype=int)
-        inv_order[order] = np.arange(m)
-        adjusted_orig = adjusted[inv_order]
-
-        adjusted_dict: Dict[str, float] = {}
-        significant_dict: Dict[str, bool] = {}
-        for i, name in enumerate(names):
-            adjusted_dict[name] = float(adjusted_orig[i])
-            significant_dict[name] = adjusted_orig[i] <= self._config.fdr_level
-
-        return FDRResult(
-            raw_p_values=dict(p_values),
-            adjusted_p_values=adjusted_dict,
-            significant=significant_dict,
-            n_discoveries=sum(significant_dict.values()),
-            fdr_level=self._config.fdr_level,
-        )
-
-    def batch_evaluate(
-        self,
-        ic_series_map: Dict[str, np.ndarray],
-        bootstrap_tester: BootstrapICTester,
-    ) -> FDRResult:
-        """Compute bootstrap p-values for all factors, then apply BH.
-
-        Parameters
-        ----------
-        ic_series_map : Dict[str, np.ndarray]
-            Mapping of factor_name -> IC series (T,).
-        bootstrap_tester : BootstrapICTester
-
-        Returns
-        -------
-        FDRResult
-        """
-        p_values: Dict[str, float] = {}
-        for name, ic_series in ic_series_map.items():
-            p_values[name] = bootstrap_tester.compute_p_value(ic_series)
-        return self.apply_fdr(p_values)
-
-
-# ---------------------------------------------------------------------------
-# Deflated Sharpe Ratio
-# ---------------------------------------------------------------------------
-
-@dataclass
-class DeflatedSharpeResult:
-    """Result of Deflated Sharpe Ratio test."""
-
-    factor_name: str
-    raw_sharpe: float
-    deflated_sharpe: float
-    haircut: float
-    p_value: float
-    n_trials: int
-    passes: bool
-
-
-class DeflatedSharpeCalculator:
-    """Deflated Sharpe Ratio (Bailey & López de Prado, 2014).
-
-    Adjusts the observed Sharpe Ratio for multiple testing by estimating
-    the expected maximum Sharpe under the null hypothesis of zero skill
-    across *n_trials* independent strategies.
-
-    Parameters
-    ----------
-    config : SignificanceConfig
-    """
-
-    _EULER_GAMMA = 0.5772156649015329
-
-    def __init__(self, config: SignificanceConfig) -> None:
-        self._config = config
-
-    def compute(
-        self,
-        factor_name: str,
-        ls_returns: np.ndarray,
-        n_trials: int,
-        annualization_factor: float = 252.0,
-    ) -> DeflatedSharpeResult:
-        """Compute the Deflated Sharpe Ratio for a factor's L/S returns.
-
-        Parameters
-        ----------
-        factor_name : str
-        ls_returns : np.ndarray, shape (T,)
-            Long-short portfolio return series (NaN-free expected).
-        n_trials : int
-            Total number of strategy trials (including this one).
-        annualization_factor : float
-            Trading periods per year (default 252).
-
-        Returns
-        -------
-        DeflatedSharpeResult
-        """
-        valid = ls_returns[~np.isnan(ls_returns)]
-        T = len(valid)
-
-        if T < 10 or n_trials < 1:
-            return DeflatedSharpeResult(
-                factor_name=factor_name,
-                raw_sharpe=0.0,
-                deflated_sharpe=0.0,
-                haircut=0.0,
-                p_value=1.0,
-                n_trials=n_trials,
-                passes=False,
-            )
-
-        # Annualised Sharpe
-        mean_r = float(np.mean(valid))
-        std_r = float(np.std(valid, ddof=1))
-        if std_r < 1e-15:
-            return DeflatedSharpeResult(
-                factor_name=factor_name,
-                raw_sharpe=0.0,
-                deflated_sharpe=0.0,
-                haircut=0.0,
-                p_value=1.0,
-                n_trials=n_trials,
-                passes=False,
-            )
-
-        SR = (mean_r / std_r) * math.sqrt(annualization_factor)
-
-        # Expected maximum SR under the null (Bailey & LdP, 2014)
-        e_max_sr = self._expected_max_sr(n_trials)
-
-        # Higher moments of returns
-        gamma3 = float(skew(valid, bias=False))
-        gamma4 = float(kurtosis(valid, fisher=True, bias=False))  # excess kurtosis
-
-        # Variance correction incorporating skewness and kurtosis
-        var_correction = (1.0 - gamma3 * SR + (gamma4 - 1.0) / 4.0 * SR ** 2) / T
-
-        if var_correction <= 0:
-            deflated_sr = 0.0
-        else:
-            deflated_sr = (SR - e_max_sr) / math.sqrt(var_correction)
-
-        p_value = 1.0 - float(norm.cdf(deflated_sr))
-        haircut = SR - deflated_sr
-
-        passes = (
-            deflated_sr > self._config.min_deflated_sharpe and p_value < 0.05
-        )
-
-        return DeflatedSharpeResult(
-            factor_name=factor_name,
-            raw_sharpe=SR,
-            deflated_sharpe=deflated_sr,
-            haircut=haircut,
-            p_value=p_value,
-            n_trials=n_trials,
-            passes=passes,
-        )
-
-    @classmethod
-    def _expected_max_sr(cls, n_trials: int) -> float:
-        """E[max(SR)] approximation from Bailey & López de Prado (2014).
-
-        E[max(SR)] ~ sqrt(2*ln(N)) * (1 - gamma / (2*ln(N))) + gamma / sqrt(2*ln(N))
-        """
-        if n_trials <= 1:
-            return 0.0
-        log_n = math.log(n_trials)
-        sqrt_2log = math.sqrt(2.0 * log_n)
-        g = cls._EULER_GAMMA
-        return sqrt_2log * (1.0 - g / (2.0 * log_n)) + g / sqrt_2log
-
-
-# ---------------------------------------------------------------------------
-# Convenience entry point
-# ---------------------------------------------------------------------------
-
-def check_significance(
-    factor_name: str,
-    ic_series: np.ndarray,
-    ls_returns: np.ndarray,
-    n_total_trials: int,
-    config: Optional[SignificanceConfig] = None,
-) -> Tuple[bool, Optional[str], Dict]:
-    """Run all significance checks on a single factor.
-
-    Executes bootstrap CI, bootstrap p-value, and (optionally) the
-    Deflated Sharpe Ratio test.  Returns an overall pass/fail verdict
-    with a human-readable rejection reason.
-
-    Parameters
-    ----------
-    factor_name : str
-    ic_series : np.ndarray, shape (T,)
-    ls_returns : np.ndarray, shape (T,)
-    n_total_trials : int
-        Total number of factor trials (for DSR correction).
-    config : SignificanceConfig, optional
-        If *None*, defaults are used.
-
-    Returns
-    -------
-    Tuple[bool, Optional[str], Dict]
-        (passes, rejection_reason, details)
-        *passes* is True when all enabled tests succeed.
-        *rejection_reason* is None when *passes* is True.
-        *details* contains per-test result objects.
-    """
-    if config is None:
-        config = SignificanceConfig()
-
-    if not config.enabled:
-        return True, None, {"skipped": True}
-
-    details: Dict = {}
-
-    # -- Bootstrap IC CI / p-value --
-    bt = BootstrapICTester(config)
-    ci_result = bt.compute_ci(factor_name, ic_series)
-    details["bootstrap_ci"] = ci_result
-    p_value = bt.compute_p_value(ic_series)
-    details["bootstrap_p_value"] = p_value
-
-    if p_value > config.fdr_level:
-        return (
-            False,
-            f"Bootstrap p-value {p_value:.4f} exceeds alpha {config.fdr_level:.4f}",
-            details,
-        )
-
-    # -- Deflated Sharpe Ratio --
-    if config.deflated_sharpe_enabled:
-        dsr = DeflatedSharpeCalculator(config)
-        dsr_result = dsr.compute(factor_name, ls_returns, n_total_trials)
-        details["deflated_sharpe"] = dsr_result
-
-        if not dsr_result.passes:
-            return (
-                False,
-                f"Deflated Sharpe test failed: DSR={dsr_result.deflated_sharpe:.3f}, "
-                f"p={dsr_result.p_value:.4f}, haircut={dsr_result.haircut:.3f} "
-                f"(n_trials={n_total_trials})",
-                details,
-            )
-
-    return True, None, details
diff --git a/src/factorminer/factorminer/evaluation/transaction_costs.py b/src/factorminer/factorminer/evaluation/transaction_costs.py
deleted file mode 100644
index b13a6f1..0000000
--- a/src/factorminer/factorminer/evaluation/transaction_costs.py
+++ /dev/null
@@ -1,539 +0,0 @@
-"""Transaction cost models for realistic P&L computation.
-
-Implements the Almgren-Chriss (2001) market impact framework, bid-ask
-slippage, commissions, and A-share specific taxes. All costs are expressed
-in basis points (bps) unless explicitly noted.
-
-References
-----------
-Almgren, R. & Chriss, N. (2001). Optimal execution of portfolio transactions.
-    Journal of Risk, 3(2), 5-39.
-Kissell, R. (2013). The Science of Algorithmic Trading and Portfolio Management.
-    Academic Press.
-"""
-
-from __future__ import annotations
-
-from dataclasses import dataclass, field
-from typing import Dict, List, Optional
-
-import numpy as np
-
-
-# ---------------------------------------------------------------------------
-# Result containers
-# ---------------------------------------------------------------------------
-
-@dataclass
-class TradingCosts:
-    """Aggregated transaction costs for a single rebalance event.
-
-    All monetary values are in basis points (1 bps = 0.01%) of notional.
-
-    Attributes
-    ----------
-    market_impact_bps : float
-        Almgren-Chriss permanent + temporary impact, portfolio-level weighted
-        average in bps.
-    slippage_bps : float
-        Bid-ask spread crossing cost (half-spread * urgency) in bps.
-    commission_bps : float
-        Broker commission in bps (round-trip, both legs included).
-    stamp_duty_bps : float
-        Stamp duty levied on the sell leg only (A-shares: 1 bps).
-    total_bps : float
-        Sum of all cost components.
-    turnover : float
-        Fraction of portfolio traded this rebalance, in [0, 2].
-        0 = no trading, 1 = full one-way rebalance, 2 = full round-trip.
-    details : dict
-        Per-asset breakdown and intermediate quantities.
-    """
-
-    market_impact_bps: float
-    slippage_bps: float
-    commission_bps: float
-    stamp_duty_bps: float
-    total_bps: float
-    turnover: float
-    details: Dict = field(default_factory=dict)
-
-
-# ---------------------------------------------------------------------------
-# Market impact model (Almgren-Chriss)
-# ---------------------------------------------------------------------------
-
-class MarketImpactModel:
-    """Almgren-Chriss (2001) market impact model.
-
-    The model decomposes total market impact into:
-
-    Permanent impact (price pressure lasting beyond the trade window)::
-
-        g(v) = lambda_perm * sigma * sign(v) * |v/ADV|^alpha
-
-    where alpha = 0.5 (square-root law, empirically supported for equities).
-
-    Temporary impact (within-trade reversion; instantaneous cost)::
-
-        h(v) = eta_temp * sigma * (v / ADV)
-
-    Both components are expressed as fractions of price.  Multiply by 1e4
-    to convert to basis points.
-
-    Parameters
-    ----------
-    lambda_perm : float
-        Permanent impact coefficient.  Default 0.1 (market-standard calibration
-        for liquid A-shares; Almgren et al. 2005, "Direct Estimation of Equity
-        Market Impact").
-    eta_temp : float
-        Temporary impact coefficient.  Default 0.01.
-    alpha : float
-        Power-law exponent for permanent impact.  Default 0.5 (square-root).
-    """
-
-    def __init__(
-        self,
-        lambda_perm: float = 0.1,
-        eta_temp: float = 0.01,
-        alpha: float = 0.5,
-    ) -> None:
-        if lambda_perm < 0:
-            raise ValueError("lambda_perm must be >= 0")
-        if eta_temp < 0:
-            raise ValueError("eta_temp must be >= 0")
-        if not (0.0 < alpha <= 1.0):
-            raise ValueError("alpha must be in (0, 1]")
-
-        self.lambda_perm = float(lambda_perm)
-        self.eta_temp = float(eta_temp)
-        self.alpha = float(alpha)
-
-    # ------------------------------------------------------------------
-    def compute_impact(
-        self,
-        trade_size: np.ndarray,
-        adv: np.ndarray,
-        volatility: np.ndarray,
-        direction: np.ndarray,
-    ) -> np.ndarray:
-        """Compute total Almgren-Chriss impact for a batch of trades.
-
-        Parameters
-        ----------
-        trade_size : ndarray of shape (M,)
-            Absolute trade size for each asset (shares or notional units).
-        adv : ndarray of shape (M,)
-            Average daily volume (same units as trade_size).
-        volatility : ndarray of shape (M,)
-            Per-asset annualized volatility (e.g., 0.25 = 25%).
-        direction : ndarray of shape (M,)
-            +1.0 for buys, -1.0 for sells.  Absolute values used for
-            magnitude; sign determines direction for permanent component.
-
-        Returns
-        -------
-        ndarray of shape (M,)
-            Total market impact in basis points per asset.  Zero for assets
-            with zero trade size or zero ADV.
-        """
-        trade_size = np.asarray(trade_size, dtype=np.float64)
-        adv = np.asarray(adv, dtype=np.float64)
-        volatility = np.asarray(volatility, dtype=np.float64)
-        direction = np.asarray(direction, dtype=np.float64)
-
-        # Participation rate: what fraction of ADV are we trading
-        participation = np.where(adv > 0, trade_size / adv, 0.0)
-
-        # Permanent impact: lambda * sigma * participation^alpha
-        # Ref: Almgren (2001) eq. (2.4) – permanent impact g(v) = lambda * sigma * |x|^alpha
-        permanent = (
-            self.lambda_perm
-            * volatility
-            * np.sign(direction)
-            * np.power(np.abs(participation), self.alpha)
-        )
-
-        # Temporary impact: eta * sigma * participation
-        # Ref: Almgren (2001) eq. (2.5) – temporary impact h(v) = eta * v/ADV
-        temporary = self.eta_temp * volatility * np.abs(participation)
-
-        # Total impact (fractional), convert to bps
-        total_bps = (np.abs(permanent) + temporary) * 1e4
-
-        return total_bps
-
-
-# ---------------------------------------------------------------------------
-# Slippage model (bid-ask spread)
-# ---------------------------------------------------------------------------
-
-class SlippageModel:
-    """Bid-ask spread slippage model.
-
-    Each trade crosses the bid-ask spread.  For a patient order (urgency=0)
-    we assume the order rests on the book and earns half-spread; for an
-    aggressive market order (urgency=1) we pay the full spread.  In practice,
-    intraday algo execution sits around urgency=0.5.
-
-    Per-trade slippage cost::
-
-        slippage = spread_bps * urgency
-
-    For a round-trip (buy + sell) this doubles.  The caller is responsible
-    for applying round-trip scaling.
-
-    Default spreads for 10-min A-share bars:
-        - Liquid large-caps (CSI 300): 2-3 bps
-        - Mid-cap (CSI 500): 3-5 bps
-        - Small-cap: 5-10 bps
-    """
-
-    def __init__(self, default_spread_bps: float = 3.0) -> None:
-        """
-        Parameters
-        ----------
-        default_spread_bps : float
-            Fallback spread used when per-asset spreads are not supplied.
-        """
-        self.default_spread_bps = float(default_spread_bps)
-
-    # ------------------------------------------------------------------
-    def compute_slippage(
-        self,
-        trade_size: np.ndarray,
-        spread_bps: Optional[np.ndarray] = None,
-        urgency: float = 0.5,
-    ) -> np.ndarray:
-        """Compute one-way slippage for a set of trades.
-
-        Parameters
-        ----------
-        trade_size : ndarray of shape (M,)
-            Trade sizes (used to identify which assets are actively traded;
-            zero-size trades incur no slippage).
-        spread_bps : ndarray of shape (M,) or None
-            Per-asset effective bid-ask spread in bps.  Falls back to
-            ``default_spread_bps`` if None.
-        urgency : float
-            Urgency scalar in [0, 1].  0 = fully patient (resting orders),
-            1 = aggressive (market orders).  Default 0.5.
-
-        Returns
-        -------
-        ndarray of shape (M,)
-            One-way slippage cost in bps per asset.
-        """
-        if not (0.0 <= urgency <= 1.0):
-            raise ValueError("urgency must be in [0, 1]")
-
-        trade_size = np.asarray(trade_size, dtype=np.float64)
-        M = len(trade_size)
-
-        if spread_bps is None:
-            sp = np.full(M, self.default_spread_bps)
-        else:
-            sp = np.asarray(spread_bps, dtype=np.float64)
-
-        # Only traded assets incur slippage
-        traded = trade_size > 0
-        cost = np.where(traded, sp * urgency, 0.0)
-
-        return cost
-
-
-# ---------------------------------------------------------------------------
-# Aggregated transaction cost calculator
-# ---------------------------------------------------------------------------
-
-class TransactionCostCalculator:
-    """Aggregate all transaction cost components for a portfolio rebalance.
-
-    Components modelled
-    -------------------
-    1. **Market impact** – Almgren-Chriss permanent + temporary impact.
-    2. **Bid-ask slippage** – half-spread crossing cost.
-    3. **Commission** – fixed per-side brokerage fee.
-    4. **Stamp duty** – sell-side stamp duty (A-shares only).
-    5. **Financing cost** – overnight leverage cost (when applicable).
-
-    Market defaults (A-shares, 10-min bars)
-    ----------------------------------------
-    * Commission: 2 bps per side (4 bps round-trip for institutional).
-    * Stamp duty: 1 bps on the sell side only.
-    * Spread: 3 bps (CSI 500 universe average).
-    * All-in round-trip cost at modest size: ~8 bps (consistent with
-      HelixFactor benchmark config ``benchmark.cost_bps`` sweep).
-
-    Crypto defaults
-    ---------------
-    * Commission: 0.5 bps maker / 1.5 bps taker → 2 bps per side.
-    * No stamp duty.
-    * Spread: 1-2 bps for top-20 pairs.
-
-    Parameters
-    ----------
-    impact_model : MarketImpactModel, optional
-        Custom Almgren-Chriss model.  Defaults to standard parameterisation.
-    slippage_model : SlippageModel, optional
-        Custom slippage model.  Defaults to standard parameterisation.
-    commission_bps : float
-        One-way broker commission in bps.  Default 2 bps.
-    stamp_duty_bps : float
-        Sell-side stamp duty in bps.  Default 1 bps (A-shares).
-    overnight_rate_annual : float
-        Annualised financing rate for leveraged positions.  Default 0.0
-        (no leverage).
-    bars_per_year : float
-        Number of bars per year used to convert overnight rate to per-bar.
-        Default 252 * 24 = 6048 (10-min bars, 4-hour A-share session).
-    """
-
-    def __init__(
-        self,
-        impact_model: Optional[MarketImpactModel] = None,
-        slippage_model: Optional[SlippageModel] = None,
-        commission_bps: float = 2.0,
-        stamp_duty_bps: float = 1.0,
-        overnight_rate_annual: float = 0.0,
-        bars_per_year: float = 252.0 * 24.0,
-    ) -> None:
-        self.impact_model = impact_model or MarketImpactModel()
-        self.slippage_model = slippage_model or SlippageModel()
-        self.commission_bps = float(commission_bps)
-        self.stamp_duty_bps = float(stamp_duty_bps)
-        self.overnight_rate_annual = float(overnight_rate_annual)
-        self.bars_per_year = float(bars_per_year)
-
-    # ------------------------------------------------------------------
-    def compute_total_cost(
-        self,
-        old_weights: np.ndarray,
-        new_weights: np.ndarray,
-        adv: np.ndarray,
-        volatility: np.ndarray,
-        portfolio_value: float,
-        market: str = 'ashare',
-        spread_bps: Optional[np.ndarray] = None,
-        urgency: float = 0.5,
-    ) -> TradingCosts:
-        """Compute all-in transaction costs for a single rebalance event.
-
-        The portfolio transitions from ``old_weights`` to ``new_weights``.
-        Weights are signed: positive = long, negative = short.  Their sum
-        need not be 1 (allows cash + leverage).
-
-        Parameters
-        ----------
-        old_weights : ndarray of shape (M,)
-            Current (pre-trade) portfolio weights per asset.
-        new_weights : ndarray of shape (M,)
-            Target (post-trade) portfolio weights per asset.
-        adv : ndarray of shape (M,)
-            Average daily volume per asset in notional (same currency as
-            ``portfolio_value``).
-        volatility : ndarray of shape (M,)
-            Per-asset annualized volatility (e.g. 0.30 = 30%).
-        portfolio_value : float
-            Total portfolio NAV in notional currency.
-        market : str
-            ``'ashare'`` or ``'crypto'``.  Controls stamp duty defaults.
-        spread_bps : ndarray of shape (M,), optional
-            Per-asset bid-ask spread in bps.  Falls back to model default.
-        urgency : float
-            Execution urgency in [0, 1].
-
-        Returns
-        -------
-        TradingCosts
-            Fully decomposed cost object.
-        """
-        old_weights = np.asarray(old_weights, dtype=np.float64)
-        new_weights = np.asarray(new_weights, dtype=np.float64)
-        adv = np.asarray(adv, dtype=np.float64)
-        volatility = np.asarray(volatility, dtype=np.float64)
-
-        # Weight deltas and trade notional
-        delta_weights = new_weights - old_weights                     # signed
-        trade_notional = np.abs(delta_weights) * portfolio_value      # always >= 0
-        trade_direction = np.sign(delta_weights)                      # +1 buy, -1 sell
-
-        # One-way turnover: sum of absolute weight changes, divided by 2 to
-        # avoid double-counting buys and sells for a fully-funded portfolio.
-        # Convention: turnover in [0, 1] for a single rebalance (0=no trade,
-        # 1=100% of portfolio turned over on one side).
-        turnover = float(np.sum(np.abs(delta_weights)) / 2.0)
-
-        # ----------------------------------------------------------------
-        # 1. Market impact (Almgren-Chriss)
-        # ----------------------------------------------------------------
-        impact_bps_per_asset = self.impact_model.compute_impact(
-            trade_size=trade_notional,
-            adv=adv,
-            volatility=volatility,
-            direction=trade_direction,
-        )
-        # Portfolio-level impact = notional-weighted average across traded assets
-        total_trade_notional = float(np.sum(trade_notional))
-        if total_trade_notional > 1e-12:
-            impact_bps = float(
-                np.sum(impact_bps_per_asset * trade_notional) / total_trade_notional
-            )
-        else:
-            impact_bps = 0.0
-
-        # ----------------------------------------------------------------
-        # 2. Bid-ask slippage
-        # ----------------------------------------------------------------
-        slippage_bps_per_asset = self.slippage_model.compute_slippage(
-            trade_size=trade_notional,
-            spread_bps=spread_bps,
-            urgency=urgency,
-        )
-        if total_trade_notional > 1e-12:
-            slippage_bps = float(
-                np.sum(slippage_bps_per_asset * trade_notional) / total_trade_notional
-            )
-        else:
-            slippage_bps = 0.0
-
-        # ----------------------------------------------------------------
-        # 3. Commission (both buy and sell legs)
-        # ----------------------------------------------------------------
-        # Commission applies to both sides of each trade (enter + exit).
-        # Here we apply it per side (once now + once on exit = round-trip).
-        # For a rebalance we pay commission on the traded notional.
-        commission_bps = self.commission_bps  # per-side, applied to traded notional
-
-        # ----------------------------------------------------------------
-        # 4. Stamp duty (sell side only)
-        # ----------------------------------------------------------------
-        effective_stamp = 0.0
-        if market == 'ashare':
-            # Identify sell trades: delta_weight < 0 (reducing long) or
-            # delta_weight > 0 but old position was short (increasing short sell).
-            # Simplified: stamp duty on any reduction of long exposure.
-            sell_notional = np.sum(trade_notional[delta_weights < 0])
-            if total_trade_notional > 1e-12:
-                sell_fraction = sell_notional / total_trade_notional
-                effective_stamp = self.stamp_duty_bps * sell_fraction
-        # crypto: no stamp duty
-
-        # ----------------------------------------------------------------
-        # 5. Financing cost (one bar's worth of overnight carry)
-        # ----------------------------------------------------------------
-        # Per-bar financing cost on leveraged portion.  For a bar-length h:
-        #   financing_cost = leverage * overnight_rate_annual / bars_per_year
-        if self.overnight_rate_annual > 0:
-            leverage = max(
-                float(np.sum(np.abs(new_weights))) - 1.0, 0.0
-            )  # excess over 1x
-            financing_bps = (
-                leverage
-                * self.overnight_rate_annual
-                / self.bars_per_year
-                * 1e4
-            )
-        else:
-            financing_bps = 0.0
-
-        # ----------------------------------------------------------------
-        # Aggregate
-        # ----------------------------------------------------------------
-        total_bps = (
-            impact_bps
-            + slippage_bps
-            + commission_bps
-            + effective_stamp
-            + financing_bps
-        )
-
-        details = {
-            "impact_bps_per_asset": impact_bps_per_asset,
-            "slippage_bps_per_asset": slippage_bps_per_asset,
-            "delta_weights": delta_weights,
-            "trade_notional": trade_notional,
-            "financing_bps": financing_bps,
-            "sell_stamp_bps": effective_stamp,
-        }
-
-        return TradingCosts(
-            market_impact_bps=impact_bps,
-            slippage_bps=slippage_bps,
-            commission_bps=commission_bps,
-            stamp_duty_bps=effective_stamp,
-            total_bps=total_bps,
-            turnover=turnover,
-            details=details,
-        )
-
-    # ------------------------------------------------------------------
-    @classmethod
-    def for_ashare(
-        cls,
-        lambda_perm: float = 0.1,
-        eta_temp: float = 0.01,
-        commission_bps: float = 2.0,
-        stamp_duty_bps: float = 1.0,
-        default_spread_bps: float = 3.0,
-    ) -> "TransactionCostCalculator":
-        """Convenience constructor with A-share defaults.
-
-        All-in round-trip cost at low turnover ≈ 7-9 bps, consistent with
-        the HelixFactor benchmark sweep range.
-
-        Parameters
-        ----------
-        lambda_perm : float
-            Permanent impact coefficient (default 0.1).
-        eta_temp : float
-            Temporary impact coefficient (default 0.01).
-        commission_bps : float
-            One-way commission (default 2 bps).
-        stamp_duty_bps : float
-            Sell-side stamp duty (default 1 bps; CSRC mandated since 2023).
-        default_spread_bps : float
-            Default spread for assets without explicit spread data.
-        """
-        return cls(
-            impact_model=MarketImpactModel(
-                lambda_perm=lambda_perm,
-                eta_temp=eta_temp,
-            ),
-            slippage_model=SlippageModel(default_spread_bps=default_spread_bps),
-            commission_bps=commission_bps,
-            stamp_duty_bps=stamp_duty_bps,
-        )
-
-    @classmethod
-    def for_crypto(
-        cls,
-        lambda_perm: float = 0.05,
-        eta_temp: float = 0.005,
-        commission_bps: float = 1.0,
-        default_spread_bps: float = 1.5,
-    ) -> "TransactionCostCalculator":
-        """Convenience constructor with crypto exchange defaults.
-
-        Parameters
-        ----------
-        lambda_perm : float
-            Permanent impact coefficient.  Lower than A-shares due to
-            continuous 24/7 liquidity provision.
-        eta_temp : float
-            Temporary impact coefficient.
-        commission_bps : float
-            Maker/taker blended commission (default 1 bps).
-        default_spread_bps : float
-            Default effective spread for top-20 pairs (default 1.5 bps).
-        """
-        return cls(
-            impact_model=MarketImpactModel(
-                lambda_perm=lambda_perm,
-                eta_temp=eta_temp,
-            ),
-            slippage_model=SlippageModel(default_spread_bps=default_spread_bps),
-            commission_bps=commission_bps,
-            stamp_duty_bps=0.0,   # no stamp duty on crypto
-        )
diff --git a/src/factorminer/factorminer/memory/__init__.py b/src/factorminer/factorminer/memory/__init__.py
deleted file mode 100644
index 0dfe551..0000000
--- a/src/factorminer/factorminer/memory/__init__.py
+++ /dev/null
@@ -1,84 +0,0 @@
-"""Experience memory system for mining loop feedback.
-
-Implements the memory M = {S, P_succ, P_fail, I} with operators:
-- F(M, tau): Memory Formation - extract experience from mining trajectory
-- E(M, M_form): Memory Evolution - consolidate and prune memory
-- R(M, L): Memory Retrieval - context-dependent retrieval for LLM prompts
-
-Phase 2 additions:
-- Knowledge Graph: factor lineage and structural analysis
-- Embeddings: semantic formula similarity and deduplication
-- Enhanced Retrieval: KG + embedding augmented retrieval
-"""
-
-from src.factorminer.factorminer.memory.memory_store import (
-    ExperienceMemory,
-    ForbiddenDirection,
-    MiningState,
-    StrategicInsight,
-    SuccessPattern,
-)
-from src.factorminer.factorminer.memory.formation import form_memory
-from src.factorminer.factorminer.memory.evolution import evolve_memory
-from src.factorminer.factorminer.memory.retrieval import retrieve_memory
-from src.factorminer.factorminer.memory.experience_memory import ExperienceMemoryManager
-
-# Phase 2: Optional imports (graceful if dependencies missing)
-try:
-    from factorminer.memory.knowledge_graph import (
-        FactorKnowledgeGraph,
-        FactorNode,
-        EdgeType,
-    )
-except ImportError:
-    pass
-
-try:
-    from factorminer.memory.embeddings import FormulaEmbedder
-except ImportError:
-    pass
-
-try:
-    from factorminer.memory.kg_retrieval import retrieve_memory_enhanced
-except ImportError:
-    pass
-
-try:
-    from factorminer.memory.online_regime_memory import (
-        OnlineRegimeMemory,
-        OnlineMemoryUpdater,
-        RegimeSpecificPatternStore,
-        RegimeTransitionForecaster,
-        MemoryForgetCurve,
-    )
-except ImportError:
-    pass
-
-__all__ = [
-    # Data structures
-    "ExperienceMemory",
-    "MiningState",
-    "SuccessPattern",
-    "ForbiddenDirection",
-    "StrategicInsight",
-    # Operators
-    "form_memory",
-    "evolve_memory",
-    "retrieve_memory",
-    # Manager
-    "ExperienceMemoryManager",
-    # Phase 2: Knowledge Graph
-    "FactorKnowledgeGraph",
-    "FactorNode",
-    "EdgeType",
-    # Phase 2: Embeddings
-    "FormulaEmbedder",
-    # Phase 2: Enhanced Retrieval
-    "retrieve_memory_enhanced",
-    # Phase 2: Online Regime Memory
-    "OnlineRegimeMemory",
-    "OnlineMemoryUpdater",
-    "RegimeSpecificPatternStore",
-    "RegimeTransitionForecaster",
-    "MemoryForgetCurve",
-]
diff --git a/src/factorminer/factorminer/memory/embeddings.py b/src/factorminer/factorminer/memory/embeddings.py
deleted file mode 100644
index d0d6073..0000000
--- a/src/factorminer/factorminer/memory/embeddings.py
+++ /dev/null
@@ -1,392 +0,0 @@
-"""Semantic formula embeddings for factor similarity and deduplication.
-
-Converts DSL formulas into natural language descriptions and encodes
-them as dense vectors. Supports:
-- sentence-transformers for high-quality embeddings (optional)
-- FAISS for fast k-NN search (optional)
-- TF-IDF fallback when sentence-transformers is unavailable
-- Brute-force cosine fallback when FAISS is unavailable
-"""
-
-from __future__ import annotations
-
-import re
-from typing import Dict, List, Optional, Tuple
-
-import numpy as np
-
-# Optional dependency flags -- resolved at runtime
-_has_sentence_transformers = False
-_has_faiss = False
-_has_sklearn = False
-
-try:
-    from sentence_transformers import SentenceTransformer  # type: ignore[import-untyped]
-
-    _has_sentence_transformers = True
-except ImportError:
-    SentenceTransformer = None  # type: ignore[assignment,misc]
-
-try:
-    import faiss  # type: ignore[import-untyped]
-
-    _has_faiss = True
-except ImportError:
-    faiss = None  # type: ignore[assignment]
-
-try:
-    from sklearn.feature_extraction.text import TfidfVectorizer  # type: ignore[import-untyped]
-
-    _has_sklearn = True
-except ImportError:
-    TfidfVectorizer = None  # type: ignore[assignment]
-
-
-# ---------------------------------------------------------------------------
-# Operator name -> natural-language expansion table
-# ---------------------------------------------------------------------------
-
-_OPERATOR_EXPANSIONS: Dict[str, str] = {
-    # Arithmetic
-    "Add": "addition",
-    "Sub": "subtraction",
-    "Mul": "multiplication",
-    "Div": "division",
-    "Neg": "negation",
-    "Abs": "absolute value",
-    "Log": "logarithm",
-    "Sqrt": "square root",
-    "Power": "power",
-    "Sign": "sign",
-    "Max": "maximum",
-    "Min": "minimum",
-    # Rolling / time-series
-    "Mean": "rolling mean",
-    "Median": "rolling median",
-    "Std": "rolling standard deviation",
-    "Var": "rolling variance",
-    "Skew": "rolling skewness",
-    "Kurt": "rolling kurtosis",
-    "Sum": "rolling sum",
-    "TsMax": "time-series maximum",
-    "TsMin": "time-series minimum",
-    "TsRank": "time-series rank",
-    "TsArgMax": "time-series argmax",
-    "TsArgMin": "time-series argmin",
-    "Delta": "change over period",
-    "Delay": "lagged value",
-    "Return": "return over period",
-    "Corr": "rolling correlation",
-    "Cov": "rolling covariance",
-    "TsLinRegSlope": "linear regression slope",
-    "TsLinRegResid": "linear regression residual",
-    "TsLinRegIntercept": "linear regression intercept",
-    # Smoothing
-    "EMA": "exponential moving average",
-    "WMA": "weighted moving average",
-    "SMA": "simple moving average",
-    "DEMA": "double exponential moving average",
-    # Cross-sectional
-    "CsRank": "cross-sectional rank",
-    "CsZScore": "cross-sectional z-score",
-    "CsDemean": "cross-sectional demeaning",
-    "CsScale": "cross-sectional scaling",
-    # Logical / conditional
-    "IfElse": "conditional selection",
-    "Greater": "greater-than comparison",
-    "Less": "less-than comparison",
-    "Equal": "equality comparison",
-    "And": "logical and",
-    "Or": "logical or",
-    "Not": "logical not",
-}
-
-# Feature name -> natural-language
-_FEATURE_EXPANSIONS: Dict[str, str] = {
-    "$close": "close price",
-    "$open": "open price",
-    "$high": "high price",
-    "$low": "low price",
-    "$volume": "volume",
-    "$amt": "turnover amount",
-    "$vwap": "volume-weighted average price",
-    "$returns": "returns",
-}
-
-
-class FormulaEmbedder:
-    """Embed DSL formulas as dense vectors for similarity search.
-
-    Parameters
-    ----------
-    model_name : str
-        Name of a sentence-transformers model (used only when the
-        library is installed).
-    use_faiss : bool
-        Whether to use FAISS for approximate nearest-neighbour search.
-        Falls back to brute-force cosine similarity if unavailable.
-    """
-
-    def __init__(
-        self,
-        model_name: str = "all-MiniLM-L6-v2",
-        use_faiss: bool = True,
-    ) -> None:
-        self._model_name = model_name
-        self._use_faiss = use_faiss and _has_faiss
-
-        # Lazy-loaded model / vectoriser
-        self._model: Optional[SentenceTransformer] = None  # type: ignore[type-arg]
-        self._tfidf: Optional[TfidfVectorizer] = None  # type: ignore[type-arg]
-        self._tfidf_dirty: bool = False  # whether TF-IDF needs refit
-
-        # Cache: factor_id -> (embedding, text)
-        self._cache: Dict[str, Tuple[np.ndarray, str]] = {}
-        # Ordered list mirroring cache for FAISS index alignment
-        self._ids: List[str] = []
-
-        # FAISS index (rebuilt lazily)
-        self._index: Optional[object] = None
-        self._index_dirty: bool = True
-
-    # ------------------------------------------------------------------
-    # Public API
-    # ------------------------------------------------------------------
-
-    def embed(self, factor_id: str, formula: str) -> np.ndarray:
-        """Compute (or retrieve cached) embedding for a formula.
-
-        Parameters
-        ----------
-        factor_id : str
-            Unique identifier used for caching.
-        formula : str
-            DSL formula to embed.
-
-        Returns
-        -------
-        ndarray
-            Embedding vector (float32).
-        """
-        if factor_id in self._cache:
-            return self._cache[factor_id][0]
-
-        text = self._formula_to_text(formula)
-        vec = self._encode(text)
-        self._cache[factor_id] = (vec, text)
-        self._ids.append(factor_id)
-        self._index_dirty = True
-        self._tfidf_dirty = True
-        return vec
-
-    def remove(self, factor_id: str) -> bool:
-        """Remove a cached embedding by factor id."""
-        if factor_id not in self._cache:
-            return False
-
-        self._cache.pop(factor_id, None)
-        self._ids = [fid for fid in self._ids if fid != factor_id]
-        self._index = None
-        self._index_dirty = True
-        self._tfidf_dirty = True
-        return True
-
-    def clear(self) -> None:
-        """Clear all cached embeddings and search state."""
-        self._cache.clear()
-        self._ids.clear()
-        self._index = None
-        self._index_dirty = True
-        self._tfidf = None
-        self._tfidf_dirty = False
-
-    @property
-    def cache_size(self) -> int:
-        """Return the number of cached factor embeddings."""
-        return len(self._cache)
-
-    def find_nearest(
-        self,
-        formula: str,
-        k: int = 5,
-    ) -> List[Tuple[str, float]]:
-        """Find the *k* most similar cached formulas.
-
-        Parameters
-        ----------
-        formula : str
-            Query formula (does not need to be cached).
-        k : int
-            Number of neighbours to return.
-
-        Returns
-        -------
-        list of (factor_id, similarity)
-            Sorted by descending similarity.
-        """
-        if not self._cache:
-            return []
-
-        query_vec = self._encode(self._formula_to_text(formula))
-        k = min(k, len(self._cache))
-
-        if self._use_faiss and _has_faiss:
-            return self._faiss_search(query_vec, k)
-        return self._brute_force_search(query_vec, k)
-
-    def is_semantic_duplicate(
-        self,
-        formula: str,
-        threshold: float = 0.92,
-    ) -> Optional[str]:
-        """Check if *formula* is a near-duplicate of a cached factor.
-
-        Returns the factor_id of the most similar cached factor if the
-        cosine similarity exceeds *threshold*, or ``None``.
-        """
-        results = self.find_nearest(formula, k=1)
-        if results and results[0][1] >= threshold:
-            return results[0][0]
-        return None
-
-    # ------------------------------------------------------------------
-    # Formula -> text conversion
-    # ------------------------------------------------------------------
-
-    @staticmethod
-    def _formula_to_text(formula: str) -> str:
-        """Convert a DSL formula into a natural-language description.
-
-        Expands operator and feature names for better semantic matching.
-        """
-        text = formula
-
-        # Expand operators (longest-first to avoid partial matches)
-        for op in sorted(_OPERATOR_EXPANSIONS, key=len, reverse=True):
-            text = text.replace(op, _OPERATOR_EXPANSIONS[op])
-
-        # Expand features
-        for feat in _FEATURE_EXPANSIONS:
-            text = text.replace(feat, _FEATURE_EXPANSIONS[feat])
-
-        # Clean up punctuation into spaces
-        text = re.sub(r"[(),]+", " ", text)
-        text = re.sub(r"\s+", " ", text).strip()
-        return text.lower()
-
-    # ------------------------------------------------------------------
-    # Encoding backends
-    # ------------------------------------------------------------------
-
-    def _encode(self, text: str) -> np.ndarray:
-        """Encode a single text string into a unit-norm vector."""
-        if _has_sentence_transformers:
-            return self._encode_transformer(text)
-        if _has_sklearn:
-            return self._encode_tfidf(text)
-        # Absolute fallback: hash-based bag of words
-        return self._encode_hash(text)
-
-    def _encode_transformer(self, text: str) -> np.ndarray:
-        if self._model is None:
-            self._model = SentenceTransformer(self._model_name)
-        vec = self._model.encode(text, convert_to_numpy=True)
-        vec = np.asarray(vec, dtype=np.float32).flatten()
-        norm = np.linalg.norm(vec)
-        if norm > 0:
-            vec /= norm
-        return vec
-
-    def _encode_tfidf(self, text: str) -> np.ndarray:
-        """Encode using TF-IDF over all cached texts + query.
-
-        Because TF-IDF vocabulary can change when new documents are
-        added, we refit when dirty. This is cheap for the expected
-        document counts (hundreds to low thousands).
-        """
-        if self._tfidf is None:
-            self._tfidf = TfidfVectorizer(max_features=512)
-            self._tfidf_dirty = True
-
-        # Collect all known texts + this one
-        corpus = [t for _, t in self._cache.values()]
-        query_idx = len(corpus)
-        corpus.append(text)
-
-        # Always refit because vocab may have grown
-        matrix = self._tfidf.fit_transform(corpus)
-        vec = np.asarray(matrix[query_idx].toarray(), dtype=np.float32).flatten()
-
-        # Re-encode cached entries with updated vocab
-        for i, fid in enumerate(self._ids):
-            updated = np.asarray(matrix[i].toarray(), dtype=np.float32).flatten()
-            norm = np.linalg.norm(updated)
-            if norm > 0:
-                updated /= norm
-            self._cache[fid] = (updated, self._cache[fid][1])
-
-        norm = np.linalg.norm(vec)
-        if norm > 0:
-            vec /= norm
-        self._tfidf_dirty = False
-        self._index_dirty = True
-        return vec
-
-    @staticmethod
-    def _encode_hash(text: str, dim: int = 128) -> np.ndarray:
-        """Ultra-simple hash-based embedding fallback."""
-        vec = np.zeros(dim, dtype=np.float32)
-        for token in text.split():
-            idx = hash(token) % dim
-            vec[idx] += 1.0
-        norm = np.linalg.norm(vec)
-        if norm > 0:
-            vec /= norm
-        return vec
-
-    # ------------------------------------------------------------------
-    # Search backends
-    # ------------------------------------------------------------------
-
-    def _rebuild_index(self) -> None:
-        """Rebuild the FAISS ``IndexFlatIP`` from cached embeddings."""
-        if not self._cache or not _has_faiss:
-            return
-        vecs = np.stack([self._cache[fid][0] for fid in self._ids])
-        dim = vecs.shape[1]
-        self._index = faiss.IndexFlatIP(dim)
-        self._index.add(vecs)  # type: ignore[union-attr]
-        self._index_dirty = False
-
-    def _faiss_search(
-        self,
-        query: np.ndarray,
-        k: int,
-    ) -> List[Tuple[str, float]]:
-        if self._index_dirty:
-            self._rebuild_index()
-        if self._index is None:
-            return self._brute_force_search(query, k)
-
-        distances, indices = self._index.search(  # type: ignore[union-attr]
-            query.reshape(1, -1), k
-        )
-        results: List[Tuple[str, float]] = []
-        for dist, idx in zip(distances[0], indices[0]):
-            if idx < 0 or idx >= len(self._ids):
-                continue
-            results.append((self._ids[idx], float(dist)))
-        return results
-
-    def _brute_force_search(
-        self,
-        query: np.ndarray,
-        k: int,
-    ) -> List[Tuple[str, float]]:
-        sims: List[Tuple[str, float]] = []
-        for fid in self._ids:
-            vec = self._cache[fid][0]
-            sim = float(np.dot(query, vec))
-            sims.append((fid, sim))
-        sims.sort(key=lambda x: x[1], reverse=True)
-        return sims[:k]
diff --git a/src/factorminer/factorminer/memory/evolution.py b/src/factorminer/factorminer/memory/evolution.py
deleted file mode 100644
index 18ba05a..0000000
--- a/src/factorminer/factorminer/memory/evolution.py
+++ /dev/null
@@ -1,482 +0,0 @@
-"""Memory Evolution operator E(M, M_form).
-
-Consolidates newly formed experience into the existing memory:
-- Merges redundant success/failure patterns
-- Discards low-utility entries
-- Reclassifies patterns that have changed behavior
-- Caps memory size according to configuration limits
-"""
-
-from __future__ import annotations
-
-from typing import Dict, List, Optional
-
-from src.factorminer.factorminer.memory.memory_store import (
-    ExperienceMemory,
-    ForbiddenDirection,
-    MiningState,
-    StrategicInsight,
-    SuccessPattern,
-)
-
-
-def _merge_success_patterns(
-    existing: List[SuccessPattern],
-    new: List[SuccessPattern],
-) -> List[SuccessPattern]:
-    """Merge new success patterns into existing ones.
-
-    Patterns with the same name are consolidated by combining examples
-    and updating occurrence counts. Novel patterns are appended.
-    """
-    merged: Dict[str, SuccessPattern] = {}
-
-    for pat in existing:
-        merged[pat.name] = SuccessPattern(
-            name=pat.name,
-            description=pat.description,
-            template=pat.template,
-            success_rate=pat.success_rate,
-            example_factors=list(pat.example_factors),
-            occurrence_count=pat.occurrence_count,
-        )
-
-    for pat in new:
-        if pat.name in merged:
-            existing_pat = merged[pat.name]
-            existing_pat.occurrence_count += pat.occurrence_count
-            # Merge example factors, dedup
-            seen = set(existing_pat.example_factors)
-            for ex in pat.example_factors:
-                if ex not in seen:
-                    existing_pat.example_factors.append(ex)
-                    seen.add(ex)
-            # Cap examples
-            if len(existing_pat.example_factors) > 10:
-                existing_pat.example_factors = existing_pat.example_factors[-10:]
-            # Update description if new one is more informative
-            if len(pat.description) > len(existing_pat.description):
-                existing_pat.description = pat.description
-            # Promote success rate based on accumulated evidence
-            if existing_pat.occurrence_count >= 10:
-                existing_pat.success_rate = "High"
-            elif existing_pat.occurrence_count >= 5:
-                existing_pat.success_rate = "Medium"
-        else:
-            merged[pat.name] = SuccessPattern(
-                name=pat.name,
-                description=pat.description,
-                template=pat.template,
-                success_rate=pat.success_rate,
-                example_factors=list(pat.example_factors),
-                occurrence_count=pat.occurrence_count,
-            )
-
-    return list(merged.values())
-
-
-def _merge_forbidden_directions(
-    existing: List[ForbiddenDirection],
-    new: List[ForbiddenDirection],
-) -> List[ForbiddenDirection]:
-    """Merge new forbidden directions into existing ones."""
-    merged: Dict[str, ForbiddenDirection] = {}
-
-    for fd in existing:
-        merged[fd.name] = ForbiddenDirection(
-            name=fd.name,
-            description=fd.description,
-            correlated_factors=list(fd.correlated_factors),
-            typical_correlation=fd.typical_correlation,
-            reason=fd.reason,
-            occurrence_count=fd.occurrence_count,
-        )
-
-    for fd in new:
-        if fd.name in merged:
-            existing_fd = merged[fd.name]
-            existing_fd.occurrence_count += fd.occurrence_count
-            # Merge correlated factors
-            seen = set(existing_fd.correlated_factors)
-            for cf in fd.correlated_factors:
-                if cf not in seen:
-                    existing_fd.correlated_factors.append(cf)
-                    seen.add(cf)
-            if len(existing_fd.correlated_factors) > 10:
-                existing_fd.correlated_factors = existing_fd.correlated_factors[-10:]
-            # Update correlation as weighted average
-            total_count = existing_fd.occurrence_count
-            if total_count > 0 and fd.typical_correlation > 0:
-                old_weight = (total_count - fd.occurrence_count) / total_count
-                new_weight = fd.occurrence_count / total_count
-                existing_fd.typical_correlation = (
-                    old_weight * existing_fd.typical_correlation
-                    + new_weight * fd.typical_correlation
-                )
-            if len(fd.reason) > len(existing_fd.reason):
-                existing_fd.reason = fd.reason
-        else:
-            merged[fd.name] = ForbiddenDirection(
-                name=fd.name,
-                description=fd.description,
-                correlated_factors=list(fd.correlated_factors),
-                typical_correlation=fd.typical_correlation,
-                reason=fd.reason,
-                occurrence_count=fd.occurrence_count,
-            )
-
-    return list(merged.values())
-
-
-def _merge_insights(
-    existing: List[StrategicInsight],
-    new: List[StrategicInsight],
-) -> List[StrategicInsight]:
-    """Merge new insights into existing, deduplicating similar ones.
-
-    Insights with substantially overlapping text are consolidated.
-    """
-    merged: List[StrategicInsight] = list(existing)
-
-    for new_insight in new:
-        is_duplicate = False
-        new_lower = new_insight.insight.lower()
-        for i, existing_insight in enumerate(merged):
-            existing_lower = existing_insight.insight.lower()
-            # Simple similarity: check if core words overlap significantly
-            new_words = set(new_lower.split())
-            existing_words = set(existing_lower.split())
-            if len(new_words) > 0 and len(existing_words) > 0:
-                overlap = len(new_words & existing_words)
-                max_len = max(len(new_words), len(existing_words))
-                if overlap / max_len > 0.6:
-                    # Keep the one from the more recent batch
-                    if new_insight.batch_source > existing_insight.batch_source:
-                        merged[i] = new_insight
-                    is_duplicate = True
-                    break
-        if not is_duplicate:
-            merged.append(new_insight)
-
-    return merged
-
-
-def _reclassify_patterns(
-    success_patterns: List[SuccessPattern],
-    forbidden_directions: List[ForbiddenDirection],
-    failure_threshold: int = 5,
-) -> tuple[List[SuccessPattern], List[ForbiddenDirection]]:
-    """Reclassify patterns that have changed behavior.
-
-    If a success pattern consistently appears in forbidden directions
-    (e.g., VWAP variant with rho=0.82), move it from success to forbidden.
-    """
-    forbidden_names = {fd.name for fd in forbidden_directions}
-
-    remaining_success: List[SuccessPattern] = []
-    new_forbidden: List[ForbiddenDirection] = []
-
-    for pat in success_patterns:
-        # Check if this pattern name overlaps with forbidden directions
-        should_reclassify = False
-        matching_forbidden: Optional[ForbiddenDirection] = None
-
-        for fd in forbidden_directions:
-            # Check for name overlap or keyword overlap
-            if _names_overlap(pat.name, fd.name):
-                if fd.occurrence_count >= failure_threshold:
-                    should_reclassify = True
-                    matching_forbidden = fd
-                    break
-
-        if should_reclassify and matching_forbidden is not None:
-            # Demote: success -> forbidden
-            new_forbidden.append(ForbiddenDirection(
-                name=pat.name,
-                description=f"Reclassified from success: {pat.description}",
-                correlated_factors=matching_forbidden.correlated_factors,
-                typical_correlation=matching_forbidden.typical_correlation,
-                reason=f"Initially promising but consistently produces correlated factors "
-                       f"(rho={matching_forbidden.typical_correlation:.2f})",
-                occurrence_count=matching_forbidden.occurrence_count,
-            ))
-        else:
-            remaining_success.append(pat)
-
-    all_forbidden = forbidden_directions + new_forbidden
-    return remaining_success, all_forbidden
-
-
-def _names_overlap(name_a: str, name_b: str) -> bool:
-    """Check if two pattern names refer to the same concept."""
-    a_words = set(name_a.lower().replace("/", " ").replace("_", " ").split())
-    b_words = set(name_b.lower().replace("/", " ").replace("_", " ").split())
-    # Remove common filler words
-    filler = {"the", "a", "an", "of", "in", "with", "for", "and", "or"}
-    a_words -= filler
-    b_words -= filler
-    if not a_words or not b_words:
-        return False
-    overlap = len(a_words & b_words)
-    return overlap >= min(2, min(len(a_words), len(b_words)))
-
-
-def _prune_low_utility(
-    success_patterns: List[SuccessPattern],
-    forbidden_directions: List[ForbiddenDirection],
-    insights: List[StrategicInsight],
-    min_occurrences: int = 1,
-) -> tuple[List[SuccessPattern], List[ForbiddenDirection], List[StrategicInsight]]:
-    """Remove entries with too few occurrences to be reliable.
-
-    Initial knowledge base entries (occurrence_count=0) are preserved.
-    """
-    pruned_success = [
-        p for p in success_patterns
-        if p.occurrence_count >= min_occurrences or p.occurrence_count == 0
-    ]
-    pruned_forbidden = [
-        f for f in forbidden_directions
-        if f.occurrence_count >= min_occurrences or f.occurrence_count == 0
-    ]
-    # Insights are lightweight, keep all
-    return pruned_success, pruned_forbidden, insights
-
-
-def _cap_memory_size(
-    success_patterns: List[SuccessPattern],
-    forbidden_directions: List[ForbiddenDirection],
-    insights: List[StrategicInsight],
-    max_success: int = 50,
-    max_forbidden: int = 100,
-    max_insights: int = 30,
-) -> tuple[List[SuccessPattern], List[ForbiddenDirection], List[StrategicInsight]]:
-    """Enforce maximum memory sizes by keeping the most useful entries."""
-    # Sort success patterns by occurrence count (most useful first)
-    if len(success_patterns) > max_success:
-        success_patterns = sorted(
-            success_patterns, key=lambda p: p.occurrence_count, reverse=True
-        )[:max_success]
-
-    # Sort forbidden directions by occurrence count
-    if len(forbidden_directions) > max_forbidden:
-        forbidden_directions = sorted(
-            forbidden_directions, key=lambda f: f.occurrence_count, reverse=True
-        )[:max_forbidden]
-
-    # Keep most recent insights
-    if len(insights) > max_insights:
-        insights = sorted(
-            insights, key=lambda i: i.batch_source, reverse=True
-        )[:max_insights]
-
-    return success_patterns, forbidden_directions, insights
-
-
-# ---------------------------------------------------------------------------
-# Public API: Memory Evolution
-# ---------------------------------------------------------------------------
-
-def evolve_memory(
-    memory: ExperienceMemory,
-    formed_memory: ExperienceMemory,
-    max_success_patterns: int = 50,
-    max_failure_patterns: int = 100,
-    max_insights: int = 30,
-) -> ExperienceMemory:
-    """Memory Evolution operator E(M, M_form).
-
-    Consolidates newly formed experience into the existing memory.
-
-    Parameters
-    ----------
-    memory : ExperienceMemory
-        Current persistent memory.
-    formed_memory : ExperienceMemory
-        Newly formed memory from the latest batch (output of form_memory).
-    max_success_patterns : int
-        Maximum number of success patterns to retain.
-    max_failure_patterns : int
-        Maximum number of forbidden directions to retain.
-    max_insights : int
-        Maximum number of strategic insights to retain.
-
-    Returns
-    -------
-    ExperienceMemory
-        Updated memory with consolidated experience.
-    """
-    # 1. Merge patterns
-    merged_success = _merge_success_patterns(
-        memory.success_patterns, formed_memory.success_patterns
-    )
-    merged_forbidden = _merge_forbidden_directions(
-        memory.forbidden_directions, formed_memory.forbidden_directions
-    )
-    merged_insights = _merge_insights(memory.insights, formed_memory.insights)
-
-    # 2. Reclassify patterns that have changed behavior
-    merged_success, merged_forbidden = _reclassify_patterns(
-        merged_success, merged_forbidden
-    )
-
-    # 3. Prune low-utility entries
-    merged_success, merged_forbidden, merged_insights = _prune_low_utility(
-        merged_success, merged_forbidden, merged_insights
-    )
-
-    # 4. Cap memory size
-    merged_success, merged_forbidden, merged_insights = _cap_memory_size(
-        merged_success, merged_forbidden, merged_insights,
-        max_success=max_success_patterns,
-        max_forbidden=max_failure_patterns,
-        max_insights=max_insights,
-    )
-
-    # 5. Update state
-    new_state = formed_memory.state
-
-    return ExperienceMemory(
-        state=new_state,
-        success_patterns=merged_success,
-        forbidden_directions=merged_forbidden,
-        insights=merged_insights,
-        version=memory.version + 1,
-    )
-
-
-# ---------------------------------------------------------------------------
-# Phase 2: Online confidence decay helpers (added for HelixFactor)
-# ---------------------------------------------------------------------------
-
-def apply_confidence_decay(
-    memory: "ExperienceMemory",
-    decay_factor: float = 0.99,
-    min_confidence: float = 0.05,
-) -> "ExperienceMemory":
-    """Return new ExperienceMemory with decayed pattern confidences.
-
-    Seed patterns (occurrence_count == 0) are immune to decay.
-    Patterns below min_confidence are pruned.
-
-    Parameters
-    ----------
-    memory:
-        Input ExperienceMemory (not mutated — immutable-style).
-    decay_factor:
-        Multiplicative decay per call (e.g. 0.99 = 1% decay per iteration).
-    min_confidence:
-        Patterns with confidence < this threshold after decay are removed.
-
-    Returns
-    -------
-    ExperienceMemory
-        New instance with decayed/pruned patterns.
-    """
-    import dataclasses
-
-    new_patterns = []
-    for p in memory.success_patterns:
-        if p.occurrence_count == 0:
-            # seed pattern — never decay
-            new_patterns.append(p)
-            continue
-        new_conf = getattr(p, "confidence", 1.0) * decay_factor
-        if new_conf >= min_confidence:
-            try:
-                new_patterns.append(dataclasses.replace(p, confidence=new_conf))
-            except TypeError:
-                new_patterns.append(p)
-
-    return dataclasses.replace(memory, success_patterns=new_patterns)
-
-
-def bump_pattern_confidence(
-    memory: "ExperienceMemory",
-    keywords: list,
-    boost: float = 0.05,
-    max_confidence: float = 1.0,
-) -> "ExperienceMemory":
-    """Return new ExperienceMemory with confidence boosted for matching patterns.
-
-    Patterns whose description or template contain any of the keywords receive
-    a confidence boost.
-
-    Parameters
-    ----------
-    memory:
-        Input ExperienceMemory (not mutated).
-    keywords:
-        List of strings to match against pattern descriptions.
-    boost:
-        Additive confidence increase for matching patterns.
-    max_confidence:
-        Confidence cap.
-
-    Returns
-    -------
-    ExperienceMemory
-        New instance with boosted pattern confidences.
-    """
-    import dataclasses
-
-    new_patterns = []
-    for p in memory.success_patterns:
-        desc = (getattr(p, "description", "") or "").lower()
-        tmpl = (getattr(p, "template", "") or "").lower()
-        matched = any(kw.lower() in desc or kw.lower() in tmpl for kw in keywords)
-        if matched:
-            new_conf = min(getattr(p, "confidence", 1.0) + boost, max_confidence)
-            try:
-                new_patterns.append(dataclasses.replace(p, confidence=new_conf))
-            except TypeError:
-                new_patterns.append(p)
-        else:
-            new_patterns.append(p)
-
-    return dataclasses.replace(memory, success_patterns=new_patterns)
-
-
-def penalise_pattern_confidence(
-    memory: "ExperienceMemory",
-    keywords: list,
-    penalty: float = 0.15,
-    min_confidence: float = 0.05,
-) -> "ExperienceMemory":
-    """Return new ExperienceMemory with confidence penalised for matching patterns.
-
-    Parameters
-    ----------
-    memory:
-        Input ExperienceMemory (not mutated).
-    keywords:
-        List of strings to match against pattern descriptions.
-    penalty:
-        Multiplicative penalty factor (confidence *= (1 - penalty)).
-    min_confidence:
-        Patterns below this threshold after penalty are pruned.
-
-    Returns
-    -------
-    ExperienceMemory
-        New instance with penalised/pruned pattern confidences.
-    """
-    import dataclasses
-
-    new_patterns = []
-    for p in memory.success_patterns:
-        desc = (getattr(p, "description", "") or "").lower()
-        tmpl = (getattr(p, "template", "") or "").lower()
-        matched = any(kw.lower() in desc or kw.lower() in tmpl for kw in keywords)
-        if matched:
-            new_conf = getattr(p, "confidence", 1.0) * (1.0 - penalty)
-            if new_conf < min_confidence:
-                continue  # prune
-            try:
-                new_patterns.append(dataclasses.replace(p, confidence=new_conf))
-            except TypeError:
-                new_patterns.append(p)
-        else:
-            new_patterns.append(p)
-
-    return dataclasses.replace(memory, success_patterns=new_patterns)
diff --git a/src/factorminer/factorminer/memory/experience_memory.py b/src/factorminer/factorminer/memory/experience_memory.py
deleted file mode 100644
index 04cfb71..0000000
--- a/src/factorminer/factorminer/memory/experience_memory.py
+++ /dev/null
@@ -1,594 +0,0 @@
-"""Main ExperienceMemory manager class.
-
-Provides the high-level API for the experience memory system:
-- Initializes with default patterns from the paper (Tables 4 and 5)
-- Persists to/from JSON
-- update(trajectory) orchestrates formation + evolution
-- retrieve(library_state) performs context-dependent retrieval
-- Optional knowledge graph and embedding support for Phase 2
-"""
-
-from __future__ import annotations
-
-import json
-import logging
-from pathlib import Path
-from typing import Any, Dict, List, Optional
-
-from src.factorminer.factorminer.memory.memory_store import (
-    ExperienceMemory,
-    ForbiddenDirection,
-    MiningState,
-    StrategicInsight,
-    SuccessPattern,
-)
-from src.factorminer.factorminer.memory.formation import form_memory
-from src.factorminer.factorminer.memory.evolution import evolve_memory
-from src.factorminer.factorminer.memory.retrieval import retrieve_memory
-
-# Optional Phase 2 imports
-try:
-    from factorminer.memory.knowledge_graph import FactorKnowledgeGraph, FactorNode
-except ImportError:
-    FactorKnowledgeGraph = None  # type: ignore[assignment,misc]
-    FactorNode = None  # type: ignore[assignment,misc]
-
-try:
-    from factorminer.memory.embeddings import FormulaEmbedder
-except ImportError:
-    FormulaEmbedder = None  # type: ignore[assignment,misc]
-
-try:
-    from factorminer.memory.kg_retrieval import retrieve_memory_enhanced
-except ImportError:
-    retrieve_memory_enhanced = None  # type: ignore[assignment]
-
-logger = logging.getLogger(__name__)
-
-
-# ---------------------------------------------------------------------------
-# Default knowledge base from the paper
-# ---------------------------------------------------------------------------
-
-def _default_success_patterns() -> List[SuccessPattern]:
-    """Initial success patterns from FactorMiner Table 4."""
-    return [
-        SuccessPattern(
-            name="Higher Moment Regimes",
-            description=(
-                "Use Skew/Kurt as IfElse conditions to route between different "
-                "factor computations. High-moment regime switching captures "
-                "non-linear market states effectively."
-            ),
-            template="IfElse(Skew($close, 20), <factor_a>, <factor_b>)",
-            success_rate="High",
-            example_factors=["HMR_001", "HMR_002"],
-            occurrence_count=0,
-        ),
-        SuccessPattern(
-            name="PV Corr Interaction",
-            description=(
-                "Price-volume correlation interaction: use rolling Corr($close, $volume) "
-                "as a signal or conditioning variable. Captures supply-demand imbalance "
-                "through price-volume divergence."
-            ),
-            template="CsRank(Corr($close, $volume, 20))",
-            success_rate="High",
-            example_factors=["PVC_001", "PVC_002"],
-            occurrence_count=0,
-        ),
-        SuccessPattern(
-            name="Robust Efficiency",
-            description=(
-                "Use Median for noise filtering instead of Mean. Rolling median "
-                "is more robust to outliers in intraday data, producing factors "
-                "with higher ICIR."
-            ),
-            template="CsRank(Div(Median($close, 10), Median($close, 60)))",
-            success_rate="High",
-            example_factors=["RE_001"],
-            occurrence_count=0,
-        ),
-        SuccessPattern(
-            name="Smoothed Efficiency Rank",
-            description=(
-                "Combine EMA smoothing with CsRank cross-sectional normalization. "
-                "EMA reduces noise while CsRank ensures cross-sectional comparability."
-            ),
-            template="CsRank(EMA(Div($close, Mean($close, 20)), 10))",
-            success_rate="High",
-            example_factors=["SER_001", "SER_002"],
-            occurrence_count=0,
-        ),
-        SuccessPattern(
-            name="Trend Regression Adaptive",
-            description=(
-                "Use TsLinRegSlope, TsLinRegResid, or rolling R-squared to capture "
-                "trend strength and mean reversion. Regression residuals identify "
-                "deviations from local trends."
-            ),
-            template="CsRank(TsLinRegSlope($close, 20))",
-            success_rate="High",
-            example_factors=["TRA_001", "TRA_002"],
-            occurrence_count=0,
-        ),
-        SuccessPattern(
-            name="Logical Or Extreme Regimes",
-            description=(
-                "Use Or/And with Greater/Less to combine multiple extreme-value "
-                "conditions. Captures compound regime states that single indicators miss."
-            ),
-            template="IfElse(Or(Greater(Skew($returns, 20), 1), Less(Kurt($returns, 20), -1)), <a>, <b>)",
-            success_rate="Medium",
-            example_factors=["LOR_001"],
-            occurrence_count=0,
-        ),
-        SuccessPattern(
-            name="Kurtosis Regime",
-            description=(
-                "Use rolling kurtosis to detect fat-tail regimes and switch "
-                "factor behavior accordingly. High kurtosis indicates regime "
-                "changes and trend breaks."
-            ),
-            template="IfElse(Kurt($returns, 20), CsRank(Std($returns, 10)), CsRank(Mean($returns, 10)))",
-            success_rate="Medium",
-            example_factors=["KR_001"],
-            occurrence_count=0,
-        ),
-        SuccessPattern(
-            name="Amt Efficiency Rank Interaction",
-            description=(
-                "Combine $amt (turnover) with efficiency ratios and CsRank. "
-                "Amount-weighted efficiency captures liquidity-adjusted momentum."
-            ),
-            template="CsRank(Div(EMA($amt, 5), EMA($amt, 20)))",
-            success_rate="Medium",
-            example_factors=["AER_001"],
-            occurrence_count=0,
-        ),
-    ]
-
-
-def _default_forbidden_directions() -> List[ForbiddenDirection]:
-    """Initial forbidden directions from FactorMiner Table 5."""
-    return [
-        ForbiddenDirection(
-            name="Standardized Returns/Amount",
-            description=(
-                "CsZScore or Std-normalized $returns and $amt variants. "
-                "These produce a cluster of highly correlated factors."
-            ),
-            correlated_factors=["std_ret_cluster"],
-            typical_correlation=0.6,
-            reason="Standardized return/amount variants cluster with rho > 0.6",
-            occurrence_count=0,
-        ),
-        ForbiddenDirection(
-            name="VWAP Deviation variants",
-            description=(
-                "Factors based on deviation from VWAP (Sub($close, $vwap) or "
-                "Delta($vwap)). All VWAP deviation variants converge to the "
-                "same signal."
-            ),
-            correlated_factors=["vwap_dev_cluster"],
-            typical_correlation=0.5,
-            reason="VWAP deviation variants produce highly correlated factors (rho > 0.5)",
-            occurrence_count=0,
-        ),
-        ForbiddenDirection(
-            name="Simple Delta Reversal",
-            description=(
-                "Simple price-change reversal factors using Delta($close) or "
-                "Neg(Return($close)). These are well-known and already "
-                "saturated in most factor libraries."
-            ),
-            correlated_factors=["delta_rev_cluster"],
-            typical_correlation=0.5,
-            reason="Simple delta-based reversal factors are redundant (rho > 0.5)",
-            occurrence_count=0,
-        ),
-        ForbiddenDirection(
-            name="WMA/EMA Smoothed Efficiency",
-            description=(
-                "Smoothing the same base signal with WMA, EMA, SMA, DEMA "
-                "produces nearly identical factors. Different smoothing methods "
-                "on the same input do not add diversity."
-            ),
-            correlated_factors=["smoothed_eff_cluster"],
-            typical_correlation=0.9,
-            reason="WMA/EMA/SMA smoothed efficiency variants nearly identical (rho > 0.9)",
-            occurrence_count=0,
-        ),
-    ]
-
-
-def _default_insights() -> List[StrategicInsight]:
-    """Initial strategic insights from the paper."""
-    return [
-        StrategicInsight(
-            insight="Non-linear transformations (IfElse, Skew, Kurt) outperform linear ones",
-            evidence="Paper finding: regime-switching factors consistently achieve higher IC",
-            batch_source=0,
-        ),
-        StrategicInsight(
-            insight="Cross-sectional ranking (CsRank) as final layer improves factor stability",
-            evidence="CsRank normalization reduces outlier sensitivity and improves ICIR",
-            batch_source=0,
-        ),
-        StrategicInsight(
-            insight="Combining operators from different categories produces more diverse factors",
-            evidence="Multi-category composition (e.g., Statistical + Logical + CrossSectional) "
-                     "reduces correlation with existing library members",
-            batch_source=0,
-        ),
-    ]
-
-
-# ---------------------------------------------------------------------------
-# Manager class
-# ---------------------------------------------------------------------------
-
-class ExperienceMemoryManager:
-    """High-level manager for the experience memory system.
-
-    Orchestrates formation, evolution, retrieval, and persistence of the
-    experience memory M across mining sessions.
-
-    Parameters
-    ----------
-    max_success_patterns : int
-        Maximum number of success patterns to retain.
-    max_failure_patterns : int
-        Maximum number of forbidden directions to retain.
-    max_insights : int
-        Maximum number of strategic insights to retain.
-    """
-
-    def __init__(
-        self,
-        max_success_patterns: int = 50,
-        max_failure_patterns: int = 100,
-        max_insights: int = 30,
-        enable_knowledge_graph: bool = False,
-        enable_embeddings: bool = False,
-    ) -> None:
-        self.max_success_patterns = max_success_patterns
-        self.max_failure_patterns = max_failure_patterns
-        self.max_insights = max_insights
-        self._batch_counter = 0
-
-        # Initialize with default knowledge base
-        self.memory = ExperienceMemory(
-            state=MiningState(),
-            success_patterns=_default_success_patterns(),
-            forbidden_directions=_default_forbidden_directions(),
-            insights=_default_insights(),
-            version=0,
-        )
-
-        # Phase 2: Optional knowledge graph
-        self.kg: Optional[FactorKnowledgeGraph] = None  # type: ignore[type-arg]
-        if enable_knowledge_graph:
-            if FactorKnowledgeGraph is not None:
-                self.kg = FactorKnowledgeGraph()
-            else:
-                logger.warning(
-                    "Knowledge graph requested but networkx is not installed. "
-                    "Install with: pip install networkx"
-                )
-
-        # Phase 2: Optional formula embedder
-        self.embedder: Optional[FormulaEmbedder] = None  # type: ignore[type-arg]
-        if enable_embeddings:
-            if FormulaEmbedder is not None:
-                self.embedder = FormulaEmbedder()
-            else:
-                logger.warning(
-                    "Embeddings requested but required packages are not installed. "
-                    "Install with: pip install sentence-transformers"
-                )
-
-    @property
-    def version(self) -> int:
-        return self.memory.version
-
-    def update(self, trajectory: List[dict]) -> Dict[str, Any]:
-        """Process a batch trajectory: formation + evolution.
-
-        Parameters
-        ----------
-        trajectory : list[dict]
-            Batch of evaluated candidates. Each dict should contain:
-            - formula: str - the DSL formula
-            - factor_id: str - unique identifier
-            - ic: float - information coefficient
-            - icir: float - IC information ratio
-            - max_correlation: float - max correlation with existing factors
-            - correlated_with: str - ID of most correlated existing factor
-            - admitted: bool - whether the factor was admitted
-            - rejection_reason: str - reason for rejection (if rejected)
-
-        Returns
-        -------
-        dict
-            Summary of the update: admitted_count, rejected_count,
-            new_patterns, new_forbidden, new_insights, version.
-        """
-        self._batch_counter += 1
-
-        # Formation: extract experience from trajectory
-        formed = form_memory(self.memory, trajectory, self._batch_counter)
-
-        # Evolution: merge formed experience into persistent memory
-        self.memory = evolve_memory(
-            self.memory,
-            formed,
-            max_success_patterns=self.max_success_patterns,
-            max_failure_patterns=self.max_failure_patterns,
-            max_insights=self.max_insights,
-        )
-
-        admitted_count = sum(1 for c in trajectory if c.get("admitted", False))
-        rejected_count = len(trajectory) - admitted_count
-
-        # Phase 2: Update knowledge graph with new factors
-        if self.kg is not None and FactorNode is not None:
-            self._update_knowledge_graph(trajectory)
-
-        return {
-            "batch": self._batch_counter,
-            "admitted_count": admitted_count,
-            "rejected_count": rejected_count,
-            "success_patterns": len(self.memory.success_patterns),
-            "forbidden_directions": len(self.memory.forbidden_directions),
-            "insights": len(self.memory.insights),
-            "version": self.memory.version,
-        }
-
-    def retrieve(
-        self,
-        library_state: Optional[Dict[str, Any]] = None,
-        max_success: int = 8,
-        max_forbidden: int = 10,
-        max_insights: int = 10,
-    ) -> Dict[str, Any]:
-        """Retrieve context-dependent memory signal for LLM prompt.
-
-        Parameters
-        ----------
-        library_state : dict, optional
-            Current library diagnostics. Keys: library_size,
-            domain_saturation, etc.
-        max_success : int
-            Maximum number of success patterns to include.
-        max_forbidden : int
-            Maximum number of forbidden directions to include.
-        max_insights : int
-            Maximum number of insights to include.
-
-        Returns
-        -------
-        dict
-            Memory signal m with keys: recommended_directions,
-            forbidden_directions, insights, library_state, prompt_text.
-        """
-        # Use enhanced retrieval if KG or embedder is available
-        if (self.kg is not None or self.embedder is not None) and retrieve_memory_enhanced is not None:
-            return retrieve_memory_enhanced(
-                self.memory,
-                library_state=library_state,
-                max_success=max_success,
-                max_forbidden=max_forbidden,
-                max_insights=max_insights,
-                kg=self.kg,
-                embedder=self.embedder,
-            )
-
-        return retrieve_memory(
-            self.memory,
-            library_state=library_state,
-            max_success=max_success,
-            max_forbidden=max_forbidden,
-            max_insights=max_insights,
-        )
-
-    def save(self, path: str | Path) -> None:
-        """Persist memory to a JSON file.
-
-        Also saves the knowledge graph to a sibling file
-        ``<stem>_kg.json`` if enabled.
-
-        Parameters
-        ----------
-        path : str or Path
-            Output file path (will be created/overwritten).
-        """
-        path = Path(path)
-        path.parent.mkdir(parents=True, exist_ok=True)
-
-        data = self.memory.to_dict()
-        data["_batch_counter"] = self._batch_counter
-        data["_config"] = {
-            "max_success_patterns": self.max_success_patterns,
-            "max_failure_patterns": self.max_failure_patterns,
-            "max_insights": self.max_insights,
-            "enable_knowledge_graph": self.kg is not None,
-            "enable_embeddings": self.embedder is not None,
-        }
-
-        with open(path, "w") as f:
-            json.dump(data, f, indent=2, ensure_ascii=False)
-
-        # Phase 2: Save knowledge graph alongside
-        if self.kg is not None:
-            kg_path = path.with_name(f"{path.stem}_kg.json")
-            self.kg.save(kg_path)
-
-    def load(self, path: str | Path) -> None:
-        """Load memory from a JSON file.
-
-        Also loads the knowledge graph from ``<stem>_kg.json`` if
-        the file exists and the KG feature is enabled.
-
-        Parameters
-        ----------
-        path : str or Path
-            Path to a previously saved memory file.
-        """
-        path = Path(path)
-        with open(path) as f:
-            data = json.load(f)
-
-        self.memory = ExperienceMemory.from_dict(data)
-        self._batch_counter = data.get("_batch_counter", 0)
-
-        config = data.get("_config", {})
-        if config:
-            self.max_success_patterns = config.get(
-                "max_success_patterns", self.max_success_patterns
-            )
-            self.max_failure_patterns = config.get(
-                "max_failure_patterns", self.max_failure_patterns
-            )
-            self.max_insights = config.get(
-                "max_insights", self.max_insights
-            )
-
-        # Phase 2: Load knowledge graph if available
-        kg_path = path.with_name(f"{path.stem}_kg.json")
-        if kg_path.exists() and FactorKnowledgeGraph is not None:
-            if self.kg is None:
-                # Enable KG if saved config says so, or if file exists
-                if config.get("enable_knowledge_graph", False):
-                    self.kg = FactorKnowledgeGraph.load(kg_path)
-            else:
-                self.kg = FactorKnowledgeGraph.load(kg_path)
-
-        # Re-enable embedder if config says so
-        if config.get("enable_embeddings", False) and self.embedder is None:
-            if FormulaEmbedder is not None:
-                self.embedder = FormulaEmbedder()
-
-    def get_stats(self) -> Dict[str, Any]:
-        """Return summary statistics about the current memory state.
-
-        Returns
-        -------
-        dict
-            Keys: version, batch_counter, library_size, success_patterns,
-            forbidden_directions, insights, domain_saturation,
-            recent_admission_rate, plus kg_* keys when KG is enabled.
-        """
-        recent_logs = self.memory.state.admission_log[-5:]
-        avg_rate = 0.0
-        if recent_logs:
-            avg_rate = sum(
-                log.get("admission_rate", 0) for log in recent_logs
-            ) / len(recent_logs)
-
-        stats: Dict[str, Any] = {
-            "version": self.memory.version,
-            "batch_counter": self._batch_counter,
-            "library_size": self.memory.state.library_size,
-            "success_patterns": len(self.memory.success_patterns),
-            "forbidden_directions": len(self.memory.forbidden_directions),
-            "insights": len(self.memory.insights),
-            "domain_saturation": dict(self.memory.state.domain_saturation),
-            "recent_admission_rate": round(avg_rate, 4),
-            "top_success_patterns": [
-                {"name": p.name, "rate": p.success_rate, "count": p.occurrence_count}
-                for p in sorted(
-                    self.memory.success_patterns,
-                    key=lambda p: p.occurrence_count,
-                    reverse=True,
-                )[:5]
-            ],
-            "top_forbidden_directions": [
-                {"name": f.name, "corr": f.typical_correlation, "count": f.occurrence_count}
-                for f in sorted(
-                    self.memory.forbidden_directions,
-                    key=lambda f: f.occurrence_count,
-                    reverse=True,
-                )[:5]
-            ],
-        }
-
-        # Phase 2: KG stats
-        if self.kg is not None:
-            stats["kg_factor_count"] = self.kg.get_factor_count()
-            stats["kg_edge_count"] = self.kg.get_edge_count()
-            saturated = self.kg.find_saturated_regions()
-            stats["kg_saturated_clusters"] = len(saturated)
-
-        return stats
-
-    def reset(self) -> None:
-        """Reset memory to initial state with default knowledge base."""
-        self._batch_counter = 0
-        self.memory = ExperienceMemory(
-            state=MiningState(),
-            success_patterns=_default_success_patterns(),
-            forbidden_directions=_default_forbidden_directions(),
-            insights=_default_insights(),
-            version=0,
-        )
-
-        # Phase 2: Reset KG and embedder
-        if self.kg is not None and FactorKnowledgeGraph is not None:
-            self.kg = FactorKnowledgeGraph()
-        if self.embedder is not None and FormulaEmbedder is not None:
-            self.embedder = FormulaEmbedder()
-
-    # ------------------------------------------------------------------
-    # Phase 2: Knowledge graph helpers
-    # ------------------------------------------------------------------
-
-    def _update_knowledge_graph(self, trajectory: List[dict]) -> None:
-        """Add factors from a trajectory to the knowledge graph.
-
-        Extracts operators from formulas, creates FactorNode instances,
-        and registers correlation edges between co-evaluated candidates.
-        """
-        import re
-
-        if self.kg is None or FactorNode is None:
-            return
-
-        op_pattern = re.compile(r"\b([A-Z][a-zA-Z]+)\(")
-        feat_pattern = re.compile(r"\$\w+")
-
-        factor_ids: List[str] = []
-
-        for candidate in trajectory:
-            fid = candidate.get("factor_id", "")
-            formula = candidate.get("formula", "")
-            if not fid or not formula:
-                continue
-
-            # Parse operators and features from formula
-            operators = op_pattern.findall(formula)
-            features = feat_pattern.findall(formula)
-
-            node = FactorNode(
-                factor_id=fid,
-                formula=formula,
-                ic_mean=candidate.get("ic", 0.0),
-                category=candidate.get("category", ""),
-                operators=operators,
-                features=features,
-                batch_number=self._batch_counter,
-                admitted=candidate.get("admitted", False),
-            )
-
-            # Embed if embedder is available
-            if self.embedder is not None:
-                node.embedding = self.embedder.embed(fid, formula)
-
-            self.kg.add_factor(node)
-            factor_ids.append(fid)
-
-            # Add correlation edge to existing library member
-            correlated_with = candidate.get("correlated_with", "")
-            max_corr = candidate.get("max_correlation", 0.0)
-            if correlated_with and max_corr > 0:
-                self.kg.add_correlation_edge(fid, correlated_with, max_corr)
diff --git a/src/factorminer/factorminer/memory/formation.py b/src/factorminer/factorminer/memory/formation.py
deleted file mode 100644
index 3fc35f7..0000000
--- a/src/factorminer/factorminer/memory/formation.py
+++ /dev/null
@@ -1,446 +0,0 @@
-"""Memory Formation operator F(M, tau).
-
-Analyzes a mining trajectory tau (batch of evaluated candidates with IC,
-correlation, admission results) and extracts new experience:
-- Successful patterns from admitted factors
-- Forbidden directions from high-correlation rejections
-- Strategic insights about what works across the batch
-"""
-
-from __future__ import annotations
-
-import re
-from collections import Counter, defaultdict
-from typing import Any, Dict, List, Optional, Tuple
-
-from src.factorminer.factorminer.memory.memory_store import (
-    ExperienceMemory,
-    ForbiddenDirection,
-    MiningState,
-    StrategicInsight,
-    SuccessPattern,
-)
-
-
-# ---------------------------------------------------------------------------
-# Operator-pattern detection helpers
-# ---------------------------------------------------------------------------
-
-# Maps operator substrings to pattern categories
-_PATTERN_SIGNATURES: Dict[str, List[str]] = {
-    "Higher Moment Regimes": ["Skew", "Kurt", "IfElse"],
-    "PV Corr Interaction": ["Corr", "$close", "$volume"],
-    "Robust Efficiency": ["Med", "Median"],
-    "Smoothed Efficiency Rank": ["EMA", "CsRank"],
-    "Trend Regression Adaptive": ["Rsquare", "Slope", "Resi", "TsLinReg"],
-    "Logical Or Extreme Regimes": ["Or", "Greater", "Less"],
-    "Kurtosis Regime": ["Kurt", "IfElse"],
-    "Amt Efficiency Rank Interaction": ["$amt", "CsRank"],
-}
-
-_FORBIDDEN_SIGNATURES: Dict[str, Dict[str, Any]] = {
-    "Standardized Returns/Amount": {
-        "keywords": ["CsZScore", "$returns", "$amt", "Std"],
-        "typical_corr": 0.6,
-        "reason": "Standardized return/amount variants cluster with rho > 0.6",
-    },
-    "VWAP Deviation variants": {
-        "keywords": ["$vwap", "Delta", "Sub", "$close"],
-        "typical_corr": 0.5,
-        "reason": "VWAP deviation variants produce highly correlated factors (rho > 0.5)",
-    },
-    "Simple Delta Reversal": {
-        "keywords": ["Delta", "$close", "Neg", "Return"],
-        "typical_corr": 0.5,
-        "reason": "Simple delta-based reversal factors are redundant (rho > 0.5)",
-    },
-    "WMA/EMA Smoothed Efficiency": {
-        "keywords": ["WMA", "EMA", "SMA"],
-        "typical_corr": 0.9,
-        "reason": "WMA/EMA smoothed efficiency variants nearly identical (rho > 0.9)",
-    },
-}
-
-
-def _extract_operators(formula: str) -> List[str]:
-    """Extract operator names from a DSL formula string."""
-    return re.findall(r"([A-Z][a-zA-Z]+)\(", formula)
-
-
-def _extract_features(formula: str) -> List[str]:
-    """Extract feature references from a DSL formula string."""
-    return re.findall(r"\$[a-z]+", formula)
-
-
-def _matches_pattern(formula: str, signature_keywords: List[str]) -> bool:
-    """Check if a formula matches a pattern based on keyword presence."""
-    formula_upper = formula.upper()
-    ops = _extract_operators(formula)
-    feats = _extract_features(formula)
-    all_tokens = [o.upper() for o in ops] + [f.upper() for f in feats]
-    match_count = sum(
-        1 for kw in signature_keywords
-        if any(kw.upper() in token for token in all_tokens)
-        or kw.upper() in formula_upper
-    )
-    # Require at least 2 keyword matches (or all if fewer than 2 keywords)
-    threshold = min(2, len(signature_keywords))
-    return match_count >= threshold
-
-
-def _classify_success_pattern(formula: str) -> Optional[str]:
-    """Try to classify a formula into a known success pattern category."""
-    for pattern_name, keywords in _PATTERN_SIGNATURES.items():
-        if _matches_pattern(formula, keywords):
-            return pattern_name
-    return None
-
-
-def _classify_forbidden_direction(formula: str) -> Optional[str]:
-    """Try to classify a formula into a known forbidden direction."""
-    for direction_name, info in _FORBIDDEN_SIGNATURES.items():
-        if _matches_pattern(formula, info["keywords"]):
-            return direction_name
-    return None
-
-
-# ---------------------------------------------------------------------------
-# Trajectory analysis
-# ---------------------------------------------------------------------------
-
-def _analyze_admissions(
-    trajectory: List[dict],
-) -> Tuple[List[dict], List[dict]]:
-    """Split trajectory into admitted and rejected candidates."""
-    admitted = []
-    rejected = []
-    for candidate in trajectory:
-        if candidate.get("admitted", False):
-            admitted.append(candidate)
-        else:
-            rejected.append(candidate)
-    return admitted, rejected
-
-
-def _extract_success_patterns(
-    admitted: List[dict],
-    existing_patterns: List[SuccessPattern],
-) -> List[SuccessPattern]:
-    """Extract new or reinforced success patterns from admitted factors."""
-    pattern_map: Dict[str, SuccessPattern] = {
-        p.name: SuccessPattern(
-            name=p.name,
-            description=p.description,
-            template=p.template,
-            success_rate=p.success_rate,
-            example_factors=list(p.example_factors),
-            occurrence_count=p.occurrence_count,
-        )
-        for p in existing_patterns
-    }
-
-    for candidate in admitted:
-        formula = candidate.get("formula", "")
-        factor_id = candidate.get("factor_id", formula[:60])
-        ic = candidate.get("ic", 0.0)
-
-        pattern_name = _classify_success_pattern(formula)
-        if pattern_name is None:
-            # Novel pattern: create a generic entry based on operators used
-            ops = _extract_operators(formula)
-            if len(ops) >= 2:
-                pattern_name = f"Novel: {'+'.join(ops[:3])}"
-            else:
-                continue
-
-        if pattern_name in pattern_map:
-            pat = pattern_map[pattern_name]
-            pat.occurrence_count += 1
-            if factor_id not in pat.example_factors:
-                pat.example_factors.append(factor_id)
-                # Keep example list bounded
-                if len(pat.example_factors) > 10:
-                    pat.example_factors = pat.example_factors[-10:]
-            # Upgrade success rate if consistently passing
-            if pat.occurrence_count >= 5 and pat.success_rate == "Medium":
-                pat.success_rate = "High"
-        else:
-            pattern_map[pattern_name] = SuccessPattern(
-                name=pattern_name,
-                description=f"Pattern derived from admitted factor with IC={ic:.4f}",
-                template=formula,
-                success_rate="Low",
-                example_factors=[factor_id],
-                occurrence_count=1,
-            )
-
-    return list(pattern_map.values())
-
-
-def _extract_forbidden_directions(
-    rejected: List[dict],
-    existing_forbidden: List[ForbiddenDirection],
-) -> List[ForbiddenDirection]:
-    """Extract new or reinforced forbidden directions from rejections."""
-    direction_map: Dict[str, ForbiddenDirection] = {
-        f.name: ForbiddenDirection(
-            name=f.name,
-            description=f.description,
-            correlated_factors=list(f.correlated_factors),
-            typical_correlation=f.typical_correlation,
-            reason=f.reason,
-            occurrence_count=f.occurrence_count,
-        )
-        for f in existing_forbidden
-    }
-
-    for candidate in rejected:
-        formula = candidate.get("formula", "")
-        factor_id = candidate.get("factor_id", formula[:60])
-        rejection_reason = candidate.get("rejection_reason", "")
-        max_corr = candidate.get("max_correlation", 0.0)
-        correlated_with = candidate.get("correlated_with", "")
-
-        # Only track correlation-based rejections
-        if max_corr < 0.4 and "correlation" not in rejection_reason.lower():
-            continue
-
-        direction_name = _classify_forbidden_direction(formula)
-        if direction_name is None:
-            # Detect generic high-correlation cluster
-            if max_corr >= 0.5:
-                ops = _extract_operators(formula)
-                feats = _extract_features(formula)
-                direction_name = f"HighCorr: {'+'.join(ops[:2])}({','.join(feats[:2])})"
-            else:
-                continue
-
-        if direction_name in direction_map:
-            d = direction_map[direction_name]
-            d.occurrence_count += 1
-            if correlated_with and correlated_with not in d.correlated_factors:
-                d.correlated_factors.append(correlated_with)
-                if len(d.correlated_factors) > 10:
-                    d.correlated_factors = d.correlated_factors[-10:]
-            # Update typical correlation as running average
-            if max_corr > 0:
-                d.typical_correlation = (
-                    d.typical_correlation * (d.occurrence_count - 1) + max_corr
-                ) / d.occurrence_count
-        else:
-            direction_map[direction_name] = ForbiddenDirection(
-                name=direction_name,
-                description=f"Rejected due to: {rejection_reason}",
-                correlated_factors=[correlated_with] if correlated_with else [],
-                typical_correlation=max_corr,
-                reason=rejection_reason or f"High correlation (rho={max_corr:.2f})",
-                occurrence_count=1,
-            )
-
-    return list(direction_map.values())
-
-
-def _derive_insights(
-    admitted: List[dict],
-    rejected: List[dict],
-    batch_number: int,
-) -> List[StrategicInsight]:
-    """Derive higher-level strategic insights from a batch."""
-    insights: List[StrategicInsight] = []
-    if not admitted and not rejected:
-        return insights
-
-    total = len(admitted) + len(rejected)
-    admission_rate = len(admitted) / total if total > 0 else 0.0
-
-    # Insight: overall batch success rate
-    if total >= 5:
-        if admission_rate > 0.3:
-            insights.append(StrategicInsight(
-                insight="Current direction is productive with high admission rate",
-                evidence=f"Batch {batch_number}: {len(admitted)}/{total} admitted ({admission_rate:.0%})",
-                batch_source=batch_number,
-            ))
-        elif admission_rate < 0.05:
-            insights.append(StrategicInsight(
-                insight="Current direction is exhausted, need to pivot to new operator combinations",
-                evidence=f"Batch {batch_number}: only {len(admitted)}/{total} admitted ({admission_rate:.0%})",
-                batch_source=batch_number,
-            ))
-
-    # Insight: operator frequency analysis
-    admitted_ops = Counter()
-    rejected_ops = Counter()
-    for c in admitted:
-        for op in _extract_operators(c.get("formula", "")):
-            admitted_ops[op] += 1
-    for c in rejected:
-        for op in _extract_operators(c.get("formula", "")):
-            rejected_ops[op] += 1
-
-    # Find operators that appear disproportionately in admitted vs rejected
-    for op, count in admitted_ops.most_common(5):
-        rej_count = rejected_ops.get(op, 0)
-        if count >= 3 and (rej_count == 0 or count / max(rej_count, 1) > 2.0):
-            insights.append(StrategicInsight(
-                insight=f"Operator '{op}' is highly productive in current search",
-                evidence=f"Appeared in {count} admitted vs {rej_count} rejected factors",
-                batch_source=batch_number,
-            ))
-
-    # Insight: feature analysis
-    admitted_feats = Counter()
-    for c in admitted:
-        for feat in _extract_features(c.get("formula", "")):
-            admitted_feats[feat] += 1
-
-    if admitted_feats:
-        top_feat, top_count = admitted_feats.most_common(1)[0]
-        if top_count >= 3:
-            insights.append(StrategicInsight(
-                insight=f"Feature '{top_feat}' appears frequently in successful factors",
-                evidence=f"Present in {top_count}/{len(admitted)} admitted factors",
-                batch_source=batch_number,
-            ))
-
-    # Insight: non-linear vs linear
-    nonlinear_ops = {"IfElse", "Skew", "Kurt", "Square", "Pow", "Log", "Or", "And"}
-    admitted_nonlinear = sum(
-        1 for c in admitted
-        if any(op in nonlinear_ops for op in _extract_operators(c.get("formula", "")))
-    )
-    if len(admitted) >= 3 and admitted_nonlinear / len(admitted) > 0.6:
-        insights.append(StrategicInsight(
-            insight="Non-linear transformations outperform linear ones in current regime",
-            evidence=f"{admitted_nonlinear}/{len(admitted)} admitted factors use non-linear operators",
-            batch_source=batch_number,
-        ))
-
-    return insights
-
-
-# ---------------------------------------------------------------------------
-# Public API: Memory Formation
-# ---------------------------------------------------------------------------
-
-def form_memory(
-    memory: ExperienceMemory,
-    trajectory: List[dict],
-    batch_number: int = 0,
-) -> ExperienceMemory:
-    """Memory Formation operator F(M, tau).
-
-    Analyzes the mining trajectory tau and forms new experience memories
-    to be merged into the existing memory via the evolution operator.
-
-    Parameters
-    ----------
-    memory : ExperienceMemory
-        Current memory state (used for context, not modified in place).
-    trajectory : list[dict]
-        Batch of evaluated candidates. Each dict should contain:
-        - formula: str - the DSL formula
-        - factor_id: str - unique identifier
-        - ic: float - information coefficient
-        - icir: float - IC information ratio
-        - max_correlation: float - max correlation with existing factors
-        - correlated_with: str - ID of most correlated existing factor
-        - admitted: bool - whether the factor was admitted to the library
-        - rejection_reason: str - reason for rejection (if rejected)
-    batch_number : int
-        Current batch/iteration number.
-
-    Returns
-    -------
-    ExperienceMemory
-        A *new* ExperienceMemory containing only the newly formed entries
-        (to be merged by the evolution operator).
-    """
-    admitted, rejected = _analyze_admissions(trajectory)
-
-    # Extract patterns
-    new_success = _extract_success_patterns(admitted, memory.success_patterns)
-    new_forbidden = _extract_forbidden_directions(rejected, memory.forbidden_directions)
-    new_insights = _derive_insights(admitted, rejected, batch_number)
-
-    # Build updated mining state
-    new_state = MiningState(
-        library_size=memory.state.library_size + len(admitted),
-        recent_admissions=[
-            {
-                "factor_id": c.get("factor_id", ""),
-                "formula": c.get("formula", ""),
-                "ic": c.get("ic", 0.0),
-                "batch": batch_number,
-            }
-            for c in admitted
-        ],
-        recent_rejections=[
-            {
-                "factor_id": c.get("factor_id", ""),
-                "formula": c.get("formula", ""),
-                "reason": c.get("rejection_reason", ""),
-                "max_correlation": c.get("max_correlation", 0.0),
-                "batch": batch_number,
-            }
-            for c in rejected[-20:]  # Keep only last 20 rejections
-        ],
-        domain_saturation=_compute_domain_saturation(
-            memory.state.domain_saturation, admitted, rejected
-        ),
-        admission_log=memory.state.admission_log + [
-            {
-                "batch": batch_number,
-                "admitted": len(admitted),
-                "rejected": len(rejected),
-                "admission_rate": len(admitted) / max(len(trajectory), 1),
-            }
-        ],
-    )
-
-    return ExperienceMemory(
-        state=new_state,
-        success_patterns=new_success,
-        forbidden_directions=new_forbidden,
-        insights=new_insights,
-        version=memory.version,
-    )
-
-
-def _compute_domain_saturation(
-    existing_saturation: Dict[str, float],
-    admitted: List[dict],
-    rejected: List[dict],
-) -> Dict[str, float]:
-    """Compute per-category domain saturation metrics.
-
-    Saturation increases when many candidates in a category are rejected
-    due to high correlation (the domain is "full").
-    """
-    saturation = dict(existing_saturation)
-
-    # Count category attempts and rejections
-    category_attempts: Dict[str, int] = defaultdict(int)
-    category_rejections: Dict[str, int] = defaultdict(int)
-
-    for candidate in admitted + rejected:
-        formula = candidate.get("formula", "")
-        category = _classify_success_pattern(formula) or "Other"
-        category_attempts[category] += 1
-
-    for candidate in rejected:
-        formula = candidate.get("formula", "")
-        max_corr = candidate.get("max_correlation", 0.0)
-        if max_corr >= 0.4:
-            category = _classify_success_pattern(formula) or "Other"
-            category_rejections[category] += 1
-
-    # Update saturation with exponential moving average
-    alpha = 0.3
-    for category, attempts in category_attempts.items():
-        if attempts > 0:
-            batch_saturation = category_rejections.get(category, 0) / attempts
-            old = saturation.get(category, 0.0)
-            saturation[category] = (1 - alpha) * old + alpha * batch_saturation
-
-    return saturation
diff --git a/src/factorminer/factorminer/memory/kg_retrieval.py b/src/factorminer/factorminer/memory/kg_retrieval.py
deleted file mode 100644
index 5006091..0000000
--- a/src/factorminer/factorminer/memory/kg_retrieval.py
+++ /dev/null
@@ -1,336 +0,0 @@
-"""Enhanced memory retrieval combining Knowledge Graph + Embeddings + flat memory."""
-
-from __future__ import annotations
-
-from typing import Any, Dict, List, Optional, Set, Tuple
-
-from src.factorminer.factorminer.memory.memory_store import ExperienceMemory
-from src.factorminer.factorminer.memory.retrieval import retrieve_memory
-
-# Optional imports -- presence checked at call time
-try:
-    from factorminer.memory.knowledge_graph import FactorKnowledgeGraph
-except ImportError:
-    FactorKnowledgeGraph = None  # type: ignore[assignment,misc]
-
-try:
-    from factorminer.memory.embeddings import FormulaEmbedder
-except ImportError:
-    FormulaEmbedder = None  # type: ignore[assignment,misc]
-
-
-def retrieve_memory_enhanced(
-    memory: ExperienceMemory,
-    library_state: Optional[Dict[str, Any]] = None,
-    max_success: int = 8,
-    max_forbidden: int = 10,
-    max_insights: int = 10,
-    kg: Optional[FactorKnowledgeGraph] = None,  # type: ignore[type-arg]
-    embedder: Optional[FormulaEmbedder] = None,  # type: ignore[type-arg]
-) -> Dict[str, Any]:
-    """Enhanced memory retrieval operator R+(M, L, KG, E).
-
-    Calls the base :func:`retrieve_memory` first, then augments the
-    returned dict with additional prompt-oriented keys derived from the
-    knowledge graph and embedder.
-
-    Parameters
-    ----------
-    memory : ExperienceMemory
-        The flat experience memory.
-    library_state : dict, optional
-        Current library diagnostics.
-    max_success, max_forbidden, max_insights : int
-        Limits forwarded to the base retrieval.
-    kg : FactorKnowledgeGraph, optional
-        Knowledge graph instance.
-    embedder : FormulaEmbedder, optional
-        Formula embedder instance.
-
-    Returns
-    -------
-    dict
-        Base memory signal plus the four additional keys above.
-    """
-    # Base retrieval
-    result = retrieve_memory(
-        memory,
-        library_state=library_state,
-        max_success=max_success,
-        max_forbidden=max_forbidden,
-        max_insights=max_insights,
-    )
-
-    # Default augmented keys
-    result["complementary_patterns"] = []
-    result["conflict_warnings"] = []
-    result["operator_cooccurrence"] = []
-    result["semantic_neighbors"] = []
-    result["semantic_duplicates"] = []
-    result["semantic_gaps"] = []
-
-    # ----------------------------------------------------------------
-    # Knowledge-graph augmentations
-    # ----------------------------------------------------------------
-    if kg is not None:
-        # Complementary patterns: for each recently admitted factor,
-        # find structurally complementary neighbours.
-        complementary: List[str] = []
-        seen: Set[str] = set()
-        for admission in memory.state.recent_admissions[-5:]:
-            fid = admission.get("factor_id", "")
-            if not fid:
-                continue
-            for comp in kg.find_complementary_patterns(fid, max_hops=2):
-                if comp not in seen:
-                    seen.add(comp)
-                    complementary.append(_describe_factor_node(kg, comp))
-        result["complementary_patterns"] = complementary
-
-        # Conflict warnings: saturated regions
-        saturated_regions = kg.find_saturated_regions(threshold=0.5)
-        result["conflict_warnings"] = [
-            _describe_conflict_cluster(kg, region) for region in saturated_regions
-        ]
-
-        # Operator co-occurrence
-        cooc = kg.get_operator_cooccurrence()
-        # Sort by count descending, take top 20
-        top_cooc = sorted(cooc.items(), key=lambda x: x[1], reverse=True)[:20]
-        result["operator_cooccurrence"] = [
-            f"{a} + {b} (seen {count} times)" for (a, b), count in top_cooc
-        ]
-
-    # ----------------------------------------------------------------
-    # Embedding-based augmentations
-    # ----------------------------------------------------------------
-    if embedder is not None:
-        _seed_embedder_from_memory(memory, kg, embedder)
-        semantic_neighbors, semantic_duplicates = _collect_semantic_context(
-            memory=memory,
-            kg=kg,
-            embedder=embedder,
-        )
-        result["semantic_neighbors"] = semantic_neighbors
-        result["semantic_duplicates"] = semantic_duplicates
-        result["semantic_gaps"] = _find_semantic_gaps(memory, kg, embedder)
-
-    # ----------------------------------------------------------------
-    # Augment prompt text
-    # ----------------------------------------------------------------
-    extra_sections: List[str] = []
-
-    if result["complementary_patterns"]:
-        extra_sections.append("=== COMPLEMENTARY PATTERNS (explore) ===")
-        extra_sections.append(
-            "Factors structurally complementary to recent admissions:"
-        )
-        for fid in result["complementary_patterns"][:8]:
-            extra_sections.append(f"  - {fid}")
-        extra_sections.append("")
-
-    if result["conflict_warnings"]:
-        extra_sections.append("=== SATURATION WARNINGS ===")
-        extra_sections.append(
-            "The following factor clusters are highly correlated -- "
-            "avoid generating variants:"
-        )
-        for cluster in result["conflict_warnings"][:5]:
-            extra_sections.append(f"  Cluster: {', '.join(cluster[:6])}")
-        extra_sections.append("")
-
-    if result["semantic_gaps"]:
-        extra_sections.append("=== SEMANTIC GAPS (underexplored) ===")
-        extra_sections.append(
-            "Operators present in success patterns but underused in the library:"
-        )
-        for op in result["semantic_gaps"][:10]:
-            extra_sections.append(f"  - {op}")
-        extra_sections.append("")
-
-    if result["semantic_neighbors"]:
-        extra_sections.append("=== SEMANTIC NEIGHBORS (similar library factors) ===")
-        for item in result["semantic_neighbors"][:8]:
-            extra_sections.append(f"  - {item}")
-        extra_sections.append("")
-
-    if result["semantic_duplicates"]:
-        extra_sections.append("=== SEMANTIC DUPLICATES (near-duplicate risk) ===")
-        for item in result["semantic_duplicates"][:5]:
-            extra_sections.append(f"  - {item}")
-        extra_sections.append("")
-
-    if extra_sections:
-        result["prompt_text"] += "\n" + "\n".join(extra_sections)
-
-    return result
-
-
-# ---------------------------------------------------------------------------
-# Helpers
-# ---------------------------------------------------------------------------
-
-
-def _find_semantic_gaps(
-    memory: ExperienceMemory,
-    kg: Optional[FactorKnowledgeGraph],  # type: ignore[type-arg]
-    embedder: Optional[FormulaEmbedder],  # type: ignore[type-arg]
-) -> List[str]:
-    """Identify success-pattern operators with poor semantic coverage."""
-    import re
-
-    template_ops: Set[str] = set()
-    op_pattern = re.compile(r"\b([A-Z][a-zA-Z]+)\(")
-
-    for pat in memory.success_patterns:
-        for match in op_pattern.finditer(pat.template):
-            template_ops.add(match.group(1))
-
-    if not template_ops:
-        return []
-
-    if embedder is None:
-        return sorted(template_ops)
-
-    # A pattern is considered underexplored when it has no close semantic
-    # neighbors in the current library representation.
-    uncovered_ops: Set[str] = set()
-    anchors = list(memory.success_patterns[:10])
-    if not anchors:
-        return sorted(template_ops)
-
-    for pat in anchors:
-        nearest = embedder.find_nearest(pat.template, k=1)
-        best_similarity = nearest[0][1] if nearest else 0.0
-        if best_similarity < 0.72:
-            for match in op_pattern.finditer(pat.template):
-                uncovered_ops.add(match.group(1))
-
-    if not uncovered_ops and kg is None:
-        return sorted(template_ops)
-
-    if not uncovered_ops:
-        # Fall back to the operators that are entirely absent from the admitted set.
-        used_ops: Set[str] = set()
-        if kg is not None:
-            for node in kg.list_factor_nodes(admitted_only=True):
-                used_ops.update(node.operators)
-        uncovered_ops = template_ops - used_ops
-
-    return sorted(uncovered_ops or template_ops)
-
-
-def _seed_embedder_from_memory(
-    memory: ExperienceMemory,
-    kg: Optional[FactorKnowledgeGraph],  # type: ignore[type-arg]
-    embedder: FormulaEmbedder,  # type: ignore[type-arg]
-) -> None:
-    """Ensure the embedder cache reflects the current known factors."""
-    seen: Set[str] = set()
-
-    if kg is not None:
-        for node in kg.list_factor_nodes(admitted_only=True):
-            if node.factor_id and node.formula and node.factor_id not in seen:
-                embedder.embed(node.factor_id, node.formula)
-                seen.add(node.factor_id)
-
-    for admission in memory.state.recent_admissions[-10:]:
-        fid = admission.get("factor_id", "")
-        formula = admission.get("formula", "")
-        if fid and formula and fid not in seen:
-            embedder.embed(fid, formula)
-            seen.add(fid)
-
-
-def _collect_semantic_context(
-    memory: ExperienceMemory,
-    kg: Optional[FactorKnowledgeGraph],  # type: ignore[type-arg]
-    embedder: FormulaEmbedder,  # type: ignore[type-arg]
-    max_neighbors: int = 8,
-    similarity_threshold: float = 0.72,
-) -> Tuple[List[str], List[str]]:
-    """Collect semantically similar neighbors and duplicate warnings."""
-    anchors: List[Tuple[str, str, str]] = []
-    for admission in memory.state.recent_admissions[-5:]:
-        fid = admission.get("factor_id", "")
-        formula = admission.get("formula", "")
-        if fid and formula:
-            anchors.append(("recent admission", fid, formula))
-
-    if not anchors:
-        for pattern in memory.success_patterns[:5]:
-            if pattern.template:
-                anchors.append(("success pattern", pattern.name, pattern.template))
-
-    semantic_neighbors: List[str] = []
-    semantic_duplicates: List[str] = []
-    seen_matches: Set[Tuple[str, str]] = set()
-
-    if embedder.cache_size == 0:
-        return semantic_neighbors, semantic_duplicates
-
-    for anchor_kind, anchor_id, formula in anchors:
-        nearest = embedder.find_nearest(formula, k=min(5, embedder.cache_size))
-        for match_id, similarity in nearest:
-            if anchor_id == match_id:
-                continue
-            if similarity < similarity_threshold:
-                continue
-            match_key = (anchor_id, match_id)
-            if match_key in seen_matches:
-                continue
-            seen_matches.add(match_key)
-            match_desc = _describe_factor_node(kg, match_id)
-            if match_desc == match_id:
-                semantic_neighbors.append(
-                    f"{anchor_kind} {anchor_id} -> {match_id} (sim={similarity:.2f})"
-                )
-            else:
-                semantic_neighbors.append(
-                    f"{anchor_kind} {anchor_id} -> {match_desc} (sim={similarity:.2f})"
-                )
-            if similarity >= 0.90:
-                semantic_duplicates.append(
-                    f"{anchor_kind} {anchor_id} is very close to {match_id} "
-                    f"(sim={similarity:.2f})"
-                )
-            if len(semantic_neighbors) >= max_neighbors:
-                return semantic_neighbors, semantic_duplicates
-
-    return semantic_neighbors, semantic_duplicates
-
-
-def _describe_factor_node(
-    kg: FactorKnowledgeGraph,  # type: ignore[type-arg]
-    factor_id: str,
-) -> str:
-    """Render a factor node into short prompt-friendly text."""
-    if kg is None:
-        return factor_id
-
-    node = kg.get_factor_node(factor_id)
-    if node is None:
-        return factor_id
-
-    category = node.category or "unknown"
-    ic_mean = node.ic_mean
-    formula = node.formula
-    summary = factor_id
-    if category:
-        summary += f" [{category}]"
-    if ic_mean is not None:
-        summary += f" IC={float(ic_mean):.4f}"
-    if formula:
-        summary += f": {formula[:80]}"
-        if len(formula) > 80:
-            summary += "..."
-    return summary
-
-
-def _describe_conflict_cluster(
-    kg: FactorKnowledgeGraph,  # type: ignore[type-arg]
-    cluster: Set[str],
-) -> str:
-    """Render one saturated cluster into short text."""
-    described = [_describe_factor_node(kg, factor_id) for factor_id in sorted(cluster)]
-    return " | ".join(described[:3])
diff --git a/src/factorminer/factorminer/memory/knowledge_graph.py b/src/factorminer/factorminer/memory/knowledge_graph.py
deleted file mode 100644
index b782135..0000000
--- a/src/factorminer/factorminer/memory/knowledge_graph.py
+++ /dev/null
@@ -1,418 +0,0 @@
-"""Factor Knowledge Graph for lineage tracking and structural analysis.
-
-Uses a NetworkX DiGraph to model relationships between factors, operators,
-and feature inputs. Supports:
-- Factor derivation lineage (parent -> child mutations)
-- Correlation-based edges for saturation detection
-- Operator co-occurrence analysis for diversity guidance
-- Complementary pattern discovery via BFS
-"""
-
-from __future__ import annotations
-
-import json
-from collections import defaultdict
-from dataclasses import dataclass, field, asdict
-from enum import Enum
-from pathlib import Path
-from typing import Any, Dict, Iterable, List, Optional, Set, Tuple
-
-import numpy as np
-
-try:
-    import networkx as nx
-except ImportError:
-    nx = None  # type: ignore[assignment]
-
-
-class EdgeType(Enum):
-    """Types of edges in the factor knowledge graph."""
-
-    DERIVED_FROM = "derived_from"
-    CORRELATED_WITH = "correlated_with"
-    USES_OPERATOR = "uses_operator"
-    COMPLEMENTARY = "complementary"
-    CONFLICTS = "conflicts"
-
-
-@dataclass
-class FactorNode:
-    """A node in the factor knowledge graph representing a single factor.
-
-    Attributes
-    ----------
-    factor_id : str
-        Unique identifier for the factor.
-    formula : str
-        DSL formula string.
-    ic_mean : float
-        Mean information coefficient.
-    category : str
-        Factor category (e.g., "momentum", "mean_reversion").
-    operators : list[str]
-        List of operator names used in the formula.
-    features : list[str]
-        List of input features (e.g., "$close", "$volume").
-    batch_number : int
-        Batch in which the factor was generated.
-    admitted : bool
-        Whether the factor was admitted to the library.
-    embedding : ndarray or None
-        Optional semantic embedding vector.
-    """
-
-    factor_id: str
-    formula: str
-    ic_mean: float = 0.0
-    category: str = ""
-    operators: List[str] = field(default_factory=list)
-    features: List[str] = field(default_factory=list)
-    batch_number: int = 0
-    admitted: bool = False
-    embedding: Optional[np.ndarray] = None
-
-    def to_dict(self) -> Dict[str, Any]:
-        d = asdict(self)
-        if self.embedding is not None:
-            d["embedding"] = self.embedding.tolist()
-        else:
-            d["embedding"] = None
-        return d
-
-    @classmethod
-    def from_dict(cls, d: Dict[str, Any]) -> FactorNode:
-        embedding = d.get("embedding")
-        if embedding is not None:
-            embedding = np.array(embedding, dtype=np.float32)
-        return cls(
-            factor_id=d["factor_id"],
-            formula=d.get("formula", ""),
-            ic_mean=d.get("ic_mean", 0.0),
-            category=d.get("category", ""),
-            operators=d.get("operators", []),
-            features=d.get("features", []),
-            batch_number=d.get("batch_number", 0),
-            admitted=d.get("admitted", False),
-            embedding=embedding,
-        )
-
-
-def _ensure_networkx() -> None:
-    """Raise a clear error if networkx is not installed."""
-    if nx is None:
-        raise ImportError(
-            "networkx is required for FactorKnowledgeGraph. "
-            "Install it with: pip install networkx"
-        )
-
-
-class FactorKnowledgeGraph:
-    """Directed graph tracking factor lineage and relationships.
-
-    Uses ``networkx.DiGraph`` internally. Factor nodes store a
-    :class:`FactorNode` dataclass; operator nodes are prefixed with
-    ``op:``. Factor metadata retains the declared features even though
-    the graph currently materializes operator structure explicitly.
-    """
-
-    def __init__(self) -> None:
-        _ensure_networkx()
-        self._graph: nx.DiGraph = nx.DiGraph()
-
-    # ------------------------------------------------------------------
-    # Node operations
-    # ------------------------------------------------------------------
-
-    def add_factor(self, node: FactorNode) -> None:
-        """Add or replace a factor node and auto-create USES_OPERATOR edges.
-
-        For each operator in ``node.operators``, an ``op:{name}`` node
-        is created (if absent) and a USES_OPERATOR edge is drawn from
-        the factor to that operator node.
-        """
-        if self._graph.has_node(node.factor_id):
-            self.remove_factor(node.factor_id)
-
-        self._graph.add_node(
-            node.factor_id,
-            node_type="factor",
-            data=node.to_dict(),
-        )
-
-        for op in node.operators:
-            op_id = f"op:{op}"
-            if not self._graph.has_node(op_id):
-                self._graph.add_node(op_id, node_type="operator")
-            self._graph.add_edge(
-                node.factor_id,
-                op_id,
-                edge_type=EdgeType.USES_OPERATOR.value,
-            )
-
-    def get_factor_node(self, factor_id: str) -> Optional[FactorNode]:
-        """Return a factor node by id, or ``None`` if missing."""
-        attrs = self._graph.nodes.get(factor_id)
-        if not attrs or attrs.get("node_type") != "factor":
-            return None
-        data = attrs.get("data", {})
-        if not isinstance(data, dict):
-            return None
-        try:
-            return FactorNode.from_dict(data)
-        except Exception:
-            return None
-
-    def iter_factor_nodes(
-        self,
-        admitted_only: bool = False,
-    ) -> Iterable[FactorNode]:
-        """Yield factor nodes currently present in the graph."""
-        for node_id, attrs in self._graph.nodes(data=True):
-            if attrs.get("node_type") != "factor":
-                continue
-            data = attrs.get("data", {})
-            if not isinstance(data, dict):
-                continue
-            if admitted_only and not data.get("admitted", False):
-                continue
-            try:
-                yield FactorNode.from_dict(data)
-            except Exception:
-                continue
-
-    def list_factor_nodes(self, admitted_only: bool = False) -> List[FactorNode]:
-        """Return all factor nodes as a list."""
-        return list(self.iter_factor_nodes(admitted_only=admitted_only))
-
-    # ------------------------------------------------------------------
-    # Edge operations
-    # ------------------------------------------------------------------
-
-    def add_correlation_edge(
-        self,
-        a: str,
-        b: str,
-        rho: float,
-        threshold: float = 0.4,
-    ) -> None:
-        """Add a CORRELATED_WITH edge if ``|rho| >= threshold``."""
-        if abs(rho) >= threshold:
-            self._graph.add_edge(
-                a,
-                b,
-                edge_type=EdgeType.CORRELATED_WITH.value,
-                rho=rho,
-            )
-            self._graph.add_edge(
-                b,
-                a,
-                edge_type=EdgeType.CORRELATED_WITH.value,
-                rho=rho,
-            )
-
-    def add_derivation_edge(
-        self,
-        child: str,
-        parent: str,
-        mutation_type: str = "",
-    ) -> None:
-        """Add a DERIVED_FROM edge from *child* to *parent*."""
-        self._graph.add_edge(
-            child,
-            parent,
-            edge_type=EdgeType.DERIVED_FROM.value,
-            mutation_type=mutation_type,
-        )
-
-    def remove_factor(self, factor_id: str) -> bool:
-        """Remove a factor and prune orphaned auxiliary nodes.
-
-        Returns ``True`` when the factor was present.
-        """
-        if not self._graph.has_node(factor_id):
-            return False
-
-        self._graph.remove_node(factor_id)
-        self._prune_orphan_aux_nodes()
-        return True
-
-    # ------------------------------------------------------------------
-    # Query operations
-    # ------------------------------------------------------------------
-
-    def find_complementary_patterns(
-        self,
-        factor_id: str,
-        max_hops: int = 2,
-    ) -> List[str]:
-        """Find factors complementary to *factor_id* via BFS.
-
-        A complementary factor is one that:
-        1. Is reachable within *max_hops* in the undirected view, and
-        2. Is NOT directly correlated with the source factor, and
-        3. Uses at least one different operator.
-
-        Returns a list of factor IDs.
-        """
-        if not self._graph.has_node(factor_id):
-            return []
-
-        # Collect correlated neighbours (direct CORRELATED_WITH edges)
-        correlated: Set[str] = set()
-        for _, nbr, data in self._graph.edges(factor_id, data=True):
-            if data.get("edge_type") == EdgeType.CORRELATED_WITH.value:
-                correlated.add(nbr)
-        for pred, _, data in self._graph.in_edges(factor_id, data=True):
-            if data.get("edge_type") == EdgeType.CORRELATED_WITH.value:
-                correlated.add(pred)
-
-        # Source operators
-        source_ops = self._get_operators(factor_id)
-
-        # BFS on undirected view
-        undirected = self._graph.to_undirected()
-        visited: Set[str] = {factor_id}
-        frontier: List[str] = [factor_id]
-        complementary: List[str] = []
-
-        for _ in range(max_hops):
-            next_frontier: List[str] = []
-            for node in frontier:
-                for nbr in undirected.neighbors(node):
-                    if nbr in visited:
-                        continue
-                    visited.add(nbr)
-                    next_frontier.append(nbr)
-
-                    # Only consider factor nodes
-                    if self._graph.nodes[nbr].get("node_type") != "factor":
-                        continue
-                    # Skip if correlated
-                    if nbr in correlated:
-                        continue
-                    # Must use at least one different operator
-                    nbr_ops = self._get_operators(nbr)
-                    if nbr_ops and source_ops and not nbr_ops.issubset(source_ops):
-                        complementary.append(nbr)
-            frontier = next_frontier
-
-        return complementary
-
-    def find_saturated_regions(
-        self,
-        threshold: float = 0.5,
-    ) -> List[Set[str]]:
-        """Find clusters of highly correlated factors.
-
-        Builds a subgraph of CORRELATED_WITH edges where
-        ``|rho| > threshold``, then returns connected components.
-        Each component is a set of factor IDs.
-        """
-        sub = nx.Graph()
-        for u, v, data in self._graph.edges(data=True):
-            if data.get("edge_type") != EdgeType.CORRELATED_WITH.value:
-                continue
-            rho = abs(data.get("rho", 0.0))
-            if rho > threshold:
-                # Only include factor nodes
-                if (
-                    self._graph.nodes.get(u, {}).get("node_type") == "factor"
-                    and self._graph.nodes.get(v, {}).get("node_type") == "factor"
-                ):
-                    sub.add_edge(u, v)
-
-        components = list(nx.connected_components(sub))
-        # Filter out singletons
-        return [c for c in components if len(c) > 1]
-
-    def get_operator_cooccurrence(self) -> Dict[Tuple[str, str], int]:
-        """Count operator pair co-occurrences across admitted factors.
-
-        Returns a dict mapping ``(op_a, op_b)`` (sorted tuple) to count.
-        """
-        cooccurrence: Dict[Tuple[str, str], int] = defaultdict(int)
-
-        for node_id, attrs in self._graph.nodes(data=True):
-            if attrs.get("node_type") != "factor":
-                continue
-            node_data = attrs.get("data", {})
-            if not node_data.get("admitted", False):
-                continue
-
-            ops = sorted(set(node_data.get("operators", [])))
-            for i in range(len(ops)):
-                for j in range(i + 1, len(ops)):
-                    pair = (ops[i], ops[j])
-                    cooccurrence[pair] += 1
-
-        return dict(cooccurrence)
-
-    # ------------------------------------------------------------------
-    # Stats
-    # ------------------------------------------------------------------
-
-    def get_factor_count(self) -> int:
-        """Return the number of factor nodes in the graph."""
-        return sum(
-            1
-            for _, d in self._graph.nodes(data=True)
-            if d.get("node_type") == "factor"
-        )
-
-    def get_edge_count(self) -> int:
-        """Return total number of edges in the graph."""
-        return self._graph.number_of_edges()
-
-    # ------------------------------------------------------------------
-    # Serialization
-    # ------------------------------------------------------------------
-
-    def to_dict(self) -> Dict[str, Any]:
-        """Serialize to a JSON-compatible dict via ``nx.node_link_data``."""
-        return nx.node_link_data(self._graph, edges="links")
-
-    @classmethod
-    def from_dict(cls, data: Dict[str, Any]) -> FactorKnowledgeGraph:
-        """Deserialize from a dict produced by :meth:`to_dict`."""
-        kg = cls()
-        kg._graph = nx.node_link_graph(data, edges="links")
-        return kg
-
-    def save(self, path: str | Path) -> None:
-        """Persist the graph to a JSON file."""
-        path = Path(path)
-        path.parent.mkdir(parents=True, exist_ok=True)
-        with open(path, "w") as f:
-            json.dump(self.to_dict(), f, indent=2, ensure_ascii=False)
-
-    @classmethod
-    def load(cls, path: str | Path) -> FactorKnowledgeGraph:
-        """Load a graph from a JSON file."""
-        path = Path(path)
-        with open(path) as f:
-            data = json.load(f)
-        return cls.from_dict(data)
-
-    # ------------------------------------------------------------------
-    # Internal helpers
-    # ------------------------------------------------------------------
-
-    def _get_operators(self, factor_id: str) -> Set[str]:
-        """Return the set of operator names used by a factor."""
-        ops: Set[str] = set()
-        for _, nbr, data in self._graph.edges(factor_id, data=True):
-            if data.get("edge_type") == EdgeType.USES_OPERATOR.value:
-                # Strip "op:" prefix
-                ops.add(nbr.removeprefix("op:"))
-        return ops
-
-    def _prune_orphan_aux_nodes(self) -> None:
-        """Remove operator nodes that are no longer referenced."""
-        orphan_nodes = [
-            node_id
-            for node_id, attrs in self._graph.nodes(data=True)
-            if attrs.get("node_type") in {"operator", "feature"}
-            and self._graph.degree(node_id) == 0
-        ]
-        if orphan_nodes:
-            self._graph.remove_nodes_from(orphan_nodes)
diff --git a/src/factorminer/factorminer/memory/memory_store.py b/src/factorminer/factorminer/memory/memory_store.py
deleted file mode 100644
index 12615a1..0000000
--- a/src/factorminer/factorminer/memory/memory_store.py
+++ /dev/null
@@ -1,165 +0,0 @@
-"""Data structures for the FactorMiner experience memory system.
-
-Implements the experience memory M = {S, P_succ, P_fail, I} where:
-- S: Mining state tracking global evolution of the factor library
-- P_succ: Success patterns (recommended mining directions)
-- P_fail: Forbidden directions (directions to avoid)
-- I: Strategic insights (high-level lessons)
-"""
-
-from __future__ import annotations
-
-from dataclasses import dataclass, field, asdict
-from typing import Any, Dict, List, Optional
-
-
-@dataclass
-class MiningState:
-    """Tracks the global evolution of the factor library (S).
-
-    Captures a snapshot of the current library status including size,
-    recent admission/rejection history, and per-category saturation.
-    """
-
-    library_size: int = 0
-    recent_admissions: List[dict] = field(default_factory=list)
-    recent_rejections: List[dict] = field(default_factory=list)
-    domain_saturation: Dict[str, float] = field(default_factory=dict)
-    admission_log: List[dict] = field(default_factory=list)
-
-    def to_dict(self) -> dict:
-        return asdict(self)
-
-    @classmethod
-    def from_dict(cls, d: dict) -> MiningState:
-        return cls(
-            library_size=d.get("library_size", 0),
-            recent_admissions=d.get("recent_admissions", []),
-            recent_rejections=d.get("recent_rejections", []),
-            domain_saturation=d.get("domain_saturation", {}),
-            admission_log=d.get("admission_log", []),
-        )
-
-
-@dataclass
-class SuccessPattern:
-    """A recommended mining direction (P_succ).
-
-    Encodes a known-effective pattern for factor construction, including
-    a canonical formula template and tracked success rate.
-    """
-
-    name: str
-    description: str
-    template: str
-    success_rate: str  # "High", "Medium", "Low"
-    example_factors: List[str] = field(default_factory=list)
-    occurrence_count: int = 0
-
-    def to_dict(self) -> dict:
-        return asdict(self)
-
-    @classmethod
-    def from_dict(cls, d: dict) -> SuccessPattern:
-        return cls(
-            name=d["name"],
-            description=d["description"],
-            template=d["template"],
-            success_rate=d.get("success_rate", "Medium"),
-            example_factors=d.get("example_factors", []),
-            occurrence_count=d.get("occurrence_count", 0),
-        )
-
-
-@dataclass
-class ForbiddenDirection:
-    """A forbidden mining direction (P_fail).
-
-    Encodes a pattern that consistently produces factors too correlated
-    with existing library members or that fail quality thresholds.
-    """
-
-    name: str
-    description: str
-    correlated_factors: List[str] = field(default_factory=list)
-    typical_correlation: float = 0.0
-    reason: str = ""
-    occurrence_count: int = 0
-
-    def to_dict(self) -> dict:
-        return asdict(self)
-
-    @classmethod
-    def from_dict(cls, d: dict) -> ForbiddenDirection:
-        return cls(
-            name=d["name"],
-            description=d["description"],
-            correlated_factors=d.get("correlated_factors", []),
-            typical_correlation=d.get("typical_correlation", 0.0),
-            reason=d.get("reason", ""),
-            occurrence_count=d.get("occurrence_count", 0),
-        )
-
-
-@dataclass
-class StrategicInsight:
-    """High-level lesson from mining (I).
-
-    Captures abstract observations about what works and what doesn't,
-    derived from accumulated mining experience across batches.
-    """
-
-    insight: str
-    evidence: str
-    batch_source: int = 0
-
-    def to_dict(self) -> dict:
-        return asdict(self)
-
-    @classmethod
-    def from_dict(cls, d: dict) -> StrategicInsight:
-        return cls(
-            insight=d["insight"],
-            evidence=d["evidence"],
-            batch_source=d.get("batch_source", 0),
-        )
-
-
-@dataclass
-class ExperienceMemory:
-    """The complete experience memory M = {S, P_succ, P_fail, I}.
-
-    Persists across mining sessions and evolves with each batch of
-    evaluated factor candidates.
-    """
-
-    state: MiningState = field(default_factory=MiningState)
-    success_patterns: List[SuccessPattern] = field(default_factory=list)
-    forbidden_directions: List[ForbiddenDirection] = field(default_factory=list)
-    insights: List[StrategicInsight] = field(default_factory=list)
-    version: int = 0
-
-    def to_dict(self) -> dict:
-        return {
-            "state": self.state.to_dict(),
-            "success_patterns": [p.to_dict() for p in self.success_patterns],
-            "forbidden_directions": [f.to_dict() for f in self.forbidden_directions],
-            "insights": [i.to_dict() for i in self.insights],
-            "version": self.version,
-        }
-
-    @classmethod
-    def from_dict(cls, d: dict) -> ExperienceMemory:
-        return cls(
-            state=MiningState.from_dict(d.get("state", {})),
-            success_patterns=[
-                SuccessPattern.from_dict(p) for p in d.get("success_patterns", [])
-            ],
-            forbidden_directions=[
-                ForbiddenDirection.from_dict(f) for f in d.get("forbidden_directions", [])
-            ],
-            insights=[
-                StrategicInsight.from_dict(i) for i in d.get("insights", [])
-            ],
-            version=d.get("version", 0),
-        )
diff --git a/src/factorminer/factorminer/memory/online_regime_memory.py b/src/factorminer/factorminer/memory/online_regime_memory.py
deleted file mode 100644
index e8cf19a..0000000
--- a/src/factorminer/factorminer/memory/online_regime_memory.py
+++ /dev/null
@@ -1,1625 +0,0 @@
-"""Online regime-aware memory system for FactorMiner.
-
-Addresses FactorMiner's core limitation: static, offline-only memory that
-ignores regime changes.  This module provides:
-
-- ``RegimeSpecificPattern`` / ``RegimeSpecificPatternStore``
-  — per-regime success/failure pattern storage with IC-based scoring
-
-- ``OnlineMemoryUpdater``
-  — streaming memory update with exponential forgetting and regime-change hooks
-
-- ``RegimeTransitionForecaster``
-  — logistic-regression-based next-regime predictor for proactive memory prep
-
-- ``OnlineRegimeMemory``
-  — top-level orchestrator integrating all components
-
-- ``MemoryForgetCurve``
-  — snapshot tracker for visualising and analysing memory decay
-
-All components are:
-  * Thread-safe (``threading.RLock``)
-  * Serialisable (``to_dict`` / ``from_dict`` + ``pickle`` compatible)
-  * Streaming-fast (< 1 ms per ``update`` call with normal loads)
-  * Pure Python + NumPy + scikit-learn (no additional dependencies)
-"""
-
-from __future__ import annotations
-
-import copy
-import json
-import logging
-import math
-import pickle
-import threading
-import time
-from collections import defaultdict, deque
-from dataclasses import dataclass, field
-from datetime import datetime, timezone
-from pathlib import Path
-from typing import Any, Dict, FrozenSet, List, Optional, Tuple
-
-import numpy as np
-
-from src.factorminer.factorminer.evaluation.regime import RegimeState, StreamingRegimeDetector, StreamingRegimeConfig
-from src.factorminer.factorminer.memory.memory_store import (
-    ExperienceMemory,
-    StrategicInsight,
-    SuccessPattern,
-)
-from src.factorminer.factorminer.memory.evolution import (
-    apply_confidence_decay,
-    bump_pattern_confidence,
-    penalise_pattern_confidence,
-)
-from src.factorminer.factorminer.memory.retrieval import retrieve_memory
-
-logger = logging.getLogger(__name__)
-
-
-# ---------------------------------------------------------------------------
-# MemorySignal — returned by OnlineRegimeMemory.retrieve()
-# ---------------------------------------------------------------------------
-
-@dataclass
-class MemorySignal:
-    """Structured memory signal for LLM prompt injection.
-
-    Wraps the standard retrieval result with regime-specific additions.
-    """
-    recommended_directions: List[dict]
-    forbidden_directions: List[dict]
-    insights: List[dict]
-    library_state: dict
-    prompt_text: str
-    # Regime-specific additions
-    current_regime: RegimeState = field(default_factory=RegimeState)
-    regime_patterns: List[dict] = field(default_factory=list)
-    cross_regime_patterns: List[dict] = field(default_factory=list)
-    forecasted_regime: Optional[RegimeState] = None
-    forecast_confidence: float = 0.0
-
-    def to_dict(self) -> dict:
-        return {
-            "recommended_directions": self.recommended_directions,
-            "forbidden_directions": self.forbidden_directions,
-            "insights": self.insights,
-            "library_state": self.library_state,
-            "prompt_text": self.prompt_text,
-            "current_regime": self.current_regime.to_dict(),
-            "regime_patterns": self.regime_patterns,
-            "cross_regime_patterns": self.cross_regime_patterns,
-            "forecasted_regime": self.forecasted_regime.to_dict()
-                if self.forecasted_regime else None,
-            "forecast_confidence": self.forecast_confidence,
-        }
-
-
-# ---------------------------------------------------------------------------
-# RegimeSpecificPattern & RegimeSpecificPatternStore
-# ---------------------------------------------------------------------------
-
-@dataclass
-class RegimeSpecificPattern:
-    """A formula pattern with per-regime performance statistics.
-
-    Attributes
-    ----------
-    formula_template : str
-        DSL formula template (may contain ``{w}`` style placeholders).
-    regime : RegimeState
-        The regime context in which this pattern was discovered.
-    ic_in_regime : float
-        Mean IC when the current market regime matches ``self.regime``.
-    ic_out_of_regime : float
-        Mean IC when the current regime does not match.
-    regime_specificity : float
-        ``ic_in_regime / (|ic_out_of_regime| + 1e-8)``.  Values >> 1 indicate
-        strong regime specialisation.
-    discovery_date : datetime
-        UTC timestamp of first observation.
-    confidence : float
-        Normalised confidence in [0, 1] based on sample count.  Decays via
-        forgetting.
-    n_observations : int
-        Number of times this pattern has been observed.
-    n_in_regime : int
-        Observations when regime matched.
-    """
-    formula_template: str
-    regime: RegimeState
-    ic_in_regime: float = 0.0
-    ic_out_of_regime: float = 0.0
-    regime_specificity: float = 1.0
-    discovery_date: datetime = field(
-        default_factory=lambda: datetime.now(tz=timezone.utc)
-    )
-    confidence: float = 1.0
-    n_observations: int = 1
-    n_in_regime: int = 0
-
-    def update_ic(self, ic: float, in_regime: bool) -> None:
-        """Online update of IC statistics using an EW running mean."""
-        self.n_observations += 1
-        alpha = 2.0 / (min(self.n_observations, 50) + 1)
-        if in_regime:
-            self.n_in_regime += 1
-            self.ic_in_regime = (1 - alpha) * self.ic_in_regime + alpha * ic
-        else:
-            self.ic_out_of_regime = (1 - alpha) * self.ic_out_of_regime + alpha * ic
-        # Recompute specificity
-        self.regime_specificity = abs(self.ic_in_regime) / (
-            abs(self.ic_out_of_regime) + 1e-8
-        )
-
-    def to_dict(self) -> dict:
-        return {
-            "formula_template": self.formula_template,
-            "regime": self.regime.to_dict(),
-            "ic_in_regime": round(self.ic_in_regime, 6),
-            "ic_out_of_regime": round(self.ic_out_of_regime, 6),
-            "regime_specificity": round(self.regime_specificity, 4),
-            "discovery_date": self.discovery_date.isoformat(),
-            "confidence": round(self.confidence, 6),
-            "n_observations": self.n_observations,
-            "n_in_regime": self.n_in_regime,
-        }
-
-    @classmethod
-    def from_dict(cls, d: dict) -> "RegimeSpecificPattern":
-        discovery_date = datetime.fromisoformat(
-            d.get("discovery_date", datetime.now(tz=timezone.utc).isoformat())
-        )
-        if discovery_date.tzinfo is None:
-            discovery_date = discovery_date.replace(tzinfo=timezone.utc)
-        return cls(
-            formula_template=d["formula_template"],
-            regime=RegimeState.from_dict(d["regime"]),
-            ic_in_regime=d.get("ic_in_regime", 0.0),
-            ic_out_of_regime=d.get("ic_out_of_regime", 0.0),
-            regime_specificity=d.get("regime_specificity", 1.0),
-            discovery_date=discovery_date,
-            confidence=d.get("confidence", 1.0),
-            n_observations=d.get("n_observations", 1),
-            n_in_regime=d.get("n_in_regime", 0),
-        )
-
-
-class RegimeSpecificPatternStore:
-    """Thread-safe store for regime-specific formula patterns.
-
-    Patterns are keyed by ``(formula_template, regime_str)`` and indexed
-    for fast retrieval by regime similarity.
-
-    Parameters
-    ----------
-    max_patterns : int
-        Maximum total patterns retained.  When full, lowest-confidence
-        patterns are evicted.
-    min_ic : float
-        Minimum |IC| threshold; patterns consistently below this are pruned.
-    cross_regime_specificity_threshold : float
-        A pattern with ``regime_specificity < threshold`` is classified as
-        a cross-regime (general) pattern.
-    """
-
-    def __init__(
-        self,
-        max_patterns: int = 500,
-        min_ic: float = 0.02,
-        cross_regime_specificity_threshold: float = 1.5,
-    ) -> None:
-        self.max_patterns = max_patterns
-        self.min_ic = min_ic
-        self.cross_regime_threshold = cross_regime_specificity_threshold
-        self._lock = threading.RLock()
-        # key: (formula_template, regime_str)
-        self._patterns: Dict[Tuple[str, str], RegimeSpecificPattern] = {}
-
-    # --- public API ---
-
-    def add_pattern(
-        self,
-        formula: str,
-        regime: RegimeState,
-        ic: float,
-    ) -> None:
-        """Add or update a pattern observation.
-
-        If the (formula, regime) pair already exists, the IC statistics
-        are updated online.  Otherwise a new entry is created.
-
-        Parameters
-        ----------
-        formula : str
-        regime : RegimeState
-            The regime active when this IC was measured.
-        ic : float
-            Observed IC (signed).
-        """
-        with self._lock:
-            key = (formula, str(regime))
-            if key in self._patterns:
-                pat = self._patterns[key]
-                pat.update_ic(ic, in_regime=True)
-                pat.confidence = min(1.0, pat.confidence + 0.05)
-            else:
-                # Also update out-of-regime IC for all *existing* patterns
-                # with a different regime tag
-                for existing_key, existing_pat in self._patterns.items():
-                    if existing_key[0] == formula and existing_key[1] != str(regime):
-                        existing_pat.update_ic(ic, in_regime=False)
-
-                # Create new pattern
-                pat = RegimeSpecificPattern(
-                    formula_template=formula,
-                    regime=regime,
-                    ic_in_regime=ic,
-                    ic_out_of_regime=0.0,
-                    confidence=1.0,
-                    n_observations=1,
-                    n_in_regime=1,
-                )
-                pat.regime_specificity = abs(ic) / (abs(0.0) + 1e-8)
-                self._patterns[key] = pat
-
-            # Evict if over capacity
-            if len(self._patterns) > self.max_patterns:
-                self._evict_weakest()
-
-    def retrieve_for_regime(
-        self,
-        current_regime: RegimeState,
-        top_k: int = 10,
-        min_confidence: float = 0.1,
-    ) -> List[RegimeSpecificPattern]:
-        """Retrieve patterns most relevant to the current regime.
-
-        Patterns are scored as:
-            score = confidence * ic_in_regime * regime_similarity
-
-        where ``regime_similarity`` is the Jaccard similarity between the
-        pattern's tagged regime and ``current_regime``.
-
-        Parameters
-        ----------
-        current_regime : RegimeState
-        top_k : int
-        min_confidence : float
-            Minimum confidence to include.
-
-        Returns
-        -------
-        list[RegimeSpecificPattern]
-            Sorted by descending relevance score.
-        """
-        with self._lock:
-            scored: List[Tuple[float, RegimeSpecificPattern]] = []
-            for pat in self._patterns.values():
-                if pat.confidence < min_confidence:
-                    continue
-                sim = pat.regime.similarity(current_regime)
-                score = pat.confidence * abs(pat.ic_in_regime) * (0.2 + 0.8 * sim)
-                scored.append((score, pat))
-            scored.sort(key=lambda x: -x[0])
-            return [p for _, p in scored[:top_k]]
-
-    def get_cross_regime_patterns(
-        self,
-        top_k: int = 10,
-        min_confidence: float = 0.1,
-    ) -> List[RegimeSpecificPattern]:
-        """Return patterns that generalise well across regimes.
-
-        A pattern qualifies as cross-regime if its ``regime_specificity``
-        is below ``cross_regime_specificity_threshold`` *and* its absolute
-        IC is meaningfully positive (>= ``min_ic``).
-
-        Returns
-        -------
-        list[RegimeSpecificPattern]
-        """
-        with self._lock:
-            cross: List[Tuple[float, RegimeSpecificPattern]] = []
-            for pat in self._patterns.values():
-                if pat.confidence < min_confidence:
-                    continue
-                if pat.regime_specificity < self.cross_regime_threshold:
-                    avg_ic = (abs(pat.ic_in_regime) + abs(pat.ic_out_of_regime)) / 2.0
-                    if avg_ic >= self.min_ic:
-                        cross.append((avg_ic * pat.confidence, pat))
-            cross.sort(key=lambda x: -x[0])
-            return [p for _, p in cross[:top_k]]
-
-    def apply_decay(self, decay_factor: float) -> None:
-        """Multiply all pattern confidences by ``decay_factor`` and prune weak ones."""
-        with self._lock:
-            to_delete = []
-            for key, pat in self._patterns.items():
-                pat.confidence = max(0.0, pat.confidence * decay_factor)
-                if pat.confidence < 0.01 and pat.n_observations > 3:
-                    to_delete.append(key)
-            for key in to_delete:
-                del self._patterns[key]
-
-    def boost_regime_patterns(self, regime: RegimeState, boost: float = 0.1) -> None:
-        """Increase confidence of patterns tagged for ``regime``."""
-        with self._lock:
-            for pat in self._patterns.values():
-                if pat.regime == regime:
-                    pat.confidence = min(1.0, pat.confidence + boost)
-
-    def penalise_regime_patterns(self, regime: RegimeState, penalty: float = 0.3) -> None:
-        """Decrease confidence of patterns tagged for ``regime``."""
-        with self._lock:
-            for pat in self._patterns.values():
-                if pat.regime == regime:
-                    pat.confidence = max(0.0, pat.confidence - penalty)
-
-    def get_stats(self) -> dict:
-        """Return aggregate statistics."""
-        with self._lock:
-            n = len(self._patterns)
-            if n == 0:
-                return {
-                    "total_patterns": 0,
-                    "avg_confidence": 0.0,
-                    "avg_ic_in_regime": 0.0,
-                    "cross_regime_count": 0,
-                }
-            confs = [p.confidence for p in self._patterns.values()]
-            ics = [p.ic_in_regime for p in self._patterns.values()]
-            cross = len(self.get_cross_regime_patterns(top_k=n))
-            return {
-                "total_patterns": n,
-                "avg_confidence": float(np.mean(confs)),
-                "avg_ic_in_regime": float(np.mean(np.abs(ics))),
-                "cross_regime_count": cross,
-            }
-
-    def to_dict(self) -> dict:
-        with self._lock:
-            return {
-                "max_patterns": self.max_patterns,
-                "min_ic": self.min_ic,
-                "cross_regime_threshold": self.cross_regime_threshold,
-                "patterns": [p.to_dict() for p in self._patterns.values()],
-            }
-
-    @classmethod
-    def from_dict(cls, d: dict) -> "RegimeSpecificPatternStore":
-        store = cls(
-            max_patterns=d.get("max_patterns", 500),
-            min_ic=d.get("min_ic", 0.02),
-            cross_regime_specificity_threshold=d.get("cross_regime_threshold", 1.5),
-        )
-        for pd in d.get("patterns", []):
-            pat = RegimeSpecificPattern.from_dict(pd)
-            key = (pat.formula_template, str(pat.regime))
-            store._patterns[key] = pat
-        return store
-
-    # --- internals ---
-
-    def _evict_weakest(self) -> None:
-        """Remove the single weakest (lowest confidence * ic) pattern."""
-        if not self._patterns:
-            return
-        worst_key = min(
-            self._patterns,
-            key=lambda k: (
-                self._patterns[k].confidence
-                * (abs(self._patterns[k].ic_in_regime) + 1e-8)
-            ),
-        )
-        del self._patterns[worst_key]
-
-
-# ---------------------------------------------------------------------------
-# OnlineMemoryUpdater
-# ---------------------------------------------------------------------------
-
-class OnlineMemoryUpdater:
-    """Streaming experience-memory updater with exponential forgetting.
-
-    Integrates with the base ``ExperienceMemory`` and the
-    ``RegimeSpecificPatternStore`` to maintain an up-to-date picture of
-    what works in the current market regime.
-
-    Thread safety
-    -------------
-    All mutating operations acquire ``self._lock`` (``threading.RLock``).
-    The ``base_memory`` is replaced atomically so readers always see a
-    consistent snapshot.
-
-    Parameters
-    ----------
-    base_memory : ExperienceMemory
-        The underlying experience memory (will be mutated in place via
-        evolution helpers).
-    forgetting_rate : float
-        Per-iteration exponential decay rate applied to pattern confidence.
-    regime_sensitivity : float
-        Weight given to regime-specific IC boosts vs generic boosts.
-        0 = ignore regime, 1 = fully regime-sensitive.
-    min_confidence : float
-        Patterns with normalised confidence below this are pruned during
-        forgetting.
-    regime_boost : float
-        Confidence increment when a pattern's regime matches the current one.
-    regime_penalty : float
-        Confidence decrement when the regime changes away from a pattern's home.
-    """
-
-    def __init__(
-        self,
-        base_memory: ExperienceMemory,
-        forgetting_rate: float = 0.01,
-        regime_sensitivity: float = 0.5,
-        min_confidence: float = 0.05,
-        regime_boost: float = 0.1,
-        regime_penalty: float = 0.3,
-    ) -> None:
-        self.forgetting_rate = forgetting_rate
-        self.regime_sensitivity = regime_sensitivity
-        self.min_confidence = min_confidence
-        self.regime_boost = regime_boost
-        self.regime_penalty = regime_penalty
-
-        self._lock = threading.RLock()
-        self._base_memory: ExperienceMemory = base_memory
-
-        # Counters
-        self._iteration: int = 0
-        self._last_decay_iteration: int = 0
-
-        # Per-regime IC accumulators: regime_str -> deque of ICs
-        self._regime_ic_history: Dict[str, deque] = defaultdict(
-            lambda: deque(maxlen=200)
-        )
-
-        # Outcome stats
-        self._outcome_counts: Dict[str, int] = defaultdict(int)
-        self._formula_regime_map: Dict[str, RegimeState] = {}
-
-    # --- public API ---
-
-    @property
-    def base_memory(self) -> ExperienceMemory:
-        """Thread-safe read of the current base memory snapshot."""
-        with self._lock:
-            return self._base_memory
-
-    def on_factor_evaluated(
-        self,
-        formula: str,
-        ic: float,
-        regime: RegimeState,
-        outcome: str,
-    ) -> None:
-        """Called immediately after each factor evaluation.
-
-        Parameters
-        ----------
-        formula : str
-            DSL formula of the evaluated candidate.
-        ic : float
-            Observed IC (signed).
-        regime : RegimeState
-            Active market regime at evaluation time.
-        outcome : str
-            One of: ``'admitted'``, ``'rejected_ic'``,
-            ``'rejected_correlation'``, ``'replaced'``.
-        """
-        t0 = time.perf_counter()
-        with self._lock:
-            self._iteration += 1
-            self._outcome_counts[outcome] += 1
-            self._formula_regime_map[formula] = regime
-            regime_key = str(regime)
-            self._regime_ic_history[regime_key].append(ic)
-
-            # Boost success patterns that match admitted factors
-            if outcome == "admitted" and abs(ic) >= 0.03:
-                boost_factor = 1 + int(
-                    self.regime_sensitivity * 2 * abs(ic) / 0.1
-                )
-                # Try to match formula against existing success pattern templates
-                for pat in self._base_memory.success_patterns:
-                    if _formula_matches_template(formula, pat.template):
-                        self._base_memory = bump_pattern_confidence(
-                            self._base_memory, pat.name, boost=boost_factor
-                        )
-
-        elapsed_ms = (time.perf_counter() - t0) * 1000
-        if elapsed_ms > 1.0:
-            logger.debug(
-                "on_factor_evaluated took %.2f ms (target < 1 ms)", elapsed_ms
-            )
-
-    def apply_forgetting(self, iterations_elapsed: int = 1) -> None:
-        """Exponentially decay pattern confidence and prune stale entries.
-
-        Parameters
-        ----------
-        iterations_elapsed : int
-            Number of mining iterations since last call to this method.
-        """
-        with self._lock:
-            self._base_memory = apply_confidence_decay(
-                self._base_memory,
-                forgetting_rate=self.forgetting_rate,
-                iterations_elapsed=iterations_elapsed,
-                min_confidence=self.min_confidence,
-            )
-            self._last_decay_iteration = self._iteration
-
-    def on_regime_change(
-        self,
-        old_regime: RegimeState,
-        new_regime: RegimeState,
-    ) -> None:
-        """React to a detected regime transition.
-
-        Actions performed:
-        1. Boost confidence of success patterns tagged for ``new_regime``.
-        2. Down-weight success patterns tagged for ``old_regime``.
-        3. Insert a regime-transition ``StrategicInsight`` into base memory.
-
-        Parameters
-        ----------
-        old_regime : RegimeState
-        new_regime : RegimeState
-        """
-        with self._lock:
-            # Boost / penalise patterns in base memory by tag matching
-            for pat in self._base_memory.success_patterns:
-                tag_new = str(new_regime)
-                tag_old = str(old_regime)
-                # We tag patterns heuristically via their description keywords
-                desc_lower = pat.description.lower()
-                name_lower = pat.name.lower()
-                new_labels_lower = {lbl.lower() for lbl in new_regime.labels}
-                old_labels_lower = {lbl.lower() for lbl in old_regime.labels}
-
-                if any(lbl in desc_lower or lbl in name_lower for lbl in new_labels_lower):
-                    self._base_memory = bump_pattern_confidence(
-                        self._base_memory, pat.name,
-                        boost=int(self.regime_boost * 10)
-                    )
-                elif any(lbl in desc_lower or lbl in name_lower for lbl in old_labels_lower):
-                    self._base_memory = penalise_pattern_confidence(
-                        self._base_memory, pat.name,
-                        penalty=self.regime_penalty,
-                    )
-
-            # Add a strategic insight about the regime transition
-            insight_text = (
-                f"Regime transition detected: {old_regime} -> {new_regime} "
-                f"at iteration {self._iteration}"
-            )
-            evidence = (
-                f"Based on EW streaming statistics. New regime labels: "
-                f"{new_regime.labels}. Old: {old_regime.labels}."
-            )
-            new_insight = StrategicInsight(
-                insight=insight_text,
-                evidence=evidence,
-                batch_source=self._iteration,
-            )
-            # Avoid duplicate back-to-back transition insights
-            if not self._base_memory.insights or (
-                self._base_memory.insights[-1].insight != insight_text
-            ):
-                self._base_memory.insights.append(new_insight)
-                # Cap insights at 50 to avoid unbounded growth
-                if len(self._base_memory.insights) > 50:
-                    self._base_memory.insights = self._base_memory.insights[-50:]
-
-    def get_memory_health_stats(self) -> dict:
-        """Return comprehensive health statistics for the memory system.
-
-        Returns
-        -------
-        dict
-            Keys: ``active_patterns``, ``avg_confidence``,
-            ``regime_distribution``, ``staleness_score``,
-            ``outcome_counts``, ``total_iterations``.
-        """
-        with self._lock:
-            mem = self._base_memory
-            all_counts = [
-                p.occurrence_count for p in mem.success_patterns
-            ] + [
-                f.occurrence_count for f in mem.forbidden_directions
-            ]
-            max_c = max(all_counts) if all_counts else 1
-            if max_c == 0:
-                max_c = 1
-            norm_confs = [c / max_c for c in all_counts]
-            avg_conf = float(np.mean(norm_confs)) if norm_confs else 0.0
-
-            # Regime distribution from IC history
-            regime_dist = {
-                k: len(v) for k, v in self._regime_ic_history.items()
-            }
-
-            # Staleness: fraction of patterns with count 0 (never updated)
-            n_patterns = len(mem.success_patterns) + len(mem.forbidden_directions)
-            n_zero = sum(1 for c in all_counts if c == 0)
-            staleness = n_zero / max(n_patterns, 1)
-
-            return {
-                "active_patterns": n_patterns,
-                "avg_confidence": round(avg_conf, 4),
-                "regime_distribution": regime_dist,
-                "staleness_score": round(staleness, 4),
-                "outcome_counts": dict(self._outcome_counts),
-                "total_iterations": self._iteration,
-                "last_decay_iteration": self._last_decay_iteration,
-                "version": mem.version,
-            }
-
-    def to_dict(self) -> dict:
-        with self._lock:
-            return {
-                "forgetting_rate": self.forgetting_rate,
-                "regime_sensitivity": self.regime_sensitivity,
-                "min_confidence": self.min_confidence,
-                "regime_boost": self.regime_boost,
-                "regime_penalty": self.regime_penalty,
-                "iteration": self._iteration,
-                "last_decay_iteration": self._last_decay_iteration,
-                "outcome_counts": dict(self._outcome_counts),
-                "base_memory": self._base_memory.to_dict(),
-                # Regime IC history stores last N ICs per regime
-                "regime_ic_history": {
-                    k: list(v) for k, v in self._regime_ic_history.items()
-                },
-            }
-
-    @classmethod
-    def from_dict(cls, d: dict) -> "OnlineMemoryUpdater":
-        mem = ExperienceMemory.from_dict(d["base_memory"])
-        updater = cls(
-            base_memory=mem,
-            forgetting_rate=d.get("forgetting_rate", 0.01),
-            regime_sensitivity=d.get("regime_sensitivity", 0.5),
-            min_confidence=d.get("min_confidence", 0.05),
-            regime_boost=d.get("regime_boost", 0.1),
-            regime_penalty=d.get("regime_penalty", 0.3),
-        )
-        updater._iteration = d.get("iteration", 0)
-        updater._last_decay_iteration = d.get("last_decay_iteration", 0)
-        updater._outcome_counts.update(d.get("outcome_counts", {}))
-        for regime_key, ic_list in d.get("regime_ic_history", {}).items():
-            updater._regime_ic_history[regime_key] = deque(ic_list, maxlen=200)
-        return updater
-
-
-# ---------------------------------------------------------------------------
-# RegimeTransitionForecaster
-# ---------------------------------------------------------------------------
-
-class RegimeTransitionForecaster:
-    """Logistic-regression forecaster for regime transitions.
-
-    Trains on the sequence of (feature_vector, next_regime_label) pairs
-    accumulated during live trading / mining.  Used to proactively load
-    regime-specific patterns *before* a transition occurs.
-
-    The feature vector is constructed inside ``_build_feature_vector`` and
-    encodes recent EW statistics (mean, vol, Hurst proxy) concatenated with
-    a one-hot encoding of the current regime dimensions.
-
-    Parameters
-    ----------
-    n_regime_classes : int
-        Number of distinct regime label combinations tracked.  Set to a
-        small number (e.g. 8 or 16) to keep the model tractable.
-    min_samples_to_fit : int
-        Minimum labelled samples before the model is fitted.
-    refit_every : int
-        Re-train every N calls to ``predict_next_regime``.
-    """
-
-    # Feature dimension: 3 (ew stats) + 3 (trend one-hot) + 3 (vol one-hot)
-    #                   + 3 (mean_rev one-hot) = 12
-    _FEATURE_DIM = 12
-
-    def __init__(
-        self,
-        min_samples_to_fit: int = 30,
-        refit_every: int = 20,
-    ) -> None:
-        self.min_samples_to_fit = min_samples_to_fit
-        self.refit_every = refit_every
-
-        self._lock = threading.RLock()
-        self._feature_history: List[np.ndarray] = []
-        self._regime_history: List[RegimeState] = []
-        self._next_regime_labels: List[str] = []  # shifted by 1
-
-        self._model = None  # sklearn LogisticRegression, lazy init
-        self._label_encoder: Dict[str, int] = {}
-        self._inv_label_encoder: Dict[int, str] = {}
-        self._predict_call_count: int = 0
-        self._fitted: bool = False
-
-        # Cache of unique regime states seen during training
-        self._known_regimes: Dict[str, RegimeState] = {}
-
-    def record_observation(
-        self,
-        regime: RegimeState,
-        features: np.ndarray,
-    ) -> None:
-        """Append one (features, regime) observation to the training buffer.
-
-        Should be called once per bar/update with the current streaming
-        feature vector and the corresponding regime.
-
-        Parameters
-        ----------
-        regime : RegimeState
-        features : np.ndarray, shape (``_FEATURE_DIM``,)
-        """
-        with self._lock:
-            self._feature_history.append(features.copy())
-            self._regime_history.append(regime)
-            regime_str = str(regime)
-            self._known_regimes[regime_str] = regime
-
-            # Build (X, y) where y[t] = regime_str[t+1]
-            if len(self._regime_history) >= 2:
-                # The label for the *previous* observation is the current regime
-                self._next_regime_labels.append(regime_str)
-
-    def fit(
-        self,
-        regime_history: Optional[List[RegimeState]] = None,
-        feature_history: Optional[np.ndarray] = None,
-    ) -> None:
-        """Fit (or re-fit) the logistic regression model.
-
-        Can be called with external data (for back-testing) or with no
-        arguments to use the internally accumulated buffer.
-
-        Parameters
-        ----------
-        regime_history : list[RegimeState] or None
-            Optional external regime sequence (length T).
-        feature_history : np.ndarray or None, shape (T, _FEATURE_DIM)
-            Optional external feature matrix.
-        """
-        with self._lock:
-            if regime_history is not None and feature_history is not None:
-                assert len(regime_history) == len(feature_history)
-                feats = feature_history
-                regimes = regime_history
-                labels = [str(r) for r in regimes[1:]]
-                X = feats[:-1]
-            else:
-                if len(self._next_regime_labels) < self.min_samples_to_fit:
-                    return
-                X = np.array(self._feature_history[:-1])
-                labels = self._next_regime_labels
-
-            unique_labels = list(set(labels))
-            if len(unique_labels) < 2:
-                return  # Cannot fit with only one class
-
-            self._label_encoder = {lbl: i for i, lbl in enumerate(unique_labels)}
-            self._inv_label_encoder = {i: lbl for lbl, i in self._label_encoder.items()}
-
-            y = np.array([self._label_encoder[lbl] for lbl in labels])
-
-            try:
-                from sklearn.linear_model import LogisticRegression
-                from sklearn.preprocessing import StandardScaler
-
-                scaler = StandardScaler()
-                X_scaled = scaler.fit_transform(X)
-
-                model = LogisticRegression(
-                    max_iter=500,
-                    solver="lbfgs",
-                    C=1.0,
-                    random_state=42,
-                )
-                model.fit(X_scaled, y)
-                self._model = (scaler, model)
-                self._fitted = True
-            except Exception as e:
-                logger.warning("RegimeTransitionForecaster fit failed: %s", e)
-                self._fitted = False
-
-    def predict_next_regime(
-        self,
-        current_features: np.ndarray,
-    ) -> Tuple[RegimeState, float]:
-        """Predict the most probable next regime.
-
-        Parameters
-        ----------
-        current_features : np.ndarray, shape (``_FEATURE_DIM``,)
-
-        Returns
-        -------
-        (RegimeState, float)
-            Predicted regime and probability.  Returns (current regime, 0.0)
-            if the model is not yet fitted.
-        """
-        with self._lock:
-            self._predict_call_count += 1
-            if self._predict_call_count % self.refit_every == 0:
-                self.fit()
-
-            if not self._fitted or self._model is None:
-                # Fall back to current regime
-                current = (
-                    self._regime_history[-1]
-                    if self._regime_history
-                    else RegimeState()
-                )
-                return current, 0.0
-
-            scaler, model = self._model
-            try:
-                X = scaler.transform(current_features.reshape(1, -1))
-                proba = model.predict_proba(X)[0]
-                best_class = int(np.argmax(proba))
-                best_prob = float(proba[best_class])
-                best_label = self._inv_label_encoder.get(best_class, "")
-                best_regime = self._known_regimes.get(best_label, RegimeState())
-                return best_regime, best_prob
-            except Exception as e:
-                logger.warning("RegimeTransitionForecaster predict failed: %s", e)
-                return RegimeState(), 0.0
-
-    def prepare_memory_for_transition(
-        self,
-        predicted_regime: RegimeState,
-        pattern_store: RegimeSpecificPatternStore,
-        boost: float = 0.15,
-    ) -> None:
-        """Pre-load (boost confidence of) patterns for the predicted regime.
-
-        Parameters
-        ----------
-        predicted_regime : RegimeState
-        pattern_store : RegimeSpecificPatternStore
-        boost : float
-            Confidence boost applied to matching patterns.
-        """
-        pattern_store.boost_regime_patterns(predicted_regime, boost=boost)
-
-    @staticmethod
-    def build_feature_vector(
-        ew_mean: float,
-        ew_std: float,
-        hurst_proxy: float,
-        regime: RegimeState,
-    ) -> np.ndarray:
-        """Build a fixed-length feature vector from streaming statistics.
-
-        Layout (12 elements):
-        [0]  ew_mean
-        [1]  ew_std
-        [2]  hurst_proxy
-        [3-5]  trend one-hot (BULL, BEAR, NEUTRAL)
-        [6-8]  vol one-hot (HIGH_VOL, LOW_VOL, NORMAL_VOL)
-        [9-11] mean_rev one-hot (TRENDING, MEAN_REVERTING, RANDOM_WALK)
-
-        Parameters
-        ----------
-        ew_mean, ew_std, hurst_proxy : float
-        regime : RegimeState
-
-        Returns
-        -------
-        np.ndarray, shape (12,)
-        """
-        from factorminer.evaluation.regime import TrendRegime, VolRegime, MeanRevRegime
-
-        trend_oh = [
-            float(regime.trend == TrendRegime.BULL),
-            float(regime.trend == TrendRegime.BEAR),
-            float(regime.trend == TrendRegime.NEUTRAL),
-        ]
-        vol_oh = [
-            float(regime.vol == VolRegime.HIGH_VOL),
-            float(regime.vol == VolRegime.LOW_VOL),
-            float(regime.vol == VolRegime.NORMAL_VOL),
-        ]
-        mr_oh = [
-            float(regime.mean_rev == MeanRevRegime.TRENDING),
-            float(regime.mean_rev == MeanRevRegime.MEAN_REVERTING),
-            float(regime.mean_rev == MeanRevRegime.RANDOM_WALK),
-        ]
-        return np.array(
-            [ew_mean, ew_std, hurst_proxy] + trend_oh + vol_oh + mr_oh,
-            dtype=np.float64,
-        )
-
-    def to_dict(self) -> dict:
-        with self._lock:
-            return {
-                "min_samples_to_fit": self.min_samples_to_fit,
-                "refit_every": self.refit_every,
-                "predict_call_count": self._predict_call_count,
-                "fitted": self._fitted,
-                "feature_history": [f.tolist() for f in self._feature_history[-500:]],
-                "regime_history": [r.to_dict() for r in self._regime_history[-500:]],
-                "next_regime_labels": self._next_regime_labels[-500:],
-                "known_regimes": {k: v.to_dict() for k, v in self._known_regimes.items()},
-            }
-
-    @classmethod
-    def from_dict(cls, d: dict) -> "RegimeTransitionForecaster":
-        forecaster = cls(
-            min_samples_to_fit=d.get("min_samples_to_fit", 30),
-            refit_every=d.get("refit_every", 20),
-        )
-        forecaster._feature_history = [
-            np.array(f, dtype=np.float64) for f in d.get("feature_history", [])
-        ]
-        forecaster._regime_history = [
-            RegimeState.from_dict(r) for r in d.get("regime_history", [])
-        ]
-        forecaster._next_regime_labels = d.get("next_regime_labels", [])
-        forecaster._known_regimes = {
-            k: RegimeState.from_dict(v)
-            for k, v in d.get("known_regimes", {}).items()
-        }
-        forecaster._predict_call_count = d.get("predict_call_count", 0)
-        if d.get("fitted", False):
-            forecaster.fit()
-        return forecaster
-
-
-# ---------------------------------------------------------------------------
-# OnlineRegimeMemory — main orchestrator
-# ---------------------------------------------------------------------------
-
-class OnlineRegimeMemory:
-    """Full online regime-aware memory system.
-
-    Integrates:
-    - ``StreamingRegimeDetector`` for bar-by-bar regime classification
-    - ``RegimeSpecificPatternStore`` for per-regime IC tracking
-    - ``OnlineMemoryUpdater`` for streaming forgetting and regime-change hooks
-    - ``RegimeTransitionForecaster`` for proactive memory preparation
-
-    Usage
-    -----
-    ::
-
-        from factorminer.memory.online_regime_memory import OnlineRegimeMemory
-        from factorminer.memory.memory_store import ExperienceMemory
-
-        mem = OnlineRegimeMemory(base_memory=ExperienceMemory(), config={})
-
-        # In the mining loop, after each bar of market data:
-        mem.update_market(returns=bar_returns)
-
-        # After each factor evaluation:
-        mem.update(formula, signals, ic, market_data, outcome)
-
-        # At generation time:
-        signal = mem.retrieve(library_state, market_data)
-        print(signal.prompt_text)
-
-    Parameters
-    ----------
-    base_memory : ExperienceMemory
-    config : dict
-        Optional configuration overrides.  Keys and defaults:
-
-        - ``forgetting_rate`` (0.01): per-iteration decay
-        - ``regime_sensitivity`` (0.5): how much to weight regime-specific patterns
-        - ``min_confidence`` (0.05): pruning threshold
-        - ``forget_every_n_iterations`` (10): call ``apply_forgetting`` every N evals
-        - ``max_regime_patterns`` (500): capacity of regime pattern store
-        - ``streaming_config`` ({}): forwarded to ``StreamingRegimeConfig``
-    """
-
-    def __init__(
-        self,
-        base_memory: Optional[ExperienceMemory] = None,
-        config: Optional[dict] = None,
-    ) -> None:
-        cfg = config or {}
-        if base_memory is None:
-            base_memory = ExperienceMemory()
-
-        streaming_cfg = StreamingRegimeConfig(
-            **{k: v for k, v in cfg.get("streaming_config", {}).items()
-               if k in StreamingRegimeConfig.__dataclass_fields__}
-        )
-        self._detector = StreamingRegimeDetector(config=streaming_cfg)
-        self._pattern_store = RegimeSpecificPatternStore(
-            max_patterns=cfg.get("max_regime_patterns", 500),
-        )
-        self._updater = OnlineMemoryUpdater(
-            base_memory=base_memory,
-            forgetting_rate=cfg.get("forgetting_rate", 0.01),
-            regime_sensitivity=cfg.get("regime_sensitivity", 0.5),
-            min_confidence=cfg.get("min_confidence", 0.05),
-        )
-        self._forecaster = RegimeTransitionForecaster()
-        self._forget_every = cfg.get("forget_every_n_iterations", 10)
-        self._iteration_count: int = 0
-        self._current_regime: RegimeState = RegimeState()
-        self._lock = threading.RLock()
-
-        # Track last regime for change detection
-        self._prev_regime: RegimeState = RegimeState()
-
-    # ------------------------------------------------------------------
-    # Primary API
-    # ------------------------------------------------------------------
-
-    def update_market(
-        self,
-        returns: np.ndarray,
-        volumes: Optional[np.ndarray] = None,
-    ) -> RegimeState:
-        """Process one bar of market data and update the regime state.
-
-        Call this *before* ``update()`` on any factors evaluated at this bar.
-
-        Parameters
-        ----------
-        returns : np.ndarray, shape (M,)
-        volumes : np.ndarray or None
-
-        Returns
-        -------
-        RegimeState
-            Updated current regime.
-        """
-        with self._lock:
-            new_regime = self._detector.update(returns, volumes)
-            prev = self._current_regime
-
-            if new_regime != prev:
-                self._updater.on_regime_change(prev, new_regime)
-                self._pattern_store.boost_regime_patterns(new_regime, boost=0.1)
-                self._pattern_store.penalise_regime_patterns(prev, penalty=0.15)
-
-                # Prepare memory proactively
-                feat = self._build_feature_vector(new_regime)
-                predicted, prob = self._forecaster.predict_next_regime(feat)
-                if prob > 0.5:
-                    self._forecaster.prepare_memory_for_transition(
-                        predicted, self._pattern_store
-                    )
-
-            self._prev_regime = prev
-            self._current_regime = new_regime
-
-            # Record for forecaster
-            feat = self._build_feature_vector(new_regime)
-            self._forecaster.record_observation(new_regime, feat)
-
-            return new_regime
-
-    def update(
-        self,
-        formula: str,
-        signals: np.ndarray,
-        ic: float,
-        market_data: Optional[dict] = None,
-        outcome: str = "admitted",
-    ) -> None:
-        """Single update call: detect regime from market_data, update patterns.
-
-        This is the main hook called inside the mining loop after each factor
-        evaluation.  It orchestrates:
-        1. Regime detection from ``market_data`` (if provided)
-        2. Regime-specific pattern update
-        3. Base memory update (online updater)
-        4. Periodic forgetting
-
-        Parameters
-        ----------
-        formula : str
-            DSL formula string.
-        signals : np.ndarray
-            Factor signal matrix (used only for future extension).
-        ic : float
-            Observed IC.
-        market_data : dict or None
-            Optional dict with key ``'returns'`` (np.ndarray).
-        outcome : str
-        """
-        with self._lock:
-            regime = self._current_regime
-
-            # If market_data provided, do an inline regime update
-            if market_data is not None and "returns" in market_data:
-                regime = self.update_market(
-                    market_data["returns"],
-                    market_data.get("volumes"),
-                )
-
-            # Update regime-specific pattern store
-            if abs(ic) >= 0.02:
-                self._pattern_store.add_pattern(formula, regime, ic)
-
-            # Notify online updater
-            self._updater.on_factor_evaluated(formula, ic, regime, outcome)
-
-            self._iteration_count += 1
-
-            # Periodic forgetting
-            if self._iteration_count % self._forget_every == 0:
-                self._updater.apply_forgetting(
-                    iterations_elapsed=self._forget_every
-                )
-                decay = (1.0 - self._updater.forgetting_rate) ** self._forget_every
-                self._pattern_store.apply_decay(decay)
-
-    def retrieve(
-        self,
-        library_state: Optional[dict] = None,
-        market_data: Optional[dict] = None,
-        max_success: int = 8,
-        max_forbidden: int = 10,
-        max_insights: int = 10,
-        top_regime_patterns: int = 5,
-    ) -> MemorySignal:
-        """Regime-aware memory retrieval.
-
-        Combines the standard base-memory retrieval with regime-specific
-        pattern selection and a next-regime forecast.
-
-        Parameters
-        ----------
-        library_state : dict or None
-        market_data : dict or None
-        max_success : int
-        max_forbidden : int
-        max_insights : int
-        top_regime_patterns : int
-
-        Returns
-        -------
-        MemorySignal
-        """
-        with self._lock:
-            current_regime = self._current_regime
-
-            # Update regime if market data provided
-            if market_data is not None and "returns" in market_data:
-                current_regime = self.update_market(
-                    market_data["returns"], market_data.get("volumes")
-                )
-
-            # 1. Base retrieval
-            base_result = retrieve_memory(
-                self._updater.base_memory,
-                library_state=library_state,
-                max_success=max_success,
-                max_forbidden=max_forbidden,
-                max_insights=max_insights,
-            )
-
-            # 2. Regime-specific patterns
-            regime_pats = self._pattern_store.retrieve_for_regime(
-                current_regime, top_k=top_regime_patterns
-            )
-            cross_pats = self._pattern_store.get_cross_regime_patterns(
-                top_k=top_regime_patterns // 2 + 1
-            )
-
-            # 3. Forecast next regime
-            feat = self._build_feature_vector(current_regime)
-            predicted_regime, forecast_conf = self._forecaster.predict_next_regime(feat)
-
-            # 4. Build enriched prompt text
-            regime_section = self._format_regime_section(
-                current_regime, regime_pats, cross_pats, predicted_regime, forecast_conf
-            )
-            prompt_text = base_result["prompt_text"] + "\n" + regime_section
-
-            return MemorySignal(
-                recommended_directions=base_result["recommended_directions"],
-                forbidden_directions=base_result["forbidden_directions"],
-                insights=base_result["insights"],
-                library_state=base_result["library_state"],
-                prompt_text=prompt_text,
-                current_regime=current_regime,
-                regime_patterns=[p.to_dict() for p in regime_pats],
-                cross_regime_patterns=[p.to_dict() for p in cross_pats],
-                forecasted_regime=predicted_regime if forecast_conf > 0.0 else None,
-                forecast_confidence=forecast_conf,
-            )
-
-    def get_full_status(self) -> dict:
-        """Comprehensive status: regime, patterns, health, forecasts.
-
-        Returns
-        -------
-        dict
-            Keys: ``current_regime``, ``regime_history``, ``transition_probs``,
-            ``pattern_store_stats``, ``memory_health``, ``forecasted_regime``,
-            ``forecast_confidence``, ``iteration_count``.
-        """
-        with self._lock:
-            current = self._current_regime
-            feat = self._build_feature_vector(current)
-            predicted, conf = self._forecaster.predict_next_regime(feat)
-            history = self._detector.get_regime_history(lookback=20)
-            return {
-                "current_regime": current.to_dict(),
-                "regime_history": [r.to_dict() for r in history],
-                "transition_probs": self._detector.regime_transition_probability(),
-                "pattern_store_stats": self._pattern_store.get_stats(),
-                "memory_health": self._updater.get_memory_health_stats(),
-                "forecasted_regime": predicted.to_dict() if conf > 0.0 else None,
-                "forecast_confidence": round(conf, 4),
-                "iteration_count": self._iteration_count,
-            }
-
-    # ------------------------------------------------------------------
-    # Persistence
-    # ------------------------------------------------------------------
-
-    def save(self, path: str | Path) -> None:
-        """Serialise to JSON.
-
-        Parameters
-        ----------
-        path : str or Path
-        """
-        path = Path(path)
-        path.parent.mkdir(parents=True, exist_ok=True)
-        with self._lock:
-            data = self.to_dict()
-        with open(path, "w") as f:
-            json.dump(data, f, indent=2, ensure_ascii=False)
-
-    def load(self, path: str | Path) -> None:
-        """Deserialise from JSON.
-
-        Parameters
-        ----------
-        path : str or Path
-        """
-        with open(path) as f:
-            data = json.load(f)
-        with self._lock:
-            self._from_dict_inplace(data)
-
-    def to_dict(self) -> dict:
-        with self._lock:
-            return {
-                "_version": 1,
-                "iteration_count": self._iteration_count,
-                "current_regime": self._current_regime.to_dict(),
-                "prev_regime": self._prev_regime.to_dict(),
-                "forget_every": self._forget_every,
-                "updater": self._updater.to_dict(),
-                "pattern_store": self._pattern_store.to_dict(),
-                "forecaster": self._forecaster.to_dict(),
-            }
-
-    @classmethod
-    def from_dict(cls, d: dict) -> "OnlineRegimeMemory":
-        mem_data = d["updater"]["base_memory"]
-        base_mem = ExperienceMemory.from_dict(mem_data)
-        cfg = {"forget_every_n_iterations": d.get("forget_every", 10)}
-        obj = cls(base_memory=base_mem, config=cfg)
-        obj._from_dict_inplace(d)
-        return obj
-
-    def _from_dict_inplace(self, d: dict) -> None:
-        self._iteration_count = d.get("iteration_count", 0)
-        self._current_regime = RegimeState.from_dict(
-            d.get("current_regime", {})
-        )
-        self._prev_regime = RegimeState.from_dict(
-            d.get("prev_regime", {})
-        )
-        self._forget_every = d.get("forget_every", 10)
-        self._updater = OnlineMemoryUpdater.from_dict(d["updater"])
-        self._pattern_store = RegimeSpecificPatternStore.from_dict(
-            d["pattern_store"]
-        )
-        self._forecaster = RegimeTransitionForecaster.from_dict(
-            d["forecaster"]
-        )
-
-    # pickle support
-    def __getstate__(self) -> dict:
-        return self.to_dict()
-
-    def __setstate__(self, state: dict) -> None:
-        # Minimal init to avoid __init__ side effects
-        self._lock = threading.RLock()
-        self._from_dict_inplace(state)
-        # Rebuild detector (streaming state is not persisted)
-        self._detector = StreamingRegimeDetector()
-
-    # ------------------------------------------------------------------
-    # Internal helpers
-    # ------------------------------------------------------------------
-
-    def _build_feature_vector(self, regime: RegimeState) -> np.ndarray:
-        """Build a 12-element feature vector from the detector's EW state."""
-        ew_mean = self._detector._ew_mean
-        ew_std = float(np.sqrt(max(self._detector._ew_var, 0.0)))
-        # Use the ratio of fast/slow variance as a Hurst proxy
-        slow_var = max(self._detector._ew_var_slow, 1e-16)
-        fast_var = max(self._detector._ew_var, 1e-16)
-        hurst_proxy = float(np.clip(
-            0.5 + 0.5 * math.log(fast_var / slow_var + 1e-10) / (math.log(20) + 1e-10),
-            0.0, 1.0
-        ))
-        return RegimeTransitionForecaster.build_feature_vector(
-            ew_mean, ew_std, hurst_proxy, regime
-        )
-
-    @staticmethod
-    def _format_regime_section(
-        current: RegimeState,
-        regime_patterns: List[RegimeSpecificPattern],
-        cross_patterns: List[RegimeSpecificPattern],
-        predicted: RegimeState,
-        forecast_conf: float,
-    ) -> str:
-        lines = [
-            "=== REGIME-AWARE MEMORY ===",
-            f"Current regime: {current}",
-        ]
-        if forecast_conf > 0.3:
-            lines.append(
-                f"Forecasted next regime: {predicted} "
-                f"(confidence {forecast_conf:.1%})"
-            )
-        if regime_patterns:
-            lines.append("\nTop patterns for current regime:")
-            for i, p in enumerate(regime_patterns, 1):
-                lines.append(
-                    f"  {i}. {p.formula_template[:80]}  "
-                    f"[IC={p.ic_in_regime:.3f}, "
-                    f"spec={p.regime_specificity:.2f}, "
-                    f"conf={p.confidence:.2f}]"
-                )
-        if cross_patterns:
-            lines.append("\nCross-regime (universal) patterns:")
-            for i, p in enumerate(cross_patterns, 1):
-                lines.append(
-                    f"  {i}. {p.formula_template[:80]}  "
-                    f"[avg_IC={abs(p.ic_in_regime):.3f}, "
-                    f"conf={p.confidence:.2f}]"
-                )
-        lines.append("")
-        return "\n".join(lines)
-
-
-# ---------------------------------------------------------------------------
-# MemoryForgetCurve
-# ---------------------------------------------------------------------------
-
-@dataclass
-class _MemorySnapshot:
-    """Internal snapshot used by MemoryForgetCurve."""
-    iteration: int
-    timestamp: float
-    active_patterns: int
-    avg_confidence: float
-    n_regime_patterns: int
-    staleness_score: float
-    pattern_confidences: List[float]
-
-
-class MemoryForgetCurve:
-    """Track and visualise how memory evolves (and forgets) over mining iterations.
-
-    Parameters
-    ----------
-    max_snapshots : int
-        Maximum snapshots to retain in memory.
-    """
-
-    def __init__(self, max_snapshots: int = 1000) -> None:
-        self.max_snapshots = max_snapshots
-        self._snapshots: List[_MemorySnapshot] = []
-        self._lock = threading.RLock()
-
-    def record_snapshot(
-        self,
-        memory: OnlineRegimeMemory,
-        iteration: int,
-    ) -> None:
-        """Record a snapshot of the current memory state.
-
-        Parameters
-        ----------
-        memory : OnlineRegimeMemory
-        iteration : int
-            Current mining iteration number (used as x-axis).
-        """
-        status = memory.get_full_status()
-        health = status["memory_health"]
-        ps = status["pattern_store_stats"]
-
-        # Collect per-pattern confidences from the regime pattern store
-        with memory._lock:
-            confs = [
-                p.confidence
-                for p in memory._pattern_store._patterns.values()
-            ]
-
-        snap = _MemorySnapshot(
-            iteration=iteration,
-            timestamp=time.time(),
-            active_patterns=health["active_patterns"],
-            avg_confidence=health["avg_confidence"],
-            n_regime_patterns=ps["total_patterns"],
-            staleness_score=health["staleness_score"],
-            pattern_confidences=confs,
-        )
-        with self._lock:
-            self._snapshots.append(snap)
-            if len(self._snapshots) > self.max_snapshots:
-                self._snapshots = self._snapshots[-self.max_snapshots:]
-
-    def get_pattern_lifetimes(self) -> List[float]:
-        """Estimate pattern lifetimes (iterations survived) from snapshot series.
-
-        Returns
-        -------
-        list[float]
-            One entry per 'pattern birth' estimated from count increases.
-            Uses the number of iterations between when a pattern first appears
-            (count > 0) and drops below min_confidence.
-
-        Note: this is an approximation based on the active count trajectory.
-        """
-        with self._lock:
-            if len(self._snapshots) < 2:
-                return []
-            counts = [s.n_regime_patterns for s in self._snapshots]
-            iterations = [s.iteration for s in self._snapshots]
-            lifetimes = []
-            # Simple heuristic: measure spans between count peaks and troughs
-            for i in range(1, len(counts) - 1):
-                if counts[i] > counts[i - 1] and counts[i] > counts[i + 1]:
-                    # Local peak: estimate lifetime as distance to next trough
-                    for j in range(i + 1, len(counts)):
-                        if counts[j] < counts[i] * 0.5:
-                            lifetimes.append(float(iterations[j] - iterations[i]))
-                            break
-            return lifetimes
-
-    def plot_confidence_decay(self) -> None:
-        """Plot confidence decay and pattern count over iterations.
-
-        Requires matplotlib to be installed.  If not available, prints a
-        summary table instead.
-        """
-        with self._lock:
-            snapshots = list(self._snapshots)
-
-        if not snapshots:
-            print("No snapshots recorded yet.")
-            return
-
-        iterations = [s.iteration for s in snapshots]
-        avg_confs = [s.avg_confidence for s in snapshots]
-        active = [s.active_patterns for s in snapshots]
-        regime_counts = [s.n_regime_patterns for s in snapshots]
-        staleness = [s.staleness_score for s in snapshots]
-
-        try:
-            import matplotlib.pyplot as plt
-
-            fig, axes = plt.subplots(2, 2, figsize=(12, 8))
-            fig.suptitle("Memory Forget Curve", fontsize=14)
-
-            ax = axes[0, 0]
-            ax.plot(iterations, avg_confs, "b-o", markersize=3)
-            ax.set_title("Average Pattern Confidence")
-            ax.set_xlabel("Iteration")
-            ax.set_ylabel("Confidence")
-            ax.grid(True, alpha=0.3)
-
-            ax = axes[0, 1]
-            ax.plot(iterations, active, "g-o", markersize=3)
-            ax.set_title("Active Patterns (base memory)")
-            ax.set_xlabel("Iteration")
-            ax.set_ylabel("Count")
-            ax.grid(True, alpha=0.3)
-
-            ax = axes[1, 0]
-            ax.plot(iterations, regime_counts, "r-o", markersize=3)
-            ax.set_title("Regime-Specific Patterns")
-            ax.set_xlabel("Iteration")
-            ax.set_ylabel("Count")
-            ax.grid(True, alpha=0.3)
-
-            ax = axes[1, 1]
-            ax.plot(iterations, staleness, "m-o", markersize=3)
-            ax.set_title("Staleness Score (fraction of zero-count patterns)")
-            ax.set_xlabel("Iteration")
-            ax.set_ylabel("Staleness")
-            ax.grid(True, alpha=0.3)
-
-            plt.tight_layout()
-            plt.show()
-
-        except ImportError:
-            # Fallback: ASCII table
-            print(
-                f"{'Iter':>8} {'AvgConf':>10} {'Active':>8} "
-                f"{'RegimePats':>12} {'Staleness':>10}"
-            )
-            print("-" * 52)
-            for s in snapshots[::max(1, len(snapshots) // 20)]:
-                print(
-                    f"{s.iteration:>8} {s.avg_confidence:>10.4f} "
-                    f"{s.active_patterns:>8} {s.n_regime_patterns:>12} "
-                    f"{s.staleness_score:>10.4f}"
-                )
-
-    def to_dict(self) -> dict:
-        with self._lock:
-            return {
-                "max_snapshots": self.max_snapshots,
-                "snapshots": [
-                    {
-                        "iteration": s.iteration,
-                        "timestamp": s.timestamp,
-                        "active_patterns": s.active_patterns,
-                        "avg_confidence": s.avg_confidence,
-                        "n_regime_patterns": s.n_regime_patterns,
-                        "staleness_score": s.staleness_score,
-                    }
-                    for s in self._snapshots
-                ],
-            }
-
-    @classmethod
-    def from_dict(cls, d: dict) -> "MemoryForgetCurve":
-        curve = cls(max_snapshots=d.get("max_snapshots", 1000))
-        for sd in d.get("snapshots", []):
-            snap = _MemorySnapshot(
-                iteration=sd["iteration"],
-                timestamp=sd["timestamp"],
-                active_patterns=sd["active_patterns"],
-                avg_confidence=sd["avg_confidence"],
-                n_regime_patterns=sd["n_regime_patterns"],
-                staleness_score=sd["staleness_score"],
-                pattern_confidences=[],
-            )
-            curve._snapshots.append(snap)
-        return curve
-
-
-# ---------------------------------------------------------------------------
-# Utility helpers
-# ---------------------------------------------------------------------------
-
-def _formula_matches_template(formula: str, template: str) -> bool:
-    """Heuristic check: does a formula share structural operators with a template?
-
-    Extracts capitalised operator names from both strings and tests for
-    meaningful overlap (>= 1 shared operator, or substring containment).
-    """
-    import re
-    op_re = re.compile(r"\b([A-Z][a-zA-Z]+)\(")
-    f_ops = set(op_re.findall(formula))
-    t_ops = set(op_re.findall(template))
-    if not f_ops or not t_ops:
-        return False
-    overlap = f_ops & t_ops
-    # At least 1 operator shared AND at least half of template ops present
-    return (
-        len(overlap) >= 1
-        and len(overlap) / max(len(t_ops), 1) >= 0.4
-    )
diff --git a/src/factorminer/factorminer/memory/retrieval.py b/src/factorminer/factorminer/memory/retrieval.py
deleted file mode 100644
index 2fd8519..0000000
--- a/src/factorminer/factorminer/memory/retrieval.py
+++ /dev/null
@@ -1,288 +0,0 @@
-"""Memory Retrieval operator R(M, L).
-
-Context-dependent retrieval of experience memory, producing a structured
-memory signal m for injection into the LLM generation prompt.
-
-The retrieval considers the current library state (domain saturation,
-recent rejections) to select the most relevant patterns and insights.
-"""
-
-from __future__ import annotations
-
-from typing import Any, Dict, List, Optional
-
-from src.factorminer.factorminer.memory.memory_store import (
-    ExperienceMemory,
-    ForbiddenDirection,
-    MiningState,
-    StrategicInsight,
-    SuccessPattern,
-)
-
-
-def _score_success_pattern(
-    pattern: SuccessPattern,
-    domain_saturation: Dict[str, float],
-    saturated_threshold: float = 0.7,
-) -> float:
-    """Score a success pattern for relevance given current library state.
-
-    Patterns in saturated domains score lower; high success-rate patterns
-    with many occurrences score higher.
-    """
-    base_score = 1.0
-
-    # Success rate bonus
-    rate_bonus = {"High": 2.0, "Medium": 1.0, "Low": 0.5}
-    base_score *= rate_bonus.get(pattern.success_rate, 1.0)
-
-    # Occurrence count bonus (log scale to avoid runaway)
-    if pattern.occurrence_count > 0:
-        import math
-        base_score *= 1.0 + math.log1p(pattern.occurrence_count)
-
-    # Domain saturation penalty
-    saturation = domain_saturation.get(pattern.name, 0.0)
-    if saturation >= saturated_threshold:
-        base_score *= 0.2  # Heavily penalize saturated domains
-    elif saturation >= 0.5:
-        base_score *= 0.6
-
-    return base_score
-
-
-def _score_forbidden_direction(
-    direction: ForbiddenDirection,
-    recent_rejection_reasons: List[str],
-) -> float:
-    """Score a forbidden direction for relevance.
-
-    Directions matching recent rejection reasons score higher (more
-    important to communicate to the LLM).
-    """
-    base_score = 1.0
-
-    # Higher correlation = more important to avoid
-    base_score *= 1.0 + direction.typical_correlation
-
-    # Occurrence count: frequently encountered = important warning
-    if direction.occurrence_count > 0:
-        import math
-        base_score *= 1.0 + math.log1p(direction.occurrence_count)
-
-    # Boost if matching recent rejections
-    direction_lower = direction.name.lower()
-    for reason in recent_rejection_reasons:
-        if any(
-            word in reason.lower()
-            for word in direction_lower.split()
-            if len(word) > 3
-        ):
-            base_score *= 1.5
-            break
-
-    return base_score
-
-
-def _select_relevant_success(
-    patterns: List[SuccessPattern],
-    domain_saturation: Dict[str, float],
-    max_patterns: int = 8,
-) -> List[SuccessPattern]:
-    """Select the most relevant success patterns for the current context."""
-    if not patterns:
-        return []
-
-    scored = [
-        (pat, _score_success_pattern(pat, domain_saturation))
-        for pat in patterns
-    ]
-    scored.sort(key=lambda x: x[1], reverse=True)
-    return [pat for pat, _ in scored[:max_patterns]]
-
-
-def _select_relevant_forbidden(
-    directions: List[ForbiddenDirection],
-    recent_rejections: List[dict],
-    max_directions: int = 10,
-) -> List[ForbiddenDirection]:
-    """Select the most relevant forbidden directions for the current context."""
-    if not directions:
-        return []
-
-    recent_reasons = [
-        r.get("reason", "") for r in recent_rejections
-    ]
-    scored = [
-        (d, _score_forbidden_direction(d, recent_reasons))
-        for d in directions
-    ]
-    scored.sort(key=lambda x: x[1], reverse=True)
-    return [d for d, _ in scored[:max_directions]]
-
-
-def _format_library_state(state: MiningState) -> Dict[str, Any]:
-    """Format mining state as structured context for LLM prompt."""
-    # Identify saturated domains
-    saturated = {
-        domain: sat
-        for domain, sat in state.domain_saturation.items()
-        if sat >= 0.5
-    }
-
-    # Recent admission rate trend
-    recent_logs = state.admission_log[-5:] if state.admission_log else []
-    avg_rate = 0.0
-    if recent_logs:
-        avg_rate = sum(log.get("admission_rate", 0) for log in recent_logs) / len(recent_logs)
-
-    return {
-        "library_size": state.library_size,
-        "recent_admission_rate": round(avg_rate, 3),
-        "saturated_domains": saturated,
-        "recent_admissions_count": len(state.recent_admissions),
-        "recent_rejections_count": len(state.recent_rejections),
-    }
-
-
-def _format_for_prompt(
-    success_patterns: List[SuccessPattern],
-    forbidden_directions: List[ForbiddenDirection],
-    insights: List[StrategicInsight],
-    library_state: Dict[str, Any],
-) -> str:
-    """Format the memory signal as structured text for LLM injection.
-
-    Produces a human-readable prompt section that can be inserted into
-    the factor generation prompt to guide the LLM.
-    """
-    sections = []
-
-    # Library state
-    sections.append("=== CURRENT LIBRARY STATE ===")
-    sections.append(f"Library size: {library_state['library_size']} factors")
-    sections.append(f"Recent admission rate: {library_state['recent_admission_rate']:.1%}")
-    if library_state.get("saturated_domains"):
-        sections.append("Saturated domains (avoid):")
-        for domain, sat in library_state["saturated_domains"].items():
-            sections.append(f"  - {domain}: {sat:.0%} saturated")
-    sections.append("")
-
-    # Recommended directions
-    if success_patterns:
-        sections.append("=== RECOMMENDED DIRECTIONS (P_succ) ===")
-        for i, pat in enumerate(success_patterns, 1):
-            sections.append(f"{i}. {pat.name} [{pat.success_rate}]")
-            sections.append(f"   {pat.description}")
-            sections.append(f"   Template: {pat.template}")
-            if pat.example_factors:
-                sections.append(f"   Examples: {', '.join(pat.example_factors[:3])}")
-        sections.append("")
-
-    # Forbidden directions
-    if forbidden_directions:
-        sections.append("=== FORBIDDEN DIRECTIONS (P_fail) ===")
-        sections.append("DO NOT generate factors using these patterns:")
-        for i, fd in enumerate(forbidden_directions, 1):
-            sections.append(f"{i}. {fd.name} (rho > {fd.typical_correlation:.2f})")
-            sections.append(f"   Reason: {fd.reason}")
-            if fd.correlated_factors:
-                sections.append(f"   Correlated with: {', '.join(fd.correlated_factors[:3])}")
-        sections.append("")
-
-    # Strategic insights
-    if insights:
-        sections.append("=== STRATEGIC INSIGHTS ===")
-        for insight in insights:
-            sections.append(f"- {insight.insight}")
-            sections.append(f"  Evidence: {insight.evidence}")
-        sections.append("")
-
-    return "\n".join(sections)
-
-
-# ---------------------------------------------------------------------------
-# Public API: Memory Retrieval
-# ---------------------------------------------------------------------------
-
-def retrieve_memory(
-    memory: ExperienceMemory,
-    library_state: Optional[Dict[str, Any]] = None,
-    max_success: int = 8,
-    max_forbidden: int = 10,
-    max_insights: int = 10,
-) -> Dict[str, Any]:
-    """Memory Retrieval operator R(M, L).
-
-    Performs context-dependent retrieval matching against the current
-    library state, returning a memory signal m suitable for LLM prompt
-    injection.
-
-    Parameters
-    ----------
-    memory : ExperienceMemory
-        The experience memory to retrieve from.
-    library_state : dict, optional
-        Current library diagnostics. If None, uses the state from memory.
-        Expected keys: library_size, domain_saturation, etc.
-    max_success : int
-        Maximum number of success patterns to include.
-    max_forbidden : int
-        Maximum number of forbidden directions to include.
-    max_insights : int
-        Maximum number of insights to include.
-
-    Returns
-    -------
-    dict
-        Memory signal m with keys:
-        - recommended_directions: list of success pattern dicts
-        - forbidden_directions: list of forbidden direction dicts
-        - insights: list of insight dicts
-        - library_state: dict of library state info
-        - prompt_text: str - formatted text for LLM prompt injection
-    """
-    # Use provided library state or fall back to memory's state
-    if library_state is not None:
-        # Update memory state with external library info
-        state = MiningState(
-            library_size=library_state.get("library_size", memory.state.library_size),
-            recent_admissions=memory.state.recent_admissions,
-            recent_rejections=memory.state.recent_rejections,
-            domain_saturation=library_state.get(
-                "domain_saturation", memory.state.domain_saturation
-            ),
-            admission_log=memory.state.admission_log,
-        )
-    else:
-        state = memory.state
-
-    # Select relevant patterns
-    relevant_success = _select_relevant_success(
-        memory.success_patterns, state.domain_saturation, max_success
-    )
-    relevant_forbidden = _select_relevant_forbidden(
-        memory.forbidden_directions, state.recent_rejections, max_forbidden
-    )
-
-    # Select most recent insights (up to limit)
-    sorted_insights = sorted(
-        memory.insights, key=lambda i: i.batch_source, reverse=True
-    )
-    relevant_insights = sorted_insights[:max_insights]
-
-    # Format library state
-    lib_state_info = _format_library_state(state)
-
-    # Format as prompt text
-    prompt_text = _format_for_prompt(
-        relevant_success, relevant_forbidden, relevant_insights, lib_state_info
-    )
-
-    return {
-        "recommended_directions": [p.to_dict() for p in relevant_success],
-        "forbidden_directions": [f.to_dict() for f in relevant_forbidden],
-        "insights": [i.to_dict() for i in relevant_insights],
-        "library_state": lib_state_info,
-        "prompt_text": prompt_text,
-    }
diff --git a/src/factorminer/factorminer/operators/__init__.py b/src/factorminer/factorminer/operators/__init__.py
deleted file mode 100644
index 75b9f56..0000000
--- a/src/factorminer/factorminer/operators/__init__.py
+++ /dev/null
@@ -1,54 +0,0 @@
-"""Financial operators for factor expression evaluation.
-
-Exports the central registry and all operator category modules.
-"""
-
-from src.factorminer.factorminer.operators.registry import (
-    OPERATOR_REGISTRY,
-    execute_operator,
-    get_impl,
-    get_operator,
-    implemented_operators,
-    list_operators,
-)
-from src.factorminer.factorminer.operators.gpu_backend import (
-    DeviceManager,
-    batch_execute,
-    device_manager,
-    to_numpy,
-    to_tensor,
-    torch_available,
-)
-from src.factorminer.factorminer.operators.auto_inventor import (
-    OperatorInventor,
-    ProposedOperator,
-    ValidationResult,
-)
-from src.factorminer.factorminer.operators.custom import (
-    CustomOperator,
-    CustomOperatorStore,
-)
-
-__all__ = [
-    # Registry
-    "OPERATOR_REGISTRY",
-    "execute_operator",
-    "get_impl",
-    "get_operator",
-    "implemented_operators",
-    "list_operators",
-    # GPU
-    "DeviceManager",
-    "batch_execute",
-    "device_manager",
-    "to_numpy",
-    "to_tensor",
-    "torch_available",
-    # Auto-inventor
-    "OperatorInventor",
-    "ProposedOperator",
-    "ValidationResult",
-    # Custom operators
-    "CustomOperator",
-    "CustomOperatorStore",
-]
diff --git a/src/factorminer/factorminer/operators/arithmetic.py b/src/factorminer/factorminer/operators/arithmetic.py
deleted file mode 100644
index de9bc7b..0000000
--- a/src/factorminer/factorminer/operators/arithmetic.py
+++ /dev/null
@@ -1,223 +0,0 @@
-"""Element-wise arithmetic operators (unary and binary).
-
-Every function accepts arrays of shape ``(M, T)`` and returns the same shape.
-Both NumPy and PyTorch implementations are provided.
-"""
-
-from __future__ import annotations
-
-from typing import Union
-
-import numpy as np
-
-try:
-    import torch
-except ImportError:
-    torch = None  # type: ignore[assignment]
-
-Array = Union[np.ndarray, "torch.Tensor"]
-
-# ---- helpers ---------------------------------------------------------------
-
-_EPS_NP = np.float32(1e-10)
-
-
-def _eps(x: Array) -> float:
-    return 1e-10
-
-
-# ---- NumPy implementations ------------------------------------------------
-
-def add_np(x: np.ndarray, y: np.ndarray) -> np.ndarray:
-    return np.add(x, y)
-
-
-def sub_np(x: np.ndarray, y: np.ndarray) -> np.ndarray:
-    return np.subtract(x, y)
-
-
-def mul_np(x: np.ndarray, y: np.ndarray) -> np.ndarray:
-    return np.multiply(x, y)
-
-
-def div_np(x: np.ndarray, y: np.ndarray) -> np.ndarray:
-    out = np.full_like(x, np.nan, dtype=np.float64)
-    mask = np.abs(y) > _EPS_NP
-    out[mask] = x[mask] / y[mask]
-    return out
-
-
-def neg_np(x: np.ndarray) -> np.ndarray:
-    return np.negative(x)
-
-
-def abs_np(x: np.ndarray) -> np.ndarray:
-    return np.abs(x)
-
-
-def sign_np(x: np.ndarray) -> np.ndarray:
-    return np.sign(x)
-
-
-def log_np(x: np.ndarray) -> np.ndarray:
-    """log(1 + |x|) * sign(x) -- safe log that handles negatives."""
-    return np.log1p(np.abs(x)) * np.sign(x)
-
-
-def sqrt_np(x: np.ndarray) -> np.ndarray:
-    """sqrt(|x|) * sign(x)."""
-    return np.sqrt(np.abs(x)) * np.sign(x)
-
-
-def square_np(x: np.ndarray) -> np.ndarray:
-    return np.square(x)
-
-
-def inv_np(x: np.ndarray) -> np.ndarray:
-    out = np.full_like(x, np.nan, dtype=np.float64)
-    mask = np.abs(x) > _EPS_NP
-    out[mask] = 1.0 / x[mask]
-    return out
-
-
-def pow_np(x: np.ndarray, y: np.ndarray) -> np.ndarray:
-    """x^y with safe handling."""
-    with np.errstate(invalid="ignore", divide="ignore"):
-        return np.where(np.isnan(x) | np.isnan(y), np.nan, np.power(np.abs(x), y) * np.sign(x))
-
-
-def max_np(x: np.ndarray, y: np.ndarray) -> np.ndarray:
-    return np.fmax(x, y)
-
-
-def min_np(x: np.ndarray, y: np.ndarray) -> np.ndarray:
-    return np.fmin(x, y)
-
-
-def clip_np(x: np.ndarray, lower: float = -3.0, upper: float = 3.0) -> np.ndarray:
-    return np.clip(x, lower, upper)
-
-
-def exp_np(x: np.ndarray) -> np.ndarray:
-    """Clamped exp to avoid overflow."""
-    return np.exp(np.clip(x, -50.0, 50.0))
-
-
-def tanh_np(x: np.ndarray) -> np.ndarray:
-    return np.tanh(x)
-
-
-def signed_power_np(x: np.ndarray, e: float = 2.0) -> np.ndarray:
-    return np.sign(x) * np.power(np.abs(x), e)
-
-
-def power_np(x: np.ndarray, e: float = 2.0) -> np.ndarray:
-    with np.errstate(invalid="ignore"):
-        return np.power(x, e)
-
-
-# ---- PyTorch (GPU) implementations ----------------------------------------
-
-def add_torch(x: "torch.Tensor", y: "torch.Tensor") -> "torch.Tensor":
-    return x + y
-
-
-def sub_torch(x: "torch.Tensor", y: "torch.Tensor") -> "torch.Tensor":
-    return x - y
-
-
-def mul_torch(x: "torch.Tensor", y: "torch.Tensor") -> "torch.Tensor":
-    return x * y
-
-
-def div_torch(x: "torch.Tensor", y: "torch.Tensor") -> "torch.Tensor":
-    mask = y.abs() > 1e-10
-    out = torch.full_like(x, float("nan"))
-    out[mask] = x[mask] / y[mask]
-    return out
-
-
-def neg_torch(x: "torch.Tensor") -> "torch.Tensor":
-    return -x
-
-
-def abs_torch(x: "torch.Tensor") -> "torch.Tensor":
-    return x.abs()
-
-
-def sign_torch(x: "torch.Tensor") -> "torch.Tensor":
-    return x.sign()
-
-
-def log_torch(x: "torch.Tensor") -> "torch.Tensor":
-    return torch.log1p(x.abs()) * x.sign()
-
-
-def sqrt_torch(x: "torch.Tensor") -> "torch.Tensor":
-    return x.abs().sqrt() * x.sign()
-
-
-def square_torch(x: "torch.Tensor") -> "torch.Tensor":
-    return x * x
-
-
-def inv_torch(x: "torch.Tensor") -> "torch.Tensor":
-    mask = x.abs() > 1e-10
-    out = torch.full_like(x, float("nan"))
-    out[mask] = 1.0 / x[mask]
-    return out
-
-
-def pow_torch(x: "torch.Tensor", y: "torch.Tensor") -> "torch.Tensor":
-    safe = x.abs().pow(y) * x.sign()
-    return torch.where(torch.isnan(x) | torch.isnan(y), torch.tensor(float("nan"), device=x.device), safe)
-
-
-def max_torch(x: "torch.Tensor", y: "torch.Tensor") -> "torch.Tensor":
-    return torch.fmax(x, y)
-
-
-def min_torch(x: "torch.Tensor", y: "torch.Tensor") -> "torch.Tensor":
-    return torch.fmin(x, y)
-
-
-def clip_torch(x: "torch.Tensor", lower: float = -3.0, upper: float = 3.0) -> "torch.Tensor":
-    return x.clamp(lower, upper)
-
-
-def exp_torch(x: "torch.Tensor") -> "torch.Tensor":
-    return torch.exp(x.clamp(-50.0, 50.0))
-
-
-def tanh_torch(x: "torch.Tensor") -> "torch.Tensor":
-    return x.tanh()
-
-
-def signed_power_torch(x: "torch.Tensor", e: float = 2.0) -> "torch.Tensor":
-    return x.sign() * x.abs().pow(e)
-
-
-def power_torch(x: "torch.Tensor", e: float = 2.0) -> "torch.Tensor":
-    return x.pow(e)
-
-
-# ---- Registration table ----------------------------------------------------
-# Maps operator name -> (numpy_fn, torch_fn)
-
-ARITHMETIC_OPS = {
-    "Add": (add_np, add_torch),
-    "Sub": (sub_np, sub_torch),
-    "Mul": (mul_np, mul_torch),
-    "Div": (div_np, div_torch),
-    "Neg": (neg_np, neg_torch),
-    "Abs": (abs_np, abs_torch),
-    "Sign": (sign_np, sign_torch),
-    "Log": (log_np, log_torch),
-    "Sqrt": (sqrt_np, sqrt_torch),
-    "Square": (square_np, square_torch),
-    "Inv": (inv_np, inv_torch),
-    "Pow": (pow_np, pow_torch),
-    "Max": (max_np, max_torch),
-    "Min": (min_np, min_torch),
-    "Clip": (clip_np, clip_torch),
-}
diff --git a/src/factorminer/factorminer/operators/auto_inventor.py b/src/factorminer/factorminer/operators/auto_inventor.py
deleted file mode 100644
index 41f90e1..0000000
--- a/src/factorminer/factorminer/operators/auto_inventor.py
+++ /dev/null
@@ -1,547 +0,0 @@
-"""Automated operator invention via LLM-guided proposal and validation.
-
-Uses an LLM to propose novel operator definitions (as NumPy functions),
-validates them in a sandboxed environment, and checks for differentiation
-from existing operators and IC contribution.
-"""
-
-from __future__ import annotations
-
-import json
-import logging
-import re
-from dataclasses import dataclass, field
-from typing import Any, Callable, Dict, List, Optional, Tuple
-
-import numpy as np
-
-from src.factorminer.factorminer.agent.llm_interface import LLMProvider
-from src.factorminer.factorminer.core.types import OPERATOR_REGISTRY, OperatorSpec
-
-logger = logging.getLogger(__name__)
-
-
-# ---------------------------------------------------------------------------
-# Data classes
-# ---------------------------------------------------------------------------
-
-@dataclass
-class ProposedOperator:
-    """A single operator proposal generated by the LLM.
-
-    Attributes
-    ----------
-    name : str
-        Canonical name for the operator (e.g. ``"ExpDecayDiff"``).
-    arity : int
-        Number of expression children (1 = unary, 2 = binary).
-    description : str
-        Short human-readable description.
-    numpy_code : str
-        Python source defining a function called ``compute``.
-        The function signature must accept (M, T)-shaped ndarrays and
-        return an (M, T)-shaped ndarray.
-    param_names : tuple of str
-        Names of extra numeric parameters.
-    param_defaults : dict
-        Default value for each parameter.
-    param_ranges : dict
-        Valid (inclusive) range for each parameter.
-    rationale : str
-        Why this operator might be useful for alpha factor construction.
-    based_on : list of str
-        Existing operators that inspired this proposal.
-    """
-
-    name: str
-    arity: int
-    description: str
-    numpy_code: str
-    param_names: Tuple[str, ...] = ()
-    param_defaults: Dict[str, float] = field(default_factory=dict)
-    param_ranges: Dict[str, Tuple[float, float]] = field(default_factory=dict)
-    rationale: str = ""
-    based_on: List[str] = field(default_factory=list)
-
-
-@dataclass
-class ValidationResult:
-    """Result of validating a proposed operator.
-
-    Attributes
-    ----------
-    valid : bool
-        True if the operator passed all validation checks.
-    error : str
-        Error message if validation failed; empty string on success.
-    output_shape_ok : bool
-        True if the operator output has the correct (M, T) shape.
-    nan_ratio : float
-        Fraction of NaN values in the operator output.
-    differentiates_from_existing : bool
-        True if the operator is sufficiently different from all existing operators.
-    ic_contribution : float
-        Information coefficient of a simple factor using this operator.
-    """
-
-    valid: bool
-    error: str = ""
-    output_shape_ok: bool = False
-    nan_ratio: float = 1.0
-    differentiates_from_existing: bool = False
-    ic_contribution: float = 0.0
-
-
-# ---------------------------------------------------------------------------
-# Sandbox security: allowed names in exec()
-# ---------------------------------------------------------------------------
-
-_SAFE_GLOBALS: Dict[str, Any] = {
-    "np": np,
-    "numpy": np,
-    "__builtins__": {},
-}
-
-# Explicitly blocked tokens in submitted code.  If any of these appear in the
-# raw source string, the code is rejected *before* exec().
-_BLOCKED_TOKENS: Tuple[str, ...] = (
-    "import ",
-    "__import__",
-    "os.",
-    "sys.",
-    "subprocess",
-    "open(",
-    "exec(",
-    "eval(",
-    "compile(",
-    "getattr(",
-    "setattr(",
-    "delattr(",
-    "globals(",
-    "locals(",
-    "__class__",
-    "__subclasses__",
-    "__bases__",
-    "__mro__",
-    "breakpoint(",
-    "exit(",
-    "quit(",
-)
-
-
-# ---------------------------------------------------------------------------
-# OperatorInventor
-# ---------------------------------------------------------------------------
-
-class OperatorInventor:
-    """Proposes and validates new operators using an LLM.
-
-    Parameters
-    ----------
-    llm_provider : LLMProvider
-        LLM backend used to generate proposals.
-    data_tensor : np.ndarray
-        Shape ``(M, T, F)`` -- M stocks, T time steps, F features.
-        Used as test data for validation.
-    returns : np.ndarray
-        Shape ``(M, T)`` -- forward returns for IC measurement.
-    max_proposals_per_round : int
-        Maximum number of proposals to request per LLM call.
-    """
-
-    _SYSTEM_PROMPT = (
-        "You are an expert in quantitative finance operator design. "
-        "Your task is to invent novel numerical operators that transform "
-        "stock market time-series data (shape: stocks x time) into alpha "
-        "signals. Each operator is a pure NumPy function.\n\n"
-        "RULES:\n"
-        "1. Each operator must define a function called `compute`.\n"
-        "2. The function receives ndarray inputs of shape (M, T) and must "
-        "return an ndarray of shape (M, T).\n"
-        "3. You may ONLY use numpy (imported as `np`). No other imports.\n"
-        "4. Handle NaN values gracefully (use np.nan, np.nanmean, etc.).\n"
-        "5. Avoid division by zero -- use np.where or add epsilon.\n"
-        "6. Operators should be economically meaningful for alpha factor "
-        "construction.\n"
-        "7. Do NOT use os, sys, subprocess, open, exec, eval, or any "
-        "filesystem/network access.\n"
-    )
-
-    def __init__(
-        self,
-        llm_provider: LLMProvider,
-        data_tensor: np.ndarray,
-        returns: np.ndarray,
-        max_proposals_per_round: int = 5,
-    ) -> None:
-        if data_tensor.ndim != 3:
-            raise ValueError(
-                f"data_tensor must be 3-D (M, T, F), got shape {data_tensor.shape}"
-            )
-        if returns.ndim != 2:
-            raise ValueError(
-                f"returns must be 2-D (M, T), got shape {returns.shape}"
-            )
-        self.llm = llm_provider
-        self.data_tensor = data_tensor
-        self.returns = returns
-        self.max_proposals = max_proposals_per_round
-        # Pre-compute test slices for validation
-        self._test_x = data_tensor[:, :, 0]  # first feature as default input
-        self._test_y = (
-            data_tensor[:, :, 1] if data_tensor.shape[2] > 1 else data_tensor[:, :, 0]
-        )
-
-    # ------------------------------------------------------------------
-    # Public API
-    # ------------------------------------------------------------------
-
-    def propose_operators(
-        self,
-        existing_operators: Dict[str, OperatorSpec],
-        successful_patterns: Optional[List[str]] = None,
-    ) -> List[ProposedOperator]:
-        """Ask the LLM to propose new operators.
-
-        Parameters
-        ----------
-        existing_operators : dict
-            Mapping of name -> OperatorSpec for already-registered operators.
-        successful_patterns : list of str, optional
-            Natural-language descriptions of patterns that have worked well.
-
-        Returns
-        -------
-        list of ProposedOperator
-        """
-        successful_patterns = successful_patterns or []
-        prompt = self._build_proposal_prompt(existing_operators, successful_patterns)
-        logger.info(
-            "Requesting %d operator proposals from %s",
-            self.max_proposals,
-            self.llm.provider_name,
-        )
-        raw = self.llm.generate(
-            system_prompt=self._SYSTEM_PROMPT,
-            user_prompt=prompt,
-            temperature=0.9,
-            max_tokens=8192,
-        )
-        proposals = self._parse_proposals(raw)
-        logger.info("Parsed %d proposals from LLM output", len(proposals))
-        return proposals
-
-    def validate_operator(self, proposal: ProposedOperator) -> ValidationResult:
-        """Validate a proposed operator through compilation, execution, and IC check.
-
-        Parameters
-        ----------
-        proposal : ProposedOperator
-
-        Returns
-        -------
-        ValidationResult
-        """
-        # Step 1: compile safely
-        fn = self._compile_safely(proposal.numpy_code)
-        if fn is None:
-            return ValidationResult(valid=False, error="Compilation failed or blocked code detected")
-
-        # Step 2: execute on test data, check output shape
-        try:
-            if proposal.arity == 1:
-                output = fn(self._test_x)
-            elif proposal.arity >= 2:
-                output = fn(self._test_x, self._test_y)
-            else:
-                output = fn(self._test_x)
-        except Exception as exc:
-            return ValidationResult(valid=False, error=f"Execution error: {exc}")
-
-        if not isinstance(output, np.ndarray):
-            return ValidationResult(valid=False, error="Output is not an ndarray")
-
-        M, T = self._test_x.shape
-        if output.shape != (M, T):
-            return ValidationResult(
-                valid=False,
-                error=f"Shape mismatch: expected ({M}, {T}), got {output.shape}",
-                output_shape_ok=False,
-            )
-
-        # Step 3: NaN ratio
-        nan_ratio = float(np.isnan(output).sum()) / output.size if output.size > 0 else 1.0
-        if nan_ratio > 0.5:
-            return ValidationResult(
-                valid=False,
-                error=f"NaN ratio too high: {nan_ratio:.2%}",
-                output_shape_ok=True,
-                nan_ratio=nan_ratio,
-            )
-
-        # Step 4: differentiation from existing operators
-        differentiates = self._check_differentiation(fn, proposal)
-
-        # Step 5: IC contribution
-        ic = self._measure_ic_contribution(fn, proposal)
-
-        valid = differentiates  # must differentiate; IC is informational
-        error = "" if valid else "Too correlated with existing operators (r > 0.9)"
-
-        return ValidationResult(
-            valid=valid,
-            error=error,
-            output_shape_ok=True,
-            nan_ratio=nan_ratio,
-            differentiates_from_existing=differentiates,
-            ic_contribution=ic,
-        )
-
-    # ------------------------------------------------------------------
-    # Internal: prompt building & parsing
-    # ------------------------------------------------------------------
-
-    def _build_proposal_prompt(
-        self,
-        existing_ops: Dict[str, OperatorSpec],
-        patterns: List[str],
-    ) -> str:
-        """Format the user prompt for operator proposals."""
-        lines: List[str] = []
-
-        # Existing operators summary
-        lines.append("## EXISTING OPERATORS (do NOT duplicate these)")
-        for name, spec in sorted(existing_ops.items()):
-            lines.append(f"- {name} (arity={spec.arity}): {spec.description}")
-
-        # Successful patterns
-        if patterns:
-            lines.append("\n## SUCCESSFUL PATTERNS (build on these themes)")
-            for p in patterns:
-                lines.append(f"  * {p}")
-
-        # Request
-        lines.append(f"\n## REQUEST")
-        lines.append(
-            f"Propose exactly {self.max_proposals} new operators. "
-            f"For each operator, output a JSON object with these fields:"
-        )
-        lines.append(
-            '  {"name": "OpName", "arity": 1, "description": "...", '
-            '"numpy_code": "def compute(x):\\n    ...", '
-            '"param_names": [], "param_defaults": {}, "param_ranges": {}, '
-            '"rationale": "...", "based_on": ["ExistingOp1"]}'
-        )
-        lines.append(
-            "\nOutput each JSON object on a separate line, preceded by "
-            "the line number (e.g., '1. {...}')."
-        )
-        lines.append(
-            "\nIMPORTANT: The `numpy_code` field must define a function "
-            "called `compute` that accepts ndarray(s) of shape (M, T) "
-            "and returns an ndarray of shape (M, T). Use only numpy (as np)."
-        )
-        return "\n".join(lines)
-
-    def _parse_proposals(self, raw: str) -> List[ProposedOperator]:
-        """Parse LLM output into ProposedOperator objects."""
-        proposals: List[ProposedOperator] = []
-
-        # Try to find JSON objects in the text
-        # Pattern: optional number prefix, then a JSON object
-        json_pattern = re.compile(r'\{[^{}]*(?:\{[^{}]*\}[^{}]*)*\}')
-        matches = json_pattern.findall(raw)
-
-        for match in matches:
-            try:
-                data = json.loads(match)
-            except json.JSONDecodeError:
-                logger.debug("Failed to parse JSON: %s", match[:100])
-                continue
-
-            name = data.get("name", "")
-            if not name:
-                continue
-
-            # Normalize numpy_code: replace \\n with actual newlines
-            numpy_code = data.get("numpy_code", "")
-            if "\\n" in numpy_code:
-                numpy_code = numpy_code.replace("\\n", "\n")
-
-            proposal = ProposedOperator(
-                name=name,
-                arity=int(data.get("arity", 1)),
-                description=data.get("description", ""),
-                numpy_code=numpy_code,
-                param_names=tuple(data.get("param_names", [])),
-                param_defaults=data.get("param_defaults", {}),
-                param_ranges={
-                    k: tuple(v) for k, v in data.get("param_ranges", {}).items()
-                },
-                rationale=data.get("rationale", ""),
-                based_on=data.get("based_on", []),
-            )
-            proposals.append(proposal)
-
-            if len(proposals) >= self.max_proposals:
-                break
-
-        return proposals
-
-    # ------------------------------------------------------------------
-    # Internal: safe compilation & validation helpers
-    # ------------------------------------------------------------------
-
-    def _compile_safely(self, code: str) -> Optional[Callable]:
-        """Compile operator code in a restricted sandbox.
-
-        SECURITY: Only numpy is available. No filesystem, network,
-        subprocess, or introspection access is permitted.
-
-        Parameters
-        ----------
-        code : str
-            Python source defining a function called ``compute``.
-
-        Returns
-        -------
-        Callable or None
-            The compiled ``compute`` function, or None if compilation
-            failed or the code was rejected for security reasons.
-        """
-        # Pre-scan for blocked tokens
-        code_lower = code.lower()
-        for token in _BLOCKED_TOKENS:
-            if token.lower() in code_lower:
-                logger.warning("Blocked token '%s' found in operator code", token)
-                return None
-
-        # Restricted exec
-        safe_ns: Dict[str, Any] = dict(_SAFE_GLOBALS)
-        try:
-            exec(code, safe_ns)  # noqa: S102 -- intentional sandboxed exec
-        except Exception as exc:
-            logger.warning("Operator compilation failed: %s", exc)
-            return None
-
-        fn = safe_ns.get("compute")
-        if fn is None or not callable(fn):
-            logger.warning("No callable 'compute' found in operator code")
-            return None
-
-        return fn
-
-    def _check_differentiation(self, fn: Callable, proposal: ProposedOperator) -> bool:
-        """Check that the operator output is not too correlated with existing operators.
-
-        Computes the new operator on test data and correlates the result with
-        outputs from a sample of existing operators.  If max |correlation| > 0.9,
-        the operator is considered redundant.
-
-        Returns
-        -------
-        bool
-            True if the operator differentiates sufficiently.
-        """
-        try:
-            if proposal.arity == 1:
-                new_output = fn(self._test_x)
-            else:
-                new_output = fn(self._test_x, self._test_y)
-        except Exception:
-            return False
-
-        new_flat = new_output.flatten()
-        valid_mask = ~np.isnan(new_flat)
-        if valid_mask.sum() < 10:
-            return False
-
-        # Compare against a sample of existing operator implementations
-        from factorminer.operators.registry import OPERATOR_REGISTRY as RUNTIME_REG
-
-        sample_names = list(RUNTIME_REG.keys())[:20]  # sample for efficiency
-        for name in sample_names:
-            entry = RUNTIME_REG.get(name)
-            if entry is None:
-                continue
-            spec, np_fn, _ = entry
-            if np_fn is None:
-                continue
-            try:
-                if spec.arity == 1:
-                    existing_output = np_fn(self._test_x)
-                elif spec.arity == 2:
-                    existing_output = np_fn(self._test_x, self._test_y)
-                else:
-                    continue
-            except Exception:
-                continue
-
-            if existing_output.shape != new_output.shape:
-                continue
-
-            ex_flat = existing_output.flatten()
-            both_valid = valid_mask & ~np.isnan(ex_flat)
-            if both_valid.sum() < 10:
-                continue
-
-            corr = np.corrcoef(new_flat[both_valid], ex_flat[both_valid])[0, 1]
-            if abs(corr) > 0.9:
-                logger.info(
-                    "Proposed operator '%s' too correlated with '%s' (r=%.3f)",
-                    proposal.name,
-                    name,
-                    corr,
-                )
-                return False
-
-        return True
-
-    def _measure_ic_contribution(self, fn: Callable, proposal: ProposedOperator) -> float:
-        """Measure the Information Coefficient of a simple factor using the operator.
-
-        Constructs a basic factor as CsRank(NewOp(data)) and computes
-        the rank IC against forward returns.
-
-        Returns
-        -------
-        float
-            Mean rank IC across time steps.
-        """
-        try:
-            if proposal.arity == 1:
-                raw = fn(self._test_x)
-            else:
-                raw = fn(self._test_x, self._test_y)
-        except Exception:
-            return 0.0
-
-        if raw.shape != self.returns.shape:
-            return 0.0
-
-        # Cross-sectional rank at each time step
-        M, T = raw.shape
-        ranked = np.full_like(raw, np.nan)
-        for t in range(T):
-            col = raw[:, t]
-            valid = ~np.isnan(col)
-            if valid.sum() < 3:
-                continue
-            order = col[valid].argsort().argsort()
-            ranked[valid, t] = order / (valid.sum() - 1)  # percentile rank
-
-        # Rank IC per time step
-        ics: List[float] = []
-        for t in range(T):
-            factor_col = ranked[:, t]
-            ret_col = self.returns[:, t]
-            both_valid = ~np.isnan(factor_col) & ~np.isnan(ret_col)
-            if both_valid.sum() < 5:
-                continue
-            corr = np.corrcoef(factor_col[both_valid], ret_col[both_valid])[0, 1]
-            if not np.isnan(corr):
-                ics.append(corr)
-
-        if not ics:
-            return 0.0
-        return float(np.mean(ics))
diff --git a/src/factorminer/factorminer/operators/crosssectional.py b/src/factorminer/factorminer/operators/crosssectional.py
deleted file mode 100644
index 84ebf12..0000000
--- a/src/factorminer/factorminer/operators/crosssectional.py
+++ /dev/null
@@ -1,151 +0,0 @@
-"""Cross-sectional operators (across M assets at each time step t).
-
-Input shape: ``(M, T)`` -> output shape ``(M, T)``.
-Operations are performed along axis=0 (the asset dimension) for every column.
-"""
-
-from __future__ import annotations
-
-import numpy as np
-
-try:
-    import torch
-except ImportError:
-    torch = None  # type: ignore[assignment]
-
-
-# ===========================================================================
-# NumPy implementations
-# ===========================================================================
-
-def cs_rank_np(x: np.ndarray) -> np.ndarray:
-    """Cross-sectional percentile rank -- key GPU target (26x speedup).
-
-    For each time step, rank assets from 0 to 1.  NaN inputs get NaN rank.
-    """
-    M, T = x.shape
-    out = np.full_like(x, np.nan, dtype=np.float64)
-    for t in range(T):
-        col = x[:, t]
-        valid = ~np.isnan(col)
-        n = valid.sum()
-        if n < 2:
-            continue
-        order = col[valid].argsort().argsort().astype(np.float64)
-        out[valid, t] = order / (n - 1)
-    return out
-
-
-def cs_zscore_np(x: np.ndarray) -> np.ndarray:
-    """Cross-sectional z-score."""
-    m = np.nanmean(x, axis=0, keepdims=True)
-    s = np.nanstd(x, axis=0, keepdims=True, ddof=0)
-    with np.errstate(invalid="ignore", divide="ignore"):
-        return np.where(s > 1e-10, (x - m) / s, np.nan)
-
-
-def cs_demean_np(x: np.ndarray) -> np.ndarray:
-    """Subtract cross-sectional mean."""
-    return x - np.nanmean(x, axis=0, keepdims=True)
-
-
-def cs_scale_np(x: np.ndarray) -> np.ndarray:
-    """Scale to unit L1 norm cross-sectionally."""
-    l1 = np.nansum(np.abs(x), axis=0, keepdims=True)
-    with np.errstate(invalid="ignore", divide="ignore"):
-        return np.where(l1 > 1e-10, x / l1, np.nan)
-
-
-def cs_neutralize_np(x: np.ndarray) -> np.ndarray:
-    """Industry-neutralize (simplified: demean)."""
-    return cs_demean_np(x)
-
-
-def cs_quantile_np(x: np.ndarray, n_bins: int = 5) -> np.ndarray:
-    """Assign each asset to a quantile bin (0 .. n_bins-1) cross-sectionally."""
-    n_bins = int(n_bins)
-    M, T = x.shape
-    out = np.full_like(x, np.nan, dtype=np.float64)
-    for t in range(T):
-        col = x[:, t]
-        valid = ~np.isnan(col)
-        n = valid.sum()
-        if n < 2:
-            continue
-        order = col[valid].argsort().argsort().astype(np.float64)
-        out[valid, t] = np.floor(order / n * n_bins).clip(0, n_bins - 1)
-    return out
-
-
-# ===========================================================================
-# PyTorch implementations
-# ===========================================================================
-
-def cs_rank_torch(x: "torch.Tensor") -> "torch.Tensor":
-    """Cross-sectional percentile rank -- fully vectorized for GPU."""
-    M, T = x.shape
-    not_nan = ~torch.isnan(x)
-    # Replace NaN with very large value so they sort last
-    filled = x.clone()
-    filled[~not_nan] = float("inf")
-    # argsort twice gives rank
-    ranks = filled.argsort(dim=0).argsort(dim=0).float()
-    # Count valid per column
-    n_valid = not_nan.sum(dim=0, keepdim=True).float()
-    result = ranks / (n_valid - 1).clamp(min=1)
-    result[~not_nan] = float("nan")
-    # Clamp ranks for entries that got inf-sorted
-    result = result.clamp(0.0, 1.0)
-    result[~not_nan] = float("nan")
-    return result
-
-
-def cs_zscore_torch(x: "torch.Tensor") -> "torch.Tensor":
-    m = x.nanmean(dim=0, keepdim=True)
-    d = x - m
-    not_nan = ~torch.isnan(x)
-    n = not_nan.sum(dim=0, keepdim=True).float()
-    s = (d.nan_to_num(0.0).pow(2).sum(dim=0, keepdim=True) / n.clamp(min=1)).sqrt()
-    result = torch.where(s > 1e-10, d / s, torch.tensor(float("nan"), device=x.device))
-    result[~not_nan] = float("nan")
-    return result
-
-
-def cs_demean_torch(x: "torch.Tensor") -> "torch.Tensor":
-    return x - x.nanmean(dim=0, keepdim=True)
-
-
-def cs_scale_torch(x: "torch.Tensor") -> "torch.Tensor":
-    l1 = x.abs().nansum(dim=0, keepdim=True)
-    return torch.where(l1 > 1e-10, x / l1, torch.tensor(float("nan"), device=x.device))
-
-
-def cs_neutralize_torch(x: "torch.Tensor") -> "torch.Tensor":
-    return cs_demean_torch(x)
-
-
-def cs_quantile_torch(x: "torch.Tensor", n_bins: int = 5) -> "torch.Tensor":
-    n_bins = int(n_bins)
-    M, T = x.shape
-    not_nan = ~torch.isnan(x)
-    filled = x.clone()
-    filled[~not_nan] = float("inf")
-    ranks = filled.argsort(dim=0).argsort(dim=0).float()
-    n_valid = not_nan.sum(dim=0, keepdim=True).float()
-    result = (ranks / n_valid * n_bins).floor().clamp(0, n_bins - 1)
-    result[~not_nan] = float("nan")
-    return result
-
-
-# ===========================================================================
-# Registration table
-# ===========================================================================
-
-CROSSSECTIONAL_OPS = {
-    "CsRank": (cs_rank_np, cs_rank_torch),
-    "CsZScore": (cs_zscore_np, cs_zscore_torch),
-    "CsDemean": (cs_demean_np, cs_demean_torch),
-    "CsScale": (cs_scale_np, cs_scale_torch),
-    "CsNeutralize": (cs_neutralize_np, cs_neutralize_torch),
-    "CsQuantile": (cs_quantile_np, cs_quantile_torch),
-}
diff --git a/src/factorminer/factorminer/operators/custom.py b/src/factorminer/factorminer/operators/custom.py
deleted file mode 100644
index ba93b78..0000000
--- a/src/factorminer/factorminer/operators/custom.py
+++ /dev/null
@@ -1,251 +0,0 @@
-"""Custom operator storage, registration, and persistence.
-
-Manages operators invented by the auto-inventor: registers them into the
-global operator registry at runtime, and persists them to disk as JSON
-metadata plus Python source files for reload across sessions.
-"""
-
-from __future__ import annotations
-
-import json
-import logging
-import os
-from dataclasses import dataclass, field
-from pathlib import Path
-from typing import Any, Callable, Dict, List, Optional, Tuple
-
-import numpy as np
-
-from src.factorminer.factorminer.core.types import (
-    OPERATOR_REGISTRY as SPEC_REGISTRY,
-    OperatorSpec,
-    OperatorType,
-    SignatureType,
-)
-from src.factorminer.factorminer.operators.registry import OPERATOR_REGISTRY as RUNTIME_REGISTRY
-
-logger = logging.getLogger(__name__)
-
-
-# ---------------------------------------------------------------------------
-# Safe compilation (shared with auto_inventor.py)
-# ---------------------------------------------------------------------------
-
-_SAFE_GLOBALS: Dict[str, Any] = {
-    "np": np,
-    "numpy": np,
-    "__builtins__": {},
-}
-
-
-def _compile_operator_code(code: str) -> Optional[Callable]:
-    """Compile operator code in a restricted sandbox.
-
-    Returns the ``compute`` function or None on failure.
-    """
-    safe_ns: Dict[str, Any] = dict(_SAFE_GLOBALS)
-    try:
-        exec(code, safe_ns)  # noqa: S102 -- sandboxed exec
-    except Exception as exc:
-        logger.warning("Failed to compile custom operator code: %s", exc)
-        return None
-    fn = safe_ns.get("compute")
-    if fn is None or not callable(fn):
-        return None
-    return fn
-
-
-# ---------------------------------------------------------------------------
-# CustomOperator
-# ---------------------------------------------------------------------------
-
-@dataclass
-class CustomOperator:
-    """A validated, auto-invented operator ready for registration.
-
-    Attributes
-    ----------
-    name : str
-        Canonical operator name.
-    spec : OperatorSpec
-        Immutable specification matching the type system.
-    numpy_code : str
-        Python source defining ``compute``.
-    numpy_fn : Callable
-        Compiled compute function (not persisted; recompiled on load).
-    validation_ic : float
-        Information coefficient measured during validation.
-    invention_iteration : int
-        The search iteration in which this operator was invented.
-    rationale : str
-        Why this operator was proposed.
-    """
-
-    name: str
-    spec: OperatorSpec
-    numpy_code: str
-    numpy_fn: Callable
-    validation_ic: float = 0.0
-    invention_iteration: int = 0
-    rationale: str = ""
-
-
-# ---------------------------------------------------------------------------
-# CustomOperatorStore
-# ---------------------------------------------------------------------------
-
-class CustomOperatorStore:
-    """Manages custom operator lifecycle: register, persist, and reload.
-
-    Parameters
-    ----------
-    store_dir : str
-        Directory for persisting operator metadata and source files.
-    """
-
-    def __init__(self, store_dir: str = "./output/custom_operators") -> None:
-        self._store_dir = Path(store_dir)
-        self._operators: Dict[str, CustomOperator] = {}
-
-    # ------------------------------------------------------------------
-    # Public API
-    # ------------------------------------------------------------------
-
-    def register(self, op: CustomOperator) -> None:
-        """Register a custom operator into both global registries.
-
-        Adds the operator to:
-        1. ``core.types.OPERATOR_REGISTRY`` (spec-only registry)
-        2. ``operators.registry.OPERATOR_REGISTRY`` (runtime registry with impl)
-
-        Parameters
-        ----------
-        op : CustomOperator
-        """
-        # Add to spec registry
-        SPEC_REGISTRY[op.name] = op.spec
-
-        # Add to runtime registry (spec, numpy_fn, torch_fn=None)
-        RUNTIME_REGISTRY[op.name] = (op.spec, op.numpy_fn, None)
-
-        # Track internally
-        self._operators[op.name] = op
-        logger.info(
-            "Registered custom operator '%s' (IC=%.4f, iteration=%d)",
-            op.name,
-            op.validation_ic,
-            op.invention_iteration,
-        )
-
-    def save(self) -> None:
-        """Persist all custom operators to disk.
-
-        Creates ``store_dir/`` with:
-        - ``index.json``: metadata for all operators
-        - ``<name>.py``: Python source for each operator
-        """
-        self._store_dir.mkdir(parents=True, exist_ok=True)
-
-        index: List[Dict[str, Any]] = []
-        for name, op in self._operators.items():
-            # Save Python source
-            src_path = self._store_dir / f"{name}.py"
-            src_path.write_text(op.numpy_code, encoding="utf-8")
-
-            # Build metadata entry
-            entry = {
-                "name": op.name,
-                "arity": op.spec.arity,
-                "category": op.spec.category.name,
-                "signature": op.spec.signature.name,
-                "param_names": list(op.spec.param_names),
-                "param_defaults": op.spec.param_defaults,
-                "param_ranges": {
-                    k: list(v) for k, v in op.spec.param_ranges.items()
-                },
-                "description": op.spec.description,
-                "validation_ic": op.validation_ic,
-                "invention_iteration": op.invention_iteration,
-                "rationale": op.rationale,
-            }
-            index.append(entry)
-
-        index_path = self._store_dir / "index.json"
-        index_path.write_text(
-            json.dumps(index, indent=2, ensure_ascii=False),
-            encoding="utf-8",
-        )
-        logger.info(
-            "Saved %d custom operators to %s", len(index), self._store_dir
-        )
-
-    def load(self) -> None:
-        """Load custom operators from disk, recompile, and re-register.
-
-        Reads ``store_dir/index.json`` and corresponding ``.py`` source files.
-        Operators that fail recompilation are skipped with a warning.
-        """
-        index_path = self._store_dir / "index.json"
-        if not index_path.exists():
-            logger.debug("No custom operator index at %s", index_path)
-            return
-
-        with open(index_path, "r", encoding="utf-8") as f:
-            index: List[Dict[str, Any]] = json.load(f)
-
-        loaded = 0
-        for entry in index:
-            name = entry["name"]
-            src_path = self._store_dir / f"{name}.py"
-            if not src_path.exists():
-                logger.warning("Source file missing for custom operator '%s'", name)
-                continue
-
-            numpy_code = src_path.read_text(encoding="utf-8")
-            fn = _compile_operator_code(numpy_code)
-            if fn is None:
-                logger.warning(
-                    "Failed to recompile custom operator '%s'; skipping", name
-                )
-                continue
-
-            spec = OperatorSpec(
-                name=name,
-                arity=entry["arity"],
-                category=OperatorType[entry["category"]],
-                signature=SignatureType[entry["signature"]],
-                param_names=tuple(entry.get("param_names", [])),
-                param_defaults=entry.get("param_defaults", {}),
-                param_ranges={
-                    k: tuple(v)
-                    for k, v in entry.get("param_ranges", {}).items()
-                },
-                description=entry.get("description", ""),
-            )
-
-            op = CustomOperator(
-                name=name,
-                spec=spec,
-                numpy_code=numpy_code,
-                numpy_fn=fn,
-                validation_ic=entry.get("validation_ic", 0.0),
-                invention_iteration=entry.get("invention_iteration", 0),
-                rationale=entry.get("rationale", ""),
-            )
-            self.register(op)
-            loaded += 1
-
-        logger.info("Loaded %d / %d custom operators from %s", loaded, len(index), self._store_dir)
-
-    def list_operators(self) -> List[str]:
-        """Return names of all registered custom operators."""
-        return sorted(self._operators.keys())
-
-    def get_operator(self, name: str) -> Optional[CustomOperator]:
-        """Look up a custom operator by name.
-
-        Returns
-        -------
-        CustomOperator or None
-        """
-        return self._operators.get(name)
diff --git a/src/factorminer/factorminer/operators/gpu_backend.py b/src/factorminer/factorminer/operators/gpu_backend.py
deleted file mode 100644
index d28e3cb..0000000
--- a/src/factorminer/factorminer/operators/gpu_backend.py
+++ /dev/null
@@ -1,110 +0,0 @@
-"""GPU acceleration utilities for FactorMiner operators.
-
-Provides device management, tensor conversion helpers, and batch execution
-for parallel factor evaluation on CUDA GPUs with automatic CPU fallback.
-"""
-
-from __future__ import annotations
-
-from typing import Optional, Union
-
-import numpy as np
-
-try:
-    import torch
-
-    _TORCH_AVAILABLE = True
-except ImportError:
-    torch = None  # type: ignore[assignment]
-    _TORCH_AVAILABLE = False
-
-
-# ---------------------------------------------------------------------------
-# Device management
-# ---------------------------------------------------------------------------
-
-class DeviceManager:
-    """Singleton-style helper that picks the best available device."""
-
-    def __init__(self) -> None:
-        self._device: Optional["torch.device"] = None
-
-    @property
-    def device(self) -> "torch.device":
-        if self._device is None:
-            self._device = self._select_device()
-        return self._device
-
-    @device.setter
-    def device(self, dev: Union[str, "torch.device"]) -> None:
-        if not _TORCH_AVAILABLE:
-            raise RuntimeError("PyTorch is not installed")
-        self._device = torch.device(dev)
-
-    @staticmethod
-    def _select_device() -> "torch.device":
-        if not _TORCH_AVAILABLE:
-            raise RuntimeError("PyTorch is not installed")
-        if torch.cuda.is_available():
-            return torch.device("cuda")
-        if hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
-            return torch.device("mps")
-        return torch.device("cpu")
-
-    @property
-    def is_gpu(self) -> bool:
-        return self.device.type in ("cuda", "mps")
-
-    def reset(self) -> None:
-        self._device = None
-
-
-device_manager = DeviceManager()
-
-
-# ---------------------------------------------------------------------------
-# Conversion helpers
-# ---------------------------------------------------------------------------
-
-def to_tensor(
-    arr: np.ndarray,
-    device: Optional["torch.device"] = None,
-    dtype: Optional["torch.dtype"] = None,
-) -> "torch.Tensor":
-    """Convert a NumPy array to a PyTorch tensor on the target device."""
-    if not _TORCH_AVAILABLE:
-        raise RuntimeError("PyTorch is not installed")
-    dev = device or device_manager.device
-    dt = dtype or torch.float32
-    return torch.as_tensor(np.ascontiguousarray(arr), dtype=dt, device=torch.device("cpu")).to(dev)
-
-
-def to_numpy(tensor: "torch.Tensor") -> np.ndarray:
-    """Convert a PyTorch tensor back to a NumPy array."""
-    return tensor.detach().cpu().numpy()
-
-
-# ---------------------------------------------------------------------------
-# Batch execution helper
-# ---------------------------------------------------------------------------
-
-def batch_execute(
-    fn,
-    inputs: list,
-    params_list: list[dict],
-    backend: str = "numpy",
-) -> list:
-    """Execute a function over multiple parameter sets.
-
-    Useful for evaluating many factors in parallel on the GPU by batching
-    the inputs into a single large tensor operation.
-    """
-    results = []
-    for params in params_list:
-        results.append(fn(*inputs, **params))
-    return results
-
-
-def torch_available() -> bool:
-    """Return True if PyTorch is importable."""
-    return _TORCH_AVAILABLE
diff --git a/src/factorminer/factorminer/operators/logical.py b/src/factorminer/factorminer/operators/logical.py
deleted file mode 100644
index c6d16f1..0000000
--- a/src/factorminer/factorminer/operators/logical.py
+++ /dev/null
@@ -1,185 +0,0 @@
-"""Conditional and comparison operators (element-wise).
-
-All operators are element-wise on ``(M, T)`` arrays.
-Boolean-like outputs use ``1.0`` / ``0.0`` (float), not Python bool.
-"""
-
-from __future__ import annotations
-
-import numpy as np
-
-try:
-    import torch
-except ImportError:
-    torch = None  # type: ignore[assignment]
-
-
-# ===========================================================================
-# NumPy implementations
-# ===========================================================================
-
-def if_else_np(cond: np.ndarray, x: np.ndarray, y: np.ndarray) -> np.ndarray:
-    """Where cond > 0 return x, else y.  NaN in cond -> NaN."""
-    result = np.where(cond > 0, x, y)
-    result[np.isnan(cond)] = np.nan
-    return result
-
-
-def greater_np(x: np.ndarray, y: np.ndarray) -> np.ndarray:
-    out = np.where(x > y, 1.0, 0.0)
-    out[np.isnan(x) | np.isnan(y)] = np.nan
-    return out
-
-
-def less_np(x: np.ndarray, y: np.ndarray) -> np.ndarray:
-    out = np.where(x < y, 1.0, 0.0)
-    out[np.isnan(x) | np.isnan(y)] = np.nan
-    return out
-
-
-def greater_equal_np(x: np.ndarray, y: np.ndarray) -> np.ndarray:
-    out = np.where(x >= y, 1.0, 0.0)
-    out[np.isnan(x) | np.isnan(y)] = np.nan
-    return out
-
-
-def less_equal_np(x: np.ndarray, y: np.ndarray) -> np.ndarray:
-    out = np.where(x <= y, 1.0, 0.0)
-    out[np.isnan(x) | np.isnan(y)] = np.nan
-    return out
-
-
-def equal_np(x: np.ndarray, y: np.ndarray) -> np.ndarray:
-    out = np.where(np.abs(x - y) < 1e-10, 1.0, 0.0)
-    out[np.isnan(x) | np.isnan(y)] = np.nan
-    return out
-
-
-def and_np(x: np.ndarray, y: np.ndarray) -> np.ndarray:
-    out = np.where((x > 0) & (y > 0), 1.0, 0.0)
-    out[np.isnan(x) | np.isnan(y)] = np.nan
-    return out
-
-
-def or_np(x: np.ndarray, y: np.ndarray) -> np.ndarray:
-    out = np.where((x > 0) | (y > 0), 1.0, 0.0)
-    out[np.isnan(x) | np.isnan(y)] = np.nan
-    return out
-
-
-def not_np(x: np.ndarray) -> np.ndarray:
-    out = np.where(x > 0, 0.0, 1.0)
-    out[np.isnan(x)] = np.nan
-    return out
-
-
-def sign_np(x: np.ndarray) -> np.ndarray:
-    return np.sign(x)
-
-
-def max2_np(x: np.ndarray, y: np.ndarray) -> np.ndarray:
-    return np.fmax(x, y)
-
-
-def min2_np(x: np.ndarray, y: np.ndarray) -> np.ndarray:
-    return np.fmin(x, y)
-
-
-def ne_np(x: np.ndarray, y: np.ndarray) -> np.ndarray:
-    out = np.where(np.abs(x - y) >= 1e-10, 1.0, 0.0)
-    out[np.isnan(x) | np.isnan(y)] = np.nan
-    return out
-
-
-# ===========================================================================
-# PyTorch implementations
-# ===========================================================================
-
-def if_else_torch(cond: "torch.Tensor", x: "torch.Tensor", y: "torch.Tensor") -> "torch.Tensor":
-    result = torch.where(cond > 0, x, y)
-    result[torch.isnan(cond)] = float("nan")
-    return result
-
-
-def greater_torch(x: "torch.Tensor", y: "torch.Tensor") -> "torch.Tensor":
-    out = torch.where(x > y, 1.0, 0.0)
-    out[torch.isnan(x) | torch.isnan(y)] = float("nan")
-    return out
-
-
-def less_torch(x: "torch.Tensor", y: "torch.Tensor") -> "torch.Tensor":
-    out = torch.where(x < y, 1.0, 0.0)
-    out[torch.isnan(x) | torch.isnan(y)] = float("nan")
-    return out
-
-
-def greater_equal_torch(x: "torch.Tensor", y: "torch.Tensor") -> "torch.Tensor":
-    out = torch.where(x >= y, 1.0, 0.0)
-    out[torch.isnan(x) | torch.isnan(y)] = float("nan")
-    return out
-
-
-def less_equal_torch(x: "torch.Tensor", y: "torch.Tensor") -> "torch.Tensor":
-    out = torch.where(x <= y, 1.0, 0.0)
-    out[torch.isnan(x) | torch.isnan(y)] = float("nan")
-    return out
-
-
-def equal_torch(x: "torch.Tensor", y: "torch.Tensor") -> "torch.Tensor":
-    out = torch.where((x - y).abs() < 1e-10, 1.0, 0.0)
-    out[torch.isnan(x) | torch.isnan(y)] = float("nan")
-    return out
-
-
-def and_torch(x: "torch.Tensor", y: "torch.Tensor") -> "torch.Tensor":
-    out = torch.where((x > 0) & (y > 0), 1.0, 0.0)
-    out[torch.isnan(x) | torch.isnan(y)] = float("nan")
-    return out
-
-
-def or_torch(x: "torch.Tensor", y: "torch.Tensor") -> "torch.Tensor":
-    out = torch.where((x > 0) | (y > 0), 1.0, 0.0)
-    out[torch.isnan(x) | torch.isnan(y)] = float("nan")
-    return out
-
-
-def not_torch(x: "torch.Tensor") -> "torch.Tensor":
-    out = torch.where(x > 0, 0.0, 1.0)
-    out[torch.isnan(x)] = float("nan")
-    return out
-
-
-def sign_torch(x: "torch.Tensor") -> "torch.Tensor":
-    return x.sign()
-
-
-def max2_torch(x: "torch.Tensor", y: "torch.Tensor") -> "torch.Tensor":
-    return torch.fmax(x, y)
-
-
-def min2_torch(x: "torch.Tensor", y: "torch.Tensor") -> "torch.Tensor":
-    return torch.fmin(x, y)
-
-
-def ne_torch(x: "torch.Tensor", y: "torch.Tensor") -> "torch.Tensor":
-    out = torch.where((x - y).abs() >= 1e-10, 1.0, 0.0)
-    out[torch.isnan(x) | torch.isnan(y)] = float("nan")
-    return out
-
-
-# ===========================================================================
-# Registration table
-# ===========================================================================
-
-LOGICAL_OPS = {
-    "IfElse": (if_else_np, if_else_torch),
-    "Greater": (greater_np, greater_torch),
-    "GreaterEqual": (greater_equal_np, greater_equal_torch),
-    "Less": (less_np, less_torch),
-    "LessEqual": (less_equal_np, less_equal_torch),
-    "Equal": (equal_np, equal_torch),
-    "Ne": (ne_np, ne_torch),
-    "And": (and_np, and_torch),
-    "Or": (or_np, or_torch),
-    "Not": (not_np, not_torch),
-}
diff --git a/src/factorminer/factorminer/operators/neuro_symbolic.py b/src/factorminer/factorminer/operators/neuro_symbolic.py
deleted file mode 100644
index 95f58c5..0000000
--- a/src/factorminer/factorminer/operators/neuro_symbolic.py
+++ /dev/null
@@ -1,1614 +0,0 @@
-"""Hybrid neural-symbolic operators for HelixFactor.
-
-WHY THIS MODULE EXISTS
-----------------------
-Symbolic expression trees give us interpretability and generalizability, but
-they are limited by the vocabulary of hand-coded operators.  Neural leaves
-bridge that gap: a tiny MLP trained on historical market data can discover
-non-linear interaction patterns (e.g. volume-price divergence under high
-intraday volatility) that no single hand-written formula captures.
-
-The workflow is:
-  1. Train a NeuralLeaf on historical data to maximise IC with next-period
-     returns.  The leaf sees a rolling window of all available features.
-  2. Insert the trained leaf into an expression tree as a NeuralLeafNode.
-     It behaves like any other operator: (M, T) in -> (M, T) out.
-  3. After validation, run distill_to_symbolic() to find the symbolic
-     formula from the existing operator library that best approximates
-     the neural leaf.  This restores interpretability while keeping the
-     discovered signal.
-  4. Replace NeuralLeafNode with the distilled formula for production.
-
-Architecture constraints
-------------------------
-- Each NeuralLeaf has < 5 000 parameters (fits on CPU, fast inference).
-- 2-layer MLP: input -> 32 hidden -> 1, with LayerNorm and GELU.
-- Input: flattened rolling window of F features over the last W time steps.
-- Output: scalar signal per (asset, time) pair, shape (M, T).
-- Training uses a differentiable Pearson-IC proxy loss.
-"""
-
-from __future__ import annotations
-
-import logging
-import os
-import warnings
-from dataclasses import dataclass, field
-from typing import Any, Dict, List, Optional, Tuple
-
-import numpy as np
-
-logger = logging.getLogger(__name__)
-
-# ---------------------------------------------------------------------------
-# Optional PyTorch import — graceful degradation
-# ---------------------------------------------------------------------------
-
-try:
-    import torch
-    import torch.nn as nn
-    import torch.optim as optim
-
-    _TORCH_AVAILABLE = True
-except ImportError:  # pragma: no cover
-    torch = None  # type: ignore[assignment]
-    nn = None  # type: ignore[assignment]
-    optim = None  # type: ignore[assignment]
-    _TORCH_AVAILABLE = False
-    warnings.warn(
-        "PyTorch is not installed.  NeuralLeaf training and inference will be "
-        "unavailable.  Install torch to enable neuro-symbolic operators.",
-        ImportWarning,
-        stacklevel=1,
-    )
-
-
-# ---------------------------------------------------------------------------
-# Constants
-# ---------------------------------------------------------------------------
-
-# Canonical feature order — must match factorminer.core.types.FEATURES
-_DEFAULT_FEATURES: List[str] = [
-    "$open",
-    "$high",
-    "$low",
-    "$close",
-    "$volume",
-    "$amt",
-    "$vwap",
-    "$returns",
-]
-
-_HIDDEN_DIM: int = 32  # keeps param count ~2 000 for window=10, F=8
-
-
-# ===========================================================================
-# NeuralLeaf — the learnable micro-model
-# ===========================================================================
-
-if _TORCH_AVAILABLE:
-
-    class NeuralLeaf(nn.Module):
-        """Tiny MLP operating on a rolling window of market features.
-
-        Parameters
-        ----------
-        window_size : int
-            Number of look-back time steps fed to the model.
-        n_features : int
-            Number of input feature channels (e.g. 8 for the standard OHLCV set).
-        hidden_dim : int
-            Width of the single hidden layer (default: 32).
-        name : str
-            Human-readable identifier used in DSL strings and logging.
-
-        Input / Output shapes
-        ---------------------
-        ``forward`` expects a tensor of shape ``(M * T_valid, window_size * n_features)``
-        where rows where the window is fully available have been pre-selected.
-        It returns a tensor of shape ``(M * T_valid,)``.
-
-        The public ``evaluate()`` method handles the full (M, T) -> (M, T) pipeline
-        including NaN masking and output assembly.
-
-        Parameter count
-        ---------------
-        With defaults (window=10, F=8, hidden=32):
-            input_dim  = 10 * 8 = 80
-            layer 1    = 80 * 32 + 32 = 2 592
-            layer 2    = 32 * 1  + 1  = 33
-            LayerNorm  = 2 * 80 + 2 * 32 = 224
-            total      ≈ 2 849  (well under 5 000)
-        """
-
-        def __init__(
-            self,
-            window_size: int = 10,
-            n_features: int = 8,
-            hidden_dim: int = _HIDDEN_DIM,
-            name: str = "NeuralLeaf",
-        ) -> None:
-            super().__init__()
-            self.window_size = window_size
-            self.n_features = n_features
-            self.hidden_dim = hidden_dim
-            self.name = name
-
-            input_dim = window_size * n_features
-
-            self.net = nn.Sequential(
-                nn.LayerNorm(input_dim),
-                nn.Linear(input_dim, hidden_dim),
-                nn.GELU(),
-                nn.LayerNorm(hidden_dim),
-                nn.Linear(hidden_dim, 1),
-            )
-            # Xavier init for stability
-            for module in self.net.modules():
-                if isinstance(module, nn.Linear):
-                    nn.init.xavier_uniform_(module.weight)
-                    nn.init.zeros_(module.bias)
-
-        # ------------------------------------------------------------------
-        # Core PyTorch forward
-        # ------------------------------------------------------------------
-
-        def forward(self, x: "torch.Tensor") -> "torch.Tensor":
-            """Map ``(N, window_size * n_features)`` -> ``(N,)``.
-
-            Parameters
-            ----------
-            x : torch.Tensor, shape (N, window_size * n_features)
-
-            Returns
-            -------
-            torch.Tensor, shape (N,)
-            """
-            return self.net(x).squeeze(-1)
-
-        # ------------------------------------------------------------------
-        # High-level evaluation: (M, T, F) -> (M, T) with NaN handling
-        # ------------------------------------------------------------------
-
-        def evaluate(
-            self,
-            features_3d: np.ndarray,
-            device: Optional["torch.device"] = None,
-        ) -> np.ndarray:
-            """Evaluate the leaf on a full (M, T, F) market tensor.
-
-            For each (asset, time) pair where a full window is available,
-            the flattened window is fed through the MLP.  Positions where
-            the window is not yet complete (the first ``window_size - 1``
-            time steps) are filled with NaN.
-
-            Parameters
-            ----------
-            features_3d : np.ndarray, shape (M, T, F)
-                Stack of feature arrays, F channels, in the order given at
-                construction time.
-            device : torch.device, optional
-                Where to place tensors.  Defaults to CPU.
-
-            Returns
-            -------
-            np.ndarray, shape (M, T)
-            """
-            if not _TORCH_AVAILABLE:
-                return np.full(features_3d.shape[:2], np.nan)
-
-            device = device or torch.device("cpu")
-            M, T, F = features_3d.shape
-            W = self.window_size
-
-            out = np.full((M, T), np.nan, dtype=np.float64)
-
-            if T < W:
-                return out
-
-            # Build input matrix: (M, T - W + 1, W * F)
-            # stride-trick to avoid copies
-            X_windows = _build_windows_np(features_3d, W)  # (M, T-W+1, W*F)
-
-            # Reshape to (M * (T-W+1), W*F)
-            n_windows = T - W + 1
-            X_flat = X_windows.reshape(M * n_windows, W * F).astype(np.float32)
-
-            # Mask out rows that contain any NaN
-            nan_mask = np.isnan(X_flat).any(axis=1)  # (M * n_windows,)
-
-            X_valid = X_flat[~nan_mask]
-            if X_valid.shape[0] == 0:
-                return out
-
-            self.eval()
-            with torch.no_grad():
-                x_tensor = torch.from_numpy(X_valid).to(device)
-                preds = self.forward(x_tensor).cpu().numpy().astype(np.float64)
-
-            # Scatter predictions back
-            result_flat = np.full(M * n_windows, np.nan, dtype=np.float64)
-            result_flat[~nan_mask] = preds
-            result_2d = result_flat.reshape(M, n_windows)
-
-            # Place into the last T - W + 1 columns of the output
-            out[:, W - 1 :] = result_2d
-
-            return out
-
-        # ------------------------------------------------------------------
-        # Utilities
-        # ------------------------------------------------------------------
-
-        def param_count(self) -> int:
-            """Return the total number of trainable parameters."""
-            return sum(p.numel() for p in self.parameters() if p.requires_grad)
-
-        def __repr__(self) -> str:
-            return (
-                f"NeuralLeaf(name={self.name!r}, window={self.window_size}, "
-                f"features={self.n_features}, params={self.param_count()})"
-            )
-
-else:
-    # Stub when torch is unavailable so type annotations still resolve.
-    class NeuralLeaf:  # type: ignore[no-redef]
-        """Stub NeuralLeaf (PyTorch unavailable)."""
-
-        def __init__(self, *args: Any, **kwargs: Any) -> None:
-            self.window_size = kwargs.get("window_size", 10)
-            self.n_features = kwargs.get("n_features", 8)
-            self.name = kwargs.get("name", "NeuralLeaf")
-
-        def evaluate(self, features_3d: np.ndarray, **kwargs: Any) -> np.ndarray:
-            return np.full(features_3d.shape[:2], np.nan)
-
-        def param_count(self) -> int:
-            return 0
-
-
-# ===========================================================================
-# Window construction helper (NumPy, no copy when F is contiguous)
-# ===========================================================================
-
-def _build_windows_np(x: np.ndarray, window: int) -> np.ndarray:
-    """Create sliding windows from a (M, T, F) array.
-
-    Returns
-    -------
-    np.ndarray, shape (M, T - window + 1, window * F)
-        Each row is the flattened window of shape (window, F).
-    """
-    M, T, F = x.shape
-    n = T - window + 1
-    # Use stride tricks for zero-copy view
-    s_m, s_t, s_f = x.strides
-    shape = (M, n, window, F)
-    strides = (s_m, s_t, s_t, s_f)
-    windows = np.lib.stride_tricks.as_strided(x, shape=shape, strides=strides)
-    return windows.reshape(M, n, window * F)
-
-
-# ===========================================================================
-# IC loss helpers (differentiable Pearson proxy)
-# ===========================================================================
-
-def _pearson_ic_loss(
-    pred: "torch.Tensor",
-    target: "torch.Tensor",
-    eps: float = 1e-8,
-) -> "torch.Tensor":
-    """Negative Pearson cross-sectional IC averaged over time steps.
-
-    Both tensors must be shape ``(M,)`` (one time slice) or ``(N,)``
-    (flattened batch).  The loss is ``-IC`` so gradient descent maximises IC.
-
-    Parameters
-    ----------
-    pred : torch.Tensor, shape (N,)
-    target : torch.Tensor, shape (N,)
-    eps : float
-        Denominator stabiliser.
-
-    Returns
-    -------
-    torch.Tensor scalar
-    """
-    pred_m = pred - pred.mean()
-    tgt_m = target - target.mean()
-    cov = (pred_m * tgt_m).mean()
-    denom = pred_m.std(unbiased=False).clamp(min=eps) * tgt_m.std(unbiased=False).clamp(min=eps)
-    ic = cov / denom
-    return -ic
-
-
-def _l2_regularisation(model: "NeuralLeaf", lam: float = 1e-4) -> "torch.Tensor":
-    """Compute L2 weight penalty (excludes bias and LayerNorm params)."""
-    reg = torch.tensor(0.0)
-    for name, param in model.named_parameters():
-        if "weight" in name and "norm" not in name:
-            reg = reg + param.pow(2).sum()
-    return lam * reg
-
-
-# ===========================================================================
-# Training procedure
-# ===========================================================================
-
-def train_neural_leaf(
-    name: str,
-    features: np.ndarray,
-    returns: np.ndarray,
-    window_size: int = 10,
-    n_epochs: int = 100,
-    lr: float = 1e-3,
-    hidden_dim: int = _HIDDEN_DIM,
-    val_fraction: float = 0.2,
-    l2_lambda: float = 1e-4,
-    batch_size: int = 2048,
-    patience: int = 15,
-    device: Optional["torch.device"] = None,
-    verbose: bool = False,
-) -> Optional["NeuralLeaf"]:
-    """Train a NeuralLeaf to maximise cross-sectional IC with next-period returns.
-
-    The leaf receives a rolling window of F features per (asset, time) pair
-    and learns to output a signal that is cross-sectionally correlated with
-    next-period returns.  Training uses time-based train/validation splits
-    (no look-ahead: validation set = later time steps).
-
-    Parameters
-    ----------
-    name : str
-        Human-readable name for the leaf (e.g. ``"NeuralMomentum"``).
-    features : np.ndarray, shape (M, T, F)
-        Market feature tensor.  F must match the ``_DEFAULT_FEATURES`` list
-        or be explicitly sized for the model.
-    returns : np.ndarray, shape (M, T)
-        Forward returns aligned to the same (M, T) grid.
-    window_size : int
-        Number of look-back bars for the rolling window.
-    n_epochs : int
-        Maximum training epochs.
-    lr : float
-        Adam learning rate.
-    hidden_dim : int
-        Width of the hidden layer.
-    val_fraction : float
-        Fraction of time steps reserved for validation (tail of the series).
-    l2_lambda : float
-        L2 regularisation coefficient.
-    batch_size : int
-        Mini-batch size over the flattened (asset, time) dimension.
-    patience : int
-        Early stopping patience (epochs without val IC improvement).
-    device : torch.device, optional
-        Computation device.  Defaults to CPU.
-    verbose : bool
-        Whether to log training progress at DEBUG level.
-
-    Returns
-    -------
-    NeuralLeaf or None
-        The trained leaf, or None if torch is unavailable or training fails.
-    """
-    if not _TORCH_AVAILABLE:
-        logger.warning("train_neural_leaf: PyTorch unavailable, returning None.")
-        return None
-
-    device = device or torch.device("cpu")
-    M, T, F = features.shape
-
-    if T <= window_size:
-        logger.warning(
-            "train_neural_leaf(%s): T=%d <= window=%d, cannot train.", name, T, window_size
-        )
-        return None
-
-    # ------------------------------------------------------------------
-    # Build full flat dataset: (M * n_windows, window * F) and target (M * n_windows,)
-    # ------------------------------------------------------------------
-    n_windows = T - window_size + 1
-    X_all = _build_windows_np(features, window_size)  # (M, n_windows, W*F)
-    X_flat = X_all.reshape(M * n_windows, window_size * F).astype(np.float32)
-
-    # Target: forward return at the LAST time step of each window (t = W-1 + k)
-    # features[:, k : k+W, :] -> return at time k + W - 1
-    # We align the return index to the last step in the window.
-    ret_aligned = returns[:, window_size - 1 :]  # (M, n_windows)
-    y_flat = ret_aligned.reshape(M * n_windows).astype(np.float32)
-
-    # ------------------------------------------------------------------
-    # Remove NaN rows (both in X and y)
-    # ------------------------------------------------------------------
-    valid_mask = (~np.isnan(X_flat).any(axis=1)) & (~np.isnan(y_flat))
-    X_flat = X_flat[valid_mask]
-    y_flat = y_flat[valid_mask]
-
-    N = X_flat.shape[0]
-    if N < 100:
-        logger.warning(
-            "train_neural_leaf(%s): only %d valid samples after NaN removal.", name, N
-        )
-        return None
-
-    # ------------------------------------------------------------------
-    # Temporal train / val split: preserve time ordering.
-    # The valid_mask does not preserve temporal ordering in general, so
-    # we use a simple head/tail split on the original time dimension.
-    # ------------------------------------------------------------------
-    # We rebuild from scratch with explicit temporal indexing to ensure
-    # the val set is always strictly later in time.
-
-    T_val_start = int(T * (1.0 - val_fraction))
-    T_val_start = max(T_val_start, window_size)  # ensure at least one val window
-
-    # Train windows: windows whose last time index < T_val_start
-    # Last time index of window k = window_size - 1 + k  (0-indexed over n_windows)
-    # => k < T_val_start - window_size + 1
-    k_split = T_val_start - window_size + 1  # exclusive upper bound for train
-    k_split = max(1, min(k_split, n_windows - 1))
-
-    X_train_raw = _build_windows_np(features[:, :T_val_start, :], window_size)
-    X_train_raw = X_train_raw.reshape(-1, window_size * F).astype(np.float32)
-    y_train_raw = returns[:, window_size - 1 : T_val_start].reshape(-1).astype(np.float32)
-
-    X_val_raw = _build_windows_np(features[:, T_val_start - window_size + 1 :, :], window_size)
-    X_val_raw = X_val_raw.reshape(-1, window_size * F).astype(np.float32)
-    y_val_raw = returns[:, T_val_start:].reshape(-1).astype(np.float32)
-
-    def _clean(X: np.ndarray, y: np.ndarray):
-        mask = (~np.isnan(X).any(axis=1)) & (~np.isnan(y))
-        return X[mask], y[mask]
-
-    X_train, y_train = _clean(X_train_raw, y_train_raw)
-    X_val, y_val = _clean(X_val_raw, y_val_raw)
-
-    if X_train.shape[0] < 50:
-        logger.warning(
-            "train_neural_leaf(%s): too few training samples (%d).", name, X_train.shape[0]
-        )
-        return None
-
-    # ------------------------------------------------------------------
-    # Model, optimiser, scheduler
-    # ------------------------------------------------------------------
-    leaf = NeuralLeaf(
-        window_size=window_size,
-        n_features=F,
-        hidden_dim=hidden_dim,
-        name=name,
-    ).to(device)
-
-    optimizer = optim.Adam(leaf.parameters(), lr=lr)
-    scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=n_epochs, eta_min=lr * 0.01)
-
-    X_train_t = torch.from_numpy(X_train).to(device)
-    y_train_t = torch.from_numpy(y_train).to(device)
-    X_val_t = torch.from_numpy(X_val).to(device)
-    y_val_t = torch.from_numpy(y_val).to(device)
-
-    N_train = X_train_t.shape[0]
-
-    best_val_ic: float = -np.inf
-    best_state: Optional[Dict[str, Any]] = None
-    no_improve: int = 0
-
-    # ------------------------------------------------------------------
-    # Training loop
-    # ------------------------------------------------------------------
-    for epoch in range(n_epochs):
-        leaf.train()
-        # Shuffle each epoch
-        perm = torch.randperm(N_train, device=device)
-        X_shuf = X_train_t[perm]
-        y_shuf = y_train_t[perm]
-
-        epoch_loss = 0.0
-        n_batches = 0
-        for start in range(0, N_train, batch_size):
-            xb = X_shuf[start : start + batch_size]
-            yb = y_shuf[start : start + batch_size]
-            if xb.shape[0] < 4:
-                continue  # skip tiny last batch
-
-            optimizer.zero_grad()
-            pred = leaf(xb)
-            ic_loss = _pearson_ic_loss(pred, yb)
-            reg = _l2_regularisation(leaf, l2_lambda)
-            loss = ic_loss + reg
-            loss.backward()
-            torch.nn.utils.clip_grad_norm_(leaf.parameters(), max_norm=1.0)
-            optimizer.step()
-            epoch_loss += loss.item()
-            n_batches += 1
-
-        scheduler.step()
-
-        # ------------------------------------------------------------------
-        # Validation IC (no gradient)
-        # ------------------------------------------------------------------
-        leaf.eval()
-        with torch.no_grad():
-            val_pred = leaf(X_val_t)
-            val_ic = -_pearson_ic_loss(val_pred, y_val_t).item()  # positive = good
-
-        if val_ic > best_val_ic + 1e-5:
-            best_val_ic = val_ic
-            best_state = {k: v.clone() for k, v in leaf.state_dict().items()}
-            no_improve = 0
-        else:
-            no_improve += 1
-
-        if verbose:
-            avg_loss = epoch_loss / max(n_batches, 1)
-            logger.debug(
-                "Epoch %d/%d  train_loss=%.5f  val_IC=%.4f  best_val_IC=%.4f",
-                epoch + 1,
-                n_epochs,
-                avg_loss,
-                val_ic,
-                best_val_ic,
-            )
-
-        if no_improve >= patience:
-            logger.info(
-                "train_neural_leaf(%s): early stopping at epoch %d (val_IC=%.4f).",
-                name,
-                epoch + 1,
-                best_val_ic,
-            )
-            break
-
-    # Restore best weights
-    if best_state is not None:
-        leaf.load_state_dict(best_state)
-
-    logger.info(
-        "Trained NeuralLeaf '%s': params=%d, best_val_IC=%.4f",
-        name,
-        leaf.param_count(),
-        best_val_ic,
-    )
-    leaf.eval()
-    return leaf
-
-
-# ===========================================================================
-# Symbolic Distillation
-# ===========================================================================
-
-@dataclass
-class DistillationResult:
-    """Result of distilling a neural leaf to a symbolic approximation.
-
-    Attributes
-    ----------
-    formula : str
-        The best-matching symbolic formula string (DSL notation).
-    correlation : float
-        Pearson correlation between the neural leaf output and the
-        best symbolic approximation (over all valid positions).
-    rank_correlation : float
-        Spearman rank correlation (more relevant for factor quality).
-    candidate_scores : dict
-        Full mapping of formula -> correlation for all candidates tried.
-    """
-
-    formula: str
-    correlation: float
-    rank_correlation: float
-    candidate_scores: Dict[str, float] = field(default_factory=dict)
-
-    def __str__(self) -> str:
-        return (
-            f"DistillationResult(formula={self.formula!r}, "
-            f"r={self.correlation:.4f}, rho={self.rank_correlation:.4f})"
-        )
-
-
-def _spearman_corr(a: np.ndarray, b: np.ndarray) -> float:
-    """Spearman rank correlation between two flat arrays, ignoring NaN."""
-    from scipy.stats import spearmanr as _spearman  # local import to keep scipy optional
-
-    mask = ~(np.isnan(a) | np.isnan(b))
-    if mask.sum() < 10:
-        return 0.0
-    rho, _ = _spearman(a[mask], b[mask])
-    return float(rho)
-
-
-def _pearson_corr(a: np.ndarray, b: np.ndarray) -> float:
-    """Pearson correlation between two flat arrays, ignoring NaN."""
-    mask = ~(np.isnan(a) | np.isnan(b))
-    if mask.sum() < 10:
-        return 0.0
-    am, bm = a[mask], b[mask]
-    num = np.mean((am - am.mean()) * (bm - bm.mean()))
-    denom = am.std() * bm.std()
-    if denom < 1e-10:
-        return 0.0
-    return float(num / denom)
-
-
-def _evaluate_symbolic_candidate(formula_fn, data: Dict[str, np.ndarray]) -> Optional[np.ndarray]:
-    """Safely evaluate a symbolic candidate, returning None on failure."""
-    try:
-        result = formula_fn(data)
-        if not isinstance(result, np.ndarray):
-            return None
-        if result.shape != next(iter(data.values())).shape:
-            return None
-        return result
-    except Exception as exc:  # noqa: BLE001
-        logger.debug("Symbolic candidate failed: %s", exc)
-        return None
-
-
-def _build_symbolic_candidates(data: Dict[str, np.ndarray]) -> Dict[str, np.ndarray]:
-    """Generate all symbolic candidate outputs from the hand-coded operator library.
-
-    Returns a dict mapping formula string -> (M, T) array.
-    """
-    # Import operators lazily to avoid circular imports
-    from factorminer.core.expression_tree import _ema, _wma, _rolling_apply  # type: ignore[attr-defined]
-    from factorminer.core.expression_tree import _ts_rank, _ts_mean, _ts_std  # type: ignore[attr-defined]
-
-    candidates: Dict[str, np.ndarray] = {}
-
-    close = data.get("$close")
-    volume = data.get("$volume")
-    returns = data.get("$returns")
-    high = data.get("$high")
-    low = data.get("$low")
-    amt = data.get("$amt")
-    vwap = data.get("$vwap")
-
-    def _safe_add(name: str, arr: Optional[np.ndarray]) -> None:
-        if arr is not None and isinstance(arr, np.ndarray):
-            candidates[name] = arr
-
-    # EMA variants
-    if close is not None:
-        for w in (3, 5, 10, 20):
-            _safe_add(f"EMA($close, {w})", _ema(close, w))
-        # Delta (momentum)
-        for w in (1, 3, 5, 10):
-            M, T = close.shape
-            out = np.full_like(close, np.nan, dtype=np.float64)
-            if w < T:
-                out[:, w:] = close[:, w:] - close[:, :-w]
-            _safe_add(f"Delta($close, {w})", out)
-        # Rolling return
-        for w in (1, 3, 5, 10):
-            M, T = close.shape
-            out = np.full_like(close, np.nan, dtype=np.float64)
-            if w < T:
-                prev = close[:, :-w]
-                ok = np.abs(prev) > 1e-10
-                out[: , w:][ok] = close[:, w:][ok] / prev[ok] - 1.0
-            _safe_add(f"Return($close, {w})", out)
-        # TsRank
-        for w in (5, 10, 20):
-            _safe_add(f"TsRank($close, {w})", _rolling_apply(close, w, _ts_rank))
-        # Rolling std
-        for w in (5, 10, 20):
-            _safe_add(f"Std($close, {w})", _rolling_apply(close, w, _ts_std))
-        # Rolling mean
-        for w in (5, 10, 20):
-            _safe_add(f"Mean($close, {w})", _rolling_apply(close, w, _ts_mean))
-
-    if volume is not None:
-        for w in (5, 10, 20):
-            _safe_add(f"TsRank($volume, {w})", _rolling_apply(volume, w, _ts_rank))
-            _safe_add(f"EMA($volume, {w})", _ema(volume, w))
-        for w in (1, 3, 5):
-            M, T = volume.shape
-            out = np.full_like(volume, np.nan, dtype=np.float64)
-            if w < T:
-                out[:, w:] = volume[:, w:] - volume[:, :-w]
-            _safe_add(f"Delta($volume, {w})", out)
-
-    if returns is not None:
-        for w in (5, 10, 20):
-            _safe_add(f"Std($returns, {w})", _rolling_apply(returns, w, _ts_std))
-            _safe_add(f"Mean($returns, {w})", _rolling_apply(returns, w, _ts_mean))
-            _safe_add(f"TsRank($returns, {w})", _rolling_apply(returns, w, _ts_rank))
-
-    # VWAP-close spread (price-quality signal)
-    if close is not None and vwap is not None:
-        spread = close - vwap
-        _safe_add("Sub($close, $vwap)", spread)
-        for w in (5, 10):
-            _safe_add(f"EMA(Sub($close,$vwap),{w})", _ema(spread, w))
-
-    # High-low range (volatility proxy)
-    if high is not None and low is not None:
-        hl_range = high - low
-        _safe_add("Sub($high, $low)", hl_range)
-        for w in (5, 10, 20):
-            _safe_add(f"Mean(Sub($high,$low),{w})", _rolling_apply(hl_range, w, _ts_mean))
-
-    return candidates
-
-
-def distill_to_symbolic(
-    leaf: "NeuralLeaf",
-    data: Dict[str, np.ndarray],
-    feature_order: Optional[List[str]] = None,
-) -> DistillationResult:
-    """Find the symbolic formula that best approximates the neural leaf.
-
-    Evaluates the leaf on *data*, then computes the Pearson and Spearman
-    correlation between the leaf output and every formula in a curated
-    candidate set.  The candidate with the highest absolute Pearson
-    correlation is chosen as the distillation target.
-
-    Parameters
-    ----------
-    leaf : NeuralLeaf
-        A trained neural leaf.
-    data : dict[str, np.ndarray]
-        Market data dict mapping feature name -> (M, T) array.
-    feature_order : list of str, optional
-        Order of features in the (M, T, F) stack passed to the leaf.
-        Defaults to ``_DEFAULT_FEATURES``.
-
-    Returns
-    -------
-    DistillationResult
-    """
-    feature_order = feature_order or _DEFAULT_FEATURES
-
-    # Build feature tensor (M, T, F)
-    ref_arr = next(iter(data.values()))
-    M, T = ref_arr.shape
-    F = len(feature_order)
-    features_3d = np.stack(
-        [data.get(f, np.full((M, T), np.nan)) for f in feature_order],
-        axis=-1,
-    )  # (M, T, F)
-
-    # Evaluate neural leaf -> (M, T)
-    leaf_output = leaf.evaluate(features_3d)
-
-    # Flatten for correlation computation
-    leaf_flat = leaf_output.ravel()
-
-    # Build symbolic candidates
-    candidates = _build_symbolic_candidates(data)
-
-    scores: Dict[str, float] = {}
-    for formula, arr in candidates.items():
-        r = _pearson_corr(leaf_flat, arr.ravel())
-        scores[formula] = abs(r)  # rank by |r|
-
-    if not scores:
-        logger.warning("distill_to_symbolic: no symbolic candidates available.")
-        return DistillationResult(
-            formula="NeuralLeaf(no_candidates)",
-            correlation=0.0,
-            rank_correlation=0.0,
-            candidate_scores={},
-        )
-
-    best_formula = max(scores, key=lambda k: scores[k])
-    best_arr = candidates[best_formula]
-    best_r = _pearson_corr(leaf_flat, best_arr.ravel())
-
-    try:
-        best_rho = _spearman_corr(leaf_flat, best_arr.ravel())
-    except ImportError:
-        best_rho = 0.0
-        logger.debug("distill_to_symbolic: scipy not available, Spearman correlation skipped.")
-
-    logger.info(
-        "Distillation: best formula='%s', Pearson r=%.4f, Spearman rho=%.4f",
-        best_formula,
-        best_r,
-        best_rho,
-    )
-
-    return DistillationResult(
-        formula=best_formula,
-        correlation=best_r,
-        rank_correlation=best_rho,
-        candidate_scores={k: v for k, v in sorted(scores.items(), key=lambda x: -x[1])},
-    )
-
-
-# ===========================================================================
-# Expression Tree Integration
-# ===========================================================================
-
-class NeuralLeafNode:
-    """A node that wraps a NeuralLeaf for use inside expression trees.
-
-    Implements the same interface as ``factorminer.core.expression_tree.Node``
-    so it can be dropped into any tree position that expects a (M, T) output.
-
-    Crucially, this node does NOT inherit from ``Node`` to avoid coupling to
-    the abstract base class, but it exposes the same public methods so that
-    ExpressionTree machinery works without modification.
-
-    Parameters
-    ----------
-    leaf : NeuralLeaf
-        The trained (or untrained) neural leaf.
-    feature_order : list of str, optional
-        Feature channels fed to the leaf, in order.
-        Defaults to ``_DEFAULT_FEATURES``.
-    distilled_formula : str, optional
-        If set, ``to_string()`` returns this formula instead of the neural
-        leaf name.  Used after distillation for interpretable serialisation.
-    """
-
-    def __init__(
-        self,
-        leaf: "NeuralLeaf",
-        feature_order: Optional[List[str]] = None,
-        distilled_formula: Optional[str] = None,
-    ) -> None:
-        self._leaf = leaf
-        self._feature_order = feature_order or _DEFAULT_FEATURES
-        self._distilled_formula = distilled_formula
-
-    # ------------------------------------------------------------------
-    # Node interface
-    # ------------------------------------------------------------------
-
-    def evaluate(self, data: Dict[str, np.ndarray]) -> np.ndarray:
-        """Compute the leaf signal on market data.
-
-        Parameters
-        ----------
-        data : dict[str, np.ndarray]
-            Maps feature names to (M, T) arrays.
-
-        Returns
-        -------
-        np.ndarray, shape (M, T)
-        """
-        ref = next(iter(data.values()))
-        M, T = ref.shape
-        F = len(self._feature_order)
-        features_3d = np.stack(
-            [data.get(f, np.full((M, T), np.nan)) for f in self._feature_order],
-            axis=-1,
-        )
-        return self._leaf.evaluate(features_3d)
-
-    def to_string(self) -> str:
-        """DSL serialisation.  Returns distilled formula when available."""
-        if self._distilled_formula:
-            return self._distilled_formula
-        return f"NeuralLeaf({self._leaf.name})"
-
-    def depth(self) -> int:
-        return 1
-
-    def size(self) -> int:
-        return 1
-
-    def clone(self) -> "NeuralLeafNode":
-        return NeuralLeafNode(
-            leaf=self._leaf,  # shared reference — leaf weights are shared
-            feature_order=list(self._feature_order),
-            distilled_formula=self._distilled_formula,
-        )
-
-    def leaf_features(self) -> List[str]:
-        return sorted(self._feature_order)
-
-    def __repr__(self) -> str:
-        return self.to_string()
-
-    # ------------------------------------------------------------------
-    # Extra helpers
-    # ------------------------------------------------------------------
-
-    @property
-    def neural_leaf(self) -> "NeuralLeaf":
-        return self._leaf
-
-    def set_distilled_formula(self, formula: str) -> None:
-        """Pin the distilled formula used by ``to_string()``."""
-        self._distilled_formula = formula
-
-
-# ===========================================================================
-# SymbolicShell — presents a neural leaf as a typed operator
-# ===========================================================================
-
-class SymbolicShell:
-    """Wraps a NeuralLeaf as a callable operator compatible with the DSL.
-
-    After distillation, the internal NeuralLeafNode can be replaced with
-    its symbolic approximation by calling ``replace_with_symbolic()``.
-
-    Parameters
-    ----------
-    name : str
-        Operator name used in the registry and DSL strings.
-    leaf_node : NeuralLeafNode
-        The node wrapping the trained leaf.
-
-    Usage
-    -----
-    ::
-
-        shell = SymbolicShell("NeuralMomentum", leaf_node)
-        signal = shell(data)                    # (M, T) array
-        distilled = shell.distill(data)         # DistillationResult
-        shell.replace_with_symbolic(distilled.formula)
-        print(shell.formula_string)             # "EMA($close, 10)"
-    """
-
-    def __init__(self, name: str, leaf_node: NeuralLeafNode) -> None:
-        self.name = name
-        self._node = leaf_node
-        self._is_distilled = False
-
-    def __call__(self, data: Dict[str, np.ndarray]) -> np.ndarray:
-        """Evaluate the operator on market data."""
-        return self._node.evaluate(data)
-
-    @property
-    def formula_string(self) -> str:
-        """Current DSL formula (neural or distilled)."""
-        return self._node.to_string()
-
-    @property
-    def is_distilled(self) -> bool:
-        return self._is_distilled
-
-    def distill(
-        self,
-        data: Dict[str, np.ndarray],
-        feature_order: Optional[List[str]] = None,
-    ) -> DistillationResult:
-        """Run distillation and return the result without modifying state."""
-        return distill_to_symbolic(
-            self._node.neural_leaf,
-            data,
-            feature_order=feature_order,
-        )
-
-    def replace_with_symbolic(self, formula: str) -> None:
-        """Pin a distilled symbolic formula to this shell.
-
-        After calling this, ``formula_string`` and ``to_string()`` return
-        *formula* instead of the neural leaf name.
-
-        Parameters
-        ----------
-        formula : str
-            Symbolic formula string (DSL notation).
-        """
-        self._node.set_distilled_formula(formula)
-        self._is_distilled = True
-        logger.info("SymbolicShell '%s' replaced with symbolic formula: %s", self.name, formula)
-
-    def __repr__(self) -> str:
-        state = "distilled" if self._is_distilled else "neural"
-        return f"SymbolicShell({self.name!r}, {state}, formula={self.formula_string!r})"
-
-
-# ===========================================================================
-# NeuralLeafRegistry
-# ===========================================================================
-
-class NeuralLeafRegistry:
-    """Registry of named, trained NeuralLeaf models.
-
-    Provides named storage, persistence, and lookup of NeuralLeaf instances.
-    Trained weights are persisted via ``torch.save`` / ``torch.load``.
-
-    Parameters
-    ----------
-    storage_dir : str, optional
-        Directory where weights are saved.  Defaults to the system temp dir.
-
-    Example
-    -------
-    ::
-
-        registry = NeuralLeafRegistry(storage_dir="/tmp/neural_leaves")
-        leaf = train_neural_leaf("NeuralMomentum", features, returns)
-        registry.register("NeuralMomentum", leaf)
-        registry.save("NeuralMomentum")
-
-        # Later:
-        registry.load("NeuralMomentum")
-        leaf = registry.get("NeuralMomentum")
-    """
-
-    def __init__(self, storage_dir: Optional[str] = None) -> None:
-        import tempfile
-
-        self._storage_dir = storage_dir or os.path.join(tempfile.gettempdir(), "neural_leaves")
-        os.makedirs(self._storage_dir, exist_ok=True)
-        self._leaves: Dict[str, NeuralLeaf] = {}
-
-    # ------------------------------------------------------------------
-    # CRUD
-    # ------------------------------------------------------------------
-
-    def register(self, name: str, leaf: "NeuralLeaf") -> None:
-        """Register a trained leaf under *name*."""
-        self._leaves[name] = leaf
-        logger.info("NeuralLeafRegistry: registered '%s'.", name)
-
-    def get(self, name: str) -> Optional["NeuralLeaf"]:
-        """Return the leaf registered under *name*, or None."""
-        return self._leaves.get(name)
-
-    def remove(self, name: str) -> None:
-        """Remove a leaf from the in-memory registry."""
-        self._leaves.pop(name, None)
-
-    def available(self) -> List[str]:
-        """Return sorted list of registered leaf names."""
-        return sorted(self._leaves.keys())
-
-    # ------------------------------------------------------------------
-    # Persistence
-    # ------------------------------------------------------------------
-
-    def _path(self, name: str) -> str:
-        safe_name = name.replace("/", "_").replace("\\", "_")
-        return os.path.join(self._storage_dir, f"{safe_name}.pt")
-
-    def save(self, name: str) -> str:
-        """Save a registered leaf's weights to disk.
-
-        Returns
-        -------
-        str
-            Path where the file was saved.
-
-        Raises
-        ------
-        KeyError
-            If *name* is not registered.
-        RuntimeError
-            If PyTorch is unavailable.
-        """
-        if not _TORCH_AVAILABLE:
-            raise RuntimeError("PyTorch not available; cannot save NeuralLeaf.")
-        leaf = self._leaves.get(name)
-        if leaf is None:
-            raise KeyError(f"NeuralLeafRegistry: no leaf named '{name}'.")
-        path = self._path(name)
-        torch.save(
-            {
-                "name": leaf.name,
-                "window_size": leaf.window_size,
-                "n_features": leaf.n_features,
-                "hidden_dim": leaf.hidden_dim,
-                "state_dict": leaf.state_dict(),
-            },
-            path,
-        )
-        logger.info("Saved NeuralLeaf '%s' to %s", name, path)
-        return path
-
-    def load(self, name: str, path: Optional[str] = None) -> "NeuralLeaf":
-        """Load a leaf from disk and register it.
-
-        Parameters
-        ----------
-        name : str
-            Registry name to assign (may differ from the file's embedded name).
-        path : str, optional
-            Explicit file path.  If omitted, uses the default storage path.
-
-        Returns
-        -------
-        NeuralLeaf
-        """
-        if not _TORCH_AVAILABLE:
-            raise RuntimeError("PyTorch not available; cannot load NeuralLeaf.")
-        file_path = path or self._path(name)
-        if not os.path.exists(file_path):
-            raise FileNotFoundError(f"NeuralLeaf weights not found at '{file_path}'.")
-        ckpt = torch.load(file_path, map_location="cpu", weights_only=True)
-        leaf = NeuralLeaf(
-            window_size=ckpt["window_size"],
-            n_features=ckpt["n_features"],
-            hidden_dim=ckpt["hidden_dim"],
-            name=ckpt["name"],
-        )
-        leaf.load_state_dict(ckpt["state_dict"])
-        leaf.eval()
-        self._leaves[name] = leaf
-        logger.info("Loaded NeuralLeaf '%s' from %s", name, file_path)
-        return leaf
-
-    def save_all(self) -> Dict[str, str]:
-        """Save all registered leaves.  Returns name -> path mapping."""
-        return {name: self.save(name) for name in self._leaves}
-
-    def load_all(self) -> List[str]:
-        """Load all .pt files from the storage directory.  Returns loaded names."""
-        loaded = []
-        for fname in os.listdir(self._storage_dir):
-            if fname.endswith(".pt"):
-                name = fname[:-3]
-                try:
-                    self.load(name)
-                    loaded.append(name)
-                except Exception as exc:  # noqa: BLE001
-                    logger.warning("Failed to load '%s': %s", name, exc)
-        return loaded
-
-
-# ===========================================================================
-# NeuralOperatorIntegration — high-level orchestration
-# ===========================================================================
-
-@dataclass
-class NeuralLeafConfig:
-    """Configuration for a single named neural leaf.
-
-    Attributes
-    ----------
-    name : str
-        Registry name (e.g. ``"NeuralMomentum"``).
-    window_size : int
-        Rolling window size.
-    n_epochs : int
-        Training epochs.
-    lr : float
-        Adam learning rate.
-    hidden_dim : int
-        Hidden layer width.
-    description : str
-        Human-readable description.
-    """
-
-    name: str
-    window_size: int = 10
-    n_epochs: int = 100
-    lr: float = 1e-3
-    hidden_dim: int = _HIDDEN_DIM
-    description: str = ""
-
-
-class NeuralOperatorIntegration:
-    """Orchestrates training, distillation, and persistence of neural leaves.
-
-    This is the main entry point for integrating neural leaves into a
-    HelixFactor workflow.
-
-    Parameters
-    ----------
-    registry : NeuralLeafRegistry, optional
-        Shared registry.  A new one is created if not provided.
-    feature_order : list of str, optional
-        Feature channels expected in the (M, T, F) input tensor.
-
-    Example
-    -------
-    ::
-
-        integration = NeuralOperatorIntegration()
-        configs = [
-            NeuralLeafConfig("NeuralMomentum", window_size=10),
-            NeuralLeafConfig("NeuralReversal", window_size=5),
-            NeuralLeafConfig("NeuralVolume",   window_size=10),
-        ]
-        integration.train_all_leaves(features_3d, returns, configs)
-        distilled = integration.distill_all(data_dict)
-        integration.save("/tmp/my_leaves")
-    """
-
-    def __init__(
-        self,
-        registry: Optional[NeuralLeafRegistry] = None,
-        feature_order: Optional[List[str]] = None,
-    ) -> None:
-        self._registry = registry or NeuralLeafRegistry()
-        self._feature_order = feature_order or _DEFAULT_FEATURES
-        self._distillation_results: Dict[str, DistillationResult] = {}
-
-    # ------------------------------------------------------------------
-    # Training
-    # ------------------------------------------------------------------
-
-    def train_all_leaves(
-        self,
-        features: np.ndarray,
-        returns: np.ndarray,
-        leaf_configs: List[NeuralLeafConfig],
-        device: Optional["torch.device"] = None,
-        verbose: bool = False,
-    ) -> None:
-        """Train all listed neural leaves and register them.
-
-        Parameters
-        ----------
-        features : np.ndarray, shape (M, T, F)
-            Market feature tensor in the order given by ``feature_order``.
-        returns : np.ndarray, shape (M, T)
-            Forward returns for training targets.
-        leaf_configs : list of NeuralLeafConfig
-            One entry per leaf to train.
-        device : torch.device, optional
-        verbose : bool
-            Pass through to training loop for debug logging.
-        """
-        for cfg in leaf_configs:
-            logger.info("Training NeuralLeaf '%s'…", cfg.name)
-            leaf = train_neural_leaf(
-                name=cfg.name,
-                features=features,
-                returns=returns,
-                window_size=cfg.window_size,
-                n_epochs=cfg.n_epochs,
-                lr=cfg.lr,
-                hidden_dim=cfg.hidden_dim,
-                device=device,
-                verbose=verbose,
-            )
-            if leaf is not None:
-                self._registry.register(cfg.name, leaf)
-            else:
-                logger.warning("Training failed for '%s', skipping.", cfg.name)
-
-    # ------------------------------------------------------------------
-    # Distillation
-    # ------------------------------------------------------------------
-
-    def distill_all(
-        self,
-        data: Dict[str, np.ndarray],
-    ) -> Dict[str, str]:
-        """Distill all registered leaves and return name -> best formula.
-
-        Parameters
-        ----------
-        data : dict[str, np.ndarray]
-            Market data dict (same format used for expression tree evaluation).
-
-        Returns
-        -------
-        dict
-            Maps leaf name to its best symbolic approximation formula string.
-        """
-        results: Dict[str, str] = {}
-        for name in self._registry.available():
-            leaf = self._registry.get(name)
-            if leaf is None:
-                continue
-            distilled = distill_to_symbolic(
-                leaf, data, feature_order=self._feature_order
-            )
-            self._distillation_results[name] = distilled
-            results[name] = distilled.formula
-            logger.info(
-                "Distilled '%s' -> '%s' (r=%.4f, rho=%.4f)",
-                name,
-                distilled.formula,
-                distilled.correlation,
-                distilled.rank_correlation,
-            )
-        return results
-
-    # ------------------------------------------------------------------
-    # Registry accessors
-    # ------------------------------------------------------------------
-
-    def get_available_leaves(self) -> List[str]:
-        """Return names of all registered leaves."""
-        return self._registry.available()
-
-    def get_leaf(self, name: str) -> Optional["NeuralLeaf"]:
-        """Return the NeuralLeaf registered under *name*, or None."""
-        return self._registry.get(name)
-
-    def get_distillation_result(self, name: str) -> Optional[DistillationResult]:
-        """Return the stored DistillationResult for *name*, or None."""
-        return self._distillation_results.get(name)
-
-    def as_node(self, name: str) -> Optional[NeuralLeafNode]:
-        """Return a NeuralLeafNode ready for use in an expression tree.
-
-        If distillation has been run, the formula string is automatically set
-        on the returned node.
-
-        Parameters
-        ----------
-        name : str
-
-        Returns
-        -------
-        NeuralLeafNode or None
-        """
-        leaf = self._registry.get(name)
-        if leaf is None:
-            return None
-        distilled_formula = None
-        if name in self._distillation_results:
-            distilled_formula = self._distillation_results[name].formula
-        return NeuralLeafNode(
-            leaf=leaf,
-            feature_order=self._feature_order,
-            distilled_formula=distilled_formula,
-        )
-
-    def as_shell(self, name: str) -> Optional[SymbolicShell]:
-        """Return a SymbolicShell for *name*, or None if unknown."""
-        node = self.as_node(name)
-        if node is None:
-            return None
-        shell = SymbolicShell(name=name, leaf_node=node)
-        if name in self._distillation_results:
-            shell.replace_with_symbolic(self._distillation_results[name].formula)
-        return shell
-
-    # ------------------------------------------------------------------
-    # Persistence
-    # ------------------------------------------------------------------
-
-    def save(self, path: str) -> None:
-        """Save all registered leaves to *path* (directory).
-
-        Parameters
-        ----------
-        path : str
-            Target directory.  Will be created if it does not exist.
-        """
-        os.makedirs(path, exist_ok=True)
-        old_dir = self._registry._storage_dir
-        self._registry._storage_dir = path
-        self._registry.save_all()
-        self._registry._storage_dir = old_dir
-        logger.info("Saved %d neural leaves to %s", len(self._registry.available()), path)
-
-    def load(self, path: str) -> None:
-        """Load all .pt files from *path* into the registry.
-
-        Parameters
-        ----------
-        path : str
-            Directory containing .pt weight files.
-        """
-        if not os.path.isdir(path):
-            raise FileNotFoundError(f"NeuralOperatorIntegration.load: '{path}' is not a directory.")
-        old_dir = self._registry._storage_dir
-        self._registry._storage_dir = path
-        loaded = self._registry.load_all()
-        self._registry._storage_dir = old_dir
-        logger.info("Loaded %d neural leaves from %s", len(loaded), path)
-
-
-# ===========================================================================
-# Registry hook — exposes neural leaves to the operator registry
-# ===========================================================================
-
-# Global singleton, populated lazily when neural leaves are trained/loaded.
-_GLOBAL_REGISTRY: Optional[NeuralLeafRegistry] = None
-
-
-def get_global_neural_registry() -> NeuralLeafRegistry:
-    """Return (and lazily create) the global NeuralLeafRegistry."""
-    global _GLOBAL_REGISTRY
-    if _GLOBAL_REGISTRY is None:
-        _GLOBAL_REGISTRY = NeuralLeafRegistry()
-    return _GLOBAL_REGISTRY
-
-
-def register_neural_leaves_in_operator_registry() -> None:
-    """Expose registered neural leaves to the main operator OPERATOR_REGISTRY.
-
-    This function should be called AFTER leaves have been trained / loaded.
-    Each leaf is added to the registry with:
-      - A synthetic OperatorSpec (category AUTO_INVENTED, arity 0 — the leaf
-        takes the full data dict rather than individual array inputs).
-      - A numpy_fn that calls ``leaf.evaluate(features_3d)`` after assembling
-        the feature tensor from the data dict.
-      - No PyTorch fn (the leaf already uses PyTorch internally).
-
-    This allows the broader HelixFactor system to treat neural leaves as
-    first-class operators that can appear in search spaces and fitness
-    evaluation loops.
-    """
-    try:
-        from factorminer.operators.registry import OPERATOR_REGISTRY  # type: ignore[attr-defined]
-        from factorminer.core.types import OperatorSpec, OperatorType, SignatureType
-    except ImportError:
-        logger.debug("register_neural_leaves_in_operator_registry: operator registry not available.")
-        return
-
-    registry = get_global_neural_registry()
-    for name in registry.available():
-        if name in OPERATOR_REGISTRY:
-            continue  # already registered
-
-        leaf = registry.get(name)
-        if leaf is None:
-            continue
-
-        feature_order = _DEFAULT_FEATURES
-
-        # Capture leaf in closure
-        def _make_np_fn(captured_leaf, captured_order):
-            def _np_fn(data: Dict[str, np.ndarray]) -> np.ndarray:
-                ref = next(iter(data.values()))
-                M, T = ref.shape
-                F = len(captured_order)
-                features_3d = np.stack(
-                    [data.get(f, np.full((M, T), np.nan)) for f in captured_order],
-                    axis=-1,
-                )
-                return captured_leaf.evaluate(features_3d)
-
-            return _np_fn
-
-        np_fn = _make_np_fn(leaf, feature_order)
-
-        spec = OperatorSpec(
-            name=name,
-            arity=0,  # special: takes data dict, not individual arrays
-            category=OperatorType.AUTO_INVENTED,
-            signature=SignatureType.TIME_SERIES_TO_TIME_SERIES,
-            description=f"NeuralLeaf: {name}",
-        )
-        OPERATOR_REGISTRY[name] = (spec, np_fn, None)
-        logger.info("Registered neural leaf '%s' in OPERATOR_REGISTRY.", name)
-
-
-# ===========================================================================
-# Convenience: build standard leaves from mock data
-# ===========================================================================
-
-def build_default_neural_leaves(
-    num_assets: int = 20,
-    num_periods: int = 500,
-    window_size: int = 10,
-    n_epochs: int = 50,
-    seed: int = 42,
-    verbose: bool = False,
-) -> NeuralOperatorIntegration:
-    """Train the three standard neural leaves on synthetic mock data.
-
-    Intended for quick experimentation and testing.  Uses
-    ``factorminer.data.mock_data.generate_mock_data`` internally.
-
-    The three leaves are:
-    - ``NeuralMomentum``: captures price trend and momentum patterns.
-    - ``NeuralReversal``: captures short-term mean-reversion signals.
-    - ``NeuralVolume``: captures volume-price interaction signals.
-
-    Parameters
-    ----------
-    num_assets : int
-    num_periods : int
-    window_size : int
-    n_epochs : int
-    seed : int
-    verbose : bool
-
-    Returns
-    -------
-    NeuralOperatorIntegration
-        Fully initialised integration with trained leaves.
-    """
-    from factorminer.data.mock_data import MockConfig, generate_mock_data
-
-    config = MockConfig(
-        num_assets=num_assets,
-        num_periods=num_periods,
-        seed=seed,
-        plant_alpha=True,
-        alpha_strength=0.03,
-    )
-    df = generate_mock_data(config)
-
-    # Pivot to (M, T) arrays
-    df_sorted = df.sort_values(["asset_id", "datetime"])
-    assets = sorted(df_sorted["asset_id"].unique())
-    M = len(assets)
-    T = df_sorted.groupby("asset_id").size().min()
-
-    def _pivot(col: str) -> np.ndarray:
-        return np.array(
-            [df_sorted[df_sorted["asset_id"] == a][col].values[:T] for a in assets],
-            dtype=np.float64,
-        )
-
-    close = _pivot("close")
-    high = _pivot("high")
-    low = _pivot("low")
-    open_ = _pivot("open")
-    volume = _pivot("volume")
-    amount = _pivot("amount")
-    # Derive returns and vwap
-    ret = np.full_like(close, np.nan)
-    ret[:, 1:] = close[:, 1:] / np.where(close[:, :-1] > 1e-10, close[:, :-1], np.nan) - 1.0
-    vwap = (high + low + close) / 3.0
-
-    data_dict: Dict[str, np.ndarray] = {
-        "$open": open_,
-        "$high": high,
-        "$low": low,
-        "$close": close,
-        "$volume": volume,
-        "$amt": amount,
-        "$vwap": vwap,
-        "$returns": ret,
-    }
-
-    # Stack features in the canonical order
-    features_3d = np.stack(
-        [data_dict[f] for f in _DEFAULT_FEATURES],
-        axis=-1,
-    )  # (M, T, F)
-
-    # Forward returns: shift by 1
-    fwd_returns = np.full_like(close, np.nan)
-    fwd_returns[:, :-1] = ret[:, 1:]
-
-    configs = [
-        NeuralLeafConfig(
-            "NeuralMomentum",
-            window_size=window_size,
-            n_epochs=n_epochs,
-            description="Learns price-momentum patterns from OHLCV windows",
-        ),
-        NeuralLeafConfig(
-            "NeuralReversal",
-            window_size=max(5, window_size // 2),
-            n_epochs=n_epochs,
-            description="Learns short-term mean-reversion signals",
-        ),
-        NeuralLeafConfig(
-            "NeuralVolume",
-            window_size=window_size,
-            n_epochs=n_epochs,
-            description="Learns volume-price interaction patterns",
-        ),
-    ]
-
-    integration = NeuralOperatorIntegration(feature_order=_DEFAULT_FEATURES)
-    integration.train_all_leaves(
-        features=features_3d,
-        returns=fwd_returns,
-        leaf_configs=configs,
-        verbose=verbose,
-    )
-
-    # Distill to symbolic
-    integration.distill_all(data_dict)
-
-    return integration
-
-
-# ===========================================================================
-# Public API
-# ===========================================================================
-
-__all__ = [
-    # Core classes
-    "NeuralLeaf",
-    "NeuralLeafNode",
-    "NeuralLeafRegistry",
-    "SymbolicShell",
-    # Training
-    "train_neural_leaf",
-    "NeuralLeafConfig",
-    # Distillation
-    "distill_to_symbolic",
-    "DistillationResult",
-    # Orchestration
-    "NeuralOperatorIntegration",
-    # Registry integration
-    "get_global_neural_registry",
-    "register_neural_leaves_in_operator_registry",
-    # Convenience
-    "build_default_neural_leaves",
-    # Constants
-    "_DEFAULT_FEATURES",
-    "_TORCH_AVAILABLE",
-]
diff --git a/src/factorminer/factorminer/operators/registry.py b/src/factorminer/factorminer/operators/registry.py
deleted file mode 100644
index ca9cff7..0000000
--- a/src/factorminer/factorminer/operators/registry.py
+++ /dev/null
@@ -1,142 +0,0 @@
-"""Central operator registry mapping names to implementations and specs.
-
-Combines the ``OperatorSpec`` definitions from ``core.types`` with the concrete
-NumPy / PyTorch function implementations from each category module.
-"""
-
-from __future__ import annotations
-
-from typing import Any, Callable, Dict, List, Optional, Tuple, Union
-
-import numpy as np
-
-from src.factorminer.factorminer.core.types import OPERATOR_REGISTRY as SPEC_REGISTRY
-from src.factorminer.factorminer.core.types import OperatorSpec, OperatorType
-
-from src.factorminer.factorminer.operators.arithmetic import ARITHMETIC_OPS
-from src.factorminer.factorminer.operators.statistical import STATISTICAL_OPS
-from src.factorminer.factorminer.operators.timeseries import TIMESERIES_OPS
-from src.factorminer.factorminer.operators.crosssectional import CROSSSECTIONAL_OPS
-from src.factorminer.factorminer.operators.smoothing import SMOOTHING_OPS
-from src.factorminer.factorminer.operators.regression import REGRESSION_OPS
-from src.factorminer.factorminer.operators.logical import LOGICAL_OPS
-
-try:
-    import torch
-
-    _TORCH = True
-except ImportError:
-    torch = None  # type: ignore[assignment]
-    _TORCH = False
-
-# ---------------------------------------------------------------------------
-# Build unified registry: name -> (OperatorSpec, np_fn, torch_fn)
-# ---------------------------------------------------------------------------
-
-_ALL_IMPL_TABLES: List[Dict[str, Tuple[Callable, Callable]]] = [
-    ARITHMETIC_OPS,
-    STATISTICAL_OPS,
-    TIMESERIES_OPS,
-    CROSSSECTIONAL_OPS,
-    SMOOTHING_OPS,
-    REGRESSION_OPS,
-    LOGICAL_OPS,
-]
-
-# Merge implementation tables
-_IMPL: Dict[str, Tuple[Callable, Callable]] = {}
-for table in _ALL_IMPL_TABLES:
-    _IMPL.update(table)
-
-# The full registry: name -> (spec, numpy_fn, torch_fn)
-OPERATOR_REGISTRY: Dict[str, Tuple[OperatorSpec, Callable, Optional[Callable]]] = {}
-
-for name, spec in SPEC_REGISTRY.items():
-    if name in _IMPL:
-        np_fn, torch_fn = _IMPL[name]
-        OPERATOR_REGISTRY[name] = (spec, np_fn, torch_fn)
-    else:
-        # Spec exists but no implementation yet -- register with None fns
-        OPERATOR_REGISTRY[name] = (spec, None, None)  # type: ignore[assignment]
-
-
-# ---------------------------------------------------------------------------
-# Public API
-# ---------------------------------------------------------------------------
-
-def get_operator(name: str) -> OperatorSpec:
-    """Look up an operator spec by name."""
-    if name not in OPERATOR_REGISTRY:
-        raise KeyError(
-            f"Unknown operator '{name}'. "
-            f"Available: {sorted(OPERATOR_REGISTRY.keys())}"
-        )
-    return OPERATOR_REGISTRY[name][0]
-
-
-def get_impl(name: str, backend: str = "numpy") -> Callable:
-    """Return the implementation function for a given operator and backend."""
-    if name not in OPERATOR_REGISTRY:
-        raise KeyError(f"Unknown operator '{name}'")
-    spec, np_fn, torch_fn = OPERATOR_REGISTRY[name]
-    if backend == "torch" or backend == "gpu":
-        if torch_fn is None:
-            raise NotImplementedError(f"No PyTorch implementation for '{name}'")
-        return torch_fn
-    if np_fn is None:
-        raise NotImplementedError(f"No NumPy implementation for '{name}'")
-    return np_fn
-
-
-def execute_operator(
-    name: str,
-    *inputs: Any,
-    params: Optional[Dict[str, Any]] = None,
-    backend: str = "numpy",
-) -> Union[np.ndarray, "torch.Tensor"]:
-    """Execute an operator by name.
-
-    Parameters
-    ----------
-    name : str
-        Operator name (e.g. ``"Add"``, ``"Mean"``).
-    *inputs : array-like
-        Positional data inputs (1, 2, or 3 depending on arity).
-    params : dict, optional
-        Extra keyword parameters (e.g. ``{"window": 20}``).
-    backend : str
-        ``"numpy"`` or ``"torch"`` / ``"gpu"``.
-
-    Returns
-    -------
-    np.ndarray or torch.Tensor
-    """
-    fn = get_impl(name, backend)
-    kw = params or {}
-    return fn(*inputs, **kw)
-
-
-def list_operators(grouped: bool = True) -> Union[List[str], Dict[str, List[str]]]:
-    """List all registered operator names.
-
-    Parameters
-    ----------
-    grouped : bool
-        If True, return a dict mapping category name -> list of op names.
-        If False, return a flat sorted list.
-    """
-    if not grouped:
-        return sorted(OPERATOR_REGISTRY.keys())
-
-    groups: Dict[str, List[str]] = {}
-    for name, (spec, _, _) in OPERATOR_REGISTRY.items():
-        cat = spec.category.name
-        groups.setdefault(cat, []).append(name)
-    for cat in groups:
-        groups[cat].sort()
-    return groups
-
-
-def implemented_operators() -> List[str]:
-    """Return names of operators that have at least a NumPy implementation."""
-    return sorted(name for name, (_, np_fn, _) in OPERATOR_REGISTRY.items() if np_fn is not None)
diff --git a/src/factorminer/factorminer/operators/regression.py b/src/factorminer/factorminer/operators/regression.py
deleted file mode 100644
index a0e8dfc..0000000
--- a/src/factorminer/factorminer/operators/regression.py
+++ /dev/null
@@ -1,167 +0,0 @@
-"""Rolling linear-regression operators.
-
-Each function regresses x against a simple time index [0, 1, ..., window-1]
-within a rolling window along axis=1.  Input/output shape: ``(M, T)``.
-"""
-
-from __future__ import annotations
-
-import numpy as np
-
-try:
-    import torch
-except ImportError:
-    torch = None  # type: ignore[assignment]
-
-
-# ===========================================================================
-# NumPy implementations
-# ===========================================================================
-
-def _linreg_components_np(x: np.ndarray, window: int):
-    """Compute slope, intercept, and fitted values for rolling OLS vs time index."""
-    window = int(window)
-    M, T = x.shape
-
-    from factorminer.operators.statistical import _rolling_np, _pad_front
-
-    w = _rolling_np(x, window)
-    if w is None:
-        nan = np.full_like(x, np.nan)
-        return nan, nan, nan, nan
-
-    t_idx = np.arange(window, dtype=np.float64)  # (window,)
-    t_mean = t_idx.mean()
-    t_var = ((t_idx - t_mean) ** 2).sum()
-
-    x_mean = np.nanmean(w, axis=2, keepdims=True)  # (M, T-w+1, 1)
-    # covariance of x with t_idx
-    cov_xt = np.nansum((w - x_mean) * (t_idx - t_mean), axis=2)  # (M, T-w+1)
-
-    slope = cov_xt / t_var  # (M, T-w+1)
-    intercept = x_mean.squeeze(2) - slope * t_mean
-
-    # Fitted value at the last time step in window (t = window - 1)
-    fitted = slope * (window - 1) + intercept
-
-    # Residual at last time step
-    residual = w[:, :, -1] - fitted
-
-    # R-squared
-    ss_res_all = w - (slope[:, :, np.newaxis] * t_idx + intercept[:, :, np.newaxis])
-    ss_res = np.nansum(ss_res_all ** 2, axis=2)
-    ss_tot = np.nansum((w - x_mean) ** 2, axis=2)
-    with np.errstate(invalid="ignore", divide="ignore"):
-        r2 = np.where(ss_tot > 1e-10, 1.0 - ss_res / ss_tot, np.nan)
-
-    slope = _pad_front(slope, window, T)
-    intercept = _pad_front(intercept, window, T)
-    fitted = _pad_front(fitted, window, T)
-    residual = _pad_front(residual, window, T)
-    r2 = _pad_front(r2, window, T)
-
-    return slope, intercept, fitted, residual, r2
-
-
-def ts_linreg_np(x: np.ndarray, window: int = 20) -> np.ndarray:
-    """Rolling linear-regression fitted value."""
-    _, _, fitted, _, _ = _linreg_components_np(x, window)
-    return fitted
-
-
-def ts_linreg_slope_np(x: np.ndarray, window: int = 20) -> np.ndarray:
-    """Rolling linear-regression slope."""
-    slope, _, _, _, _ = _linreg_components_np(x, window)
-    return slope
-
-
-def ts_linreg_intercept_np(x: np.ndarray, window: int = 20) -> np.ndarray:
-    """Rolling linear-regression intercept."""
-    _, intercept, _, _, _ = _linreg_components_np(x, window)
-    return intercept
-
-
-def ts_linreg_resid_np(x: np.ndarray, window: int = 20) -> np.ndarray:
-    """Rolling linear-regression residual at the last time step."""
-    _, _, _, residual, _ = _linreg_components_np(x, window)
-    return residual
-
-
-# ===========================================================================
-# PyTorch implementations
-# ===========================================================================
-
-def _linreg_components_torch(x: "torch.Tensor", window: int):
-    """Vectorized rolling OLS on GPU."""
-    window = int(window)
-    M, T = x.shape
-
-    from factorminer.operators.statistical import _unfold_torch, _pad_front_torch
-
-    w = _unfold_torch(x, window)  # (M, T-w+1, window)
-
-    t_idx = torch.arange(window, dtype=x.dtype, device=x.device)
-    t_mean = t_idx.mean()
-    t_var = ((t_idx - t_mean) ** 2).sum()
-
-    x_mean = w.nanmean(dim=2, keepdim=True)
-    # Handle NaN: replace with 0 for summation
-    w_filled = w.nan_to_num(0.0)
-    not_nan = ~torch.isnan(w)
-    n = not_nan.sum(dim=2, keepdim=True).float()
-
-    # Recompute mean with nan handling
-    cov_xt = ((w_filled - x_mean.nan_to_num(0.0)) * (t_idx - t_mean) * not_nan).sum(dim=2)
-
-    slope = cov_xt / t_var
-    intercept = x_mean.squeeze(2) - slope * t_mean
-
-    fitted = slope * (window - 1) + intercept
-    residual = w[:, :, -1] - fitted
-
-    # R-squared
-    fitted_all = slope.unsqueeze(2) * t_idx + intercept.unsqueeze(2)
-    ss_res = ((w_filled - fitted_all) ** 2 * not_nan).sum(dim=2)
-    ss_tot = ((w_filled - x_mean.nan_to_num(0.0)) ** 2 * not_nan).sum(dim=2)
-    r2 = torch.where(ss_tot > 1e-10, 1.0 - ss_res / ss_tot,
-                     torch.tensor(float("nan"), device=x.device))
-
-    slope = _pad_front_torch(slope, window, T)
-    intercept = _pad_front_torch(intercept, window, T)
-    fitted = _pad_front_torch(fitted, window, T)
-    residual = _pad_front_torch(residual, window, T)
-    r2 = _pad_front_torch(r2, window, T)
-
-    return slope, intercept, fitted, residual, r2
-
-
-def ts_linreg_torch(x: "torch.Tensor", window: int = 20) -> "torch.Tensor":
-    _, _, fitted, _, _ = _linreg_components_torch(x, window)
-    return fitted
-
-
-def ts_linreg_slope_torch(x: "torch.Tensor", window: int = 20) -> "torch.Tensor":
-    slope, _, _, _, _ = _linreg_components_torch(x, window)
-    return slope
-
-
-def ts_linreg_intercept_torch(x: "torch.Tensor", window: int = 20) -> "torch.Tensor":
-    _, intercept, _, _, _ = _linreg_components_torch(x, window)
-    return intercept
-
-
-def ts_linreg_resid_torch(x: "torch.Tensor", window: int = 20) -> "torch.Tensor":
-    _, _, _, residual, _ = _linreg_components_torch(x, window)
-    return residual
-
-
-# ===========================================================================
-# Registration table
-# ===========================================================================
-
-REGRESSION_OPS = {
-    "TsLinReg": (ts_linreg_np, ts_linreg_torch),
-    "TsLinRegSlope": (ts_linreg_slope_np, ts_linreg_slope_torch),
-    "TsLinRegIntercept": (ts_linreg_intercept_np, ts_linreg_intercept_torch),
-    "TsLinRegResid": (ts_linreg_resid_np, ts_linreg_resid_torch),
-}
diff --git a/src/factorminer/factorminer/operators/smoothing.py b/src/factorminer/factorminer/operators/smoothing.py
deleted file mode 100644
index 2e990e3..0000000
--- a/src/factorminer/factorminer/operators/smoothing.py
+++ /dev/null
@@ -1,173 +0,0 @@
-"""Moving average / smoothing operators.
-
-Input shape: ``(M, T)`` -> output shape ``(M, T)``.
-All operate along the time axis (axis=1) per asset row.
-"""
-
-from __future__ import annotations
-
-import numpy as np
-
-try:
-    import torch
-except ImportError:
-    torch = None  # type: ignore[assignment]
-
-
-# ===========================================================================
-# NumPy implementations
-# ===========================================================================
-
-def sma_np(x: np.ndarray, window: int = 10) -> np.ndarray:
-    """Simple moving average (identical to Mean)."""
-    window = int(window)
-    M, T = x.shape
-    out = np.full_like(x, np.nan, dtype=np.float64)
-    # Cumsum trick for O(1) per element
-    cs = np.nancumsum(x, axis=1)
-    out[:, window - 1:] = cs[:, window - 1:]
-    if window > 1:
-        out[:, window - 1:] -= np.concatenate(
-            [np.zeros((M, 1), dtype=np.float64), cs[:, :-window]], axis=1
-        )[:, :T - window + 1]  # fix: just subtract shifted cumsum
-        out[:, window - 1:] = (cs[:, window - 1:] - np.concatenate(
-            [np.zeros((M, 1), dtype=np.float64), cs[:, :-1]], axis=1
-        )[:, :T - window + 1])
-    out[:, window - 1:] /= window
-    out[:, :window - 1] = np.nan
-    return out
-
-
-def ema_np(x: np.ndarray, window: int = 10) -> np.ndarray:
-    """Exponential moving average with span = window."""
-    window = int(window)
-    alpha = 2.0 / (window + 1.0)
-    M, T = x.shape
-    out = np.copy(x).astype(np.float64)
-    for t in range(1, T):
-        prev = out[:, t - 1]
-        curr = x[:, t]
-        both_valid = ~np.isnan(prev) & ~np.isnan(curr)
-        only_prev = ~np.isnan(prev) & np.isnan(curr)
-        out[both_valid, t] = alpha * curr[both_valid] + (1 - alpha) * prev[both_valid]
-        out[only_prev, t] = prev[only_prev]
-    return out
-
-
-def dema_np(x: np.ndarray, window: int = 10) -> np.ndarray:
-    """Double EMA: 2 * EMA(x) - EMA(EMA(x))."""
-    e1 = ema_np(x, window)
-    e2 = ema_np(e1, window)
-    return 2.0 * e1 - e2
-
-
-def kama_np(x: np.ndarray, window: int = 10) -> np.ndarray:
-    """Kaufman Adaptive Moving Average."""
-    window = int(window)
-    fast_sc = 2.0 / (2.0 + 1.0)
-    slow_sc = 2.0 / (30.0 + 1.0)
-    M, T = x.shape
-    out = np.copy(x).astype(np.float64)
-
-    for t in range(window, T):
-        direction = np.abs(x[:, t] - x[:, t - window])
-        volatility = np.nansum(np.abs(np.diff(x[:, t - window:t + 1], axis=1)), axis=1)
-        with np.errstate(invalid="ignore", divide="ignore"):
-            er = np.where(volatility > 1e-10, direction / volatility, 0.0)
-        sc = (er * (fast_sc - slow_sc) + slow_sc) ** 2
-        prev = out[:, t - 1]
-        curr = x[:, t]
-        valid = ~np.isnan(prev) & ~np.isnan(curr)
-        out[valid, t] = prev[valid] + sc[valid] * (curr[valid] - prev[valid])
-    return out
-
-
-def hma_np(x: np.ndarray, window: int = 10) -> np.ndarray:
-    """Hull Moving Average: WMA(2*WMA(x, w/2) - WMA(x, w), sqrt(w))."""
-    window = int(window)
-    from factorminer.operators.timeseries import wma_np
-    half = max(int(window / 2), 1)
-    sqrt_w = max(int(np.sqrt(window)), 1)
-    w1 = wma_np(x, half)
-    w2 = wma_np(x, window)
-    diff = 2.0 * w1 - w2
-    return wma_np(diff, sqrt_w)
-
-
-# ===========================================================================
-# PyTorch implementations
-# ===========================================================================
-
-def sma_torch(x: "torch.Tensor", window: int = 10) -> "torch.Tensor":
-    """Simple moving average using conv1d for GPU efficiency."""
-    window = int(window)
-    M, T = x.shape
-    # Use unfold-based approach
-    from factorminer.operators.statistical import _unfold_torch, _pad_front_torch
-    w = _unfold_torch(x, window)
-    result = w.nanmean(dim=2)
-    return _pad_front_torch(result, window, T)
-
-
-def ema_torch(x: "torch.Tensor", window: int = 10) -> "torch.Tensor":
-    """EMA -- sequential by nature, but batch across assets."""
-    window = int(window)
-    alpha = 2.0 / (window + 1.0)
-    M, T = x.shape
-    out = x.clone()
-    for t in range(1, T):
-        prev = out[:, t - 1]
-        curr = x[:, t]
-        both = ~torch.isnan(prev) & ~torch.isnan(curr)
-        only_prev = ~torch.isnan(prev) & torch.isnan(curr)
-        out[both, t] = alpha * curr[both] + (1 - alpha) * prev[both]
-        out[only_prev, t] = prev[only_prev]
-    return out
-
-
-def dema_torch(x: "torch.Tensor", window: int = 10) -> "torch.Tensor":
-    e1 = ema_torch(x, window)
-    e2 = ema_torch(e1, window)
-    return 2.0 * e1 - e2
-
-
-def kama_torch(x: "torch.Tensor", window: int = 10) -> "torch.Tensor":
-    window = int(window)
-    fast_sc = 2.0 / (2.0 + 1.0)
-    slow_sc = 2.0 / (30.0 + 1.0)
-    M, T = x.shape
-    out = x.clone()
-    for t in range(window, T):
-        direction = (x[:, t] - x[:, t - window]).abs()
-        vol = x[:, t - window:t + 1].diff(dim=1).abs().nansum(dim=1)
-        er = torch.where(vol > 1e-10, direction / vol, torch.zeros_like(direction))
-        sc = (er * (fast_sc - slow_sc) + slow_sc) ** 2
-        prev = out[:, t - 1]
-        curr = x[:, t]
-        valid = ~torch.isnan(prev) & ~torch.isnan(curr)
-        out[valid, t] = prev[valid] + sc[valid] * (curr[valid] - prev[valid])
-    return out
-
-
-def hma_torch(x: "torch.Tensor", window: int = 10) -> "torch.Tensor":
-    window = int(window)
-    from factorminer.operators.timeseries import wma_torch
-    half = max(int(window / 2), 1)
-    sqrt_w = max(int(window ** 0.5), 1)
-    w1 = wma_torch(x, half)
-    w2 = wma_torch(x, window)
-    diff = 2.0 * w1 - w2
-    return wma_torch(diff, sqrt_w)
-
-
-# ===========================================================================
-# Registration table
-# ===========================================================================
-
-SMOOTHING_OPS = {
-    "EMA": (ema_np, ema_torch),
-    "DEMA": (dema_np, dema_torch),
-    "SMA": (sma_np, sma_torch),
-    "KAMA": (kama_np, kama_torch),
-    "HMA": (hma_np, hma_torch),
-}
diff --git a/src/factorminer/factorminer/operators/statistical.py b/src/factorminer/factorminer/operators/statistical.py
deleted file mode 100644
index d903889..0000000
--- a/src/factorminer/factorminer/operators/statistical.py
+++ /dev/null
@@ -1,452 +0,0 @@
-"""Rolling-window statistical operators.
-
-Each function operates along the **time** axis (axis=1) independently for
-every asset row.  Input shape: ``(M, T)`` -> output shape ``(M, T)``.
-The first ``(window - 1)`` values in each row are set to ``NaN``.
-"""
-
-from __future__ import annotations
-
-import numpy as np
-
-try:
-    import torch
-    import torch.nn.functional as F
-except ImportError:
-    torch = None  # type: ignore[assignment]
-    F = None  # type: ignore[assignment]
-
-
-# ===========================================================================
-# Helpers
-# ===========================================================================
-
-def _rolling_np(x: np.ndarray, window: int):
-    """Yield views of shape (M, T-w+1, w) using stride tricks."""
-    M, T = x.shape
-    if T < window:
-        return None
-    strides = (x.strides[0], x.strides[1], x.strides[1])
-    shape = (M, T - window + 1, window)
-    return np.lib.stride_tricks.as_strided(x, shape=shape, strides=strides)
-
-
-def _pad_front(result: np.ndarray, window: int, total_T: int) -> np.ndarray:
-    """Pad front of time axis with NaN to restore original length."""
-    M = result.shape[0]
-    pad_len = total_T - result.shape[1]
-    if pad_len > 0:
-        pad = np.full((M, pad_len), np.nan, dtype=result.dtype)
-        return np.concatenate([pad, result], axis=1)
-    return result
-
-
-def _unfold_torch(x: "torch.Tensor", window: int) -> "torch.Tensor":
-    """Unfold last dimension to get sliding windows: (M, T) -> (M, T-w+1, w)."""
-    return x.unfold(dimension=1, size=window, step=1)
-
-
-def _pad_front_torch(result: "torch.Tensor", window: int, total_T: int) -> "torch.Tensor":
-    M = result.shape[0]
-    pad_len = total_T - result.shape[1]
-    if pad_len > 0:
-        pad = torch.full((M, pad_len), float("nan"), device=result.device, dtype=result.dtype)
-        return torch.cat([pad, result], dim=1)
-    return result
-
-
-# ===========================================================================
-# NumPy implementations
-# ===========================================================================
-
-def mean_np(x: np.ndarray, window: int = 10) -> np.ndarray:
-    window = int(window)
-    M, T = x.shape
-    w = _rolling_np(x, window)
-    if w is None:
-        return np.full_like(x, np.nan)
-    result = np.nanmean(w, axis=2)
-    return _pad_front(result, window, T)
-
-
-def std_np(x: np.ndarray, window: int = 10) -> np.ndarray:
-    window = int(window)
-    M, T = x.shape
-    w = _rolling_np(x, window)
-    if w is None:
-        return np.full_like(x, np.nan)
-    result = np.nanstd(w, axis=2, ddof=1)
-    return _pad_front(result, window, T)
-
-
-def var_np(x: np.ndarray, window: int = 10) -> np.ndarray:
-    window = int(window)
-    M, T = x.shape
-    w = _rolling_np(x, window)
-    if w is None:
-        return np.full_like(x, np.nan)
-    result = np.nanvar(w, axis=2, ddof=1)
-    return _pad_front(result, window, T)
-
-
-def skew_np(x: np.ndarray, window: int = 20) -> np.ndarray:
-    window = int(window)
-    M, T = x.shape
-    w = _rolling_np(x, window)
-    if w is None:
-        return np.full_like(x, np.nan)
-    m = np.nanmean(w, axis=2, keepdims=True)
-    d = w - m
-    n = np.sum(~np.isnan(w), axis=2, keepdims=True).astype(np.float64)
-    m2 = np.nanmean(d ** 2, axis=2, keepdims=True)
-    m3 = np.nanmean(d ** 3, axis=2, keepdims=True)
-    with np.errstate(invalid="ignore", divide="ignore"):
-        sk = m3 / np.power(m2, 1.5)
-    result = sk.squeeze(2)
-    return _pad_front(result, window, T)
-
-
-def kurt_np(x: np.ndarray, window: int = 20) -> np.ndarray:
-    window = int(window)
-    M, T = x.shape
-    w = _rolling_np(x, window)
-    if w is None:
-        return np.full_like(x, np.nan)
-    m = np.nanmean(w, axis=2, keepdims=True)
-    d = w - m
-    m2 = np.nanmean(d ** 2, axis=2, keepdims=True)
-    m4 = np.nanmean(d ** 4, axis=2, keepdims=True)
-    with np.errstate(invalid="ignore", divide="ignore"):
-        kt = m4 / np.power(m2, 2.0) - 3.0
-    result = kt.squeeze(2)
-    return _pad_front(result, window, T)
-
-
-def median_np(x: np.ndarray, window: int = 10) -> np.ndarray:
-    window = int(window)
-    M, T = x.shape
-    w = _rolling_np(x, window)
-    if w is None:
-        return np.full_like(x, np.nan)
-    result = np.nanmedian(w, axis=2)
-    return _pad_front(result, window, T)
-
-
-def sum_np(x: np.ndarray, window: int = 10) -> np.ndarray:
-    window = int(window)
-    M, T = x.shape
-    w = _rolling_np(x, window)
-    if w is None:
-        return np.full_like(x, np.nan)
-    result = np.nansum(w, axis=2)
-    # If all NaN in a window, nansum returns 0; fix that
-    all_nan = np.all(np.isnan(w), axis=2)
-    result[all_nan] = np.nan
-    return _pad_front(result, window, T)
-
-
-def prod_np(x: np.ndarray, window: int = 10) -> np.ndarray:
-    window = int(window)
-    M, T = x.shape
-    w = _rolling_np(x, window)
-    if w is None:
-        return np.full_like(x, np.nan)
-    result = np.nanprod(w, axis=2)
-    all_nan = np.all(np.isnan(w), axis=2)
-    result[all_nan] = np.nan
-    return _pad_front(result, window, T)
-
-
-def ts_max_np(x: np.ndarray, window: int = 10) -> np.ndarray:
-    window = int(window)
-    M, T = x.shape
-    w = _rolling_np(x, window)
-    if w is None:
-        return np.full_like(x, np.nan)
-    result = np.nanmax(w, axis=2)
-    return _pad_front(result, window, T)
-
-
-def ts_min_np(x: np.ndarray, window: int = 10) -> np.ndarray:
-    window = int(window)
-    M, T = x.shape
-    w = _rolling_np(x, window)
-    if w is None:
-        return np.full_like(x, np.nan)
-    result = np.nanmin(w, axis=2)
-    return _pad_front(result, window, T)
-
-
-def ts_argmax_np(x: np.ndarray, window: int = 10) -> np.ndarray:
-    window = int(window)
-    M, T = x.shape
-    w = _rolling_np(x, window)
-    if w is None:
-        return np.full_like(x, np.nan)
-    result = np.nanargmax(w, axis=2).astype(np.float64)
-    return _pad_front(result, window, T)
-
-
-def ts_argmin_np(x: np.ndarray, window: int = 10) -> np.ndarray:
-    window = int(window)
-    M, T = x.shape
-    w = _rolling_np(x, window)
-    if w is None:
-        return np.full_like(x, np.nan)
-    result = np.nanargmin(w, axis=2).astype(np.float64)
-    return _pad_front(result, window, T)
-
-
-def ts_rank_np(x: np.ndarray, window: int = 10) -> np.ndarray:
-    """Rolling percentile rank of the latest value within its window."""
-    window = int(window)
-    M, T = x.shape
-    w = _rolling_np(x, window)
-    if w is None:
-        return np.full_like(x, np.nan)
-    latest = w[:, :, -1:]  # (M, T-w+1, 1)
-    count_less = np.nansum(w < latest, axis=2).astype(np.float64)
-    count_valid = np.sum(~np.isnan(w), axis=2).astype(np.float64)
-    with np.errstate(invalid="ignore", divide="ignore"):
-        result = count_less / (count_valid - 1.0)
-    result[count_valid <= 1] = np.nan
-    return _pad_front(result, window, T)
-
-
-def quantile_np(x: np.ndarray, window: int = 10, q: float = 0.5) -> np.ndarray:
-    window = int(window)
-    M, T = x.shape
-    w = _rolling_np(x, window)
-    if w is None:
-        return np.full_like(x, np.nan)
-    result = np.nanquantile(w, q, axis=2)
-    return _pad_front(result, window, T)
-
-
-def count_nan_np(x: np.ndarray, window: int = 10) -> np.ndarray:
-    window = int(window)
-    M, T = x.shape
-    w = _rolling_np(x, window)
-    if w is None:
-        return np.full_like(x, np.nan)
-    result = np.sum(np.isnan(w), axis=2).astype(np.float64)
-    return _pad_front(result, window, T)
-
-
-def count_not_nan_np(x: np.ndarray, window: int = 10) -> np.ndarray:
-    window = int(window)
-    M, T = x.shape
-    w = _rolling_np(x, window)
-    if w is None:
-        return np.full_like(x, np.nan)
-    result = np.sum(~np.isnan(w), axis=2).astype(np.float64)
-    return _pad_front(result, window, T)
-
-
-# ===========================================================================
-# PyTorch (GPU) implementations
-# ===========================================================================
-
-def mean_torch(x: "torch.Tensor", window: int = 10) -> "torch.Tensor":
-    window = int(window)
-    M, T = x.shape
-    w = _unfold_torch(x, window)  # (M, T-w+1, w)
-    result = w.nanmean(dim=2)
-    return _pad_front_torch(result, window, T)
-
-
-def std_torch(x: "torch.Tensor", window: int = 10) -> "torch.Tensor":
-    window = int(window)
-    M, T = x.shape
-    w = _unfold_torch(x, window)
-    m = w.nanmean(dim=2, keepdim=True)
-    d = w - m
-    not_nan = ~torch.isnan(w)
-    d = d.nan_to_num(0.0)
-    n = not_nan.sum(dim=2, keepdim=True).float()
-    var = (d ** 2).sum(dim=2, keepdim=True) / (n - 1).clamp(min=1)
-    result = var.sqrt().squeeze(2)
-    result[n.squeeze(2) < 2] = float("nan")
-    return _pad_front_torch(result, window, T)
-
-
-def var_torch(x: "torch.Tensor", window: int = 10) -> "torch.Tensor":
-    window = int(window)
-    M, T = x.shape
-    w = _unfold_torch(x, window)
-    m = w.nanmean(dim=2, keepdim=True)
-    d = w - m
-    not_nan = ~torch.isnan(w)
-    d = d.nan_to_num(0.0)
-    n = not_nan.sum(dim=2, keepdim=True).float()
-    result = ((d ** 2).sum(dim=2, keepdim=True) / (n - 1).clamp(min=1)).squeeze(2)
-    result[n.squeeze(2) < 2] = float("nan")
-    return _pad_front_torch(result, window, T)
-
-
-def skew_torch(x: "torch.Tensor", window: int = 20) -> "torch.Tensor":
-    window = int(window)
-    M, T = x.shape
-    w = _unfold_torch(x, window)
-    m = w.nanmean(dim=2, keepdim=True)
-    d = (w - m).nan_to_num(0.0)
-    not_nan = ~torch.isnan(w)
-    n = not_nan.sum(dim=2, keepdim=True).float()
-    m2 = (d ** 2).sum(dim=2, keepdim=True) / n.clamp(min=1)
-    m3 = (d ** 3).sum(dim=2, keepdim=True) / n.clamp(min=1)
-    result = (m3 / m2.pow(1.5)).squeeze(2)
-    result[n.squeeze(2) < 3] = float("nan")
-    return _pad_front_torch(result, window, T)
-
-
-def kurt_torch(x: "torch.Tensor", window: int = 20) -> "torch.Tensor":
-    window = int(window)
-    M, T = x.shape
-    w = _unfold_torch(x, window)
-    m = w.nanmean(dim=2, keepdim=True)
-    d = (w - m).nan_to_num(0.0)
-    not_nan = ~torch.isnan(w)
-    n = not_nan.sum(dim=2, keepdim=True).float()
-    m2 = (d ** 2).sum(dim=2, keepdim=True) / n.clamp(min=1)
-    m4 = (d ** 4).sum(dim=2, keepdim=True) / n.clamp(min=1)
-    result = (m4 / m2.pow(2.0) - 3.0).squeeze(2)
-    result[n.squeeze(2) < 4] = float("nan")
-    return _pad_front_torch(result, window, T)
-
-
-def median_torch(x: "torch.Tensor", window: int = 10) -> "torch.Tensor":
-    window = int(window)
-    M, T = x.shape
-    w = _unfold_torch(x, window)
-    result = w.nanmedian(dim=2).values
-    return _pad_front_torch(result, window, T)
-
-
-def sum_torch(x: "torch.Tensor", window: int = 10) -> "torch.Tensor":
-    window = int(window)
-    M, T = x.shape
-    w = _unfold_torch(x, window)
-    result = w.nansum(dim=2)
-    all_nan = torch.isnan(w).all(dim=2)
-    result[all_nan] = float("nan")
-    return _pad_front_torch(result, window, T)
-
-
-def prod_torch(x: "torch.Tensor", window: int = 10) -> "torch.Tensor":
-    window = int(window)
-    M, T = x.shape
-    w = _unfold_torch(x, window)
-    filled = w.nan_to_num(1.0)
-    result = filled.prod(dim=2)
-    all_nan = torch.isnan(w).all(dim=2)
-    result[all_nan] = float("nan")
-    return _pad_front_torch(result, window, T)
-
-
-def ts_max_torch(x: "torch.Tensor", window: int = 10) -> "torch.Tensor":
-    window = int(window)
-    M, T = x.shape
-    w = _unfold_torch(x, window)
-    filled = w.nan_to_num(float("-inf"))
-    result = filled.max(dim=2).values
-    all_nan = torch.isnan(w).all(dim=2)
-    result[all_nan] = float("nan")
-    return _pad_front_torch(result, window, T)
-
-
-def ts_min_torch(x: "torch.Tensor", window: int = 10) -> "torch.Tensor":
-    window = int(window)
-    M, T = x.shape
-    w = _unfold_torch(x, window)
-    filled = w.nan_to_num(float("inf"))
-    result = filled.min(dim=2).values
-    all_nan = torch.isnan(w).all(dim=2)
-    result[all_nan] = float("nan")
-    return _pad_front_torch(result, window, T)
-
-
-def ts_argmax_torch(x: "torch.Tensor", window: int = 10) -> "torch.Tensor":
-    window = int(window)
-    M, T = x.shape
-    w = _unfold_torch(x, window)
-    filled = w.nan_to_num(float("-inf"))
-    result = filled.argmax(dim=2).float()
-    return _pad_front_torch(result, window, T)
-
-
-def ts_argmin_torch(x: "torch.Tensor", window: int = 10) -> "torch.Tensor":
-    window = int(window)
-    M, T = x.shape
-    w = _unfold_torch(x, window)
-    filled = w.nan_to_num(float("inf"))
-    result = filled.argmin(dim=2).float()
-    return _pad_front_torch(result, window, T)
-
-
-def ts_rank_torch(x: "torch.Tensor", window: int = 10) -> "torch.Tensor":
-    """Rolling percentile rank -- key GPU acceleration target (17x speedup)."""
-    window = int(window)
-    M, T = x.shape
-    w = _unfold_torch(x, window)  # (M, T-w+1, w)
-    latest = w[:, :, -1:]  # (M, T-w+1, 1)
-    not_nan = ~torch.isnan(w)
-    # Count values strictly less than latest (NaN-safe)
-    less = ((w < latest) & not_nan).sum(dim=2).float()
-    count_valid = not_nan.sum(dim=2).float()
-    result = less / (count_valid - 1).clamp(min=1)
-    result[count_valid <= 1] = float("nan")
-    return _pad_front_torch(result, window, T)
-
-
-def quantile_torch(x: "torch.Tensor", window: int = 10, q: float = 0.5) -> "torch.Tensor":
-    window = int(window)
-    M, T = x.shape
-    w = _unfold_torch(x, window)
-    result = w.nanmedian(dim=2).values  # approximation; true quantile below
-    # Use sorting for proper quantile
-    sorted_w, _ = w.sort(dim=2)
-    n = (~torch.isnan(w)).sum(dim=2).float()
-    idx = ((n - 1) * q).long().clamp(min=0)
-    # Gather the quantile value
-    result = sorted_w.gather(2, idx.unsqueeze(2)).squeeze(2)
-    return _pad_front_torch(result, window, T)
-
-
-def count_nan_torch(x: "torch.Tensor", window: int = 10) -> "torch.Tensor":
-    window = int(window)
-    M, T = x.shape
-    w = _unfold_torch(x, window)
-    result = torch.isnan(w).sum(dim=2).float()
-    return _pad_front_torch(result, window, T)
-
-
-def count_not_nan_torch(x: "torch.Tensor", window: int = 10) -> "torch.Tensor":
-    window = int(window)
-    M, T = x.shape
-    w = _unfold_torch(x, window)
-    result = (~torch.isnan(w)).sum(dim=2).float()
-    return _pad_front_torch(result, window, T)
-
-
-# ===========================================================================
-# Registration table
-# ===========================================================================
-
-STATISTICAL_OPS = {
-    "Mean": (mean_np, mean_torch),
-    "Std": (std_np, std_torch),
-    "Var": (var_np, var_torch),
-    "Skew": (skew_np, skew_torch),
-    "Kurt": (kurt_np, kurt_torch),
-    "Median": (median_np, median_torch),
-    "Sum": (sum_np, sum_torch),
-    "Prod": (prod_np, prod_torch),
-    "TsMax": (ts_max_np, ts_max_torch),
-    "TsMin": (ts_min_np, ts_min_torch),
-    "TsArgMax": (ts_argmax_np, ts_argmax_torch),
-    "TsArgMin": (ts_argmin_np, ts_argmin_torch),
-    "TsRank": (ts_rank_np, ts_rank_torch),
-    "Quantile": (quantile_np, quantile_torch),
-    "CountNaN": (count_nan_np, count_nan_torch),
-    "CountNotNaN": (count_not_nan_np, count_not_nan_torch),
-}
diff --git a/src/factorminer/factorminer/operators/timeseries.py b/src/factorminer/factorminer/operators/timeseries.py
deleted file mode 100644
index ced08c8..0000000
--- a/src/factorminer/factorminer/operators/timeseries.py
+++ /dev/null
@@ -1,395 +0,0 @@
-"""Time-series operators along the T axis for each asset row.
-
-Input shape: ``(M, T)`` -> output shape ``(M, T)``.
-"""
-
-from __future__ import annotations
-
-import numpy as np
-
-try:
-    import torch
-except ImportError:
-    torch = None  # type: ignore[assignment]
-
-
-# ===========================================================================
-# NumPy implementations
-# ===========================================================================
-
-def delta_np(x: np.ndarray, window: int = 1) -> np.ndarray:
-    """x[t] - x[t - period]."""
-    window = int(window)
-    M, T = x.shape
-    out = np.full_like(x, np.nan, dtype=np.float64)
-    if window < T:
-        out[:, window:] = x[:, window:] - x[:, :-window]
-    return out
-
-
-def delay_np(x: np.ndarray, window: int = 1) -> np.ndarray:
-    """x[t - period] (lag operator)."""
-    window = int(window)
-    M, T = x.shape
-    out = np.full_like(x, np.nan, dtype=np.float64)
-    if window < T:
-        out[:, window:] = x[:, :-window]
-    return out
-
-
-def return_np(x: np.ndarray, window: int = 1) -> np.ndarray:
-    """x[t] / x[t-d] - 1."""
-    window = int(window)
-    M, T = x.shape
-    out = np.full_like(x, np.nan, dtype=np.float64)
-    if window < T:
-        prev = x[:, :-window]
-        mask = np.abs(prev) > 1e-10
-        out_slice = np.full_like(prev, np.nan)
-        out_slice[mask] = x[:, window:][mask] / prev[mask] - 1.0
-        out[:, window:] = out_slice
-    return out
-
-
-def log_return_np(x: np.ndarray, window: int = 1) -> np.ndarray:
-    """log(x[t] / x[t-d])."""
-    window = int(window)
-    M, T = x.shape
-    out = np.full_like(x, np.nan, dtype=np.float64)
-    if window < T:
-        prev = x[:, :-window]
-        curr = x[:, window:]
-        with np.errstate(invalid="ignore", divide="ignore"):
-            ratio = np.where(np.abs(prev) > 1e-10, curr / prev, np.nan)
-            out[:, window:] = np.where(ratio > 0, np.log(ratio), np.nan)
-    return out
-
-
-def corr_np(x: np.ndarray, y: np.ndarray, window: int = 10) -> np.ndarray:
-    """Rolling Pearson correlation."""
-    window = int(window)
-    M, T = x.shape
-    if T < window:
-        return np.full_like(x, np.nan)
-
-    from factorminer.operators.statistical import _rolling_np, _pad_front
-
-    wx = _rolling_np(x, window)
-    wy = _rolling_np(y, window)
-    if wx is None or wy is None:
-        return np.full_like(x, np.nan)
-
-    mx = np.nanmean(wx, axis=2, keepdims=True)
-    my = np.nanmean(wy, axis=2, keepdims=True)
-    dx = wx - mx
-    dy = wy - my
-    with np.errstate(invalid="ignore", divide="ignore"):
-        cov = np.nanmean(dx * dy, axis=2)
-        sx = np.sqrt(np.nanmean(dx ** 2, axis=2))
-        sy = np.sqrt(np.nanmean(dy ** 2, axis=2))
-        result = np.where((sx > 1e-10) & (sy > 1e-10), cov / (sx * sy), np.nan)
-    return _pad_front(result, window, T)
-
-
-def cov_np(x: np.ndarray, y: np.ndarray, window: int = 10) -> np.ndarray:
-    """Rolling covariance."""
-    window = int(window)
-    M, T = x.shape
-    if T < window:
-        return np.full_like(x, np.nan)
-
-    from factorminer.operators.statistical import _rolling_np, _pad_front
-
-    wx = _rolling_np(x, window)
-    wy = _rolling_np(y, window)
-    if wx is None or wy is None:
-        return np.full_like(x, np.nan)
-
-    mx = np.nanmean(wx, axis=2, keepdims=True)
-    my = np.nanmean(wy, axis=2, keepdims=True)
-    result = np.nanmean((wx - mx) * (wy - my), axis=2)
-    return _pad_front(result, window, T)
-
-
-def beta_np(x: np.ndarray, y: np.ndarray, window: int = 10) -> np.ndarray:
-    """Rolling regression beta: slope of x regressed on y."""
-    window = int(window)
-    M, T = x.shape
-    if T < window:
-        return np.full_like(x, np.nan)
-
-    from factorminer.operators.statistical import _rolling_np, _pad_front
-
-    wx = _rolling_np(x, window)
-    wy = _rolling_np(y, window)
-    if wx is None or wy is None:
-        return np.full_like(x, np.nan)
-
-    my = np.nanmean(wy, axis=2, keepdims=True)
-    mx = np.nanmean(wx, axis=2, keepdims=True)
-    dy = wy - my
-    dx = wx - mx
-    with np.errstate(invalid="ignore", divide="ignore"):
-        var_y = np.nanmean(dy ** 2, axis=2)
-        cov_xy = np.nanmean(dx * dy, axis=2)
-        result = np.where(var_y > 1e-10, cov_xy / var_y, np.nan)
-    return _pad_front(result, window, T)
-
-
-def resid_np(x: np.ndarray, y: np.ndarray, window: int = 10) -> np.ndarray:
-    """Rolling regression residual: x - beta * y - alpha, evaluated at last point."""
-    window = int(window)
-    M, T = x.shape
-    if T < window:
-        return np.full_like(x, np.nan)
-
-    from factorminer.operators.statistical import _rolling_np, _pad_front
-
-    wx = _rolling_np(x, window)
-    wy = _rolling_np(y, window)
-    if wx is None or wy is None:
-        return np.full_like(x, np.nan)
-
-    mx = np.nanmean(wx, axis=2, keepdims=True)
-    my = np.nanmean(wy, axis=2, keepdims=True)
-    dx = wx - mx
-    dy = wy - my
-    with np.errstate(invalid="ignore", divide="ignore"):
-        var_y = np.nanmean(dy ** 2, axis=2, keepdims=True)
-        cov_xy = np.nanmean(dx * dy, axis=2, keepdims=True)
-        b = np.where(var_y > 1e-10, cov_xy / var_y, 0.0)
-        a = mx - b * my
-    # Residual at last time step in each window
-    result = (wx[:, :, -1:] - b * wy[:, :, -1:] - a).squeeze(2)
-    return _pad_front(result, window, T)
-
-
-def wma_np(x: np.ndarray, window: int = 10) -> np.ndarray:
-    """Linearly weighted moving average."""
-    window = int(window)
-    M, T = x.shape
-    from factorminer.operators.statistical import _rolling_np, _pad_front
-
-    w = _rolling_np(x, window)
-    if w is None:
-        return np.full_like(x, np.nan)
-    weights = np.arange(1, window + 1, dtype=np.float64)
-    weights = weights / weights.sum()
-    result = np.nansum(w * weights[np.newaxis, np.newaxis, :], axis=2)
-    return _pad_front(result, window, T)
-
-
-def decay_np(x: np.ndarray, window: int = 10) -> np.ndarray:
-    """Exponentially decaying sum (linearly decaying weighted average)."""
-    return wma_np(x, window)
-
-
-def cumsum_np(x: np.ndarray) -> np.ndarray:
-    return np.nancumsum(x, axis=1)
-
-
-def cumprod_np(x: np.ndarray) -> np.ndarray:
-    filled = np.where(np.isnan(x), 1.0, x)
-    return np.cumprod(filled, axis=1)
-
-
-def cummax_np(x: np.ndarray) -> np.ndarray:
-    out = np.copy(x)
-    for t in range(1, x.shape[1]):
-        out[:, t] = np.fmax(out[:, t - 1], x[:, t])
-    return out
-
-
-def cummin_np(x: np.ndarray) -> np.ndarray:
-    out = np.copy(x)
-    for t in range(1, x.shape[1]):
-        out[:, t] = np.fmin(out[:, t - 1], x[:, t])
-    return out
-
-
-# ===========================================================================
-# PyTorch implementations
-# ===========================================================================
-
-def delta_torch(x: "torch.Tensor", window: int = 1) -> "torch.Tensor":
-    window = int(window)
-    M, T = x.shape
-    out = torch.full_like(x, float("nan"))
-    if window < T:
-        out[:, window:] = x[:, window:] - x[:, :-window]
-    return out
-
-
-def delay_torch(x: "torch.Tensor", window: int = 1) -> "torch.Tensor":
-    window = int(window)
-    M, T = x.shape
-    out = torch.full_like(x, float("nan"))
-    if window < T:
-        out[:, window:] = x[:, :-window]
-    return out
-
-
-def return_torch(x: "torch.Tensor", window: int = 1) -> "torch.Tensor":
-    window = int(window)
-    M, T = x.shape
-    out = torch.full_like(x, float("nan"))
-    if window < T:
-        prev = x[:, :-window]
-        mask = prev.abs() > 1e-10
-        r = torch.full_like(prev, float("nan"))
-        r[mask] = x[:, window:][mask] / prev[mask] - 1.0
-        out[:, window:] = r
-    return out
-
-
-def log_return_torch(x: "torch.Tensor", window: int = 1) -> "torch.Tensor":
-    window = int(window)
-    M, T = x.shape
-    out = torch.full_like(x, float("nan"))
-    if window < T:
-        prev = x[:, :-window]
-        curr = x[:, window:]
-        mask = prev.abs() > 1e-10
-        ratio = torch.full_like(prev, float("nan"))
-        ratio[mask] = curr[mask] / prev[mask]
-        lr = torch.full_like(prev, float("nan"))
-        pos = ratio > 0
-        lr[pos] = ratio[pos].log()
-        out[:, window:] = lr
-    return out
-
-
-def corr_torch(x: "torch.Tensor", y: "torch.Tensor", window: int = 10) -> "torch.Tensor":
-    window = int(window)
-    M, T = x.shape
-    from factorminer.operators.statistical import _unfold_torch, _pad_front_torch
-
-    wx = _unfold_torch(x, window)
-    wy = _unfold_torch(y, window)
-    mx = wx.nanmean(dim=2, keepdim=True)
-    my = wy.nanmean(dim=2, keepdim=True)
-    dx = (wx - mx).nan_to_num(0.0)
-    dy = (wy - my).nan_to_num(0.0)
-    not_nan = ~(torch.isnan(wx) | torch.isnan(wy))
-    n = not_nan.sum(dim=2).float()
-    cov = (dx * dy * not_nan).sum(dim=2) / n.clamp(min=1)
-    sx = ((dx ** 2 * not_nan).sum(dim=2) / n.clamp(min=1)).sqrt()
-    sy = ((dy ** 2 * not_nan).sum(dim=2) / n.clamp(min=1)).sqrt()
-    result = torch.where((sx > 1e-10) & (sy > 1e-10), cov / (sx * sy),
-                         torch.tensor(float("nan"), device=x.device))
-    return _pad_front_torch(result, window, T)
-
-
-def cov_torch(x: "torch.Tensor", y: "torch.Tensor", window: int = 10) -> "torch.Tensor":
-    window = int(window)
-    M, T = x.shape
-    from factorminer.operators.statistical import _unfold_torch, _pad_front_torch
-
-    wx = _unfold_torch(x, window)
-    wy = _unfold_torch(y, window)
-    mx = wx.nanmean(dim=2, keepdim=True)
-    my = wy.nanmean(dim=2, keepdim=True)
-    dx = (wx - mx).nan_to_num(0.0)
-    dy = (wy - my).nan_to_num(0.0)
-    not_nan = ~(torch.isnan(wx) | torch.isnan(wy))
-    n = not_nan.sum(dim=2).float()
-    result = (dx * dy * not_nan).sum(dim=2) / n.clamp(min=1)
-    return _pad_front_torch(result, window, T)
-
-
-def beta_torch(x: "torch.Tensor", y: "torch.Tensor", window: int = 10) -> "torch.Tensor":
-    window = int(window)
-    M, T = x.shape
-    from factorminer.operators.statistical import _unfold_torch, _pad_front_torch
-
-    wx = _unfold_torch(x, window)
-    wy = _unfold_torch(y, window)
-    mx = wx.nanmean(dim=2, keepdim=True)
-    my = wy.nanmean(dim=2, keepdim=True)
-    dx = (wx - mx).nan_to_num(0.0)
-    dy = (wy - my).nan_to_num(0.0)
-    not_nan = ~(torch.isnan(wx) | torch.isnan(wy))
-    n = not_nan.sum(dim=2).float()
-    var_y = (dy ** 2 * not_nan).sum(dim=2) / n.clamp(min=1)
-    cov_xy = (dx * dy * not_nan).sum(dim=2) / n.clamp(min=1)
-    result = torch.where(var_y > 1e-10, cov_xy / var_y,
-                         torch.tensor(float("nan"), device=x.device))
-    return _pad_front_torch(result, window, T)
-
-
-def resid_torch(x: "torch.Tensor", y: "torch.Tensor", window: int = 10) -> "torch.Tensor":
-    window = int(window)
-    M, T = x.shape
-    from factorminer.operators.statistical import _unfold_torch, _pad_front_torch
-
-    wx = _unfold_torch(x, window)
-    wy = _unfold_torch(y, window)
-    mx = wx.nanmean(dim=2, keepdim=True)
-    my = wy.nanmean(dim=2, keepdim=True)
-    dx = (wx - mx).nan_to_num(0.0)
-    dy = (wy - my).nan_to_num(0.0)
-    not_nan = ~(torch.isnan(wx) | torch.isnan(wy))
-    n = not_nan.sum(dim=2, keepdim=True).float()
-    var_y = (dy ** 2 * not_nan).sum(dim=2, keepdim=True) / n.clamp(min=1)
-    cov_xy = (dx * dy * not_nan).sum(dim=2, keepdim=True) / n.clamp(min=1)
-    b = torch.where(var_y > 1e-10, cov_xy / var_y, torch.zeros_like(var_y))
-    a = mx - b * my
-    result = (wx[:, :, -1:] - b * wy[:, :, -1:] - a).squeeze(2)
-    return _pad_front_torch(result, window, T)
-
-
-def wma_torch(x: "torch.Tensor", window: int = 10) -> "torch.Tensor":
-    window = int(window)
-    M, T = x.shape
-    from factorminer.operators.statistical import _unfold_torch, _pad_front_torch
-
-    w = _unfold_torch(x, window)
-    weights = torch.arange(1, window + 1, dtype=x.dtype, device=x.device).float()
-    weights = weights / weights.sum()
-    filled = w.nan_to_num(0.0)
-    result = (filled * weights.unsqueeze(0).unsqueeze(0)).sum(dim=2)
-    return _pad_front_torch(result, window, T)
-
-
-def decay_torch(x: "torch.Tensor", window: int = 10) -> "torch.Tensor":
-    return wma_torch(x, window)
-
-
-def cumsum_torch(x: "torch.Tensor") -> "torch.Tensor":
-    return x.nan_to_num(0.0).cumsum(dim=1)
-
-
-def cumprod_torch(x: "torch.Tensor") -> "torch.Tensor":
-    return x.nan_to_num(1.0).cumprod(dim=1)
-
-
-def cummax_torch(x: "torch.Tensor") -> "torch.Tensor":
-    filled = x.nan_to_num(float("-inf"))
-    return filled.cummax(dim=1).values
-
-
-def cummin_torch(x: "torch.Tensor") -> "torch.Tensor":
-    filled = x.nan_to_num(float("inf"))
-    return filled.cummin(dim=1).values
-
-
-# ===========================================================================
-# Registration table
-# ===========================================================================
-
-TIMESERIES_OPS = {
-    "Delta": (delta_np, delta_torch),
-    "Delay": (delay_np, delay_torch),
-    "Return": (return_np, return_torch),
-    "LogReturn": (log_return_np, log_return_torch),
-    "Corr": (corr_np, corr_torch),
-    "Cov": (cov_np, cov_torch),
-    "Beta": (beta_np, beta_torch),
-    "Resid": (resid_np, resid_torch),
-    "WMA": (wma_np, wma_torch),
-    "Decay": (decay_np, decay_torch),
-    "CumSum": (cumsum_np, cumsum_torch),
-    "CumProd": (cumprod_np, cumprod_torch),
-    "CumMax": (cummax_np, cummax_torch),
-    "CumMin": (cummin_np, cummin_torch),
-}
diff --git a/src/factorminer/factorminer/tests/__init__.py b/src/factorminer/factorminer/tests/__init__.py
deleted file mode 100644
index b66dd9f..0000000
--- a/src/factorminer/factorminer/tests/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-"""Test suite for FactorMiner."""
diff --git a/src/factorminer/factorminer/tests/conftest.py b/src/factorminer/factorminer/tests/conftest.py
deleted file mode 100644
index 2a6791a..0000000
--- a/src/factorminer/factorminer/tests/conftest.py
+++ /dev/null
@@ -1,163 +0,0 @@
-"""Shared pytest fixtures for FactorMiner test suite."""
-
-from __future__ import annotations
-
-import numpy as np
-import pytest
-
-from src.factorminer.factorminer.core.factor_library import Factor, FactorLibrary
-from src.factorminer.factorminer.memory.experience_memory import ExperienceMemoryManager
-
-
-# ---------------------------------------------------------------------------
-# Mock data fixtures
-# ---------------------------------------------------------------------------
-
-@pytest.fixture
-def rng():
-    """Seeded random generator for reproducibility."""
-    return np.random.default_rng(42)
-
-
-@pytest.fixture
-def small_data(rng):
-    """Small (M=10, T=50) synthetic dataset dict mapping feature names to arrays."""
-    M, T = 10, 50
-    close = 100.0 + np.cumsum(rng.normal(0, 0.5, (M, T)), axis=1)
-    open_ = close + rng.normal(0, 0.1, (M, T))
-    high = np.maximum(close, open_) + np.abs(rng.normal(0, 0.2, (M, T)))
-    low = np.minimum(close, open_) - np.abs(rng.normal(0, 0.2, (M, T)))
-    low = np.maximum(low, 1.0)
-    volume = np.abs(rng.normal(1e6, 1e5, (M, T)))
-    vwap = (high + low + close) / 3
-    amt = volume * vwap
-    returns = np.zeros((M, T))
-    returns[:, 1:] = np.diff(close, axis=1) / close[:, :-1]
-
-    return {
-        "$open": open_,
-        "$high": high,
-        "$low": low,
-        "$close": close,
-        "$volume": volume,
-        "$amt": amt,
-        "$vwap": vwap,
-        "$returns": returns,
-    }
-
-
-@pytest.fixture
-def medium_data(rng):
-    """Medium (M=20, T=100) synthetic dataset for evaluation tests."""
-    M, T = 20, 100
-    close = 50.0 + np.cumsum(rng.normal(0, 0.3, (M, T)), axis=1)
-    open_ = close + rng.normal(0, 0.05, (M, T))
-    high = np.maximum(close, open_) + np.abs(rng.normal(0, 0.1, (M, T)))
-    low = np.minimum(close, open_) - np.abs(rng.normal(0, 0.1, (M, T)))
-    low = np.maximum(low, 1.0)
-    volume = np.abs(rng.normal(1e6, 1e5, (M, T)))
-    vwap = (high + low + close) / 3
-    amt = volume * vwap
-    returns = np.zeros((M, T))
-    returns[:, 1:] = np.diff(close, axis=1) / close[:, :-1]
-
-    return {
-        "$open": open_,
-        "$high": high,
-        "$low": low,
-        "$close": close,
-        "$volume": volume,
-        "$amt": amt,
-        "$vwap": vwap,
-        "$returns": returns,
-    }
-
-
-# ---------------------------------------------------------------------------
-# Library fixtures
-# ---------------------------------------------------------------------------
-
-@pytest.fixture
-def mock_library(rng):
-    """Small FactorLibrary pre-loaded with 3 known factors."""
-    lib = FactorLibrary(correlation_threshold=0.5, ic_threshold=0.04)
-
-    M, T = 20, 60
-    for i in range(3):
-        signals = rng.normal(0, 1, (M, T))
-        factor = Factor(
-            id=0,
-            name=f"test_factor_{i}",
-            formula=f"Neg($close)" if i == 0 else f"CsRank(Mean($close, {10 + i * 5}))",
-            category="test",
-            ic_mean=0.05 + i * 0.01,
-            icir=0.8 + i * 0.1,
-            ic_win_rate=0.55 + i * 0.05,
-            max_correlation=0.1 * i,
-            batch_number=1,
-            signals=signals,
-        )
-        lib.admit_factor(factor)
-
-    return lib
-
-
-# ---------------------------------------------------------------------------
-# Memory fixtures
-# ---------------------------------------------------------------------------
-
-@pytest.fixture
-def mock_memory():
-    """ExperienceMemoryManager with default patterns initialized."""
-    return ExperienceMemoryManager(
-        max_success_patterns=20,
-        max_failure_patterns=30,
-        max_insights=15,
-    )
-
-
-@pytest.fixture
-def sample_trajectory():
-    """Sample batch trajectory for memory update tests."""
-    return [
-        {
-            "formula": "CsRank(Corr($close, $volume, 20))",
-            "factor_id": "f001",
-            "ic": 0.08,
-            "icir": 1.2,
-            "max_correlation": 0.15,
-            "correlated_with": "",
-            "admitted": True,
-            "rejection_reason": "",
-        },
-        {
-            "formula": "Neg(Div(Sub($close, $vwap), $vwap))",
-            "factor_id": "f002",
-            "ic": 0.06,
-            "icir": 0.9,
-            "max_correlation": 0.65,
-            "correlated_with": "existing_factor_3",
-            "admitted": False,
-            "rejection_reason": "Max correlation 0.65 >= threshold 0.5",
-        },
-        {
-            "formula": "IfElse(Skew($close, 20), CsRank($returns), Neg($returns))",
-            "factor_id": "f003",
-            "ic": 0.10,
-            "icir": 1.5,
-            "max_correlation": 0.20,
-            "correlated_with": "",
-            "admitted": True,
-            "rejection_reason": "",
-        },
-        {
-            "formula": "CsZScore(Std($returns, 10))",
-            "factor_id": "f004",
-            "ic": 0.03,
-            "icir": 0.4,
-            "max_correlation": 0.70,
-            "correlated_with": "existing_factor_1",
-            "admitted": False,
-            "rejection_reason": "IC 0.03 below threshold 0.04",
-        },
-    ]
diff --git a/src/factorminer/factorminer/tests/test_auto_inventor.py b/src/factorminer/factorminer/tests/test_auto_inventor.py
deleted file mode 100644
index 76dead4..0000000
--- a/src/factorminer/factorminer/tests/test_auto_inventor.py
+++ /dev/null
@@ -1,130 +0,0 @@
-"""Tests for auto-operator invention (operators/auto_inventor.py)."""
-
-from __future__ import annotations
-
-import numpy as np
-import pytest
-
-from src.factorminer.factorminer.operators.auto_inventor import (
-    OperatorInventor,
-    ProposedOperator,
-    _BLOCKED_TOKENS,
-)
-from src.factorminer.factorminer.operators.custom import CustomOperatorStore
-
-
-# -----------------------------------------------------------------------
-# _compile_safely: valid numpy code -> callable
-# -----------------------------------------------------------------------
-
-def test_compile_safely_valid_code():
-    """Valid numpy code defining compute() should return a callable."""
-    code = "def compute(x):\n    return np.nanmean(x, axis=1, keepdims=True) * np.ones_like(x)"
-
-    # Use OperatorInventor._compile_safely as a static-like test
-    data = np.random.default_rng(42).normal(0, 1, (10, 50))
-    inventor = OperatorInventor(
-        llm_provider=_mock_provider(),
-        data_tensor=data.reshape(10, 50, 1),
-        returns=data,
-    )
-    fn = inventor._compile_safely(code)
-    assert fn is not None
-    assert callable(fn)
-    result = fn(data)
-    assert isinstance(result, np.ndarray)
-
-
-# -----------------------------------------------------------------------
-# _compile_safely: os.system -> returns None (SECURITY)
-# -----------------------------------------------------------------------
-
-def test_compile_safely_blocks_os_system():
-    """Code containing os.system should be blocked."""
-    code = "import os\ndef compute(x):\n    os.system('echo hacked')\n    return x"
-    inventor = _make_inventor()
-    fn = inventor._compile_safely(code)
-    assert fn is None
-
-
-# -----------------------------------------------------------------------
-# _compile_safely: import os -> returns None (SECURITY)
-# -----------------------------------------------------------------------
-
-def test_compile_safely_blocks_import_os():
-    """Code with 'import ' token should be blocked."""
-    code = "import os\ndef compute(x):\n    return x"
-    inventor = _make_inventor()
-    fn = inventor._compile_safely(code)
-    assert fn is None
-
-
-def test_compile_safely_blocks_eval():
-    """Code with eval() should be blocked."""
-    code = "def compute(x):\n    return eval('x + 1')"
-    inventor = _make_inventor()
-    fn = inventor._compile_safely(code)
-    assert fn is None
-
-
-# -----------------------------------------------------------------------
-# CustomOperatorStore: register and list
-# -----------------------------------------------------------------------
-
-def test_custom_operator_store_register_and_list(tmp_path):
-    store = CustomOperatorStore(store_dir=str(tmp_path / "ops"))
-
-    from factorminer.core.types import OperatorSpec, OperatorType, SignatureType
-    spec = OperatorSpec(
-        name="TestOp",
-        arity=1,
-        category=OperatorType.AUTO_INVENTED,
-        signature=SignatureType.ELEMENT_WISE,
-        description="test operator",
-    )
-    from factorminer.operators.custom import CustomOperator
-    op = CustomOperator(
-        name="TestOp",
-        spec=spec,
-        numpy_code="def compute(x): return x * 2",
-        numpy_fn=lambda x: x * 2,
-        validation_ic=0.05,
-    )
-    store.register(op)
-    assert "TestOp" in store.list_operators()
-    assert store.get_operator("TestOp") is not None
-
-
-# -----------------------------------------------------------------------
-# ProposedOperator dataclass
-# -----------------------------------------------------------------------
-
-def test_proposed_operator_dataclass():
-    op = ProposedOperator(
-        name="TestOp",
-        arity=1,
-        description="A test operator",
-        numpy_code="def compute(x): return x",
-    )
-    assert op.name == "TestOp"
-    assert op.arity == 1
-    assert op.param_names == ()
-    assert op.based_on == []
-
-
-# -----------------------------------------------------------------------
-# Helpers
-# -----------------------------------------------------------------------
-
-def _mock_provider():
-    from factorminer.agent.llm_interface import MockProvider
-    return MockProvider()
-
-
-def _make_inventor():
-    data = np.random.default_rng(42).normal(0, 1, (10, 50))
-    return OperatorInventor(
-        llm_provider=_mock_provider(),
-        data_tensor=data.reshape(10, 50, 1),
-        returns=data,
-    )
diff --git a/src/factorminer/factorminer/tests/test_benchmark.py b/src/factorminer/factorminer/tests/test_benchmark.py
deleted file mode 100644
index f6cd1a8..0000000
--- a/src/factorminer/factorminer/tests/test_benchmark.py
+++ /dev/null
@@ -1,484 +0,0 @@
-"""Benchmark-runtime and CLI coverage."""
-
-from __future__ import annotations
-
-import json
-from types import SimpleNamespace
-
-from click.testing import CliRunner
-import numpy as np
-
-from src.factorminer.factorminer.benchmark.runtime import (
-    build_benchmark_library,
-    run_table1_benchmark,
-    select_frozen_top_k,
-)
-from src.factorminer.factorminer.benchmark.helix_benchmark import StatisticalComparisonTests, _json_safe
-from src.factorminer.factorminer.cli import main
-from src.factorminer.factorminer.core.factor_library import Factor, FactorLibrary
-from src.factorminer.factorminer.core.library_io import save_library
-from src.factorminer.factorminer.core.session import MiningSession
-from src.factorminer.factorminer.evaluation.runtime import FactorEvaluationArtifact
-from src.factorminer.factorminer.utils.config import load_config
-from run_phase2_benchmark import (
-    _build_phase2_manifest,
-    _collect_runtime_manifest_refs,
-    _generate_markdown_report,
-    _write_markdown_table,
-)
-
-
-def _artifact(
-    factor_id: int,
-    formula: str,
-    train_ic: float,
-    train_icir: float,
-    signal_scale: float,
-) -> FactorEvaluationArtifact:
-    signal = np.array(
-        [
-            [1.0, 2.0, 3.0],
-            [2.0, 1.0, 0.0],
-            [0.5, 0.3, 0.1],
-        ],
-        dtype=np.float64,
-    ) * signal_scale
-    return FactorEvaluationArtifact(
-        factor_id=factor_id,
-        name=f"factor_{factor_id}",
-        formula=formula,
-        category="test",
-        parse_ok=True,
-        signals_full=signal,
-        split_signals={"train": signal, "test": signal, "full": signal},
-        split_stats={
-            "train": {
-                "ic_mean": train_ic,
-                "ic_abs_mean": abs(train_ic),
-                "icir": train_icir,
-                "ic_win_rate": 0.6,
-            },
-            "test": {
-                "ic_mean": train_ic / 2.0,
-                "ic_abs_mean": abs(train_ic / 2.0),
-                "icir": train_icir / 2.0,
-                "ic_win_rate": 0.5,
-            },
-            "full": {
-                "ic_mean": train_ic,
-                "ic_abs_mean": abs(train_ic),
-                "icir": train_icir,
-                "ic_win_rate": 0.6,
-            },
-        },
-    )
-
-
-def test_select_frozen_top_k_prefers_thresholded_admitted_then_fills():
-    cfg = load_config()
-    artifacts = [
-        _artifact(1, "Neg($close)", 0.07, 0.8, 1.0),
-        _artifact(2, "Neg($open)", 0.06, 0.7, 0.7),
-        _artifact(3, "Neg($high)", 0.049, 0.9, 0.2),
-    ]
-    library, _ = build_benchmark_library(artifacts, cfg, split_name="train")
-
-    frozen = select_frozen_top_k(
-        artifacts,
-        library,
-        top_k=3,
-        split_name="train",
-        min_ic=0.05,
-        min_icir=0.5,
-    )
-
-    assert [artifact.formula for artifact in frozen[:2]] == ["Neg($close)", "Neg($open)"]
-    assert frozen[2].formula == "Neg($high)"
-
-
-def test_build_benchmark_library_rejects_low_ic_candidates():
-    cfg = load_config()
-    artifacts = [
-        _artifact(1, "Neg($close)", 0.07, 0.8, 1.0),
-        _artifact(2, "Neg($open)", 0.01, 0.6, 0.9),
-    ]
-
-    library, stats = build_benchmark_library(artifacts, cfg, split_name="train")
-
-    assert library.size == 1
-    assert stats["threshold_rejections"] == 1
-    assert stats["admitted"] == 1
-
-
-def test_benchmark_table1_cli_invokes_runtime(monkeypatch, tmp_path):
-    captured = {}
-
-    def _fake_run(*args, **kwargs):
-        captured["called"] = True
-        return {
-            "factor_miner": {
-                "freeze_library_size": 12,
-                "frozen_top_k": [{"name": "f1"}],
-                "universes": {
-                    "CSI500": {
-                        "library": {"ic": 0.08, "icir": 0.9, "avg_abs_rho": 0.2}
-                    }
-                },
-            }
-        }
-
-    monkeypatch.setattr("src.factorminer.factorminer.benchmark.runtime.run_table1_benchmark", _fake_run)
-
-    runner = CliRunner()
-    result = runner.invoke(
-        main,
-        [
-            "--cpu",
-            "--output-dir",
-            str(tmp_path / "out"),
-            "benchmark",
-            "table1",
-            "--mock",
-        ],
-    )
-
-    assert result.exit_code == 0, result.output
-    assert captured.get("called") is True
-    assert "Benchmark Table 1" in result.output
-    assert "Baseline: factor_miner" in result.output
-    assert "CSI500: library IC=0.0800" in result.output
-
-
-def test_table1_manifest_includes_saved_library_provenance(monkeypatch, tmp_path):
-    saved_root = tmp_path / "saved"
-    library_base = saved_root / "factor_miner_library"
-
-    library = FactorLibrary()
-    factor = Factor(
-        id=0,
-        name="saved_factor",
-        formula="Neg($close)",
-        category="test",
-        ic_mean=0.07,
-        icir=0.8,
-        ic_win_rate=0.6,
-        max_correlation=0.1,
-        batch_number=1,
-        signals=np.array(
-            [
-                [1.0, 2.0, 3.0],
-                [0.5, 0.4, 0.3],
-                [0.2, 0.3, 0.4],
-            ],
-            dtype=np.float64,
-        ),
-    )
-    library.admit_factor(factor)
-    save_library(library, library_base)
-
-    session = MiningSession(
-        session_id="session-001",
-        output_dir=str(saved_root),
-        library_path=str(library_base),
-    )
-    session.record_iteration({"candidates": 3, "admitted": 1, "replaced": 0, "library_size": 1})
-    session.record_iteration({"candidates": 2, "admitted": 1, "replaced": 0, "library_size": 1})
-    session.finalize()
-    session.save(saved_root / "session.json")
-    with open(saved_root / "session_log.json", "w") as fp:
-        json.dump({"summary": session.get_summary(), "iterations": session.iterations}, fp)
-
-    cfg = load_config()
-    output_dir = tmp_path / "results"
-    artifact = _artifact(1, "Neg($close)", 0.07, 0.8, 1.0)
-
-    monkeypatch.setattr(
-        "src.factorminer.factorminer.benchmark.runtime.load_benchmark_dataset",
-        lambda *args, **kwargs: (SimpleNamespace(), "freeze-hash"),
-    )
-    monkeypatch.setattr(
-        "src.factorminer.factorminer.benchmark.runtime.evaluate_factors",
-        lambda *args, **kwargs: [artifact],
-    )
-    monkeypatch.setattr(
-        "src.factorminer.factorminer.benchmark.runtime.evaluate_frozen_set",
-        lambda frozen, dataset, **kwargs: {
-            "factor_count": len(frozen),
-            "library": {"ic": 0.1, "icir": 1.0, "avg_abs_rho": 0.2},
-            "combinations": {},
-            "selections": {},
-        },
-    )
-
-    run_table1_benchmark(
-        cfg,
-        output_dir,
-        baseline_names=["factor_miner"],
-        factor_miner_library_path=str(library_base),
-    )
-
-    result_path = output_dir / "benchmark" / "table1" / "factor_miner.json"
-    manifest_path = output_dir / "benchmark" / "table1" / "factor_miner_manifest.json"
-    result = json.loads(result_path.read_text())
-    manifest = json.loads(manifest_path.read_text())
-
-    provenance = manifest["baseline_provenance"]["factor_miner"]
-    assert provenance["kind"] == "saved_library"
-    assert provenance["library_summary"]["factor_count"] == 1
-    assert provenance["session_summary"]["total_iterations"] == 2
-    assert provenance["source_files"]["library_json"]["path"].endswith("factor_miner_library.json")
-    assert provenance["source_files"]["signal_cache"]["path"].endswith("factor_miner_library_signals.npz")
-    assert manifest["artifact_paths"]["result"] == str(result_path)
-    assert manifest["artifact_paths"]["manifest"] == str(manifest_path)
-    assert result["provenance"]["kind"] == "saved_library"
-
-
-def test_phase2_manifest_references_runtime_manifest_and_sanitizes_stats(tmp_path):
-    runtime_root = tmp_path / "runtime"
-    manifest_path = runtime_root / "benchmark" / "table1" / "factor_miner_manifest.json"
-    manifest_path.parent.mkdir(parents=True, exist_ok=True)
-    manifest_path.write_text(
-        json.dumps(
-            {
-                "benchmark_name": "table1",
-                "baseline": "factor_miner",
-                "mode": "paper",
-                "artifact_paths": {"result": "result.json", "manifest": str(manifest_path)},
-                "baseline_provenance": {
-                    "factor_miner": {
-                        "kind": "saved_library",
-                        "source": "factor_miner",
-                    }
-                },
-            }
-        )
-    )
-
-    refs = _collect_runtime_manifest_refs(runtime_root)
-    assert len(refs) == 1
-    assert refs[0]["path"] == str(manifest_path)
-    assert refs[0]["baseline_provenance"]["factor_miner"]["kind"] == "saved_library"
-
-    phase2_manifest = _build_phase2_manifest(
-        output_dir=tmp_path / "phase2",
-        methods=["ralph_loop", "helix_phase2"],
-        seed=7,
-        n_factors=40,
-        mock=True,
-        data_path=None,
-        full_ablation=False,
-        skip_ablation=True,
-        artifact_paths={"html_report": str(tmp_path / "phase2" / "benchmark_report.html")},
-        statistical_tests={
-            "diebold_mariano": {"dm_stat": np.nan, "p_value": np.inf},
-            "bootstrap_ci_95": {"lower": -np.inf, "upper": np.nan},
-        },
-        ablation_configs=["full"],
-        runtime_manifest_root=runtime_root,
-    )
-
-    assert phase2_manifest["runtime_manifest_refs"][0]["path"] == str(manifest_path)
-    assert phase2_manifest["statistical_tests"]["diebold_mariano"]["dm_stat"] is None
-    assert phase2_manifest["statistical_tests"]["diebold_mariano"]["p_value"] is None
-    assert phase2_manifest["statistical_tests"]["bootstrap_ci_95"]["lower"] is None
-    dumped = json.dumps(_json_safe(phase2_manifest), allow_nan=False)
-    assert "NaN" not in dumped
-
-
-def test_diebold_mariano_handles_identical_series_without_nan_direction():
-    tests = StatisticalComparisonTests(seed=42)
-    series = np.array([0.05, 0.05, 0.05, 0.05, 0.05], dtype=np.float64)
-
-    result = tests.diebold_mariano_test(series, series.copy())
-
-    assert result.direction == "no_difference"
-    assert result.p_value == 1.0
-    assert np.isfinite(result.dm_statistic)
-
-
-def test_json_safe_removes_non_finite_values():
-    payload = {
-        "finite": 1.5,
-        "nan": float("nan"),
-        "nested": [np.float64(np.inf), {"value": -np.inf}],
-    }
-
-    cleaned = _json_safe(payload)
-
-    assert cleaned == {"finite": 1.5, "nan": None, "nested": [None, {"value": None}]}
-    dumped = json.dumps(cleaned, allow_nan=False)
-    assert "NaN" not in dumped
-
-
-def test_markdown_artifacts_use_expected_paths(tmp_path):
-    table_stub = SimpleNamespace(
-        to_markdown=lambda **kwargs: "| a | b |\n|---|---|\n| 1 | 2 |\n"
-    )
-    bench_result = SimpleNamespace(
-        factor_library_metrics=table_stub,
-        combination_metrics=table_stub,
-        selection_metrics=table_stub,
-        speed_metrics=table_stub,
-        statistical_tests={"diebold_mariano": {"dm_stat": 0.0, "p_value": 1.0}},
-        to_markdown_table=lambda: "| a | b |\n|---|---|\n| 1 | 2 |\n",
-    )
-
-    table_path = _write_markdown_table(bench_result, tmp_path)
-    report_path = _generate_markdown_report(bench_result, None, tmp_path)
-
-    assert table_path.endswith("benchmark_report.md")
-    assert report_path.endswith("benchmark_report_full.md")
-    assert (tmp_path / "benchmark_report.md").exists()
-    assert (tmp_path / "benchmark_report_full.md").exists()
-
-
-def _runtime_dataset_stub():
-    data_tensor = np.ones((2, 6, 8), dtype=np.float64)
-    returns = np.array(
-        [
-            [0.01, 0.02, 0.01, 0.03, 0.02, 0.01],
-            [0.02, 0.01, 0.03, 0.02, 0.01, 0.02],
-        ],
-        dtype=np.float64,
-    )
-    splits = {
-        "train": SimpleNamespace(
-            indices=np.array([0, 1, 2]),
-            returns=returns[:, :3],
-            timestamps=np.arange(3),
-        ),
-        "test": SimpleNamespace(
-            indices=np.array([3, 4, 5]),
-            returns=returns[:, 3:],
-            timestamps=np.arange(3, 6),
-        ),
-        "full": SimpleNamespace(
-            indices=np.arange(6),
-            returns=returns,
-            timestamps=np.arange(6),
-        ),
-    }
-
-    return SimpleNamespace(
-        data_tensor=data_tensor,
-        returns=returns,
-        data_dict={
-            "$open": data_tensor[:, :, 0],
-            "$high": data_tensor[:, :, 1],
-            "$low": data_tensor[:, :, 2],
-            "$close": data_tensor[:, :, 3],
-            "$volume": data_tensor[:, :, 4],
-            "$amt": data_tensor[:, :, 5],
-            "$vwap": data_tensor[:, :, 6],
-            "$returns": data_tensor[:, :, 7],
-        },
-        target_panels={"paper": returns},
-        target_specs={"paper": SimpleNamespace(holding_bars=1)},
-        get_split=lambda name: splits[name],
-    )
-
-
-def _single_factor_library():
-    library = FactorLibrary()
-    library.admit_factor(
-        Factor(
-            id=0,
-            name="runtime_factor",
-            formula="Neg($close)",
-            category="test",
-            ic_mean=0.08,
-            icir=0.9,
-            ic_win_rate=0.6,
-            max_correlation=0.0,
-            batch_number=1,
-            signals=np.ones((2, 3), dtype=np.float64),
-        )
-    )
-    return library
-
-
-def test_table1_runtime_methods_instantiate_live_loops(monkeypatch, tmp_path):
-    cfg = load_config()
-    calls = []
-
-    monkeypatch.setattr(
-        "src.factorminer.factorminer.benchmark.runtime.load_benchmark_dataset",
-        lambda *args, **kwargs: (_runtime_dataset_stub(), "dataset-hash"),
-    )
-    monkeypatch.setattr(
-        "src.factorminer.factorminer.benchmark.runtime._get_baseline_entries",
-        lambda *args, **kwargs: (_ for _ in ()).throw(AssertionError("catalog fallback used")),
-    )
-    monkeypatch.setattr(
-        "src.factorminer.factorminer.benchmark.runtime.evaluate_factors",
-        lambda *args, **kwargs: [_artifact(1, "Neg($close)", 0.08, 0.9, 1.0)],
-    )
-    monkeypatch.setattr(
-        "src.factorminer.factorminer.benchmark.runtime.evaluate_frozen_set",
-        lambda frozen, dataset, **kwargs: {
-            "factor_count": len(frozen),
-            "library": {"ic": 0.1, "icir": 1.0, "avg_abs_rho": 0.2},
-            "combinations": {
-                "equal_weight": {"ic": 0.12, "icir": 1.1, "turnover": 0.3},
-                "ic_weighted": {"ic": 0.13, "icir": 1.2, "turnover": 0.25},
-            },
-            "selections": {"lasso": {"ic": 0.09, "icir": 0.8}},
-        },
-    )
-
-    def _fake_ralph_run(self, *args, **kwargs):
-        calls.append("ralph")
-        return _single_factor_library()
-
-    def _fake_helix_run(self, *args, **kwargs):
-        calls.append("helix")
-        return _single_factor_library()
-
-    monkeypatch.setattr("src.factorminer.factorminer.core.ralph_loop.RalphLoop.run", _fake_ralph_run)
-    monkeypatch.setattr("src.factorminer.factorminer.core.helix_loop.HelixLoop.run", _fake_helix_run)
-
-    payload = run_table1_benchmark(
-        cfg,
-        tmp_path,
-        mock=True,
-        baseline_names=["ralph_loop", "helix_phase2"],
-        use_runtime_loops=True,
-    )
-
-    assert calls == ["ralph", "helix"]
-    assert payload["ralph_loop"]["provenance"]["kind"] == "runtime_loop"
-    assert payload["helix_phase2"]["provenance"]["kind"] == "runtime_loop"
-
-
-def test_table1_runtime_methods_fail_loudly_without_catalog_fallback(monkeypatch, tmp_path):
-    cfg = load_config()
-    fallback_called = {"value": False}
-
-    monkeypatch.setattr(
-        "src.factorminer.factorminer.benchmark.runtime.load_benchmark_dataset",
-        lambda *args, **kwargs: (_runtime_dataset_stub(), "dataset-hash"),
-    )
-
-    def _forbidden_catalog(*args, **kwargs):
-        fallback_called["value"] = True
-        raise AssertionError("catalog fallback used")
-
-    monkeypatch.setattr("src.factorminer.factorminer.benchmark.runtime._get_baseline_entries", _forbidden_catalog)
-    monkeypatch.setattr(
-        "src.factorminer.factorminer.benchmark.runtime._run_runtime_mining_loop",
-        lambda *args, **kwargs: (_ for _ in ()).throw(RuntimeError("runtime loop failed")),
-    )
-
-    try:
-        run_table1_benchmark(
-            cfg,
-            tmp_path,
-            mock=True,
-            baseline_names=["ralph_loop"],
-            use_runtime_loops=True,
-        )
-        assert False, "expected runtime loop failure"
-    except RuntimeError as exc:
-        assert "runtime loop failed" in str(exc)
-
-    assert fallback_called["value"] is False
diff --git a/src/factorminer/factorminer/tests/test_canonicalizer.py b/src/factorminer/factorminer/tests/test_canonicalizer.py
deleted file mode 100644
index d680b2b..0000000
--- a/src/factorminer/factorminer/tests/test_canonicalizer.py
+++ /dev/null
@@ -1,79 +0,0 @@
-"""Tests for the SymPy-based formula canonicalizer (core/canonicalizer.py)."""
-
-from __future__ import annotations
-
-import time
-
-import pytest
-
-from src.factorminer.factorminer.core.canonicalizer import FormulaCanonicalizer
-from src.factorminer.factorminer.core.parser import parse
-
-
-@pytest.fixture
-def canon():
-    return FormulaCanonicalizer()
-
-
-# -----------------------------------------------------------------------
-# Double negation: Neg(Neg($close)) == $close
-# -----------------------------------------------------------------------
-
-def test_double_negation(canon):
-    tree_a = parse("Neg(Neg($close))")
-    tree_b = parse("$close")
-    assert canon.is_duplicate(tree_a, tree_b)
-
-
-# -----------------------------------------------------------------------
-# Commutativity: Add($close, $open) == Add($open, $close)
-# -----------------------------------------------------------------------
-
-def test_commutativity_add(canon):
-    tree_a = parse("Add($close, $open)")
-    tree_b = parse("Add($open, $close)")
-    assert canon.is_duplicate(tree_a, tree_b)
-
-
-# -----------------------------------------------------------------------
-# Non-algebraic preserved: CsRank(Neg($close)) != Neg(CsRank($close))
-# -----------------------------------------------------------------------
-
-def test_non_algebraic_not_simplified(canon):
-    tree_a = parse("CsRank(Neg($close))")
-    tree_b = parse("Neg(CsRank($close))")
-    assert not canon.is_duplicate(tree_a, tree_b)
-
-
-# -----------------------------------------------------------------------
-# is_duplicate method
-# -----------------------------------------------------------------------
-
-def test_is_duplicate_same_formula(canon):
-    tree = parse("CsRank($close)")
-    assert canon.is_duplicate(tree, tree)
-
-
-def test_is_duplicate_different_formulas(canon):
-    tree_a = parse("CsRank($close)")
-    tree_b = parse("CsRank($volume)")
-    assert not canon.is_duplicate(tree_a, tree_b)
-
-
-# -----------------------------------------------------------------------
-# Cache: second call should be faster (or at least not slower)
-# -----------------------------------------------------------------------
-
-def test_cache_works(canon):
-    tree = parse("Add(Mul($close, $open), Neg($volume))")
-
-    # First call populates cache
-    h1 = canon.canonicalize(tree)
-
-    # Second call should hit cache and return same hash
-    h2 = canon.canonicalize(tree)
-    assert h1 == h2
-
-    # Verify cache is populated
-    key = tree.to_string()
-    assert key in canon._cache
diff --git a/src/factorminer/factorminer/tests/test_capacity.py b/src/factorminer/factorminer/tests/test_capacity.py
deleted file mode 100644
index 0d5fd3a..0000000
--- a/src/factorminer/factorminer/tests/test_capacity.py
+++ /dev/null
@@ -1,118 +0,0 @@
-"""Tests for capacity-aware backtesting (evaluation/capacity.py)."""
-
-from __future__ import annotations
-
-import numpy as np
-import pytest
-
-from src.factorminer.factorminer.evaluation.capacity import (
-    CapacityConfig,
-    CapacityEstimator,
-    MarketImpactModel,
-    NetCostResult,
-)
-
-
-@pytest.fixture
-def rng():
-    return np.random.default_rng(42)
-
-
-@pytest.fixture
-def market_data(rng):
-    """Synthetic returns and volume for capacity tests."""
-    M, T = 20, 100
-    returns = rng.normal(0, 0.01, (M, T))
-    volume = np.abs(rng.normal(1e6, 1e5, (M, T)))
-    signals = rng.normal(0, 1, (M, T))
-    return returns, volume, signals
-
-
-# -----------------------------------------------------------------------
-# MarketImpactModel: higher capital -> higher impact_bps
-# -----------------------------------------------------------------------
-
-def test_impact_increases_with_capital(rng):
-    """Higher capital should result in higher average impact."""
-    M, T = 20, 100
-    signals = rng.normal(0, 1, (M, T))
-    # Use very high volume so low capital stays below participation limit
-    volume = np.abs(rng.normal(1e9, 1e8, (M, T)))
-    model = MarketImpactModel()
-
-    low_cap = model.estimate_impact(signals, volume, capital=1e6)
-    high_cap = model.estimate_impact(signals, volume, capital=1e9)
-
-    assert high_cap.avg_impact_bps > low_cap.avg_impact_bps
-
-
-def test_impact_result_shape(market_data):
-    """Impact arrays should match T dimension."""
-    returns, volume, signals = market_data
-    T = signals.shape[1]
-    model = MarketImpactModel()
-    result = model.estimate_impact(signals, volume, capital=1e8)
-
-    assert result.impact_bps.shape == (T,)
-    assert result.participation_rate.shape == (T,)
-    assert result.avg_impact_bps >= 0
-
-
-# -----------------------------------------------------------------------
-# CapacityEstimator: low capital -> net_icir ~ gross_icir
-# -----------------------------------------------------------------------
-
-def test_low_capital_minimal_degradation(market_data):
-    """At very low capital, net ICIR should be close to gross ICIR."""
-    returns, volume, signals = market_data
-    estimator = CapacityEstimator(
-        returns=returns,
-        volume=volume,
-        config=CapacityConfig(base_capital_usd=1e4),
-    )
-    result = estimator.net_cost_evaluation("test", signals, capital=1e4)
-    assert isinstance(result, NetCostResult)
-    # At very low capital, impact is tiny, so net ~ gross
-    diff = abs(result.gross_icir - result.net_icir)
-    assert diff < abs(result.gross_icir) + 0.5  # generous tolerance
-
-
-# -----------------------------------------------------------------------
-# CapacityEstimator: high capital -> significant IC degradation
-# -----------------------------------------------------------------------
-
-def test_high_capital_degrades_ic(market_data):
-    """At very high capital, the net ICIR should be meaningfully lower."""
-    returns, volume, signals = market_data
-    config = CapacityConfig(
-        capacity_levels=[1e4, 1e6, 1e8, 1e10],
-    )
-    estimator = CapacityEstimator(
-        returns=returns,
-        volume=volume,
-        config=config,
-    )
-    cap_est = estimator.estimate("test", signals)
-    # The capacity curve should show increasing degradation
-    degradations = list(cap_est.capacity_curve.values())
-    assert degradations[-1] >= degradations[0]
-
-
-# -----------------------------------------------------------------------
-# Edge case: zero volume
-# -----------------------------------------------------------------------
-
-def test_zero_volume_handling(rng):
-    """Zero volume should be handled gracefully (participation_limit used)."""
-    M, T = 10, 50
-    returns = rng.normal(0, 0.01, (M, T))
-    volume = np.zeros((M, T))  # all zero volume
-    signals = rng.normal(0, 1, (M, T))
-
-    model = MarketImpactModel()
-    result = model.estimate_impact(signals, volume, capital=1e8)
-
-    # Should not crash; participation rate should be capped at limit
-    assert not np.any(np.isnan(result.impact_bps))
-    cfg = CapacityConfig()
-    assert np.allclose(result.participation_rate, cfg.participation_limit)
diff --git a/src/factorminer/factorminer/tests/test_causal.py b/src/factorminer/factorminer/tests/test_causal.py
deleted file mode 100644
index 0f03278..0000000
--- a/src/factorminer/factorminer/tests/test_causal.py
+++ /dev/null
@@ -1,147 +0,0 @@
-"""Tests for the causal validation layer (evaluation/causal.py)."""
-
-from __future__ import annotations
-
-import numpy as np
-import pytest
-
-from src.factorminer.factorminer.evaluation.causal import CausalConfig, CausalTestResult, CausalValidator
-
-
-@pytest.fixture
-def rng():
-    return np.random.default_rng(42)
-
-
-# -----------------------------------------------------------------------
-# CausalConfig defaults
-# -----------------------------------------------------------------------
-
-def test_causal_config_defaults():
-    cfg = CausalConfig()
-    assert cfg.enabled is True
-    assert cfg.granger_max_lag == 5
-    assert cfg.granger_significance == 0.05
-    assert cfg.n_interventions == 3
-    assert cfg.robustness_threshold == 0.4
-
-
-# -----------------------------------------------------------------------
-# CausalTestResult dataclass
-# -----------------------------------------------------------------------
-
-def test_causal_test_result_fields():
-    r = CausalTestResult(
-        factor_name="test",
-        granger_p_value=0.01,
-        granger_f_stat=5.0,
-        granger_passes=True,
-        intervention_ic_ratio=0.8,
-        intervention_passes=True,
-        robustness_score=0.7,
-        passes=True,
-    )
-    assert r.factor_name == "test"
-    assert r.passes is True
-    assert isinstance(r.details, dict)
-
-
-# -----------------------------------------------------------------------
-# Granger test: planted causal signal should pass
-# -----------------------------------------------------------------------
-
-def test_granger_causal_signal_passes(rng):
-    """A signal that IS lag-1 predictive of returns should produce low p."""
-    M, T = 20, 200
-    noise = rng.normal(0, 0.01, (M, T))
-    signal = rng.normal(0, 1, (M, T))
-    # Returns are a lagged copy of the signal + small noise
-    returns = np.zeros((M, T))
-    returns[:, 1:] = signal[:, :-1] * 0.5 + noise[:, 1:]
-
-    validator = CausalValidator(
-        returns=returns,
-        data_tensor=None,
-        library_signals={},
-        config=CausalConfig(granger_max_lag=3, seed=42),
-    )
-    result = validator.validate("planted_signal", signal)
-    # The Granger test should detect causality (low p-value)
-    assert result.granger_p_value < 0.10 or result.granger_passes
-
-
-# -----------------------------------------------------------------------
-# Granger test: random noise should fail (high p-value)
-# -----------------------------------------------------------------------
-
-def test_granger_random_noise_high_pvalue(rng):
-    """Pure noise signal should have high p-value."""
-    M, T = 20, 200
-    signal = rng.normal(0, 1, (M, T))
-    returns = rng.normal(0, 0.01, (M, T))
-
-    validator = CausalValidator(
-        returns=returns,
-        data_tensor=None,
-        library_signals={},
-        config=CausalConfig(granger_max_lag=3, seed=42),
-    )
-    result = validator.validate("noise_signal", signal)
-    # High p-value expected (not necessarily >0.05 due to random chance,
-    # but the test is about the API working correctly)
-    assert isinstance(result.granger_p_value, float)
-    assert 0.0 <= result.granger_p_value <= 1.0
-
-
-# -----------------------------------------------------------------------
-# Intervention robustness: robust signal retains IC
-# -----------------------------------------------------------------------
-
-def test_intervention_robust_signal(rng):
-    """A signal strongly correlated with returns should be robust."""
-    M, T = 20, 100
-    returns = rng.normal(0, 0.01, (M, T))
-    # Signal is nearly identical to returns -> high IC, robust
-    signal = returns * 10 + rng.normal(0, 0.001, (M, T))
-
-    validator = CausalValidator(
-        returns=returns,
-        data_tensor=None,
-        library_signals={},
-        config=CausalConfig(seed=42),
-    )
-    result = validator.validate("robust_factor", signal)
-    assert result.intervention_ic_ratio > 0.0
-    assert isinstance(result.intervention_passes, bool)
-    assert isinstance(result.robustness_score, float)
-
-
-def test_validate_excludes_candidate_from_control_library(rng):
-    """A factor under test should not be used as its own Granger control."""
-    M, T = 8, 40
-    signal = rng.normal(0, 1, (M, T))
-    returns = rng.normal(0, 0.01, (M, T))
-    control = rng.normal(0, 1, (M, T))
-
-    validator = CausalValidator(
-        returns=returns,
-        data_tensor=None,
-        library_signals={
-            "candidate_factor": signal.copy(),
-            "control_factor": control,
-        },
-        config=CausalConfig(seed=42),
-    )
-
-    captured: dict[str, np.ndarray] = {}
-
-    def _capture_controls(signals_arg, returns_arg, library_signals_arg):
-        captured.update(library_signals_arg)
-        return 1.0, 0.0, True
-
-    validator._granger_test = _capture_controls  # type: ignore[method-assign]
-    result = validator.validate("candidate_factor", signal)
-
-    assert result.granger_passes is True
-    assert "candidate_factor" not in captured
-    assert "control_factor" in captured
diff --git a/src/factorminer/factorminer/tests/test_cli_analysis.py b/src/factorminer/factorminer/tests/test_cli_analysis.py
deleted file mode 100644
index 33c2c5c..0000000
--- a/src/factorminer/factorminer/tests/test_cli_analysis.py
+++ /dev/null
@@ -1,312 +0,0 @@
-"""Focused CLI analysis tests for evaluate, combine, and visualize."""
-
-from __future__ import annotations
-
-from dataclasses import dataclass
-
-from click.testing import CliRunner
-import numpy as np
-
-from src.factorminer.factorminer.cli import main
-from src.factorminer.factorminer.core.factor_library import Factor, FactorLibrary
-from src.factorminer.factorminer.core.library_io import save_library
-from src.factorminer.factorminer.evaluation.runtime import DatasetSplit, FactorEvaluationArtifact
-
-
-@dataclass
-class _FakeDataset:
-    """Small runtime dataset stub sufficient for analysis CLI commands."""
-
-    asset_ids: np.ndarray
-    timestamps: np.ndarray
-    splits: dict[str, DatasetSplit]
-
-    def get_split(self, name: str) -> DatasetSplit:
-        return self.splits[name]
-
-
-def _make_stats(
-    ic_mean: float,
-    ic_abs_mean: float,
-    icir: float,
-    ic_win_rate: float,
-    turnover: float,
-) -> dict:
-    return {
-        "ic_mean": ic_mean,
-        "ic_abs_mean": ic_abs_mean,
-        "icir": icir,
-        "ic_win_rate": ic_win_rate,
-        "turnover": turnover,
-        "ic_series": np.array([ic_mean, -ic_mean / 2.0, ic_mean / 3.0], dtype=np.float64),
-        "Q1": -0.02,
-        "Q2": -0.01,
-        "Q3": 0.0,
-        "Q4": 0.01,
-        "Q5": 0.02,
-        "long_short": 0.04,
-        "monotonicity": 1.0,
-    }
-
-
-def _make_artifact(
-    factor_id: int,
-    name: str,
-    train_abs_ic: float,
-    test_abs_ic: float,
-) -> FactorEvaluationArtifact:
-    train_signal = np.full((2, 3), float(factor_id), dtype=np.float64)
-    test_signal = np.full((2, 3), float(factor_id) * 10.0, dtype=np.float64)
-    full_signal = np.concatenate([train_signal, test_signal], axis=1)
-
-    return FactorEvaluationArtifact(
-        factor_id=factor_id,
-        name=name,
-        formula="Neg($close)",
-        category="test",
-        parse_ok=True,
-        signals_full=full_signal,
-        split_signals={
-            "train": train_signal,
-            "test": test_signal,
-            "full": full_signal,
-        },
-        split_stats={
-            "train": _make_stats(0.05 * factor_id, train_abs_ic, 1.0 + factor_id, 0.6, 0.1),
-            "test": _make_stats(-0.04 * factor_id, test_abs_ic, 0.8 + factor_id, 0.4, 0.2),
-            "full": _make_stats(0.01 * factor_id, max(train_abs_ic, test_abs_ic), 0.9, 0.5, 0.15),
-        },
-    )
-
-
-def _make_dataset() -> _FakeDataset:
-    timestamps = np.array(
-        [
-            np.datetime64("2025-01-01"),
-            np.datetime64("2025-01-02"),
-            np.datetime64("2025-01-03"),
-            np.datetime64("2025-01-04"),
-            np.datetime64("2025-01-05"),
-            np.datetime64("2025-01-06"),
-        ]
-    )
-    returns = np.zeros((2, 3), dtype=np.float64)
-    return _FakeDataset(
-        asset_ids=np.array(["A", "B"]),
-        timestamps=timestamps,
-        splits={
-            "train": DatasetSplit(
-                name="train",
-                indices=np.array([0, 1, 2]),
-                timestamps=timestamps[:3],
-                returns=returns,
-            ),
-            "test": DatasetSplit(
-                name="test",
-                indices=np.array([3, 4, 5]),
-                timestamps=timestamps[3:],
-                returns=returns,
-            ),
-            "full": DatasetSplit(
-                name="full",
-                indices=np.array([0, 1, 2, 3, 4, 5]),
-                timestamps=timestamps,
-                returns=np.zeros((2, 6), dtype=np.float64),
-            ),
-        },
-    )
-
-
-def _save_test_library(tmp_path) -> str:
-    library = FactorLibrary(correlation_threshold=0.5, ic_threshold=0.04)
-    library.admit_factor(
-        Factor(
-            id=0,
-            name="factor_one",
-            formula="Neg($close)",
-            category="test",
-            ic_mean=9.99,
-            icir=8.88,
-            ic_win_rate=0.99,
-            max_correlation=0.0,
-            batch_number=1,
-        )
-    )
-    library.admit_factor(
-        Factor(
-            id=0,
-            name="factor_two",
-            formula="Neg($open)",
-            category="test",
-            ic_mean=7.77,
-            icir=6.66,
-            ic_win_rate=0.95,
-            max_correlation=0.0,
-            batch_number=1,
-        )
-    )
-    base_path = tmp_path / "factor_library"
-    save_library(library, base_path, save_signals=False)
-    return str(base_path.with_suffix(".json"))
-
-
-def test_evaluate_recomputes_and_selects_top_k_by_train_split(tmp_path, monkeypatch):
-    """`evaluate --period both` should use recomputed train metrics for top-k."""
-    library_path = _save_test_library(tmp_path)
-    dataset = _make_dataset()
-    artifacts = [
-        _make_artifact(1, "factor_one", train_abs_ic=0.20, test_abs_ic=0.90),
-        _make_artifact(2, "factor_two", train_abs_ic=0.70, test_abs_ic=0.10),
-    ]
-
-    monkeypatch.setattr(
-        "src.factorminer.factorminer.cli._load_runtime_dataset_for_analysis",
-        lambda cfg, data_path, mock: dataset,
-    )
-    monkeypatch.setattr(
-        "src.factorminer.factorminer.cli._recompute_analysis_artifacts",
-        lambda library, dataset_arg, signal_failure_policy: artifacts,
-    )
-
-    runner = CliRunner()
-    result = runner.invoke(
-        main,
-        [
-            "--cpu",
-            "--output-dir",
-            str(tmp_path / "out"),
-            "evaluate",
-            library_path,
-            "--mock",
-            "--period",
-            "both",
-            "--top-k",
-            "1",
-        ],
-    )
-
-    assert result.exit_code == 0, result.output
-    assert "Evaluating top 1 factors by train |IC| for train/test comparison" in result.output
-    assert "factor_two" in result.output
-    assert "factor_one" not in result.output
-    assert "0.7000" in result.output
-    assert "9.9900" not in result.output
-    assert "Decay summary (train -> test)" in result.output
-
-
-def test_combine_uses_fit_split_for_factor_preselection(tmp_path, monkeypatch):
-    """`combine` should pre-select factors by fit split rather than eval split."""
-    library_path = _save_test_library(tmp_path)
-    dataset = _make_dataset()
-    artifacts = [
-        _make_artifact(1, "factor_one", train_abs_ic=0.20, test_abs_ic=0.90),
-        _make_artifact(2, "factor_two", train_abs_ic=0.70, test_abs_ic=0.10),
-    ]
-    captured_factor_ids: list[int] = []
-
-    monkeypatch.setattr(
-        "src.factorminer.factorminer.cli._load_runtime_dataset_for_analysis",
-        lambda cfg, data_path, mock: dataset,
-    )
-    monkeypatch.setattr(
-        "src.factorminer.factorminer.cli._recompute_analysis_artifacts",
-        lambda library, dataset_arg, signal_failure_policy: artifacts,
-    )
-
-    def _capture_equal_weight(self, factor_signals):
-        captured_factor_ids.extend(sorted(factor_signals.keys()))
-        return next(iter(factor_signals.values()))
-
-    monkeypatch.setattr(
-        "src.factorminer.factorminer.evaluation.combination.FactorCombiner.equal_weight",
-        _capture_equal_weight,
-    )
-    monkeypatch.setattr(
-        "src.factorminer.factorminer.evaluation.portfolio.PortfolioBacktester.quintile_backtest",
-        lambda self, combined_signal, returns, transaction_cost_bps=0: {
-            "ic_mean": 0.12,
-            "icir": 1.23,
-            "ls_return": 0.04,
-            "monotonicity": 1.0,
-            "avg_turnover": 0.10,
-        },
-    )
-
-    runner = CliRunner()
-    result = runner.invoke(
-        main,
-        [
-            "--cpu",
-            "--output-dir",
-            str(tmp_path / "out"),
-            "combine",
-            library_path,
-            "--mock",
-            "--fit-period",
-            "train",
-            "--eval-period",
-            "test",
-            "--method",
-            "equal-weight",
-            "--top-k",
-            "1",
-        ],
-    )
-
-    assert result.exit_code == 0, result.output
-    assert "Pre-selected top 1 factors by train |IC|" in result.output
-    assert "Fit split:  train" in result.output
-    assert "Eval split: test" in result.output
-    assert captured_factor_ids == [2]
-
-
-def test_visualize_defaults_factor_specific_plots_to_split_top_factor(tmp_path, monkeypatch):
-    """`visualize` should default factor-specific plots to the split top factor."""
-    library_path = _save_test_library(tmp_path)
-    dataset = _make_dataset()
-    artifacts = [
-        _make_artifact(1, "factor_one", train_abs_ic=0.80, test_abs_ic=0.20),
-        _make_artifact(2, "factor_two", train_abs_ic=0.30, test_abs_ic=0.90),
-    ]
-    ic_paths: list[str] = []
-    quintile_paths: list[str] = []
-
-    monkeypatch.setattr(
-        "src.factorminer.factorminer.cli._load_runtime_dataset_for_analysis",
-        lambda cfg, data_path, mock: dataset,
-    )
-    monkeypatch.setattr(
-        "src.factorminer.factorminer.cli._recompute_analysis_artifacts",
-        lambda library, dataset_arg, signal_failure_policy: artifacts,
-    )
-    monkeypatch.setattr(
-        "src.factorminer.factorminer.utils.visualization.plot_ic_timeseries",
-        lambda ic_series, dates, rolling_window=21, title="", save_path=None: ic_paths.append(save_path),
-    )
-    monkeypatch.setattr(
-        "src.factorminer.factorminer.utils.visualization.plot_quintile_returns",
-        lambda quintile_returns, title="", save_path=None: quintile_paths.append(save_path),
-    )
-
-    runner = CliRunner()
-    result = runner.invoke(
-        main,
-        [
-            "--cpu",
-            "--output-dir",
-            str(tmp_path / "viz"),
-            "visualize",
-            library_path,
-            "--mock",
-            "--period",
-            "test",
-            "--ic-timeseries",
-            "--quintile",
-        ],
-    )
-
-    assert result.exit_code == 0, result.output
-    assert "Defaulted to factor #2 factor_two for factor-specific plots." in result.output
-    assert ic_paths and all("factor_2" in path for path in ic_paths)
-    assert quintile_paths and all("factor_2" in path for path in quintile_paths)
-    assert not any("factor_1" in path for path in ic_paths + quintile_paths)
diff --git a/src/factorminer/factorminer/tests/test_cli_helix.py b/src/factorminer/factorminer/tests/test_cli_helix.py
deleted file mode 100644
index 761a1dc..0000000
--- a/src/factorminer/factorminer/tests/test_cli_helix.py
+++ /dev/null
@@ -1,142 +0,0 @@
-"""CLI tests for the Helix command."""
-
-from __future__ import annotations
-
-import json
-
-from click.testing import CliRunner
-import numpy as np
-import pandas as pd
-
-from src.factorminer.factorminer.cli import _build_core_mining_config, _prepare_data_arrays, main
-from src.factorminer.factorminer.utils.config import load_config
-
-
-def test_helix_cli_runs_with_mock_data(tmp_path):
-    """The helix command should execute end-to-end and save a library."""
-    output_dir = tmp_path / "helix-output"
-    runner = CliRunner()
-
-    result = runner.invoke(
-        main,
-        [
-            "--cpu",
-            "--output-dir",
-            str(output_dir),
-            "helix",
-            "--mock",
-            "-n",
-            "1",
-            "-b",
-            "5",
-            "-t",
-            "3",
-        ],
-    )
-
-    assert result.exit_code == 0, result.output
-    assert "Starting Helix Loop..." in result.output
-    assert "Helix mining complete!" in result.output
-
-    library_path = output_dir / "factor_library.json"
-    assert library_path.exists()
-
-    payload = json.loads(library_path.read_text())
-    assert "factors" in payload
-
-
-def test_helix_cli_reports_enabled_features(tmp_path):
-    """Explicit feature flags should be reflected in the CLI output."""
-    output_dir = tmp_path / "helix-flags"
-    runner = CliRunner()
-
-    result = runner.invoke(
-        main,
-        [
-            "--cpu",
-            "--output-dir",
-            str(output_dir),
-            "helix",
-            "--mock",
-            "--debate",
-            "--canonicalize",
-            "-n",
-            "1",
-            "-b",
-            "4",
-            "-t",
-            "2",
-        ],
-    )
-
-    assert result.exit_code == 0, result.output
-    assert "Active Phase 2 features: debate, canonicalization" in result.output
-
-
-def test_prepare_data_arrays_builds_full_factor_feature_surface():
-    """The CLI tensor builder should expose the paper's canonical features."""
-    df = pd.DataFrame(
-        [
-            {
-                "datetime": "2025-01-01 09:30:00",
-                "asset_id": "A",
-                "open": 10.0,
-                "high": 11.0,
-                "low": 9.0,
-                "close": 10.0,
-                "volume": 2.0,
-                "amount": 20.0,
-            },
-            {
-                "datetime": "2025-01-01 09:40:00",
-                "asset_id": "A",
-                "open": 10.0,
-                "high": 12.0,
-                "low": 9.5,
-                "close": 11.0,
-                "volume": 2.0,
-                "amount": 22.0,
-            },
-            {
-                "datetime": "2025-01-01 09:30:00",
-                "asset_id": "B",
-                "open": 20.0,
-                "high": 21.0,
-                "low": 19.0,
-                "close": 20.0,
-                "volume": 4.0,
-                "amount": 80.0,
-            },
-            {
-                "datetime": "2025-01-01 09:40:00",
-                "asset_id": "B",
-                "open": 20.0,
-                "high": 22.0,
-                "low": 19.5,
-                "close": 18.0,
-                "volume": 4.0,
-                "amount": 72.0,
-            },
-        ]
-    )
-    df["datetime"] = pd.to_datetime(df["datetime"])
-
-    data_tensor, forward_returns = _prepare_data_arrays(df)
-
-    assert data_tensor.shape == (2, 2, 8)
-    np.testing.assert_allclose(data_tensor[:, :, 6], np.array([[10.0, 11.0], [20.0, 18.0]]))
-    assert np.isnan(data_tensor[0, 0, 7])
-    np.testing.assert_allclose(data_tensor[:, 1, 7], np.array([0.1, -0.1]))
-    assert np.isnan(forward_returns[0, 1])
-    np.testing.assert_allclose(forward_returns[:, 0], np.array([0.1, -0.1]))
-
-
-def test_mock_mining_config_uses_synthetic_signal_failures(tmp_path):
-    """Mock mining flows should bypass strict benchmark recomputation defaults."""
-    cfg = load_config()
-
-    normal_config = _build_core_mining_config(cfg, tmp_path / "normal", mock=False)
-    mock_config = _build_core_mining_config(cfg, tmp_path / "mock", mock=True)
-
-    assert normal_config.signal_failure_policy == "reject"
-    assert mock_config.signal_failure_policy == "synthetic"
diff --git a/src/factorminer/factorminer/tests/test_combination.py b/src/factorminer/factorminer/tests/test_combination.py
deleted file mode 100644
index ebf2d8f..0000000
--- a/src/factorminer/factorminer/tests/test_combination.py
+++ /dev/null
@@ -1,531 +0,0 @@
-"""Tests for factor combination and selection strategies."""
-
-from __future__ import annotations
-
-import numpy as np
-import pytest
-
-from src.factorminer.factorminer.evaluation.combination import FactorCombiner
-from src.factorminer.factorminer.evaluation.selection import FactorSelector
-
-
-# ---------------------------------------------------------------------------
-# Fixtures
-# ---------------------------------------------------------------------------
-
-@pytest.fixture
-def combiner():
-    return FactorCombiner()
-
-
-@pytest.fixture
-def rng():
-    return np.random.default_rng(42)
-
-
-@pytest.fixture
-def simple_signals(rng):
-    """Three factor signals of shape (T=50, N=20)."""
-    T, N = 50, 20
-    return {
-        1: rng.normal(0, 1, (T, N)),
-        2: rng.normal(0, 1, (T, N)),
-        3: rng.normal(0, 1, (T, N)),
-    }
-
-
-@pytest.fixture
-def identical_signals(rng):
-    """Two identical factor signals."""
-    T, N = 30, 10
-    sig = rng.normal(0, 1, (T, N))
-    return {1: sig.copy(), 2: sig.copy()}
-
-
-# ---------------------------------------------------------------------------
-# Equal weight
-# ---------------------------------------------------------------------------
-
-class TestEqualWeight:
-    """Test equal-weight combination."""
-
-    def test_output_shape(self, combiner, simple_signals):
-        result = combiner.equal_weight(simple_signals)
-        T, N = next(iter(simple_signals.values())).shape
-        assert result.shape == (T, N)
-
-    def test_single_factor_is_zscore(self, combiner, rng):
-        T, N = 30, 10
-        sig = rng.normal(5, 2, (T, N))
-        signals = {1: sig}
-        result = combiner.equal_weight(signals)
-        # Should be z-scored: mean ~0 per row
-        row_means = np.nanmean(result, axis=1)
-        np.testing.assert_array_almost_equal(row_means, np.zeros(T), decimal=10)
-
-    def test_two_identical_factors(self, combiner, identical_signals):
-        result = combiner.equal_weight(identical_signals)
-        # Average of two identical z-scored signals = same z-scored signal
-        single = combiner.equal_weight({1: identical_signals[1]})
-        np.testing.assert_array_almost_equal(result, single, decimal=10)
-
-    def test_empty_raises(self, combiner):
-        with pytest.raises(ValueError, match="not be empty"):
-            combiner.equal_weight({})
-
-    def test_result_is_average(self, combiner, rng):
-        """EW of multiple factors should be the average of their z-scores."""
-        T, N = 20, 10
-        s1 = np.ones((T, N))  # Constant -> z-score = 0
-        s2 = np.tile(np.arange(N, dtype=np.float64), (T, 1))  # Variable
-        signals = {1: s1, 2: s2}
-        result = combiner.equal_weight(signals)
-        # s1 z-score = 0 everywhere (constant), s2 z-score is not 0
-        # Average should be s2_zscore / 2
-        s2_zscore = combiner._cross_sectional_standardize(s2)
-        # s1_zscore is 0 (constant cross-section, std=0 -> std=1 fallback)
-        s1_zscore = combiner._cross_sectional_standardize(s1)
-        expected = np.nanmean(np.stack([s1_zscore, s2_zscore]), axis=0)
-        np.testing.assert_array_almost_equal(result, expected)
-
-
-# ---------------------------------------------------------------------------
-# IC-weighted
-# ---------------------------------------------------------------------------
-
-class TestICWeighted:
-    """Test IC-weighted combination."""
-
-    def test_output_shape(self, combiner, simple_signals):
-        ic_values = {1: 0.05, 2: 0.08, 3: 0.03}
-        result = combiner.ic_weighted(simple_signals, ic_values)
-        T, N = next(iter(simple_signals.values())).shape
-        assert result.shape == (T, N)
-
-    def test_higher_ic_gets_more_weight(self, combiner, rng):
-        T, N = 30, 15
-        s1 = rng.normal(0, 1, (T, N))
-        s2 = rng.normal(0, 1, (T, N))
-        signals = {1: s1, 2: s2}
-
-        # Give all weight to factor 1
-        ic_values_1 = {1: 1.0, 2: 0.0001}
-        result_1 = combiner.ic_weighted(signals, ic_values_1)
-
-        # Give all weight to factor 2
-        ic_values_2 = {1: 0.0001, 2: 1.0}
-        result_2 = combiner.ic_weighted(signals, ic_values_2)
-
-        # Results should be different (weighted differently)
-        assert not np.allclose(result_1, result_2)
-
-    def test_fallback_to_ew_with_nonpositive_ic(self, combiner, simple_signals):
-        ic_values = {1: -0.01, 2: 0.0, 3: -0.05}
-        result = combiner.ic_weighted(simple_signals, ic_values)
-        # Should fall back to equal weight
-        ew_result = combiner.equal_weight(simple_signals)
-        np.testing.assert_array_almost_equal(result, ew_result)
-
-    def test_empty_raises(self, combiner):
-        with pytest.raises(ValueError, match="not be empty"):
-            combiner.ic_weighted({}, {})
-
-
-# ---------------------------------------------------------------------------
-# Orthogonal
-# ---------------------------------------------------------------------------
-
-class TestOrthogonal:
-    """Test orthogonal (Gram-Schmidt) combination."""
-
-    def test_output_shape(self, combiner, simple_signals):
-        result = combiner.orthogonal(simple_signals)
-        T, N = next(iter(simple_signals.values())).shape
-        assert result.shape == (T, N)
-
-    def test_single_factor(self, combiner, rng):
-        T, N = 20, 10
-        sig = rng.normal(0, 1, (T, N))
-        result = combiner.orthogonal({1: sig})
-        # Single factor orthogonalized = z-scored version
-        zscore = combiner._cross_sectional_standardize(sig)
-        np.testing.assert_array_almost_equal(result, zscore)
-
-    def test_orthogonal_different_from_ew(self, combiner, simple_signals):
-        ew = combiner.equal_weight(simple_signals)
-        ortho = combiner.orthogonal(simple_signals)
-        # They should generally differ (unless signals are already orthogonal)
-        # Check that the operation at least runs without error
-        assert ortho.shape == ew.shape
-
-    def test_empty_raises(self, combiner):
-        with pytest.raises(ValueError, match="not be empty"):
-            combiner.orthogonal({})
-
-
-# ---------------------------------------------------------------------------
-# Cross-sectional standardization helper
-# ---------------------------------------------------------------------------
-
-class TestCrossSectionalStandardize:
-    """Test the internal _cross_sectional_standardize method."""
-
-    def test_zero_mean_per_row(self, combiner, rng):
-        T, N = 20, 15
-        signals = rng.normal(5.0, 2.0, (T, N))
-        result = combiner._cross_sectional_standardize(signals)
-        row_means = np.nanmean(result, axis=1)
-        np.testing.assert_array_almost_equal(row_means, np.zeros(T), decimal=10)
-
-    def test_unit_std_per_row(self, combiner, rng):
-        T, N = 20, 30
-        signals = rng.normal(10.0, 3.0, (T, N))
-        result = combiner._cross_sectional_standardize(signals)
-        row_stds = np.nanstd(result, axis=1)
-        np.testing.assert_array_almost_equal(row_stds, np.ones(T), decimal=5)
-
-    def test_constant_row_handled(self, combiner):
-        signals = np.ones((5, 10))
-        result = combiner._cross_sectional_standardize(signals)
-        # Constant row: std=0, should be 0 after standardization
-        np.testing.assert_array_almost_equal(result, np.zeros((5, 10)))
-
-    def test_nan_handling(self, combiner, rng):
-        T, N = 10, 10
-        signals = rng.normal(0, 1, (T, N))
-        signals[0, 0] = np.nan
-        result = combiner._cross_sectional_standardize(signals)
-        assert np.isnan(result[0, 0])
-
-
-# ---------------------------------------------------------------------------
-# Gram-Schmidt helper
-# ---------------------------------------------------------------------------
-
-class TestGramSchmidt:
-    """Test the Gram-Schmidt orthogonalization helper."""
-
-    def test_orthogonal_output(self, rng):
-        T, N = 20, 10
-        factors = [rng.normal(0, 1, (T, N)) for _ in range(3)]
-        ortho = FactorCombiner._gram_schmidt(factors)
-        assert len(ortho) == 3
-
-        # Check approximate orthogonality of flattened vectors
-        for i in range(len(ortho)):
-            for j in range(i + 1, len(ortho)):
-                vi = np.where(np.isnan(ortho[i]), 0, ortho[i]).ravel()
-                vj = np.where(np.isnan(ortho[j]), 0, ortho[j]).ravel()
-                denom = np.sqrt(np.dot(vi, vi) * np.dot(vj, vj))
-                if denom > 1e-10:
-                    cos_sim = abs(np.dot(vi, vj) / denom)
-                    assert cos_sim < 0.01, f"Factors {i} and {j} not orthogonal: cos={cos_sim}"
-
-    def test_single_factor(self, rng):
-        T, N = 10, 5
-        f = [rng.normal(0, 1, (T, N))]
-        ortho = FactorCombiner._gram_schmidt(f)
-        assert len(ortho) == 1
-        np.testing.assert_array_almost_equal(ortho[0], f[0])
-
-    def test_nan_preserved(self, rng):
-        T, N = 10, 5
-        f1 = rng.normal(0, 1, (T, N))
-        f2 = rng.normal(0, 1, (T, N))
-        f1[0, 0] = np.nan
-        ortho = FactorCombiner._gram_schmidt([f1, f2])
-        assert np.isnan(ortho[0][0, 0])
-
-
-# ===========================================================================
-# Factor Selection Tests
-# ===========================================================================
-
-# ---------------------------------------------------------------------------
-# Fixtures for selection tests
-# ---------------------------------------------------------------------------
-
-@pytest.fixture
-def selector():
-    return FactorSelector()
-
-
-@pytest.fixture
-def synthetic_factors(rng):
-    """Synthetic factor signals for selection tests.
-
-    Creates 5 factors of shape (T=80, N=30) where factor 0 is predictive
-    (correlated with returns) and the rest are noise.
-    """
-    T, N = 80, 30
-    returns = rng.normal(0, 0.02, (T, N))
-
-    signals = {}
-    # Factor 0: predictive (signal ~ returns + noise)
-    signals[0] = returns + rng.normal(0, 0.01, (T, N))
-    # Factors 1-4: pure noise
-    for i in range(1, 5):
-        signals[i] = rng.normal(0, 1, (T, N))
-
-    return signals, returns
-
-
-@pytest.fixture
-def uniform_factors(rng):
-    """5 factors that are all weakly predictive."""
-    T, N = 60, 25
-    returns = rng.normal(0, 0.02, (T, N))
-
-    signals = {}
-    for i in range(5):
-        signals[i] = returns * (0.5 + 0.1 * i) + rng.normal(0, 0.05, (T, N))
-
-    return signals, returns
-
-
-# ---------------------------------------------------------------------------
-# _prepare_panel helper tests
-# ---------------------------------------------------------------------------
-
-class TestPreparePanel:
-    """Test the _prepare_panel static helper."""
-
-    def test_empty_returns_empty(self, selector):
-        ids, X, y = selector._prepare_panel({}, np.empty((10, 5)))
-        assert ids == []
-        assert X.shape == (0, 0)
-        assert y.shape == (0,)
-
-    def test_output_shapes(self, selector, rng):
-        T, N = 20, 10
-        signals = {
-            1: rng.normal(0, 1, (T, N)),
-            2: rng.normal(0, 1, (T, N)),
-        }
-        returns = rng.normal(0, 1, (T, N))
-
-        ids, X, y = selector._prepare_panel(signals, returns)
-        assert ids == [1, 2]
-        # X should be (n_valid_samples, 2), y should be (n_valid_samples,)
-        assert X.shape[1] == 2
-        assert X.shape[0] == y.shape[0]
-        assert X.shape[0] <= T * N
-
-    def test_nan_rows_dropped(self, selector, rng):
-        T, N = 10, 5
-        signals = {1: np.ones((T, N))}
-        returns = np.ones((T, N))
-        # Inject NaN into one position
-        signals[1][0, 0] = np.nan
-
-        ids, X, y = selector._prepare_panel(signals, returns)
-        assert X.shape[0] == T * N - 1  # One row dropped
-
-    def test_ids_sorted(self, selector, rng):
-        T, N = 5, 3
-        signals = {
-            3: rng.normal(0, 1, (T, N)),
-            1: rng.normal(0, 1, (T, N)),
-            2: rng.normal(0, 1, (T, N)),
-        }
-        returns = rng.normal(0, 1, (T, N))
-        ids, _, _ = selector._prepare_panel(signals, returns)
-        assert ids == [1, 2, 3]
-
-
-# ---------------------------------------------------------------------------
-# _composite_icir helper tests
-# ---------------------------------------------------------------------------
-
-class TestCompositeICIR:
-    """Test the _composite_icir static helper."""
-
-    def test_empty_returns_zero(self, selector, rng):
-        T, N = 20, 10
-        signals = {1: rng.normal(0, 1, (T, N))}
-        returns = rng.normal(0, 1, (T, N))
-        assert selector._composite_icir(signals, [], returns) == 0.0
-
-    def test_single_factor(self, selector, rng):
-        T, N = 50, 20
-        returns = rng.normal(0, 0.02, (T, N))
-        signals = {1: returns + rng.normal(0, 0.01, (T, N))}
-        icir = selector._composite_icir(signals, [1], returns)
-        assert isinstance(icir, float)
-        # Predictive signal should have positive ICIR
-        assert icir > 0
-
-    def test_noise_factor_low_icir(self, selector, rng):
-        T, N = 50, 20
-        returns = rng.normal(0, 0.02, (T, N))
-        signals = {1: rng.normal(0, 1, (T, N))}
-        icir = selector._composite_icir(signals, [1], returns)
-        # Pure noise should have ICIR near zero (much lower than predictive)
-        assert abs(icir) < 2.0  # Loose bound, noise can have some correlation
-
-
-# ---------------------------------------------------------------------------
-# Lasso selection tests
-# ---------------------------------------------------------------------------
-
-class TestLassoSelection:
-    """Test L1-regularized Lasso factor selection."""
-
-    def test_empty_signals_returns_empty(self, selector):
-        result = selector.lasso_selection({}, np.empty((10, 5)))
-        assert result == []
-
-    def test_returns_list_of_tuples(self, selector, synthetic_factors):
-        signals, returns = synthetic_factors
-        result = selector.lasso_selection(signals, returns, alpha=0.001)
-        assert isinstance(result, list)
-        for item in result:
-            assert isinstance(item, tuple)
-            assert len(item) == 2
-            fid, coef = item
-            assert isinstance(fid, int)
-            assert isinstance(coef, float)
-
-    def test_sorted_by_abs_coefficient(self, selector, synthetic_factors):
-        signals, returns = synthetic_factors
-        result = selector.lasso_selection(signals, returns, alpha=0.001)
-        if len(result) >= 2:
-            abs_coefs = [abs(c) for _, c in result]
-            assert abs_coefs == sorted(abs_coefs, reverse=True)
-
-    def test_selects_predictive_factor(self, selector, synthetic_factors):
-        signals, returns = synthetic_factors
-        result = selector.lasso_selection(signals, returns, alpha=0.0001)
-        if result:
-            selected_ids = [fid for fid, _ in result]
-            # Factor 0 is correlated with returns; it should be selected
-            assert 0 in selected_ids
-
-    def test_sparsity_with_high_alpha(self, selector, synthetic_factors):
-        signals, returns = synthetic_factors
-        result_low = selector.lasso_selection(signals, returns, alpha=0.0001)
-        result_high = selector.lasso_selection(signals, returns, alpha=1.0)
-        # Higher alpha should select fewer (or equal) factors
-        assert len(result_high) <= len(result_low)
-
-    def test_auto_alpha_via_cv(self, selector, synthetic_factors):
-        signals, returns = synthetic_factors
-        # alpha=None triggers LassoCV
-        result = selector.lasso_selection(signals, returns, alpha=None)
-        assert isinstance(result, list)
-
-    def test_nonzero_coefficients_only(self, selector, synthetic_factors):
-        signals, returns = synthetic_factors
-        result = selector.lasso_selection(signals, returns, alpha=0.001)
-        for _, coef in result:
-            assert abs(coef) > 1e-10
-
-
-# ---------------------------------------------------------------------------
-# Forward stepwise selection tests
-# ---------------------------------------------------------------------------
-
-class TestForwardStepwise:
-    """Test greedy forward stepwise factor selection."""
-
-    def test_empty_signals_returns_empty(self, selector):
-        result = selector.forward_stepwise({}, np.empty((10, 5)))
-        assert result == []
-
-    def test_returns_list_of_tuples(self, selector, synthetic_factors):
-        signals, returns = synthetic_factors
-        result = selector.forward_stepwise(signals, returns, max_factors=3)
-        assert isinstance(result, list)
-        for item in result:
-            assert isinstance(item, tuple)
-            assert len(item) == 2
-
-    def test_respects_max_factors(self, selector, synthetic_factors):
-        signals, returns = synthetic_factors
-        result = selector.forward_stepwise(signals, returns, max_factors=2)
-        assert len(result) <= 2
-
-    def test_selection_order(self, selector, synthetic_factors):
-        signals, returns = synthetic_factors
-        result = selector.forward_stepwise(signals, returns, max_factors=5)
-        # Each entry should have positive delta (ICIR improvement)
-        for _, delta in result:
-            assert delta > 0
-
-    def test_no_duplicate_selections(self, selector, synthetic_factors):
-        signals, returns = synthetic_factors
-        result = selector.forward_stepwise(signals, returns, max_factors=5)
-        selected_ids = [fid for fid, _ in result]
-        assert len(selected_ids) == len(set(selected_ids))
-
-    def test_predictive_factor_selected_first(self, selector, synthetic_factors):
-        signals, returns = synthetic_factors
-        result = selector.forward_stepwise(signals, returns, max_factors=5)
-        if result:
-            # Factor 0 is highly predictive; should likely be selected first
-            first_id = result[0][0]
-            assert first_id == 0
-
-    def test_single_factor_available(self, selector, rng):
-        T, N = 40, 15
-        returns = rng.normal(0, 0.02, (T, N))
-        signals = {42: returns + rng.normal(0, 0.01, (T, N))}
-        result = selector.forward_stepwise(signals, returns, max_factors=5)
-        assert len(result) <= 1
-        if result:
-            assert result[0][0] == 42
-
-
-# ---------------------------------------------------------------------------
-# XGBoost selection tests
-# ---------------------------------------------------------------------------
-
-class TestXGBoostSelection:
-    """Test XGBoost importance-based factor selection."""
-
-    def test_empty_signals_returns_empty(self, selector):
-        result = selector.xgboost_selection({}, np.empty((10, 5)))
-        assert result == []
-
-    def test_returns_all_factors(self, selector, synthetic_factors):
-        signals, returns = synthetic_factors
-        result = selector.xgboost_selection(signals, returns)
-        # XGBoost returns importance for all factors
-        assert len(result) == len(signals)
-
-    def test_returns_list_of_tuples(self, selector, synthetic_factors):
-        signals, returns = synthetic_factors
-        result = selector.xgboost_selection(signals, returns)
-        for item in result:
-            assert isinstance(item, tuple)
-            assert len(item) == 2
-            fid, importance = item
-            assert isinstance(fid, int)
-            assert isinstance(importance, float)
-
-    def test_sorted_by_importance(self, selector, synthetic_factors):
-        signals, returns = synthetic_factors
-        result = selector.xgboost_selection(signals, returns)
-        importances = [imp for _, imp in result]
-        assert importances == sorted(importances, reverse=True)
-
-    def test_importances_nonnegative(self, selector, synthetic_factors):
-        signals, returns = synthetic_factors
-        result = selector.xgboost_selection(signals, returns)
-        for _, importance in result:
-            assert importance >= 0.0
-
-    def test_predictive_factor_has_high_importance(self, selector, synthetic_factors):
-        signals, returns = synthetic_factors
-        result = selector.xgboost_selection(signals, returns)
-        if result:
-            # Factor 0 is predictive; it should have the highest importance
-            top_id = result[0][0]
-            assert top_id == 0
-
-    def test_importances_sum_to_one(self, selector, synthetic_factors):
-        signals, returns = synthetic_factors
-        result = selector.xgboost_selection(signals, returns)
-        total = sum(imp for _, imp in result)
-        # Gain importances from XGBoost should sum to ~1
-        assert total == pytest.approx(1.0, abs=0.05)
diff --git a/src/factorminer/factorminer/tests/test_data.py b/src/factorminer/factorminer/tests/test_data.py
deleted file mode 100644
index a294f41..0000000
--- a/src/factorminer/factorminer/tests/test_data.py
+++ /dev/null
@@ -1,258 +0,0 @@
-"""Tests for the data pipeline: mock data generation, preprocessing, tensor building."""
-
-from __future__ import annotations
-
-import numpy as np
-import pandas as pd
-import pytest
-
-from src.factorminer.factorminer.data.loader import load_market_data
-from src.factorminer.factorminer.data.mock_data import MockConfig, generate_mock_data, generate_with_halts
-
-
-# ---------------------------------------------------------------------------
-# Mock data generation
-# ---------------------------------------------------------------------------
-
-class TestMockDataGeneration:
-    """Test the synthetic market data generator."""
-
-    @pytest.fixture
-    def small_config(self):
-        return MockConfig(
-            num_assets=10,
-            num_periods=100,
-            frequency="1d",
-            seed=42,
-        )
-
-    @pytest.fixture
-    def small_df(self, small_config):
-        return generate_mock_data(small_config)
-
-    def test_returns_dataframe(self, small_df):
-        assert isinstance(small_df, pd.DataFrame)
-
-    def test_required_columns(self, small_df):
-        for col in ["datetime", "asset_id", "open", "high", "low", "close", "volume", "amount"]:
-            assert col in small_df.columns, f"Missing column: {col}"
-
-    def test_correct_shape(self, small_config, small_df):
-        expected_rows = small_config.num_assets * small_config.num_periods
-        assert len(small_df) == expected_rows
-
-    def test_unique_assets(self, small_config, small_df):
-        n_unique = small_df["asset_id"].nunique()
-        assert n_unique == small_config.num_assets
-
-    def test_periods_per_asset(self, small_config, small_df):
-        counts = small_df.groupby("asset_id").size()
-        assert (counts == small_config.num_periods).all()
-
-
-# ---------------------------------------------------------------------------
-# OHLC consistency
-# ---------------------------------------------------------------------------
-
-class TestOHLCConsistency:
-    """Test that generated data maintains OHLC invariants."""
-
-    @pytest.fixture
-    def df(self):
-        config = MockConfig(num_assets=20, num_periods=200, seed=123)
-        return generate_mock_data(config)
-
-    def test_low_le_high(self, df):
-        assert (df["low"] <= df["high"] + 1e-8).all(), "Found low > high"
-
-    def test_open_within_range(self, df):
-        assert (df["open"] >= df["low"] - 1e-8).all(), "Found open < low"
-        assert (df["open"] <= df["high"] + 1e-8).all(), "Found open > high"
-
-    def test_close_within_range(self, df):
-        assert (df["close"] >= df["low"] - 1e-8).all(), "Found close < low"
-        assert (df["close"] <= df["high"] + 1e-8).all(), "Found close > high"
-
-    def test_positive_prices(self, df):
-        for col in ["open", "high", "low", "close"]:
-            assert (df[col] > 0).all(), f"Found non-positive {col}"
-
-    def test_positive_volume(self, df):
-        assert (df["volume"] >= 0).all(), "Found negative volume"
-
-    def test_positive_amount(self, df):
-        assert (df["amount"] >= 0).all(), "Found negative amount"
-
-
-# ---------------------------------------------------------------------------
-# Trading halts
-# ---------------------------------------------------------------------------
-
-class TestHaltGeneration:
-    """Test synthetic data with trading halts."""
-
-    def test_generate_with_halts(self):
-        config = MockConfig(num_assets=10, num_periods=100, seed=42)
-        df = generate_with_halts(config, halt_fraction=0.05)
-        assert isinstance(df, pd.DataFrame)
-        # Should have some zero-volume bars
-        assert (df["volume"] == 0).any()
-
-    def test_halt_bars_have_flat_ohlc(self):
-        config = MockConfig(num_assets=10, num_periods=100, seed=42)
-        df = generate_with_halts(config, halt_fraction=0.05)
-        halted = df[df["volume"] == 0]
-        if len(halted) > 0:
-            # Open = High = Low = Close for halted bars
-            np.testing.assert_array_almost_equal(halted["open"], halted["close"])
-            np.testing.assert_array_almost_equal(halted["high"], halted["close"])
-            np.testing.assert_array_almost_equal(halted["low"], halted["close"])
-
-
-# ---------------------------------------------------------------------------
-# Different frequencies
-# ---------------------------------------------------------------------------
-
-class TestFrequencies:
-    """Test data generation at different frequencies."""
-
-    @pytest.mark.parametrize("freq", ["10min", "30min", "1h", "1d"])
-    def test_frequency(self, freq):
-        config = MockConfig(num_assets=5, num_periods=50, frequency=freq, seed=42)
-        df = generate_mock_data(config)
-        assert len(df) > 0
-        assert "datetime" in df.columns
-
-
-# ---------------------------------------------------------------------------
-# MockConfig defaults
-# ---------------------------------------------------------------------------
-
-class TestMockConfig:
-    """Test MockConfig defaults and overrides."""
-
-    def test_default_config(self):
-        config = MockConfig()
-        assert config.num_assets == 50
-        assert config.num_periods == 1000
-        assert config.frequency == "10min"
-        assert config.seed == 42
-
-    def test_config_with_universe(self):
-        config = MockConfig(num_assets=5, num_periods=20, universe="CSI300")
-        df = generate_mock_data(config)
-        assert "universe" in df.columns
-        assert (df["universe"] == "CSI300").all()
-
-    def test_config_no_planted_alpha(self):
-        config = MockConfig(num_assets=5, num_periods=20, plant_alpha=False)
-        df = generate_mock_data(config)
-        assert len(df) > 0
-
-
-# ---------------------------------------------------------------------------
-# Feature computation (basic checks with preprocessor if available)
-# ---------------------------------------------------------------------------
-
-class TestFeatureComputation:
-    """Test derived feature computation."""
-
-    def test_vwap_computable(self):
-        config = MockConfig(num_assets=5, num_periods=50, seed=42)
-        df = generate_mock_data(config)
-        # VWAP can be approximated from high, low, close
-        vwap = (df["high"] + df["low"] + df["close"]) / 3
-        assert len(vwap) == len(df)
-        assert (vwap > 0).all()
-
-    def test_returns_computable(self):
-        config = MockConfig(num_assets=5, num_periods=50, seed=42)
-        df = generate_mock_data(config)
-        # Returns per asset
-        df = df.sort_values(["asset_id", "datetime"])
-        df["returns"] = df.groupby("asset_id")["close"].pct_change()
-        # First bar per asset should be NaN
-        first_bar_per_asset = df.groupby("asset_id").head(1)
-        assert first_bar_per_asset["returns"].isna().all()
-        # Rest should be finite
-        rest = df.dropna(subset=["returns"])
-        assert np.isfinite(rest["returns"]).all()
-
-
-# ---------------------------------------------------------------------------
-# Tensor builder integration
-# ---------------------------------------------------------------------------
-
-class TestTensorBuilder:
-    """Test tensor construction from mock data (if modules available)."""
-
-    def test_build_pipeline_import(self):
-        """Verify we can import the tensor builder."""
-        from factorminer.data.tensor_builder import TensorConfig, build_tensor
-        config = TensorConfig()
-        assert config.backend == "numpy"
-        assert "close" in config.features
-
-    def test_temporal_split_import(self):
-        """Verify temporal_split is importable."""
-        from factorminer.data.tensor_builder import temporal_split
-        assert callable(temporal_split)
-
-
-# ---------------------------------------------------------------------------
-# Loader schema compatibility
-# ---------------------------------------------------------------------------
-
-class TestLoaderSchemaCompatibility:
-    """Test common market-data schema variants accepted by the loader."""
-
-    def test_accepts_common_column_aliases(self, tmp_path):
-        path = tmp_path / "alias_data.csv"
-        df = pd.DataFrame(
-            {
-                "timestamp": pd.to_datetime(
-                    ["2025-01-01 09:30:00", "2025-01-01 09:40:00"]
-                ),
-                "code": ["600519.SH", "600519.SH"],
-                "open": [10.0, 10.2],
-                "high": [10.3, 10.4],
-                "low": [9.9, 10.1],
-                "close": [10.1, 10.3],
-                "volume": [1000.0, 1200.0],
-                "amt": [10100.0, 12360.0],
-            }
-        )
-        df.to_csv(path, index=False)
-
-        loaded = load_market_data(path)
-
-        assert list(loaded.columns[:8]) == [
-            "datetime",
-            "asset_id",
-            "open",
-            "high",
-            "low",
-            "close",
-            "volume",
-            "amount",
-        ]
-        assert loaded.loc[0, "asset_id"] == "600519.SH"
-        assert loaded.loc[1, "amount"] == 12360.0
-
-    def test_missing_asset_id_still_raises_clear_error(self, tmp_path):
-        path = tmp_path / "missing_asset_id.csv"
-        df = pd.DataFrame(
-            {
-                "datetime": pd.to_datetime(["2025-01-01 09:30:00"]),
-                "open": [10.0],
-                "high": [10.3],
-                "low": [9.9],
-                "close": [10.1],
-                "volume": [1000.0],
-                "amount": [10100.0],
-            }
-        )
-        df.to_csv(path, index=False)
-
-        with pytest.raises(ValueError, match="missing required columns: \\['asset_id'\\]"):
-            load_market_data(path)
diff --git a/src/factorminer/factorminer/tests/test_debate.py b/src/factorminer/factorminer/tests/test_debate.py
deleted file mode 100644
index e6fee03..0000000
--- a/src/factorminer/factorminer/tests/test_debate.py
+++ /dev/null
@@ -1,229 +0,0 @@
-"""Tests for the multi-agent debate orchestrator (agent/debate.py)."""
-
-from __future__ import annotations
-
-import pytest
-
-from src.factorminer.factorminer.agent.critic import CriticAgent
-from src.factorminer.factorminer.agent.debate import DebateConfig, DebateGenerator
-from src.factorminer.factorminer.agent.llm_interface import MockProvider
-from src.factorminer.factorminer.agent.output_parser import CandidateFactor
-from src.factorminer.factorminer.agent.prompt_builder import PromptBuilder
-from src.factorminer.factorminer.agent.specialists import (
-    SpecialistConfig,
-    SpecialistPromptBuilder,
-)
-
-
-# -----------------------------------------------------------------------
-# SpecialistConfig and SpecialistPromptBuilder
-# -----------------------------------------------------------------------
-
-def test_specialist_config_creation():
-    cfg = SpecialistConfig(
-        name="test_spec",
-        domain="testing domain",
-        preferred_operators=["CsRank", "Neg"],
-        preferred_features=["$close"],
-        temperature=0.7,
-    )
-    assert cfg.name == "test_spec"
-    assert "CsRank" in cfg.preferred_operators
-
-
-def test_specialist_prompt_builder_inherits():
-    """SpecialistPromptBuilder should be a subclass of PromptBuilder."""
-    assert issubclass(SpecialistPromptBuilder, PromptBuilder)
-
-
-def test_specialist_prompt_builder_creates():
-    cfg = SpecialistConfig(
-        name="momentum",
-        domain="trend-following",
-        preferred_operators=["Delta"],
-        preferred_features=["$close"],
-        system_prompt_suffix="Focus on momentum.",
-    )
-    pb = SpecialistPromptBuilder(specialist_config=cfg)
-    assert "SPECIALIST DOMAIN DIRECTIVE" in pb.system_prompt
-    assert "Focus on momentum." in pb.system_prompt
-
-
-@pytest.fixture
-def helix_memory_signal():
-    return {
-        "prompt_text": (
-            "Prefer library-adjacent structures.\n"
-            "Avoid saturated price-only motifs."
-        ),
-        "complementary_patterns": [
-            "Combine TsRank momentum with liquidity normalization.",
-        ],
-        "conflict_warnings": [
-            "Price-volume reversal cluster is saturated.",
-        ],
-        "operator_cooccurrence": [
-            "TsRank + CsRank",
-        ],
-        "semantic_gaps": [
-            "VWAP-driven dispersion factors",
-        ],
-    }
-
-
-@pytest.fixture
-def prompt_library_state():
-    return {
-        "size": 12,
-        "target_size": 110,
-    }
-
-
-def _assert_helix_retrieval_sections(prompt: str) -> None:
-    assert "## HELIX RETRIEVAL SUMMARY" in prompt
-    assert "Prefer library-adjacent structures." in prompt
-    assert "Avoid saturated price-only motifs." in prompt
-    assert "## COMPLEMENTARY PATTERNS" in prompt
-    assert "Combine TsRank momentum with liquidity normalization." in prompt
-    assert "## SATURATION WARNINGS" in prompt
-    assert "Price-volume reversal cluster is saturated." in prompt
-    assert "## OPERATOR CO-OCCURRENCE PRIORS" in prompt
-    assert "TsRank + CsRank" in prompt
-    assert "## SEMANTIC GAPS" in prompt
-    assert "Underused but promising: VWAP-driven dispersion factors" in prompt
-
-
-def test_prompt_builder_renders_helix_retrieval_fields(
-    helix_memory_signal,
-    prompt_library_state,
-):
-    pb = PromptBuilder()
-
-    prompt = pb.build_user_prompt(
-        memory_signal=helix_memory_signal,
-        library_state=prompt_library_state,
-        batch_size=5,
-    )
-
-    _assert_helix_retrieval_sections(prompt)
-
-
-def test_specialist_prompt_builder_renders_helix_retrieval_fields(
-    helix_memory_signal,
-    prompt_library_state,
-):
-    cfg = SpecialistConfig(
-        name="momentum",
-        domain="trend-following",
-        preferred_operators=["Delta", "TsRank"],
-        preferred_features=["$close", "$returns"],
-        system_prompt_suffix="Focus on momentum.",
-    )
-    pb = SpecialistPromptBuilder(specialist_config=cfg)
-
-    prompt = pb.build_user_prompt(
-        memory_signal=helix_memory_signal,
-        library_state=prompt_library_state,
-        batch_size=5,
-    )
-
-    _assert_helix_retrieval_sections(prompt)
-    assert "## SPECIALIST FOCUS" in prompt
-    assert "trend-following specialist" in prompt
-
-
-# -----------------------------------------------------------------------
-# CriticAgent with MockProvider
-# -----------------------------------------------------------------------
-
-def test_critic_agent_with_mock():
-    """CriticAgent should produce scores when given proposals."""
-    provider = MockProvider()
-    critic = CriticAgent(llm_provider=provider)
-
-    candidates = [
-        CandidateFactor(name="f1", formula="Neg($close)", category="test"),
-        CandidateFactor(name="f2", formula="CsRank($volume)", category="test"),
-    ]
-    proposals = {"test_specialist": candidates}
-
-    scores = critic.review_candidates(
-        proposals=proposals,
-        library_state={"size": 0},
-        memory_signal={},
-    )
-    # Should return scores (fallback uniform if parsing fails)
-    assert len(scores) >= 2
-    assert all(hasattr(s, "final_score") for s in scores)
-
-
-# -----------------------------------------------------------------------
-# DebateGenerator.generate_batch returns List[CandidateFactor]
-# -----------------------------------------------------------------------
-
-def test_debate_generator_returns_candidates():
-    provider = MockProvider()
-    gen = DebateGenerator(
-        llm_provider=provider,
-        debate_config=DebateConfig(
-            enable_critic=False,
-            candidates_per_specialist=5,
-        ),
-    )
-    result = gen.generate_batch(batch_size=10)
-    assert isinstance(result, list)
-    # Should have some candidates (specialists produce them)
-    assert len(result) > 0
-    assert all(isinstance(c, CandidateFactor) for c in result)
-
-
-# -----------------------------------------------------------------------
-# DebateGenerator with critic produces non-empty results
-# -----------------------------------------------------------------------
-
-def test_debate_generator_with_critic():
-    provider = MockProvider()
-    gen = DebateGenerator(
-        llm_provider=provider,
-        debate_config=DebateConfig(
-            enable_critic=True,
-            candidates_per_specialist=5,
-            top_k_after_critic=10,
-        ),
-    )
-    result = gen.generate_batch(batch_size=10)
-    assert isinstance(result, list)
-    assert len(result) > 0
-
-
-def test_debate_generator_accepts_dict_recent_admissions():
-    provider = MockProvider()
-    gen = DebateGenerator(
-        llm_provider=provider,
-        debate_config=DebateConfig(
-            enable_critic=True,
-            candidates_per_specialist=2,
-            top_k_after_critic=6,
-        ),
-    )
-
-    result = gen.generate_batch(
-        batch_size=6,
-        library_state={
-            "recent_admissions": [
-                {
-                    "id": 7,
-                    "name": "volatilityminer_factor_2",
-                    "category": "VWAP",
-                },
-                {
-                    "id": 8,
-                    "name": "regimeminer_factor_2",
-                    "category": "Amount",
-                },
-            ]
-        },
-    )
-
-    assert isinstance(result, list)
-    assert len(result) > 0
diff --git a/src/factorminer/factorminer/tests/test_evaluation.py b/src/factorminer/factorminer/tests/test_evaluation.py
deleted file mode 100644
index 0030398..0000000
--- a/src/factorminer/factorminer/tests/test_evaluation.py
+++ /dev/null
@@ -1,287 +0,0 @@
-"""Tests for the evaluation metrics pipeline."""
-
-from __future__ import annotations
-
-import numpy as np
-import pytest
-
-from src.factorminer.factorminer.evaluation.metrics import (
-    compute_factor_stats,
-    compute_ic,
-    compute_ic_mean,
-    compute_ic_win_rate,
-    compute_icir,
-    compute_pairwise_correlation,
-    compute_quintile_returns,
-    compute_turnover,
-)
-
-
-# ---------------------------------------------------------------------------
-# Helpers
-# ---------------------------------------------------------------------------
-
-@pytest.fixture
-def rng():
-    return np.random.default_rng(123)
-
-
-@pytest.fixture
-def perfect_signal(rng):
-    """Signal perfectly correlated with returns -> IC should be ~1.0."""
-    M, T = 50, 60
-    returns = rng.normal(0, 0.01, (M, T))
-    signals = returns.copy()  # Perfect correlation
-    return signals, returns
-
-
-@pytest.fixture
-def random_signal(rng):
-    """Random signal independent of returns -> IC should be ~0."""
-    M, T = 50, 80
-    returns = rng.normal(0, 0.01, (M, T))
-    signals = rng.normal(0, 1.0, (M, T))  # Independent
-    return signals, returns
-
-
-@pytest.fixture
-def known_quintile_signal(rng):
-    """Signal where high-signal assets have high returns."""
-    M, T = 100, 50
-    signals = np.tile(np.arange(M, dtype=np.float64).reshape(M, 1), (1, T))
-    # Returns correlated with signal rank
-    returns = signals * 0.001 + rng.normal(0, 0.001, (M, T))
-    return signals, returns
-
-
-# ---------------------------------------------------------------------------
-# IC computation
-# ---------------------------------------------------------------------------
-
-class TestIC:
-    """Test Information Coefficient computation."""
-
-    def test_perfect_signal_ic_near_one(self, perfect_signal):
-        signals, returns = perfect_signal
-        ic_series = compute_ic(signals, returns)
-        valid = ic_series[~np.isnan(ic_series)]
-        assert len(valid) > 0
-        # Perfect correlation should give IC close to 1.0
-        mean_ic = np.mean(valid)
-        assert mean_ic > 0.9, f"Expected IC > 0.9, got {mean_ic}"
-
-    def test_random_signal_ic_near_zero(self, random_signal):
-        signals, returns = random_signal
-        ic_series = compute_ic(signals, returns)
-        valid = ic_series[~np.isnan(ic_series)]
-        assert len(valid) > 0
-        # Random signal should give IC near 0
-        mean_ic = np.mean(np.abs(valid))
-        assert mean_ic < 0.2, f"Expected |IC| < 0.2, got {mean_ic}"
-
-    def test_ic_shape(self, perfect_signal):
-        signals, returns = perfect_signal
-        ic_series = compute_ic(signals, returns)
-        assert ic_series.shape == (signals.shape[1],)
-
-    def test_ic_with_nans(self, rng):
-        M, T = 30, 20
-        signals = rng.normal(0, 1, (M, T))
-        returns = rng.normal(0, 0.01, (M, T))
-        # Inject NaNs
-        signals[0, :] = np.nan
-        signals[:, 0] = np.nan
-        ic_series = compute_ic(signals, returns)
-        assert ic_series.shape == (T,)
-
-    def test_ic_too_few_assets_returns_nan(self):
-        # Only 3 assets (below threshold of 5)
-        signals = np.array([[1, 2], [3, 4], [5, 6]], dtype=np.float64)
-        returns = np.array([[0.1, 0.2], [0.3, 0.4], [0.5, 0.6]], dtype=np.float64)
-        ic_series = compute_ic(signals, returns)
-        assert np.all(np.isnan(ic_series))
-
-
-# ---------------------------------------------------------------------------
-# ICIR computation
-# ---------------------------------------------------------------------------
-
-class TestICIR:
-    """Test ICIR = mean(IC) / std(IC)."""
-
-    def test_icir_positive_for_good_signal(self, rng):
-        # Use a signal that is correlated but not perfectly, so IC has variance
-        M, T = 50, 80
-        returns = rng.normal(0, 0.01, (M, T))
-        signals = returns + rng.normal(0, 0.005, (M, T))  # Noisy correlation
-        ic_series = compute_ic(signals, returns)
-        icir = compute_icir(ic_series)
-        assert icir > 0, f"Expected positive ICIR, got {icir}"
-
-    def test_icir_near_zero_for_random(self, random_signal):
-        signals, returns = random_signal
-        ic_series = compute_ic(signals, returns)
-        icir = compute_icir(ic_series)
-        # Random signal: ICIR should be small in magnitude
-        assert abs(icir) < 2.0, f"Expected small ICIR, got {icir}"
-
-    def test_icir_with_few_valid_points(self):
-        ic_series = np.array([np.nan, np.nan, 0.05])
-        icir = compute_icir(ic_series)
-        # Only 1 valid point -> returns 0.0
-        assert icir == 0.0
-
-    def test_icir_constant_ic_returns_zero(self):
-        ic_series = np.array([0.05, 0.05, 0.05, 0.05])
-        icir = compute_icir(ic_series)
-        # std = 0 -> returns 0.0
-        assert icir == 0.0
-
-
-# ---------------------------------------------------------------------------
-# IC-derived statistics
-# ---------------------------------------------------------------------------
-
-class TestICStats:
-    """Test IC mean and win rate."""
-
-    def test_ic_mean_absolute(self):
-        ic_series = np.array([0.1, -0.05, 0.08, -0.03, np.nan])
-        result = compute_ic_mean(ic_series)
-        expected = np.mean(np.abs([0.1, 0.05, 0.08, 0.03]))
-        np.testing.assert_almost_equal(result, expected)
-
-    def test_ic_win_rate(self):
-        ic_series = np.array([0.1, -0.05, 0.08, -0.03, 0.02, np.nan])
-        result = compute_ic_win_rate(ic_series)
-        # 3 positive out of 5 valid
-        np.testing.assert_almost_equal(result, 0.6)
-
-    def test_ic_mean_all_nan(self):
-        ic_series = np.array([np.nan, np.nan, np.nan])
-        assert compute_ic_mean(ic_series) == 0.0
-
-    def test_ic_win_rate_all_nan(self):
-        ic_series = np.array([np.nan, np.nan])
-        assert compute_ic_win_rate(ic_series) == 0.0
-
-
-# ---------------------------------------------------------------------------
-# Pairwise correlation
-# ---------------------------------------------------------------------------
-
-class TestPairwiseCorrelation:
-    """Test pairwise cross-sectional correlation."""
-
-    def test_identical_signals_correlation_one(self, rng):
-        M, T = 30, 40
-        signals = rng.normal(0, 1, (M, T))
-        corr = compute_pairwise_correlation(signals, signals)
-        assert corr > 0.95, f"Expected corr > 0.95 for identical, got {corr}"
-
-    def test_independent_signals_low_correlation(self, rng):
-        M, T = 50, 60
-        a = rng.normal(0, 1, (M, T))
-        b = rng.normal(0, 1, (M, T))
-        corr = compute_pairwise_correlation(a, b)
-        assert abs(corr) < 0.3, f"Expected low corr, got {corr}"
-
-    def test_negatively_correlated(self, rng):
-        M, T = 30, 40
-        a = rng.normal(0, 1, (M, T))
-        b = -a  # Perfectly negatively correlated
-        corr = compute_pairwise_correlation(a, b)
-        assert corr < -0.95, f"Expected corr < -0.95, got {corr}"
-
-    def test_correlation_with_nans(self, rng):
-        M, T = 30, 20
-        a = rng.normal(0, 1, (M, T))
-        b = rng.normal(0, 1, (M, T))
-        a[:5, :] = np.nan
-        corr = compute_pairwise_correlation(a, b)
-        # Should still produce a valid number
-        assert np.isfinite(corr)
-
-
-# ---------------------------------------------------------------------------
-# Quintile returns
-# ---------------------------------------------------------------------------
-
-class TestQuintileReturns:
-    """Test quintile return computation."""
-
-    def test_quintile_keys(self, known_quintile_signal):
-        signals, returns = known_quintile_signal
-        result = compute_quintile_returns(signals, returns)
-        assert "Q1" in result
-        assert "Q5" in result
-        assert "long_short" in result
-        assert "monotonicity" in result
-
-    def test_quintile_monotonic_for_known_signal(self, known_quintile_signal):
-        signals, returns = known_quintile_signal
-        result = compute_quintile_returns(signals, returns)
-        # With positively correlated signal, Q5 > Q1
-        assert result["long_short"] > 0, (
-            f"Expected positive long_short, got {result['long_short']}"
-        )
-        # Monotonicity should be positive
-        assert result["monotonicity"] > 0.5, (
-            f"Expected high monotonicity, got {result['monotonicity']}"
-        )
-
-    def test_quintile_returns_shape(self, rng):
-        M, T = 20, 30
-        signals = rng.normal(0, 1, (M, T))
-        returns = rng.normal(0, 0.01, (M, T))
-        result = compute_quintile_returns(signals, returns, n_quantiles=5)
-        # Should have Q1..Q5 plus long_short and monotonicity
-        assert len(result) == 7
-
-
-# ---------------------------------------------------------------------------
-# Turnover
-# ---------------------------------------------------------------------------
-
-class TestTurnover:
-    """Test portfolio turnover computation."""
-
-    def test_constant_signal_zero_turnover(self):
-        M, T = 20, 10
-        signals = np.tile(np.arange(M, dtype=np.float64).reshape(M, 1), (1, T))
-        turnover = compute_turnover(signals, top_fraction=0.2)
-        assert turnover == 0.0
-
-    def test_random_signal_positive_turnover(self, rng):
-        M, T = 30, 50
-        signals = rng.normal(0, 1, (M, T))
-        turnover = compute_turnover(signals, top_fraction=0.2)
-        assert 0 <= turnover <= 1.0
-
-
-# ---------------------------------------------------------------------------
-# Comprehensive factor stats
-# ---------------------------------------------------------------------------
-
-class TestFactorStats:
-    """Test the compute_factor_stats wrapper."""
-
-    def test_factor_stats_keys(self, rng):
-        M, T = 30, 40
-        signals = rng.normal(0, 1, (M, T))
-        returns = rng.normal(0, 0.01, (M, T))
-        stats = compute_factor_stats(signals, returns)
-        assert "ic_mean" in stats
-        assert "icir" in stats
-        assert "ic_win_rate" in stats
-        assert "Q1" in stats
-        assert "long_short" in stats
-        assert "turnover" in stats
-        assert "ic_series" in stats
-
-    def test_factor_stats_ic_series_shape(self, rng):
-        M, T = 20, 30
-        signals = rng.normal(0, 1, (M, T))
-        returns = rng.normal(0, 0.01, (M, T))
-        stats = compute_factor_stats(signals, returns)
-        assert stats["ic_series"].shape == (T,)
diff --git a/src/factorminer/factorminer/tests/test_expression_tree.py b/src/factorminer/factorminer/tests/test_expression_tree.py
deleted file mode 100644
index 1462bb7..0000000
--- a/src/factorminer/factorminer/tests/test_expression_tree.py
+++ /dev/null
@@ -1,307 +0,0 @@
-"""Tests for the expression tree and parser modules."""
-
-from __future__ import annotations
-
-import numpy as np
-import pytest
-
-from src.factorminer.factorminer.core.parser import parse, try_parse, tokenize
-from src.factorminer.factorminer.core.expression_tree import (
-    ConstantNode,
-    ExpressionTree,
-    LeafNode,
-    OperatorNode,
-)
-from src.factorminer.factorminer.core.types import OPERATOR_REGISTRY, get_operator
-
-
-# ---------------------------------------------------------------------------
-# Parsing simple formulas
-# ---------------------------------------------------------------------------
-
-class TestParseSimple:
-    """Test parsing of basic single-operator formulas."""
-
-    def test_parse_neg_close(self):
-        tree = parse("Neg($close)")
-        assert tree.to_string() == "Neg($close)"
-
-    def test_parse_add_open_close(self):
-        tree = parse("Add($open, $close)")
-        assert tree.to_string() == "Add($open, $close)"
-
-    def test_parse_leaf_only(self):
-        tree = parse("$close")
-        assert tree.to_string() == "$close"
-        assert tree.depth() == 1
-        assert tree.size() == 1
-
-    def test_parse_constant(self):
-        tree = parse("0.0001")
-        assert tree.depth() == 1
-
-    def test_parse_div_with_two_features(self):
-        tree = parse("Div($high, $low)")
-        assert tree.to_string() == "Div($high, $low)"
-
-    def test_parse_sub(self):
-        tree = parse("Sub($close, $open)")
-        assert tree.to_string() == "Sub($close, $open)"
-
-    def test_parse_operator_with_window(self):
-        tree = parse("Mean($close, 20)")
-        assert tree.to_string() == "Mean($close, 20)"
-
-    def test_parse_ema_with_window(self):
-        tree = parse("EMA($close, 10)")
-        assert tree.to_string() == "EMA($close, 10)"
-
-
-# ---------------------------------------------------------------------------
-# Parsing complex nested formulas from the paper
-# ---------------------------------------------------------------------------
-
-class TestParseComplex:
-    """Test parsing of complex nested formulas (paper factors)."""
-
-    def test_factor_006(self):
-        """Neg(Div(Sub($close, $vwap), $vwap))"""
-        formula = "Neg(Div(Sub($close, $vwap), $vwap))"
-        tree = parse(formula)
-        assert tree.to_string() == formula
-
-    def test_factor_002(self):
-        """Neg(Div(Sub($close, EMA($close, 10)), EMA($close, 18)))"""
-        formula = "Neg(Div(Sub($close, EMA($close, 10)), EMA($close, 18)))"
-        tree = parse(formula)
-        assert tree.to_string() == formula
-
-    def test_factor_046_ifelse(self):
-        """Complex IfElse with Greater, Std, Mean, Neg, CsRank, Delta, Div, Sub, Add."""
-        formula = (
-            "IfElse(Greater(Std($returns, 12), Mean(Std($returns, 12), 48)), "
-            "Neg(CsRank(Delta($close, 3))), "
-            "Neg(CsRank(Div(Sub($close, $low), Add(Sub($high, $low), 0.0001)))))"
-        )
-        tree = parse(formula)
-        roundtrip = tree.to_string()
-        # Parse roundtrip should also succeed
-        tree2 = parse(roundtrip)
-        assert tree2.to_string() == roundtrip
-
-    def test_nested_csrank_corr(self):
-        formula = "CsRank(Corr($close, $volume, 20))"
-        tree = parse(formula)
-        assert tree.to_string() == formula
-
-    def test_deeply_nested(self):
-        formula = "CsRank(Neg(Div(Sub($close, Mean($close, 20)), Std($close, 20))))"
-        tree = parse(formula)
-        assert tree.to_string() == formula
-
-
-# ---------------------------------------------------------------------------
-# Roundtrip: parse -> to_string -> parse
-# ---------------------------------------------------------------------------
-
-class TestRoundtrip:
-    """Test that parse -> to_string -> parse produces identical trees."""
-
-    @pytest.mark.parametrize(
-        "formula",
-        [
-            "Neg($close)",
-            "Add($open, $close)",
-            "Div(Sub($close, $vwap), $vwap)",
-            "Mean($close, 20)",
-            "EMA($close, 10)",
-            "CsRank(Std($returns, 12))",
-            "IfElse(Greater($close, $open), $high, $low)",
-        ],
-    )
-    def test_roundtrip(self, formula):
-        tree1 = parse(formula)
-        s1 = tree1.to_string()
-        tree2 = parse(s1)
-        s2 = tree2.to_string()
-        assert s1 == s2
-
-
-# ---------------------------------------------------------------------------
-# Expression tree evaluation with mock data
-# ---------------------------------------------------------------------------
-
-class TestEvaluate:
-    """Test evaluate on known inputs."""
-
-    def test_neg_evaluate(self, small_data):
-        tree = parse("Neg($close)")
-        result = tree.evaluate(small_data)
-        np.testing.assert_array_almost_equal(result, -small_data["$close"])
-
-    def test_add_evaluate(self, small_data):
-        tree = parse("Add($open, $close)")
-        result = tree.evaluate(small_data)
-        expected = small_data["$open"] + small_data["$close"]
-        np.testing.assert_array_almost_equal(result, expected)
-
-    def test_sub_evaluate(self, small_data):
-        tree = parse("Sub($close, $open)")
-        result = tree.evaluate(small_data)
-        expected = small_data["$close"] - small_data["$open"]
-        np.testing.assert_array_almost_equal(result, expected)
-
-    def test_div_evaluate(self, small_data):
-        tree = parse("Div($high, $low)")
-        result = tree.evaluate(small_data)
-        assert result.shape == small_data["$high"].shape
-        # Should be positive since high > low
-        valid = ~np.isnan(result) & (result != 0)
-        assert np.all(result[valid] > 0)
-
-    def test_constant_in_formula(self, small_data):
-        tree = parse("Add($close, 0.0001)")
-        result = tree.evaluate(small_data)
-        # The constant becomes a ConstantNode, which is treated as a trailing
-        # parameter if arity is 2 and it becomes the second child.
-        assert result.shape == small_data["$close"].shape
-
-    def test_nested_evaluate_shape(self, small_data):
-        tree = parse("Neg(Div(Sub($close, $vwap), $vwap))")
-        result = tree.evaluate(small_data)
-        assert result.shape == small_data["$close"].shape
-
-
-# ---------------------------------------------------------------------------
-# Tree depth and size
-# ---------------------------------------------------------------------------
-
-class TestTreeStructure:
-    """Test depth() and size() computations."""
-
-    def test_leaf_depth(self):
-        tree = parse("$close")
-        assert tree.depth() == 1
-
-    def test_leaf_size(self):
-        tree = parse("$close")
-        assert tree.size() == 1
-
-    def test_unary_depth(self):
-        tree = parse("Neg($close)")
-        assert tree.depth() == 2
-
-    def test_unary_size(self):
-        tree = parse("Neg($close)")
-        assert tree.size() == 2
-
-    def test_binary_depth(self):
-        tree = parse("Add($open, $close)")
-        assert tree.depth() == 2
-
-    def test_binary_size(self):
-        tree = parse("Add($open, $close)")
-        assert tree.size() == 3  # Add + $open + $close
-
-    def test_nested_depth(self):
-        tree = parse("Neg(Div(Sub($close, $vwap), $vwap))")
-        assert tree.depth() == 4  # Neg -> Div -> Sub -> $close
-
-    def test_nested_size(self):
-        tree = parse("Neg(Div(Sub($close, $vwap), $vwap))")
-        assert tree.size() == 6  # Neg, Div, Sub, $close, $vwap, $vwap
-
-    def test_leaf_features(self):
-        tree = parse("Neg(Div(Sub($close, $vwap), $vwap))")
-        feats = tree.leaf_features()
-        assert feats == ["$close", "$vwap"]
-
-    def test_clone_preserves_structure(self):
-        tree = parse("Add($open, $close)")
-        cloned = tree.clone()
-        assert cloned.to_string() == tree.to_string()
-        assert cloned.depth() == tree.depth()
-        assert cloned.size() == tree.size()
-
-
-# ---------------------------------------------------------------------------
-# Error handling
-# ---------------------------------------------------------------------------
-
-class TestErrorHandling:
-    """Test that invalid inputs raise appropriate errors."""
-
-    def test_unknown_operator(self):
-        with pytest.raises(SyntaxError, match="Unknown operator"):
-            parse("FooBar($close)")
-
-    def test_unknown_feature(self):
-        with pytest.raises(SyntaxError, match="Unknown feature"):
-            parse("Neg($foobar)")
-
-    def test_wrong_arity_too_few(self):
-        with pytest.raises(SyntaxError, match="expects"):
-            parse("Add($close)")
-
-    def test_wrong_arity_too_many_nodes(self):
-        # Neg expects 1 expression arg; passing 2 should fail
-        with pytest.raises(SyntaxError):
-            parse("Neg($close, $open)")
-
-    def test_empty_string(self):
-        with pytest.raises((SyntaxError, IndexError)):
-            parse("")
-
-    def test_unmatched_paren(self):
-        with pytest.raises(SyntaxError):
-            parse("Neg($close")
-
-    def test_trailing_content(self):
-        with pytest.raises(SyntaxError, match="Unexpected trailing"):
-            parse("Neg($close) extra")
-
-    def test_try_parse_returns_none_on_failure(self):
-        assert try_parse("InvalidOp($close)") is None
-        assert try_parse("") is None
-
-    def test_try_parse_returns_tree_on_success(self):
-        result = try_parse("Neg($close)")
-        assert result is not None
-        assert result.to_string() == "Neg($close)"
-
-    def test_missing_feature_in_data(self, small_data):
-        tree = parse("Neg($close)")
-        data_missing = {k: v for k, v in small_data.items() if k != "$close"}
-        with pytest.raises(KeyError, match="\\$close"):
-            tree.evaluate(data_missing)
-
-
-# ---------------------------------------------------------------------------
-# Tokenizer
-# ---------------------------------------------------------------------------
-
-class TestTokenizer:
-    """Test the tokenizer separately."""
-
-    def test_simple_tokens(self):
-        tokens = tokenize("Neg($close)")
-        types = [t.type.name for t in tokens]
-        assert types == ["IDENT", "LPAREN", "FEATURE", "RPAREN", "EOF"]
-
-    def test_number_token(self):
-        tokens = tokenize("0.0001")
-        assert tokens[0].type.name == "NUMBER"
-        assert tokens[0].value == "0.0001"
-
-    def test_negative_number_token(self):
-        tokens = tokenize("Mean($close, -3)")
-        # -3 should be a number token after comma
-        num_tokens = [t for t in tokens if t.type.name == "NUMBER"]
-        assert len(num_tokens) == 1
-        assert num_tokens[0].value == "-3"
-
-    def test_whitespace_handling(self):
-        tokens = tokenize("  Add(  $open ,  $close  ) ")
-        ident_tokens = [t for t in tokens if t.type.name == "IDENT"]
-        assert len(ident_tokens) == 1
-        assert ident_tokens[0].value == "Add"
diff --git a/src/factorminer/factorminer/tests/test_helix_loop.py b/src/factorminer/factorminer/tests/test_helix_loop.py
deleted file mode 100644
index 54ffe10..0000000
--- a/src/factorminer/factorminer/tests/test_helix_loop.py
+++ /dev/null
@@ -1,251 +0,0 @@
-"""Tests for the Helix Loop (core/helix_loop.py)."""
-
-from __future__ import annotations
-
-import numpy as np
-import pytest
-
-try:
-    from factorminer.core.helix_loop import HelixLoop
-    HAS_HELIX = True
-except ImportError:
-    HAS_HELIX = False
-
-from src.factorminer.factorminer.agent.llm_interface import MockProvider
-from src.factorminer.factorminer.core.factor_library import Factor, FactorLibrary
-from src.factorminer.factorminer.core.config import MiningConfig
-from src.factorminer.factorminer.core.ralph_loop import EvaluationResult
-
-
-pytestmark = pytest.mark.skipif(not HAS_HELIX, reason="helix_loop not yet built")
-
-
-@pytest.fixture
-def rng():
-    return np.random.default_rng(42)
-
-
-@pytest.fixture
-def small_tensor(rng):
-    """Small data tensor and returns for HelixLoop tests."""
-    M, T, F = 10, 50, 3
-    data = rng.normal(0, 1, (M, T, F))
-    close = 100.0 + np.cumsum(rng.normal(0, 0.5, (M, T)), axis=1)
-    returns = np.zeros((M, T))
-    returns[:, 1:] = np.diff(close, axis=1) / close[:, :-1]
-    return data, returns
-
-
-# -----------------------------------------------------------------------
-# HelixLoop can be instantiated with all defaults
-# -----------------------------------------------------------------------
-
-def test_helix_loop_instantiates_with_defaults(small_tensor):
-    """HelixLoop with all features off should be instantiable."""
-    data, returns = small_tensor
-    config = MiningConfig(target_library_size=5, max_iterations=1)
-    provider = MockProvider()
-
-    loop = HelixLoop(
-        config=config,
-        data_tensor=data,
-        returns=returns,
-        llm_provider=provider,
-        canonicalize=False,
-        enable_knowledge_graph=False,
-        enable_auto_inventor=False,
-    )
-    assert loop is not None
-
-
-# -----------------------------------------------------------------------
-# HelixLoop with canonicalize=True
-# -----------------------------------------------------------------------
-
-def test_helix_loop_canonicalize_flag(small_tensor):
-    """HelixLoop with canonicalize=True should initialize the canonicalizer."""
-    data, returns = small_tensor
-    config = MiningConfig(target_library_size=5, max_iterations=1)
-    provider = MockProvider()
-
-    loop = HelixLoop(
-        config=config,
-        data_tensor=data,
-        returns=returns,
-        llm_provider=provider,
-        canonicalize=True,
-    )
-    assert loop._canonicalize is True
-
-
-# -----------------------------------------------------------------------
-# HelixLoop with MockProvider runs 1 iteration
-# -----------------------------------------------------------------------
-
-def test_helix_loop_runs_one_iteration(small_tensor):
-    """HelixLoop should complete 1 iteration without error using MockProvider."""
-    data, returns = small_tensor
-    config = MiningConfig(
-        target_library_size=3,
-        max_iterations=1,
-        batch_size=5,
-    )
-    provider = MockProvider()
-
-    loop = HelixLoop(
-        config=config,
-        data_tensor=data,
-        returns=returns,
-        llm_provider=provider,
-        canonicalize=False,
-        enable_knowledge_graph=False,
-        enable_auto_inventor=False,
-    )
-    # Run the loop -- should not raise
-    loop.run()
-    assert loop.library is not None
-
-
-def test_phase2_revocation_updates_stats_and_library_state(small_tensor):
-    """Post-admission revocation should keep stats aligned with library state."""
-    data, returns = small_tensor
-    config = MiningConfig(
-        target_library_size=3,
-        max_iterations=1,
-        batch_size=5,
-        ic_threshold=0.0001,
-        correlation_threshold=0.95,
-    )
-    provider = MockProvider()
-
-    loop = HelixLoop(
-        config=config,
-        data_tensor=data,
-        returns=returns,
-        llm_provider=provider,
-        canonicalize=False,
-        enable_knowledge_graph=False,
-        enable_auto_inventor=False,
-    )
-
-    original_validate = loop._helix_validate
-
-    def force_one_revocation(results, admitted_results):
-        rejected = original_validate(results, admitted_results)
-        for admitted in admitted_results:
-            if admitted.admitted:
-                loop._revoke_admission(admitted, results, "forced test revocation")
-                return rejected + 1
-        return rejected
-
-    loop._helix_validate = force_one_revocation
-
-    stats = loop._run_iteration(batch_size=5)
-
-    assert stats["admitted"] == loop.library.size
-    if loop.library.correlation_matrix is not None:
-        assert loop.library.correlation_matrix.shape[0] == loop.library.size
-
-
-def test_revoke_admission_rebuilds_library_indices(small_tensor):
-    """Revoking a factor should rebuild the library correlation bookkeeping."""
-    data, returns = small_tensor
-    config = MiningConfig(target_library_size=5, max_iterations=1)
-    provider = MockProvider()
-
-    loop = HelixLoop(
-        config=config,
-        data_tensor=data,
-        returns=returns,
-        llm_provider=provider,
-        canonicalize=False,
-        enable_knowledge_graph=False,
-        enable_auto_inventor=False,
-    )
-
-    factor_a = Factor(
-        id=0,
-        name="factor_a",
-        formula="Mean($close, 5)",
-        category="test",
-        ic_mean=0.1,
-        icir=1.0,
-        ic_win_rate=0.6,
-        max_correlation=0.0,
-        batch_number=1,
-        signals=np.ones_like(returns),
-    )
-    factor_b = Factor(
-        id=0,
-        name="factor_b",
-        formula="Std($close, 5)",
-        category="test",
-        ic_mean=0.08,
-        icir=0.9,
-        ic_win_rate=0.55,
-        max_correlation=0.1,
-        batch_number=1,
-        signals=np.full_like(returns, 2.0),
-    )
-
-    loop.library.admit_factor(factor_a)
-    loop.library.admit_factor(factor_b)
-
-    result = EvaluationResult(
-        factor_name="factor_a",
-        formula="Mean($close, 5)",
-        admitted=True,
-    )
-    loop._revoke_admission(result, [], "forced test revocation")
-
-    assert loop.library.size == 1
-    assert list(loop.library.factors.values())[0].name == "factor_b"
-    assert loop.library._id_to_index == {list(loop.library.factors.keys())[0]: 0}
-    assert loop.library.correlation_matrix is not None
-    assert loop.library.correlation_matrix.shape == (1, 1)
-
-
-def test_helix_embedding_screen_filters_library_duplicates(small_tensor):
-    """Embedding-aware synthesis should drop near-duplicates of admitted factors."""
-    data, returns = small_tensor
-    config = MiningConfig(target_library_size=5, max_iterations=1)
-    provider = MockProvider()
-
-    library = FactorLibrary(correlation_threshold=0.95, ic_threshold=0.0001)
-    library.admit_factor(
-        Factor(
-            id=0,
-            name="existing_factor",
-            formula="Mean($close, 5)",
-            category="test",
-            ic_mean=0.1,
-            icir=1.0,
-            ic_win_rate=0.6,
-            max_correlation=0.0,
-            batch_number=0,
-            signals=np.ones_like(returns),
-        )
-    )
-
-    loop = HelixLoop(
-        config=config,
-        data_tensor=data,
-        returns=returns,
-        llm_provider=provider,
-        library=library,
-        canonicalize=False,
-        enable_embeddings=True,
-        enable_knowledge_graph=False,
-        enable_auto_inventor=False,
-    )
-
-    deduped, canon_dupes, semantic_dupes = loop._canonicalize_and_dedup(
-        [
-            ("dup_factor", "Mean($close, 5)"),
-            ("novel_factor", "Std($close, 5)"),
-        ]
-    )
-
-    assert canon_dupes == 0
-    assert semantic_dupes == 1
-    assert deduped == [("novel_factor", "Std($close, 5)")]
diff --git a/src/factorminer/factorminer/tests/test_knowledge_graph.py b/src/factorminer/factorminer/tests/test_knowledge_graph.py
deleted file mode 100644
index adce2db..0000000
--- a/src/factorminer/factorminer/tests/test_knowledge_graph.py
+++ /dev/null
@@ -1,166 +0,0 @@
-"""Tests for the factor knowledge graph (memory/knowledge_graph.py)."""
-
-from __future__ import annotations
-
-import pytest
-
-from src.factorminer.factorminer.memory.knowledge_graph import (
-    EdgeType,
-    FactorKnowledgeGraph,
-    FactorNode,
-)
-
-
-# -----------------------------------------------------------------------
-# Basic node and edge operations
-# -----------------------------------------------------------------------
-
-def test_add_factor():
-    kg = FactorKnowledgeGraph()
-    node = FactorNode(
-        factor_id="f1",
-        formula="CsRank($close)",
-        ic_mean=0.05,
-        operators=["CsRank"],
-        features=["$close"],
-        admitted=True,
-    )
-    kg.add_factor(node)
-    assert kg.get_factor_count() == 1
-    # Operator node should also be created
-    assert kg.get_edge_count() >= 1
-    assert kg.get_factor_node("f1") is not None
-
-
-def test_list_factor_nodes_filters_admitted():
-    kg = FactorKnowledgeGraph()
-    kg.add_factor(FactorNode(
-        factor_id="f1", formula="CsRank($close)", operators=["CsRank"], admitted=True,
-    ))
-    kg.add_factor(FactorNode(
-        factor_id="f2", formula="Neg($volume)", operators=["Neg"], admitted=False,
-    ))
-
-    admitted_ids = [node.factor_id for node in kg.list_factor_nodes(admitted_only=True)]
-    all_ids = [node.factor_id for node in kg.list_factor_nodes()]
-
-    assert admitted_ids == ["f1"]
-    assert set(all_ids) == {"f1", "f2"}
-
-
-def test_add_correlation_edge():
-    kg = FactorKnowledgeGraph()
-    kg.add_factor(FactorNode(factor_id="f1", formula="A", operators=["CsRank"]))
-    kg.add_factor(FactorNode(factor_id="f2", formula="B", operators=["Neg"]))
-
-    # Below threshold -> no edge
-    kg.add_correlation_edge("f1", "f2", rho=0.3, threshold=0.4)
-    initial_edges = kg.get_edge_count()
-
-    # Above threshold -> edge added (bidirectional = 2 edges)
-    kg.add_correlation_edge("f1", "f2", rho=0.6, threshold=0.4)
-    assert kg.get_edge_count() >= initial_edges + 2
-
-
-# -----------------------------------------------------------------------
-# find_saturated_regions
-# -----------------------------------------------------------------------
-
-def test_find_saturated_regions():
-    kg = FactorKnowledgeGraph()
-    for i in range(3):
-        kg.add_factor(FactorNode(
-            factor_id=f"f{i}", formula=f"Op{i}($close)", operators=[f"Op{i}"],
-        ))
-    # High correlation between f0 and f1
-    kg.add_correlation_edge("f0", "f1", rho=0.8, threshold=0.4)
-    # Low correlation with f2
-    kg.add_correlation_edge("f0", "f2", rho=0.2, threshold=0.4)
-
-    regions = kg.find_saturated_regions(threshold=0.5)
-    assert len(regions) >= 1
-    # f0 and f1 should be in the same cluster
-    found = any({"f0", "f1"}.issubset(r) for r in regions)
-    assert found
-
-
-# -----------------------------------------------------------------------
-# find_complementary_patterns
-# -----------------------------------------------------------------------
-
-def test_find_complementary_patterns():
-    kg = FactorKnowledgeGraph()
-    kg.add_factor(FactorNode(
-        factor_id="f1", formula="CsRank($close)", operators=["CsRank"],
-    ))
-    kg.add_factor(FactorNode(
-        factor_id="f2", formula="Neg($volume)", operators=["Neg"],
-    ))
-    # Connect them via a shared operator node (indirectly)
-    # f1 uses CsRank, f2 uses Neg -- different operators
-    # Add a derivation edge so they are reachable
-    kg.add_derivation_edge("f2", "f1", mutation_type="test")
-
-    complementary = kg.find_complementary_patterns("f1", max_hops=2)
-    # f2 uses a different operator set and is not correlated -> complementary
-    assert "f2" in complementary
-
-
-# -----------------------------------------------------------------------
-# Serialization roundtrip
-# -----------------------------------------------------------------------
-
-def test_save_load_roundtrip():
-    kg = FactorKnowledgeGraph()
-    kg.add_factor(FactorNode(
-        factor_id="f1", formula="CsRank($close)",
-        ic_mean=0.05, operators=["CsRank"], admitted=True,
-    ))
-    kg.add_factor(FactorNode(
-        factor_id="f2", formula="Neg($volume)",
-        operators=["Neg"], admitted=True,
-    ))
-    kg.add_correlation_edge("f1", "f2", rho=0.5)
-
-    data = kg.to_dict()
-    kg2 = FactorKnowledgeGraph.from_dict(data)
-    assert kg2.get_factor_count() == 2
-    assert kg2.get_edge_count() == kg.get_edge_count()
-
-
-def test_remove_factor_prunes_graph_state():
-    kg = FactorKnowledgeGraph()
-    kg.add_factor(FactorNode(
-        factor_id="f1",
-        formula="CsRank(Neg($close))",
-        operators=["CsRank", "Neg"],
-        features=["$close"],
-        admitted=True,
-    ))
-
-    assert kg.remove_factor("f1") is True
-    assert kg.get_factor_count() == 0
-    assert kg.get_factor_node("f1") is None
-    assert kg.get_edge_count() == 0
-    assert kg.remove_factor("f1") is False
-
-
-# -----------------------------------------------------------------------
-# get_operator_cooccurrence
-# -----------------------------------------------------------------------
-
-def test_get_operator_cooccurrence():
-    kg = FactorKnowledgeGraph()
-    kg.add_factor(FactorNode(
-        factor_id="f1", formula="CsRank(Neg($close))",
-        operators=["CsRank", "Neg"], admitted=True,
-    ))
-    kg.add_factor(FactorNode(
-        factor_id="f2", formula="CsRank(Mean($close, 10))",
-        operators=["CsRank", "Mean"], admitted=True,
-    ))
-
-    cooc = kg.get_operator_cooccurrence()
-    assert ("CsRank", "Neg") in cooc
-    assert ("CsRank", "Mean") in cooc
-    assert cooc[("CsRank", "Neg")] == 1
diff --git a/src/factorminer/factorminer/tests/test_library.py b/src/factorminer/factorminer/tests/test_library.py
deleted file mode 100644
index bc9eab9..0000000
--- a/src/factorminer/factorminer/tests/test_library.py
+++ /dev/null
@@ -1,356 +0,0 @@
-"""Tests for the factor library management system."""
-
-from __future__ import annotations
-
-import numpy as np
-import pytest
-
-from src.factorminer.factorminer.core.factor_library import Factor, FactorLibrary
-
-
-# ---------------------------------------------------------------------------
-# Helpers
-# ---------------------------------------------------------------------------
-
-@pytest.fixture
-def rng():
-    return np.random.default_rng(42)
-
-
-@pytest.fixture
-def empty_library():
-    return FactorLibrary(correlation_threshold=0.5, ic_threshold=0.04)
-
-
-def _make_factor(
-    name="test",
-    formula="Neg($close)",
-    ic=0.06,
-    signals=None,
-    rng=None,
-    M=20,
-    T=60,
-):
-    """Helper to create a Factor with random signals."""
-    if signals is None and rng is not None:
-        signals = rng.normal(0, 1, (M, T))
-    return Factor(
-        id=0,
-        name=name,
-        formula=formula,
-        category="test",
-        ic_mean=ic,
-        icir=1.0,
-        ic_win_rate=0.6,
-        max_correlation=0.0,
-        batch_number=1,
-        signals=signals,
-    )
-
-
-# ---------------------------------------------------------------------------
-# Admission
-# ---------------------------------------------------------------------------
-
-class TestAdmission:
-    """Test factor admission rules."""
-
-    def test_admit_first_factor(self, empty_library, rng):
-        factor = _make_factor(name="f1", ic=0.05, rng=rng)
-        fid = empty_library.admit_factor(factor)
-        assert fid == 1
-        assert empty_library.size == 1
-        assert factor.id == 1
-
-    def test_admit_assigns_incremental_ids(self, empty_library, rng):
-        f1 = _make_factor(name="f1", rng=rng)
-        f2 = _make_factor(name="f2", rng=rng)
-        id1 = empty_library.admit_factor(f1)
-        id2 = empty_library.admit_factor(f2)
-        assert id1 == 1
-        assert id2 == 2
-
-    def test_check_admission_ic_below_threshold(self, empty_library, rng):
-        signals = rng.normal(0, 1, (20, 60))
-        admitted, reason = empty_library.check_admission(0.03, signals)
-        assert not admitted
-        assert "below threshold" in reason
-
-    def test_check_admission_first_factor(self, empty_library, rng):
-        signals = rng.normal(0, 1, (20, 60))
-        admitted, reason = empty_library.check_admission(0.05, signals)
-        assert admitted
-        assert "First factor" in reason
-
-    def test_check_admission_rejects_high_correlation(self, rng):
-        lib = FactorLibrary(correlation_threshold=0.5, ic_threshold=0.04)
-        # Add a factor
-        f1 = _make_factor(name="f1", rng=rng)
-        lib.admit_factor(f1)
-
-        # Try to admit same signals (correlation = 1.0)
-        admitted, reason = lib.check_admission(0.05, f1.signals)
-        assert not admitted
-        assert "correlation" in reason.lower()
-
-    def test_check_admission_accepts_low_correlation(self, rng):
-        lib = FactorLibrary(correlation_threshold=0.5, ic_threshold=0.04)
-        f1 = _make_factor(name="f1", rng=rng)
-        lib.admit_factor(f1)
-
-        # Independent signals
-        independent_signals = rng.normal(0, 1, (20, 60))
-        admitted, reason = lib.check_admission(0.05, independent_signals)
-        assert admitted
-
-    def test_factor_in_library_after_admission(self, empty_library, rng):
-        factor = _make_factor(name="f1", rng=rng)
-        fid = empty_library.admit_factor(factor)
-        assert fid in empty_library.factors
-        retrieved = empty_library.get_factor(fid)
-        assert retrieved.name == "f1"
-
-
-# ---------------------------------------------------------------------------
-# Replacement
-# ---------------------------------------------------------------------------
-
-class TestReplacement:
-    """Test the replacement mechanism (Eq. 11)."""
-
-    def test_replacement_ic_below_floor(self, rng):
-        lib = FactorLibrary(correlation_threshold=0.5, ic_threshold=0.04)
-        f1 = _make_factor(name="f1", ic=0.06, rng=rng)
-        lib.admit_factor(f1)
-
-        signals = rng.normal(0, 1, (20, 60))
-        should, fid, reason = lib.check_replacement(0.05, signals)  # Below 0.10
-        assert not should
-        assert "below replacement floor" in reason
-
-    def test_replacement_needs_exactly_one_correlated(self, rng):
-        lib = FactorLibrary(correlation_threshold=0.5, ic_threshold=0.04)
-        f1 = _make_factor(name="f1", ic=0.06, rng=rng)
-        lib.admit_factor(f1)
-
-        # Independent signals -> 0 correlated factors
-        signals = rng.normal(0, 1, (20, 60))
-        should, fid, reason = lib.check_replacement(0.15, signals)
-        assert not should
-        assert "0 correlated" in reason
-
-    def test_replacement_success(self, rng):
-        lib = FactorLibrary(correlation_threshold=0.5, ic_threshold=0.04)
-        f1_signals = rng.normal(0, 1, (20, 60))
-        f1 = _make_factor(name="f1", ic=0.06, signals=f1_signals)
-        lib.admit_factor(f1)
-
-        # Candidate highly correlated with f1 but much better IC
-        candidate_signals = f1_signals + rng.normal(0, 0.1, (20, 60))
-        should, old_id, reason = lib.check_replacement(
-            0.15, candidate_signals, ic_min=0.10, ic_ratio=1.3
-        )
-        assert should
-        assert old_id == 1
-
-    def test_replace_factor(self, rng):
-        lib = FactorLibrary(correlation_threshold=0.5, ic_threshold=0.04)
-        f1 = _make_factor(name="old_factor", ic=0.06, rng=rng)
-        fid = lib.admit_factor(f1)
-
-        new_factor = _make_factor(name="new_factor", ic=0.15, rng=rng)
-        lib.replace_factor(fid, new_factor)
-
-        assert fid not in lib.factors
-        assert lib.size == 1
-        remaining = lib.list_factors()
-        assert remaining[0].name == "new_factor"
-
-    def test_replace_nonexistent_raises(self, empty_library, rng):
-        new_factor = _make_factor(name="new", rng=rng)
-        with pytest.raises(KeyError):
-            empty_library.replace_factor(999, new_factor)
-
-
-# ---------------------------------------------------------------------------
-# Correlation matrix
-# ---------------------------------------------------------------------------
-
-class TestCorrelationMatrix:
-    """Test correlation matrix management."""
-
-    def test_matrix_initialized_on_first_admit(self, empty_library, rng):
-        f = _make_factor(name="f1", rng=rng)
-        empty_library.admit_factor(f)
-        assert empty_library.correlation_matrix is not None
-        assert empty_library.correlation_matrix.shape == (1, 1)
-
-    def test_matrix_grows_with_admissions(self, empty_library, rng):
-        for i in range(3):
-            f = _make_factor(name=f"f{i}", rng=rng)
-            empty_library.admit_factor(f)
-        assert empty_library.correlation_matrix.shape == (3, 3)
-
-    def test_matrix_symmetric(self, rng):
-        lib = FactorLibrary()
-        for i in range(4):
-            f = _make_factor(name=f"f{i}", rng=rng)
-            lib.admit_factor(f)
-        mat = lib.correlation_matrix
-        np.testing.assert_array_almost_equal(mat, mat.T)
-
-    def test_update_correlation_matrix_full(self, rng):
-        lib = FactorLibrary()
-        for i in range(3):
-            f = _make_factor(name=f"f{i}", rng=rng)
-            lib.admit_factor(f)
-        # Full recompute
-        lib.update_correlation_matrix()
-        assert lib.correlation_matrix.shape == (3, 3)
-        np.testing.assert_array_almost_equal(
-            lib.correlation_matrix, lib.correlation_matrix.T
-        )
-
-    def test_compute_correlation_same_signals(self, rng):
-        lib = FactorLibrary()
-        signals = rng.normal(0, 1, (20, 60))
-        corr = lib.compute_correlation(signals, signals)
-        assert corr > 0.95
-
-
-# ---------------------------------------------------------------------------
-# Queries and diagnostics
-# ---------------------------------------------------------------------------
-
-class TestQueries:
-    """Test library query methods."""
-
-    def test_size_property(self, mock_library):
-        assert mock_library.size == 3
-
-    def test_list_factors(self, mock_library):
-        factors = mock_library.list_factors()
-        assert len(factors) == 3
-        # Should be sorted by ID
-        ids = [f.id for f in factors]
-        assert ids == sorted(ids)
-
-    def test_get_factor(self, mock_library):
-        factors = mock_library.list_factors()
-        fid = factors[0].id
-        f = mock_library.get_factor(fid)
-        assert f.id == fid
-
-    def test_get_factor_nonexistent_raises(self, mock_library):
-        with pytest.raises(KeyError):
-            mock_library.get_factor(9999)
-
-    def test_get_factors_by_category(self, mock_library):
-        result = mock_library.get_factors_by_category("test")
-        assert len(result) == 3
-
-    def test_get_factors_by_nonexistent_category(self, mock_library):
-        result = mock_library.get_factors_by_category("nonexistent")
-        assert len(result) == 0
-
-    def test_get_diagnostics(self, mock_library):
-        diag = mock_library.get_diagnostics()
-        assert "size" in diag
-        assert diag["size"] == 3
-        assert "avg_correlation" in diag
-        assert "max_correlation" in diag
-        assert "category_counts" in diag
-        assert "saturation" in diag
-
-    def test_get_state_summary(self, mock_library):
-        summary = mock_library.get_state_summary()
-        assert "library_size" in summary
-        assert summary["library_size"] == 3
-        assert "categories" in summary
-        assert "recent_admissions" in summary
-
-
-# ---------------------------------------------------------------------------
-# Factor serialization
-# ---------------------------------------------------------------------------
-
-class TestFactorSerialization:
-    """Test Factor to_dict / from_dict."""
-
-    def test_factor_to_dict(self):
-        f = Factor(
-            id=1,
-            name="test",
-            formula="Neg($close)",
-            category="momentum",
-            ic_mean=0.06,
-            icir=1.0,
-            ic_win_rate=0.6,
-            max_correlation=0.1,
-            batch_number=1,
-            admission_date="2024-01-01 00:00:00",
-        )
-        d = f.to_dict()
-        assert d["id"] == 1
-        assert d["name"] == "test"
-        assert d["formula"] == "Neg($close)"
-        assert "signals" not in d
-
-    def test_factor_from_dict(self):
-        d = {
-            "id": 2,
-            "name": "restored",
-            "formula": "Add($open, $close)",
-            "category": "arithmetic",
-            "ic_mean": 0.08,
-            "icir": 1.2,
-            "ic_win_rate": 0.65,
-            "max_correlation": 0.2,
-            "batch_number": 3,
-            "admission_date": "2024-06-15 12:00:00",
-        }
-        f = Factor.from_dict(d)
-        assert f.id == 2
-        assert f.name == "restored"
-        assert f.formula == "Add($open, $close)"
-
-    def test_factor_roundtrip(self):
-        f = Factor(
-            id=5,
-            name="roundtrip",
-            formula="CsRank($close)",
-            category="cross_sectional",
-            ic_mean=0.07,
-            icir=0.9,
-            ic_win_rate=0.58,
-            max_correlation=0.15,
-            batch_number=2,
-        )
-        restored = Factor.from_dict(f.to_dict())
-        assert restored.name == f.name
-        assert restored.ic_mean == f.ic_mean
-        assert restored.formula == f.formula
-
-    def test_factor_roundtrip_preserves_provenance(self):
-        f = Factor(
-            id=6,
-            name="with_provenance",
-            formula="Neg($close)",
-            category="test",
-            ic_mean=0.05,
-            icir=0.9,
-            ic_win_rate=0.55,
-            max_correlation=0.1,
-            batch_number=2,
-            provenance={
-                "run_id": "run_001",
-                "generator_family": "MockProvider",
-                "candidate_rank": 2,
-            },
-        )
-
-        restored = Factor.from_dict(f.to_dict())
-
-        assert restored.provenance["run_id"] == "run_001"
-        assert restored.provenance["generator_family"] == "MockProvider"
-        assert restored.provenance["candidate_rank"] == 2
diff --git a/src/factorminer/factorminer/tests/test_memory.py b/src/factorminer/factorminer/tests/test_memory.py
deleted file mode 100644
index 6e75676..0000000
--- a/src/factorminer/factorminer/tests/test_memory.py
+++ /dev/null
@@ -1,405 +0,0 @@
-"""Tests for the experience memory system."""
-
-from __future__ import annotations
-
-import json
-import tempfile
-from pathlib import Path
-
-import pytest
-
-from src.factorminer.factorminer.memory.experience_memory import ExperienceMemoryManager
-from src.factorminer.factorminer.memory.embeddings import FormulaEmbedder
-from src.factorminer.factorminer.memory.kg_retrieval import retrieve_memory_enhanced
-from src.factorminer.factorminer.memory.knowledge_graph import FactorKnowledgeGraph, FactorNode
-from src.factorminer.factorminer.memory.memory_store import (
-    ExperienceMemory,
-    ForbiddenDirection,
-    MiningState,
-    StrategicInsight,
-    SuccessPattern,
-)
-from src.factorminer.factorminer.memory.formation import form_memory
-from src.factorminer.factorminer.memory.evolution import evolve_memory
-from src.factorminer.factorminer.memory.retrieval import retrieve_memory
-
-
-# ---------------------------------------------------------------------------
-# Initialization
-# ---------------------------------------------------------------------------
-
-class TestInitialization:
-    """Test memory manager initialization with default patterns."""
-
-    def test_default_success_patterns_loaded(self, mock_memory):
-        assert len(mock_memory.memory.success_patterns) > 0
-
-    def test_default_forbidden_directions_loaded(self, mock_memory):
-        assert len(mock_memory.memory.forbidden_directions) > 0
-
-    def test_default_insights_loaded(self, mock_memory):
-        assert len(mock_memory.memory.insights) > 0
-
-    def test_initial_version_is_zero(self, mock_memory):
-        assert mock_memory.version == 0
-
-    def test_default_pattern_names(self, mock_memory):
-        names = [p.name for p in mock_memory.memory.success_patterns]
-        assert "Higher Moment Regimes" in names
-        assert "PV Corr Interaction" in names
-
-    def test_default_forbidden_names(self, mock_memory):
-        names = [f.name for f in mock_memory.memory.forbidden_directions]
-        assert "Standardized Returns/Amount" in names
-        assert "VWAP Deviation variants" in names
-
-    def test_memory_reset(self, mock_memory):
-        # Modify state
-        mock_memory.memory.version = 99
-        mock_memory.reset()
-        assert mock_memory.version == 0
-        assert len(mock_memory.memory.success_patterns) > 0
-
-
-# ---------------------------------------------------------------------------
-# Formation
-# ---------------------------------------------------------------------------
-
-class TestFormation:
-    """Test memory formation from trajectory."""
-
-    def test_form_memory_creates_new_memory(self, mock_memory, sample_trajectory):
-        formed = form_memory(mock_memory.memory, sample_trajectory, batch_number=1)
-        assert isinstance(formed, ExperienceMemory)
-
-    def test_form_memory_updates_state(self, mock_memory, sample_trajectory):
-        formed = form_memory(mock_memory.memory, sample_trajectory, batch_number=1)
-        # Should count 2 admitted factors
-        assert formed.state.library_size == mock_memory.memory.state.library_size + 2
-
-    def test_form_memory_recent_admissions(self, mock_memory, sample_trajectory):
-        formed = form_memory(mock_memory.memory, sample_trajectory, batch_number=1)
-        assert len(formed.state.recent_admissions) == 2
-
-    def test_form_memory_recent_rejections(self, mock_memory, sample_trajectory):
-        formed = form_memory(mock_memory.memory, sample_trajectory, batch_number=1)
-        assert len(formed.state.recent_rejections) == 2
-
-    def test_form_memory_admission_log(self, mock_memory, sample_trajectory):
-        formed = form_memory(mock_memory.memory, sample_trajectory, batch_number=1)
-        assert len(formed.state.admission_log) >= 1
-        last_log = formed.state.admission_log[-1]
-        assert last_log["batch"] == 1
-        assert last_log["admitted"] == 2
-        assert last_log["rejected"] == 2
-
-    def test_form_memory_empty_trajectory(self, mock_memory):
-        formed = form_memory(mock_memory.memory, [], batch_number=1)
-        assert formed.state.library_size == mock_memory.memory.state.library_size
-
-    def test_form_memory_extracts_success_patterns(self, mock_memory, sample_trajectory):
-        formed = form_memory(mock_memory.memory, sample_trajectory, batch_number=1)
-        # Should have some patterns (at least the defaults)
-        assert len(formed.success_patterns) >= len(mock_memory.memory.success_patterns)
-
-
-# ---------------------------------------------------------------------------
-# Evolution
-# ---------------------------------------------------------------------------
-
-class TestEvolution:
-    """Test memory evolution (merge + consolidate)."""
-
-    def test_evolve_increments_version(self, mock_memory, sample_trajectory):
-        formed = form_memory(mock_memory.memory, sample_trajectory, batch_number=1)
-        evolved = evolve_memory(mock_memory.memory, formed)
-        assert evolved.version == mock_memory.memory.version + 1
-
-    def test_evolve_merges_success_patterns(self, mock_memory, sample_trajectory):
-        formed = form_memory(mock_memory.memory, sample_trajectory, batch_number=1)
-        evolved = evolve_memory(mock_memory.memory, formed)
-        # Should have at least as many patterns as before
-        assert len(evolved.success_patterns) >= len(mock_memory.memory.success_patterns)
-
-    def test_evolve_merges_forbidden_directions(self, mock_memory, sample_trajectory):
-        formed = form_memory(mock_memory.memory, sample_trajectory, batch_number=1)
-        evolved = evolve_memory(mock_memory.memory, formed)
-        assert len(evolved.forbidden_directions) >= len(mock_memory.memory.forbidden_directions)
-
-    def test_evolve_caps_memory_size(self, mock_memory, sample_trajectory):
-        formed = form_memory(mock_memory.memory, sample_trajectory, batch_number=1)
-        evolved = evolve_memory(
-            mock_memory.memory, formed,
-            max_success_patterns=5,
-            max_failure_patterns=5,
-            max_insights=5,
-        )
-        assert len(evolved.success_patterns) <= 5
-        assert len(evolved.forbidden_directions) <= 5
-        assert len(evolved.insights) <= 5
-
-
-# ---------------------------------------------------------------------------
-# Retrieval
-# ---------------------------------------------------------------------------
-
-class TestRetrieval:
-    """Test context-dependent memory retrieval."""
-
-    def test_retrieve_returns_dict(self, mock_memory):
-        result = mock_memory.retrieve()
-        assert isinstance(result, dict)
-
-    def test_retrieve_has_required_keys(self, mock_memory):
-        result = mock_memory.retrieve()
-        assert "recommended_directions" in result
-        assert "forbidden_directions" in result
-        assert "insights" in result
-        assert "library_state" in result
-        assert "prompt_text" in result
-
-    def test_retrieve_prompt_text_is_string(self, mock_memory):
-        result = mock_memory.retrieve()
-        assert isinstance(result["prompt_text"], str)
-        assert len(result["prompt_text"]) > 0
-
-    def test_retrieve_with_library_state(self, mock_memory):
-        lib_state = {
-            "library_size": 50,
-            "domain_saturation": {"Momentum": 0.8, "VWAP": 0.3},
-        }
-        result = mock_memory.retrieve(library_state=lib_state)
-        assert result["library_state"]["library_size"] == 50
-
-    def test_retrieve_respects_max_limits(self, mock_memory):
-        result = mock_memory.retrieve(max_success=2, max_forbidden=2, max_insights=1)
-        assert len(result["recommended_directions"]) <= 2
-        assert len(result["forbidden_directions"]) <= 2
-        assert len(result["insights"]) <= 1
-
-    def test_retrieve_deprioritizes_saturated_patterns(self, mock_memory):
-        # Set high domain saturation
-        mock_memory.memory.state.domain_saturation = {
-            "Higher Moment Regimes": 0.9,
-            "PV Corr Interaction": 0.9,
-        }
-        result = mock_memory.retrieve(max_success=3)
-        # Saturated patterns should be scored lower
-        names = [p["name"] for p in result["recommended_directions"]]
-        # There should still be patterns, but saturated ones ranked lower
-        assert len(names) > 0
-
-    def test_enhanced_retrieval_uses_semantic_similarity_and_removals(self):
-        memory = ExperienceMemory()
-        memory.state.recent_admissions = [
-            {
-                "factor_id": "query_factor",
-                "formula": "CsRank(Corr($close, $volume, 20))",
-            }
-        ]
-
-        kg = FactorKnowledgeGraph()
-        kg.add_factor(FactorNode(
-            factor_id="neighbor_factor",
-            formula="CsRank(Corr($close, $volume, 20))",
-            operators=["CsRank", "Corr"],
-            features=["$close", "$volume"],
-            admitted=True,
-        ))
-        kg.add_factor(FactorNode(
-            factor_id="distant_factor",
-            formula="Neg(Std($returns, 10))",
-            operators=["Neg", "Std"],
-            features=["$returns"],
-            admitted=True,
-        ))
-
-        embedder = FormulaEmbedder(use_faiss=False)
-
-        result = retrieve_memory_enhanced(
-            memory,
-            kg=kg,
-            embedder=embedder,
-        )
-
-        assert result["semantic_neighbors"]
-        assert any("neighbor_factor" in item for item in result["semantic_neighbors"])
-
-        kg.remove_factor("neighbor_factor")
-        embedder.remove("neighbor_factor")
-
-        refreshed = retrieve_memory_enhanced(
-            memory,
-            kg=kg,
-            embedder=embedder,
-        )
-
-        assert all("neighbor_factor" not in item for item in refreshed["semantic_neighbors"])
-
-
-# ---------------------------------------------------------------------------
-# Full update cycle
-# ---------------------------------------------------------------------------
-
-class TestUpdateCycle:
-    """Test the full update (formation + evolution) via the manager."""
-
-    def test_update_returns_summary(self, mock_memory, sample_trajectory):
-        summary = mock_memory.update(sample_trajectory)
-        assert "batch" in summary
-        assert "admitted_count" in summary
-        assert "rejected_count" in summary
-        assert summary["admitted_count"] == 2
-        assert summary["rejected_count"] == 2
-
-    def test_update_increments_version(self, mock_memory, sample_trajectory):
-        assert mock_memory.version == 0
-        mock_memory.update(sample_trajectory)
-        assert mock_memory.version == 1
-
-    def test_multiple_updates(self, mock_memory, sample_trajectory):
-        for i in range(3):
-            mock_memory.update(sample_trajectory)
-        assert mock_memory.version == 3
-
-
-# ---------------------------------------------------------------------------
-# Save / load roundtrip
-# ---------------------------------------------------------------------------
-
-class TestPersistence:
-    """Test save and load roundtrip."""
-
-    def test_save_load_roundtrip(self, mock_memory, sample_trajectory):
-        # Update memory with some data
-        mock_memory.update(sample_trajectory)
-
-        with tempfile.TemporaryDirectory() as tmpdir:
-            path = Path(tmpdir) / "memory.json"
-            mock_memory.save(path)
-
-            # Verify file exists and is valid JSON
-            assert path.exists()
-            with open(path) as f:
-                data = json.load(f)
-            assert "version" in data
-            assert "success_patterns" in data
-
-            # Load into new manager
-            new_manager = ExperienceMemoryManager()
-            new_manager.load(path)
-
-            assert new_manager.version == mock_memory.version
-            assert len(new_manager.memory.success_patterns) == len(
-                mock_memory.memory.success_patterns
-            )
-            assert len(new_manager.memory.forbidden_directions) == len(
-                mock_memory.memory.forbidden_directions
-            )
-
-    def test_save_creates_directory(self, mock_memory):
-        with tempfile.TemporaryDirectory() as tmpdir:
-            path = Path(tmpdir) / "subdir" / "deep" / "memory.json"
-            mock_memory.save(path)
-            assert path.exists()
-
-
-# ---------------------------------------------------------------------------
-# Memory store serialization
-# ---------------------------------------------------------------------------
-
-class TestMemoryStoreSerialization:
-    """Test data class to_dict / from_dict methods."""
-
-    def test_success_pattern_roundtrip(self):
-        pat = SuccessPattern(
-            name="Test Pattern",
-            description="A test",
-            template="CsRank($close)",
-            success_rate="High",
-            example_factors=["f1", "f2"],
-            occurrence_count=5,
-        )
-        d = pat.to_dict()
-        restored = SuccessPattern.from_dict(d)
-        assert restored.name == pat.name
-        assert restored.occurrence_count == pat.occurrence_count
-        assert restored.success_rate == pat.success_rate
-
-    def test_forbidden_direction_roundtrip(self):
-        fd = ForbiddenDirection(
-            name="Bad Direction",
-            description="Avoid this",
-            correlated_factors=["f1"],
-            typical_correlation=0.7,
-            reason="Too correlated",
-            occurrence_count=3,
-        )
-        d = fd.to_dict()
-        restored = ForbiddenDirection.from_dict(d)
-        assert restored.name == fd.name
-        assert restored.typical_correlation == fd.typical_correlation
-
-    def test_strategic_insight_roundtrip(self):
-        insight = StrategicInsight(
-            insight="Test insight",
-            evidence="Some evidence",
-            batch_source=5,
-        )
-        d = insight.to_dict()
-        restored = StrategicInsight.from_dict(d)
-        assert restored.insight == insight.insight
-        assert restored.batch_source == 5
-
-    def test_mining_state_roundtrip(self):
-        state = MiningState(
-            library_size=42,
-            domain_saturation={"Momentum": 0.5},
-        )
-        d = state.to_dict()
-        restored = MiningState.from_dict(d)
-        assert restored.library_size == 42
-        assert restored.domain_saturation["Momentum"] == 0.5
-
-    def test_full_memory_roundtrip(self):
-        mem = ExperienceMemory(
-            state=MiningState(library_size=10),
-            success_patterns=[
-                SuccessPattern(name="P1", description="d1", template="t1", success_rate="High")
-            ],
-            forbidden_directions=[
-                ForbiddenDirection(name="F1", description="d1")
-            ],
-            insights=[
-                StrategicInsight(insight="I1", evidence="E1")
-            ],
-            version=3,
-        )
-        d = mem.to_dict()
-        restored = ExperienceMemory.from_dict(d)
-        assert restored.version == 3
-        assert len(restored.success_patterns) == 1
-        assert len(restored.forbidden_directions) == 1
-        assert len(restored.insights) == 1
-
-
-# ---------------------------------------------------------------------------
-# Stats
-# ---------------------------------------------------------------------------
-
-class TestStats:
-    """Test memory manager statistics."""
-
-    def test_get_stats_keys(self, mock_memory):
-        stats = mock_memory.get_stats()
-        assert "version" in stats
-        assert "batch_counter" in stats
-        assert "success_patterns" in stats
-        assert "forbidden_directions" in stats
-        assert "insights" in stats
-
-    def test_get_stats_after_update(self, mock_memory, sample_trajectory):
-        mock_memory.update(sample_trajectory)
-        stats = mock_memory.get_stats()
-        assert stats["batch_counter"] == 1
-        assert stats["version"] == 1
diff --git a/src/factorminer/factorminer/tests/test_operators.py b/src/factorminer/factorminer/tests/test_operators.py
deleted file mode 100644
index 7a23e22..0000000
--- a/src/factorminer/factorminer/tests/test_operators.py
+++ /dev/null
@@ -1,500 +0,0 @@
-"""Tests for all operator categories via the registry."""
-
-from __future__ import annotations
-
-import numpy as np
-import pytest
-
-from src.factorminer.factorminer.operators.registry import execute_operator, get_operator, list_operators, implemented_operators
-
-
-# ---------------------------------------------------------------------------
-# Helpers
-# ---------------------------------------------------------------------------
-
-def _arr(*rows):
-    """Build a (M, T) float64 array from nested lists."""
-    return np.array(rows, dtype=np.float64)
-
-
-@pytest.fixture
-def x_simple():
-    """Simple 2x10 input for operator tests."""
-    return _arr(
-        [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
-        [10, 9, 8, 7, 6, 5, 4, 3, 2, 1],
-    )
-
-
-@pytest.fixture
-def y_simple():
-    return _arr(
-        [10, 9, 8, 7, 6, 5, 4, 3, 2, 1],
-        [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
-    )
-
-
-# ---------------------------------------------------------------------------
-# Arithmetic operators
-# ---------------------------------------------------------------------------
-
-class TestArithmeticOps:
-    """Test element-wise arithmetic operators."""
-
-    def test_add(self, x_simple, y_simple):
-        result = execute_operator("Add", x_simple, y_simple)
-        expected = x_simple + y_simple
-        np.testing.assert_array_almost_equal(result, expected)
-
-    def test_sub(self, x_simple, y_simple):
-        result = execute_operator("Sub", x_simple, y_simple)
-        expected = x_simple - y_simple
-        np.testing.assert_array_almost_equal(result, expected)
-
-    def test_mul(self, x_simple, y_simple):
-        result = execute_operator("Mul", x_simple, y_simple)
-        expected = x_simple * y_simple
-        np.testing.assert_array_almost_equal(result, expected)
-
-    def test_neg_negates(self, x_simple):
-        result = execute_operator("Neg", x_simple)
-        np.testing.assert_array_almost_equal(result, -x_simple)
-
-    def test_neg_double_neg(self, x_simple):
-        result = execute_operator("Neg", execute_operator("Neg", x_simple))
-        np.testing.assert_array_almost_equal(result, x_simple)
-
-    def test_abs(self):
-        x = _arr([-1, -2, 3, 0], [5, -6, 0, -8])
-        result = execute_operator("Abs", x)
-        np.testing.assert_array_almost_equal(result, np.abs(x))
-
-    def test_sign(self):
-        x = _arr([-3, 0, 5], [7, -2, 0])
-        result = execute_operator("Sign", x)
-        np.testing.assert_array_almost_equal(result, np.sign(x))
-
-    def test_div_by_zero_returns_nan(self):
-        x = _arr([1, 2, 3], [4, 5, 6])
-        y = _arr([0, 0, 0], [1, 0, 2])
-        result = execute_operator("Div", x, y)
-        # Where y is 0, result should be NaN
-        assert np.isnan(result[0, 0])
-        assert np.isnan(result[0, 1])
-        assert np.isnan(result[1, 1])
-        # Where y is non-zero, should be correct
-        np.testing.assert_almost_equal(result[1, 0], 4.0)
-        np.testing.assert_almost_equal(result[1, 2], 3.0)
-
-    def test_log_handles_negative(self):
-        x = _arr([-1, 0, 1, np.e - 1], [2, -3, 0.5, 10])
-        result = execute_operator("Log", x)
-        # Log is defined as log(1+|x|)*sign(x)
-        expected = np.log1p(np.abs(x)) * np.sign(x)
-        np.testing.assert_array_almost_equal(result, expected)
-
-    def test_sqrt_handles_negative(self):
-        x = _arr([-4, 0, 9, 16], [1, -1, 4, 25])
-        result = execute_operator("Sqrt", x)
-        expected = np.sqrt(np.abs(x)) * np.sign(x)
-        np.testing.assert_array_almost_equal(result, expected)
-
-    def test_square(self, x_simple):
-        result = execute_operator("Square", x_simple)
-        np.testing.assert_array_almost_equal(result, x_simple ** 2)
-
-    def test_inv_zero_returns_nan(self):
-        x = _arr([0, 1, 2], [3, 0, 5])
-        result = execute_operator("Inv", x)
-        assert np.isnan(result[0, 0])
-        assert np.isnan(result[1, 1])
-        np.testing.assert_almost_equal(result[0, 1], 1.0)
-
-    def test_max_elementwise(self, x_simple, y_simple):
-        result = execute_operator("Max", x_simple, y_simple)
-        np.testing.assert_array_almost_equal(result, np.fmax(x_simple, y_simple))
-
-    def test_min_elementwise(self, x_simple, y_simple):
-        result = execute_operator("Min", x_simple, y_simple)
-        np.testing.assert_array_almost_equal(result, np.fmin(x_simple, y_simple))
-
-    def test_clip(self):
-        x = _arr([-5, -1, 0, 2, 5], [10, -10, 3, -3, 0])
-        result = execute_operator("Clip", x, params={"lower": -3.0, "upper": 3.0})
-        np.testing.assert_array_almost_equal(result, np.clip(x, -3.0, 3.0))
-
-
-# ---------------------------------------------------------------------------
-# Statistical operators (rolling window)
-# ---------------------------------------------------------------------------
-
-class TestStatisticalOps:
-    """Test rolling-window statistical operators."""
-
-    def test_mean_window3(self):
-        x = _arr([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
-        result = execute_operator("Mean", x, params={"window": 3})
-        assert result.shape == (1, 10)
-        # First 2 values should be NaN
-        assert np.isnan(result[0, 0])
-        assert np.isnan(result[0, 1])
-        # Mean of [1,2,3] = 2.0
-        np.testing.assert_almost_equal(result[0, 2], 2.0)
-        # Mean of [2,3,4] = 3.0
-        np.testing.assert_almost_equal(result[0, 3], 3.0)
-        # Mean of [8,9,10] = 9.0
-        np.testing.assert_almost_equal(result[0, 9], 9.0)
-
-    def test_std_window3(self):
-        x = _arr([1, 2, 3, 4, 5])
-        result = execute_operator("Std", x, params={"window": 3})
-        assert result.shape == (1, 5)
-        # First 2 values NaN
-        assert np.isnan(result[0, 0])
-        assert np.isnan(result[0, 1])
-        # std of [1,2,3] with ddof=1 = 1.0
-        np.testing.assert_almost_equal(result[0, 2], 1.0)
-
-    def test_sum_window3(self):
-        x = _arr([1, 2, 3, 4, 5])
-        result = execute_operator("Sum", x, params={"window": 3})
-        np.testing.assert_almost_equal(result[0, 2], 6.0)  # 1+2+3
-        np.testing.assert_almost_equal(result[0, 4], 12.0)  # 3+4+5
-
-    def test_ts_max_window3(self):
-        x = _arr([3, 1, 4, 1, 5, 9, 2, 6])
-        result = execute_operator("TsMax", x, params={"window": 3})
-        np.testing.assert_almost_equal(result[0, 2], 4.0)  # max(3,1,4)
-        np.testing.assert_almost_equal(result[0, 5], 9.0)  # max(1,5,9)
-
-    def test_ts_min_window3(self):
-        x = _arr([3, 1, 4, 1, 5, 9, 2, 6])
-        result = execute_operator("TsMin", x, params={"window": 3})
-        np.testing.assert_almost_equal(result[0, 2], 1.0)  # min(3,1,4)
-        np.testing.assert_almost_equal(result[0, 5], 1.0)  # min(1,5,9)
-
-    def test_ts_rank_basic(self):
-        # Ascending series: latest should have high rank
-        x = _arr([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
-        result = execute_operator("TsRank", x, params={"window": 5})
-        # At index 4, window=[1,2,3,4,5], latest=5 is the largest
-        # count_less = 4 values less than 5, count_valid = 5
-        # rank = 4 / (5-1) = 1.0
-        np.testing.assert_almost_equal(result[0, 4], 1.0)
-
-    def test_median_window3(self):
-        x = _arr([1, 5, 3, 4, 2])
-        result = execute_operator("Median", x, params={"window": 3})
-        np.testing.assert_almost_equal(result[0, 2], 3.0)  # median(1,5,3)
-
-
-# ---------------------------------------------------------------------------
-# Time-series operators
-# ---------------------------------------------------------------------------
-
-class TestTimeseriesOps:
-    """Test time-series operators like Delta, Delay, Return."""
-
-    def test_delta_period1_is_diff(self):
-        x = _arr([1, 3, 6, 10, 15])
-        result = execute_operator("Delta", x, params={"window": 1})
-        assert np.isnan(result[0, 0])
-        np.testing.assert_almost_equal(result[0, 1], 2.0)  # 3-1
-        np.testing.assert_almost_equal(result[0, 2], 3.0)  # 6-3
-        np.testing.assert_almost_equal(result[0, 3], 4.0)  # 10-6
-        np.testing.assert_almost_equal(result[0, 4], 5.0)  # 15-10
-
-    def test_delay_lags_by_period(self):
-        x = _arr([10, 20, 30, 40, 50])
-        result = execute_operator("Delay", x, params={"window": 2})
-        assert np.isnan(result[0, 0])
-        assert np.isnan(result[0, 1])
-        np.testing.assert_almost_equal(result[0, 2], 10.0)
-        np.testing.assert_almost_equal(result[0, 3], 20.0)
-        np.testing.assert_almost_equal(result[0, 4], 30.0)
-
-    def test_return_period1(self):
-        x = _arr([100, 110, 99, 105])
-        result = execute_operator("Return", x, params={"window": 1})
-        assert np.isnan(result[0, 0])
-        np.testing.assert_almost_equal(result[0, 1], 0.10)  # 110/100 - 1
-        np.testing.assert_almost_equal(result[0, 2], -0.1, decimal=2)  # 99/110 - 1
-
-    def test_cumsum(self):
-        x = _arr([1, 2, 3, 4, 5])
-        result = execute_operator("CumSum", x)
-        np.testing.assert_array_almost_equal(result[0], [1, 3, 6, 10, 15])
-
-    def test_cummax(self):
-        x = _arr([3, 1, 4, 1, 5, 9, 2])
-        result = execute_operator("CumMax", x)
-        np.testing.assert_array_almost_equal(result[0], [3, 3, 4, 4, 5, 9, 9])
-
-    def test_cummin(self):
-        x = _arr([5, 3, 4, 1, 2, 6, 0])
-        result = execute_operator("CumMin", x)
-        np.testing.assert_array_almost_equal(result[0], [5, 3, 3, 1, 1, 1, 0])
-
-
-# ---------------------------------------------------------------------------
-# Cross-sectional operators
-# ---------------------------------------------------------------------------
-
-class TestCrossSectionalOps:
-    """Test cross-sectional operators."""
-
-    def test_csrank_produces_percentiles(self):
-        # 5 assets, 1 time step; values 1..5
-        x = _arr([1], [2], [3], [4], [5])
-        result = execute_operator("CsRank", x)
-        assert result.shape == (5, 1)
-        # Ranks should be [0, 0.25, 0.5, 0.75, 1.0]
-        expected = _arr([0], [0.25], [0.5], [0.75], [1.0])
-        np.testing.assert_array_almost_equal(result, expected)
-
-    def test_csrank_nan_handling(self):
-        x = _arr([np.nan], [2], [3], [np.nan], [5])
-        result = execute_operator("CsRank", x)
-        assert np.isnan(result[0, 0])
-        assert np.isnan(result[3, 0])
-        # Valid ranks for [2, 3, 5] = [0, 0.5, 1.0]
-        valid = result[~np.isnan(result)]
-        assert len(valid) == 3
-
-    def test_cszscore_zero_mean(self):
-        x = _arr([1], [2], [3], [4], [5])
-        result = execute_operator("CsZScore", x)
-        # Mean of z-scores should be ~0
-        np.testing.assert_almost_equal(np.nanmean(result[:, 0]), 0.0, decimal=10)
-
-    def test_csdemean(self):
-        x = _arr([10], [20], [30])
-        result = execute_operator("CsDemean", x)
-        expected = _arr([-10], [0], [10])
-        np.testing.assert_array_almost_equal(result, expected)
-
-    def test_csscale_unit_l1(self):
-        x = _arr([1], [2], [3])
-        result = execute_operator("CsScale", x)
-        l1_norm = np.nansum(np.abs(result[:, 0]))
-        np.testing.assert_almost_equal(l1_norm, 1.0)
-
-
-# ---------------------------------------------------------------------------
-# Smoothing operators
-# ---------------------------------------------------------------------------
-
-class TestSmoothingOps:
-    """Test smoothing / moving average operators."""
-
-    def test_sma_equals_mean(self):
-        x = _arr([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
-        sma = execute_operator("SMA", x, params={"window": 3})
-        mean = execute_operator("Mean", x, params={"window": 3})
-        # SMA should equal Mean for non-NaN data
-        valid = ~(np.isnan(sma) | np.isnan(mean))
-        np.testing.assert_array_almost_equal(sma[valid], mean[valid])
-
-    def test_ema_convergence(self):
-        # Constant series: EMA should converge to that constant
-        x = _arr([5, 5, 5, 5, 5, 5, 5, 5, 5, 5])
-        result = execute_operator("EMA", x, params={"window": 3})
-        # Should all be 5 (for constant input, EMA = constant)
-        np.testing.assert_array_almost_equal(result[0], np.full(10, 5.0))
-
-    def test_ema_output_shape(self, x_simple):
-        result = execute_operator("EMA", x_simple, params={"window": 5})
-        assert result.shape == x_simple.shape
-
-
-# ---------------------------------------------------------------------------
-# Regression operators
-# ---------------------------------------------------------------------------
-
-class TestRegressionOps:
-    """Test rolling regression operators."""
-
-    def test_slope_of_linear_data(self):
-        # Perfectly linear: y = 2*t for each asset
-        t_vals = np.arange(20, dtype=np.float64)
-        x = np.stack([2 * t_vals, 3 * t_vals], axis=0)  # (2, 20)
-        result = execute_operator("TsLinRegSlope", x, params={"window": 5})
-        # After window-1 NaNs, slope should be ~2.0 for first asset
-        valid_idx = ~np.isnan(result[0])
-        if valid_idx.any():
-            np.testing.assert_almost_equal(result[0, valid_idx][-1], 2.0, decimal=3)
-            np.testing.assert_almost_equal(result[1, valid_idx][-1], 3.0, decimal=3)
-
-    def test_resid_of_linear_is_near_zero(self):
-        # Perfectly linear: residuals should be ~0
-        t_vals = np.arange(20, dtype=np.float64)
-        x = np.stack([2 * t_vals + 1, t_vals + 5], axis=0)
-        result = execute_operator("TsLinRegResid", x, params={"window": 5})
-        valid = ~np.isnan(result)
-        if valid.any():
-            np.testing.assert_almost_equal(np.abs(result[valid]).max(), 0.0, decimal=3)
-
-
-# ---------------------------------------------------------------------------
-# Logical operators
-# ---------------------------------------------------------------------------
-
-class TestLogicalOps:
-    """Test conditional and comparison operators."""
-
-    def test_ifelse_branching(self):
-        cond = _arr([1, -1, 1, -1, 0])
-        x = _arr([10, 20, 30, 40, 50])
-        y = _arr([100, 200, 300, 400, 500])
-        result = execute_operator("IfElse", cond, x, y)
-        # cond > 0 -> x, else y
-        np.testing.assert_almost_equal(result[0, 0], 10)
-        np.testing.assert_almost_equal(result[0, 1], 200)
-        np.testing.assert_almost_equal(result[0, 2], 30)
-        np.testing.assert_almost_equal(result[0, 3], 400)
-        np.testing.assert_almost_equal(result[0, 4], 500)  # 0 is not > 0
-
-    def test_greater(self):
-        x = _arr([1, 5, 3], [4, 2, 6])
-        y = _arr([2, 3, 3], [4, 5, 1])
-        result = execute_operator("Greater", x, y)
-        expected = _arr([0, 1, 0], [0, 0, 1])
-        np.testing.assert_array_almost_equal(result, expected)
-
-    def test_less(self):
-        x = _arr([1, 5, 3])
-        y = _arr([2, 3, 3])
-        result = execute_operator("Less", x, y)
-        expected = _arr([1, 0, 0])
-        np.testing.assert_array_almost_equal(result, expected)
-
-    def test_and(self):
-        x = _arr([1, 1, -1, -1])
-        y = _arr([1, -1, 1, -1])
-        result = execute_operator("And", x, y)
-        expected = _arr([1, 0, 0, 0])
-        np.testing.assert_array_almost_equal(result, expected)
-
-    def test_or(self):
-        x = _arr([1, 1, -1, -1])
-        y = _arr([1, -1, 1, -1])
-        result = execute_operator("Or", x, y)
-        expected = _arr([1, 1, 1, 0])
-        np.testing.assert_array_almost_equal(result, expected)
-
-    def test_not(self):
-        x = _arr([1, -1, 0, 5])
-        result = execute_operator("Not", x)
-        expected = _arr([0, 1, 1, 0])
-        np.testing.assert_array_almost_equal(result, expected)
-
-
-# ---------------------------------------------------------------------------
-# NaN propagation
-# ---------------------------------------------------------------------------
-
-class TestNaNPropagation:
-    """Test NaN handling across operators."""
-
-    def test_add_nan_propagation(self):
-        x = _arr([1, np.nan, 3])
-        y = _arr([4, 5, np.nan])
-        result = execute_operator("Add", x, y)
-        assert np.isnan(result[0, 1])
-        assert np.isnan(result[0, 2])
-        np.testing.assert_almost_equal(result[0, 0], 5.0)
-
-    def test_neg_nan_propagation(self):
-        x = _arr([1, np.nan, 3])
-        result = execute_operator("Neg", x)
-        assert np.isnan(result[0, 1])
-
-    def test_greater_nan_propagation(self):
-        x = _arr([1, np.nan, 3])
-        y = _arr([0, 1, np.nan])
-        result = execute_operator("Greater", x, y)
-        assert np.isnan(result[0, 1])
-        assert np.isnan(result[0, 2])
-
-
-# ---------------------------------------------------------------------------
-# GPU (torch) vs CPU equivalence
-# ---------------------------------------------------------------------------
-
-class TestGPUCPUEquivalence:
-    """Test that torch and numpy implementations produce similar results."""
-
-    @pytest.fixture
-    def torch_available(self):
-        try:
-            import torch
-            return True
-        except ImportError:
-            pytest.skip("PyTorch not available")
-
-    @pytest.mark.parametrize("op_name", ["Add", "Sub", "Mul", "Neg", "Abs", "Sign"])
-    def test_arithmetic_equivalence(self, torch_available, x_simple, y_simple, op_name):
-        import torch as th
-
-        spec = get_operator(op_name)
-        if spec.arity == 1:
-            np_result = execute_operator(op_name, x_simple, backend="numpy")
-            torch_result = execute_operator(
-                op_name, th.tensor(x_simple), backend="torch"
-            )
-        else:
-            np_result = execute_operator(op_name, x_simple, y_simple, backend="numpy")
-            torch_result = execute_operator(
-                op_name, th.tensor(x_simple), th.tensor(y_simple), backend="torch"
-            )
-        np.testing.assert_array_almost_equal(
-            np_result, torch_result.numpy(), decimal=5
-        )
-
-    @pytest.mark.parametrize("op_name", ["Mean", "Std", "TsMax", "TsMin"])
-    def test_statistical_equivalence(self, torch_available, x_simple, op_name):
-        import torch as th
-
-        np_result = execute_operator(op_name, x_simple, params={"window": 3}, backend="numpy")
-        torch_result = execute_operator(
-            op_name, th.tensor(x_simple, dtype=th.float64), params={"window": 3}, backend="torch"
-        )
-        valid = ~(np.isnan(np_result) | np.isnan(torch_result.numpy()))
-        if valid.any():
-            np.testing.assert_array_almost_equal(
-                np_result[valid], torch_result.numpy()[valid], decimal=4
-            )
-
-
-# ---------------------------------------------------------------------------
-# Registry
-# ---------------------------------------------------------------------------
-
-class TestRegistry:
-    """Test operator registry functions."""
-
-    def test_list_operators_flat(self):
-        ops = list_operators(grouped=False)
-        assert isinstance(ops, list)
-        assert "Add" in ops
-        assert "Neg" in ops
-        assert "CsRank" in ops
-
-    def test_list_operators_grouped(self):
-        groups = list_operators(grouped=True)
-        assert isinstance(groups, dict)
-        assert "ARITHMETIC" in groups
-        assert "STATISTICAL" in groups
-
-    def test_implemented_operators(self):
-        impl = implemented_operators()
-        assert len(impl) > 0
-        assert "Add" in impl
-
-    def test_get_operator_unknown_raises(self):
-        with pytest.raises(KeyError):
-            get_operator("FooBarBaz")
-
-    def test_execute_unknown_raises(self):
-        with pytest.raises(KeyError):
-            execute_operator("UnknownOp", np.ones((2, 3)))
diff --git a/src/factorminer/factorminer/tests/test_provenance.py b/src/factorminer/factorminer/tests/test_provenance.py
deleted file mode 100644
index fa4f326..0000000
--- a/src/factorminer/factorminer/tests/test_provenance.py
+++ /dev/null
@@ -1,131 +0,0 @@
-"""Tests for mining run manifests and factor provenance."""
-
-from __future__ import annotations
-
-import json
-
-import numpy as np
-
-from src.factorminer.factorminer.agent.llm_interface import MockProvider
-from src.factorminer.factorminer.core.factor_library import Factor
-from src.factorminer.factorminer.core.library_io import load_library
-from src.factorminer.factorminer.core.config import MiningConfig
-from src.factorminer.factorminer.core.helix_loop import HelixLoop
-from src.factorminer.factorminer.core.ralph_loop import EvaluationResult
-from src.factorminer.factorminer.core.session import MiningSession
-
-
-def test_factor_provenance_roundtrip():
-    factor = Factor(
-        id=7,
-        name="alpha_7",
-        formula="Neg($close)",
-        category="test",
-        ic_mean=0.12,
-        icir=1.4,
-        ic_win_rate=0.6,
-        max_correlation=0.1,
-        batch_number=3,
-        provenance={
-            "run_id": "run_123",
-            "loop_type": "helix",
-            "memory_summary": {"insight_count": 2},
-        },
-    )
-
-    restored = Factor.from_dict(factor.to_dict())
-
-    assert restored.provenance["run_id"] == "run_123"
-    assert restored.provenance["loop_type"] == "helix"
-    assert restored.provenance["memory_summary"]["insight_count"] == 2
-
-
-def test_helix_run_writes_manifest_and_factor_provenance(tmp_path, small_data, monkeypatch):
-    data = np.stack(
-        [
-            small_data["$open"],
-            small_data["$high"],
-            small_data["$low"],
-            small_data["$close"],
-            small_data["$volume"],
-            small_data["$amt"],
-            small_data["$vwap"],
-        ],
-        axis=2,
-    )
-    returns = small_data["$returns"]
-    config = MiningConfig(
-        target_library_size=1,
-        max_iterations=1,
-        batch_size=1,
-        output_dir=str(tmp_path / "helix-output"),
-    )
-    provider = MockProvider()
-
-    loop = HelixLoop(
-        config=config,
-        data_tensor=data,
-        returns=returns,
-        llm_provider=provider,
-        canonicalize=False,
-        enable_knowledge_graph=False,
-        enable_embeddings=False,
-        enable_auto_inventor=False,
-    )
-
-    monkeypatch.setattr(
-        loop.generator,
-        "generate_batch",
-        lambda *args, **kwargs: [("alpha_1", "Neg($close)")],
-    )
-    monkeypatch.setattr(
-        loop.pipeline,
-        "evaluate_batch",
-        lambda candidates: [
-            EvaluationResult(
-                factor_name="alpha_1",
-                formula="Neg($close)",
-                parse_ok=True,
-                ic_mean=0.12,
-                icir=1.3,
-                ic_win_rate=0.6,
-                max_correlation=0.0,
-                admitted=True,
-                stage_passed=3,
-                signals=np.ones_like(returns),
-                score_vector={"primary_score": 0.12},
-            )
-        ],
-    )
-
-    library = loop.run(target_size=1, max_iterations=1)
-
-    output_dir = tmp_path / "helix-output"
-    run_manifest_path = output_dir / "run_manifest.json"
-    checkpoint_manifest_path = output_dir / "checkpoint" / "run_manifest.json"
-    session_path = output_dir / "session.json"
-    library_path = output_dir / "factor_library.json"
-    checkpoint_library_path = output_dir / "checkpoint" / "library.json"
-
-    assert run_manifest_path.exists()
-    assert checkpoint_manifest_path.exists()
-    assert session_path.exists()
-    assert library_path.exists()
-    assert checkpoint_library_path.exists()
-
-    manifest = json.loads(run_manifest_path.read_text())
-    assert manifest["loop_type"] == "helix"
-    assert manifest["library_size"] >= 1
-    assert manifest["artifact_paths"]["run_manifest"] == str(run_manifest_path)
-
-    session = MiningSession.load(session_path)
-    assert session.run_manifest_path == str(run_manifest_path)
-    assert session.run_manifest["loop_type"] == "helix"
-
-    loaded_library = load_library(output_dir / "factor_library")
-    factor = loaded_library.list_factors()[0]
-    assert factor.provenance["run_id"] == manifest["run_id"]
-    assert factor.provenance["loop_type"] == "helix"
-    assert factor.provenance["admission"]["admitted"] is True
-    assert factor.provenance["evaluation"]["ic_mean"] == 0.12
-    assert library.size == 1
diff --git a/src/factorminer/factorminer/tests/test_ralph_loop.py b/src/factorminer/factorminer/tests/test_ralph_loop.py
deleted file mode 100644
index dd75da1..0000000
--- a/src/factorminer/factorminer/tests/test_ralph_loop.py
+++ /dev/null
@@ -1,1076 +0,0 @@
-"""Integration tests for the Ralph Loop end-to-end mining pipeline.
-
-Tests the full pipeline using MockProvider for deterministic factor generation
-and synthetic market data, covering:
-  - BudgetTracker resource monitoring
-  - FactorGenerator response parsing
-  - ValidationPipeline multi-stage evaluation
-  - RalphLoop end-to-end mining iterations
-  - Category inference from formula structure
-  - Session persistence (save / load)
-"""
-
-from __future__ import annotations
-
-import json
-import os
-import shutil
-import tempfile
-from dataclasses import dataclass
-from pathlib import Path
-from typing import Any, Dict, Optional
-
-import numpy as np
-import pytest
-
-from src.factorminer.factorminer.agent.llm_interface import MockProvider
-from src.factorminer.factorminer.core.factor_library import Factor, FactorLibrary
-from src.factorminer.factorminer.core.ralph_loop import (
-    BudgetTracker,
-    EvaluationResult,
-    FactorGenerator,
-    MiningReporter,
-    RalphLoop,
-    ValidationPipeline,
-)
-from src.factorminer.factorminer.memory.memory_store import ExperienceMemory
-
-
-# ---------------------------------------------------------------------------
-# Minimal config for tests
-# ---------------------------------------------------------------------------
-
-@dataclass
-class _TestConfig:
-    target_library_size: int = 10
-    batch_size: int = 5
-    max_iterations: int = 3
-    ic_threshold: float = 0.02
-    icir_threshold: float = 0.3
-    correlation_threshold: float = 0.7
-    replacement_ic_min: float = 0.10
-    replacement_ic_ratio: float = 1.3
-    fast_screen_assets: int = 0  # No fast screening for deterministic tests
-    num_workers: int = 1
-    output_dir: str = ""
-
-    def to_dict(self) -> Dict[str, Any]:
-        return {
-            "target_library_size": self.target_library_size,
-            "batch_size": self.batch_size,
-            "max_iterations": self.max_iterations,
-            "ic_threshold": self.ic_threshold,
-            "icir_threshold": self.icir_threshold,
-            "correlation_threshold": self.correlation_threshold,
-            "replacement_ic_min": self.replacement_ic_min,
-            "replacement_ic_ratio": self.replacement_ic_ratio,
-        }
-
-
-# ---------------------------------------------------------------------------
-# Fixtures
-# ---------------------------------------------------------------------------
-
-@pytest.fixture
-def rng():
-    return np.random.default_rng(42)
-
-
-@pytest.fixture
-def tmp_dir():
-    d = tempfile.mkdtemp(prefix="ralph_test_")
-    yield d
-    shutil.rmtree(d, ignore_errors=True)
-
-
-@pytest.fixture
-def test_config(tmp_dir):
-    return _TestConfig(output_dir=tmp_dir)
-
-
-@pytest.fixture
-def synthetic_data(rng):
-    """Synthetic (M=15, T=60, F=8) data tensor and returns."""
-    M, T, F = 15, 60, 8
-    data_tensor = rng.normal(0, 1, (M, T, F)).astype(np.float64)
-    returns = rng.normal(0, 0.02, (M, T)).astype(np.float64)
-    return data_tensor, returns
-
-
-@pytest.fixture
-def mock_provider():
-    return MockProvider(cycle=True)
-
-
-@pytest.fixture
-def empty_library():
-    return FactorLibrary(correlation_threshold=0.7, ic_threshold=0.02)
-
-
-@pytest.fixture
-def empty_memory():
-    return ExperienceMemory()
-
-
-# ===========================================================================
-# BudgetTracker tests
-# ===========================================================================
-
-class TestBudgetTracker:
-
-    def test_initial_state(self):
-        bt = BudgetTracker()
-        assert bt.llm_calls == 0
-        assert bt.total_tokens == 0
-        assert bt.compute_seconds == 0.0
-        assert not bt.is_exhausted()
-
-    def test_record_llm_call(self):
-        bt = BudgetTracker()
-        bt.record_llm_call(prompt_tokens=100, completion_tokens=50)
-        assert bt.llm_calls == 1
-        assert bt.llm_prompt_tokens == 100
-        assert bt.llm_completion_tokens == 50
-        assert bt.total_tokens == 150
-
-    def test_record_compute(self):
-        bt = BudgetTracker()
-        bt.record_compute(1.5)
-        bt.record_compute(2.5)
-        assert bt.compute_seconds == pytest.approx(4.0)
-
-    def test_exhausted_by_llm_calls(self):
-        bt = BudgetTracker(max_llm_calls=2)
-        assert not bt.is_exhausted()
-        bt.record_llm_call()
-        assert not bt.is_exhausted()
-        bt.record_llm_call()
-        assert bt.is_exhausted()
-
-    def test_exhausted_by_wall_time(self):
-        bt = BudgetTracker(max_wall_seconds=0.01)
-        import time
-        time.sleep(0.02)
-        assert bt.is_exhausted()
-
-    def test_unlimited_budgets(self):
-        bt = BudgetTracker(max_llm_calls=0, max_wall_seconds=0)
-        for _ in range(100):
-            bt.record_llm_call()
-        assert not bt.is_exhausted()
-
-    def test_to_dict_keys(self):
-        bt = BudgetTracker()
-        bt.record_llm_call(10, 20)
-        d = bt.to_dict()
-        expected_keys = {
-            "llm_calls", "llm_prompt_tokens", "llm_completion_tokens",
-            "total_tokens", "compute_seconds", "wall_elapsed_seconds",
-        }
-        assert set(d.keys()) == expected_keys
-
-    def test_wall_elapsed_positive(self):
-        bt = BudgetTracker()
-        assert bt.wall_elapsed >= 0
-
-
-# ===========================================================================
-# EvaluationResult tests
-# ===========================================================================
-
-class TestEvaluationResult:
-
-    def test_defaults(self):
-        r = EvaluationResult(factor_name="test", formula="Neg($close)")
-        assert not r.parse_ok
-        assert r.ic_mean == 0.0
-        assert r.icir == 0.0
-        assert not r.admitted
-        assert r.replaced is None
-        assert r.rejection_reason == ""
-        assert r.stage_passed == 0
-        assert r.signals is None
-
-    def test_admitted_result(self):
-        r = EvaluationResult(
-            factor_name="good",
-            formula="CsRank($close)",
-            parse_ok=True,
-            ic_mean=0.08,
-            icir=1.2,
-            admitted=True,
-            stage_passed=3,
-        )
-        assert r.admitted
-        assert r.stage_passed == 3
-
-
-# ===========================================================================
-# FactorGenerator tests
-# ===========================================================================
-
-class TestFactorGenerator:
-
-    def test_generate_batch(self, mock_provider):
-        gen = FactorGenerator(llm_provider=mock_provider)
-        candidates = gen.generate_batch(
-            memory_signal={},
-            library_state={"size": 0},
-            batch_size=5,
-        )
-        assert len(candidates) > 0
-        for name, formula in candidates:
-            assert isinstance(name, str)
-            assert isinstance(formula, str)
-            assert len(name) > 0
-            assert len(formula) > 0
-
-    def test_parse_response_numbered_format(self):
-        raw = (
-            "1. factor_a: Neg($close)\n"
-            "2. factor_b: CsRank(Mean($close, 10))\n"
-            "3. factor_c: Div($high, $low)\n"
-        )
-        result = FactorGenerator._parse_response(raw)
-        assert len(result) == 3
-        assert result[0] == ("factor_a", "Neg($close)")
-        assert result[1] == ("factor_b", "CsRank(Mean($close, 10))")
-
-    def test_parse_response_empty(self):
-        assert FactorGenerator._parse_response("") == []
-        assert FactorGenerator._parse_response("\n\n") == []
-
-    def test_parse_response_ignores_bad_lines(self):
-        raw = (
-            "Some random text\n"
-            "1. valid_factor: Neg($close)\n"
-            "Not a factor line\n"
-            "2. another: CsRank($volume)\n"
-        )
-        result = FactorGenerator._parse_response(raw)
-        assert len(result) == 2
-
-    def test_mock_provider_deterministic(self):
-        p1 = MockProvider(cycle=False)
-        p2 = MockProvider(cycle=False)
-        r1 = p1.generate("sys", "user", 0.8, 4096)
-        r2 = p2.generate("sys", "user", 0.8, 4096)
-        assert r1 == r2
-
-    def test_mock_provider_cycling(self):
-        p = MockProvider(cycle=True)
-        r1 = p.generate("sys", "user")
-        r2 = p.generate("sys", "user")
-        # Second call should produce different factors (cycled offset)
-        # unless batch_size == len(MOCK_FACTORS)
-        assert isinstance(r1, str)
-        assert isinstance(r2, str)
-
-
-# ===========================================================================
-# ValidationPipeline tests
-# ===========================================================================
-
-class TestValidationPipeline:
-
-    @pytest.fixture
-    def pipeline(self, synthetic_data, empty_library):
-        data_tensor, returns = synthetic_data
-        return ValidationPipeline(
-            data_tensor=data_tensor,
-            returns=returns,
-            library=empty_library,
-            ic_threshold=0.02,
-            fast_screen_assets=0,  # Use all assets
-        )
-
-    def test_parse_failure(self, pipeline):
-        result = pipeline.evaluate_candidate("bad", "NotAnOperator($close)")
-        assert not result.parse_ok
-        assert result.stage_passed == 0
-        assert "Parse failure" in result.rejection_reason
-
-    def test_valid_formula_parses(self, pipeline):
-        result = pipeline.evaluate_candidate("neg_close", "Neg($close)")
-        assert result.parse_ok
-
-    def test_signals_computed(self, pipeline):
-        result = pipeline.evaluate_candidate("neg_close", "Neg($close)")
-        assert result.signals is not None
-        # Signals should be (M, T) shaped
-        M, T = pipeline.returns.shape
-        assert result.signals.shape == (M, T)
-
-    def test_ic_computed(self, pipeline):
-        result = pipeline.evaluate_candidate("neg_close", "Neg($close)")
-        # IC should be a number (may or may not pass threshold)
-        assert isinstance(result.ic_mean, float)
-
-    def test_batch_evaluation(self, pipeline):
-        candidates = [
-            ("f1", "Neg($close)"),
-            ("f2", "CsRank(Mean($close, 10))"),
-            ("f3", "InvalidFormula!!!"),
-        ]
-        results = pipeline.evaluate_batch(candidates)
-        assert len(results) == 3
-        # Third should fail parse
-        assert not results[2].parse_ok
-
-    def test_deduplication_keeps_highest_ic(self, synthetic_data, empty_library):
-        data_tensor, returns = synthetic_data
-        # Use very low threshold and high correlation threshold to admit most
-        pipeline = ValidationPipeline(
-            data_tensor=data_tensor,
-            returns=returns,
-            library=empty_library,
-            ic_threshold=0.0001,
-            fast_screen_assets=0,
-        )
-
-        # Create two results with identical signals but different IC
-        M, T = returns.shape
-        signals = np.random.RandomState(99).randn(M, T)
-
-        r1 = EvaluationResult(
-            factor_name="low_ic", formula="Neg($close)",
-            parse_ok=True, ic_mean=0.05, admitted=True,
-            stage_passed=3, signals=signals.copy(),
-        )
-        r2 = EvaluationResult(
-            factor_name="high_ic", formula="CsRank($close)",
-            parse_ok=True, ic_mean=0.10, admitted=True,
-            stage_passed=3, signals=signals.copy(),
-        )
-        results = pipeline._deduplicate_batch([r1, r2])
-
-        # The higher-IC one should be kept; the lower deduped
-        admitted = [r for r in results if r.admitted]
-        assert len(admitted) == 1
-        assert admitted[0].factor_name == "high_ic"
-
-    def test_deduplication_uncorrelated_kept(self, synthetic_data, empty_library):
-        data_tensor, returns = synthetic_data
-        pipeline = ValidationPipeline(
-            data_tensor=data_tensor,
-            returns=returns,
-            library=empty_library,
-            ic_threshold=0.0001,
-            fast_screen_assets=0,
-        )
-
-        M, T = returns.shape
-        rng = np.random.RandomState(42)
-
-        r1 = EvaluationResult(
-            factor_name="f1", formula="Neg($close)",
-            parse_ok=True, ic_mean=0.05, admitted=True,
-            stage_passed=3, signals=rng.randn(M, T),
-        )
-        r2 = EvaluationResult(
-            factor_name="f2", formula="CsRank($volume)",
-            parse_ok=True, ic_mean=0.07, admitted=True,
-            stage_passed=3, signals=rng.randn(M, T),
-        )
-        results = pipeline._deduplicate_batch([r1, r2])
-        admitted = [r for r in results if r.admitted]
-        # Both should survive (independent random signals -> low correlation)
-        assert len(admitted) == 2
-
-
-# ===========================================================================
-# MiningReporter tests
-# ===========================================================================
-
-class TestMiningReporter:
-
-    def test_log_batch(self, tmp_dir):
-        reporter = MiningReporter(output_dir=tmp_dir)
-        reporter.log_batch(1, admitted=3, rejected=7)
-        log_path = os.path.join(tmp_dir, "mining_batches.jsonl")
-        assert os.path.exists(log_path)
-        with open(log_path) as f:
-            lines = f.readlines()
-        assert len(lines) == 1
-        record = json.loads(lines[0])
-        assert record["iteration"] == 1
-        assert record["admitted"] == 3
-
-    def test_export_library(self, tmp_dir, empty_library):
-        reporter = MiningReporter(output_dir=tmp_dir)
-        path = reporter.export_library(empty_library)
-        assert os.path.exists(path)
-        with open(path) as f:
-            data = json.load(f)
-        assert "factors" in data
-        assert "diagnostics" in data
-        assert "exported_at" in data
-
-
-# ===========================================================================
-# Category inference tests
-# ===========================================================================
-
-class TestCategoryInference:
-
-    def test_momentum(self):
-        assert RalphLoop._infer_category("Delta($close, 5)") == "Momentum"
-
-    def test_volatility(self):
-        assert RalphLoop._infer_category("Std($returns, 10)") == "Volatility"
-
-    def test_higher_moment(self):
-        assert RalphLoop._infer_category("Skew($returns, 20)") == "Higher-Moment"
-
-    def test_pv_correlation(self):
-        assert RalphLoop._infer_category("Corr($close, $volume, 10)") == "PV-Correlation"
-
-    def test_regime_conditional(self):
-        cat = RalphLoop._infer_category(
-            "IfElse(Greater($returns, 0), $volume, Neg($volume))"
-        )
-        assert cat == "Regime-Conditional"
-
-    def test_regression(self):
-        assert RalphLoop._infer_category("TsLinRegSlope($close, 20)") == "Regression"
-
-    def test_smoothing(self):
-        assert RalphLoop._infer_category("EMA($close, 10)") == "Smoothing"
-
-    def test_vwap(self):
-        assert RalphLoop._infer_category("Div(Sub($close, $vwap), $vwap)") == "VWAP"
-
-    def test_amount(self):
-        assert RalphLoop._infer_category("CsRank($amt)") == "Amount"
-
-    def test_extrema(self):
-        assert RalphLoop._infer_category("TsMax($close, 20)") == "Extrema"
-
-    def test_cross_sectional(self):
-        assert RalphLoop._infer_category("CsRank($close)") == "Cross-Sectional"
-
-    def test_other_fallback(self):
-        assert RalphLoop._infer_category("Add($close, $open)") == "Other"
-
-
-# ===========================================================================
-# End-to-end RalphLoop tests
-# ===========================================================================
-
-class TestRalphLoopEndToEnd:
-
-    def test_single_iteration(self, test_config, synthetic_data, mock_provider, tmp_dir):
-        test_config.max_iterations = 1
-        test_config.output_dir = tmp_dir
-        data_tensor, returns = synthetic_data
-
-        loop = RalphLoop(
-            config=test_config,
-            data_tensor=data_tensor,
-            returns=returns,
-            llm_provider=mock_provider,
-        )
-
-        library = loop.run(max_iterations=1)
-        assert isinstance(library, FactorLibrary)
-        assert loop.iteration == 1
-        assert loop.budget.llm_calls >= 1
-
-    def test_multiple_iterations(self, test_config, synthetic_data, mock_provider, tmp_dir):
-        test_config.max_iterations = 3
-        test_config.output_dir = tmp_dir
-        data_tensor, returns = synthetic_data
-
-        loop = RalphLoop(
-            config=test_config,
-            data_tensor=data_tensor,
-            returns=returns,
-            llm_provider=mock_provider,
-        )
-
-        library = loop.run(max_iterations=3)
-        assert loop.iteration <= 3
-        assert loop.budget.llm_calls <= 3
-
-    def test_library_grows(self, test_config, synthetic_data, mock_provider, tmp_dir):
-        test_config.max_iterations = 5
-        test_config.target_library_size = 100  # High target so we don't stop early
-        test_config.output_dir = tmp_dir
-        data_tensor, returns = synthetic_data
-
-        loop = RalphLoop(
-            config=test_config,
-            data_tensor=data_tensor,
-            returns=returns,
-            llm_provider=mock_provider,
-        )
-
-        library = loop.run(max_iterations=5, target_size=100)
-        # With mock provider and low IC threshold, some factors should be admitted
-        # (exact count depends on pseudo-signal randomness)
-        assert isinstance(library.size, int)
-
-    def test_callback_invoked(self, test_config, synthetic_data, mock_provider, tmp_dir):
-        test_config.max_iterations = 2
-        test_config.output_dir = tmp_dir
-        data_tensor, returns = synthetic_data
-
-        loop = RalphLoop(
-            config=test_config,
-            data_tensor=data_tensor,
-            returns=returns,
-            llm_provider=mock_provider,
-        )
-
-        callback_calls = []
-
-        def cb(iteration: int, stats: Dict[str, Any]) -> None:
-            callback_calls.append((iteration, stats))
-
-        loop.run(max_iterations=2, callback=cb)
-        assert len(callback_calls) == 2
-        assert callback_calls[0][0] == 1
-        assert callback_calls[1][0] == 2
-        # Stats should have standard keys
-        for _, stats in callback_calls:
-            assert "candidates" in stats
-            assert "admitted" in stats
-            assert "library_size" in stats
-            assert "yield_rate" in stats
-
-    def test_budget_stops_loop(self, test_config, synthetic_data, mock_provider, tmp_dir):
-        test_config.max_iterations = 100
-        test_config.output_dir = tmp_dir
-        data_tensor, returns = synthetic_data
-
-        loop = RalphLoop(
-            config=test_config,
-            data_tensor=data_tensor,
-            returns=returns,
-            llm_provider=mock_provider,
-        )
-        loop.budget = BudgetTracker(max_llm_calls=2)
-
-        library = loop.run(max_iterations=100, target_size=1000)
-        assert loop.budget.llm_calls == 2
-        assert loop.iteration == 2
-
-    def test_target_size_stops_loop(self, test_config, synthetic_data, mock_provider, tmp_dir):
-        test_config.output_dir = tmp_dir
-        test_config.ic_threshold = 0.0001  # Very low to admit most
-        test_config.correlation_threshold = 0.99  # Very high to avoid dedup
-        data_tensor, returns = synthetic_data
-
-        loop = RalphLoop(
-            config=test_config,
-            data_tensor=data_tensor,
-            returns=returns,
-            llm_provider=mock_provider,
-        )
-        # Request tiny library
-        library = loop.run(max_iterations=50, target_size=2)
-        # Either reached target or exhausted iterations
-        assert library.size >= 0
-
-    def test_memory_evolves(self, test_config, synthetic_data, mock_provider, tmp_dir):
-        test_config.max_iterations = 2
-        test_config.output_dir = tmp_dir
-        data_tensor, returns = synthetic_data
-
-        loop = RalphLoop(
-            config=test_config,
-            data_tensor=data_tensor,
-            returns=returns,
-            llm_provider=mock_provider,
-        )
-
-        loop.run(max_iterations=2)
-        # Memory should have been updated at least once
-        assert loop.memory is not None
-
-    def test_output_files_created(self, test_config, synthetic_data, mock_provider, tmp_dir):
-        test_config.output_dir = tmp_dir
-        test_config.max_iterations = 1
-        data_tensor, returns = synthetic_data
-
-        loop = RalphLoop(
-            config=test_config,
-            data_tensor=data_tensor,
-            returns=returns,
-            llm_provider=mock_provider,
-        )
-        loop.run(max_iterations=1)
-
-        # Check that library JSON was exported
-        lib_path = os.path.join(tmp_dir, "factor_library.json")
-        assert os.path.exists(lib_path)
-
-    def test_run_with_prepopulated_library(
-        self, test_config, synthetic_data, mock_provider, tmp_dir, rng
-    ):
-        test_config.output_dir = tmp_dir
-        test_config.max_iterations = 1
-        data_tensor, returns = synthetic_data
-        M, T = returns.shape
-
-        lib = FactorLibrary(
-            correlation_threshold=0.7, ic_threshold=0.02,
-        )
-        # Add one factor
-        factor = Factor(
-            id=0, name="seed_factor", formula="Neg($close)",
-            category="test", ic_mean=0.06, icir=1.0,
-            ic_win_rate=0.6, max_correlation=0.0,
-            batch_number=0, signals=rng.normal(0, 1, (M, T)),
-        )
-        lib.admit_factor(factor)
-
-        loop = RalphLoop(
-            config=test_config,
-            data_tensor=data_tensor,
-            returns=returns,
-            llm_provider=mock_provider,
-            library=lib,
-        )
-        result_lib = loop.run(max_iterations=1)
-        assert result_lib.size >= 1  # At least the seed factor
-
-    def test_empty_stats_on_no_candidates(self, test_config, synthetic_data, tmp_dir):
-        test_config.output_dir = tmp_dir
-        data_tensor, returns = synthetic_data
-
-        # Provider that returns empty response
-        class EmptyProvider(MockProvider):
-            def generate(self, *args, **kwargs):
-                return ""
-
-        loop = RalphLoop(
-            config=test_config,
-            data_tensor=data_tensor,
-            returns=returns,
-            llm_provider=EmptyProvider(),
-        )
-        stats = loop._run_iteration(batch_size=5)
-        assert stats["candidates"] == 0
-        assert stats["admitted"] == 0
-
-
-# ===========================================================================
-# Session persistence tests
-# ===========================================================================
-
-class TestSessionPersistence:
-
-    def test_save_creates_checkpoint(
-        self, test_config, synthetic_data, mock_provider, tmp_dir
-    ):
-        test_config.output_dir = tmp_dir
-        test_config.max_iterations = 1
-        data_tensor, returns = synthetic_data
-
-        loop = RalphLoop(
-            config=test_config,
-            data_tensor=data_tensor,
-            returns=returns,
-            llm_provider=mock_provider,
-        )
-        loop.run(max_iterations=1)
-
-        checkpoint_path = loop.save_session(tmp_dir)
-        assert os.path.isdir(checkpoint_path)
-
-        # Should contain key files
-        checkpoint_files = os.listdir(checkpoint_path)
-        assert "library.json" in checkpoint_files
-        assert "memory.json" in checkpoint_files
-        assert "loop_state.json" in checkpoint_files
-
-    def test_load_restores_iteration(
-        self, test_config, synthetic_data, mock_provider, tmp_dir
-    ):
-        test_config.output_dir = tmp_dir
-        data_tensor, returns = synthetic_data
-
-        # Run 2 iterations
-        loop1 = RalphLoop(
-            config=test_config,
-            data_tensor=data_tensor,
-            returns=returns,
-            llm_provider=mock_provider,
-        )
-        loop1.run(max_iterations=2)
-        checkpoint_path = loop1.save_session(tmp_dir)
-
-        # Load into new loop
-        loop2 = RalphLoop(
-            config=test_config,
-            data_tensor=data_tensor,
-            returns=returns,
-            llm_provider=MockProvider(cycle=True),
-        )
-        loop2.load_session(checkpoint_path)
-        assert loop2.iteration == loop1.iteration
-
-    def test_load_restores_memory(
-        self, test_config, synthetic_data, mock_provider, tmp_dir
-    ):
-        test_config.output_dir = tmp_dir
-        data_tensor, returns = synthetic_data
-
-        loop1 = RalphLoop(
-            config=test_config,
-            data_tensor=data_tensor,
-            returns=returns,
-            llm_provider=mock_provider,
-        )
-        loop1.run(max_iterations=2)
-        checkpoint_path = loop1.save_session(tmp_dir)
-
-        loop2 = RalphLoop(
-            config=test_config,
-            data_tensor=data_tensor,
-            returns=returns,
-            llm_provider=MockProvider(cycle=True),
-        )
-        loop2.load_session(checkpoint_path)
-        # Memory should have been restored
-        assert loop2.memory is not None
-
-
-# ===========================================================================
-# Checkpoint / Resume tests (Phase 1f)
-# ===========================================================================
-
-class TestCheckpointResume:
-    """Tests for the checkpoint/resume functionality."""
-
-    def test_checkpoint_creates_files(
-        self, test_config, synthetic_data, mock_provider, tmp_dir
-    ):
-        """Verify that save_session creates all expected checkpoint files."""
-        test_config.output_dir = tmp_dir
-        data_tensor, returns = synthetic_data
-
-        loop = RalphLoop(
-            config=test_config,
-            data_tensor=data_tensor,
-            returns=returns,
-            llm_provider=mock_provider,
-        )
-        loop.run(max_iterations=2)
-        checkpoint_path = loop.save_session()
-
-        checkpoint_dir = Path(checkpoint_path)
-        assert checkpoint_dir.exists()
-        assert (checkpoint_dir / "library.json").exists()
-        assert (checkpoint_dir / "memory.json").exists()
-        assert (checkpoint_dir / "loop_state.json").exists()
-        assert (checkpoint_dir / "session.json").exists()
-
-        # Verify loop_state.json contains expected keys
-        with open(checkpoint_dir / "loop_state.json") as f:
-            loop_state = json.load(f)
-        assert "iteration" in loop_state
-        assert "library_size" in loop_state
-        assert "memory_version" in loop_state
-        assert "budget" in loop_state
-        assert loop_state["iteration"] == loop.iteration
-        assert loop_state["library_size"] == loop.library.size
-        assert loop_state["budget"]["llm_calls"] == loop.budget.llm_calls
-
-    def test_resume_continues_from_checkpoint(
-        self, test_config, synthetic_data, mock_provider, tmp_dir
-    ):
-        """Verify that resuming continues from the saved iteration."""
-        test_config.output_dir = tmp_dir
-        test_config.target_library_size = 200  # High target so loop doesn't stop early
-        data_tensor, returns = synthetic_data
-
-        # Run 2 iterations, then save
-        loop1 = RalphLoop(
-            config=test_config,
-            data_tensor=data_tensor,
-            returns=returns,
-            llm_provider=mock_provider,
-        )
-        loop1.run(max_iterations=2, target_size=200)
-        saved_iteration = loop1.iteration
-        saved_library_size = loop1.library.size
-        loop1.save_session()
-
-        # Create a new loop and resume from checkpoint
-        loop2 = RalphLoop(
-            config=test_config,
-            data_tensor=data_tensor,
-            returns=returns,
-            llm_provider=MockProvider(cycle=True),
-        )
-        assert loop2.iteration == 0  # Starts fresh
-
-        # Resume should load the saved state and continue
-        library = loop2.run(max_iterations=4, target_size=200, resume=True)
-
-        # loop2 should have continued from iteration 2, running up to 4
-        assert loop2.iteration > saved_iteration
-        assert loop2.iteration <= 4
-
-    def test_resume_preserves_library(
-        self, test_config, synthetic_data, mock_provider, tmp_dir
-    ):
-        """Verify that library factors are preserved across resume."""
-        test_config.output_dir = tmp_dir
-        test_config.ic_threshold = 0.0001
-        test_config.correlation_threshold = 0.99
-        data_tensor, returns = synthetic_data
-
-        # Run and save
-        loop1 = RalphLoop(
-            config=test_config,
-            data_tensor=data_tensor,
-            returns=returns,
-            llm_provider=mock_provider,
-        )
-        loop1.run(max_iterations=3, target_size=100)
-        saved_factors = {
-            fid: f.to_dict() for fid, f in loop1.library.factors.items()
-        }
-        saved_size = loop1.library.size
-        loop1.save_session()
-
-        # Load into a new loop
-        loop2 = RalphLoop(
-            config=test_config,
-            data_tensor=data_tensor,
-            returns=returns,
-            llm_provider=MockProvider(cycle=True),
-        )
-        checkpoint_dir = os.path.join(tmp_dir, "checkpoint")
-        loop2.load_session(checkpoint_dir)
-
-        # Library should have the same factors
-        assert loop2.library.size == saved_size
-        for fid, f_dict in saved_factors.items():
-            assert fid in loop2.library.factors
-            restored = loop2.library.factors[fid].to_dict()
-            assert restored["name"] == f_dict["name"]
-            assert restored["formula"] == f_dict["formula"]
-            assert restored["ic_mean"] == pytest.approx(f_dict["ic_mean"])
-
-    def test_resume_preserves_memory(
-        self, test_config, synthetic_data, mock_provider, tmp_dir
-    ):
-        """Verify that experience memory is preserved across resume."""
-        test_config.output_dir = tmp_dir
-        data_tensor, returns = synthetic_data
-
-        # Run and save
-        loop1 = RalphLoop(
-            config=test_config,
-            data_tensor=data_tensor,
-            returns=returns,
-            llm_provider=mock_provider,
-        )
-        loop1.run(max_iterations=2)
-        saved_version = loop1.memory.version
-        saved_patterns = len(loop1.memory.success_patterns)
-        saved_forbidden = len(loop1.memory.forbidden_directions)
-        saved_insights = len(loop1.memory.insights)
-        loop1.save_session()
-
-        # Load into a new loop
-        loop2 = RalphLoop(
-            config=test_config,
-            data_tensor=data_tensor,
-            returns=returns,
-            llm_provider=MockProvider(cycle=True),
-        )
-        checkpoint_dir = os.path.join(tmp_dir, "checkpoint")
-        loop2.load_session(checkpoint_dir)
-
-        # Memory state should match
-        assert loop2.memory.version == saved_version
-        assert len(loop2.memory.success_patterns) == saved_patterns
-        assert len(loop2.memory.forbidden_directions) == saved_forbidden
-        assert len(loop2.memory.insights) == saved_insights
-
-    def test_checkpoint_interval_controls_frequency(
-        self, test_config, synthetic_data, mock_provider, tmp_dir
-    ):
-        """Verify checkpoint_interval controls how often checkpoints are saved."""
-        test_config.output_dir = tmp_dir
-        data_tensor, returns = synthetic_data
-
-        # With interval=2, checkpoint should be written at iterations 2 and 4
-        loop = RalphLoop(
-            config=test_config,
-            data_tensor=data_tensor,
-            returns=returns,
-            llm_provider=mock_provider,
-            checkpoint_interval=2,
-        )
-        loop.run(max_iterations=3)
-
-        checkpoint_dir = Path(tmp_dir) / "checkpoint"
-        # After 3 iterations with interval=2, checkpoint at iteration 2
-        # should have created the directory
-        assert checkpoint_dir.exists()
-
-        # Verify the checkpoint was written at least once
-        with open(checkpoint_dir / "loop_state.json") as f:
-            state = json.load(f)
-        # The last checkpoint should be at iteration 2 (since 3 is not
-        # divisible by 2, the checkpoint at iter 2 is the latest one)
-        assert state["iteration"] == 2
-
-    def test_checkpoint_disabled(
-        self, test_config, synthetic_data, mock_provider, tmp_dir
-    ):
-        """Verify checkpoint_interval=0 disables automatic checkpointing."""
-        test_config.output_dir = tmp_dir
-        data_tensor, returns = synthetic_data
-
-        loop = RalphLoop(
-            config=test_config,
-            data_tensor=data_tensor,
-            returns=returns,
-            llm_provider=mock_provider,
-            checkpoint_interval=0,
-        )
-        loop.run(max_iterations=2)
-
-        checkpoint_dir = Path(tmp_dir) / "checkpoint"
-        # No automatic checkpoint should have been created
-        assert not checkpoint_dir.exists()
-
-    def test_resume_from_classmethod(
-        self, test_config, synthetic_data, mock_provider, tmp_dir
-    ):
-        """Verify the resume_from classmethod works correctly."""
-        test_config.output_dir = tmp_dir
-        data_tensor, returns = synthetic_data
-
-        # Run and save
-        loop1 = RalphLoop(
-            config=test_config,
-            data_tensor=data_tensor,
-            returns=returns,
-            llm_provider=mock_provider,
-        )
-        loop1.run(max_iterations=2)
-        checkpoint_path = loop1.save_session()
-
-        # Use classmethod to resume
-        loop2 = RalphLoop.resume_from(
-            checkpoint_path=checkpoint_path,
-            config=test_config,
-            data_tensor=data_tensor,
-            returns=returns,
-            llm_provider=MockProvider(cycle=True),
-        )
-
-        assert loop2.iteration == loop1.iteration
-        assert loop2.library.size == loop1.library.size
-
-    def test_resume_restores_budget(
-        self, test_config, synthetic_data, mock_provider, tmp_dir
-    ):
-        """Verify that budget tracker state is preserved across resume."""
-        test_config.output_dir = tmp_dir
-        data_tensor, returns = synthetic_data
-
-        loop1 = RalphLoop(
-            config=test_config,
-            data_tensor=data_tensor,
-            returns=returns,
-            llm_provider=mock_provider,
-        )
-        loop1.run(max_iterations=2)
-        saved_llm_calls = loop1.budget.llm_calls
-        saved_compute = loop1.budget.compute_seconds
-        loop1.save_session()
-
-        loop2 = RalphLoop(
-            config=test_config,
-            data_tensor=data_tensor,
-            returns=returns,
-            llm_provider=MockProvider(cycle=True),
-        )
-        checkpoint_dir = os.path.join(tmp_dir, "checkpoint")
-        loop2.load_session(checkpoint_dir)
-
-        assert loop2.budget.llm_calls == saved_llm_calls
-        assert loop2.budget.compute_seconds == pytest.approx(
-            saved_compute, abs=0.1
-        )
-
-    def test_backward_compatible_no_checkpoint(
-        self, test_config, synthetic_data, mock_provider, tmp_dir
-    ):
-        """Verify run() works without checkpoint/resume (backward compat)."""
-        test_config.output_dir = tmp_dir
-        data_tensor, returns = synthetic_data
-
-        # Disable checkpointing entirely
-        loop = RalphLoop(
-            config=test_config,
-            data_tensor=data_tensor,
-            returns=returns,
-            llm_provider=mock_provider,
-            checkpoint_interval=0,
-        )
-        library = loop.run(max_iterations=2)
-
-        assert isinstance(library, FactorLibrary)
-        assert loop.iteration == 2
-
-    def test_resume_no_checkpoint_is_noop(
-        self, test_config, synthetic_data, mock_provider, tmp_dir
-    ):
-        """Verify resume=True with no existing checkpoint just starts fresh."""
-        test_config.output_dir = tmp_dir
-        data_tensor, returns = synthetic_data
-
-        loop = RalphLoop(
-            config=test_config,
-            data_tensor=data_tensor,
-            returns=returns,
-            llm_provider=mock_provider,
-        )
-        # resume=True but no checkpoint exists -- should work normally
-        library = loop.run(max_iterations=1, resume=True)
-        assert isinstance(library, FactorLibrary)
-        assert loop.iteration == 1
-
-    def test_run_exports_manifest_and_factor_provenance(
-        self, test_config, synthetic_data, mock_provider, tmp_dir
-    ):
-        """Completed runs should export a manifest and persist factor provenance."""
-        test_config.output_dir = tmp_dir
-        test_config.ic_threshold = 0.0
-        test_config.icir_threshold = -1.0
-        test_config.correlation_threshold = 1.1
-        data_tensor, returns = synthetic_data
-
-        loop = RalphLoop(
-            config=test_config,
-            data_tensor=data_tensor,
-            returns=returns,
-            llm_provider=mock_provider,
-        )
-        library = loop.run(max_iterations=2, target_size=2)
-
-        manifest_path = Path(tmp_dir) / "run_manifest.json"
-        assert manifest_path.exists()
-
-        manifest = json.loads(manifest_path.read_text())
-        assert manifest["loop_type"] == "ralph"
-        assert manifest["artifact_paths"]["run_manifest"] == str(manifest_path)
-        assert manifest["dataset_summary"]["data_tensor_shape"] == list(data_tensor.shape)
-
-        assert library.size > 0
-        exported_library = json.loads((Path(tmp_dir) / "factor_library.json").read_text())
-        factor_payload = exported_library["factors"][0]
-        assert "provenance" in factor_payload
-        assert factor_payload["provenance"]["run_id"] == loop._session.session_id
-        assert factor_payload["provenance"]["loop_type"] == "ralph"
-        assert factor_payload["provenance"]["generator_family"]
diff --git a/src/factorminer/factorminer/tests/test_regime.py b/src/factorminer/factorminer/tests/test_regime.py
deleted file mode 100644
index 0a4833e..0000000
--- a/src/factorminer/factorminer/tests/test_regime.py
+++ /dev/null
@@ -1,118 +0,0 @@
-"""Tests for regime-aware factor validation (evaluation/regime.py)."""
-
-from __future__ import annotations
-
-import numpy as np
-import pytest
-
-from src.factorminer.factorminer.evaluation.regime import (
-    MarketRegime,
-    RegimeAwareEvaluator,
-    RegimeConfig,
-    RegimeDetector,
-)
-
-
-@pytest.fixture
-def rng():
-    return np.random.default_rng(42)
-
-
-# -----------------------------------------------------------------------
-# RegimeDetector: synthetic bull/bear phases
-# -----------------------------------------------------------------------
-
-def test_regime_detector_bull_bear_phases(rng):
-    """Clear positive first half, negative second half should produce
-    BULL and BEAR labels after the lookback window."""
-    M, T = 20, 300
-    returns = np.zeros((M, T))
-    # First half: strongly positive
-    returns[:, :150] = rng.normal(0.02, 0.005, (M, 150))
-    # Second half: strongly negative
-    returns[:, 150:] = rng.normal(-0.02, 0.005, (M, 150))
-
-    cfg = RegimeConfig(lookback_window=30, bull_return_threshold=0.0,
-                       bear_return_threshold=0.0)
-    detector = RegimeDetector(config=cfg)
-    result = detector.classify(returns)
-
-    # After the lookback window, first half should contain BULL periods
-    bull_periods = result.periods[MarketRegime.BULL]
-    bear_periods = result.periods[MarketRegime.BEAR]
-    assert bull_periods[50:140].sum() > 0, "Should have BULL in first half"
-    assert bear_periods[180:].sum() > 0, "Should have BEAR in second half"
-
-
-def test_regime_detector_labels_shape(rng):
-    M, T = 10, 100
-    returns = rng.normal(0, 0.01, (M, T))
-    detector = RegimeDetector()
-    result = detector.classify(returns)
-    assert result.labels.shape == (T,)
-    assert set(result.labels).issubset({0, 1, 2})
-
-
-# -----------------------------------------------------------------------
-# RegimeAwareEvaluator: signal works in all regimes
-# -----------------------------------------------------------------------
-
-def test_regime_evaluator_all_regimes_pass(rng):
-    """A signal correlated with returns across all regimes should pass."""
-    M, T = 20, 400
-    returns = rng.normal(0, 0.01, (M, T))
-    signal = returns * 5 + rng.normal(0, 0.001, (M, T))
-
-    cfg = RegimeConfig(lookback_window=20, min_regime_ic=0.01,
-                       min_regimes_passing=1)
-    detector = RegimeDetector(config=cfg)
-    regime = detector.classify(returns)
-
-    evaluator = RegimeAwareEvaluator(returns, regime, config=cfg)
-    result = evaluator.evaluate("strong_factor", signal)
-    assert result.passes is True
-
-
-# -----------------------------------------------------------------------
-# RegimeAwareEvaluator: signal only works in bull
-# -----------------------------------------------------------------------
-
-def test_regime_evaluator_bull_only_fails(rng):
-    """A signal that only works in positive-return periods should fail
-    if min_regimes_passing=2."""
-    M, T = 20, 400
-    returns = np.zeros((M, T))
-    returns[:, :200] = rng.normal(0.02, 0.005, (M, 200))
-    returns[:, 200:] = rng.normal(-0.02, 0.005, (M, 200))
-
-    # Signal only correlates with returns in first half
-    signal = np.zeros((M, T))
-    signal[:, :200] = returns[:, :200] * 5
-    signal[:, 200:] = rng.normal(0, 1, (M, 200))  # noise in bear
-
-    cfg = RegimeConfig(lookback_window=20, min_regime_ic=0.03,
-                       min_regimes_passing=2)
-    detector = RegimeDetector(config=cfg)
-    regime = detector.classify(returns)
-
-    evaluator = RegimeAwareEvaluator(returns, regime, config=cfg)
-    result = evaluator.evaluate("bull_only", signal)
-    # May or may not pass depending on how many regimes are detected,
-    # but the structure is correct
-    assert isinstance(result.n_regimes_passing, int)
-    assert isinstance(result.passes, bool)
-
-
-# -----------------------------------------------------------------------
-# Edge case: very short data
-# -----------------------------------------------------------------------
-
-def test_regime_detector_short_data(rng):
-    """Data shorter than lookback_window should still work (all SIDEWAYS)."""
-    M, T = 10, 20
-    returns = rng.normal(0, 0.01, (M, T))
-    cfg = RegimeConfig(lookback_window=60)
-    detector = RegimeDetector(config=cfg)
-    result = detector.classify(returns)
-    # All periods should be SIDEWAYS since T < lookback_window
-    assert np.all(result.labels == MarketRegime.SIDEWAYS.value)
diff --git a/src/factorminer/factorminer/tests/test_research.py b/src/factorminer/factorminer/tests/test_research.py
deleted file mode 100644
index a404a4f..0000000
--- a/src/factorminer/factorminer/tests/test_research.py
+++ /dev/null
@@ -1,237 +0,0 @@
-"""Research-mode target, scoring, and model-suite coverage."""
-
-from __future__ import annotations
-
-from pathlib import Path
-
-import numpy as np
-import pandas as pd
-import pytest
-
-from src.factorminer.factorminer.core.factor_library import Factor
-from src.factorminer.factorminer.data.tensor_builder import TargetSpec, compute_targets
-from src.factorminer.factorminer.evaluation.portfolio import PortfolioBacktester
-from src.factorminer.factorminer.evaluation.research import (
-    FactorGeometryDiagnostics,
-    build_score_vector,
-    passes_research_admission,
-    run_research_model_suite,
-)
-from src.factorminer.factorminer.evaluation.runtime import DatasetSplit, EvaluationDataset, evaluate_factors
-from src.factorminer.factorminer.utils.config import load_config
-
-
-def test_compute_targets_supports_multiple_horizons():
-    df = pd.DataFrame(
-        {
-            "datetime": pd.date_range("2024-01-01", periods=5, freq="D").tolist() * 2,
-            "asset_id": ["A"] * 5 + ["B"] * 5,
-            "open": [10, 11, 12, 13, 14, 20, 21, 22, 23, 24],
-            "high": [11, 12, 13, 14, 15, 21, 22, 23, 24, 25],
-            "low": [9, 10, 11, 12, 13, 19, 20, 21, 22, 23],
-            "close": [10.5, 11.5, 12.5, 13.5, 14.5, 20.5, 21.5, 22.5, 23.5, 24.5],
-            "volume": [1] * 10,
-            "amount": [10] * 10,
-        }
-    )
-    out = compute_targets(
-        df,
-        [
-            TargetSpec("paper", 1, 1, "open_to_close", "simple"),
-            TargetSpec("h2_close_to_close", 0, 2, "close_to_close", "simple"),
-        ],
-    )
-
-    a0 = out[(out["asset_id"] == "A")].sort_values("datetime").reset_index(drop=True)
-    assert a0.loc[0, "target"] == pytest.approx(11.5 / 11.0 - 1.0)
-    assert a0.loc[0, "target_h2_close_to_close"] == pytest.approx(12.5 / 10.5 - 1.0)
-
-
-def test_evaluate_factors_records_all_target_stats(small_data):
-    timestamps = np.array(
-        [np.datetime64("2024-01-01") + np.timedelta64(i, "D") for i in range(50)]
-    )
-    returns = small_data["$returns"]
-    data_tensor = np.stack(
-        [
-            small_data["$open"],
-            small_data["$high"],
-            small_data["$low"],
-            small_data["$close"],
-            small_data["$volume"],
-            small_data["$amt"],
-            small_data["$vwap"],
-            small_data["$returns"],
-        ],
-        axis=-1,
-    )
-    alt_returns = -returns
-    splits = {
-        "train": DatasetSplit(
-            name="train",
-            indices=np.arange(25),
-            timestamps=timestamps[:25],
-            returns=returns[:, :25],
-            target_returns={"paper": returns[:, :25], "alt": alt_returns[:, :25]},
-            default_target="paper",
-        ),
-        "test": DatasetSplit(
-            name="test",
-            indices=np.arange(25, 50),
-            timestamps=timestamps[25:],
-            returns=returns[:, 25:],
-            target_returns={"paper": returns[:, 25:], "alt": alt_returns[:, 25:]},
-            default_target="paper",
-        ),
-        "full": DatasetSplit(
-            name="full",
-            indices=np.arange(50),
-            timestamps=timestamps,
-            returns=returns,
-            target_returns={"paper": returns, "alt": alt_returns},
-            default_target="paper",
-        ),
-    }
-    dataset = EvaluationDataset(
-        data_dict=small_data,
-        data_tensor=data_tensor,
-        returns=returns,
-        timestamps=timestamps,
-        asset_ids=np.array([f"A{i:02d}" for i in range(returns.shape[0])]),
-        splits=splits,
-        processed_df=pd.DataFrame(),
-        target_panels={"paper": returns, "alt": alt_returns},
-        default_target="paper",
-    )
-    factor = Factor(
-        id=1,
-        name="close_neg",
-        formula="Neg($close)",
-        category="test",
-        ic_mean=0.0,
-        icir=0.0,
-        ic_win_rate=0.0,
-        max_correlation=0.0,
-        batch_number=1,
-    )
-
-    artifact = evaluate_factors([factor], dataset, signal_failure_policy="reject")[0]
-
-    assert artifact.succeeded
-    assert set(artifact.target_stats["train"]) == {"paper", "alt"}
-    assert artifact.target_stats["train"]["paper"]["ic_mean"] == pytest.approx(
-        -artifact.target_stats["train"]["alt"]["ic_mean"]
-    )
-
-
-def test_research_score_vector_and_admission():
-    cfg = load_config(
-        overrides={
-            "benchmark": {"mode": "research"},
-            "research": {
-                "enabled": True,
-                "horizon_weights": {"h1": 0.7, "h3": 0.3},
-            },
-        }
-    )
-    score = build_score_vector(
-        target_stats={
-            "h1": {
-                "ic_mean": 0.08,
-                "ic_abs_mean": 0.08,
-                "icir": 1.1,
-                "turnover": 0.2,
-                "ic_series": np.array([0.07, 0.08, 0.09, 0.08, 0.07]),
-            },
-            "h3": {
-                "ic_mean": 0.05,
-                "ic_abs_mean": 0.05,
-                "icir": 0.8,
-                "turnover": 0.1,
-                "ic_series": np.array([0.03, 0.05, 0.06, 0.05, 0.04]),
-            },
-        },
-        target_horizons={"h1": 1, "h3": 3},
-        research_cfg=cfg.research,
-        geometry=FactorGeometryDiagnostics(
-            max_abs_correlation=0.2,
-            mean_abs_correlation=0.1,
-            projection_loss=0.25,
-            marginal_span_gain=0.75,
-            effective_rank_gain=0.4,
-            residual_ic=0.06,
-        ),
-    )
-
-    assert score.primary_score > 0.0
-    assert score.lower_confidence_bound >= 0.0
-    admitted, reason = passes_research_admission(
-        score,
-        cfg.research,
-        correlation_threshold=0.5,
-    )
-    assert admitted is True
-    assert "admission" in reason.lower()
-
-
-def test_research_model_suite_reports_net_ir():
-    cfg = load_config(
-        overrides={
-            "benchmark": {"mode": "research"},
-            "research": {
-                "enabled": True,
-                "selection": {
-                    "models": ["ridge", "lasso"],
-                    "rolling_train_window": 20,
-                    "rolling_test_window": 10,
-                    "rolling_step": 10,
-                },
-                "regimes": {"enabled": False},
-                "execution": {"cost_bps": 0.0},
-            },
-        }
-    )
-    rng = np.random.default_rng(42)
-    t, n = 60, 8
-    base = rng.normal(size=(t, n))
-    factor_signals = {
-        1: base,
-        2: rng.normal(size=(t, n)),
-        3: 0.5 * base + 0.1 * rng.normal(size=(t, n)),
-    }
-    returns = 0.03 * base + 0.01 * rng.normal(size=(t, n))
-
-    reports = run_research_model_suite(factor_signals, returns, cfg.research)
-
-    assert "ridge" in reports
-    assert reports["ridge"]["available"] is True
-    assert "mean_test_net_ir" in reports["ridge"]
-    assert reports["ridge"]["selection_stability"] >= 0.0
-
-
-def test_portfolio_backtest_exposes_raw_series():
-    backtester = PortfolioBacktester()
-    signal = np.array(
-        [
-            [1.0, 0.5, -0.2, -1.0, 0.2],
-            [1.1, 0.2, -0.3, -0.8, 0.0],
-            [0.9, 0.1, -0.5, -1.1, 0.3],
-            [1.2, 0.4, -0.1, -0.9, 0.1],
-            [1.0, 0.3, -0.4, -1.2, 0.2],
-        ]
-    )
-    returns = np.array(
-        [
-            [0.03, 0.01, -0.01, -0.02, 0.00],
-            [0.02, 0.00, -0.01, -0.03, 0.01],
-            [0.01, 0.02, -0.02, -0.01, 0.00],
-            [0.03, 0.01, -0.01, -0.02, 0.00],
-            [0.02, 0.00, -0.03, -0.01, 0.01],
-        ]
-    )
-
-    stats = backtester.quintile_backtest(signal, returns, transaction_cost_bps=4.0)
-
-    assert stats["ls_net_series"].shape[0] == signal.shape[0]
-    assert stats["turnover_series"].shape[0] == signal.shape[0]
-    assert stats["quintile_period_returns"].shape == (signal.shape[0], 5)
diff --git a/src/factorminer/factorminer/tests/test_runtime_analysis.py b/src/factorminer/factorminer/tests/test_runtime_analysis.py
deleted file mode 100644
index 440dfb8..0000000
--- a/src/factorminer/factorminer/tests/test_runtime_analysis.py
+++ /dev/null
@@ -1,196 +0,0 @@
-"""Unit tests for strict runtime recomputation helpers."""
-
-from __future__ import annotations
-
-from pathlib import Path
-from types import SimpleNamespace
-
-import numpy as np
-import pandas as pd
-import pytest
-
-from src.factorminer.factorminer.core.factor_library import Factor
-from src.factorminer.factorminer.core.parser import try_parse
-from src.factorminer.factorminer.evaluation.metrics import compute_factor_stats
-from src.factorminer.factorminer.evaluation.runtime import (
-    DatasetSplit,
-    EvaluationDataset,
-    SignalComputationError,
-    compute_tree_signals,
-    evaluate_factors,
-)
-
-
-def _build_dataset(data_dict: dict[str, np.ndarray]) -> EvaluationDataset:
-    timestamps = np.array(
-        [np.datetime64("2024-01-01") + np.timedelta64(i, "D") for i in range(50)]
-    )
-    returns = data_dict["$returns"]
-    feature_order = [
-        "$open",
-        "$high",
-        "$low",
-        "$close",
-        "$volume",
-        "$amt",
-        "$vwap",
-        "$returns",
-    ]
-    data_tensor = np.stack([data_dict[name] for name in feature_order], axis=-1)
-
-    splits = {
-        "train": DatasetSplit(
-            name="train",
-            indices=np.arange(25),
-            timestamps=timestamps[:25],
-            returns=returns[:, :25],
-        ),
-        "test": DatasetSplit(
-            name="test",
-            indices=np.arange(25, 50),
-            timestamps=timestamps[25:],
-            returns=returns[:, 25:],
-        ),
-        "full": DatasetSplit(
-            name="full",
-            indices=np.arange(50),
-            timestamps=timestamps,
-            returns=returns,
-        ),
-    }
-
-    return EvaluationDataset(
-        data_dict=data_dict,
-        data_tensor=data_tensor,
-        returns=returns,
-        timestamps=timestamps,
-        asset_ids=np.array([f"A{i:02d}" for i in range(returns.shape[0])]),
-        splits=splits,
-        processed_df=pd.DataFrame(),
-    )
-
-
-def test_evaluate_factors_matches_direct_metric_computation(small_data):
-    """Shared runtime evaluation should match direct metric recomputation."""
-    dataset = _build_dataset(small_data)
-    factor = Factor(
-        id=1,
-        name="close_neg",
-        formula="Neg($close)",
-        category="test",
-        ic_mean=99.0,
-        icir=88.0,
-        ic_win_rate=0.99,
-        max_correlation=0.0,
-        batch_number=1,
-    )
-
-    artifact = evaluate_factors([factor], dataset, signal_failure_policy="reject")[0]
-    tree = try_parse(factor.formula)
-    signals = tree.evaluate(dataset.data_dict)
-    expected_train = compute_factor_stats(signals[:, :25], dataset.returns[:, :25])
-    expected_test = compute_factor_stats(signals[:, 25:], dataset.returns[:, 25:])
-
-    assert artifact.succeeded
-    np.testing.assert_allclose(
-        artifact.split_stats["train"]["ic_series"],
-        expected_train["ic_series"],
-        equal_nan=True,
-    )
-    np.testing.assert_allclose(
-        artifact.split_stats["test"]["ic_series"],
-        expected_test["ic_series"],
-        equal_nan=True,
-    )
-    assert artifact.split_stats["train"]["ic_mean"] == pytest.approx(
-        expected_train["ic_mean"]
-    )
-    assert artifact.split_stats["test"]["long_short"] == pytest.approx(
-        expected_test["long_short"]
-    )
-    assert artifact.split_stats["train"]["turnover"] == pytest.approx(
-        expected_train["turnover"]
-    )
-
-
-def test_compute_tree_signals_obeys_failure_policy():
-    """Signal failures should reject, synthesize, or raise explicitly."""
-    tree = try_parse("Neg($close)")
-    returns_shape = (3, 7)
-
-    with pytest.raises(SignalComputationError):
-        compute_tree_signals(
-            tree,
-            data_dict={},
-            returns_shape=returns_shape,
-            signal_failure_policy="reject",
-        )
-
-    synthetic = compute_tree_signals(
-        tree,
-        data_dict={},
-        returns_shape=returns_shape,
-        signal_failure_policy="synthetic",
-    )
-    assert synthetic.shape == returns_shape
-    assert np.isfinite(synthetic).sum() > 0
-
-    with pytest.raises(Exception):
-        compute_tree_signals(
-            tree,
-            data_dict={},
-            returns_shape=returns_shape,
-            signal_failure_policy="raise",
-        )
-
-
-def test_evaluate_factors_records_strict_recomputation_failure(small_data):
-    """Strict evaluation should record failures instead of hiding them."""
-    dataset = _build_dataset(dict(small_data, **{"$close": np.full((10, 50), np.nan)}))
-    factor = Factor(
-        id=7,
-        name="broken_close",
-        formula="Neg($close)",
-        category="test",
-        ic_mean=0.0,
-        icir=0.0,
-        ic_win_rate=0.0,
-        max_correlation=0.0,
-        batch_number=1,
-    )
-
-    artifact = evaluate_factors([factor], dataset, signal_failure_policy="reject")[0]
-
-    assert not artifact.succeeded
-    assert "Signal computation produced only NaN values" in artifact.error
-
-
-def test_build_core_mining_config_uses_synthetic_policy_for_mock():
-    """Mock mining flows should opt into synthetic fallback explicitly."""
-    from factorminer.cli import _build_core_mining_config
-
-    cfg = SimpleNamespace(
-        mining=SimpleNamespace(
-            target_library_size=10,
-            batch_size=5,
-            max_iterations=3,
-            ic_threshold=0.02,
-            icir_threshold=0.3,
-            correlation_threshold=0.7,
-            replacement_ic_min=0.10,
-            replacement_ic_ratio=1.3,
-        ),
-        evaluation=SimpleNamespace(
-            fast_screen_assets=10,
-            num_workers=1,
-            backend="numpy",
-            gpu_device="cuda:0",
-            signal_failure_policy="reject",
-        ),
-    )
-
-    strict_cfg = _build_core_mining_config(cfg, output_dir=Path("/tmp"), mock=False)
-    mock_cfg = _build_core_mining_config(cfg, output_dir=Path("/tmp"), mock=True)
-
-    assert strict_cfg.signal_failure_policy == "reject"
-    assert mock_cfg.signal_failure_policy == "synthetic"
diff --git a/src/factorminer/factorminer/tests/test_significance.py b/src/factorminer/factorminer/tests/test_significance.py
deleted file mode 100644
index e4afd1d..0000000
--- a/src/factorminer/factorminer/tests/test_significance.py
+++ /dev/null
@@ -1,174 +0,0 @@
-"""Tests for statistical significance testing (evaluation/significance.py)."""
-
-from __future__ import annotations
-
-import numpy as np
-import pytest
-
-from src.factorminer.factorminer.evaluation.significance import (
-    BootstrapCIResult,
-    BootstrapICTester,
-    DeflatedSharpeCalculator,
-    DeflatedSharpeResult,
-    FDRController,
-    FDRResult,
-    SignificanceConfig,
-)
-
-
-@pytest.fixture
-def config():
-    return SignificanceConfig(
-        bootstrap_n_samples=500,
-        bootstrap_block_size=10,
-        bootstrap_confidence=0.95,
-        fdr_level=0.05,
-        seed=42,
-    )
-
-
-# -----------------------------------------------------------------------
-# BootstrapICTester: strong signal -> CI excludes zero
-# -----------------------------------------------------------------------
-
-def test_bootstrap_strong_signal_excludes_zero(config):
-    """A consistently high IC (0.10) should have CI that excludes zero."""
-    T = 200
-    ic_series = np.full(T, 0.10) + np.random.default_rng(42).normal(0, 0.01, T)
-
-    tester = BootstrapICTester(config)
-    result = tester.compute_ci("strong_factor", ic_series)
-
-    assert result.ci_excludes_zero is True
-    assert result.ci_lower > 0
-    assert result.ic_mean > 0.08
-
-
-# -----------------------------------------------------------------------
-# BootstrapICTester: weak signal -> CI includes zero
-# -----------------------------------------------------------------------
-
-def test_bootstrap_weak_signal_includes_zero(config):
-    """A near-zero IC should have CI that includes zero."""
-    T = 200
-    rng = np.random.default_rng(123)
-    ic_series = rng.normal(0.0, 0.05, T)  # mean ~0
-
-    tester = BootstrapICTester(config)
-    result = tester.compute_ci("weak_factor", ic_series)
-
-    # The CI for |IC| may or may not include zero depending on noise,
-    # but the result should be a valid BootstrapCIResult
-    assert isinstance(result, BootstrapCIResult)
-    assert result.ci_lower <= result.ci_upper
-
-
-def test_bootstrap_p_value_distinguishes_signal_from_noise(config):
-    """The sign-flip p-value should be small for signal and large for noise."""
-    rng = np.random.default_rng(7)
-    strong_ic = 0.08 + rng.normal(0.0, 0.01, 200)
-    weak_ic = rng.normal(0.0, 0.05, 200)
-
-    tester = BootstrapICTester(config)
-
-    strong_p = tester.compute_p_value(strong_ic)
-    weak_p = tester.compute_p_value(weak_ic)
-
-    assert strong_p < 0.05
-    assert weak_p > 0.05
-
-
-# -----------------------------------------------------------------------
-# FDRController: BH procedure
-# -----------------------------------------------------------------------
-
-def test_fdr_batch_evaluate_separates_signal_from_noise(config):
-    """Batch FDR should keep the strong series and reject the weak one."""
-    strong_ic = np.full(200, 0.08)
-    weak_ic = np.tile(np.array([0.05, -0.05]), 100)
-
-    tester = BootstrapICTester(config)
-    controller = FDRController(config)
-    result = controller.batch_evaluate(
-        {"strong_factor": strong_ic, "weak_factor": weak_ic},
-        tester,
-    )
-
-    assert result.significant["strong_factor"]
-    assert not result.significant["weak_factor"]
-    assert result.n_discoveries == 1
-
-def test_fdr_bh_procedure(config):
-    """10 factors with p-values [0.001, ..., 0.010] at FDR=0.05."""
-    # Use small enough p-values that BH adjustment still yields significance
-    p_values = {f"f{i}": 0.001 * (i + 1) for i in range(10)}
-    controller = FDRController(config)
-    result = controller.apply_fdr(p_values)
-
-    assert isinstance(result, FDRResult)
-    assert result.fdr_level == 0.05
-    # With BH at 0.05 and raw p in [0.001..0.010], adjusted p for f0 = 0.001*10/1 = 0.01 < 0.05
-    assert result.n_discoveries >= 1
-    assert result.significant["f0"] == True  # p=0.001, adjusted=0.01
-
-
-def test_fdr_all_significant(config):
-    """All p=0.001 should be significant after BH."""
-    p_values = {f"f{i}": 0.001 for i in range(10)}
-    controller = FDRController(config)
-    result = controller.apply_fdr(p_values)
-
-    assert result.n_discoveries == 10
-    for name, sig in result.significant.items():
-        assert sig == True
-
-
-def test_fdr_empty_input(config):
-    """Empty p-value dict should return empty result."""
-    controller = FDRController(config)
-    result = controller.apply_fdr({})
-    assert result.n_discoveries == 0
-    assert result.significant == {}
-
-
-# -----------------------------------------------------------------------
-# DeflatedSharpeCalculator
-# -----------------------------------------------------------------------
-
-def test_deflated_sharpe_with_known_returns(config):
-    """Verify DSR computation with known returns and n_trials."""
-    rng = np.random.default_rng(42)
-    T = 500
-    # Strong positive returns
-    ls_returns = rng.normal(0.001, 0.01, T)
-
-    calc = DeflatedSharpeCalculator(config)
-    result = calc.compute("good_factor", ls_returns, n_trials=10)
-
-    assert isinstance(result, DeflatedSharpeResult)
-    assert result.raw_sharpe > 0
-    assert result.n_trials == 10
-    assert result.haircut >= 0 or result.haircut < 0  # can be negative
-
-
-def test_deflated_sharpe_many_trials_penalizes(config):
-    """More trials should increase the haircut (higher expected max SR)."""
-    rng = np.random.default_rng(42)
-    T = 500
-    ls_returns = rng.normal(0.001, 0.01, T)
-
-    calc = DeflatedSharpeCalculator(config)
-    result_few = calc.compute("factor", ls_returns, n_trials=5)
-    result_many = calc.compute("factor", ls_returns, n_trials=500)
-
-    # With more trials, the deflated SR should be lower
-    assert result_many.deflated_sharpe <= result_few.deflated_sharpe
-
-
-def test_deflated_sharpe_short_series(config):
-    """Very short series (<10) should return default failing result."""
-    ls_returns = np.array([0.01, 0.02, 0.01])
-    calc = DeflatedSharpeCalculator(config)
-    result = calc.compute("short", ls_returns, n_trials=10)
-    assert result.passes is False
-    assert result.raw_sharpe == 0.0
diff --git a/src/factorminer/factorminer/utils/__init__.py b/src/factorminer/factorminer/utils/__init__.py
deleted file mode 100644
index f4982d8..0000000
--- a/src/factorminer/factorminer/utils/__init__.py
+++ /dev/null
@@ -1,26 +0,0 @@
-"""Utility modules for FactorMiner."""
-
-from src.factorminer.factorminer.utils.config import (
-    AutoInventorConfig,
-    CapacityConfig,
-    CausalConfig,
-    Config,
-    DebateConfig,
-    HelixConfig,
-    MiningConfig,
-    Phase2Config,
-    RegimeConfig,
-    SignificanceConfig,
-    load_config,
-)
-from src.factorminer.factorminer.utils.reporting import MiningReporter
-from src.factorminer.factorminer.utils.tearsheet import FactorTearSheet
-from src.factorminer.factorminer.utils.visualization import (
-    plot_ablation_comparison,
-    plot_correlation_heatmap,
-    plot_cost_pressure,
-    plot_efficiency_benchmark,
-    plot_ic_timeseries,
-    plot_mining_funnel,
-    plot_quintile_returns,
-)
diff --git a/src/factorminer/factorminer/utils/config.py b/src/factorminer/factorminer/utils/config.py
deleted file mode 100644
index c0c8640..0000000
--- a/src/factorminer/factorminer/utils/config.py
+++ /dev/null
@@ -1,741 +0,0 @@
-"""Configuration loading, validation, and management for FactorMiner."""
-
-from __future__ import annotations
-
-import copy
-from dataclasses import dataclass, field, asdict
-from pathlib import Path
-from typing import Any
-
-import yaml
-
-from src.factorminer.factorminer.configs import DEFAULT_CONFIG_PATH
-
-
-@dataclass
-class MiningConfig:
-    """Parameters controlling the factor mining loop."""
-
-    target_library_size: int = 110
-    batch_size: int = 40
-    max_iterations: int = 200
-    ic_threshold: float = 0.04
-    icir_threshold: float = 0.5
-    correlation_threshold: float = 0.5
-    replacement_ic_min: float = 0.10
-    replacement_ic_ratio: float = 1.3
-
-    def validate(self) -> None:
-        if self.target_library_size < 1:
-            raise ValueError("target_library_size must be >= 1")
-        if self.batch_size < 1:
-            raise ValueError("batch_size must be >= 1")
-        if self.max_iterations < 1:
-            raise ValueError("max_iterations must be >= 1")
-        if not (0.0 < self.ic_threshold < 1.0):
-            raise ValueError("ic_threshold must be in (0, 1)")
-        if not (0.0 < self.icir_threshold < 10.0):
-            raise ValueError("icir_threshold must be in (0, 10)")
-        if not (0.0 < self.correlation_threshold <= 1.0):
-            raise ValueError("correlation_threshold must be in (0, 1]")
-        if self.replacement_ic_min <= self.ic_threshold:
-            raise ValueError("replacement_ic_min must be > ic_threshold")
-        if self.replacement_ic_ratio < 1.0:
-            raise ValueError("replacement_ic_ratio must be >= 1.0")
-
-
-@dataclass
-class EvaluationConfig:
-    """Parameters for factor evaluation."""
-
-    num_workers: int = 40
-    fast_screen_assets: int = 100
-    gpu_device: str = "cuda:0"
-    backend: str = "gpu"
-    signal_failure_policy: str = "reject"
-
-    def validate(self) -> None:
-        if self.num_workers < 1:
-            raise ValueError("num_workers must be >= 1")
-        if self.fast_screen_assets < 1:
-            raise ValueError("fast_screen_assets must be >= 1")
-        if self.backend not in ("gpu", "numpy", "c"):
-            raise ValueError(f"backend must be one of: gpu, numpy, c (got '{self.backend}')")
-        if self.signal_failure_policy not in ("reject", "synthetic", "raise"):
-            raise ValueError(
-                "signal_failure_policy must be one of: reject, synthetic, raise"
-            )
-
-
-@dataclass
-class DataConfig:
-    """Parameters for data loading and universes."""
-
-    market: str = "a_shares"
-    universe: str = "CSI500"
-    frequency: str = "10min"
-    features: list[str] = field(
-        default_factory=lambda: [
-            "$open", "$high", "$low", "$close",
-            "$volume", "$amt", "$vwap", "$returns",
-        ]
-    )
-    train_period: list[str] = field(
-        default_factory=lambda: ["2024-01-01", "2024-12-31"]
-    )
-    test_period: list[str] = field(
-        default_factory=lambda: ["2025-01-01", "2025-12-31"]
-    )
-    targets: list[dict[str, Any]] = field(
-        default_factory=lambda: [
-            {
-                "name": "paper",
-                "entry_delay_bars": 1,
-                "holding_bars": 1,
-                "price_pair": "open_to_close",
-                "return_transform": "simple",
-            }
-        ]
-    )
-    default_target: str = "paper"
-
-    def validate(self) -> None:
-        if len(self.train_period) != 2:
-            raise ValueError("train_period must be a list of [start, end]")
-        if len(self.test_period) != 2:
-            raise ValueError("test_period must be a list of [start, end]")
-        if self.train_period[0] >= self.train_period[1]:
-            raise ValueError("train_period start must be before end")
-        if self.test_period[0] >= self.test_period[1]:
-            raise ValueError("test_period start must be before end")
-        if not self.features:
-            raise ValueError("features must not be empty")
-        if not self.targets:
-            raise ValueError("data.targets must not be empty")
-        target_names: list[str] = []
-        for target in self.targets:
-            if not isinstance(target, dict):
-                raise ValueError("each data.targets entry must be a mapping")
-            name = str(target.get("name", "")).strip()
-            if not name:
-                raise ValueError("each data.targets entry must define a non-empty name")
-            target_names.append(name)
-            if int(target.get("entry_delay_bars", 0)) < 0:
-                raise ValueError("entry_delay_bars must be >= 0")
-            if int(target.get("holding_bars", 0)) < 0:
-                raise ValueError("holding_bars must be >= 0")
-            if target.get("price_pair") not in (
-                "open_to_close",
-                "close_to_close",
-                "open_to_open",
-                "close_to_open",
-            ):
-                raise ValueError(
-                    "price_pair must be one of: open_to_close, close_to_close, "
-                    "open_to_open, close_to_open"
-                )
-            if target.get("return_transform", "simple") not in ("simple", "log"):
-                raise ValueError("return_transform must be one of: simple, log")
-        if len(set(target_names)) != len(target_names):
-            raise ValueError("data.targets names must be unique")
-        if self.default_target not in set(target_names):
-            raise ValueError("data.default_target must match one of data.targets[*].name")
-
-
-@dataclass
-class LLMConfig:
-    """Parameters for LLM-based factor generation."""
-
-    provider: str = "google"
-    model: str = "gemini-2.0-flash"
-    temperature: float = 0.8
-    max_tokens: int = 4096
-    batch_candidates: int = 40
-
-    def validate(self) -> None:
-        if self.provider not in ("google", "openai", "anthropic", "mock"):
-            raise ValueError(
-                f"provider must be one of: google, openai, anthropic, mock "
-                f"(got '{self.provider}')"
-            )
-        if not (0.0 <= self.temperature <= 2.0):
-            raise ValueError("temperature must be in [0, 2]")
-        if self.max_tokens < 1:
-            raise ValueError("max_tokens must be >= 1")
-        if self.batch_candidates < 1:
-            raise ValueError("batch_candidates must be >= 1")
-
-
-@dataclass
-class MemoryConfig:
-    """Parameters for the experience memory system."""
-
-    max_success_patterns: int = 50
-    max_failure_patterns: int = 100
-    max_insights: int = 30
-    consolidation_interval: int = 10
-
-    def validate(self) -> None:
-        if self.max_success_patterns < 1:
-            raise ValueError("max_success_patterns must be >= 1")
-        if self.max_failure_patterns < 1:
-            raise ValueError("max_failure_patterns must be >= 1")
-        if self.max_insights < 1:
-            raise ValueError("max_insights must be >= 1")
-        if self.consolidation_interval < 1:
-            raise ValueError("consolidation_interval must be >= 1")
-
-
-@dataclass
-class CausalConfig:
-    """Parameters for causal validation (Granger + intervention tests)."""
-
-    enabled: bool = False
-    granger_max_lag: int = 5
-    granger_significance: float = 0.05
-    n_interventions: int = 3
-    intervention_magnitude: float = 2.0
-    intervention_ic_threshold: float = 0.5
-    robustness_threshold: float = 0.4
-    granger_weight: float = 0.4
-    intervention_weight: float = 0.6
-
-    def validate(self) -> None:
-        if self.granger_max_lag < 1:
-            raise ValueError("granger_max_lag must be >= 1")
-        if not (0.0 < self.granger_significance < 1.0):
-            raise ValueError("granger_significance must be in (0, 1)")
-        if self.n_interventions < 1:
-            raise ValueError("n_interventions must be >= 1")
-        if self.intervention_magnitude <= 0.0:
-            raise ValueError("intervention_magnitude must be > 0")
-        if not (0.0 <= self.intervention_ic_threshold <= 1.0):
-            raise ValueError("intervention_ic_threshold must be in [0, 1]")
-        if not (0.0 <= self.robustness_threshold <= 1.0):
-            raise ValueError("robustness_threshold must be in [0, 1]")
-        if not (0.0 <= self.granger_weight <= 1.0):
-            raise ValueError("granger_weight must be in [0, 1]")
-        if not (0.0 <= self.intervention_weight <= 1.0):
-            raise ValueError("intervention_weight must be in [0, 1]")
-        if abs(self.granger_weight + self.intervention_weight - 1.0) > 1e-6:
-            raise ValueError("granger_weight + intervention_weight must equal 1.0")
-
-
-@dataclass
-class RegimeConfig:
-    """Parameters for regime-conditional factor evaluation."""
-
-    enabled: bool = False
-    lookback_window: int = 60
-    bull_return_threshold: float = 0.0
-    bear_return_threshold: float = 0.0
-    volatility_percentile: float = 0.7
-    min_regime_ic: float = 0.03
-    min_regimes_passing: int = 2
-
-    def validate(self) -> None:
-        if self.lookback_window < 5:
-            raise ValueError("lookback_window must be >= 5")
-        if not (0.0 < self.volatility_percentile < 1.0):
-            raise ValueError("volatility_percentile must be in (0, 1)")
-        if self.min_regime_ic < 0.0:
-            raise ValueError("min_regime_ic must be >= 0")
-        if not (1 <= self.min_regimes_passing <= 4):
-            raise ValueError("min_regimes_passing must be in [1, 4]")
-
-
-@dataclass
-class CapacityConfig:
-    """Parameters for strategy capacity estimation."""
-
-    enabled: bool = False
-    base_capital_usd: float = 1e8
-    ic_degradation_limit: float = 0.20
-    net_icir_threshold: float = 0.3
-    sigma_annual: float = 0.25
-
-    def validate(self) -> None:
-        if self.base_capital_usd <= 0.0:
-            raise ValueError("base_capital_usd must be > 0")
-        if not (0.0 < self.ic_degradation_limit < 1.0):
-            raise ValueError("ic_degradation_limit must be in (0, 1)")
-        if self.net_icir_threshold < 0.0:
-            raise ValueError("net_icir_threshold must be >= 0")
-        if self.sigma_annual <= 0.0:
-            raise ValueError("sigma_annual must be > 0")
-
-
-@dataclass
-class SignificanceConfig:
-    """Parameters for statistical significance testing."""
-
-    enabled: bool = False
-    bootstrap_n_samples: int = 1000
-    bootstrap_block_size: int = 20
-    fdr_level: float = 0.05
-    deflated_sharpe_enabled: bool = True
-    min_deflated_sharpe: float = 0.0
-
-    def validate(self) -> None:
-        if self.bootstrap_n_samples < 100:
-            raise ValueError("bootstrap_n_samples must be >= 100")
-        if self.bootstrap_block_size < 1:
-            raise ValueError("bootstrap_block_size must be >= 1")
-        if not (0.0 < self.fdr_level < 1.0):
-            raise ValueError("fdr_level must be in (0, 1)")
-
-
-@dataclass
-class DebateConfig:
-    """Parameters for multi-specialist debate-based generation."""
-
-    enabled: bool = False
-    num_specialists: int = 3
-    candidates_per_specialist: int = 15
-    enable_critic: bool = True
-    top_k_after_critic: int = 40
-    critic_temperature: float = 0.3
-
-    def validate(self) -> None:
-        if self.num_specialists < 1:
-            raise ValueError("num_specialists must be >= 1")
-        if self.candidates_per_specialist < 1:
-            raise ValueError("candidates_per_specialist must be >= 1")
-        if self.top_k_after_critic < 1:
-            raise ValueError("top_k_after_critic must be >= 1")
-        if not (0.0 <= self.critic_temperature <= 2.0):
-            raise ValueError("critic_temperature must be in [0, 2]")
-
-
-@dataclass
-class AutoInventorConfig:
-    """Parameters for automatic operator invention."""
-
-    enabled: bool = False
-    invention_interval: int = 10
-    max_proposals_per_round: int = 5
-    min_ic_contribution: float = 0.03
-    store_dir: str = "./output/custom_operators"
-
-    def validate(self) -> None:
-        if self.invention_interval < 1:
-            raise ValueError("invention_interval must be >= 1")
-        if self.max_proposals_per_round < 1:
-            raise ValueError("max_proposals_per_round must be >= 1")
-        if self.min_ic_contribution < 0.0:
-            raise ValueError("min_ic_contribution must be >= 0")
-
-
-@dataclass
-class HelixConfig:
-    """Parameters for the Helix knowledge and memory system."""
-
-    enabled: bool = False
-    enable_knowledge_graph: bool = False
-    enable_embeddings: bool = False
-    enable_canonicalization: bool = True
-    forgetting_lambda: float = 0.95
-    forgetting_demotion_threshold: int = 20
-
-    def validate(self) -> None:
-        if not (0.0 < self.forgetting_lambda <= 1.0):
-            raise ValueError("forgetting_lambda must be in (0, 1]")
-        if self.forgetting_demotion_threshold < 1:
-            raise ValueError("forgetting_demotion_threshold must be >= 1")
-
-
-@dataclass
-class Phase2Config:
-    """Aggregated configuration for all Phase 2 subsystems."""
-
-    causal: CausalConfig = field(default_factory=CausalConfig)
-    regime: RegimeConfig = field(default_factory=RegimeConfig)
-    capacity: CapacityConfig = field(default_factory=CapacityConfig)
-    significance: SignificanceConfig = field(default_factory=SignificanceConfig)
-    debate: DebateConfig = field(default_factory=DebateConfig)
-    auto_inventor: AutoInventorConfig = field(default_factory=AutoInventorConfig)
-    helix: HelixConfig = field(default_factory=HelixConfig)
-
-    def validate(self) -> None:
-        for sub in [
-            self.causal,
-            self.regime,
-            self.capacity,
-            self.significance,
-            self.debate,
-            self.auto_inventor,
-            self.helix,
-        ]:
-            sub.validate()
-
-
-@dataclass
-class BenchmarkConfig:
-    """Parameters for paper/research benchmark execution."""
-
-    mode: str = "paper"
-    seed: int = 42
-    freeze_top_k: int = 40
-    freeze_universe: str = "CSI500"
-    report_universes: list[str] = field(
-        default_factory=lambda: ["CSI500", "CSI1000", "HS300", "Binance"]
-    )
-    baselines: list[str] = field(
-        default_factory=lambda: [
-            "alpha101_classic",
-            "alpha101_adapted",
-            "random_exploration",
-            "gplearn",
-            "alphaforge_style",
-            "alphaagent_style",
-            "factor_miner",
-            "factor_miner_no_memory",
-        ]
-    )
-    cost_bps: list[float] = field(default_factory=lambda: [1.0, 4.0, 7.0, 10.0, 11.0])
-    efficiency_panel_shape: list[int] = field(default_factory=lambda: [12610, 500])
-
-    def validate(self) -> None:
-        if self.mode not in ("paper", "research"):
-            raise ValueError("benchmark.mode must be one of: paper, research")
-        if self.freeze_top_k < 1:
-            raise ValueError("benchmark.freeze_top_k must be >= 1")
-        if not self.freeze_universe:
-            raise ValueError("benchmark.freeze_universe must not be empty")
-        if not self.report_universes:
-            raise ValueError("benchmark.report_universes must not be empty")
-        if any(not universe for universe in self.report_universes):
-            raise ValueError("benchmark.report_universes must not contain empty entries")
-        if not self.baselines:
-            raise ValueError("benchmark.baselines must not be empty")
-        if any(cost < 0 for cost in self.cost_bps):
-            raise ValueError("benchmark.cost_bps must be non-negative")
-        if len(self.efficiency_panel_shape) != 2:
-            raise ValueError("benchmark.efficiency_panel_shape must be [periods, assets]")
-        if any(dim < 1 for dim in self.efficiency_panel_shape):
-            raise ValueError("benchmark.efficiency_panel_shape values must be >= 1")
-
-
-@dataclass
-class ResearchUncertaintyConfig:
-    """Uncertainty controls for multi-horizon research scoring."""
-
-    bootstrap_samples: int = 200
-    block_size: int = 20
-    shrinkage_strength: float = 1.0
-    lcb_zscore: float = 1.0
-    fdr_alpha: float = 0.05
-
-    def validate(self) -> None:
-        if self.bootstrap_samples < 10:
-            raise ValueError("research.uncertainty.bootstrap_samples must be >= 10")
-        if self.block_size < 1:
-            raise ValueError("research.uncertainty.block_size must be >= 1")
-        if self.shrinkage_strength < 0.0:
-            raise ValueError("research.uncertainty.shrinkage_strength must be >= 0")
-        if self.lcb_zscore < 0.0:
-            raise ValueError("research.uncertainty.lcb_zscore must be >= 0")
-        if not (0.0 < self.fdr_alpha < 1.0):
-            raise ValueError("research.uncertainty.fdr_alpha must be in (0, 1)")
-
-
-@dataclass
-class ResearchAdmissionConfig:
-    """Research-mode admission controls."""
-
-    use_residual_ic: bool = True
-    use_effective_rank_gain: bool = True
-    turnover_penalty: float = 0.05
-    redundancy_penalty: float = 0.20
-    min_score: float = 0.04
-    min_lcb: float = 0.0
-    min_span_gain: float = 0.05
-    min_effective_rank_gain: float = 0.0
-
-    def validate(self) -> None:
-        if self.turnover_penalty < 0.0:
-            raise ValueError("research.admission.turnover_penalty must be >= 0")
-        if self.redundancy_penalty < 0.0:
-            raise ValueError("research.admission.redundancy_penalty must be >= 0")
-        if self.min_score < 0.0:
-            raise ValueError("research.admission.min_score must be >= 0")
-        if self.min_span_gain < 0.0:
-            raise ValueError("research.admission.min_span_gain must be >= 0")
-
-
-@dataclass
-class ResearchSelectionConfig:
-    """Research-mode model configuration."""
-
-    models: list[str] = field(
-        default_factory=lambda: ["ridge", "elastic_net", "lasso", "xgboost"]
-    )
-    rolling_train_window: int = 80
-    rolling_test_window: int = 20
-    rolling_step: int = 20
-
-    def validate(self) -> None:
-        allowed = {
-            "ridge",
-            "elastic_net",
-            "lasso",
-            "stepwise",
-            "xgboost",
-        }
-        if not self.models:
-            raise ValueError("research.selection.models must not be empty")
-        if any(model not in allowed for model in self.models):
-            raise ValueError(
-                "research.selection.models entries must be one of: "
-                "ridge, elastic_net, lasso, stepwise, xgboost"
-            )
-        if self.rolling_train_window < 5:
-            raise ValueError("research.selection.rolling_train_window must be >= 5")
-        if self.rolling_test_window < 1:
-            raise ValueError("research.selection.rolling_test_window must be >= 1")
-        if self.rolling_step < 1:
-            raise ValueError("research.selection.rolling_step must be >= 1")
-
-
-@dataclass
-class ResearchRegimesConfig:
-    """Research-mode regime diagnostics."""
-
-    enabled: bool = False
-    definition: str = "return_volatility_liquidity"
-
-    def validate(self) -> None:
-        if self.definition not in (
-            "return_volatility",
-            "return_volatility_liquidity",
-        ):
-            raise ValueError(
-                "research.regimes.definition must be one of: "
-                "return_volatility, return_volatility_liquidity"
-            )
-
-
-@dataclass
-class ResearchExecutionConfig:
-    """Execution-aware research scoring controls."""
-
-    cost_model: str = "linear_bps"
-    cost_bps: float = 4.0
-
-    def validate(self) -> None:
-        if self.cost_model not in ("linear_bps",):
-            raise ValueError("research.execution.cost_model must be 'linear_bps'")
-        if self.cost_bps < 0.0:
-            raise ValueError("research.execution.cost_bps must be >= 0")
-
-
-@dataclass
-class ResearchConfig:
-    """Research-first multi-horizon scoring configuration."""
-
-    enabled: bool = False
-    primary_objective: str = "weighted_multi_horizon"
-    target_aggregation: str = "weighted"
-    horizon_weights: dict[str, float] = field(default_factory=dict)
-    uncertainty: ResearchUncertaintyConfig = field(default_factory=ResearchUncertaintyConfig)
-    admission: ResearchAdmissionConfig = field(default_factory=ResearchAdmissionConfig)
-    selection: ResearchSelectionConfig = field(default_factory=ResearchSelectionConfig)
-    regimes: ResearchRegimesConfig = field(default_factory=ResearchRegimesConfig)
-    execution: ResearchExecutionConfig = field(default_factory=ResearchExecutionConfig)
-
-    def validate(self) -> None:
-        if self.primary_objective not in (
-            "single_horizon",
-            "weighted_multi_horizon",
-            "pareto_multi_horizon",
-            "net_ir",
-        ):
-            raise ValueError(
-                "research.primary_objective must be one of: "
-                "single_horizon, weighted_multi_horizon, pareto_multi_horizon, net_ir"
-            )
-        if self.target_aggregation not in ("weighted", "pareto"):
-            raise ValueError(
-                "research.target_aggregation must be one of: weighted, pareto"
-            )
-        if any(weight < 0.0 for weight in self.horizon_weights.values()):
-            raise ValueError("research.horizon_weights values must be >= 0")
-        self.uncertainty.validate()
-        self.admission.validate()
-        self.selection.validate()
-        self.regimes.validate()
-        self.execution.validate()
-
-
-@dataclass
-class Config:
-    """Top-level configuration aggregating all sub-configs."""
-
-    mining: MiningConfig = field(default_factory=MiningConfig)
-    evaluation: EvaluationConfig = field(default_factory=EvaluationConfig)
-    data: DataConfig = field(default_factory=DataConfig)
-    llm: LLMConfig = field(default_factory=LLMConfig)
-    memory: MemoryConfig = field(default_factory=MemoryConfig)
-    phase2: Phase2Config = field(default_factory=Phase2Config)
-    benchmark: BenchmarkConfig = field(default_factory=BenchmarkConfig)
-    research: ResearchConfig = field(default_factory=ResearchConfig)
-
-    def validate(self) -> None:
-        """Validate all sub-configurations."""
-        self.mining.validate()
-        self.evaluation.validate()
-        self.data.validate()
-        self.llm.validate()
-        self.memory.validate()
-        self.phase2.validate()
-        self.benchmark.validate()
-        self.research.validate()
-
-    def to_dict(self) -> dict[str, Any]:
-        """Serialize config to a plain dictionary."""
-        return asdict(self)
-
-    def save(self, path: str | Path) -> None:
-        """Write config to a YAML file."""
-        path = Path(path)
-        path.parent.mkdir(parents=True, exist_ok=True)
-        with open(path, "w") as f:
-            yaml.dump(self.to_dict(), f, default_flow_style=False, sort_keys=False)
-
-
-_SECTION_MAP: dict[str, type] = {
-    "mining": MiningConfig,
-    "evaluation": EvaluationConfig,
-    "data": DataConfig,
-    "llm": LLMConfig,
-    "memory": MemoryConfig,
-    "phase2": Phase2Config,
-    "benchmark": BenchmarkConfig,
-    "research": ResearchConfig,
-}
-
-_PHASE2_SECTION_MAP: dict[str, type] = {
-    "causal": CausalConfig,
-    "regime": RegimeConfig,
-    "capacity": CapacityConfig,
-    "significance": SignificanceConfig,
-    "debate": DebateConfig,
-    "auto_inventor": AutoInventorConfig,
-    "helix": HelixConfig,
-}
-
-_RESEARCH_SECTION_MAP: dict[str, type] = {
-    "uncertainty": ResearchUncertaintyConfig,
-    "admission": ResearchAdmissionConfig,
-    "selection": ResearchSelectionConfig,
-    "regimes": ResearchRegimesConfig,
-    "execution": ResearchExecutionConfig,
-}
-
-
-def _deep_merge(base: dict, override: dict) -> dict:
-    """Recursively merge override into base, returning a new dict."""
-    result = copy.deepcopy(base)
-    for key, value in override.items():
-        if key in result and isinstance(result[key], dict) and isinstance(value, dict):
-            result[key] = _deep_merge(result[key], value)
-        else:
-            result[key] = copy.deepcopy(value)
-    return result
-
-
-def _load_yaml(path: Path) -> dict[str, Any]:
-    """Load a YAML file and return its contents as a dict."""
-    with open(path) as f:
-        data = yaml.safe_load(f)
-    if data is None:
-        return {}
-    if not isinstance(data, dict):
-        raise ValueError(f"Config file {path} must contain a YAML mapping at the top level")
-    return data
-
-
-def _build_section(section_cls: type, raw: dict[str, Any]) -> Any:
-    """Instantiate a config dataclass, ignoring unknown keys."""
-    valid_fields = {f.name for f in section_cls.__dataclass_fields__.values()}
-    filtered = {k: v for k, v in raw.items() if k in valid_fields}
-    return section_cls(**filtered)
-
-
-def _build_phase2(raw: dict[str, Any]) -> Phase2Config:
-    """Build Phase2Config with nested sub-config dataclasses."""
-    subs = {}
-    for sub_name, sub_cls in _PHASE2_SECTION_MAP.items():
-        sub_raw = raw.get(sub_name, {})
-        if isinstance(sub_raw, dict):
-            subs[sub_name] = _build_section(sub_cls, sub_raw)
-        else:
-            subs[sub_name] = sub_cls()
-    return Phase2Config(**subs)
-
-
-def _build_research(raw: dict[str, Any]) -> ResearchConfig:
-    """Build ResearchConfig with nested sub-config dataclasses."""
-    subs = {}
-    valid_fields = {f.name for f in ResearchConfig.__dataclass_fields__.values()}
-    for key, value in raw.items():
-        if key in valid_fields and key not in _RESEARCH_SECTION_MAP:
-            subs[key] = copy.deepcopy(value)
-
-    for sub_name, sub_cls in _RESEARCH_SECTION_MAP.items():
-        sub_raw = raw.get(sub_name, {})
-        if isinstance(sub_raw, dict):
-            subs[sub_name] = _build_section(sub_cls, sub_raw)
-        else:
-            subs[sub_name] = sub_cls()
-
-    return ResearchConfig(**subs)
-
-
-def load_config(
-    config_path: str | Path | None = None,
-    overrides: dict[str, Any] | None = None,
-) -> Config:
-    """Load configuration from YAML with defaults and optional overrides.
-
-    Resolution order:
-        1. Built-in defaults (default.yaml shipped with the package)
-        2. User-provided config file (if given)
-        3. Programmatic overrides dict (if given)
-
-    Args:
-        config_path: Path to a user YAML config file. If None, only defaults are used.
-        overrides: Dict of overrides keyed by section, e.g.
-            {"mining": {"batch_size": 20}, "llm": {"model": "gpt-4"}}.
-
-    Returns:
-        A fully validated Config instance.
-    """
-    # 1. Load package defaults
-    defaults = _load_yaml(DEFAULT_CONFIG_PATH)
-
-    # 2. Merge user config
-    if config_path is not None:
-        user_cfg = _load_yaml(Path(config_path))
-        merged = _deep_merge(defaults, user_cfg)
-    else:
-        merged = defaults
-
-    # 3. Merge programmatic overrides
-    if overrides:
-        merged = _deep_merge(merged, overrides)
-
-    # 4. Build typed config objects
-    sections = {}
-    for section_name, section_cls in _SECTION_MAP.items():
-        raw = merged.get(section_name, {})
-        if section_name == "phase2":
-            sections[section_name] = _build_phase2(raw)
-        elif section_name == "research":
-            sections[section_name] = _build_research(raw)
-        else:
-            sections[section_name] = _build_section(section_cls, raw)
-
-    config = Config(**sections)
-    config.validate()
-    return config
diff --git a/src/factorminer/factorminer/utils/logging.py b/src/factorminer/factorminer/utils/logging.py
deleted file mode 100644
index 4c13ea1..0000000
--- a/src/factorminer/factorminer/utils/logging.py
+++ /dev/null
@@ -1,297 +0,0 @@
-"""Structured logging system for FactorMiner mining sessions."""
-
-from __future__ import annotations
-
-import json
-import logging
-import sys
-import time
-from dataclasses import dataclass, field, asdict
-from pathlib import Path
-from typing import Any, TextIO
-
-from tqdm import tqdm
-
-
-# ---------------------------------------------------------------------------
-# Structured data records
-# ---------------------------------------------------------------------------
-
-@dataclass
-class FactorRecord:
-    """Log record for a single evaluated factor candidate."""
-
-    expression: str
-    ic: float | None = None
-    icir: float | None = None
-    max_correlation: float | None = None
-    admitted: bool = False
-    rejection_reason: str | None = None
-    replaced_factor: str | None = None
-    timestamp: float = field(default_factory=time.time)
-
-    def to_dict(self) -> dict[str, Any]:
-        return {k: v for k, v in asdict(self).items() if v is not None}
-
-
-@dataclass
-class IterationRecord:
-    """Aggregated stats for a single mining iteration (batch)."""
-
-    iteration: int
-    candidates_generated: int = 0
-    ic_passed: int = 0
-    correlation_passed: int = 0
-    admitted: int = 0
-    rejected: int = 0
-    replaced: int = 0
-    library_size: int = 0
-    best_ic: float = 0.0
-    mean_ic: float = 0.0
-    elapsed_seconds: float = 0.0
-    timestamp: float = field(default_factory=time.time)
-
-    @property
-    def yield_rate(self) -> float:
-        """Fraction of candidates that were admitted to the library."""
-        if self.candidates_generated == 0:
-            return 0.0
-        return self.admitted / self.candidates_generated
-
-    def to_dict(self) -> dict[str, Any]:
-        d = asdict(self)
-        d["yield_rate"] = self.yield_rate
-        return d
-
-
-# ---------------------------------------------------------------------------
-# JSON log exporter
-# ---------------------------------------------------------------------------
-
-class JSONLogExporter:
-    """Collects structured records and exports them to a JSON file."""
-
-    def __init__(self) -> None:
-        self.iterations: list[dict[str, Any]] = []
-        self.factors: list[dict[str, Any]] = []
-
-    def add_iteration(self, record: IterationRecord) -> None:
-        self.iterations.append(record.to_dict())
-
-    def add_factor(self, record: FactorRecord) -> None:
-        self.factors.append(record.to_dict())
-
-    def export(self, path: str | Path) -> None:
-        path = Path(path)
-        path.parent.mkdir(parents=True, exist_ok=True)
-        payload = {
-            "iterations": self.iterations,
-            "factors": self.factors,
-            "summary": self._summary(),
-        }
-        with open(path, "w") as f:
-            json.dump(payload, f, indent=2, default=str)
-
-    def _summary(self) -> dict[str, Any]:
-        if not self.iterations:
-            return {}
-        total_candidates = sum(it["candidates_generated"] for it in self.iterations)
-        total_admitted = sum(it["admitted"] for it in self.iterations)
-        return {
-            "total_iterations": len(self.iterations),
-            "total_candidates": total_candidates,
-            "total_admitted": total_admitted,
-            "overall_yield_rate": total_admitted / total_candidates if total_candidates else 0.0,
-            "final_library_size": self.iterations[-1].get("library_size", 0),
-        }
-
-
-# ---------------------------------------------------------------------------
-# Console formatter
-# ---------------------------------------------------------------------------
-
-class _ConsoleFormatter(logging.Formatter):
-    """Compact colored formatter for terminal output."""
-
-    GREY = "\033[90m"
-    GREEN = "\033[92m"
-    YELLOW = "\033[93m"
-    RED = "\033[91m"
-    BOLD = "\033[1m"
-    RESET = "\033[0m"
-
-    LEVEL_COLORS = {
-        logging.DEBUG: GREY,
-        logging.INFO: GREEN,
-        logging.WARNING: YELLOW,
-        logging.ERROR: RED,
-        logging.CRITICAL: RED + BOLD,
-    }
-
-    def format(self, record: logging.LogRecord) -> str:
-        color = self.LEVEL_COLORS.get(record.levelno, self.RESET)
-        level = record.levelname[0]  # Single-char level
-        ts = time.strftime("%H:%M:%S", time.localtime(record.created))
-        return f"{self.GREY}{ts}{self.RESET} {color}{level}{self.RESET} {record.getMessage()}"
-
-
-def setup_logger(
-    name: str = "factorminer",
-    level: int = logging.INFO,
-    log_file: str | Path | None = None,
-    stream: TextIO = sys.stderr,
-) -> logging.Logger:
-    """Create and configure a FactorMiner logger.
-
-    Args:
-        name: Logger name.
-        level: Logging level.
-        log_file: Optional path for a plain-text log file.
-        stream: Stream for console output (default stderr).
-
-    Returns:
-        Configured logger instance.
-    """
-    logger = logging.getLogger(name)
-    logger.setLevel(level)
-    logger.handlers.clear()
-
-    # Console handler with colors
-    console = logging.StreamHandler(stream)
-    console.setFormatter(_ConsoleFormatter())
-    logger.addHandler(console)
-
-    # Optional file handler
-    if log_file is not None:
-        log_path = Path(log_file)
-        log_path.parent.mkdir(parents=True, exist_ok=True)
-        fh = logging.FileHandler(log_path)
-        fh.setFormatter(logging.Formatter("%(asctime)s %(levelname)s %(message)s"))
-        logger.addHandler(fh)
-
-    return logger
-
-
-# ---------------------------------------------------------------------------
-# Mining session logger (high-level helper)
-# ---------------------------------------------------------------------------
-
-class MiningSessionLogger:
-    """High-level logger for an entire mining session.
-
-    Combines structured JSON export with pretty console output.
-    """
-
-    def __init__(
-        self,
-        output_dir: str | Path,
-        verbose: bool = False,
-    ) -> None:
-        self.output_dir = Path(output_dir)
-        self.output_dir.mkdir(parents=True, exist_ok=True)
-
-        level = logging.DEBUG if verbose else logging.INFO
-        self.logger = setup_logger(
-            level=level,
-            log_file=self.output_dir / "mining.log",
-        )
-        self.exporter = JSONLogExporter()
-        self._progress: tqdm | None = None
-
-    # -- Progress bar ---------------------------------------------------
-
-    def start_progress(self, total_iterations: int) -> None:
-        self._progress = tqdm(
-            total=total_iterations,
-            desc="Mining",
-            unit="iter",
-            bar_format="{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}]",
-        )
-
-    def advance_progress(self) -> None:
-        if self._progress is not None:
-            self._progress.update(1)
-
-    def close_progress(self) -> None:
-        if self._progress is not None:
-            self._progress.close()
-            self._progress = None
-
-    # -- Iteration-level ------------------------------------------------
-
-    def log_iteration(self, record: IterationRecord) -> None:
-        """Log a completed iteration to both console and structured store."""
-        self.exporter.add_iteration(record)
-        self.logger.info(
-            "Iter %3d | gen=%d ic_ok=%d corr_ok=%d +%d -%d | "
-            "lib=%d yield=%.1f%% best_ic=%.4f mean_ic=%.4f (%.1fs)",
-            record.iteration,
-            record.candidates_generated,
-            record.ic_passed,
-            record.correlation_passed,
-            record.admitted,
-            record.rejected,
-            record.library_size,
-            record.yield_rate * 100,
-            record.best_ic,
-            record.mean_ic,
-            record.elapsed_seconds,
-        )
-        self.advance_progress()
-
-    # -- Factor-level ---------------------------------------------------
-
-    def log_factor(self, record: FactorRecord) -> None:
-        """Log a single factor evaluation result."""
-        self.exporter.add_factor(record)
-        if record.admitted:
-            self.logger.debug(
-                "  + ADMIT  ic=%.4f icir=%.3f corr=%.3f  %s",
-                record.ic or 0,
-                record.icir or 0,
-                record.max_correlation or 0,
-                record.expression[:80],
-            )
-        else:
-            self.logger.debug(
-                "  - REJECT (%s)  %s",
-                record.rejection_reason or "unknown",
-                record.expression[:80],
-            )
-
-    # -- Session lifecycle ----------------------------------------------
-
-    def log_session_start(self, config_summary: dict[str, Any]) -> None:
-        self.logger.info("=" * 60)
-        self.logger.info("FactorMiner session started")
-        self.logger.info(
-            "Target library: %d | Batch: %d | Max iters: %d",
-            config_summary.get("target_library_size", "?"),
-            config_summary.get("batch_size", "?"),
-            config_summary.get("max_iterations", "?"),
-        )
-        self.logger.info("=" * 60)
-
-    def log_session_end(self, library_size: int, total_time: float) -> None:
-        summary = self.exporter._summary()
-        self.close_progress()
-        self.logger.info("=" * 60)
-        self.logger.info("Session complete")
-        self.logger.info(
-            "Library: %d factors | %d iterations | %.0fs total",
-            library_size,
-            summary.get("total_iterations", 0),
-            total_time,
-        )
-        self.logger.info(
-            "Candidates: %d generated, %d admitted (%.1f%% yield)",
-            summary.get("total_candidates", 0),
-            summary.get("total_admitted", 0),
-            summary.get("overall_yield_rate", 0) * 100,
-        )
-        self.logger.info("=" * 60)
-
-        # Export structured log
-        json_path = self.output_dir / "session_log.json"
-        self.exporter.export(json_path)
-        self.logger.info("Session log exported to %s", json_path)
diff --git a/src/factorminer/factorminer/utils/reporting.py b/src/factorminer/factorminer/utils/reporting.py
deleted file mode 100644
index 37aba35..0000000
--- a/src/factorminer/factorminer/utils/reporting.py
+++ /dev/null
@@ -1,499 +0,0 @@
-"""Mining session reporting for FactorMiner.
-
-Provides structured logging, text reports, JSON export, and progress
-visualization for factor mining sessions. Designed to mirror the batch
-reports shown in Appendix H of the paper.
-"""
-
-from __future__ import annotations
-
-import json
-import time
-from collections import defaultdict
-from dataclasses import dataclass, field, asdict
-from datetime import datetime
-from pathlib import Path
-from typing import Dict, List, Optional
-
-import matplotlib.pyplot as plt
-import numpy as np
-
-
-# ---------------------------------------------------------------------------
-# Data classes for structured logging
-# ---------------------------------------------------------------------------
-
-@dataclass
-class FactorAdmissionRecord:
-    """Record of a single factor admission."""
-
-    factor_id: int
-    name: str
-    formula: str
-    ic: float
-    icir: float
-    max_corr: float
-    batch_number: int
-    timestamp: str = ""
-
-    def __post_init__(self) -> None:
-        if not self.timestamp:
-            self.timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
-
-
-@dataclass
-class BatchRecord:
-    """Record of a single mining batch."""
-
-    batch_num: int
-    candidates: int = 0
-    ic_passed: int = 0
-    corr_passed: int = 0
-    admitted: int = 0
-    replaced: int = 0
-    rejection_reasons: List[str] = field(default_factory=list)
-    admitted_factors: List[FactorAdmissionRecord] = field(default_factory=list)
-    library_size: int = 0
-    elapsed_seconds: float = 0.0
-    timestamp: str = ""
-
-    def __post_init__(self) -> None:
-        if not self.timestamp:
-            self.timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
-
-    @property
-    def rejected(self) -> int:
-        return self.candidates - self.admitted - self.replaced
-
-    @property
-    def yield_rate(self) -> float:
-        if self.candidates == 0:
-            return 0.0
-        return self.admitted / self.candidates
-
-    @property
-    def rejection_rate(self) -> float:
-        if self.candidates == 0:
-            return 0.0
-        return self.rejected / self.candidates
-
-    def to_dict(self) -> dict:
-        d = asdict(self)
-        d["rejected"] = self.rejected
-        d["yield_rate"] = self.yield_rate
-        d["rejection_rate"] = self.rejection_rate
-        return d
-
-
-# ---------------------------------------------------------------------------
-# MiningReporter
-# ---------------------------------------------------------------------------
-
-class MiningReporter:
-    """Track and report mining session progress.
-
-    Collects batch-level and factor-level logs, generates text reports,
-    JSON exports, and progress visualisations.
-
-    Parameters
-    ----------
-    output_dir : str
-        Directory for saving reports and plots.
-    """
-
-    def __init__(self, output_dir: str) -> None:
-        self.output_dir = Path(output_dir)
-        self.output_dir.mkdir(parents=True, exist_ok=True)
-        self.batches: List[BatchRecord] = []
-        self.factor_admissions: List[FactorAdmissionRecord] = []
-        self._session_start: float = time.time()
-
-    # ------------------------------------------------------------------
-    # Logging
-    # ------------------------------------------------------------------
-
-    def log_batch(
-        self,
-        batch_num: int,
-        candidates: int,
-        ic_passed: int,
-        corr_passed: int,
-        admitted: int,
-        replaced: int,
-        rejection_reasons: List[str],
-        library_size: int = 0,
-        elapsed_seconds: float = 0.0,
-    ) -> None:
-        """Log a batch's results.
-
-        Parameters
-        ----------
-        batch_num : int
-            Sequential batch number.
-        candidates : int
-            Number of candidates generated.
-        ic_passed : int
-            Number passing IC screening.
-        corr_passed : int
-            Number passing correlation screening.
-        admitted : int
-            Number admitted to the library.
-        replaced : int
-            Number that replaced existing library factors.
-        rejection_reasons : List[str]
-            List of rejection reason strings for this batch.
-        library_size : int
-            Current library size after this batch.
-        elapsed_seconds : float
-            Time taken for this batch.
-        """
-        record = BatchRecord(
-            batch_num=batch_num,
-            candidates=candidates,
-            ic_passed=ic_passed,
-            corr_passed=corr_passed,
-            admitted=admitted,
-            replaced=replaced,
-            rejection_reasons=rejection_reasons,
-            library_size=library_size,
-            elapsed_seconds=elapsed_seconds,
-        )
-        self.batches.append(record)
-
-    def log_factor_admission(
-        self,
-        factor_id: int,
-        name: str,
-        formula: str,
-        ic: float,
-        icir: float,
-        max_corr: float,
-    ) -> None:
-        """Log an individual factor admission.
-
-        Parameters
-        ----------
-        factor_id : int
-            Unique factor identifier.
-        name : str
-            Human-readable factor name.
-        formula : str
-            DSL expression string.
-        ic : float
-            Mean IC of the admitted factor.
-        icir : float
-            ICIR of the admitted factor.
-        max_corr : float
-            Maximum pairwise correlation at admission time.
-        """
-        batch_num = self.batches[-1].batch_num if self.batches else 0
-        record = FactorAdmissionRecord(
-            factor_id=factor_id,
-            name=name,
-            formula=formula,
-            ic=ic,
-            icir=icir,
-            max_corr=max_corr,
-            batch_number=batch_num,
-        )
-        self.factor_admissions.append(record)
-        if self.batches:
-            self.batches[-1].admitted_factors.append(record)
-
-    # ------------------------------------------------------------------
-    # Text reports
-    # ------------------------------------------------------------------
-
-    def generate_batch_report(self, batch_num: int) -> str:
-        """Generate text report for a specific batch.
-
-        Parameters
-        ----------
-        batch_num : int
-            The batch number to report on.
-
-        Returns
-        -------
-        str
-            Formatted text report.
-        """
-        batch = None
-        for b in self.batches:
-            if b.batch_num == batch_num:
-                batch = b
-                break
-
-        if batch is None:
-            return f"Batch {batch_num} not found."
-
-        lines = [
-            f"{'=' * 60}",
-            f"  BATCH REPORT: Iteration {batch.batch_num}",
-            f"  Timestamp: {batch.timestamp}",
-            f"{'=' * 60}",
-            "",
-            f"  Candidates generated:    {batch.candidates:>6}",
-            f"  IC screen passed:        {batch.ic_passed:>6}",
-            f"  Correlation passed:      {batch.corr_passed:>6}",
-            f"  Admitted to library:     {batch.admitted:>6}",
-            f"  Replaced in library:     {batch.replaced:>6}",
-            f"  Rejected:                {batch.rejected:>6}",
-            "",
-            f"  Yield rate:              {batch.yield_rate:>6.1%}",
-            f"  Rejection rate:          {batch.rejection_rate:>6.1%}",
-            f"  Library size (after):    {batch.library_size:>6}",
-            f"  Elapsed:                 {batch.elapsed_seconds:>6.1f}s",
-        ]
-
-        # Admitted factors detail
-        if batch.admitted_factors:
-            lines.append("")
-            lines.append("  Admitted Factors:")
-            lines.append(f"  {'ID':>4}  {'IC':>8}  {'ICIR':>8}  {'MaxCorr':>8}  Name")
-            lines.append(f"  {'-'*4}  {'-'*8}  {'-'*8}  {'-'*8}  {'-'*20}")
-            for f in batch.admitted_factors:
-                lines.append(
-                    f"  {f.factor_id:>4}  {f.ic:>8.4f}  {f.icir:>8.3f}  "
-                    f"{f.max_corr:>8.4f}  {f.name[:30]}"
-                )
-
-        # Rejection breakdown
-        if batch.rejection_reasons:
-            reason_counts: Dict[str, int] = defaultdict(int)
-            for reason in batch.rejection_reasons:
-                # Normalise to short category
-                if "IC" in reason.upper() or "ic" in reason.lower():
-                    reason_counts["IC below threshold"] += 1
-                elif "corr" in reason.lower():
-                    reason_counts["Correlation too high"] += 1
-                elif "parse" in reason.lower() or "invalid" in reason.lower():
-                    reason_counts["Parse / invalid"] += 1
-                else:
-                    reason_counts["Other"] += 1
-
-            lines.append("")
-            lines.append("  Rejection Breakdown:")
-            for reason, count in sorted(reason_counts.items(), key=lambda x: -x[1]):
-                lines.append(f"    {reason:<30} {count:>5}")
-
-        lines.append(f"\n{'=' * 60}")
-        return "\n".join(lines)
-
-    def generate_session_report(self) -> str:
-        """Generate full session report with cumulative statistics.
-
-        Returns
-        -------
-        str
-            Formatted text report covering the entire mining session.
-        """
-        elapsed = time.time() - self._session_start
-
-        total_candidates = sum(b.candidates for b in self.batches)
-        total_ic_passed = sum(b.ic_passed for b in self.batches)
-        total_corr_passed = sum(b.corr_passed for b in self.batches)
-        total_admitted = sum(b.admitted for b in self.batches)
-        total_replaced = sum(b.replaced for b in self.batches)
-        total_rejected = total_candidates - total_admitted - total_replaced
-
-        overall_yield = total_admitted / total_candidates if total_candidates > 0 else 0.0
-        final_lib_size = self.batches[-1].library_size if self.batches else 0
-
-        lines = [
-            f"{'#' * 60}",
-            f"  FACTORMINER SESSION REPORT",
-            f"  Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}",
-            f"{'#' * 60}",
-            "",
-            f"  Total batches:           {len(self.batches):>6}",
-            f"  Total elapsed:           {elapsed:>6.0f}s ({elapsed/60:.1f}m)",
-            "",
-            "  --- Cumulative Pipeline ---",
-            f"  Candidates generated:    {total_candidates:>6}",
-            f"  IC screen passed:        {total_ic_passed:>6} ({total_ic_passed/total_candidates:.1%})" if total_candidates > 0 else f"  IC screen passed:        {total_ic_passed:>6}",
-            f"  Correlation passed:      {total_corr_passed:>6} ({total_corr_passed/total_candidates:.1%})" if total_candidates > 0 else f"  Correlation passed:      {total_corr_passed:>6}",
-            f"  Admitted:                {total_admitted:>6} ({overall_yield:.1%})",
-            f"  Replaced:                {total_replaced:>6}",
-            f"  Rejected:                {total_rejected:>6}",
-            "",
-            f"  Final library size:      {final_lib_size:>6}",
-            f"  Overall yield rate:      {overall_yield:>6.1%}",
-        ]
-
-        # Per-batch summary table
-        if self.batches:
-            lines.append("")
-            lines.append("  --- Per-Batch Summary ---")
-            lines.append(
-                f"  {'Batch':>5}  {'Cand':>5}  {'IC':>4}  {'Corr':>4}  "
-                f"{'Adm':>4}  {'Rep':>4}  {'Lib':>4}  {'Yield':>6}  {'Time':>6}"
-            )
-            lines.append(f"  {'-'*5}  {'-'*5}  {'-'*4}  {'-'*4}  {'-'*4}  {'-'*4}  {'-'*4}  {'-'*6}  {'-'*6}")
-            for b in self.batches:
-                lines.append(
-                    f"  {b.batch_num:>5}  {b.candidates:>5}  {b.ic_passed:>4}  "
-                    f"{b.corr_passed:>4}  {b.admitted:>4}  {b.replaced:>4}  "
-                    f"{b.library_size:>4}  {b.yield_rate:>5.1%}  {b.elapsed_seconds:>5.1f}s"
-                )
-
-        # Top admitted factors
-        if self.factor_admissions:
-            top_factors = sorted(self.factor_admissions, key=lambda f: f.ic, reverse=True)[:10]
-            lines.append("")
-            lines.append("  --- Top 10 Factors by IC ---")
-            lines.append(
-                f"  {'ID':>4}  {'IC':>8}  {'ICIR':>8}  {'MaxCorr':>8}  Name"
-            )
-            lines.append(f"  {'-'*4}  {'-'*8}  {'-'*8}  {'-'*8}  {'-'*30}")
-            for f in top_factors:
-                lines.append(
-                    f"  {f.factor_id:>4}  {f.ic:>8.4f}  {f.icir:>8.3f}  "
-                    f"{f.max_corr:>8.4f}  {f.name[:30]}"
-                )
-
-        lines.append(f"\n{'#' * 60}")
-        return "\n".join(lines)
-
-    # ------------------------------------------------------------------
-    # Export
-    # ------------------------------------------------------------------
-
-    def export_to_json(self, path: str) -> None:
-        """Export all mining logs to JSON.
-
-        Parameters
-        ----------
-        path : str
-            File path for the JSON output.
-        """
-        payload = {
-            "session": {
-                "start_time": datetime.fromtimestamp(self._session_start).strftime(
-                    "%Y-%m-%d %H:%M:%S"
-                ),
-                "elapsed_seconds": time.time() - self._session_start,
-                "total_batches": len(self.batches),
-                "total_admissions": len(self.factor_admissions),
-            },
-            "batches": [b.to_dict() for b in self.batches],
-            "factor_admissions": [asdict(f) for f in self.factor_admissions],
-            "summary": self._compute_summary(),
-        }
-
-        out_path = Path(path)
-        out_path.parent.mkdir(parents=True, exist_ok=True)
-        with open(out_path, "w") as f:
-            json.dump(payload, f, indent=2, default=str)
-
-    def save_session_report(self, filename: str = "session_report.txt") -> str:
-        """Save the session report to a text file.
-
-        Returns the path to the saved file.
-        """
-        report = self.generate_session_report()
-        path = self.output_dir / filename
-        with open(path, "w") as f:
-            f.write(report)
-        return str(path)
-
-    # ------------------------------------------------------------------
-    # Visualization
-    # ------------------------------------------------------------------
-
-    def plot_mining_progress(self, save_path: Optional[str] = None) -> None:
-        """Plot library growth, yield rate, and rejection rate over batches.
-
-        Produces a 3-panel figure:
-            1. Library size growth
-            2. Yield rate per batch
-            3. Rejection breakdown stacked area
-
-        Parameters
-        ----------
-        save_path : Optional[str]
-            If provided, saves the figure to this path.
-        """
-        if not self.batches:
-            return
-
-        plt.rcParams.update({
-            "figure.facecolor": "white",
-            "axes.facecolor": "white",
-            "axes.grid": True,
-            "grid.alpha": 0.3,
-            "grid.linestyle": "--",
-            "figure.dpi": 150,
-        })
-
-        batch_nums = [b.batch_num for b in self.batches]
-        lib_sizes = [b.library_size for b in self.batches]
-        yield_rates = [b.yield_rate * 100 for b in self.batches]
-        admitted_counts = [b.admitted for b in self.batches]
-        replaced_counts = [b.replaced for b in self.batches]
-        rejected_counts = [b.rejected for b in self.batches]
-
-        fig, (ax1, ax2, ax3) = plt.subplots(3, 1, figsize=(12, 10),
-                                             sharex=True,
-                                             gridspec_kw={"hspace": 0.15})
-
-        # Panel 1: Library size growth
-        ax1.plot(batch_nums, lib_sizes, color="#1565C0", linewidth=2.0,
-                 marker="o", markersize=3)
-        ax1.fill_between(batch_nums, lib_sizes, alpha=0.15, color="#1565C0")
-        ax1.set_ylabel("Library Size")
-        ax1.set_title("Mining Progress", fontsize=13, fontweight="bold")
-        if lib_sizes:
-            ax1.text(batch_nums[-1], lib_sizes[-1],
-                     f"  {lib_sizes[-1]}", va="center", fontsize=9, color="#1565C0")
-
-        # Panel 2: Yield rate
-        ax2.bar(batch_nums, yield_rates, color="#43A047", alpha=0.7,
-                edgecolor="white", linewidth=0.5)
-        if yield_rates:
-            avg_yield = sum(yield_rates) / len(yield_rates)
-            ax2.axhline(y=avg_yield, color="#FF6F00", linestyle="--", linewidth=1.0,
-                        label=f"Avg = {avg_yield:.1f}%")
-            ax2.legend(fontsize=8, loc="upper right")
-        ax2.set_ylabel("Yield Rate (%)")
-        ax2.set_ylim(bottom=0)
-
-        # Panel 3: Stacked bar of admitted / replaced / rejected
-        ax3.bar(batch_nums, admitted_counts, label="Admitted",
-                color="#43A047", edgecolor="white", linewidth=0.5)
-        ax3.bar(batch_nums, replaced_counts, bottom=admitted_counts,
-                label="Replaced", color="#FF8F00", edgecolor="white", linewidth=0.5)
-        bottoms = [a + r for a, r in zip(admitted_counts, replaced_counts)]
-        ax3.bar(batch_nums, rejected_counts, bottom=bottoms,
-                label="Rejected", color="#E53935", alpha=0.6,
-                edgecolor="white", linewidth=0.5)
-        ax3.set_ylabel("Candidates")
-        ax3.set_xlabel("Batch Number")
-        ax3.legend(loc="upper right", fontsize=8)
-
-        fig.tight_layout()
-
-        if save_path is not None:
-            fig.savefig(save_path, bbox_inches="tight", facecolor="white", dpi=200)
-            plt.close(fig)
-        else:
-            plt.show()
-
-    # ------------------------------------------------------------------
-    # Internal helpers
-    # ------------------------------------------------------------------
-
-    def _compute_summary(self) -> dict:
-        """Compute cumulative summary statistics."""
-        total_candidates = sum(b.candidates for b in self.batches)
-        total_admitted = sum(b.admitted for b in self.batches)
-        total_replaced = sum(b.replaced for b in self.batches)
-
-        return {
-            "total_candidates": total_candidates,
-            "total_admitted": total_admitted,
-            "total_replaced": total_replaced,
-            "total_rejected": total_candidates - total_admitted - total_replaced,
-            "overall_yield_rate": total_admitted / total_candidates if total_candidates > 0 else 0.0,
-            "final_library_size": self.batches[-1].library_size if self.batches else 0,
-            "total_elapsed_seconds": time.time() - self._session_start,
-        }
diff --git a/src/factorminer/factorminer/utils/tearsheet.py b/src/factorminer/factorminer/utils/tearsheet.py
deleted file mode 100644
index 8371d85..0000000
--- a/src/factorminer/factorminer/utils/tearsheet.py
+++ /dev/null
@@ -1,399 +0,0 @@
-"""Factor tear sheet generation for FactorMiner.
-
-Produces comprehensive, multi-panel evaluation reports for individual
-factors, following the style of Appendix O / Figure 10 from the paper.
-Also provides summary table generation for the full factor library.
-"""
-
-from __future__ import annotations
-
-from typing import Dict, List, Optional
-
-import matplotlib.pyplot as plt
-import matplotlib.gridspec as gridspec
-import numpy as np
-import pandas as pd
-from scipy.stats import rankdata
-
-from src.factorminer.factorminer.evaluation.metrics import (
-    compute_ic,
-    compute_icir,
-    compute_ic_mean,
-    compute_ic_win_rate,
-    compute_quintile_returns,
-    compute_turnover,
-)
-
-
-# ---------------------------------------------------------------------------
-# Helpers
-# ---------------------------------------------------------------------------
-
-def _rolling_mean(arr: np.ndarray, window: int) -> np.ndarray:
-    """Compute rolling mean with edge handling."""
-    out = np.full_like(arr, np.nan, dtype=np.float64)
-    clean = np.where(np.isnan(arr), 0.0, arr)
-    kernel = np.ones(window) / window
-    conv = np.convolve(clean, kernel, mode="same")
-    # Fix edges using expanding window
-    for i in range(window // 2):
-        w = i + window // 2 + 1
-        conv[i] = np.mean(clean[:w])
-        conv[-(i + 1)] = np.mean(clean[-w:])
-    return conv
-
-
-def _compute_daily_turnover(signals: np.ndarray) -> np.ndarray:
-    """Compute total daily turnover as fraction of positions changing.
-
-    Parameters
-    ----------
-    signals : np.ndarray, shape (M, T)
-
-    Returns
-    -------
-    np.ndarray, shape (T-1,)
-        Turnover for each period transition.
-    """
-    M, T = signals.shape
-    turnovers = np.full(T - 1, np.nan, dtype=np.float64)
-
-    for t in range(1, T):
-        prev = signals[:, t - 1]
-        curr = signals[:, t]
-        valid = ~(np.isnan(prev) | np.isnan(curr))
-        n = valid.sum()
-        if n < 5:
-            continue
-        # Rank-based positions
-        prev_ranks = rankdata(prev[valid]) / n
-        curr_ranks = rankdata(curr[valid]) / n
-        turnovers[t - 1] = float(np.mean(np.abs(curr_ranks - prev_ranks)))
-
-    return turnovers
-
-
-# ---------------------------------------------------------------------------
-# FactorTearSheet
-# ---------------------------------------------------------------------------
-
-class FactorTearSheet:
-    """Generate comprehensive evaluation report for a single factor.
-
-    Produces an 8-panel figure plus a metrics summary table.
-    """
-
-    # Panel colours
-    IC_BAR_POS = "#4CAF50"
-    IC_BAR_NEG = "#F44336"
-    ROLLING_COLOR = "#1565C0"
-    CUMULATIVE_COLOR = "#0D47A1"
-    QUINTILE_CMAP = "RdYlGn"
-    TURNOVER_COLOR = "#FF8F00"
-
-    def generate(
-        self,
-        factor_id: int,
-        factor_name: str,
-        formula: str,
-        signals: np.ndarray,
-        returns: np.ndarray,
-        dates: List[str],
-        save_path: Optional[str] = None,
-    ) -> Dict[str, float]:
-        """Generate a multi-panel tear sheet.
-
-        Panels:
-            (a) IC time-series analysis -- daily mean rank IC with mean line
-            (b) Rank IC distribution -- histogram with statistics
-            (c) 21-day rolling IC -- aggregated daily rolling window
-            (d) Cumulative IC composition
-            (e) Quintile returns -- bar chart with Q1-Q5
-            (f) Cumulative returns -- line chart for quintiles
-            (g) Factor value distribution -- histogram
-            (h) Turnover analysis -- daily total turnover
-
-        Parameters
-        ----------
-        factor_id : int
-            Unique identifier for the factor.
-        factor_name : str
-            Human-readable factor name.
-        formula : str
-            DSL expression string.
-        signals : np.ndarray, shape (M, T)
-            Factor signal values.
-        returns : np.ndarray, shape (M, T)
-            Forward returns.
-        dates : List[str]
-            Date strings of length T.
-        save_path : Optional[str]
-            If provided, saves the figure to this path.
-
-        Returns
-        -------
-        Dict[str, float]
-            Dictionary of computed metrics.
-        """
-        plt.rcParams.update({
-            "figure.facecolor": "white",
-            "axes.facecolor": "white",
-            "axes.grid": True,
-            "grid.alpha": 0.3,
-            "grid.linestyle": "--",
-            "figure.dpi": 150,
-        })
-
-        M, T = signals.shape
-        ic_series = compute_ic(signals, returns)
-        ic_clean = np.where(np.isnan(ic_series), 0.0, ic_series)
-        valid_ic = ic_series[~np.isnan(ic_series)]
-
-        # Compute all metrics
-        ic_mean = float(np.mean(valid_ic)) if len(valid_ic) > 0 else 0.0
-        ic_abs_mean = compute_ic_mean(ic_series)
-        icir = compute_icir(ic_series)
-        win_rate = compute_ic_win_rate(ic_series)
-        quintile = compute_quintile_returns(signals, returns)
-        turnover = compute_turnover(signals)
-        daily_turnover = _compute_daily_turnover(signals)
-
-        metrics = {
-            "ic_mean": ic_mean,
-            "ic_abs_mean": ic_abs_mean,
-            "icir": icir,
-            "ic_win_rate": win_rate,
-            "Q1_return": quintile.get("Q1", 0.0),
-            "Q5_return": quintile.get("Q5", 0.0),
-            "long_short": quintile.get("long_short", 0.0),
-            "monotonicity": quintile.get("monotonicity", 0.0),
-            "avg_turnover": turnover,
-        }
-
-        # Cumulative IC
-        cumulative_ic = np.nancumsum(ic_clean)
-
-        # Rolling IC (21-day)
-        rolling_ic = _rolling_mean(ic_clean, 21)
-
-        # Quintile cumulative returns (compute per-period Q returns)
-        n_quantiles = 5
-        quintile_ts = {q: [] for q in range(1, n_quantiles + 1)}
-        for t in range(T):
-            s = signals[:, t]
-            r = returns[:, t]
-            valid_mask = ~(np.isnan(s) | np.isnan(r))
-            n = valid_mask.sum()
-            if n < n_quantiles:
-                for q in range(1, n_quantiles + 1):
-                    quintile_ts[q].append(0.0)
-                continue
-            ranks = rankdata(s[valid_mask])
-            q_labels = np.clip(
-                np.ceil(ranks / n * n_quantiles).astype(int), 1, n_quantiles
-            )
-            r_valid = r[valid_mask]
-            for q in range(1, n_quantiles + 1):
-                mask_q = q_labels == q
-                quintile_ts[q].append(float(np.mean(r_valid[mask_q])) if mask_q.any() else 0.0)
-
-        quintile_cumulative = {}
-        for q in range(1, n_quantiles + 1):
-            quintile_cumulative[f"Q{q}"] = np.cumsum(quintile_ts[q])
-
-        # ---- Build 4x2 panel figure ----
-        fig = plt.figure(figsize=(16, 18))
-        gs = gridspec.GridSpec(4, 2, hspace=0.35, wspace=0.3)
-
-        # Suptitle
-        fig.suptitle(
-            f"Factor #{factor_id}: {factor_name}\n{formula[:100]}{'...' if len(formula) > 100 else ''}",
-            fontsize=13, fontweight="bold", y=0.98,
-        )
-
-        # (a) IC time-series
-        ax_a = fig.add_subplot(gs[0, 0])
-        x = np.arange(T)
-        colors_ic = np.where(ic_clean >= 0, self.IC_BAR_POS, self.IC_BAR_NEG)
-        ax_a.bar(x, ic_clean, color=colors_ic, alpha=0.5, width=1.0, edgecolor="none")
-        ax_a.axhline(y=ic_mean, color="#FF6F00", linestyle="--", linewidth=1.0,
-                     label=f"Mean = {ic_mean:.4f}")
-        ax_a.axhline(y=0, color="black", linewidth=0.4)
-        ax_a.set_title("(a) Daily Rank IC", fontsize=10)
-        ax_a.set_ylabel("IC")
-        ax_a.legend(fontsize=8, loc="upper left")
-        self._set_date_ticks(ax_a, dates, T)
-
-        # (b) IC distribution
-        ax_b = fig.add_subplot(gs[0, 1])
-        if len(valid_ic) > 0:
-            ax_b.hist(valid_ic, bins=50, color=self.ROLLING_COLOR, alpha=0.7,
-                      edgecolor="white", linewidth=0.5, density=True)
-            ax_b.axvline(x=ic_mean, color="#FF6F00", linestyle="--", linewidth=1.2,
-                         label=f"Mean = {ic_mean:.4f}")
-            ax_b.axvline(x=0, color="black", linewidth=0.4)
-        ax_b.set_title("(b) Rank IC Distribution", fontsize=10)
-        ax_b.set_xlabel("IC")
-        ax_b.set_ylabel("Density")
-        stats_text = f"Mean={ic_mean:.4f}\nICIR={icir:.3f}\nWin={win_rate:.1%}"
-        ax_b.text(0.97, 0.97, stats_text, transform=ax_b.transAxes,
-                  ha="right", va="top", fontsize=8,
-                  bbox=dict(boxstyle="round,pad=0.3", facecolor="wheat", alpha=0.5))
-        ax_b.legend(fontsize=8, loc="upper left")
-
-        # (c) 21-day rolling IC
-        ax_c = fig.add_subplot(gs[1, 0])
-        ax_c.plot(x, rolling_ic, color=self.ROLLING_COLOR, linewidth=1.0)
-        ax_c.fill_between(x, rolling_ic, alpha=0.15, color=self.ROLLING_COLOR)
-        ax_c.axhline(y=0, color="black", linewidth=0.4)
-        ax_c.axhline(y=ic_mean, color="#FF6F00", linestyle="--", linewidth=0.8,
-                     label=f"Mean = {ic_mean:.4f}")
-        ax_c.set_title("(c) 21-Day Rolling IC", fontsize=10)
-        ax_c.set_ylabel("Rolling IC")
-        ax_c.legend(fontsize=8, loc="upper left")
-        self._set_date_ticks(ax_c, dates, T)
-
-        # (d) Cumulative IC
-        ax_d = fig.add_subplot(gs[1, 1])
-        ax_d.fill_between(x, cumulative_ic, alpha=0.25, color=self.CUMULATIVE_COLOR)
-        ax_d.plot(x, cumulative_ic, color=self.CUMULATIVE_COLOR, linewidth=1.0)
-        ax_d.axhline(y=0, color="black", linewidth=0.4)
-        ax_d.set_title("(d) Cumulative IC", fontsize=10)
-        ax_d.set_ylabel("Cumulative IC")
-        self._set_date_ticks(ax_d, dates, T)
-
-        # (e) Quintile returns bar chart
-        ax_e = fig.add_subplot(gs[2, 0])
-        q_labels_list = [f"Q{q}" for q in range(1, n_quantiles + 1)]
-        q_vals = [quintile.get(f"Q{q}", 0.0) for q in range(1, n_quantiles + 1)]
-        cmap = plt.cm.RdYlGn
-        q_colors = [cmap(i / max(n_quantiles - 1, 1)) for i in range(n_quantiles)]
-        bars = ax_e.bar(q_labels_list, q_vals, color=q_colors, edgecolor="white", linewidth=0.8)
-        for bar, val in zip(bars, q_vals):
-            y_pos = bar.get_height()
-            ax_e.text(bar.get_x() + bar.get_width() / 2, y_pos,
-                      f"{val:.4f}", ha="center",
-                      va="bottom" if y_pos >= 0 else "top", fontsize=8)
-        ax_e.axhline(y=0, color="black", linewidth=0.4)
-        ls = quintile.get("long_short", 0.0)
-        mono = quintile.get("monotonicity", 0.0)
-        ax_e.set_title(f"(e) Quintile Returns  |  L-S={ls:.4f}  Mono={mono:.2f}", fontsize=10)
-        ax_e.set_ylabel("Mean Return")
-
-        # (f) Cumulative quintile returns
-        ax_f = fig.add_subplot(gs[2, 1])
-        q_palette = plt.cm.RdYlGn(np.linspace(0.1, 0.9, n_quantiles))
-        for i, q in enumerate(range(1, n_quantiles + 1)):
-            key = f"Q{q}"
-            ax_f.plot(quintile_cumulative[key], color=q_palette[i],
-                      linewidth=1.1, label=key)
-        ax_f.axhline(y=0, color="black", linewidth=0.4)
-        ax_f.set_title("(f) Cumulative Quintile Returns", fontsize=10)
-        ax_f.set_ylabel("Cumulative Return")
-        ax_f.legend(loc="upper left", fontsize=8, ncol=n_quantiles, framealpha=0.9)
-        self._set_date_ticks(ax_f, dates, T)
-
-        # (g) Factor value distribution
-        ax_g = fig.add_subplot(gs[3, 0])
-        # Sample from signals for histogram (last period or flattened sample)
-        flat_signals = signals[~np.isnan(signals)]
-        if len(flat_signals) > 50000:
-            rng = np.random.default_rng(42)
-            flat_signals = rng.choice(flat_signals, 50000, replace=False)
-        if len(flat_signals) > 0:
-            # Clip to 1st/99th percentile for cleaner visualization
-            lo, hi = np.percentile(flat_signals, [1, 99])
-            clipped = flat_signals[(flat_signals >= lo) & (flat_signals <= hi)]
-            ax_g.hist(clipped, bins=80, color="#7E57C2", alpha=0.7,
-                      edgecolor="white", linewidth=0.3, density=True)
-            mean_sig = float(np.mean(flat_signals))
-            std_sig = float(np.std(flat_signals))
-            ax_g.axvline(x=mean_sig, color="#FF6F00", linestyle="--", linewidth=1.0,
-                         label=f"Mean={mean_sig:.4f}")
-            stats_text_g = f"Std={std_sig:.4f}\nN={len(flat_signals):,}"
-            ax_g.text(0.97, 0.97, stats_text_g, transform=ax_g.transAxes,
-                      ha="right", va="top", fontsize=8,
-                      bbox=dict(boxstyle="round,pad=0.3", facecolor="wheat", alpha=0.5))
-        ax_g.set_title("(g) Factor Value Distribution", fontsize=10)
-        ax_g.set_xlabel("Factor Value")
-        ax_g.set_ylabel("Density")
-        ax_g.legend(fontsize=8, loc="upper left")
-
-        # (h) Turnover analysis
-        ax_h = fig.add_subplot(gs[3, 1])
-        valid_turnover = daily_turnover[~np.isnan(daily_turnover)]
-        if len(valid_turnover) > 0:
-            t_x = np.arange(len(daily_turnover))
-            ax_h.bar(t_x, np.where(np.isnan(daily_turnover), 0, daily_turnover),
-                     color=self.TURNOVER_COLOR, alpha=0.5, width=1.0, edgecolor="none")
-            avg_to = float(np.mean(valid_turnover))
-            ax_h.axhline(y=avg_to, color="#D32F2F", linestyle="--", linewidth=1.0,
-                         label=f"Avg = {avg_to:.4f}")
-            ax_h.legend(fontsize=8, loc="upper right")
-        ax_h.set_title("(h) Daily Turnover", fontsize=10)
-        ax_h.set_ylabel("Turnover")
-        ax_h.set_xlabel("Period")
-
-        # Metrics table at the bottom
-        metrics_ls_cum = float(np.sum([quintile_ts[n_quantiles][t] - quintile_ts[1][t]
-                                       for t in range(T)]))
-        metrics["long_short_cumulative"] = metrics_ls_cum
-
-        fig.tight_layout(rect=[0, 0, 1, 0.96])
-
-        if save_path is not None:
-            fig.savefig(save_path, bbox_inches="tight", facecolor="white", dpi=200)
-            plt.close(fig)
-        else:
-            plt.show()
-
-        return metrics
-
-    def generate_summary_table(self, factors: List[dict]) -> pd.DataFrame:
-        """Generate summary table for all factors in the library.
-
-        Parameters
-        ----------
-        factors : List[dict]
-            Each dict should contain keys: 'id', 'name', 'formula',
-            'ic_mean', 'icir', 'ic_win_rate', 'Q1_return', 'Q5_return',
-            'long_short', 'monotonicity', 'avg_turnover'.
-
-        Returns
-        -------
-        pd.DataFrame
-            Summary table sorted by IC mean (descending).
-        """
-        if not factors:
-            return pd.DataFrame()
-
-        rows = []
-        for f in factors:
-            rows.append({
-                "ID": f.get("id", ""),
-                "Name": f.get("name", ""),
-                "Formula": str(f.get("formula", ""))[:60],
-                "IC Mean": f.get("ic_mean", 0.0),
-                "ICIR": f.get("icir", 0.0),
-                "IC Win Rate": f.get("ic_win_rate", 0.0),
-                "Q1 Return": f.get("Q1_return", 0.0),
-                "Q5 Return": f.get("Q5_return", 0.0),
-                "L-S Return": f.get("long_short", 0.0),
-                "Monotonicity": f.get("monotonicity", 0.0),
-                "Avg Turnover": f.get("avg_turnover", 0.0),
-            })
-
-        df = pd.DataFrame(rows)
-        df = df.sort_values("IC Mean", ascending=False).reset_index(drop=True)
-        return df
-
-    @staticmethod
-    def _set_date_ticks(ax: plt.Axes, dates: List[str], T: int, n_ticks: int = 8) -> None:
-        """Set evenly spaced date tick labels on the x-axis."""
-        if T == 0:
-            return
-        n_ticks = min(n_ticks, T)
-        step = max(1, T // n_ticks)
-        positions = list(range(0, T, step))
-        ax.set_xticks(positions)
-        ax.set_xticklabels([dates[i] for i in positions], rotation=45, ha="right", fontsize=7)
diff --git a/src/factorminer/factorminer/utils/visualization.py b/src/factorminer/factorminer/utils/visualization.py
deleted file mode 100644
index 434cdde..0000000
--- a/src/factorminer/factorminer/utils/visualization.py
+++ /dev/null
@@ -1,564 +0,0 @@
-"""Core visualization functions for FactorMiner.
-
-Provides publication-quality plots for factor analysis, mining diagnostics,
-and performance reporting. Uses matplotlib and seaborn with a consistent
-style inspired by the FactorMiner paper figures.
-"""
-
-from __future__ import annotations
-
-from typing import Dict, List, Optional, Tuple
-
-import matplotlib.pyplot as plt
-import matplotlib.ticker as mticker
-import numpy as np
-import seaborn as sns
-
-# ---------------------------------------------------------------------------
-# Global style
-# ---------------------------------------------------------------------------
-
-_STYLE_APPLIED = False
-
-
-def _apply_style() -> None:
-    """Apply a clean, publication-quality matplotlib style once."""
-    global _STYLE_APPLIED
-    if _STYLE_APPLIED:
-        return
-    plt.rcParams.update({
-        "figure.facecolor": "white",
-        "axes.facecolor": "white",
-        "axes.edgecolor": "#333333",
-        "axes.labelcolor": "#333333",
-        "axes.grid": True,
-        "grid.alpha": 0.3,
-        "grid.linestyle": "--",
-        "xtick.color": "#333333",
-        "ytick.color": "#333333",
-        "font.size": 10,
-        "axes.titlesize": 12,
-        "axes.labelsize": 10,
-        "legend.fontsize": 9,
-        "figure.dpi": 150,
-        "savefig.dpi": 200,
-        "savefig.bbox": "tight",
-    })
-    _STYLE_APPLIED = True
-
-
-def _save_or_show(fig: plt.Figure, save_path: Optional[str]) -> None:
-    """Save figure to disk or display interactively."""
-    if save_path is not None:
-        fig.savefig(save_path, bbox_inches="tight", facecolor="white")
-        plt.close(fig)
-    else:
-        plt.show()
-
-
-# ---------------------------------------------------------------------------
-# Correlation heatmap (Figure 2)
-# ---------------------------------------------------------------------------
-
-def plot_correlation_heatmap(
-    correlation_matrix: np.ndarray,
-    factor_names: List[str],
-    title: str = "Factor Library Correlation Heatmap",
-    save_path: Optional[str] = None,
-) -> None:
-    """Generate pairwise Spearman correlation heatmap.
-
-    Displays the average off-diagonal |rho| in the title and uses a
-    diverging colormap centred at zero.
-
-    Parameters
-    ----------
-    correlation_matrix : np.ndarray, shape (N, N)
-        Symmetric matrix of pairwise |rho| values.
-    factor_names : List[str]
-        Labels for each factor (length N).
-    title : str
-        Base title for the plot.
-    save_path : Optional[str]
-        If provided, saves the figure to this path instead of displaying.
-    """
-    _apply_style()
-    n = correlation_matrix.shape[0]
-
-    # Compute average off-diagonal correlation
-    if n > 1:
-        triu_idx = np.triu_indices(n, k=1)
-        off_diag = correlation_matrix[triu_idx]
-        avg_corr = float(np.nanmean(np.abs(off_diag)))
-    else:
-        avg_corr = 0.0
-
-    # Scale figure size based on number of factors
-    size = max(6, min(n * 0.35 + 2, 20))
-    fig, ax = plt.subplots(figsize=(size, size * 0.85))
-
-    mask = np.zeros_like(correlation_matrix, dtype=bool)
-    np.fill_diagonal(mask, True)
-
-    sns.heatmap(
-        correlation_matrix,
-        mask=mask,
-        xticklabels=factor_names,
-        yticklabels=factor_names,
-        cmap="RdBu_r",
-        center=0,
-        vmin=-1,
-        vmax=1,
-        square=True,
-        linewidths=0.5,
-        linecolor="white",
-        cbar_kws={"shrink": 0.7, "label": "Spearman |rho|"},
-        ax=ax,
-    )
-
-    ax.set_title(f"{title}\nAvg off-diagonal |rho| = {avg_corr:.4f}", fontsize=12)
-    ax.tick_params(axis="x", rotation=45, labelsize=max(5, 10 - n // 20))
-    ax.tick_params(axis="y", rotation=0, labelsize=max(5, 10 - n // 20))
-
-    fig.tight_layout()
-    _save_or_show(fig, save_path)
-
-
-# ---------------------------------------------------------------------------
-# IC time series (Figure 5)
-# ---------------------------------------------------------------------------
-
-def plot_ic_timeseries(
-    ic_series: np.ndarray,
-    dates: List[str],
-    rolling_window: int = 21,
-    title: str = "Daily Mean Rank IC",
-    save_path: Optional[str] = None,
-) -> None:
-    """Plot IC time series with rolling average and cumulative IC.
-
-    Creates a two-panel figure: top panel shows daily IC bars with a
-    rolling mean line; bottom panel shows cumulative IC.
-
-    Parameters
-    ----------
-    ic_series : np.ndarray, shape (T,)
-        Daily IC values (may contain NaN).
-    dates : List[str]
-        Date labels of length T.
-    rolling_window : int
-        Window for rolling mean (default 21 trading days).
-    title : str
-        Title for the figure.
-    save_path : Optional[str]
-        If provided, saves the figure to this path.
-    """
-    _apply_style()
-    T = len(ic_series)
-    x = np.arange(T)
-
-    # Replace NaN with 0 for plotting
-    ic_clean = np.where(np.isnan(ic_series), 0.0, ic_series)
-
-    # Rolling mean
-    kernel = np.ones(rolling_window) / rolling_window
-    rolling_ic = np.convolve(ic_clean, kernel, mode="same")
-    # Fix edges
-    for i in range(rolling_window // 2):
-        w = i + rolling_window // 2 + 1
-        rolling_ic[i] = np.mean(ic_clean[:w])
-        rolling_ic[-(i + 1)] = np.mean(ic_clean[-w:])
-
-    # Cumulative IC
-    cumulative_ic = np.nancumsum(ic_clean)
-
-    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 7), height_ratios=[2, 1],
-                                    sharex=True, gridspec_kw={"hspace": 0.08})
-
-    # Top: daily IC bars + rolling mean
-    colors = np.where(ic_clean >= 0, "#4CAF50", "#F44336")
-    ax1.bar(x, ic_clean, color=colors, alpha=0.5, width=1.0, edgecolor="none")
-    ax1.plot(x, rolling_ic, color="#1565C0", linewidth=1.5,
-             label=f"{rolling_window}-day Rolling Mean")
-
-    ic_mean = float(np.nanmean(ic_series))
-    ax1.axhline(y=ic_mean, color="#FF6F00", linestyle="--", linewidth=1.0,
-                label=f"Mean IC = {ic_mean:.4f}")
-    ax1.axhline(y=0, color="black", linewidth=0.5)
-
-    ax1.set_ylabel("Rank IC")
-    ax1.set_title(title, fontsize=13, fontweight="bold")
-    ax1.legend(loc="upper left", framealpha=0.9)
-
-    # Bottom: cumulative IC
-    ax2.fill_between(x, cumulative_ic, alpha=0.3, color="#1565C0")
-    ax2.plot(x, cumulative_ic, color="#1565C0", linewidth=1.2)
-    ax2.set_ylabel("Cumulative IC")
-    ax2.set_xlabel("Date")
-    ax2.axhline(y=0, color="black", linewidth=0.5)
-
-    # X-axis tick labels
-    if T > 0:
-        n_ticks = min(10, T)
-        step = max(1, T // n_ticks)
-        tick_positions = list(range(0, T, step))
-        ax2.set_xticks(tick_positions)
-        ax2.set_xticklabels([dates[i] for i in tick_positions], rotation=45, ha="right")
-
-    fig.tight_layout()
-    _save_or_show(fig, save_path)
-
-
-# ---------------------------------------------------------------------------
-# Quintile returns (Figure 6)
-# ---------------------------------------------------------------------------
-
-def plot_quintile_returns(
-    quintile_returns: dict,
-    title: str = "Quintile Returns",
-    save_path: Optional[str] = None,
-) -> None:
-    """Plot Q1-Q5 quintile bar chart and cumulative returns.
-
-    Parameters
-    ----------
-    quintile_returns : dict
-        Dictionary with keys Q1..Q5 (mean returns) and optionally
-        'quintile_cumulative' mapping Qx -> array of cumulative returns.
-        Also may contain 'long_short' and 'monotonicity'.
-    title : str
-        Title for the figure.
-    save_path : Optional[str]
-        If provided, saves the figure to this path.
-    """
-    _apply_style()
-
-    # Extract quintile mean returns
-    q_labels = [k for k in sorted(quintile_returns.keys()) if k.startswith("Q")]
-    q_means = [quintile_returns[k] for k in q_labels]
-    n_q = len(q_labels)
-
-    has_cumulative = "quintile_cumulative" in quintile_returns
-    n_panels = 2 if has_cumulative else 1
-    fig, axes = plt.subplots(1, n_panels, figsize=(6 * n_panels, 5))
-    if n_panels == 1:
-        axes = [axes]
-
-    # Bar chart
-    ax = axes[0]
-    cmap = plt.cm.RdYlGn
-    colors = [cmap(i / max(n_q - 1, 1)) for i in range(n_q)]
-    bars = ax.bar(q_labels, q_means, color=colors, edgecolor="white", linewidth=0.8)
-
-    # Value labels on bars
-    for bar, val in zip(bars, q_means):
-        y = bar.get_height()
-        ax.text(bar.get_x() + bar.get_width() / 2, y,
-                f"{val:.4f}", ha="center",
-                va="bottom" if y >= 0 else "top", fontsize=9)
-
-    ax.axhline(y=0, color="black", linewidth=0.5)
-    ax.set_ylabel("Mean Return")
-    ax.set_xlabel("Quintile")
-
-    # Subtitle with L-S return and monotonicity
-    subtitle_parts = []
-    if "long_short" in quintile_returns:
-        subtitle_parts.append(f"L-S = {quintile_returns['long_short']:.4f}")
-    if "monotonicity" in quintile_returns:
-        subtitle_parts.append(f"Mono = {quintile_returns['monotonicity']:.2f}")
-    subtitle = " | ".join(subtitle_parts) if subtitle_parts else ""
-    ax.set_title(f"{title}\n{subtitle}" if subtitle else title, fontsize=12)
-
-    # Cumulative returns panel
-    if has_cumulative:
-        ax2 = axes[1]
-        cum_data = quintile_returns["quintile_cumulative"]
-        for q_label in q_labels:
-            if q_label in cum_data:
-                ax2.plot(cum_data[q_label], label=q_label, linewidth=1.2)
-        ax2.set_title("Cumulative Quintile Returns", fontsize=12)
-        ax2.set_ylabel("Cumulative Return")
-        ax2.set_xlabel("Period")
-        ax2.legend(loc="upper left", framealpha=0.9)
-        ax2.axhline(y=0, color="black", linewidth=0.5)
-
-    fig.tight_layout()
-    _save_or_show(fig, save_path)
-
-
-# ---------------------------------------------------------------------------
-# Ablation comparison (Figure 3)
-# ---------------------------------------------------------------------------
-
-def plot_ablation_comparison(
-    with_memory: dict,
-    without_memory: dict,
-    save_path: Optional[str] = None,
-) -> None:
-    """Bar charts comparing Have Memory vs No Memory ablation.
-
-    Shows side-by-side bars for: high-quality count, rejected count,
-    admitted count, yield rate, and rejection rate.
-
-    Parameters
-    ----------
-    with_memory : dict
-        Keys: 'high_quality', 'rejected', 'admitted', 'yield_rate', 'rejection_rate'.
-    without_memory : dict
-        Same keys as with_memory.
-    save_path : Optional[str]
-        If provided, saves the figure to this path.
-    """
-    _apply_style()
-
-    # Count metrics (left axis) and rate metrics (right axis)
-    count_keys = ["high_quality", "rejected", "admitted"]
-    rate_keys = ["yield_rate", "rejection_rate"]
-
-    count_labels = ["High Quality", "Rejected", "Admitted"]
-    rate_labels = ["Yield Rate", "Rejection Rate"]
-
-    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))
-
-    # Panel 1: Count metrics
-    x = np.arange(len(count_keys))
-    w = 0.35
-    vals_with = [with_memory.get(k, 0) for k in count_keys]
-    vals_without = [without_memory.get(k, 0) for k in count_keys]
-
-    bars1 = ax1.bar(x - w / 2, vals_with, w, label="With Memory",
-                    color="#1565C0", edgecolor="white")
-    bars2 = ax1.bar(x + w / 2, vals_without, w, label="Without Memory",
-                    color="#E53935", edgecolor="white")
-
-    for bars in [bars1, bars2]:
-        for bar in bars:
-            h = bar.get_height()
-            ax1.text(bar.get_x() + bar.get_width() / 2, h,
-                     f"{int(h)}", ha="center", va="bottom", fontsize=9)
-
-    ax1.set_xticks(x)
-    ax1.set_xticklabels(count_labels)
-    ax1.set_ylabel("Count")
-    ax1.set_title("Factor Counts: Memory Ablation", fontsize=12)
-    ax1.legend(loc="upper right")
-
-    # Panel 2: Rate metrics
-    x2 = np.arange(len(rate_keys))
-    vals_with_r = [with_memory.get(k, 0) * 100 for k in rate_keys]
-    vals_without_r = [without_memory.get(k, 0) * 100 for k in rate_keys]
-
-    bars3 = ax2.bar(x2 - w / 2, vals_with_r, w, label="With Memory",
-                    color="#1565C0", edgecolor="white")
-    bars4 = ax2.bar(x2 + w / 2, vals_without_r, w, label="Without Memory",
-                    color="#E53935", edgecolor="white")
-
-    for bars in [bars3, bars4]:
-        for bar in bars:
-            h = bar.get_height()
-            ax2.text(bar.get_x() + bar.get_width() / 2, h,
-                     f"{h:.1f}%", ha="center", va="bottom", fontsize=9)
-
-    ax2.set_xticks(x2)
-    ax2.set_xticklabels(rate_labels)
-    ax2.set_ylabel("Rate (%)")
-    ax2.set_title("Yield / Rejection Rates: Memory Ablation", fontsize=12)
-    ax2.legend(loc="upper right")
-
-    fig.tight_layout()
-    _save_or_show(fig, save_path)
-
-
-# ---------------------------------------------------------------------------
-# Efficiency benchmark (Figure 4)
-# ---------------------------------------------------------------------------
-
-def plot_efficiency_benchmark(
-    benchmarks: Dict[str, Dict[str, float]],
-    save_path: Optional[str] = None,
-) -> None:
-    """Grouped bar chart on log scale for computation time.
-
-    Compares Python/C/GPU backends at operator and factor levels.
-
-    Parameters
-    ----------
-    benchmarks : Dict[str, Dict[str, float]]
-        Outer keys: backend names (e.g. "Python", "C", "GPU").
-        Inner keys: operation names (e.g. "operator_eval", "factor_eval").
-        Values: time in seconds.
-    save_path : Optional[str]
-        If provided, saves the figure to this path.
-    """
-    _apply_style()
-
-    backends = list(benchmarks.keys())
-    operations = sorted(
-        {op for bm in benchmarks.values() for op in bm.keys()}
-    )
-    n_backends = len(backends)
-    n_ops = len(operations)
-
-    fig, ax = plt.subplots(figsize=(max(8, n_ops * 2), 5))
-
-    x = np.arange(n_ops)
-    total_width = 0.7
-    w = total_width / n_backends
-
-    palette = ["#1565C0", "#FF8F00", "#43A047", "#8E24AA", "#E53935"]
-
-    for i, backend in enumerate(backends):
-        vals = [benchmarks[backend].get(op, 0) for op in operations]
-        offset = (i - (n_backends - 1) / 2) * w
-        bars = ax.bar(x + offset, vals, w, label=backend,
-                      color=palette[i % len(palette)], edgecolor="white")
-
-        for bar, val in zip(bars, vals):
-            if val > 0:
-                ax.text(bar.get_x() + bar.get_width() / 2,
-                        bar.get_height(),
-                        f"{val:.3g}s", ha="center", va="bottom", fontsize=7)
-
-    ax.set_yscale("log")
-    ax.set_ylabel("Time (seconds, log scale)")
-    ax.set_xticks(x)
-    ax.set_xticklabels(operations, rotation=30, ha="right")
-    ax.set_title("Computation Efficiency by Backend", fontsize=12)
-    ax.legend(loc="upper right")
-    ax.yaxis.set_major_formatter(mticker.ScalarFormatter())
-
-    fig.tight_layout()
-    _save_or_show(fig, save_path)
-
-
-# ---------------------------------------------------------------------------
-# Cost pressure (Figure 9)
-# ---------------------------------------------------------------------------
-
-def plot_cost_pressure(
-    results: Dict[float, dict],
-    save_path: Optional[str] = None,
-) -> None:
-    """Cumulative return plots under different transaction cost settings.
-
-    Shows both linear and log-scale y-axis panels.
-
-    Parameters
-    ----------
-    results : Dict[float, dict]
-        Keys: transaction cost levels (e.g. 0.0, 0.001, 0.003).
-        Values: dict with 'cumulative_returns' (np.ndarray) and
-        optionally 'dates' (List[str]).
-    save_path : Optional[str]
-        If provided, saves the figure to this path.
-    """
-    _apply_style()
-
-    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))
-
-    cost_levels = sorted(results.keys())
-    palette = plt.cm.viridis(np.linspace(0.15, 0.85, len(cost_levels)))
-
-    for color, cost in zip(palette, cost_levels):
-        data = results[cost]
-        cum_ret = np.asarray(data["cumulative_returns"])
-        label = f"TC = {cost*100:.1f}%" if cost > 0 else "No TC"
-
-        ax1.plot(cum_ret, color=color, linewidth=1.3, label=label)
-        # For log scale, shift to always positive
-        shifted = cum_ret - cum_ret.min() + 1.0
-        ax2.plot(shifted, color=color, linewidth=1.3, label=label)
-
-    ax1.set_title("Cumulative Returns (Linear)", fontsize=12)
-    ax1.set_ylabel("Cumulative Return")
-    ax1.set_xlabel("Period")
-    ax1.legend(loc="upper left", fontsize=8, framealpha=0.9)
-    ax1.axhline(y=0, color="black", linewidth=0.5)
-
-    ax2.set_yscale("log")
-    ax2.set_title("Cumulative Returns (Log Scale)", fontsize=12)
-    ax2.set_ylabel("Shifted Cumulative Return (log)")
-    ax2.set_xlabel("Period")
-    ax2.legend(loc="upper left", fontsize=8, framealpha=0.9)
-
-    fig.tight_layout()
-    _save_or_show(fig, save_path)
-
-
-# ---------------------------------------------------------------------------
-# Mining funnel chart
-# ---------------------------------------------------------------------------
-
-def plot_mining_funnel(
-    batch_stats: dict,
-    save_path: Optional[str] = None,
-) -> None:
-    """Funnel chart showing Stage 1 -> 2 -> 3 -> 4 filtering.
-
-    Parameters
-    ----------
-    batch_stats : dict
-        Keys: 'generated', 'ic_passed', 'corr_passed', 'admitted'.
-        Each is an int count at the corresponding stage.
-    save_path : Optional[str]
-        If provided, saves the figure to this path.
-    """
-    _apply_style()
-
-    stages = [
-        ("Generated", batch_stats.get("generated", 0)),
-        ("IC Screen Passed", batch_stats.get("ic_passed", 0)),
-        ("Correlation Passed", batch_stats.get("corr_passed", 0)),
-        ("Admitted", batch_stats.get("admitted", 0)),
-    ]
-
-    labels = [s[0] for s in stages]
-    values = [s[1] for s in stages]
-    max_val = max(values) if values else 1
-
-    fig, ax = plt.subplots(figsize=(8, 5))
-
-    # Draw horizontal funnel bars centred on the y-axis
-    y_positions = list(range(len(stages) - 1, -1, -1))
-    bar_colors = ["#42A5F5", "#66BB6A", "#FFA726", "#EF5350"]
-
-    for i, (y, val, label, color) in enumerate(
-        zip(y_positions, values, labels, bar_colors)
-    ):
-        width = val / max_val if max_val > 0 else 0
-        bar = ax.barh(y, width, height=0.6, color=color, edgecolor="white",
-                      linewidth=1.5, left=(1 - width) / 2)
-        # Label inside the bar
-        ax.text(0.5, y, f"{label}\n{val:,}", ha="center", va="center",
-                fontsize=10, fontweight="bold", color="white" if width > 0.3 else "#333")
-
-    # Draw connecting trapezoids
-    for i in range(len(stages) - 1):
-        y_top = y_positions[i]
-        y_bot = y_positions[i + 1]
-        w_top = values[i] / max_val if max_val > 0 else 0
-        w_bot = values[i + 1] / max_val if max_val > 0 else 0
-
-        left_top = (1 - w_top) / 2
-        right_top = (1 + w_top) / 2
-        left_bot = (1 - w_bot) / 2
-        right_bot = (1 + w_bot) / 2
-
-        # Drop rate annotation
-        if values[i] > 0:
-            drop = (1 - values[i + 1] / values[i]) * 100
-            mid_y = (y_top + y_bot) / 2
-            ax.text(1.02, mid_y, f"-{drop:.0f}%", ha="left", va="center",
-                    fontsize=9, color="#E53935", fontweight="bold",
-                    transform=ax.get_yaxis_transform())
-
-    ax.set_xlim(-0.05, 1.15)
-    ax.set_ylim(-0.5, len(stages) - 0.5)
-    ax.set_yticks([])
-    ax.set_xticks([])
-    ax.set_title("Mining Pipeline Funnel", fontsize=13, fontweight="bold", pad=15)
-    ax.spines[:].set_visible(False)
-
-    fig.tight_layout()
-    _save_or_show(fig, save_path)