2026-03-02 22:29:18 +08:00
|
|
|
|
"""计算引擎。
|
|
|
|
|
|
|
|
|
|
|
|
执行并行运算,负责将执行计划应用到数据上。
|
2026-03-14 01:24:52 +08:00
|
|
|
|
|
|
|
|
|
|
利用 Polars 底层 Rust 引擎的原生并行能力,通过 BFS 分层执行策略
|
|
|
|
|
|
避免 Python 层面的多进程/多线程开销。
|
2026-03-02 22:29:18 +08:00
|
|
|
|
"""
|
|
|
|
|
|
|
2026-03-14 01:24:52 +08:00
|
|
|
|
from typing import Dict, List, Set
|
2026-03-02 22:29:18 +08:00
|
|
|
|
|
|
|
|
|
|
import polars as pl
|
|
|
|
|
|
|
|
|
|
|
|
from src.factors.engine.data_spec import ExecutionPlan
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class ComputeEngine:
|
|
|
|
|
|
"""计算引擎 - 执行并行运算。
|
|
|
|
|
|
|
2026-03-14 01:24:52 +08:00
|
|
|
|
负责将执行计划应用到数据上,利用 Polars 底层 Rust 引擎的原生并行能力。
|
2026-03-02 22:29:18 +08:00
|
|
|
|
|
2026-03-14 01:24:52 +08:00
|
|
|
|
采用 BFS 分层执行策略:
|
|
|
|
|
|
1. 构建依赖图,识别各计划间的依赖关系
|
|
|
|
|
|
2. 按拓扑排序分层,每层包含互不依赖的计划
|
|
|
|
|
|
3. 将每层计划打包为表达式列表,通过单次 with_columns 提交
|
|
|
|
|
|
4. Polars 自动在所有 CPU 核心上并行计算,零拷贝内存
|
2026-03-02 22:29:18 +08:00
|
|
|
|
"""
|
|
|
|
|
|
|
2026-03-14 01:24:52 +08:00
|
|
|
|
def __init__(self) -> None:
|
|
|
|
|
|
"""初始化计算引擎。"""
|
|
|
|
|
|
pass
|
2026-03-02 22:29:18 +08:00
|
|
|
|
|
|
|
|
|
|
def execute(
|
|
|
|
|
|
self,
|
|
|
|
|
|
plan: ExecutionPlan,
|
|
|
|
|
|
data: pl.DataFrame,
|
|
|
|
|
|
) -> pl.DataFrame:
|
2026-03-14 01:24:52 +08:00
|
|
|
|
"""执行单个计算计划。
|
2026-03-02 22:29:18 +08:00
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
|
plan: 执行计划
|
|
|
|
|
|
data: 输入数据(核心宽表)
|
|
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
|
包含因子结果的 DataFrame
|
|
|
|
|
|
"""
|
|
|
|
|
|
# 检查依赖字段是否存在
|
|
|
|
|
|
missing_cols = plan.dependencies - set(data.columns)
|
|
|
|
|
|
if missing_cols:
|
|
|
|
|
|
raise ValueError(f"数据缺少必要的字段: {missing_cols}")
|
|
|
|
|
|
|
|
|
|
|
|
# 执行计算
|
2026-03-14 01:24:52 +08:00
|
|
|
|
return data.with_columns([plan.polars_expr.alias(plan.output_name)])
|
2026-03-02 22:29:18 +08:00
|
|
|
|
|
|
|
|
|
|
def execute_batch(
|
|
|
|
|
|
self,
|
|
|
|
|
|
plans: List[ExecutionPlan],
|
|
|
|
|
|
data: pl.DataFrame,
|
|
|
|
|
|
) -> pl.DataFrame:
|
2026-03-14 01:24:52 +08:00
|
|
|
|
"""顺序批量执行多个计算计划。
|
2026-03-02 22:29:18 +08:00
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
|
plans: 执行计划列表
|
|
|
|
|
|
data: 输入数据
|
|
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
|
包含所有因子结果的 DataFrame
|
|
|
|
|
|
"""
|
|
|
|
|
|
result = data
|
|
|
|
|
|
for plan in plans:
|
|
|
|
|
|
result = self.execute(plan, result)
|
|
|
|
|
|
return result
|
|
|
|
|
|
|
|
|
|
|
|
def execute_parallel(
|
|
|
|
|
|
self,
|
|
|
|
|
|
plans: List[ExecutionPlan],
|
|
|
|
|
|
data: pl.DataFrame,
|
|
|
|
|
|
) -> pl.DataFrame:
|
2026-03-14 01:24:52 +08:00
|
|
|
|
"""分层并行执行计算计划(利用 Polars 原生并发优化)。
|
|
|
|
|
|
|
|
|
|
|
|
抛弃 Python 的多进程/多线程池,采用计算图拓扑分层(BFS DAG)。
|
|
|
|
|
|
将每一层互不依赖的表达式列表打包,通过单次 with_columns 交给 Polars,
|
|
|
|
|
|
由底层 Rust 引擎自动调度并行计算,实现零拷贝性能最大化。
|
2026-03-02 22:29:18 +08:00
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
|
plans: 执行计划列表
|
|
|
|
|
|
data: 输入数据
|
|
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
|
包含所有因子结果的 DataFrame
|
2026-03-14 01:24:52 +08:00
|
|
|
|
|
|
|
|
|
|
Raises:
|
|
|
|
|
|
RuntimeError: 当存在依赖环或缺少基础依赖字段时
|
2026-03-02 22:29:18 +08:00
|
|
|
|
"""
|
2026-03-14 01:24:52 +08:00
|
|
|
|
if not plans:
|
|
|
|
|
|
return data
|
2026-03-02 22:29:18 +08:00
|
|
|
|
|
2026-03-14 01:24:52 +08:00
|
|
|
|
result = data
|
|
|
|
|
|
available_cols: Set[str] = set(result.columns)
|
|
|
|
|
|
|
|
|
|
|
|
# 复制一份计划列表用于迭代
|
|
|
|
|
|
remaining_plans = plans.copy()
|
|
|
|
|
|
|
|
|
|
|
|
while remaining_plans:
|
|
|
|
|
|
# 找出当前可以执行的所有独立计划(即依赖的所有列都已就绪)
|
|
|
|
|
|
current_layer: List[ExecutionPlan] = []
|
|
|
|
|
|
next_remaining: List[ExecutionPlan] = []
|
|
|
|
|
|
|
|
|
|
|
|
for plan in remaining_plans:
|
|
|
|
|
|
if plan.dependencies <= available_cols:
|
|
|
|
|
|
current_layer.append(plan)
|
|
|
|
|
|
else:
|
|
|
|
|
|
next_remaining.append(plan)
|
|
|
|
|
|
|
|
|
|
|
|
# 安全兜底:如果一轮遍历后没找到任何可执行计划,说明存在依赖环或数据缺失
|
|
|
|
|
|
if not current_layer:
|
|
|
|
|
|
missing = remaining_plans[0].dependencies - available_cols
|
|
|
|
|
|
raise RuntimeError(
|
|
|
|
|
|
f"计算发生死锁或缺少基础依赖字段!\n"
|
|
|
|
|
|
f"因子 '{remaining_plans[0].output_name}' 缺少: {missing}"
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
# 核心优化:利用 Polars 内部 Rust 级多线程引擎执行当前层
|
|
|
|
|
|
exprs = [plan.polars_expr.alias(plan.output_name) for plan in current_layer]
|
|
|
|
|
|
result = result.with_columns(exprs)
|
|
|
|
|
|
|
|
|
|
|
|
# 更新已就绪字段集合,为计算下一层做准备
|
|
|
|
|
|
for plan in current_layer:
|
2026-03-02 22:29:18 +08:00
|
|
|
|
available_cols.add(plan.output_name)
|
2026-03-14 01:24:52 +08:00
|
|
|
|
|
|
|
|
|
|
remaining_plans = next_remaining
|
|
|
|
|
|
|
|
|
|
|
|
return result
|
|
|
|
|
|
|
|
|
|
|
|
def compute(
|
2026-03-02 22:29:18 +08:00
|
|
|
|
self,
|
2026-03-14 01:24:52 +08:00
|
|
|
|
plans: List[ExecutionPlan],
|
2026-03-02 22:29:18 +08:00
|
|
|
|
data: pl.DataFrame,
|
2026-03-14 01:24:52 +08:00
|
|
|
|
parallel: bool = True,
|
|
|
|
|
|
) -> pl.DataFrame:
|
|
|
|
|
|
"""智能计算入口。
|
|
|
|
|
|
|
|
|
|
|
|
根据 parallel 参数自动选择执行模式:
|
|
|
|
|
|
- True: 使用分层并行执行(推荐)
|
|
|
|
|
|
- False: 使用顺序执行
|
2026-03-02 22:29:18 +08:00
|
|
|
|
|
|
|
|
|
|
Args:
|
2026-03-14 01:24:52 +08:00
|
|
|
|
plans: 执行计划列表
|
2026-03-02 22:29:18 +08:00
|
|
|
|
data: 输入数据
|
2026-03-14 01:24:52 +08:00
|
|
|
|
parallel: 是否使用并行执行
|
2026-03-02 22:29:18 +08:00
|
|
|
|
|
|
|
|
|
|
Returns:
|
2026-03-14 01:24:52 +08:00
|
|
|
|
包含所有因子结果的 DataFrame
|
2026-03-02 22:29:18 +08:00
|
|
|
|
"""
|
2026-03-14 01:24:52 +08:00
|
|
|
|
if parallel:
|
|
|
|
|
|
return self.execute_parallel(plans, data)
|
|
|
|
|
|
return self.execute_batch(plans, data)
|