perf(factors/engine): 重构计算引擎使用 Polars 原生并行
- 移除 Python 多进程/多线程池,消除 DataFrame 序列化开销 - 采用 BFS 分层执行策略,每层表达式通过单次 with_columns 提交 - 利用 Polars Rust 引擎实现零拷贝并行计算 - 添加死锁检测机制处理依赖环
This commit is contained in:
@@ -1,10 +1,12 @@
|
|||||||
"""计算引擎。
|
"""计算引擎。
|
||||||
|
|
||||||
执行并行运算,负责将执行计划应用到数据上。
|
执行并行运算,负责将执行计划应用到数据上。
|
||||||
|
|
||||||
|
利用 Polars 底层 Rust 引擎的原生并行能力,通过 BFS 分层执行策略
|
||||||
|
避免 Python 层面的多进程/多线程开销。
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
|
from typing import Dict, List, Set
|
||||||
from typing import Any, Dict, List, Optional, Set, Union
|
|
||||||
|
|
||||||
import polars as pl
|
import polars as pl
|
||||||
|
|
||||||
@@ -14,33 +16,25 @@ from src.factors.engine.data_spec import ExecutionPlan
|
|||||||
class ComputeEngine:
|
class ComputeEngine:
|
||||||
"""计算引擎 - 执行并行运算。
|
"""计算引擎 - 执行并行运算。
|
||||||
|
|
||||||
负责将执行计划应用到数据上,支持并行计算。
|
负责将执行计划应用到数据上,利用 Polars 底层 Rust 引擎的原生并行能力。
|
||||||
|
|
||||||
Attributes:
|
采用 BFS 分层执行策略:
|
||||||
max_workers: 最大并行工作线程数
|
1. 构建依赖图,识别各计划间的依赖关系
|
||||||
use_processes: 是否使用进程池(CPU 密集型任务)
|
2. 按拓扑排序分层,每层包含互不依赖的计划
|
||||||
|
3. 将每层计划打包为表达式列表,通过单次 with_columns 提交
|
||||||
|
4. Polars 自动在所有 CPU 核心上并行计算,零拷贝内存
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(
|
def __init__(self) -> None:
|
||||||
self,
|
"""初始化计算引擎。"""
|
||||||
max_workers: int = 4,
|
pass
|
||||||
use_processes: bool = False,
|
|
||||||
) -> None:
|
|
||||||
"""初始化计算引擎。
|
|
||||||
|
|
||||||
Args:
|
|
||||||
max_workers: 最大并行工作线程数
|
|
||||||
use_processes: 是否使用进程池代替线程池
|
|
||||||
"""
|
|
||||||
self.max_workers = max_workers
|
|
||||||
self.use_processes = use_processes
|
|
||||||
|
|
||||||
def execute(
|
def execute(
|
||||||
self,
|
self,
|
||||||
plan: ExecutionPlan,
|
plan: ExecutionPlan,
|
||||||
data: pl.DataFrame,
|
data: pl.DataFrame,
|
||||||
) -> pl.DataFrame:
|
) -> pl.DataFrame:
|
||||||
"""执行计算计划。
|
"""执行单个计算计划。
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
plan: 执行计划
|
plan: 执行计划
|
||||||
@@ -55,16 +49,14 @@ class ComputeEngine:
|
|||||||
raise ValueError(f"数据缺少必要的字段: {missing_cols}")
|
raise ValueError(f"数据缺少必要的字段: {missing_cols}")
|
||||||
|
|
||||||
# 执行计算
|
# 执行计算
|
||||||
result = data.with_columns([plan.polars_expr.alias(plan.output_name)])
|
return data.with_columns([plan.polars_expr.alias(plan.output_name)])
|
||||||
|
|
||||||
return result
|
|
||||||
|
|
||||||
def execute_batch(
|
def execute_batch(
|
||||||
self,
|
self,
|
||||||
plans: List[ExecutionPlan],
|
plans: List[ExecutionPlan],
|
||||||
data: pl.DataFrame,
|
data: pl.DataFrame,
|
||||||
) -> pl.DataFrame:
|
) -> pl.DataFrame:
|
||||||
"""批量执行多个计算计划。
|
"""顺序批量执行多个计算计划。
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
plans: 执行计划列表
|
plans: 执行计划列表
|
||||||
@@ -74,10 +66,8 @@ class ComputeEngine:
|
|||||||
包含所有因子结果的 DataFrame
|
包含所有因子结果的 DataFrame
|
||||||
"""
|
"""
|
||||||
result = data
|
result = data
|
||||||
|
|
||||||
for plan in plans:
|
for plan in plans:
|
||||||
result = self.execute(plan, result)
|
result = self.execute(plan, result)
|
||||||
|
|
||||||
return result
|
return result
|
||||||
|
|
||||||
def execute_parallel(
|
def execute_parallel(
|
||||||
@@ -85,7 +75,11 @@ class ComputeEngine:
|
|||||||
plans: List[ExecutionPlan],
|
plans: List[ExecutionPlan],
|
||||||
data: pl.DataFrame,
|
data: pl.DataFrame,
|
||||||
) -> pl.DataFrame:
|
) -> pl.DataFrame:
|
||||||
"""并行执行多个计算计划。
|
"""分层并行执行计算计划(利用 Polars 原生并发优化)。
|
||||||
|
|
||||||
|
抛弃 Python 的多进程/多线程池,采用计算图拓扑分层(BFS DAG)。
|
||||||
|
将每一层互不依赖的表达式列表打包,通过单次 with_columns 交给 Polars,
|
||||||
|
由底层 Rust 引擎自动调度并行计算,实现零拷贝性能最大化。
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
plans: 执行计划列表
|
plans: 执行计划列表
|
||||||
@@ -93,63 +87,70 @@ class ComputeEngine:
|
|||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
包含所有因子结果的 DataFrame
|
包含所有因子结果的 DataFrame
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
RuntimeError: 当存在依赖环或缺少基础依赖字段时
|
||||||
"""
|
"""
|
||||||
# 检查计划间依赖
|
if not plans:
|
||||||
independent_plans = []
|
|
||||||
dependent_plans = []
|
|
||||||
available_cols = set(data.columns)
|
|
||||||
|
|
||||||
for plan in plans:
|
|
||||||
if plan.dependencies <= available_cols:
|
|
||||||
independent_plans.append(plan)
|
|
||||||
available_cols.add(plan.output_name)
|
|
||||||
else:
|
|
||||||
dependent_plans.append(plan)
|
|
||||||
|
|
||||||
# 并行执行独立计划
|
|
||||||
if independent_plans:
|
|
||||||
ExecutorClass = (
|
|
||||||
ProcessPoolExecutor if self.use_processes else ThreadPoolExecutor
|
|
||||||
)
|
|
||||||
|
|
||||||
with ExecutorClass(max_workers=self.max_workers) as executor:
|
|
||||||
futures = {
|
|
||||||
executor.submit(self._execute_single, plan, data): plan
|
|
||||||
for plan in independent_plans
|
|
||||||
}
|
|
||||||
|
|
||||||
results = []
|
|
||||||
for future in futures:
|
|
||||||
plan = futures[future]
|
|
||||||
try:
|
|
||||||
result_col = future.result()
|
|
||||||
results.append((plan.output_name, result_col))
|
|
||||||
except Exception as e:
|
|
||||||
raise RuntimeError(f"计算因子 {plan.output_name} 失败: {e}")
|
|
||||||
|
|
||||||
# 合并结果
|
|
||||||
for name, series in results:
|
|
||||||
data = data.with_columns([series.alias(name)])
|
|
||||||
|
|
||||||
# 顺序执行依赖计划
|
|
||||||
for plan in dependent_plans:
|
|
||||||
data = self.execute(plan, data)
|
|
||||||
|
|
||||||
return data
|
return data
|
||||||
|
|
||||||
def _execute_single(
|
result = data
|
||||||
|
available_cols: Set[str] = set(result.columns)
|
||||||
|
|
||||||
|
# 复制一份计划列表用于迭代
|
||||||
|
remaining_plans = plans.copy()
|
||||||
|
|
||||||
|
while remaining_plans:
|
||||||
|
# 找出当前可以执行的所有独立计划(即依赖的所有列都已就绪)
|
||||||
|
current_layer: List[ExecutionPlan] = []
|
||||||
|
next_remaining: List[ExecutionPlan] = []
|
||||||
|
|
||||||
|
for plan in remaining_plans:
|
||||||
|
if plan.dependencies <= available_cols:
|
||||||
|
current_layer.append(plan)
|
||||||
|
else:
|
||||||
|
next_remaining.append(plan)
|
||||||
|
|
||||||
|
# 安全兜底:如果一轮遍历后没找到任何可执行计划,说明存在依赖环或数据缺失
|
||||||
|
if not current_layer:
|
||||||
|
missing = remaining_plans[0].dependencies - available_cols
|
||||||
|
raise RuntimeError(
|
||||||
|
f"计算发生死锁或缺少基础依赖字段!\n"
|
||||||
|
f"因子 '{remaining_plans[0].output_name}' 缺少: {missing}"
|
||||||
|
)
|
||||||
|
|
||||||
|
# 核心优化:利用 Polars 内部 Rust 级多线程引擎执行当前层
|
||||||
|
exprs = [plan.polars_expr.alias(plan.output_name) for plan in current_layer]
|
||||||
|
result = result.with_columns(exprs)
|
||||||
|
|
||||||
|
# 更新已就绪字段集合,为计算下一层做准备
|
||||||
|
for plan in current_layer:
|
||||||
|
available_cols.add(plan.output_name)
|
||||||
|
|
||||||
|
remaining_plans = next_remaining
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
def compute(
|
||||||
self,
|
self,
|
||||||
plan: ExecutionPlan,
|
plans: List[ExecutionPlan],
|
||||||
data: pl.DataFrame,
|
data: pl.DataFrame,
|
||||||
) -> pl.Series:
|
parallel: bool = True,
|
||||||
"""执行单个计划并返回结果列。
|
) -> pl.DataFrame:
|
||||||
|
"""智能计算入口。
|
||||||
|
|
||||||
|
根据 parallel 参数自动选择执行模式:
|
||||||
|
- True: 使用分层并行执行(推荐)
|
||||||
|
- False: 使用顺序执行
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
plan: 执行计划
|
plans: 执行计划列表
|
||||||
data: 输入数据
|
data: 输入数据
|
||||||
|
parallel: 是否使用并行执行
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
计算结果序列
|
包含所有因子结果的 DataFrame
|
||||||
"""
|
"""
|
||||||
result = self.execute(plan, data)
|
if parallel:
|
||||||
return result[plan.output_name]
|
return self.execute_parallel(plans, data)
|
||||||
|
return self.execute_batch(plans, data)
|
||||||
|
|||||||
@@ -57,7 +57,6 @@ class FactorEngine:
|
|||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
data_source: Optional[Dict[str, pl.DataFrame]] = None,
|
data_source: Optional[Dict[str, pl.DataFrame]] = None,
|
||||||
max_workers: int = 4,
|
|
||||||
registry: Optional["FunctionRegistry"] = None,
|
registry: Optional["FunctionRegistry"] = None,
|
||||||
metadata_path: Optional[str] = None,
|
metadata_path: Optional[str] = None,
|
||||||
) -> None:
|
) -> None:
|
||||||
@@ -65,16 +64,15 @@ class FactorEngine:
|
|||||||
|
|
||||||
Args:
|
Args:
|
||||||
data_source: 内存数据源,为 None 时使用数据库连接
|
data_source: 内存数据源,为 None 时使用数据库连接
|
||||||
max_workers: 并行计算的最大工作线程数
|
|
||||||
registry: 函数注册表,None 时创建独立实例
|
registry: 函数注册表,None 时创建独立实例
|
||||||
metadata_path: 因子元数据文件路径,为 None 时不启用 metadata 功能
|
metadata_path: 因子元数据文件路径,为 None 时启用默认 metadata 功能
|
||||||
"""
|
"""
|
||||||
from src.factors.registry import FunctionRegistry
|
from src.factors.registry import FunctionRegistry
|
||||||
from src.factors.parser import FormulaParser
|
from src.factors.parser import FormulaParser
|
||||||
|
|
||||||
self.router = DataRouter(data_source)
|
self.router = DataRouter(data_source)
|
||||||
self.planner = ExecutionPlanner()
|
self.planner = ExecutionPlanner()
|
||||||
self.compute_engine = ComputeEngine(max_workers=max_workers)
|
self.compute_engine = ComputeEngine()
|
||||||
self.registered_expressions: Dict[str, Node] = {}
|
self.registered_expressions: Dict[str, Node] = {}
|
||||||
self._plans: Dict[str, ExecutionPlan] = {}
|
self._plans: Dict[str, ExecutionPlan] = {}
|
||||||
|
|
||||||
|
|||||||
@@ -72,7 +72,7 @@ class TestFactorEngineEndToEnd:
|
|||||||
def engine(self, mock_data):
|
def engine(self, mock_data):
|
||||||
"""提供配置好的 FactorEngine fixture。"""
|
"""提供配置好的 FactorEngine fixture。"""
|
||||||
data_source = {"pro_bar": mock_data}
|
data_source = {"pro_bar": mock_data}
|
||||||
return FactorEngine(data_source=data_source, max_workers=2)
|
return FactorEngine(data_source=data_source)
|
||||||
|
|
||||||
def test_simple_symbol_expression(self, engine):
|
def test_simple_symbol_expression(self, engine):
|
||||||
"""测试简单的符号表达式。"""
|
"""测试简单的符号表达式。"""
|
||||||
|
|||||||
Reference in New Issue
Block a user