"""内置机器学习模型 提供 LightGBM、CatBoost 等模型的统一接口包装器。 """ from typing import Optional, Dict, Any import polars as pl import numpy as np from src.pipeline.core import BaseModel, TaskType from src.pipeline.registry import PluginRegistry @PluginRegistry.register_model("lightgbm") class LightGBMModel(BaseModel): """LightGBM 模型包装器 支持分类、回归、排序三种任务类型。 """ def __init__( self, task_type: TaskType, params: Optional[Dict[str, Any]] = None, name: Optional[str] = None, ): super().__init__(task_type, params, name) self._model = None def fit( self, X: pl.DataFrame, y: pl.Series, X_val: Optional[pl.DataFrame] = None, y_val: Optional[pl.Series] = None, **fit_params, ) -> "LightGBMModel": """训练模型""" try: import lightgbm as lgb except ImportError: raise ImportError( "lightgbm is required. Install with: uv pip install lightgbm" ) X_arr = X.to_numpy() y_arr = y.to_numpy() train_data = lgb.Dataset(X_arr, label=y_arr) valid_sets = [train_data] valid_names = ["train"] if X_val is not None and y_val is not None: valid_data = lgb.Dataset(X_val.to_numpy(), label=y_val.to_numpy()) valid_sets.append(valid_data) valid_names.append("valid") default_params = { "objective": self._get_objective(), "metric": self._get_metric(), "boosting_type": "gbdt", "num_leaves": 31, "learning_rate": 0.05, "feature_fraction": 0.9, "bagging_fraction": 0.8, "bagging_freq": 5, "verbose": -1, } default_params.update(self.params) callbacks = [] if len(valid_sets) > 1: callbacks.append(lgb.early_stopping(stopping_rounds=10, verbose=False)) self._model = lgb.train( default_params, train_data, num_boost_round=fit_params.get("num_boost_round", 100), valid_sets=valid_sets, valid_names=valid_names, callbacks=callbacks, ) self._is_fitted = True return self def predict(self, X: pl.DataFrame) -> np.ndarray: """预测""" if not self._is_fitted: raise RuntimeError("Model not fitted yet") return self._model.predict(X.to_numpy()) def predict_proba(self, X: pl.DataFrame) -> np.ndarray: """预测概率(仅分类任务)""" if self.task_type != "classification": raise ValueError("predict_proba only for classification") probs = self.predict(X) if len(probs.shape) == 1: return np.vstack([1 - probs, probs]).T return probs def get_feature_importance(self) -> Optional[pl.DataFrame]: """获取特征重要性""" if self._model is None: return None importance = self._model.feature_importance(importance_type="gain") feature_names = getattr( self._model, "feature_name", lambda: [f"feature_{i}" for i in range(len(importance))], )() return pl.DataFrame({"feature": feature_names, "importance": importance}).sort( "importance", descending=True ) def _get_objective(self) -> str: objectives = { "classification": "binary", "regression": "regression", "ranking": "lambdarank", } return objectives.get(self.task_type, "regression") def _get_metric(self) -> str: metrics = {"classification": "auc", "regression": "rmse", "ranking": "ndcg"} return metrics.get(self.task_type, "rmse") @PluginRegistry.register_model("catboost") class CatBoostModel(BaseModel): """CatBoost 模型包装器""" def __init__( self, task_type: TaskType, params: Optional[Dict[str, Any]] = None, name: Optional[str] = None, ): super().__init__(task_type, params, name) self._model = None def fit( self, X: pl.DataFrame, y: pl.Series, X_val: Optional[pl.DataFrame] = None, y_val: Optional[pl.Series] = None, **fit_params, ) -> "CatBoostModel": """训练模型""" try: from catboost import CatBoostClassifier, CatBoostRegressor except ImportError: raise ImportError( "catboost is required. Install with: uv pip install catboost" ) if self.task_type == "classification": model_class = CatBoostClassifier default_params = {"loss_function": "Logloss", "eval_metric": "AUC"} elif self.task_type == "regression": model_class = CatBoostRegressor default_params = {"loss_function": "RMSE"} else: model_class = CatBoostRegressor default_params = {"loss_function": "QueryRMSE"} default_params.update(self.params) default_params["verbose"] = False self._model = model_class(**default_params) eval_set = None if X_val is not None and y_val is not None: eval_set = (X_val.to_pandas(), y_val.to_pandas()) self._model.fit( X.to_pandas(), y.to_pandas(), eval_set=eval_set, early_stopping_rounds=fit_params.get("early_stopping_rounds", 10), verbose=False, ) self._is_fitted = True return self def predict(self, X: pl.DataFrame) -> np.ndarray: """预测""" if not self._is_fitted: raise RuntimeError("Model not fitted yet") return self._model.predict(X.to_pandas()) def predict_proba(self, X: pl.DataFrame) -> np.ndarray: """预测概率""" if self.task_type != "classification": raise ValueError("predict_proba only for classification") return self._model.predict_proba(X.to_pandas()) def get_feature_importance(self) -> Optional[pl.DataFrame]: """获取特征重要性""" if self._model is None: return None return pl.DataFrame( { "feature": self._model.feature_names_, "importance": self._model.feature_importances_, } ).sort("importance", descending=True) __all__ = ["LightGBMModel", "CatBoostModel"]