feat(training): 实现 LightGBM 模型
- 新增 LightGBMModel:LightGBM 回归模型实现
- 支持自定义参数(objective, num_leaves, learning_rate, n_estimators 等)
- 使用 LightGBM 原生格式保存/加载模型(不依赖 pickle)
- 支持特征重要性提取
- 已注册到 ModelRegistry(@register_model("lightgbm"))
This commit is contained in:
@@ -22,6 +22,9 @@ from src.training.components.processors import (
|
|||||||
Winsorizer,
|
Winsorizer,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# 模型
|
||||||
|
from src.training.components.models import LightGBMModel
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
"BaseModel",
|
"BaseModel",
|
||||||
"BaseProcessor",
|
"BaseProcessor",
|
||||||
@@ -31,4 +34,5 @@ __all__ = [
|
|||||||
"StandardScaler",
|
"StandardScaler",
|
||||||
"CrossSectionalStandardScaler",
|
"CrossSectionalStandardScaler",
|
||||||
"Winsorizer",
|
"Winsorizer",
|
||||||
|
"LightGBMModel",
|
||||||
]
|
]
|
||||||
|
|||||||
8
src/training/components/models/__init__.py
Normal file
8
src/training/components/models/__init__.py
Normal file
@@ -0,0 +1,8 @@
|
|||||||
|
"""模型子模块
|
||||||
|
|
||||||
|
包含各种机器学习模型的实现。
|
||||||
|
"""
|
||||||
|
|
||||||
|
from src.training.components.models.lightgbm import LightGBMModel
|
||||||
|
|
||||||
|
__all__ = ["LightGBMModel"]
|
||||||
194
src/training/components/models/lightgbm.py
Normal file
194
src/training/components/models/lightgbm.py
Normal file
@@ -0,0 +1,194 @@
|
|||||||
|
"""LightGBM 模型实现
|
||||||
|
|
||||||
|
提供 LightGBM 回归模型的实现,支持特征重要性和原生模型保存。
|
||||||
|
"""
|
||||||
|
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
import polars as pl
|
||||||
|
|
||||||
|
from src.training.components.base import BaseModel
|
||||||
|
from src.training.registry import register_model
|
||||||
|
|
||||||
|
|
||||||
|
@register_model("lightgbm")
|
||||||
|
class LightGBMModel(BaseModel):
|
||||||
|
"""LightGBM 回归模型
|
||||||
|
|
||||||
|
使用 LightGBM 库实现梯度提升回归树。
|
||||||
|
支持自定义参数、特征重要性提取和原生模型格式保存。
|
||||||
|
|
||||||
|
Attributes:
|
||||||
|
name: 模型名称 "lightgbm"
|
||||||
|
params: LightGBM 参数字典
|
||||||
|
model: 训练后的 LightGBM Booster 对象
|
||||||
|
feature_names_: 特征名称列表
|
||||||
|
"""
|
||||||
|
|
||||||
|
name = "lightgbm"
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
objective: str = "regression",
|
||||||
|
metric: str = "rmse",
|
||||||
|
num_leaves: int = 31,
|
||||||
|
learning_rate: float = 0.05,
|
||||||
|
n_estimators: int = 100,
|
||||||
|
**kwargs,
|
||||||
|
):
|
||||||
|
"""初始化 LightGBM 模型
|
||||||
|
|
||||||
|
Args:
|
||||||
|
objective: 目标函数,默认 "regression"
|
||||||
|
metric: 评估指标,默认 "rmse"
|
||||||
|
num_leaves: 叶子节点数,默认 31
|
||||||
|
learning_rate: 学习率,默认 0.05
|
||||||
|
n_estimators: 迭代次数,默认 100
|
||||||
|
**kwargs: 其他 LightGBM 参数
|
||||||
|
"""
|
||||||
|
self.params = {
|
||||||
|
"objective": objective,
|
||||||
|
"metric": metric,
|
||||||
|
"num_leaves": num_leaves,
|
||||||
|
"learning_rate": learning_rate,
|
||||||
|
"verbose": -1, # 抑制训练输出
|
||||||
|
**kwargs,
|
||||||
|
}
|
||||||
|
self.n_estimators = n_estimators
|
||||||
|
self.model = None
|
||||||
|
self.feature_names_: Optional[list] = None
|
||||||
|
|
||||||
|
def fit(self, X: pl.DataFrame, y: pl.Series) -> "LightGBMModel":
|
||||||
|
"""训练模型
|
||||||
|
|
||||||
|
Args:
|
||||||
|
X: 特征矩阵 (Polars DataFrame)
|
||||||
|
y: 目标变量 (Polars Series)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
self (支持链式调用)
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
ImportError: 未安装 lightgbm
|
||||||
|
RuntimeError: 训练失败
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
import lightgbm as lgb
|
||||||
|
except ImportError:
|
||||||
|
raise ImportError(
|
||||||
|
"使用 LightGBMModel 需要安装 lightgbm: pip install lightgbm"
|
||||||
|
)
|
||||||
|
|
||||||
|
# 保存特征名称
|
||||||
|
self.feature_names_ = X.columns
|
||||||
|
|
||||||
|
# 转换为 numpy
|
||||||
|
X_np = X.to_numpy()
|
||||||
|
y_np = y.to_numpy()
|
||||||
|
|
||||||
|
# 创建数据集
|
||||||
|
train_data = lgb.Dataset(X_np, label=y_np)
|
||||||
|
|
||||||
|
# 训练
|
||||||
|
self.model = lgb.train(
|
||||||
|
self.params,
|
||||||
|
train_data,
|
||||||
|
num_boost_round=self.n_estimators,
|
||||||
|
)
|
||||||
|
|
||||||
|
return self
|
||||||
|
|
||||||
|
def predict(self, X: pl.DataFrame) -> np.ndarray:
|
||||||
|
"""预测
|
||||||
|
|
||||||
|
Args:
|
||||||
|
X: 特征矩阵 (Polars DataFrame)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
预测结果 (numpy ndarray)
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
RuntimeError: 模型未训练时调用
|
||||||
|
"""
|
||||||
|
if self.model is None:
|
||||||
|
raise RuntimeError("模型尚未训练,请先调用 fit()")
|
||||||
|
|
||||||
|
X_np = X.to_numpy()
|
||||||
|
return self.model.predict(X_np)
|
||||||
|
|
||||||
|
def feature_importance(self) -> Optional[pd.Series]:
|
||||||
|
"""返回特征重要性
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
特征重要性序列,如果模型未训练则返回 None
|
||||||
|
"""
|
||||||
|
if self.model is None or self.feature_names_ is None:
|
||||||
|
return None
|
||||||
|
|
||||||
|
importance = self.model.feature_importance(importance_type="gain")
|
||||||
|
return pd.Series(importance, index=self.feature_names_)
|
||||||
|
|
||||||
|
def save(self, path: str) -> None:
|
||||||
|
"""保存模型(使用 LightGBM 原生格式)
|
||||||
|
|
||||||
|
使用 LightGBM 的原生格式保存,不依赖 pickle,
|
||||||
|
可以在不同环境中加载。
|
||||||
|
|
||||||
|
Args:
|
||||||
|
path: 保存路径
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
RuntimeError: 模型未训练时调用
|
||||||
|
"""
|
||||||
|
if self.model is None:
|
||||||
|
raise RuntimeError("模型尚未训练,无法保存")
|
||||||
|
|
||||||
|
self.model.save_model(path)
|
||||||
|
|
||||||
|
# 同时保存特征名称(LightGBM 原生格式不保存这个)
|
||||||
|
import json
|
||||||
|
|
||||||
|
meta_path = path + ".meta.json"
|
||||||
|
with open(meta_path, "w") as f:
|
||||||
|
json.dump(
|
||||||
|
{
|
||||||
|
"feature_names": self.feature_names_,
|
||||||
|
"params": self.params,
|
||||||
|
"n_estimators": self.n_estimators,
|
||||||
|
},
|
||||||
|
f,
|
||||||
|
)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def load(cls, path: str) -> "LightGBMModel":
|
||||||
|
"""加载模型
|
||||||
|
|
||||||
|
从 LightGBM 原生格式加载模型。
|
||||||
|
|
||||||
|
Args:
|
||||||
|
path: 模型文件路径
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
加载的 LightGBMModel 实例
|
||||||
|
"""
|
||||||
|
import lightgbm as lgb
|
||||||
|
import json
|
||||||
|
|
||||||
|
instance = cls()
|
||||||
|
instance.model = lgb.Booster(model_file=path)
|
||||||
|
|
||||||
|
# 加载元数据
|
||||||
|
meta_path = path + ".meta.json"
|
||||||
|
try:
|
||||||
|
with open(meta_path, "r") as f:
|
||||||
|
meta = json.load(f)
|
||||||
|
instance.feature_names_ = meta.get("feature_names")
|
||||||
|
instance.params = meta.get("params", instance.params)
|
||||||
|
instance.n_estimators = meta.get("n_estimators", instance.n_estimators)
|
||||||
|
except FileNotFoundError:
|
||||||
|
# 如果没有元数据文件,继续运行(feature_names_ 为 None)
|
||||||
|
pass
|
||||||
|
|
||||||
|
return instance
|
||||||
226
tests/training/test_lightgbm_model.py
Normal file
226
tests/training/test_lightgbm_model.py
Normal file
@@ -0,0 +1,226 @@
|
|||||||
|
"""测试 LightGBM 模型
|
||||||
|
|
||||||
|
验证 LightGBMModel 的训练、预测、保存和加载功能。
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
import tempfile
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import polars as pl
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from src.training.components.models.lightgbm import LightGBMModel
|
||||||
|
|
||||||
|
|
||||||
|
class TestLightGBMModel:
|
||||||
|
"""LightGBMModel 测试类"""
|
||||||
|
|
||||||
|
def test_init_default(self):
|
||||||
|
"""测试默认初始化"""
|
||||||
|
model = LightGBMModel()
|
||||||
|
assert model.name == "lightgbm"
|
||||||
|
assert model.params["objective"] == "regression"
|
||||||
|
assert model.params["metric"] == "rmse"
|
||||||
|
assert model.params["num_leaves"] == 31
|
||||||
|
assert model.params["learning_rate"] == 0.05
|
||||||
|
assert model.n_estimators == 100
|
||||||
|
assert model.model is None
|
||||||
|
|
||||||
|
def test_init_custom(self):
|
||||||
|
"""测试自定义参数"""
|
||||||
|
model = LightGBMModel(
|
||||||
|
objective="huber",
|
||||||
|
metric="mae",
|
||||||
|
num_leaves=50,
|
||||||
|
learning_rate=0.1,
|
||||||
|
n_estimators=200,
|
||||||
|
)
|
||||||
|
assert model.params["objective"] == "huber"
|
||||||
|
assert model.params["metric"] == "mae"
|
||||||
|
assert model.params["num_leaves"] == 50
|
||||||
|
assert model.params["learning_rate"] == 0.1
|
||||||
|
assert model.n_estimators == 200
|
||||||
|
|
||||||
|
def test_fit_success(self):
|
||||||
|
"""测试正常训练"""
|
||||||
|
# 创建简单回归数据
|
||||||
|
X = pl.DataFrame(
|
||||||
|
{
|
||||||
|
"feature1": [1.0, 2.0, 3.0, 4.0, 5.0],
|
||||||
|
"feature2": [2.0, 4.0, 6.0, 8.0, 10.0],
|
||||||
|
}
|
||||||
|
)
|
||||||
|
y = pl.Series("target", [1.5, 3.0, 4.5, 6.0, 7.5])
|
||||||
|
|
||||||
|
model = LightGBMModel(n_estimators=10)
|
||||||
|
result = model.fit(X, y)
|
||||||
|
|
||||||
|
# 验证返回 self(支持链式调用)
|
||||||
|
assert result is model
|
||||||
|
# 验证模型已训练
|
||||||
|
assert model.model is not None
|
||||||
|
# 验证特征名称已保存
|
||||||
|
assert model.feature_names_ == ["feature1", "feature2"]
|
||||||
|
|
||||||
|
def test_predict_before_fit(self):
|
||||||
|
"""测试未训练就预测"""
|
||||||
|
X = pl.DataFrame(
|
||||||
|
{
|
||||||
|
"feature1": [1.0, 2.0],
|
||||||
|
"feature2": [2.0, 4.0],
|
||||||
|
}
|
||||||
|
)
|
||||||
|
model = LightGBMModel()
|
||||||
|
|
||||||
|
with pytest.raises(RuntimeError, match="模型尚未训练"):
|
||||||
|
model.predict(X)
|
||||||
|
|
||||||
|
def test_predict_success(self):
|
||||||
|
"""测试正常预测"""
|
||||||
|
# 创建回归数据
|
||||||
|
np.random.seed(42)
|
||||||
|
n_samples = 100
|
||||||
|
X_train = pl.DataFrame(
|
||||||
|
{
|
||||||
|
"feature1": np.random.randn(n_samples),
|
||||||
|
"feature2": np.random.randn(n_samples),
|
||||||
|
}
|
||||||
|
)
|
||||||
|
# y = 2*feature1 + 3*feature2 + noise
|
||||||
|
y_train = pl.Series(
|
||||||
|
"target",
|
||||||
|
2 * X_train["feature1"]
|
||||||
|
+ 3 * X_train["feature2"]
|
||||||
|
+ np.random.randn(n_samples) * 0.1,
|
||||||
|
)
|
||||||
|
|
||||||
|
model = LightGBMModel(n_estimators=20, learning_rate=0.1)
|
||||||
|
model.fit(X_train, y_train)
|
||||||
|
|
||||||
|
# 预测新数据(使用明显不同的值)
|
||||||
|
X_test = pl.DataFrame(
|
||||||
|
{
|
||||||
|
"feature1": [-2.0, 3.0],
|
||||||
|
"feature2": [-1.0, 4.0],
|
||||||
|
}
|
||||||
|
)
|
||||||
|
predictions = model.predict(X_test)
|
||||||
|
|
||||||
|
# 验证预测结果格式
|
||||||
|
assert isinstance(predictions, np.ndarray)
|
||||||
|
assert len(predictions) == 2
|
||||||
|
# 验证预测值是数值
|
||||||
|
assert all(np.isfinite(predictions))
|
||||||
|
# 验证单调性(第二个样本的 feature 值更大,预测值也应更大)
|
||||||
|
assert predictions[1] > predictions[0]
|
||||||
|
|
||||||
|
def test_feature_importance_before_fit(self):
|
||||||
|
"""测试未训练就获取特征重要性"""
|
||||||
|
model = LightGBMModel()
|
||||||
|
assert model.feature_importance() is None
|
||||||
|
|
||||||
|
def test_feature_importance_after_fit(self):
|
||||||
|
"""测试训练后获取特征重要性"""
|
||||||
|
X = pl.DataFrame(
|
||||||
|
{
|
||||||
|
"feature1": np.random.randn(100),
|
||||||
|
"feature2": np.random.randn(100),
|
||||||
|
}
|
||||||
|
)
|
||||||
|
y = pl.Series("target", X["feature1"] * 2 + X["feature2"] * 0.1)
|
||||||
|
|
||||||
|
model = LightGBMModel(n_estimators=10)
|
||||||
|
model.fit(X, y)
|
||||||
|
|
||||||
|
importance = model.feature_importance()
|
||||||
|
|
||||||
|
# 验证特征重要性格式
|
||||||
|
assert importance is not None
|
||||||
|
assert len(importance) == 2
|
||||||
|
assert "feature1" in importance.index
|
||||||
|
assert "feature2" in importance.index
|
||||||
|
# feature1 的系数更大,重要性应该更高
|
||||||
|
assert importance["feature1"] >= importance["feature2"]
|
||||||
|
|
||||||
|
def test_save_before_fit(self):
|
||||||
|
"""测试未训练就保存"""
|
||||||
|
model = LightGBMModel()
|
||||||
|
|
||||||
|
with pytest.raises(RuntimeError, match="模型尚未训练"):
|
||||||
|
model.save("dummy.txt")
|
||||||
|
|
||||||
|
def test_save_and_load(self):
|
||||||
|
"""测试保存和加载"""
|
||||||
|
# 训练模型
|
||||||
|
X = pl.DataFrame(
|
||||||
|
{
|
||||||
|
"feature1": [1.0, 2.0, 3.0, 4.0, 5.0],
|
||||||
|
"feature2": [2.0, 4.0, 6.0, 8.0, 10.0],
|
||||||
|
}
|
||||||
|
)
|
||||||
|
y = pl.Series("target", [2.0, 4.0, 6.0, 8.0, 10.0])
|
||||||
|
|
||||||
|
model = LightGBMModel(n_estimators=10, learning_rate=0.1)
|
||||||
|
model.fit(X, y)
|
||||||
|
|
||||||
|
# 保存前预测
|
||||||
|
X_test = pl.DataFrame(
|
||||||
|
{
|
||||||
|
"feature1": [6.0],
|
||||||
|
"feature2": [12.0],
|
||||||
|
}
|
||||||
|
)
|
||||||
|
pred_before = model.predict(X_test)
|
||||||
|
|
||||||
|
# 保存到临时文件
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
save_path = os.path.join(tmpdir, "model.txt")
|
||||||
|
model.save(save_path)
|
||||||
|
|
||||||
|
# 加载模型
|
||||||
|
loaded_model = LightGBMModel.load(save_path)
|
||||||
|
|
||||||
|
# 验证加载后预测结果相同
|
||||||
|
pred_after = loaded_model.predict(X_test)
|
||||||
|
assert pred_after[0] == pytest.approx(pred_before[0], rel=1e-5)
|
||||||
|
|
||||||
|
# 验证元数据已恢复
|
||||||
|
assert loaded_model.feature_names_ == ["feature1", "feature2"]
|
||||||
|
assert loaded_model.n_estimators == 10
|
||||||
|
|
||||||
|
def test_registration(self):
|
||||||
|
"""测试模型已注册到 registry"""
|
||||||
|
from src.training.registry import ModelRegistry
|
||||||
|
|
||||||
|
model_class = ModelRegistry.get_model("lightgbm")
|
||||||
|
assert model_class is LightGBMModel
|
||||||
|
|
||||||
|
def test_fit_predict_consistency(self):
|
||||||
|
"""测试多次预测结果一致"""
|
||||||
|
X = pl.DataFrame(
|
||||||
|
{
|
||||||
|
"feature1": np.random.randn(50),
|
||||||
|
"feature2": np.random.randn(50),
|
||||||
|
}
|
||||||
|
)
|
||||||
|
y = pl.Series("target", X["feature1"] + X["feature2"])
|
||||||
|
|
||||||
|
model = LightGBMModel(n_estimators=10)
|
||||||
|
model.fit(X, y)
|
||||||
|
|
||||||
|
X_test = pl.DataFrame(
|
||||||
|
{
|
||||||
|
"feature1": [1.0, 2.0, 3.0],
|
||||||
|
"feature2": [1.0, 2.0, 3.0],
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
# 多次预测应该返回相同结果
|
||||||
|
pred1 = model.predict(X_test)
|
||||||
|
pred2 = model.predict(X_test)
|
||||||
|
np.testing.assert_array_almost_equal(pred1, pred2)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
pytest.main([__file__, "-v"])
|
||||||
Reference in New Issue
Block a user