"""测试 LightGBM 模型 验证 LightGBMModel 的训练、预测、保存和加载功能。 """ import os import tempfile import numpy as np import polars as pl import pytest from src.training.components.models.lightgbm import LightGBMModel class TestLightGBMModel: """LightGBMModel 测试类""" def test_init_default(self): """测试默认初始化""" model = LightGBMModel() assert model.name == "lightgbm" assert model.params["objective"] == "regression" assert model.params["metric"] == "rmse" assert model.params["num_leaves"] == 31 assert model.params["learning_rate"] == 0.05 assert model.n_estimators == 100 assert model.model is None def test_init_custom(self): """测试自定义参数""" model = LightGBMModel( objective="huber", metric="mae", num_leaves=50, learning_rate=0.1, n_estimators=200, ) assert model.params["objective"] == "huber" assert model.params["metric"] == "mae" assert model.params["num_leaves"] == 50 assert model.params["learning_rate"] == 0.1 assert model.n_estimators == 200 def test_fit_success(self): """测试正常训练""" # 创建简单回归数据 X = pl.DataFrame( { "feature1": [1.0, 2.0, 3.0, 4.0, 5.0], "feature2": [2.0, 4.0, 6.0, 8.0, 10.0], } ) y = pl.Series("target", [1.5, 3.0, 4.5, 6.0, 7.5]) model = LightGBMModel(n_estimators=10) result = model.fit(X, y) # 验证返回 self(支持链式调用) assert result is model # 验证模型已训练 assert model.model is not None # 验证特征名称已保存 assert model.feature_names_ == ["feature1", "feature2"] def test_predict_before_fit(self): """测试未训练就预测""" X = pl.DataFrame( { "feature1": [1.0, 2.0], "feature2": [2.0, 4.0], } ) model = LightGBMModel() with pytest.raises(RuntimeError, match="模型尚未训练"): model.predict(X) def test_predict_success(self): """测试正常预测""" # 创建回归数据 np.random.seed(42) n_samples = 100 X_train = pl.DataFrame( { "feature1": np.random.randn(n_samples), "feature2": np.random.randn(n_samples), } ) # y = 2*feature1 + 3*feature2 + noise y_train = pl.Series( "target", 2 * X_train["feature1"] + 3 * X_train["feature2"] + np.random.randn(n_samples) * 0.1, ) model = LightGBMModel(n_estimators=20, learning_rate=0.1) model.fit(X_train, y_train) # 预测新数据(使用明显不同的值) X_test = pl.DataFrame( { "feature1": [-2.0, 3.0], "feature2": [-1.0, 4.0], } ) predictions = model.predict(X_test) # 验证预测结果格式 assert isinstance(predictions, np.ndarray) assert len(predictions) == 2 # 验证预测值是数值 assert all(np.isfinite(predictions)) # 验证单调性(第二个样本的 feature 值更大,预测值也应更大) assert predictions[1] > predictions[0] def test_feature_importance_before_fit(self): """测试未训练就获取特征重要性""" model = LightGBMModel() assert model.feature_importance() is None def test_feature_importance_after_fit(self): """测试训练后获取特征重要性""" X = pl.DataFrame( { "feature1": np.random.randn(100), "feature2": np.random.randn(100), } ) y = pl.Series("target", X["feature1"] * 2 + X["feature2"] * 0.1) model = LightGBMModel(n_estimators=10) model.fit(X, y) importance = model.feature_importance() # 验证特征重要性格式 assert importance is not None assert len(importance) == 2 assert "feature1" in importance.index assert "feature2" in importance.index # feature1 的系数更大,重要性应该更高 assert importance["feature1"] >= importance["feature2"] def test_save_before_fit(self): """测试未训练就保存""" model = LightGBMModel() with pytest.raises(RuntimeError, match="模型尚未训练"): model.save("dummy.txt") def test_save_and_load(self): """测试保存和加载""" # 训练模型 X = pl.DataFrame( { "feature1": [1.0, 2.0, 3.0, 4.0, 5.0], "feature2": [2.0, 4.0, 6.0, 8.0, 10.0], } ) y = pl.Series("target", [2.0, 4.0, 6.0, 8.0, 10.0]) model = LightGBMModel(n_estimators=10, learning_rate=0.1) model.fit(X, y) # 保存前预测 X_test = pl.DataFrame( { "feature1": [6.0], "feature2": [12.0], } ) pred_before = model.predict(X_test) # 保存到临时文件 with tempfile.TemporaryDirectory() as tmpdir: save_path = os.path.join(tmpdir, "model.txt") model.save(save_path) # 加载模型 loaded_model = LightGBMModel.load(save_path) # 验证加载后预测结果相同 pred_after = loaded_model.predict(X_test) assert pred_after[0] == pytest.approx(pred_before[0], rel=1e-5) # 验证元数据已恢复 assert loaded_model.feature_names_ == ["feature1", "feature2"] assert loaded_model.n_estimators == 10 def test_registration(self): """测试模型已注册到 registry""" from src.training.registry import ModelRegistry # 重新导入模型模块以确保注册(处理其他测试 clear 注册表的情况) import importlib import src.training.components.models.lightgbm as lightgbm_module importlib.reload(lightgbm_module) from src.training.components.models.lightgbm import ( LightGBMModel as ReloadedModel, ) model_class = ModelRegistry.get_model("lightgbm") assert model_class is ReloadedModel def test_fit_predict_consistency(self): """测试多次预测结果一致""" X = pl.DataFrame( { "feature1": np.random.randn(50), "feature2": np.random.randn(50), } ) y = pl.Series("target", X["feature1"] + X["feature2"]) model = LightGBMModel(n_estimators=10) model.fit(X, y) X_test = pl.DataFrame( { "feature1": [1.0, 2.0, 3.0], "feature2": [1.0, 2.0, 3.0], } ) # 多次预测应该返回相同结果 pred1 = model.predict(X_test) pred2 = model.predict(X_test) np.testing.assert_array_almost_equal(pred1, pred2) if __name__ == "__main__": pytest.main([__file__, "-v"])