第一个模型 #
准备工作 #
导入必要的库 #
python
import lightgbm as lgb
import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer, load_iris, load_diabetes
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, mean_squared_error
import matplotlib.pyplot as plt
准备数据 #
使用 sklearn 自带的数据集:
python
data = load_breast_cancer()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = data.target
print(f"数据形状: {X.shape}")
print(f"特征数量: {X.shape[1]}")
print(f"类别分布: {np.bincount(y)}")
二分类模型 #
数据划分 #
python
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y
)
print(f"训练集大小: {X_train.shape[0]}")
print(f"测试集大小: {X_test.shape[0]}")
创建 Dataset #
LightGBM 使用 Dataset 对象存储数据:
python
train_data = lgb.Dataset(X_train, label=y_train)
valid_data = lgb.Dataset(X_test, label=y_test, reference=train_data)
设置参数 #
python
params = {
'objective': 'binary',
'metric': 'auc',
'num_leaves': 31,
'learning_rate': 0.05,
'feature_fraction': 0.9,
'bagging_fraction': 0.8,
'bagging_freq': 5,
'verbose': -1
}
训练模型 #
python
model = lgb.train(
params,
train_data,
num_boost_round=1000,
valid_sets=[train_data, valid_data],
valid_names=['train', 'valid'],
callbacks=[
lgb.log_evaluation(100),
lgb.early_stopping(stopping_rounds=50)
]
)
print(f"最佳迭代次数: {model.best_iteration}")
print(f"最佳 AUC: {model.best_score['valid']['auc']:.4f}")
预测 #
python
y_pred = model.predict(X_test, num_iteration=model.best_iteration)
y_pred_class = (y_pred > 0.5).astype(int)
accuracy = accuracy_score(y_test, y_pred_class)
auc = roc_auc_score(y_test, y_pred)
print(f"准确率: {accuracy:.4f}")
print(f"AUC: {auc:.4f}")
特征重要性 #
python
lgb.plot_importance(model, max_num_features=10, figsize=(10, 6))
plt.title("特征重要性")
plt.tight_layout()
plt.show()
使用 Scikit-Learn API #
LightGBM 提供了兼容 scikit-learn 的 API:
LGBMClassifier #
python
from lightgbm import LGBMClassifier
clf = LGBMClassifier(
num_leaves=31,
learning_rate=0.05,
n_estimators=1000,
objective='binary',
random_state=42
)
clf.fit(
X_train, y_train,
eval_set=[(X_test, y_test)],
eval_metric='auc',
callbacks=[
lgb.log_evaluation(100),
lgb.early_stopping(50)
]
)
y_pred = clf.predict(X_test)
y_pred_proba = clf.predict_proba(X_test)[:, 1]
print(f"准确率: {accuracy_score(y_test, y_pred):.4f}")
print(f"AUC: {roc_auc_score(y_test, y_pred_proba):.4f}")
LGBMRegressor #
python
from lightgbm import LGBMRegressor
data = load_diabetes()
X_train, X_test, y_train, y_test = train_test_split(
data.data, data.target, test_size=0.2, random_state=42
)
reg = LGBMRegressor(
num_leaves=31,
learning_rate=0.05,
n_estimators=1000,
random_state=42
)
reg.fit(
X_train, y_train,
eval_set=[(X_test, y_test)],
eval_metric='rmse',
callbacks=[
lgb.log_evaluation(100),
lgb.early_stopping(50)
]
)
y_pred = reg.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"RMSE: {rmse:.4f}")
多分类模型 #
准备数据 #
python
data = load_iris()
X_train, X_test, y_train, y_test = train_test_split(
data.data, data.target, test_size=0.2, random_state=42, stratify=data.target
)
print(f"类别数量: {len(np.unique(y_train))}")
print(f"类别分布: {np.bincount(y_train)}")
训练模型 #
python
params = {
'objective': 'multiclass',
'num_class': 3,
'metric': 'multi_logloss',
'num_leaves': 31,
'learning_rate': 0.05,
'verbose': -1
}
train_data = lgb.Dataset(X_train, label=y_train)
valid_data = lgb.Dataset(X_test, label=y_test, reference=train_data)
model = lgb.train(
params,
train_data,
num_boost_round=1000,
valid_sets=[valid_data],
callbacks=[
lgb.log_evaluation(100),
lgb.early_stopping(50)
]
)
预测 #
python
y_pred_proba = model.predict(X_test, num_iteration=model.best_iteration)
y_pred = np.argmax(y_pred_proba, axis=1)
accuracy = accuracy_score(y_test, y_pred)
print(f"准确率: {accuracy:.4f}")
回归模型 #
准备数据 #
python
data = load_diabetes()
X_train, X_test, y_train, y_test = train_test_split(
data.data, data.target, test_size=0.2, random_state=42
)
print(f"目标值范围: [{y_train.min():.2f}, {y_train.max():.2f}]")
print(f"目标值均值: {y_train.mean():.2f}")
训练模型 #
python
params = {
'objective': 'regression',
'metric': 'rmse',
'num_leaves': 31,
'learning_rate': 0.05,
'verbose': -1
}
train_data = lgb.Dataset(X_train, label=y_train)
valid_data = lgb.Dataset(X_test, label=y_test, reference=train_data)
model = lgb.train(
params,
train_data,
num_boost_round=1000,
valid_sets=[valid_data],
callbacks=[
lgb.log_evaluation(100),
lgb.early_stopping(50)
]
)
预测和评估 #
python
y_pred = model.predict(X_test, num_iteration=model.best_iteration)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = np.mean(np.abs(y_test - y_pred))
print(f"RMSE: {rmse:.4f}")
print(f"MAE: {mae:.4f}")
可视化预测结果 #
python
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred, alpha=0.5)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
plt.xlabel('真实值')
plt.ylabel('预测值')
plt.title('回归预测结果')
plt.tight_layout()
plt.show()
模型保存与加载 #
保存模型 #
python
model.save_model('lightgbm_model.txt')
model.save_model('lightgbm_model.json')
加载模型 #
python
loaded_model = lgb.Booster(model_file='lightgbm_model.txt')
y_pred = loaded_model.predict(X_test)
使用 joblib 保存 #
python
import joblib
joblib.dump(clf, 'lgbm_classifier.pkl')
loaded_clf = joblib.load('lgbm_classifier.pkl')
完整示例 #
二分类完整流程 #
python
import lightgbm as lgb
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report
import matplotlib.pyplot as plt
data = load_breast_cancer()
X_train, X_test, y_train, y_test = train_test_split(
data.data, data.target, test_size=0.2, random_state=42, stratify=data.target
)
train_data = lgb.Dataset(X_train, label=y_train)
valid_data = lgb.Dataset(X_test, label=y_test, reference=train_data)
params = {
'objective': 'binary',
'metric': ['auc', 'binary_logloss'],
'num_leaves': 31,
'learning_rate': 0.05,
'feature_fraction': 0.9,
'bagging_fraction': 0.8,
'bagging_freq': 5,
'verbose': -1
}
print("开始训练...")
model = lgb.train(
params,
train_data,
num_boost_round=1000,
valid_sets=[train_data, valid_data],
valid_names=['train', 'valid'],
callbacks=[
lgb.log_evaluation(100),
lgb.early_stopping(50)
]
)
print(f"\n最佳迭代次数: {model.best_iteration}")
print(f"最佳 AUC: {model.best_score['valid']['auc']:.4f}")
y_pred = model.predict(X_test, num_iteration=model.best_iteration)
y_pred_class = (y_pred > 0.5).astype(int)
print("\n测试集评估:")
print(f"准确率: {accuracy_score(y_test, y_pred_class):.4f}")
print(f"AUC: {roc_auc_score(y_test, y_pred):.4f}")
print("\n分类报告:")
print(classification_report(y_test, y_pred_class, target_names=data.target_names))
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
lgb.plot_importance(model, max_num_features=10, ax=axes[0])
axes[0].set_title("特征重要性")
lgb.plot_metric(model, metric='auc', ax=axes[1])
axes[1].set_title("AUC 变化曲线")
plt.tight_layout()
plt.show()
model.save_model('breast_cancer_model.txt')
print("\n模型已保存到 breast_cancer_model.txt")
学习要点 #
1. Dataset 对象 #
python
train_data = lgb.Dataset(X_train, label=y_train)
valid_data = lgb.Dataset(X_test, label=y_test, reference=train_data)
- Dataset 是 LightGBM 的核心数据结构
- reference 参数确保验证集使用与训练集相同的特征映射
2. 参数配置 #
python
params = {
'objective': 'binary',
'metric': 'auc',
'num_leaves': 31,
'learning_rate': 0.05
}
- objective: 目标函数(binary, multiclass, regression 等)
- metric: 评估指标
- num_leaves: 叶子节点数量
- learning_rate: 学习率
3. 训练控制 #
python
model = lgb.train(
params,
train_data,
num_boost_round=1000,
valid_sets=[valid_data],
callbacks=[
lgb.log_evaluation(100),
lgb.early_stopping(50)
]
)
- num_boost_round: 最大迭代次数
- early_stopping: 早停机制
- log_evaluation: 日志输出频率
4. 预测方法 #
python
y_pred = model.predict(X_test, num_iteration=model.best_iteration)
- predict 返回概率值(分类)或预测值(回归)
- num_iteration 指定使用的迭代次数
下一步 #
现在你已经学会了如何训练第一个 LightGBM 模型,接下来学习 核心概念,深入理解 LightGBM 的工作原理!
最后更新:2026-04-04