模型评估 #
评估指标 #
分类指标 #
AUC(Area Under Curve) #
python
from sklearn.metrics import roc_auc_score
y_pred_proba = model.predict(X_test)
auc = roc_auc_score(y_test, y_pred_proba)
print(f"AUC: {auc:.4f}")
准确率(Accuracy) #
python
from sklearn.metrics import accuracy_score
y_pred = (y_pred_proba > 0.5).astype(int)
accuracy = accuracy_score(y_test, y_pred)
print(f"准确率: {accuracy:.4f}")
精确率与召回率 #
python
from sklearn.metrics import precision_score, recall_score, f1_score
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
print(f"精确率: {precision:.4f}")
print(f"召回率: {recall:.4f}")
print(f"F1 分数: {f1:.4f}")
分类报告 #
python
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))
混淆矩阵 #
python
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('预测值')
plt.ylabel('真实值')
plt.title('混淆矩阵')
plt.show()
ROC 曲线 #
python
from sklearn.metrics import roc_curve
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f'AUC = {auc:.4f}')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('假正率 (FPR)')
plt.ylabel('真正率 (TPR)')
plt.title('ROC 曲线')
plt.legend()
plt.grid(True)
plt.show()
回归指标 #
RMSE(均方根误差) #
python
from sklearn.metrics import mean_squared_error
import numpy as np
y_pred = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"RMSE: {rmse:.4f}")
MAE(平均绝对误差) #
python
from sklearn.metrics import mean_absolute_error
mae = mean_absolute_error(y_test, y_pred)
print(f"MAE: {mae:.4f}")
R²(决定系数) #
python
from sklearn.metrics import r2_score
r2 = r2_score(y_test, y_pred)
print(f"R²: {r2:.4f}")
MAPE(平均绝对百分比误差) #
python
def mape(y_true, y_pred):
return np.mean(np.abs((y_true - y_pred) / y_true)) * 100
mape_value = mape(y_test, y_pred)
print(f"MAPE: {mape_value:.2f}%")
多分类指标 #
python
from sklearn.metrics import accuracy_score, classification_report
y_pred_proba = model.predict(X_test)
y_pred = np.argmax(y_pred_proba, axis=1)
accuracy = accuracy_score(y_test, y_pred)
print(f"准确率: {accuracy:.4f}")
print(classification_report(y_test, y_pred))
交叉验证 #
使用 cv 函数 #
python
import lightgbm as lgb
data = lgb.Dataset(X, label=y)
params = {
'objective': 'binary',
'metric': 'auc',
'num_leaves': 31,
'learning_rate': 0.05,
'verbose': -1
}
cv_results = lgb.cv(
params,
data,
num_boost_round=1000,
nfold=5,
stratified=True,
callbacks=[
lgb.log_evaluation(100),
lgb.early_stopping(50)
]
)
print(f"最佳迭代: {len(cv_results['auc-mean'])}")
print(f"最佳 AUC: {max(cv_results['auc-mean']):.4f}")
cv 参数详解 #
python
cv_results = lgb.cv(
params,
data,
num_boost_round=1000,
nfold=5,
stratified=True,
shuffle=True,
metrics='auc',
seed=42,
callbacks=[
lgb.log_evaluation(100),
lgb.early_stopping(50)
]
)
| 参数 | 说明 |
|---|---|
| params | 参数字典 |
| train_set | 训练数据 |
| num_boost_round | 最大迭代次数 |
| nfold | 折数 |
| stratified | 是否分层抽样 |
| shuffle | 是否打乱数据 |
| metrics | 评估指标 |
| seed | 随机种子 |
使用 sklearn 交叉验证 #
python
from sklearn.model_selection import cross_val_score, StratifiedKFold
from lightgbm import LGBMClassifier
clf = LGBMClassifier(
num_leaves=31,
learning_rate=0.05,
n_estimators=100,
random_state=42,
verbose=-1
)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(clf, X, y, cv=cv, scoring='roc_auc')
print(f"交叉验证 AUC: {scores.mean():.4f} (+/- {scores.std():.4f})")
分组交叉验证 #
python
from sklearn.model_selection import GroupKFold
groups = np.random.randint(0, 10, len(X))
gkf = GroupKFold(n_splits=5)
for train_idx, val_idx in gkf.split(X, y, groups):
X_train, X_val = X[train_idx], X[val_idx]
y_train, y_val = y[train_idx], y[val_idx]
train_data = lgb.Dataset(X_train, label=y_train)
model = lgb.train(params, train_data, num_boost_round=100)
时间序列交叉验证 #
python
from sklearn.model_selection import TimeSeriesSplit
tscv = TimeSeriesSplit(n_splits=5)
for train_idx, val_idx in tscv.split(X):
X_train, X_val = X[train_idx], X[val_idx]
y_train, y_val = y[train_idx], y[val_idx]
train_data = lgb.Dataset(X_train, label=y_train)
model = lgb.train(params, train_data, num_boost_round=100)
特征重要性 #
获取特征重要性 #
python
import pandas as pd
importance_split = model.feature_importance(importance_type='split')
importance_gain = model.feature_importance(importance_type='gain')
feature_names = model.feature_name()
importance_df = pd.DataFrame({
'feature': feature_names,
'split': importance_split,
'gain': importance_gain
})
importance_df = importance_df.sort_values('gain', ascending=False)
print(importance_df.head(10))
可视化特征重要性 #
python
import matplotlib.pyplot as plt
fig, axes = plt.subplots(1, 2, figsize=(14, 6))
lgb.plot_importance(model, importance_type='split', max_num_features=10, ax=axes[0])
axes[0].set_title('特征重要性 (Split)')
lgb.plot_importance(model, importance_type='gain', max_num_features=10, ax=axes[1])
axes[1].set_title('特征重要性 (Gain)')
plt.tight_layout()
plt.show()
特征重要性类型 #
| 类型 | 说明 | 适用场景 |
|---|---|---|
| split | 特征被用于分裂的次数 | 了解特征使用频率 |
| gain | 特征带来的总增益 | 了解特征贡献度 |
基于特征重要性的特征选择 #
python
top_k = 10
top_features = importance_df.head(top_k)['feature'].tolist()
X_selected = pd.DataFrame(X, columns=feature_names)[top_features]
train_data = lgb.Dataset(X_selected, label=y)
model = lgb.train(params, train_data, num_boost_round=100)
模型诊断 #
学习曲线 #
python
evals_result = {}
model = lgb.train(
params,
train_data,
num_boost_round=100,
valid_sets=[train_data, valid_data],
valid_names=['train', 'valid'],
callbacks=[lgb.record_evaluation(evals_result)]
)
plt.figure(figsize=(10, 6))
plt.plot(evals_result['train']['auc'], label='训练集')
plt.plot(evals_result['valid']['auc'], label='验证集')
plt.xlabel('迭代次数')
plt.ylabel('AUC')
plt.title('学习曲线')
plt.legend()
plt.grid(True)
plt.show()
残差分析(回归) #
python
y_pred = model.predict(X_test)
residuals = y_test - y_pred
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
axes[0].scatter(y_pred, residuals, alpha=0.5)
axes[0].axhline(y=0, color='r', linestyle='--')
axes[0].set_xlabel('预测值')
axes[0].set_ylabel('残差')
axes[0].set_title('残差图')
axes[1].hist(residuals, bins=30, edgecolor='black')
axes[1].set_xlabel('残差')
axes[1].set_ylabel('频数')
axes[1].set_title('残差分布')
plt.tight_layout()
plt.show()
预测分布 #
python
y_pred = model.predict(X_test)
plt.figure(figsize=(10, 6))
plt.hist(y_pred[y_test == 0], bins=30, alpha=0.5, label='类别 0')
plt.hist(y_pred[y_test == 1], bins=30, alpha=0.5, label='类别 1')
plt.xlabel('预测概率')
plt.ylabel('频数')
plt.title('预测分布')
plt.legend()
plt.show()
树结构可视化 #
python
lgb.plot_tree(model, tree_index=0, figsize=(20, 10))
plt.show()
模型比较 #
比较不同参数 #
python
results = []
param_sets = [
{'num_leaves': 15},
{'num_leaves': 31},
{'num_leaves': 63}
]
for params_update in param_sets:
params_copy = params.copy()
params_copy.update(params_update)
cv_results = lgb.cv(
params_copy,
data,
num_boost_round=100,
nfold=5,
stratified=True,
callbacks=[lgb.log_evaluation(0)]
)
results.append({
'params': params_update,
'auc': max(cv_results['auc-mean'])
})
results_df = pd.DataFrame(results)
print(results_df)
比较不同模型 #
python
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier
models = {
'LightGBM': LGBMClassifier(num_leaves=31, random_state=42, verbose=-1),
'RandomForest': RandomForestClassifier(n_estimators=100, random_state=42),
'LogisticRegression': LogisticRegression(random_state=42)
}
results = []
for name, model in models.items():
cv_scores = cross_val_score(model, X, y, cv=5, scoring='roc_auc')
results.append({
'model': name,
'mean_auc': cv_scores.mean(),
'std_auc': cv_scores.std()
})
results_df = pd.DataFrame(results)
print(results_df)
模型持久化 #
保存模型 #
python
model.save_model('model.txt')
model.save_model('model.json')
import joblib
joblib.dump(model, 'model.pkl')
加载模型 #
python
loaded_model = lgb.Booster(model_file='model.txt')
loaded_model = lgb.Booster(model_file='model.json')
loaded_model = joblib.load('model.pkl')
模型信息 #
python
print(f"特征数量: {model.num_feature()}")
print(f"树的数量: {model.num_trees()}")
print(f"最佳迭代: {model.best_iteration}")
print(f"模型参数: {model.params}")
完整评估示例 #
python
import lightgbm as lgb
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import (
accuracy_score, roc_auc_score, precision_score,
recall_score, f1_score, classification_report,
confusion_matrix, roc_curve
)
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
data = load_breast_cancer()
X, y = data.data, data.target
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y
)
train_data = lgb.Dataset(X_train, label=y_train)
valid_data = lgb.Dataset(X_test, label=y_test, reference=train_data)
params = {
'objective': 'binary',
'metric': 'auc',
'num_leaves': 31,
'learning_rate': 0.05,
'verbose': -1
}
evals_result = {}
model = lgb.train(
params,
train_data,
num_boost_round=1000,
valid_sets=[train_data, valid_data],
valid_names=['train', 'valid'],
callbacks=[
lgb.log_evaluation(100),
lgb.early_stopping(50),
lgb.record_evaluation(evals_result)
]
)
y_pred_proba = model.predict(X_test, num_iteration=model.best_iteration)
y_pred = (y_pred_proba > 0.5).astype(int)
print("\n" + "="*50)
print("模型评估报告")
print("="*50)
print(f"\n准确率: {accuracy_score(y_test, y_pred):.4f}")
print(f"AUC: {roc_auc_score(y_test, y_pred_proba):.4f}")
print(f"精确率: {precision_score(y_test, y_pred):.4f}")
print(f"召回率: {recall_score(y_test, y_pred):.4f}")
print(f"F1 分数: {f1_score(y_test, y_pred):.4f}")
print("\n分类报告:")
print(classification_report(y_test, y_pred, target_names=data.target_names))
fig, axes = plt.subplots(2, 2, figsize=(14, 12))
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[0, 0])
axes[0, 0].set_xlabel('预测值')
axes[0, 0].set_ylabel('真实值')
axes[0, 0].set_title('混淆矩阵')
fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
axes[0, 1].plot(fpr, tpr, label=f'AUC = {roc_auc_score(y_test, y_pred_proba):.4f}')
axes[0, 1].plot([0, 1], [0, 1], 'k--')
axes[0, 1].set_xlabel('假正率')
axes[0, 1].set_ylabel('真正率')
axes[0, 1].set_title('ROC 曲线')
axes[0, 1].legend()
axes[0, 1].grid(True)
axes[1, 0].plot(evals_result['train']['auc'], label='训练集')
axes[1, 0].plot(evals_result['valid']['auc'], label='验证集')
axes[1, 0].set_xlabel('迭代次数')
axes[1, 0].set_ylabel('AUC')
axes[1, 0].set_title('学习曲线')
axes[1, 0].legend()
axes[1, 0].grid(True)
lgb.plot_importance(model, max_num_features=10, ax=axes[1, 1])
axes[1, 1].set_title('特征重要性')
plt.tight_layout()
plt.show()
clf = LGBMClassifier(num_leaves=31, random_state=42, verbose=-1)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(clf, X, y, cv=cv, scoring='roc_auc')
print("\n交叉验证结果:")
print(f"平均 AUC: {cv_scores.mean():.4f} (+/- {cv_scores.std():.4f})")
下一步 #
现在你已经掌握了 LightGBM 的模型评估,接下来学习 GBDT 基础,深入了解梯度提升决策树的原理!
最后更新:2026-04-04