二分类问题 #
案例概述 #
本案例使用乳腺癌数据集演示 LightGBM 二分类任务:
python
import lightgbm as lgb
import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import (
accuracy_score, roc_auc_score, precision_score,
recall_score, f1_score, classification_report,
confusion_matrix, roc_curve
)
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
data = load_breast_cancer()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = data.target
print(f"数据形状: {X.shape}")
print(f"类别分布: {np.bincount(y)}")
print(f"类别名称: {data.target_names}")
数据预处理 #
python
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y
)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
print(f"训练集大小: {X_train.shape[0]}")
print(f"测试集大小: {X_test.shape[0]}")
模型训练 #
python
train_data = lgb.Dataset(X_train_scaled, label=y_train)
valid_data = lgb.Dataset(X_test_scaled, label=y_test, reference=train_data)
params = {
'objective': 'binary',
'metric': ['auc', 'binary_logloss'],
'num_leaves': 31,
'learning_rate': 0.05,
'feature_fraction': 0.9,
'bagging_fraction': 0.8,
'bagging_freq': 5,
'verbose': -1
}
evals_result = {}
model = lgb.train(
params,
train_data,
num_boost_round=1000,
valid_sets=[train_data, valid_data],
valid_names=['train', 'valid'],
callbacks=[
lgb.log_evaluation(100),
lgb.early_stopping(50),
lgb.record_evaluation(evals_result)
]
)
print(f"\n最佳迭代: {model.best_iteration}")
print(f"最佳 AUC: {model.best_score['valid']['auc']:.4f}")
模型评估 #
python
y_pred_proba = model.predict(X_test_scaled, num_iteration=model.best_iteration)
y_pred = (y_pred_proba > 0.5).astype(int)
print("\n" + "="*50)
print("模型评估报告")
print("="*50)
print(f"\n准确率: {accuracy_score(y_test, y_pred):.4f}")
print(f"AUC: {roc_auc_score(y_test, y_pred_proba):.4f}")
print(f"精确率: {precision_score(y_test, y_pred):.4f}")
print(f"召回率: {recall_score(y_test, y_pred):.4f}")
print(f"F1 分数: {f1_score(y_test, y_pred):.4f}")
print("\n分类报告:")
print(classification_report(y_test, y_pred, target_names=data.target_names))
fig, axes = plt.subplots(2, 2, figsize=(14, 12))
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[0, 0],
xticklabels=data.target_names, yticklabels=data.target_names)
axes[0, 0].set_xlabel('预测值')
axes[0, 0].set_ylabel('真实值')
axes[0, 0].set_title('混淆矩阵')
fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
axes[0, 1].plot(fpr, tpr, label=f'AUC = {roc_auc_score(y_test, y_pred_proba):.4f}')
axes[0, 1].plot([0, 1], [0, 1], 'k--')
axes[0, 1].set_xlabel('假正率')
axes[0, 1].set_ylabel('真正率')
axes[0, 1].set_title('ROC 曲线')
axes[0, 1].legend()
axes[0, 1].grid(True)
axes[1, 0].plot(evals_result['train']['auc'], label='训练集')
axes[1, 0].plot(evals_result['valid']['auc'], label='验证集')
axes[1, 0].set_xlabel('迭代次数')
axes[1, 0].set_ylabel('AUC')
axes[1, 0].set_title('学习曲线')
axes[1, 0].legend()
axes[1, 0].grid(True)
lgb.plot_importance(model, max_num_features=15, ax=axes[1, 1])
axes[1, 1].set_title('特征重要性')
plt.tight_layout()
plt.show()
模型保存 #
python
model.save_model('binary_classification_model.txt')
print("模型已保存到 binary_classification_model.txt")
loaded_model = lgb.Booster(model_file='binary_classification_model.txt')
y_pred_loaded = loaded_model.predict(X_test_scaled)
print(f"加载模型 AUC: {roc_auc_score(y_test, y_pred_loaded):.4f}")
下一步 #
现在你已经完成了二分类实战,接下来学习 多分类问题,了解如何处理多分类任务!
最后更新:2026-04-04