第一个模型 #
概述 #
本章将带你从零开始训练一个完整的 XGBoost 模型,涵盖以下步骤:
text
┌─────────────────────────────────────────────────────────────┐
│ XGBoost 训练流程 │
├─────────────────────────────────────────────────────────────┤
│ │
│ 数据准备 → DMatrix → 参数配置 → 模型训练 → 预测评估 │
│ │
└─────────────────────────────────────────────────────────────┘
完整示例:二分类任务 #
1. 导入库和数据 #
python
import xgboost as xgb
import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
# 加载乳腺癌数据集
data = load_breast_cancer()
X = data.data
y = data.target
feature_names = data.feature_names
target_names = data.target_names
print(f"数据集形状: {X.shape}")
print(f"特征数量: {X.shape[1]}")
print(f"类别: {target_names}")
print(f"类别分布: {np.bincount(y)}")
2. 数据划分 #
python
# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(
X, y,
test_size=0.2,
random_state=42,
stratify=y
)
print(f"训练集大小: {X_train.shape[0]}")
print(f"测试集大小: {X_test.shape[0]}")
3. 创建 DMatrix #
python
# 创建 DMatrix(XGBoost 的核心数据结构)
dtrain = xgb.DMatrix(X_train, label=y_train, feature_names=feature_names)
dtest = xgb.DMatrix(X_test, label=y_test, feature_names=feature_names)
print(f"DMatrix 创建成功")
print(f"特征数量: {dtrain.num_features()}")
print(f"样本数量: {dtrain.num_row()}")
4. 设置参数 #
python
# XGBoost 参数配置
params = {
# 目标函数
'objective': 'binary:logistic',
# 评估指标
'eval_metric': ['logloss', 'auc', 'error'],
# 树参数
'max_depth': 6,
'min_child_weight': 1,
# 学习参数
'eta': 0.1,
# 采样参数
'subsample': 0.8,
'colsample_bytree': 0.8,
# 正则化参数
'lambda': 1,
'alpha': 0,
# 其他
'seed': 42,
'verbosity': 1
}
print("参数配置:")
for key, value in params.items():
print(f" {key}: {value}")
5. 训练模型 #
python
# 训练模型
evals_result = {}
model = xgb.train(
params,
dtrain,
num_boost_round=100,
evals=[(dtrain, 'train'), (dtest, 'eval')],
early_stopping_rounds=10,
evals_result=evals_result,
verbose_eval=10
)
print(f"\n最佳迭代次数: {model.best_iteration}")
print(f"最佳评分: {model.best_score}")
6. 模型预测 #
python
# 预测概率
y_pred_proba = model.predict(dtest)
# 转换为类别
y_pred = (y_pred_proba > 0.5).astype(int)
print("预测概率示例:")
print(y_pred_proba[:5])
print("\n预测类别示例:")
print(y_pred[:5])
7. 评估模型 #
python
# 计算准确率
accuracy = accuracy_score(y_test, y_pred)
print(f"准确率: {accuracy:.4f}")
# 详细分类报告
print("\n分类报告:")
print(classification_report(y_test, y_pred, target_names=target_names))
# 混淆矩阵
print("\n混淆矩阵:")
cm = confusion_matrix(y_test, y_pred)
print(cm)
8. 可视化 #
python
# 绘制训练曲线
fig, axes = plt.subplots(1, 3, figsize=(15, 4))
# Log Loss
axes[0].plot(evals_result['train']['logloss'], label='Train')
axes[0].plot(evals_result['eval']['logloss'], label='Eval')
axes[0].set_xlabel('Round')
axes[0].set_ylabel('Log Loss')
axes[0].set_title('Log Loss')
axes[0].legend()
axes[0].grid(True)
# AUC
axes[1].plot(evals_result['train']['auc'], label='Train')
axes[1].plot(evals_result['eval']['auc'], label='Eval')
axes[1].set_xlabel('Round')
axes[1].set_ylabel('AUC')
axes[1].set_title('AUC')
axes[1].legend()
axes[1].grid(True)
# Error
axes[2].plot(evals_result['train']['error'], label='Train')
axes[2].plot(evals_result['eval']['error'], label='Eval')
axes[2].set_xlabel('Round')
axes[2].set_ylabel('Error')
axes[2].set_title('Error Rate')
axes[2].legend()
axes[2].grid(True)
plt.tight_layout()
plt.show()
# 特征重要性
plt.figure(figsize=(10, 8))
xgb.plot_importance(model, max_num_features=15, importance_type='gain')
plt.title('Feature Importance (Gain)')
plt.tight_layout()
plt.show()
使用 Scikit-Learn API #
XGBoost 提供了与 scikit-learn 兼容的 API:
分类器 #
python
from xgboost import XGBClassifier
# 创建分类器
clf = XGBClassifier(
n_estimators=100,
max_depth=6,
learning_rate=0.1,
objective='binary:logistic',
eval_metric='logloss',
early_stopping_rounds=10,
random_state=42
)
# 训练
clf.fit(
X_train, y_train,
eval_set=[(X_test, y_test)],
verbose=10
)
# 预测
y_pred = clf.predict(X_test)
y_pred_proba = clf.predict_proba(X_test)
print(f"准确率: {accuracy_score(y_test, y_pred):.4f}")
回归器 #
python
from xgboost import XGBRegressor
from sklearn.datasets import load_diabetes
from sklearn.metrics import mean_squared_error, r2_score
# 加载回归数据
data = load_diabetes()
X, y = data.data, data.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# 创建回归器
reg = XGBRegressor(
n_estimators=100,
max_depth=6,
learning_rate=0.1,
objective='reg:squarederror',
eval_metric='rmse',
early_stopping_rounds=10,
random_state=42
)
# 训练
reg.fit(
X_train, y_train,
eval_set=[(X_test, y_test)],
verbose=10
)
# 预测
y_pred = reg.predict(X_test)
print(f"RMSE: {np.sqrt(mean_squared_error(y_test, y_pred)):.4f}")
print(f"R²: {r2_score(y_test, y_pred):.4f}")
多分类示例 #
python
from sklearn.datasets import load_iris
# 加载鸢尾花数据集
data = load_iris()
X, y = data.data, data.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# 创建 DMatrix
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)
# 多分类参数
params = {
'objective': 'multi:softprob',
'num_class': 3,
'eval_metric': 'mlogloss',
'max_depth': 4,
'eta': 0.1
}
# 训练
model = xgb.train(params, dtrain, num_boost_round=100)
# 预测(返回每个类别的概率)
y_pred_proba = model.predict(dtest)
y_pred = np.argmax(y_pred_proba, axis=1)
print(f"准确率: {accuracy_score(y_test, y_pred):.4f}")
模型保存与加载 #
保存模型 #
python
# 保存为 JSON 格式(推荐)
model.save_model('xgboost_model.json')
# 保存为二进制格式
model.save_model('xgboost_model.bin')
# 保存特征重要性
importance = model.get_score(importance_type='gain')
pd.Series(importance).to_csv('feature_importance.csv')
加载模型 #
python
# 加载模型
loaded_model = xgb.Booster()
loaded_model.load_model('xgboost_model.json')
# 使用加载的模型预测
y_pred = loaded_model.predict(dtest)
print(f"加载模型预测成功")
使用 joblib #
python
import joblib
# 保存
joblib.dump(clf, 'xgboost_classifier.joblib')
# 加载
loaded_clf = joblib.load('xgboost_classifier.joblib')
y_pred = loaded_clf.predict(X_test)
完整代码模板 #
python
import xgboost as xgb
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.datasets import load_breast_cancer
def train_xgboost_model():
# 1. 加载数据
data = load_breast_cancer()
X, y = data.data, data.target
feature_names = data.feature_names
# 2. 划分数据
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y
)
# 3. 创建 DMatrix
dtrain = xgb.DMatrix(X_train, label=y_train, feature_names=feature_names)
dtest = xgb.DMatrix(X_test, label=y_test, feature_names=feature_names)
# 4. 设置参数
params = {
'objective': 'binary:logistic',
'eval_metric': 'logloss',
'max_depth': 6,
'eta': 0.1,
'subsample': 0.8,
'colsample_bytree': 0.8,
'seed': 42
}
# 5. 训练模型
model = xgb.train(
params,
dtrain,
num_boost_round=100,
evals=[(dtrain, 'train'), (dtest, 'eval')],
early_stopping_rounds=10,
verbose_eval=20
)
# 6. 预测
y_pred_proba = model.predict(dtest)
y_pred = (y_pred_proba > 0.5).astype(int)
# 7. 评估
accuracy = accuracy_score(y_test, y_pred)
print(f"\n准确率: {accuracy:.4f}")
print("\n分类报告:")
print(classification_report(y_test, y_pred))
# 8. 保存模型
model.save_model('model.json')
return model
if __name__ == "__main__":
model = train_xgboost_model()
关键概念总结 #
DMatrix #
python
# DMatrix 是 XGBoost 的核心数据结构
dtrain = xgb.DMatrix(
data=X, # 特征矩阵
label=y, # 标签
weight=None, # 样本权重
base_margin=None, # 初始预测值
missing=np.nan, # 缺失值标识
silent=False, # 是否静默
feature_names=None, # 特征名称
feature_types=None # 特征类型
)
核心参数 #
| 参数 | 说明 | 常用值 |
|---|---|---|
| objective | 目标函数 | binary:logistic, multi:softprob, reg:squarederror |
| eval_metric | 评估指标 | logloss, auc, error, rmse |
| max_depth | 树的最大深度 | 3-10 |
| eta | 学习率 | 0.01-0.3 |
| n_estimators | 树的数量 | 100-1000 |
训练流程 #
python
# 标准训练流程
model = xgb.train(
params, # 参数字典
dtrain, # 训练数据
num_boost_round=100, # 迭代次数
evals=[(dtrain, 'train')], # 评估数据
early_stopping_rounds=10, # 早停轮数
verbose_eval=10 # 打印频率
)
下一步 #
现在你已经训练了第一个 XGBoost 模型,接下来学习 梯度提升原理 深入理解 XGBoost 的工作原理!
最后更新:2026-04-04