训练与评估 #
模型训练 #
基本训练 #
python
import xgboost as xgb
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_breast_cancer
# 加载数据
data = load_breast_cancer()
X_train, X_test, y_train, y_test = train_test_split(
data.data, data.target, test_size=0.2, random_state=42
)
# 创建 DMatrix
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)
# 参数设置
params = {
'objective': 'binary:logistic',
'eval_metric': 'logloss',
'max_depth': 6,
'eta': 0.1
}
# 训练模型
model = xgb.train(params, dtrain, num_boost_round=100)
带验证集的训练 #
python
# 带验证集监控的训练
evals_result = {}
model = xgb.train(
params,
dtrain,
num_boost_round=500,
evals=[
(dtrain, 'train'),
(dtest, 'eval')
],
evals_result=evals_result,
verbose_eval=10 # 每 10 轮打印一次
)
# 查看训练历史
print(evals_result['train']['logloss'][:5])
print(evals_result['eval']['logloss'][:5])
早停策略 #
python
# 使用早停防止过拟合
model = xgb.train(
params,
dtrain,
num_boost_round=1000,
evals=[(dtrain, 'train'), (dtest, 'eval')],
early_stopping_rounds=50, # 50 轮无改善则停止
verbose_eval=10
)
print(f"最佳迭代次数: {model.best_iteration}")
print(f"最佳评分: {model.best_score}")
使用 Scikit-Learn API #
python
from xgboost import XGBClassifier, XGBRegressor
# 分类器
clf = XGBClassifier(
n_estimators=500,
max_depth=6,
learning_rate=0.1,
objective='binary:logistic',
eval_metric='logloss',
early_stopping_rounds=50,
random_state=42
)
clf.fit(
X_train, y_train,
eval_set=[(X_train, y_train), (X_test, y_test)],
verbose=10
)
# 回归器
reg = XGBRegressor(
n_estimators=500,
max_depth=6,
learning_rate=0.1,
objective='reg:squarederror',
eval_metric='rmse',
early_stopping_rounds=50
)
reg.fit(
X_train, y_train,
eval_set=[(X_test, y_test)],
verbose=10
)
模型预测 #
预测方法 #
python
# 预测概率(二分类)
y_pred_proba = model.predict(dtest)
# 预测类别
y_pred = (y_pred_proba > 0.5).astype(int)
# 使用 Scikit-Learn API
y_pred = clf.predict(X_test)
y_pred_proba = clf.predict_proba(X_test)
# 多分类预测
params = {
'objective': 'multi:softprob',
'num_class': 3
}
model = xgb.train(params, dtrain, num_boost_round=100)
# 返回每个类别的概率
y_pred_proba = model.predict(dtest)
y_pred = np.argmax(y_pred_proba, axis=1)
预测选项 #
python
# 预测叶子索引
leaf_indices = model.predict(dtest, pred_leaf=True)
print(f"叶子索引形状: {leaf_indices.shape}")
# 预测贡献值(SHAP 值)
contributions = model.predict(dtest, pred_contribs=True)
print(f"贡献值形状: {contributions.shape}")
# 预测边际值
margin = model.predict(dtest, output_margin=True)
评估指标 #
分类评估 #
python
from sklearn.metrics import (
accuracy_score, precision_score, recall_score, f1_score,
roc_auc_score, log_loss, classification_report, confusion_matrix
)
# 预测
y_pred_proba = model.predict(dtest)
y_pred = (y_pred_proba > 0.5).astype(int)
# 基本指标
print(f"准确率: {accuracy_score(y_test, y_pred):.4f}")
print(f"精确率: {precision_score(y_test, y_pred):.4f}")
print(f"召回率: {recall_score(y_test, y_pred):.4f}")
print(f"F1 分数: {f1_score(y_test, y_pred):.4f}")
print(f"AUC: {roc_auc_score(y_test, y_pred_proba):.4f}")
print(f"Log Loss: {log_loss(y_test, y_pred_proba):.4f}")
# 详细报告
print("\n分类报告:")
print(classification_report(y_test, y_pred))
# 混淆矩阵
print("\n混淆矩阵:")
print(confusion_matrix(y_test, y_pred))
回归评估 #
python
from sklearn.metrics import (
mean_squared_error, mean_absolute_error, r2_score,
mean_absolute_percentage_error
)
# 预测
y_pred = model.predict(dtest)
# 评估指标
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)
print(f"MSE: {mse:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"MAE: {mae:.4f}")
print(f"R²: {r2:.4f}")
print(f"MAPE: {mape:.4f}")
多分类评估 #
python
from sklearn.metrics import (
accuracy_score, log_loss, classification_report
)
# 预测
y_pred_proba = model.predict(dtest)
y_pred = np.argmax(y_pred_proba, axis=1)
# 评估
print(f"准确率: {accuracy_score(y_test, y_pred):.4f}")
print(f"多分类 Log Loss: {log_loss(y_test, y_pred_proba):.4f}")
print("\n分类报告:")
print(classification_report(y_test, y_pred))
自定义评估函数 #
python
def custom_eval(preds, dtrain):
"""
自定义评估函数
参数:
- preds: 预测值
- dtrain: DMatrix 对象
返回:
- (指标名称, 指标值)
"""
labels = dtrain.get_label()
# 自定义指标计算
# 例如:计算 F1 分数
preds_binary = (preds > 0.5).astype(int)
f1 = f1_score(labels, preds_binary)
return 'f1', f1
# 使用自定义评估函数
model = xgb.train(
params,
dtrain,
num_boost_round=100,
evals=[(dtest, 'eval')],
feval=custom_eval
)
训练可视化 #
训练曲线 #
python
import matplotlib.pyplot as plt
# 训练并记录结果
evals_result = {}
model = xgb.train(
params,
dtrain,
num_boost_round=200,
evals=[(dtrain, 'train'), (dtest, 'eval')],
evals_result=evals_result,
verbose_eval=False
)
# 绘制训练曲线
plt.figure(figsize=(12, 4))
# Log Loss
plt.subplot(1, 2, 1)
plt.plot(evals_result['train']['logloss'], label='Train')
plt.plot(evals_result['eval']['logloss'], label='Eval')
plt.xlabel('Round')
plt.ylabel('Log Loss')
plt.title('Training Log Loss')
plt.legend()
plt.grid(True)
# AUC(如果使用)
plt.subplot(1, 2, 2)
if 'auc' in evals_result['train']:
plt.plot(evals_result['train']['auc'], label='Train')
plt.plot(evals_result['eval']['auc'], label='Eval')
plt.xlabel('Round')
plt.ylabel('AUC')
plt.title('Training AUC')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()
特征重要性可视化 #
python
# 绘制特征重要性
fig, axes = plt.subplots(1, 3, figsize=(15, 6))
# 按权重
xgb.plot_importance(model, importance_type='weight', ax=axes[0], max_num_features=10)
axes[0].set_title('Feature Importance (Weight)')
# 按增益
xgb.plot_importance(model, importance_type='gain', ax=axes[1], max_num_features=10)
axes[1].set_title('Feature Importance (Gain)')
# 按覆盖度
xgb.plot_importance(model, importance_type='cover', ax=axes[2], max_num_features=10)
axes[2].set_title('Feature Importance (Cover)')
plt.tight_layout()
plt.show()
树可视化 #
python
# 绘制单棵树
plt.figure(figsize=(20, 10))
xgb.plot_tree(model, num_trees=0, rankdir='LR')
plt.show()
# 绘制多棵树
fig, axes = plt.subplots(2, 2, figsize=(20, 16))
for i, ax in enumerate(axes.flat):
xgb.plot_tree(model, num_trees=i, ax=ax, rankdir='LR')
ax.set_title(f'Tree {i}')
plt.tight_layout()
plt.show()
模型保存与加载 #
保存模型 #
python
# 方法1:保存为 JSON 格式(推荐)
model.save_model('model.json')
# 方法2:保存为二进制格式
model.save_model('model.bin')
# 方法3:保存为文本格式
model.save_model('model.txt')
# 方法4:使用 joblib
import joblib
joblib.dump(clf, 'xgboost_model.joblib')
# 方法5:使用 pickle
import pickle
with open('model.pkl', 'wb') as f:
pickle.dump(model, f)
加载模型 #
python
# 方法1:加载 JSON 格式
loaded_model = xgb.Booster()
loaded_model.load_model('model.json')
# 方法2:加载二进制格式
loaded_model = xgb.Booster()
loaded_model.load_model('model.bin')
# 方法3:使用 joblib
clf = joblib.load('xgboost_model.joblib')
# 方法4:使用 pickle
with open('model.pkl', 'rb') as f:
model = pickle.load(f)
保存训练状态 #
python
# 保存模型和训练历史
import json
model.save_model('model.json')
with open('training_history.json', 'w') as f:
json.dump(evals_result, f)
# 加载
model = xgb.Booster()
model.load_model('model.json')
with open('training_history.json', 'r') as f:
evals_result = json.load(f)
模型信息 #
获取模型信息 #
python
# 获取模型属性
print(f"最佳迭代: {model.best_iteration}")
print(f"最佳评分: {model.best_score}")
# 获取特征重要性
importance_weight = model.get_score(importance_type='weight')
importance_gain = model.get_score(importance_type='gain')
importance_cover = model.get_score(importance_type='cover')
print("特征重要性 (Gain):")
for feat, score in sorted(importance_gain.items(), key=lambda x: x[1], reverse=True):
print(f" {feat}: {score:.4f}")
# 获取特征名称
print(f"特征名称: {model.feature_names}")
# 获取树的数量
print(f"树的数量: {len(model.get_dump())}")
导出模型 #
python
# 导出为文本格式
dump = model.get_dump()
for i, tree in enumerate(dump[:3]):
print(f"Tree {i}:")
print(tree)
print()
# 导出带统计信息的树
dump_with_stats = model.get_dump(with_stats=True)
print(dump_with_stats[0])
# 导出为 JSON 格式
config = model.save_config()
print(config)
训练回调 #
自定义回调 #
python
from xgboost.callback import TrainingCallback
class CustomCallback(TrainingCallback):
def __init__(self, metric_name='logloss'):
self.metric_name = metric_name
self.best_score = float('inf')
self.best_iteration = 0
def after_iteration(self, model, epoch, evals_log):
# 获取当前评估结果
if 'eval' in evals_log and self.metric_name in evals_log['eval']:
current_score = evals_log['eval'][self.metric_name][-1]
if current_score < self.best_score:
self.best_score = current_score
self.best_iteration = epoch
print(f"New best score: {current_score:.4f} at iteration {epoch}")
return False # 返回 True 则停止训练
# 使用自定义回调
model = xgb.train(
params,
dtrain,
num_boost_round=100,
evals=[(dtrain, 'train'), (dtest, 'eval')],
callbacks=[CustomCallback()]
)
内置回调 #
python
from xgboost.callback import (
EarlyStopping,
LearningRateScheduler,
EvaluationMonitor
)
# 早停回调
early_stop = EarlyStopping(
rounds=50,
metric_name='logloss',
data_name='eval',
save_best=True
)
# 学习率调度
def lr_schedule(epoch):
return 0.1 * (0.99 ** epoch)
lr_scheduler = LearningRateScheduler(lr_schedule)
# 评估监控
eval_monitor = EvaluationMonitor(rank=0, period=10)
# 使用回调
model = xgb.train(
params,
dtrain,
num_boost_round=500,
evals=[(dtrain, 'train'), (dtest, 'eval')],
callbacks=[early_stop, lr_scheduler, eval_monitor]
)
完整训练流程 #
python
import xgboost as xgb
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
def train_xgboost_model():
# 1. 加载数据
data = load_breast_cancer()
X_train, X_test, y_train, y_test = train_test_split(
data.data, data.target, test_size=0.2, random_state=42
)
# 2. 创建 DMatrix
dtrain = xgb.DMatrix(X_train, label=y_train, feature_names=data.feature_names)
dtest = xgb.DMatrix(X_test, label=y_test, feature_names=data.feature_names)
# 3. 参数配置
params = {
'objective': 'binary:logistic',
'eval_metric': ['logloss', 'auc'],
'max_depth': 6,
'eta': 0.1,
'subsample': 0.8,
'colsample_bytree': 0.8,
'lambda': 1,
'alpha': 0,
'seed': 42
}
# 4. 训练模型
evals_result = {}
model = xgb.train(
params,
dtrain,
num_boost_round=500,
evals=[(dtrain, 'train'), (dtest, 'eval')],
early_stopping_rounds=50,
evals_result=evals_result,
verbose_eval=10
)
# 5. 预测
y_pred_proba = model.predict(dtest)
y_pred = (y_pred_proba > 0.5).astype(int)
# 6. 评估
print(f"\n准确率: {accuracy_score(y_test, y_pred):.4f}")
print(f"AUC: {roc_auc_score(y_test, y_pred_proba):.4f}")
print("\n分类报告:")
print(classification_report(y_test, y_pred))
# 7. 可视化
fig, axes = plt.subplots(1, 2, figsize=(12, 4))
axes[0].plot(evals_result['train']['logloss'], label='Train')
axes[0].plot(evals_result['eval']['logloss'], label='Eval')
axes[0].set_xlabel('Round')
axes[0].set_ylabel('Log Loss')
axes[0].legend()
axes[0].grid(True)
xgb.plot_importance(model, max_num_features=10, ax=axes[1])
plt.tight_layout()
plt.show()
# 8. 保存模型
model.save_model('model.json')
return model, evals_result
model, evals_result = train_xgboost_model()
下一步 #
现在你已经掌握了训练与评估,接下来学习 特征工程 进一步提升模型性能!
最后更新:2026-04-04