交叉验证 #
交叉验证概述 #
为什么需要交叉验证? #
text
┌─────────────────────────────────────────────────────────────┐
│ 交叉验证的作用 │
├─────────────────────────────────────────────────────────────┤
│ │
│ 单次划分的问题: │
│ - 结果依赖于划分方式 │
│ - 可能高估或低估模型性能 │
│ - 无法评估模型稳定性 │
│ │
│ 交叉验证的优势: │
│ - 更可靠的性能估计 │
│ - 充分利用数据 │
│ - 评估模型稳定性 │
│ - 减少过拟合风险 │
│ │
└─────────────────────────────────────────────────────────────┘
交叉验证类型 #
text
┌─────────────────────────────────────────────────────────────┐
│ 交叉验证类型 │
├─────────────────────────────────────────────────────────────┤
│ │
│ K-Fold CV │
│ ├── 将数据分成 K 份 │
│ ├── 每份轮流作为验证集 │
│ └── 适用于一般场景 │
│ │
│ Stratified K-Fold CV │
│ ├── 保持每折的类别比例 │
│ ├── 适用于类别不平衡 │
│ └── 分类任务首选 │
│ │
│ Time Series CV │
│ ├── 保持时间顺序 │
│ ├── 训练集在验证集之前 │
│ └── 适用于时间序列数据 │
│ │
│ Group K-Fold │
│ ├── 同一组样本不在训练和验证集中同时出现 │
│ └── 适用于分组数据 │
│ │
└─────────────────────────────────────────────────────────────┘
XGBoost 内置交叉验证 #
xgb.cv 函数 #
python
import xgboost as xgb
import numpy as np
from sklearn.datasets import load_breast_cancer
# 加载数据
data = load_breast_cancer()
X, y = data.data, data.target
dtrain = xgb.DMatrix(X, label=y)
# 参数设置
params = {
'objective': 'binary:logistic',
'eval_metric': 'logloss',
'max_depth': 6,
'eta': 0.1
}
# 交叉验证
cv_results = xgb.cv(
params,
dtrain,
num_boost_round=100,
nfold=5,
stratified=True,
early_stopping_rounds=10,
verbose_eval=10,
seed=42
)
print(f"\n最佳迭代次数: {len(cv_results)}")
print(f"平均 Log Loss: {cv_results['test-logloss-mean'].min():.4f}")
xgb.cv 参数详解 #
python
cv_results = xgb.cv(
params, # 参数字典
dtrain, # 训练数据
num_boost_round=100, # 迭代次数
nfold=5, # 折数
stratified=True, # 分层抽样
folds=None, # 自定义折划分
metrics=(), # 评估指标
early_stopping_rounds=None, # 早停轮数
verbose_eval=None, # 打印频率
show_stdv=True, # 显示标准差
seed=42, # 随机种子
shuffle=True, # 是否打乱
callbacks=None # 回调函数
)
自定义折划分 #
python
from sklearn.model_selection import StratifiedKFold
# 创建分层 K-Fold
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
# 自定义折划分
cv_results = xgb.cv(
params,
dtrain,
num_boost_round=100,
folds=skf.split(X, y),
early_stopping_rounds=10,
verbose_eval=10
)
Scikit-Learn 交叉验证 #
cross_val_score #
python
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score, cross_validate
# 创建模型
clf = XGBClassifier(
n_estimators=100,
max_depth=6,
learning_rate=0.1,
objective='binary:logistic',
random_state=42
)
# 单一指标交叉验证
scores = cross_val_score(clf, X, y, cv=5, scoring='roc_auc')
print(f"AUC: {scores.mean():.4f} (+/- {scores.std():.4f})")
# 多指标交叉验证
scoring = ['roc_auc', 'accuracy', 'f1']
cv_results = cross_validate(clf, X, y, cv=5, scoring=scoring, return_train_score=True)
for metric in scoring:
test_key = f'test_{metric}'
print(f"{metric}: {cv_results[test_key].mean():.4f} (+/- {cv_results[test_key].std():.4f})")
K-Fold 交叉验证 #
python
from sklearn.model_selection import KFold
# K-Fold
kf = KFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(clf, X, y, cv=kf, scoring='roc_auc')
print(f"K-Fold AUC: {scores.mean():.4f} (+/- {scores.std():.4f})")
# 手动实现 K-Fold
for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
X_train, X_val = X[train_idx], X[val_idx]
y_train, y_val = y[train_idx], y[val_idx]
dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)
model = xgb.train(params, dtrain, num_boost_round=100)
y_pred = model.predict(dval)
score = roc_auc_score(y_val, y_pred)
print(f"Fold {fold}: AUC = {score:.4f}")
分层 K-Fold 交叉验证 #
python
from sklearn.model_selection import StratifiedKFold
# 分层 K-Fold(保持类别比例)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(clf, X, y, cv=skf, scoring='roc_auc')
print(f"Stratified K-Fold AUC: {scores.mean():.4f} (+/- {scores.std():.4f})")
# 手动实现
fold_scores = []
for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
X_train, X_val = X[train_idx], X[val_idx]
y_train, y_val = y[train_idx], y[val_idx]
clf = XGBClassifier(n_estimators=100, max_depth=6, random_state=42)
clf.fit(X_train, y_train)
y_pred = clf.predict_proba(X_val)[:, 1]
score = roc_auc_score(y_val, y_pred)
fold_scores.append(score)
print(f"Fold {fold}: AUC = {score:.4f}")
print(f"\n平均 AUC: {np.mean(fold_scores):.4f} (+/- {np.std(fold_scores):.4f})")
重复分层 K-Fold #
python
from sklearn.model_selection import RepeatedStratifiedKFold
# 重复分层 K-Fold
rskf = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=42)
scores = cross_val_score(clf, X, y, cv=rskf, scoring='roc_auc')
print(f"Repeated Stratified K-Fold AUC: {scores.mean():.4f} (+/- {scores.std():.4f})")
时间序列交叉验证 #
TimeSeriesSplit #
python
from sklearn.model_selection import TimeSeriesSplit
import pandas as pd
# 创建时间序列数据
dates = pd.date_range('2024-01-01', periods=1000, freq='D')
X_ts = np.random.randn(1000, 10)
y_ts = np.random.randint(0, 2, 1000)
# 时间序列分割
tscv = TimeSeriesSplit(n_splits=5)
for fold, (train_idx, val_idx) in enumerate(tscv.split(X_ts)):
print(f"Fold {fold}:")
print(f" 训练集: {len(train_idx)} 样本")
print(f" 验证集: {len(val_idx)} 样本")
X_train, X_val = X_ts[train_idx], X_ts[val_idx]
y_train, y_val = y_ts[train_idx], y_ts[val_idx]
dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)
model = xgb.train(params, dtrain, num_boost_round=100)
y_pred = model.predict(dval)
score = roc_auc_score(y_val, y_pred)
print(f" AUC: {score:.4f}")
滚动窗口验证 #
python
def rolling_window_cv(X, y, window_size, step_size, n_splits=5):
"""
滚动窗口交叉验证
参数:
- X: 特征
- y: 标签
- window_size: 训练窗口大小
- step_size: 滚动步长
- n_splits: 分割次数
"""
n_samples = len(X)
for i in range(n_splits):
train_end = window_size + i * step_size
val_end = train_end + step_size
if val_end > n_samples:
break
train_idx = np.arange(i * step_size, train_end)
val_idx = np.arange(train_end, val_end)
yield train_idx, val_idx
# 使用示例
for fold, (train_idx, val_idx) in enumerate(rolling_window_cv(X_ts, y_ts, 500, 100)):
print(f"Fold {fold}: Train={len(train_idx)}, Val={len(val_idx)}")
分组交叉验证 #
GroupKFold #
python
from sklearn.model_selection import GroupKFold
# 创建分组数据
groups = np.array([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3] * 50)
X_group = np.random.randn(600, 10)
y_group = np.random.randint(0, 2, 600)
# 分组 K-Fold
gkf = GroupKFold(n_splits=4)
for fold, (train_idx, val_idx) in enumerate(gkf.split(X_group, y_group, groups)):
print(f"Fold {fold}:")
print(f" 训练组: {np.unique(groups[train_idx])}")
print(f" 验证组: {np.unique(groups[val_idx])}")
早停与交叉验证 #
正确使用早停 #
python
def cv_with_early_stopping(X, y, params, n_splits=5):
"""
带早停的交叉验证
注意:早停应该在每一折内部使用,而不是在外部
"""
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
fold_scores = []
best_iterations = []
for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
X_train, X_val = X[train_idx], X[val_idx]
y_train, y_val = y[train_idx], y[val_idx]
dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)
# 在每一折内部使用早停
evals_result = {}
model = xgb.train(
params,
dtrain,
num_boost_round=1000,
evals=[(dtrain, 'train'), (dval, 'eval')],
early_stopping_rounds=50,
evals_result=evals_result,
verbose_eval=False
)
y_pred = model.predict(dval)
score = roc_auc_score(y_val, y_pred)
fold_scores.append(score)
best_iterations.append(model.best_iteration)
print(f"Fold {fold}: AUC = {score:.4f}, Best Iteration = {model.best_iteration}")
print(f"\n平均 AUC: {np.mean(fold_scores):.4f} (+/- {np.std(fold_scores):.4f})")
print(f"平均最佳迭代: {np.mean(best_iterations):.0f}")
return fold_scores, best_iterations
# 使用示例
params = {
'objective': 'binary:logistic',
'eval_metric': 'logloss',
'max_depth': 6,
'eta': 0.1
}
fold_scores, best_iterations = cv_with_early_stopping(X, y, params)
嵌套交叉验证 #
python
from sklearn.model_selection import GridSearchCV, cross_val_score
def nested_cv(X, y, param_grid, outer_cv=5, inner_cv=3):
"""
嵌套交叉验证
外层 CV:评估模型性能
内层 CV:选择超参数
"""
outer_scores = []
outer_skf = StratifiedKFold(n_splits=outer_cv, shuffle=True, random_state=42)
for fold, (train_idx, test_idx) in enumerate(outer_skf.split(X, y)):
X_train, X_test = X[train_idx], X[test_idx]
y_train, y_test = y[train_idx], y[test_idx]
# 内层交叉验证选择超参数
clf = XGBClassifier(objective='binary:logistic', random_state=42)
inner_cv_obj = StratifiedKFold(n_splits=inner_cv, shuffle=True, random_state=42)
grid_search = GridSearchCV(
clf, param_grid, cv=inner_cv_obj, scoring='roc_auc', n_jobs=-1
)
grid_search.fit(X_train, y_train)
# 使用最佳参数在外层测试集上评估
best_clf = grid_search.best_estimator_
y_pred = best_clf.predict_proba(X_test)[:, 1]
score = roc_auc_score(y_test, y_pred)
outer_scores.append(score)
print(f"Outer Fold {fold}: AUC = {score:.4f}")
print(f" Best params: {grid_search.best_params_}")
print(f"\n嵌套 CV 平均 AUC: {np.mean(outer_scores):.4f} (+/- {np.std(outer_scores):.4f})")
return outer_scores
# 使用示例
param_grid = {
'max_depth': [3, 5, 7],
'learning_rate': [0.01, 0.1],
'n_estimators': [100, 200]
}
nested_scores = nested_cv(X, y, param_grid)
交叉验证可视化 #
python
import matplotlib.pyplot as plt
def plot_cv_results(cv_results):
"""
可视化交叉验证结果
"""
fig, axes = plt.subplots(1, 2, figsize=(12, 4))
# 训练和验证曲线
axes[0].plot(cv_results['train-logloss-mean'], label='Train')
axes[0].plot(cv_results['test-logloss-mean'], label='Test')
axes[0].fill_between(
range(len(cv_results)),
cv_results['train-logloss-mean'] - cv_results['train-logloss-std'],
cv_results['train-logloss-mean'] + cv_results['train-logloss-std'],
alpha=0.2
)
axes[0].fill_between(
range(len(cv_results)),
cv_results['test-logloss-mean'] - cv_results['test-logloss-std'],
cv_results['test-logloss-mean'] + cv_results['test-logloss-std'],
alpha=0.2
)
axes[0].set_xlabel('Round')
axes[0].set_ylabel('Log Loss')
axes[0].set_title('Cross-Validation Learning Curve')
axes[0].legend()
axes[0].grid(True)
# 各折分数分布
axes[1].bar(range(len(fold_scores)), fold_scores)
axes[1].axhline(np.mean(fold_scores), color='r', linestyle='--', label=f'Mean: {np.mean(fold_scores):.4f}')
axes[1].set_xlabel('Fold')
axes[1].set_ylabel('AUC')
axes[1].set_title('Fold Scores')
axes[1].legend()
axes[1].grid(True)
plt.tight_layout()
plt.show()
# 使用示例
cv_results = xgb.cv(params, dtrain, num_boost_round=100, nfold=5, stratified=True)
plot_cv_results(cv_results)
交叉验证最佳实践 #
python
def cross_validation_best_practices():
"""
交叉验证最佳实践清单
"""
practices = {
'数据准备': [
'确保数据没有泄露',
'正确处理类别不平衡',
'在 CV 内部进行特征工程'
],
'验证策略': [
'分类任务使用 Stratified K-Fold',
'时间序列使用 TimeSeriesSplit',
'分组数据使用 GroupKFold'
],
'早停使用': [
'早停应在每一折内部使用',
'使用验证集确定最佳迭代次数',
'最终模型使用平均迭代次数'
],
'结果报告': [
'报告均值和标准差',
'使用嵌套 CV 进行无偏估计',
'检查各折结果的一致性'
]
}
for category, items in practices.items():
print(f"\n{category}:")
for item in items:
print(f" • {item}")
cross_validation_best_practices()
下一步 #
现在你已经掌握了交叉验证,接下来学习 分布式训练 了解如何处理大规模数据!
最后更新:2026-04-04