调参技巧 #
调参概述 #
参数调优的重要性 #
text
┌─────────────────────────────────────────────────────────────┐
│ 参数调优的重要性 │
├─────────────────────────────────────────────────────────────┤
│ │
│ 默认参数 → 调优后参数 │
│ │
│ 准确率: 85% → 92% │
│ AUC: 0.88 → 0.95 │
│ 训练时间: 10min → 5min │
│ │
│ 调优可以显著提升模型性能! │
│ │
└─────────────────────────────────────────────────────────────┘
调参策略 #
text
Step 1: 固定学习率,调整树参数
- max_depth
- min_child_weight
Step 2: 调整 gamma
- gamma: [0, 0.1, 0.2, 0.5, 1]
Step 3: 调整采样参数
- subsample
- colsample_bytree
Step 4: 调整正则化参数
- lambda
- alpha
Step 5: 降低学习率,增加迭代次数
- eta: 0.01
- n_estimators: 5000
网格搜索 #
基本网格搜索 #
python
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
# 加载数据
data = load_breast_cancer()
X_train, X_test, y_train, y_test = train_test_split(
data.data, data.target, test_size=0.2, random_state=42
)
# 定义参数网格
param_grid = {
'max_depth': [3, 5, 7],
'min_child_weight': [1, 3, 5],
'learning_rate': [0.01, 0.1, 0.3]
}
# 网格搜索
clf = XGBClassifier(n_estimators=100, objective='binary:logistic', random_state=42)
grid_search = GridSearchCV(
estimator=clf,
param_grid=param_grid,
cv=5,
scoring='roc_auc',
n_jobs=-1,
verbose=1
)
grid_search.fit(X_train, y_train)
print(f"最佳参数: {grid_search.best_params_}")
print(f"最佳分数: {grid_search.best_score_:.4f}")
分步网格搜索 #
python
def stepwise_grid_search(X_train, y_train):
"""
分步网格搜索
"""
# Step 1: 调整 max_depth 和 min_child_weight
param_grid_1 = {
'max_depth': [3, 5, 7, 9],
'min_child_weight': [1, 3, 5, 7]
}
clf = XGBClassifier(
n_estimators=100,
learning_rate=0.1,
objective='binary:logistic',
random_state=42
)
grid_search_1 = GridSearchCV(clf, param_grid_1, cv=5, scoring='roc_auc', n_jobs=-1)
grid_search_1.fit(X_train, y_train)
best_params = grid_search_1.best_params_
print(f"Step 1 最佳参数: {best_params}")
# Step 2: 调整 gamma
param_grid_2 = {
'gamma': [0, 0.1, 0.2, 0.5, 1]
}
clf = XGBClassifier(
n_estimators=100,
learning_rate=0.1,
max_depth=best_params['max_depth'],
min_child_weight=best_params['min_child_weight'],
objective='binary:logistic',
random_state=42
)
grid_search_2 = GridSearchCV(clf, param_grid_2, cv=5, scoring='roc_auc', n_jobs=-1)
grid_search_2.fit(X_train, y_train)
best_params['gamma'] = grid_search_2.best_params_['gamma']
print(f"Step 2 最佳参数: {best_params}")
# Step 3: 调整采样参数
param_grid_3 = {
'subsample': [0.6, 0.8, 1.0],
'colsample_bytree': [0.6, 0.8, 1.0]
}
clf = XGBClassifier(
n_estimators=100,
learning_rate=0.1,
max_depth=best_params['max_depth'],
min_child_weight=best_params['min_child_weight'],
gamma=best_params['gamma'],
objective='binary:logistic',
random_state=42
)
grid_search_3 = GridSearchCV(clf, param_grid_3, cv=5, scoring='roc_auc', n_jobs=-1)
grid_search_3.fit(X_train, y_train)
best_params.update(grid_search_3.best_params_)
print(f"Step 3 最佳参数: {best_params}")
# Step 4: 调整正则化参数
param_grid_4 = {
'reg_alpha': [0, 0.001, 0.01, 0.1, 1],
'reg_lambda': [0.1, 1, 10]
}
clf = XGBClassifier(
n_estimators=100,
learning_rate=0.1,
max_depth=best_params['max_depth'],
min_child_weight=best_params['min_child_weight'],
gamma=best_params['gamma'],
subsample=best_params['subsample'],
colsample_bytree=best_params['colsample_bytree'],
objective='binary:logistic',
random_state=42
)
grid_search_4 = GridSearchCV(clf, param_grid_4, cv=5, scoring='roc_auc', n_jobs=-1)
grid_search_4.fit(X_train, y_train)
best_params.update(grid_search_4.best_params_)
print(f"最终最佳参数: {best_params}")
return best_params
best_params = stepwise_grid_search(X_train, y_train)
随机搜索 #
基本随机搜索 #
python
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform, randint
# 定义参数分布
param_distributions = {
'max_depth': randint(3, 10),
'min_child_weight': randint(1, 10),
'gamma': uniform(0, 1),
'subsample': uniform(0.6, 0.4),
'colsample_bytree': uniform(0.6, 0.4),
'learning_rate': uniform(0.01, 0.3),
'reg_alpha': uniform(0, 1),
'reg_lambda': uniform(0, 10)
}
# 随机搜索
clf = XGBClassifier(n_estimators=100, objective='binary:logistic', random_state=42)
random_search = RandomizedSearchCV(
estimator=clf,
param_distributions=param_distributions,
n_iter=100, # 迭代次数
cv=5,
scoring='roc_auc',
n_jobs=-1,
verbose=1,
random_state=42
)
random_search.fit(X_train, y_train)
print(f"最佳参数: {random_search.best_params_}")
print(f"最佳分数: {random_search.best_score_:.4f}")
贝叶斯优化 #
使用 Optuna #
python
import optuna
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score
def objective(trial):
params = {
'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
'max_depth': trial.suggest_int('max_depth', 3, 10),
'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
'gamma': trial.suggest_float('gamma', 0, 1),
'subsample': trial.suggest_float('subsample', 0.6, 1.0),
'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 10, log=True),
'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 10, log=True),
'objective': 'binary:logistic',
'random_state': 42,
'n_jobs': -1
}
clf = XGBClassifier(**params)
scores = cross_val_score(clf, X_train, y_train, cv=5, scoring='roc_auc')
return scores.mean()
# 运行优化
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100, show_progress_bar=True)
print(f"最佳参数: {study.best_params}")
print(f"最佳分数: {study.best_value:.4f}")
# 使用最佳参数训练模型
best_clf = XGBClassifier(**study.best_params, objective='binary:logistic', random_state=42)
best_clf.fit(X_train, y_train)
Optuna 高级用法 #
python
import optuna
from optuna.integration import XGBoostPruningCallback
def objective_with_pruning(trial):
params = {
'n_estimators': 1000,
'max_depth': trial.suggest_int('max_depth', 3, 10),
'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
'gamma': trial.suggest_float('gamma', 0, 1),
'subsample': trial.suggest_float('subsample', 0.6, 1.0),
'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 10, log=True),
'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 10, log=True),
'objective': 'binary:logistic',
'eval_metric': 'logloss',
'random_state': 42
}
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)
pruning_callback = XGBoostPruningCallback(trial, 'test-logloss')
evals_result = {}
model = xgb.train(
params,
dtrain,
evals=[(dtest, 'test')],
early_stopping_rounds=50,
callbacks=[pruning_callback],
evals_result=evals_result,
verbose_eval=False
)
y_pred = model.predict(dtest)
return roc_auc_score(y_test, y_pred)
study = optuna.create_study(direction='maximize', pruner=optuna.pruners.MedianPruner())
study.optimize(objective_with_pruning, n_trials=100)
可视化优化过程 #
python
import optuna
# 优化历史
optuna.visualization.plot_optimization_history(study)
# 参数重要性
optuna.visualization.plot_param_importances(study)
# 参数关系
optuna.visualization.plot_slice(study)
# 并行坐标图
optuna.visualization.plot_parallel_coordinate(study)
Hyperopt #
使用 Hyperopt #
python
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from sklearn.model_selection import cross_val_score
def objective(params):
params['max_depth'] = int(params['max_depth'])
params['min_child_weight'] = int(params['min_child_weight'])
params['n_estimators'] = int(params['n_estimators'])
clf = XGBClassifier(
**params,
objective='binary:logistic',
random_state=42,
n_jobs=-1
)
scores = cross_val_score(clf, X_train, y_train, cv=5, scoring='roc_auc')
return {'loss': -scores.mean(), 'status': STATUS_OK}
# 定义搜索空间
space = {
'n_estimators': hp.quniform('n_estimators', 100, 1000, 50),
'max_depth': hp.quniform('max_depth', 3, 10, 1),
'min_child_weight': hp.quniform('min_child_weight', 1, 10, 1),
'gamma': hp.uniform('gamma', 0, 1),
'subsample': hp.uniform('subsample', 0.6, 1.0),
'colsample_bytree': hp.uniform('colsample_bytree', 0.6, 1.0),
'learning_rate': hp.loguniform('learning_rate', -4, -1),
'reg_alpha': hp.loguniform('reg_alpha', -8, 2),
'reg_lambda': hp.loguniform('reg_lambda', -8, 2)
}
# 运行优化
trials = Trials()
best = fmin(
fn=objective,
space=space,
algo=tpe.suggest,
max_evals=100,
trials=trials
)
print(f"最佳参数: {best}")
调参最佳实践 #
参数搜索范围 #
python
# 推荐的参数搜索范围
param_ranges = {
# 树参数
'max_depth': [3, 5, 7, 9, 12],
'min_child_weight': [1, 3, 5, 7, 10],
# 学习参数
'learning_rate': [0.001, 0.01, 0.05, 0.1, 0.2, 0.3],
'gamma': [0, 0.1, 0.2, 0.5, 1, 2],
# 采样参数
'subsample': [0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
'colsample_bytree': [0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
# 正则化参数
'reg_alpha': [0, 0.001, 0.01, 0.1, 1, 10],
'reg_lambda': [0.1, 1, 5, 10, 50, 100]
}
调参检查清单 #
python
def tuning_checklist():
"""
调参检查清单
"""
checklist = {
'数据准备': [
'检查数据质量',
'处理缺失值',
'特征编码',
'划分训练/验证/测试集'
],
'基准模型': [
'使用默认参数训练',
'记录基准性能',
'确定优化目标'
],
'参数调优': [
'调整 max_depth 和 min_child_weight',
'调整 gamma',
'调整采样参数',
'调整正则化参数',
'降低学习率'
],
'模型评估': [
'交叉验证',
'测试集评估',
'检查过拟合'
],
'最终检查': [
'模型稳定性',
'预测时间',
'内存使用'
]
}
for category, items in checklist.items():
print(f"\n{category}:")
for item in items:
print(f" □ {item}")
自动化调参 #
python
def auto_tune_xgboost(X_train, y_train, X_test, y_test, n_trials=100):
"""
自动化 XGBoost 调参
"""
import optuna
def objective(trial):
params = {
'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
'max_depth': trial.suggest_int('max_depth', 3, 10),
'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
'gamma': trial.suggest_float('gamma', 0, 1),
'subsample': trial.suggest_float('subsample', 0.6, 1.0),
'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 10, log=True),
'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 10, log=True),
'objective': 'binary:logistic',
'eval_metric': 'logloss',
'random_state': 42,
'n_jobs': -1
}
clf = XGBClassifier(**params)
clf.fit(
X_train, y_train,
eval_set=[(X_test, y_test)],
early_stopping_rounds=50,
verbose=False
)
y_pred = clf.predict_proba(X_test)[:, 1]
return roc_auc_score(y_test, y_pred)
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=n_trials, show_progress_bar=True)
# 使用最佳参数训练最终模型
best_params = study.best_params
best_params['objective'] = 'binary:logistic'
best_params['random_state'] = 42
best_params['n_jobs'] = -1
final_model = XGBClassifier(**best_params)
final_model.fit(X_train, y_train)
return final_model, study.best_params, study.best_value
# 使用示例
model, best_params, best_score = auto_tune_xgboost(X_train, y_train, X_test, y_test)
print(f"最佳参数: {best_params}")
print(f"最佳分数: {best_score:.4f}")
下一步 #
现在你已经掌握了调参技巧,接下来学习 交叉验证 了解如何正确评估模型性能!
最后更新:2026-04-04