调参技巧 #

调参概述 #

参数调优的重要性 #

text

┌─────────────────────────────────────────────────────────────┐
│                    参数调优的重要性                          │
├─────────────────────────────────────────────────────────────┤
│                                                              │
│  默认参数 → 调优后参数                                       │
│                                                              │
│  准确率: 85% → 92%                                           │
│  AUC: 0.88 → 0.95                                            │
│  训练时间: 10min → 5min                                      │
│                                                              │
│  调优可以显著提升模型性能！                                   │
│                                                              │
└─────────────────────────────────────────────────────────────┘

调参策略 #

text

Step 1: 固定学习率，调整树参数
        - max_depth
        - min_child_weight
        
Step 2: 调整 gamma
        - gamma: [0, 0.1, 0.2, 0.5, 1]
        
Step 3: 调整采样参数
        - subsample
        - colsample_bytree
        
Step 4: 调整正则化参数
        - lambda
        - alpha
        
Step 5: 降低学习率，增加迭代次数
        - eta: 0.01
        - n_estimators: 5000

网格搜索 #

基本网格搜索 #

python

from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split

# 加载数据
data = load_breast_cancer()
X_train, X_test, y_train, y_test = train_test_split(
    data.data, data.target, test_size=0.2, random_state=42
)

# 定义参数网格
param_grid = {
    'max_depth': [3, 5, 7],
    'min_child_weight': [1, 3, 5],
    'learning_rate': [0.01, 0.1, 0.3]
}

# 网格搜索
clf = XGBClassifier(n_estimators=100, objective='binary:logistic', random_state=42)

grid_search = GridSearchCV(
    estimator=clf,
    param_grid=param_grid,
    cv=5,
    scoring='roc_auc',
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X_train, y_train)

print(f"最佳参数: {grid_search.best_params_}")
print(f"最佳分数: {grid_search.best_score_:.4f}")

分步网格搜索 #

python

def stepwise_grid_search(X_train, y_train):
    """
    分步网格搜索
    """
    # Step 1: 调整 max_depth 和 min_child_weight
    param_grid_1 = {
        'max_depth': [3, 5, 7, 9],
        'min_child_weight': [1, 3, 5, 7]
    }
    
    clf = XGBClassifier(
        n_estimators=100,
        learning_rate=0.1,
        objective='binary:logistic',
        random_state=42
    )
    
    grid_search_1 = GridSearchCV(clf, param_grid_1, cv=5, scoring='roc_auc', n_jobs=-1)
    grid_search_1.fit(X_train, y_train)
    
    best_params = grid_search_1.best_params_
    print(f"Step 1 最佳参数: {best_params}")
    
    # Step 2: 调整 gamma
    param_grid_2 = {
        'gamma': [0, 0.1, 0.2, 0.5, 1]
    }
    
    clf = XGBClassifier(
        n_estimators=100,
        learning_rate=0.1,
        max_depth=best_params['max_depth'],
        min_child_weight=best_params['min_child_weight'],
        objective='binary:logistic',
        random_state=42
    )
    
    grid_search_2 = GridSearchCV(clf, param_grid_2, cv=5, scoring='roc_auc', n_jobs=-1)
    grid_search_2.fit(X_train, y_train)
    
    best_params['gamma'] = grid_search_2.best_params_['gamma']
    print(f"Step 2 最佳参数: {best_params}")
    
    # Step 3: 调整采样参数
    param_grid_3 = {
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0]
    }
    
    clf = XGBClassifier(
        n_estimators=100,
        learning_rate=0.1,
        max_depth=best_params['max_depth'],
        min_child_weight=best_params['min_child_weight'],
        gamma=best_params['gamma'],
        objective='binary:logistic',
        random_state=42
    )
    
    grid_search_3 = GridSearchCV(clf, param_grid_3, cv=5, scoring='roc_auc', n_jobs=-1)
    grid_search_3.fit(X_train, y_train)
    
    best_params.update(grid_search_3.best_params_)
    print(f"Step 3 最佳参数: {best_params}")
    
    # Step 4: 调整正则化参数
    param_grid_4 = {
        'reg_alpha': [0, 0.001, 0.01, 0.1, 1],
        'reg_lambda': [0.1, 1, 10]
    }
    
    clf = XGBClassifier(
        n_estimators=100,
        learning_rate=0.1,
        max_depth=best_params['max_depth'],
        min_child_weight=best_params['min_child_weight'],
        gamma=best_params['gamma'],
        subsample=best_params['subsample'],
        colsample_bytree=best_params['colsample_bytree'],
        objective='binary:logistic',
        random_state=42
    )
    
    grid_search_4 = GridSearchCV(clf, param_grid_4, cv=5, scoring='roc_auc', n_jobs=-1)
    grid_search_4.fit(X_train, y_train)
    
    best_params.update(grid_search_4.best_params_)
    print(f"最终最佳参数: {best_params}")
    
    return best_params

best_params = stepwise_grid_search(X_train, y_train)

随机搜索 #

基本随机搜索 #

python

from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform, randint

# 定义参数分布
param_distributions = {
    'max_depth': randint(3, 10),
    'min_child_weight': randint(1, 10),
    'gamma': uniform(0, 1),
    'subsample': uniform(0.6, 0.4),
    'colsample_bytree': uniform(0.6, 0.4),
    'learning_rate': uniform(0.01, 0.3),
    'reg_alpha': uniform(0, 1),
    'reg_lambda': uniform(0, 10)
}

# 随机搜索
clf = XGBClassifier(n_estimators=100, objective='binary:logistic', random_state=42)

random_search = RandomizedSearchCV(
    estimator=clf,
    param_distributions=param_distributions,
    n_iter=100,  # 迭代次数
    cv=5,
    scoring='roc_auc',
    n_jobs=-1,
    verbose=1,
    random_state=42
)

random_search.fit(X_train, y_train)

print(f"最佳参数: {random_search.best_params_}")
print(f"最佳分数: {random_search.best_score_:.4f}")

贝叶斯优化 #

使用 Optuna #

python

import optuna
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score

def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'gamma': trial.suggest_float('gamma', 0, 1),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 10, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 10, log=True),
        'objective': 'binary:logistic',
        'random_state': 42,
        'n_jobs': -1
    }
    
    clf = XGBClassifier(**params)
    
    scores = cross_val_score(clf, X_train, y_train, cv=5, scoring='roc_auc')
    
    return scores.mean()

# 运行优化
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100, show_progress_bar=True)

print(f"最佳参数: {study.best_params}")
print(f"最佳分数: {study.best_value:.4f}")

# 使用最佳参数训练模型
best_clf = XGBClassifier(**study.best_params, objective='binary:logistic', random_state=42)
best_clf.fit(X_train, y_train)

Optuna 高级用法 #

python

import optuna
from optuna.integration import XGBoostPruningCallback

def objective_with_pruning(trial):
    params = {
        'n_estimators': 1000,
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'gamma': trial.suggest_float('gamma', 0, 1),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 10, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 10, log=True),
        'objective': 'binary:logistic',
        'eval_metric': 'logloss',
        'random_state': 42
    }
    
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dtest = xgb.DMatrix(X_test, label=y_test)
    
    pruning_callback = XGBoostPruningCallback(trial, 'test-logloss')
    
    evals_result = {}
    model = xgb.train(
        params,
        dtrain,
        evals=[(dtest, 'test')],
        early_stopping_rounds=50,
        callbacks=[pruning_callback],
        evals_result=evals_result,
        verbose_eval=False
    )
    
    y_pred = model.predict(dtest)
    return roc_auc_score(y_test, y_pred)

study = optuna.create_study(direction='maximize', pruner=optuna.pruners.MedianPruner())
study.optimize(objective_with_pruning, n_trials=100)

可视化优化过程 #

python

import optuna

# 优化历史
optuna.visualization.plot_optimization_history(study)

# 参数重要性
optuna.visualization.plot_param_importances(study)

# 参数关系
optuna.visualization.plot_slice(study)

# 并行坐标图
optuna.visualization.plot_parallel_coordinate(study)

Hyperopt #

使用 Hyperopt #

python

from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from sklearn.model_selection import cross_val_score

def objective(params):
    params['max_depth'] = int(params['max_depth'])
    params['min_child_weight'] = int(params['min_child_weight'])
    params['n_estimators'] = int(params['n_estimators'])
    
    clf = XGBClassifier(
        **params,
        objective='binary:logistic',
        random_state=42,
        n_jobs=-1
    )
    
    scores = cross_val_score(clf, X_train, y_train, cv=5, scoring='roc_auc')
    
    return {'loss': -scores.mean(), 'status': STATUS_OK}

# 定义搜索空间
space = {
    'n_estimators': hp.quniform('n_estimators', 100, 1000, 50),
    'max_depth': hp.quniform('max_depth', 3, 10, 1),
    'min_child_weight': hp.quniform('min_child_weight', 1, 10, 1),
    'gamma': hp.uniform('gamma', 0, 1),
    'subsample': hp.uniform('subsample', 0.6, 1.0),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.6, 1.0),
    'learning_rate': hp.loguniform('learning_rate', -4, -1),
    'reg_alpha': hp.loguniform('reg_alpha', -8, 2),
    'reg_lambda': hp.loguniform('reg_lambda', -8, 2)
}

# 运行优化
trials = Trials()
best = fmin(
    fn=objective,
    space=space,
    algo=tpe.suggest,
    max_evals=100,
    trials=trials
)

print(f"最佳参数: {best}")

调参最佳实践 #

参数搜索范围 #

python

# 推荐的参数搜索范围
param_ranges = {
    # 树参数
    'max_depth': [3, 5, 7, 9, 12],
    'min_child_weight': [1, 3, 5, 7, 10],
    
    # 学习参数
    'learning_rate': [0.001, 0.01, 0.05, 0.1, 0.2, 0.3],
    'gamma': [0, 0.1, 0.2, 0.5, 1, 2],
    
    # 采样参数
    'subsample': [0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
    'colsample_bytree': [0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
    
    # 正则化参数
    'reg_alpha': [0, 0.001, 0.01, 0.1, 1, 10],
    'reg_lambda': [0.1, 1, 5, 10, 50, 100]
}

调参检查清单 #

python

def tuning_checklist():
    """
    调参检查清单
    """
    checklist = {
        '数据准备': [
            '检查数据质量',
            '处理缺失值',
            '特征编码',
            '划分训练/验证/测试集'
        ],
        '基准模型': [
            '使用默认参数训练',
            '记录基准性能',
            '确定优化目标'
        ],
        '参数调优': [
            '调整 max_depth 和 min_child_weight',
            '调整 gamma',
            '调整采样参数',
            '调整正则化参数',
            '降低学习率'
        ],
        '模型评估': [
            '交叉验证',
            '测试集评估',
            '检查过拟合'
        ],
        '最终检查': [
            '模型稳定性',
            '预测时间',
            '内存使用'
        ]
    }
    
    for category, items in checklist.items():
        print(f"\n{category}:")
        for item in items:
            print(f"  □ {item}")

自动化调参 #

python

def auto_tune_xgboost(X_train, y_train, X_test, y_test, n_trials=100):
    """
    自动化 XGBoost 调参
    """
    import optuna
    
    def objective(trial):
        params = {
            'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
            'max_depth': trial.suggest_int('max_depth', 3, 10),
            'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
            'gamma': trial.suggest_float('gamma', 0, 1),
            'subsample': trial.suggest_float('subsample', 0.6, 1.0),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
            'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 10, log=True),
            'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 10, log=True),
            'objective': 'binary:logistic',
            'eval_metric': 'logloss',
            'random_state': 42,
            'n_jobs': -1
        }
        
        clf = XGBClassifier(**params)
        clf.fit(
            X_train, y_train,
            eval_set=[(X_test, y_test)],
            early_stopping_rounds=50,
            verbose=False
        )
        
        y_pred = clf.predict_proba(X_test)[:, 1]
        return roc_auc_score(y_test, y_pred)
    
    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=n_trials, show_progress_bar=True)
    
    # 使用最佳参数训练最终模型
    best_params = study.best_params
    best_params['objective'] = 'binary:logistic'
    best_params['random_state'] = 42
    best_params['n_jobs'] = -1
    
    final_model = XGBClassifier(**best_params)
    final_model.fit(X_train, y_train)
    
    return final_model, study.best_params, study.best_value

# 使用示例
model, best_params, best_score = auto_tune_xgboost(X_train, y_train, X_test, y_test)
print(f"最佳参数: {best_params}")
print(f"最佳分数: {best_score:.4f}")

下一步 #

现在你已经掌握了调参技巧，接下来学习交叉验证了解如何正确评估模型性能！