调参技巧 #
调参策略 #
调参顺序 #
text
┌─────────────────────────────────────────────────────────────┐
│ LightGBM 调参顺序 │
├─────────────────────────────────────────────────────────────┤
│ │
│ 第一步:固定学习率和迭代次数 │
│ ├── learning_rate = 0.1 │
│ └── n_estimators = 1000 (配合早停) │
│ │
│ 第二步:调整树参数 │
│ ├── num_leaves (最重要) │
│ ├── min_data_in_leaf │
│ └── max_depth (可选) │
│ │
│ 第三步:调整采样参数 │
│ ├── feature_fraction │
│ ├── bagging_fraction │
│ └── bagging_freq │
│ │
│ 第四步:调整正则化参数 │
│ ├── lambda_l1 │
│ ├── lambda_l2 │
│ └── min_split_gain │
│ │
│ 第五步:降低学习率 │
│ └── learning_rate = 0.01 │
│ │
└─────────────────────────────────────────────────────────────┘
网格搜索 #
基本网格搜索 #
python
import lightgbm as lgb
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import load_breast_cancer
data = load_breast_cancer()
X, y = data.data, data.target
clf = lgb.LGBMClassifier(random_state=42, verbose=-1)
param_grid = {
'num_leaves': [15, 31, 63],
'learning_rate': [0.01, 0.05, 0.1],
'n_estimators': [100, 200, 500]
}
grid_search = GridSearchCV(
clf, param_grid, cv=5, scoring='roc_auc', n_jobs=-1, verbose=1
)
grid_search.fit(X, y)
print(f"最佳参数: {grid_search.best_params_}")
print(f"最佳分数: {grid_search.best_score_:.4f}")
详细网格搜索 #
python
def detailed_grid_search(X, y):
"""详细网格搜索"""
results = []
param_grid = {
'num_leaves': [15, 31, 63, 127],
'min_data_in_leaf': [10, 20, 50, 100],
'feature_fraction': [0.6, 0.8, 1.0],
'bagging_fraction': [0.6, 0.8, 1.0]
}
from itertools import product
for num_leaves, min_data, feat_frac, bag_frac in product(
param_grid['num_leaves'],
param_grid['min_data_in_leaf'],
param_grid['feature_fraction'],
param_grid['bagging_fraction']
):
params = {
'num_leaves': num_leaves,
'min_data_in_leaf': min_data,
'feature_fraction': feat_frac,
'bagging_fraction': bag_frac,
'bagging_freq': 5,
'verbose': -1
}
train_data = lgb.Dataset(X, label=y)
cv_results = lgb.cv(
{**params, 'objective': 'binary', 'metric': 'auc'},
train_data, num_boost_round=500, nfold=5,
callbacks=[lgb.log_evaluation(0)]
)
best_score = max(cv_results['auc-mean'])
results.append({
'num_leaves': num_leaves,
'min_data_in_leaf': min_data,
'feature_fraction': feat_frac,
'bagging_fraction': bag_frac,
'auc': best_score
})
import pandas as pd
return pd.DataFrame(results).sort_values('auc', ascending=False)
results = detailed_grid_search(X, y)
print(results.head(10))
随机搜索 #
基本随机搜索 #
python
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform
clf = lgb.LGBMClassifier(random_state=42, verbose=-1)
param_distributions = {
'num_leaves': randint(15, 127),
'learning_rate': uniform(0.01, 0.3),
'n_estimators': randint(100, 1000),
'min_data_in_leaf': randint(10, 100),
'feature_fraction': uniform(0.5, 0.5),
'bagging_fraction': uniform(0.5, 0.5),
'lambda_l1': uniform(0, 1),
'lambda_l2': uniform(0, 1)
}
random_search = RandomizedSearchCV(
clf, param_distributions, n_iter=50, cv=5,
scoring='roc_auc', n_jobs=-1, random_state=42, verbose=1
)
random_search.fit(X, y)
print(f"最佳参数: {random_search.best_params_}")
print(f"最佳分数: {random_search.best_score_:.4f}")
自定义随机搜索 #
python
def custom_random_search(X, y, n_iter=50):
"""自定义随机搜索"""
import pandas as pd
results = []
for i in range(n_iter):
params = {
'objective': 'binary',
'metric': 'auc',
'num_leaves': np.random.randint(15, 127),
'learning_rate': np.random.uniform(0.01, 0.3),
'min_data_in_leaf': np.random.randint(10, 100),
'feature_fraction': np.random.uniform(0.5, 1.0),
'bagging_fraction': np.random.uniform(0.5, 1.0),
'bagging_freq': 5,
'lambda_l1': np.random.uniform(0, 1),
'lambda_l2': np.random.uniform(0, 1),
'verbose': -1
}
train_data = lgb.Dataset(X, label=y)
cv_results = lgb.cv(
params, train_data, num_boost_round=500, nfold=5,
callbacks=[lgb.log_evaluation(0)]
)
best_score = max(cv_results['auc-mean'])
best_iter = len(cv_results['auc-mean'])
results.append({
'iteration': i,
'auc': best_score,
'best_iter': best_iter,
**{k: v for k, v in params.items() if k not in ['objective', 'metric', 'verbose']}
})
if (i + 1) % 10 == 0:
print(f"完成 {i+1}/{n_iter} 次搜索")
return pd.DataFrame(results).sort_values('auc', ascending=False)
results = custom_random_search(X, y, n_iter=30)
print(results.head(10))
贝叶斯优化 #
使用 Optuna #
python
import optuna
def objective(trial):
"""Optuna 优化目标函数"""
params = {
'objective': 'binary',
'metric': 'auc',
'num_leaves': trial.suggest_int('num_leaves', 15, 127),
'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 10, 100),
'feature_fraction': trial.suggest_float('feature_fraction', 0.5, 1.0),
'bagging_fraction': trial.suggest_float('bagging_fraction', 0.5, 1.0),
'bagging_freq': 5,
'lambda_l1': trial.suggest_float('lambda_l1', 1e-8, 10.0, log=True),
'lambda_l2': trial.suggest_float('lambda_l2', 1e-8, 10.0, log=True),
'verbose': -1
}
train_data = lgb.Dataset(X, label=y)
cv_results = lgb.cv(
params, train_data, num_boost_round=500, nfold=5,
callbacks=[lgb.log_evaluation(0)]
)
return max(cv_results['auc-mean'])
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)
print(f"最佳参数: {study.best_params}")
print(f"最佳分数: {study.best_value:.4f}")
可视化优化过程 #
python
import matplotlib.pyplot as plt
def plot_optimization_history(study):
"""可视化优化历史"""
trials = study.trials
iterations = [t.number for t in trials]
values = [t.value for t in trials]
best_values = []
current_best = -np.inf
for v in values:
if v > current_best:
current_best = v
best_values.append(current_best)
plt.figure(figsize=(12, 6))
plt.plot(iterations, values, 'o', alpha=0.5, label='尝试结果')
plt.plot(iterations, best_values, 'r-', label='最佳结果')
plt.xlabel('迭代次数')
plt.ylabel('AUC')
plt.title('贝叶斯优化历史')
plt.legend()
plt.grid(True)
plt.show()
plot_optimization_history(study)
分步调参 #
第一步:调整 num_leaves #
python
def tune_num_leaves(X, y):
"""调整 num_leaves"""
num_leaves_range = [15, 31, 63, 127, 255]
results = []
for num_leaves in num_leaves_range:
params = {
'objective': 'binary',
'metric': 'auc',
'num_leaves': num_leaves,
'learning_rate': 0.1,
'verbose': -1
}
train_data = lgb.Dataset(X, label=y)
cv_results = lgb.cv(
params, train_data, num_boost_round=500, nfold=5,
callbacks=[lgb.log_evaluation(0)]
)
best_score = max(cv_results['auc-mean'])
results.append({'num_leaves': num_leaves, 'auc': best_score})
import pandas as pd
return pd.DataFrame(results)
results = tune_num_leaves(X, y)
print(results)
import matplotlib.pyplot as plt
plt.figure(figsize=(10, 6))
plt.plot(results['num_leaves'], results['auc'], 'o-')
plt.xlabel('num_leaves')
plt.ylabel('AUC')
plt.title('num_leaves vs AUC')
plt.grid(True)
plt.show()
第二步:调整 min_data_in_leaf #
python
def tune_min_data(X, y, best_num_leaves):
"""调整 min_data_in_leaf"""
min_data_range = [5, 10, 20, 50, 100, 200]
results = []
for min_data in min_data_range:
params = {
'objective': 'binary',
'metric': 'auc',
'num_leaves': best_num_leaves,
'min_data_in_leaf': min_data,
'learning_rate': 0.1,
'verbose': -1
}
train_data = lgb.Dataset(X, label=y)
cv_results = lgb.cv(
params, train_data, num_boost_round=500, nfold=5,
callbacks=[lgb.log_evaluation(0)]
)
best_score = max(cv_results['auc-mean'])
results.append({'min_data_in_leaf': min_data, 'auc': best_score})
import pandas as pd
return pd.DataFrame(results)
best_num_leaves = 31
results = tune_min_data(X, y, best_num_leaves)
print(results)
第三步:调整采样参数 #
python
def tune_sampling(X, y, best_num_leaves, best_min_data):
"""调整采样参数"""
feature_fractions = [0.6, 0.7, 0.8, 0.9, 1.0]
bagging_fractions = [0.6, 0.7, 0.8, 0.9, 1.0]
results = []
for feat_frac in feature_fractions:
for bag_frac in bagging_fractions:
params = {
'objective': 'binary',
'metric': 'auc',
'num_leaves': best_num_leaves,
'min_data_in_leaf': best_min_data,
'feature_fraction': feat_frac,
'bagging_fraction': bag_frac,
'bagging_freq': 5,
'learning_rate': 0.1,
'verbose': -1
}
train_data = lgb.Dataset(X, label=y)
cv_results = lgb.cv(
params, train_data, num_boost_round=500, nfold=5,
callbacks=[lgb.log_evaluation(0)]
)
best_score = max(cv_results['auc-mean'])
results.append({
'feature_fraction': feat_frac,
'bagging_fraction': bag_frac,
'auc': best_score
})
import pandas as pd
return pd.DataFrame(results).sort_values('auc', ascending=False)
results = tune_sampling(X, y, 31, 20)
print(results.head(10))
过拟合检测 #
学习曲线分析 #
python
def analyze_overfitting(X, y, params):
"""分析过拟合"""
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)
train_data = lgb.Dataset(X_train, label=y_train)
val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)
evals_result = {}
model = lgb.train(
params, train_data, num_boost_round=500,
valid_sets=[train_data, val_data],
valid_names=['train', 'valid'],
callbacks=[
lgb.log_evaluation(50),
lgb.record_evaluation(evals_result)
]
)
import matplotlib.pyplot as plt
plt.figure(figsize=(12, 6))
plt.plot(evals_result['train']['auc'], label='训练集')
plt.plot(evals_result['valid']['auc'], label='验证集')
plt.xlabel('迭代次数')
plt.ylabel('AUC')
plt.title('学习曲线')
plt.legend()
plt.grid(True)
plt.show()
train_score = evals_result['train']['auc'][-1]
val_score = evals_result['valid']['auc'][-1]
gap = train_score - val_score
print(f"训练集 AUC: {train_score:.4f}")
print(f"验证集 AUC: {val_score:.4f}")
print(f"差距: {gap:.4f}")
if gap > 0.05:
print("⚠️ 可能过拟合,建议增加正则化")
else:
print("✅ 模型拟合良好")
params = {
'objective': 'binary',
'metric': 'auc',
'num_leaves': 31,
'learning_rate': 0.1,
'verbose': -1
}
analyze_overfitting(X, y, params)
完整调参示例 #
python
import lightgbm as lgb
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
data = load_breast_cancer()
X, y = data.data, data.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("第一步:调整 num_leaves")
num_leaves_range = [15, 31, 63, 127]
best_num_leaves = 31
best_score = 0
for num_leaves in num_leaves_range:
params = {
'objective': 'binary',
'metric': 'auc',
'num_leaves': num_leaves,
'learning_rate': 0.1,
'verbose': -1
}
train_data = lgb.Dataset(X_train, label=y_train)
cv_results = lgb.cv(
params, train_data, num_boost_round=500, nfold=5,
callbacks=[lgb.log_evaluation(0)]
)
score = max(cv_results['auc-mean'])
print(f"num_leaves={num_leaves}: AUC={score:.4f}")
if score > best_score:
best_score = score
best_num_leaves = num_leaves
print(f"\n最佳 num_leaves: {best_num_leaves}")
print("\n第二步:调整采样参数")
best_feat_frac = 0.8
best_bag_frac = 0.8
best_score = 0
for feat_frac in [0.6, 0.8, 1.0]:
for bag_frac in [0.6, 0.8, 1.0]:
params = {
'objective': 'binary',
'metric': 'auc',
'num_leaves': best_num_leaves,
'feature_fraction': feat_frac,
'bagging_fraction': bag_frac,
'bagging_freq': 5,
'learning_rate': 0.1,
'verbose': -1
}
train_data = lgb.Dataset(X_train, label=y_train)
cv_results = lgb.cv(
params, train_data, num_boost_round=500, nfold=5,
callbacks=[lgb.log_evaluation(0)]
)
score = max(cv_results['auc-mean'])
if score > best_score:
best_score = score
best_feat_frac = feat_frac
best_bag_frac = bag_frac
print(f"最佳 feature_fraction: {best_feat_frac}")
print(f"最佳 bagging_fraction: {best_bag_frac}")
print("\n第三步:最终模型训练")
final_params = {
'objective': 'binary',
'metric': 'auc',
'num_leaves': best_num_leaves,
'feature_fraction': best_feat_frac,
'bagging_fraction': best_bag_frac,
'bagging_freq': 5,
'learning_rate': 0.05,
'verbose': -1
}
train_data = lgb.Dataset(X_train, label=y_train)
valid_data = lgb.Dataset(X_test, label=y_test, reference=train_data)
evals_result = {}
final_model = lgb.train(
final_params, train_data, num_boost_round=1000,
valid_sets=[valid_data],
callbacks=[
lgb.log_evaluation(100),
lgb.early_stopping(50),
lgb.record_evaluation(evals_result)
]
)
print(f"\n最终模型 AUC: {final_model.best_score['valid_0']['auc']:.4f}")
plt.figure(figsize=(10, 6))
plt.plot(evals_result['valid_0']['auc'])
plt.xlabel('迭代次数')
plt.ylabel('AUC')
plt.title('最终模型学习曲线')
plt.grid(True)
plt.show()
下一步 #
现在你已经掌握了 LightGBM 的调参技巧,接下来学习 类别特征处理,深入了解如何处理类别特征!
最后更新:2026-04-04