训练与预测 #
训练流程 #
基本训练 #
python
import lightgbm as lgb
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
data = load_breast_cancer()
X_train, X_test, y_train, y_test = train_test_split(
data.data, data.target, test_size=0.2, random_state=42
)
train_data = lgb.Dataset(X_train, label=y_train)
valid_data = lgb.Dataset(X_test, label=y_test, reference=train_data)
params = {
'objective': 'binary',
'metric': 'auc',
'num_leaves': 31,
'learning_rate': 0.05,
'verbose': -1
}
model = lgb.train(
params,
train_data,
num_boost_round=1000,
valid_sets=[valid_data]
)
训练参数详解 #
python
model = lgb.train(
params,
train_data,
num_boost_round=1000,
valid_sets=[train_data, valid_data],
valid_names=['train', 'valid'],
callbacks=[
lgb.log_evaluation(100),
lgb.early_stopping(50)
]
)
| 参数 | 说明 |
|---|---|
| params | 参数字典 |
| train_set | 训练数据集 |
| num_boost_round | 最大迭代次数 |
| valid_sets | 验证数据集列表 |
| valid_names | 验证集名称 |
| callbacks | 回调函数列表 |
早停机制 #
使用 early_stopping #
python
model = lgb.train(
params,
train_data,
num_boost_round=1000,
valid_sets=[valid_data],
callbacks=[
lgb.early_stopping(stopping_rounds=50)
]
)
print(f"最佳迭代次数: {model.best_iteration}")
print(f"最佳分数: {model.best_score}")
早停参数 #
python
callbacks=[
lgb.early_stopping(
stopping_rounds=50,
first_metric_only=False,
verbose=True
)
]
| 参数 | 说明 |
|---|---|
| stopping_rounds | 早停轮数 |
| first_metric_only | 是否只使用第一个指标 |
| verbose | 是否输出信息 |
自定义早停 #
python
class CustomEarlyStopping:
def __init__(self, stopping_rounds, metric_name='auc'):
self.stopping_rounds = stopping_rounds
self.metric_name = metric_name
self.best_score = -np.inf
self.counter = 0
def __call__(self, env):
current_score = env.evaluation_result_list[0][2]
if current_score > self.best_score:
self.best_score = current_score
self.counter = 0
else:
self.counter += 1
if self.counter >= self.stopping_rounds:
print(f"早停触发!最佳分数: {self.best_score:.4f}")
env.model.stop_training = True
model = lgb.train(
params,
train_data,
num_boost_round=1000,
valid_sets=[valid_data],
callbacks=[CustomEarlyStopping(50)]
)
预测方法 #
基本预测 #
python
y_pred = model.predict(X_test)
print(f"预测结果形状: {y_pred.shape}")
print(f"预测结果示例: {y_pred[:5]}")
指定迭代次数 #
python
y_pred = model.predict(X_test, num_iteration=model.best_iteration)
y_pred_50 = model.predict(X_test, num_iteration=50)
y_pred_all = model.predict(X_test, num_iteration=-1)
预测叶子索引 #
python
leaf_indices = model.predict(X_test, pred_leaf=True)
print(f"叶子索引形状: {leaf_indices.shape}")
print(f"叶子索引示例: {leaf_indices[:5]}")
预测特征贡献 #
python
contributions = model.predict(X_test, pred_contrib=True)
print(f"特征贡献形状: {contributions.shape}")
print(f"特征贡献示例: {contributions[:5]}")
不同任务类型的预测 #
二分类 #
python
params = {'objective': 'binary'}
model = lgb.train(params, train_data, num_boost_round=100)
y_pred_proba = model.predict(X_test)
y_pred_class = (y_pred_proba > 0.5).astype(int)
多分类 #
python
params = {'objective': 'multiclass', 'num_class': 3}
model = lgb.train(params, train_data, num_boost_round=100)
y_pred_proba = model.predict(X_test)
y_pred_class = np.argmax(y_pred_proba, axis=1)
回归 #
python
params = {'objective': 'regression'}
model = lgb.train(params, train_data, num_boost_round=100)
y_pred = model.predict(X_test)
Scikit-Learn API #
LGBMClassifier #
python
from lightgbm import LGBMClassifier
clf = LGBMClassifier(
num_leaves=31,
learning_rate=0.05,
n_estimators=1000,
random_state=42,
verbose=-1
)
clf.fit(
X_train, y_train,
eval_set=[(X_test, y_test)],
eval_metric='auc',
callbacks=[
lgb.log_evaluation(100),
lgb.early_stopping(50)
]
)
y_pred = clf.predict(X_test)
y_pred_proba = clf.predict_proba(X_test)
LGBMRegressor #
python
from lightgbm import LGBMRegressor
reg = LGBMRegressor(
num_leaves=31,
learning_rate=0.05,
n_estimators=1000,
random_state=42,
verbose=-1
)
reg.fit(
X_train, y_train,
eval_set=[(X_test, y_test)],
eval_metric='rmse',
callbacks=[
lgb.log_evaluation(100),
lgb.early_stopping(50)
]
)
y_pred = reg.predict(X_test)
LGBMRanker #
python
from lightgbm import LGBMRanker
ranker = LGBMRanker(
num_leaves=31,
learning_rate=0.05,
n_estimators=1000,
random_state=42,
verbose=-1
)
query_train = [100, 100, 100]
ranker.fit(
X_train, y_train,
group=query_train,
eval_set=[(X_test, y_test)],
eval_group=[query_test],
eval_metric='ndcg'
)
y_pred = ranker.predict(X_test)
回调函数 #
日志输出 #
python
callbacks=[
lgb.log_evaluation(period=100)
]
学习率调度 #
python
def learning_rate_scheduler(current_iter):
base_lr = 0.1
return base_lr * (0.99 ** current_iter)
callbacks=[
lgb.reset_parameter(learning_rate=learning_rate_scheduler)
]
记录评估结果 #
python
evals_result = {}
model = lgb.train(
params,
train_data,
num_boost_round=100,
valid_sets=[valid_data],
callbacks=[
lgb.record_evaluation(evals_result)
]
)
print(evals_result)
自定义回调 #
python
def custom_callback(env):
iteration = env.iteration
score = env.evaluation_result_list[0][2]
if iteration % 10 == 0:
print(f"迭代 {iteration}: {score:.4f}")
model = lgb.train(
params,
train_data,
num_boost_round=100,
valid_sets=[valid_data],
callbacks=[custom_callback]
)
模型持久化 #
保存模型 #
python
model.save_model('model.txt')
model.save_model('model.json')
import joblib
joblib.dump(model, 'model.pkl')
加载模型 #
python
loaded_model = lgb.Booster(model_file='model.txt')
loaded_model = lgb.Booster(model_file='model.json')
loaded_model = joblib.load('model.pkl')
模型信息 #
python
print(f"特征数量: {model.num_feature()}")
print(f"最佳迭代: {model.best_iteration}")
print(f"模型参数: {model.params}")
print(f"特征名称: {model.feature_name()}")
训练监控 #
可视化训练过程 #
python
import matplotlib.pyplot as plt
evals_result = {}
model = lgb.train(
params,
train_data,
num_boost_round=100,
valid_sets=[train_data, valid_data],
valid_names=['train', 'valid'],
callbacks=[
lgb.record_evaluation(evals_result)
]
)
plt.figure(figsize=(10, 6))
plt.plot(evals_result['train']['auc'], label='训练集')
plt.plot(evals_result['valid']['auc'], label='验证集')
plt.xlabel('迭代次数')
plt.ylabel('AUC')
plt.title('训练过程')
plt.legend()
plt.grid(True)
plt.show()
使用 plot_metric #
python
lgb.plot_metric(model, metric='auc')
plt.show()
继续训练 #
从已有模型继续训练 #
python
model = lgb.train(params, train_data, num_boost_round=50)
model = lgb.train(
params,
train_data,
num_boost_round=50,
init_model=model
)
print(f"总迭代次数: {model.num_trees()}")
增量训练 #
python
model = lgb.train(params, train_data, num_boost_round=100)
new_data = lgb.Dataset(X_new, label=y_new)
model = lgb.train(
params,
new_data,
num_boost_round=50,
init_model=model
)
完整示例 #
python
import lightgbm as lgb
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
import matplotlib.pyplot as plt
data = load_breast_cancer()
X_train, X_test, y_train, y_test = train_test_split(
data.data, data.target, test_size=0.2, random_state=42, stratify=data.target
)
train_data = lgb.Dataset(X_train, label=y_train)
valid_data = lgb.Dataset(X_test, label=y_test, reference=train_data)
params = {
'objective': 'binary',
'metric': ['auc', 'binary_logloss'],
'num_leaves': 31,
'learning_rate': 0.05,
'feature_fraction': 0.9,
'bagging_fraction': 0.8,
'bagging_freq': 5,
'verbose': -1
}
evals_result = {}
print("开始训练...")
model = lgb.train(
params,
train_data,
num_boost_round=1000,
valid_sets=[train_data, valid_data],
valid_names=['train', 'valid'],
callbacks=[
lgb.log_evaluation(100),
lgb.early_stopping(50),
lgb.record_evaluation(evals_result)
]
)
print(f"\n最佳迭代次数: {model.best_iteration}")
print(f"最佳 AUC: {model.best_score['valid']['auc']:.4f}")
y_pred = model.predict(X_test, num_iteration=model.best_iteration)
y_pred_class = (y_pred > 0.5).astype(int)
accuracy = accuracy_score(y_test, y_pred_class)
auc = roc_auc_score(y_test, y_pred)
print(f"\n测试集评估:")
print(f"准确率: {accuracy:.4f}")
print(f"AUC: {auc:.4f}")
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
axes[0].plot(evals_result['train']['auc'], label='训练集')
axes[0].plot(evals_result['valid']['auc'], label='验证集')
axes[0].set_xlabel('迭代次数')
axes[0].set_ylabel('AUC')
axes[0].set_title('AUC 变化曲线')
axes[0].legend()
axes[0].grid(True)
lgb.plot_importance(model, max_num_features=10, ax=axes[1])
axes[1].set_title('特征重要性')
plt.tight_layout()
plt.show()
model.save_model('breast_cancer_model.txt')
print("\n模型已保存到 breast_cancer_model.txt")
下一步 #
现在你已经掌握了 LightGBM 的训练与预测,接下来学习 模型评估,深入了解模型评估和验证方法!
最后更新:2026-04-04