GBDT 基础 #
什么是 GBDT? #
GBDT(Gradient Boosting Decision Tree,梯度提升决策树)是一种集成学习算法,通过迭代训练多个决策树,每棵树学习前面所有树的残差(负梯度),最终将所有树的预测结果相加得到最终预测。
基本思想 #
text
┌─────────────────────────────────────────────────────────────┐
│ GBDT 基本思想 │
├─────────────────────────────────────────────────────────────┤
│ │
│ 初始预测 F₀(x) = argmin L(y, c) │
│ │
│ for m = 1 to M: │
│ 1. 计算负梯度(残差) │
│ rᵢₘ = -∂L(yᵢ, Fₘ₋₁(xᵢ)) / ∂Fₘ₋₁(xᵢ) │
│ │
│ 2. 拟合残差训练新树 │
│ hₘ(x) = DecisionTree(rᵢₘ) │
│ │
│ 3. 更新模型 │
│ Fₘ(x) = Fₘ₋₁(x) + η·hₘ(x) │
│ │
│ 最终预测: Fₘ(x) = F₀(x) + Σ η·hₖ(x) │
│ │
└─────────────────────────────────────────────────────────────┘
目标函数 #
回归问题 #
对于回归问题,常用均方误差作为损失函数:
python
import numpy as np
def mse_loss(y_true, y_pred):
"""均方误差损失函数"""
return np.mean((y_true - y_pred) ** 2)
def mse_gradient(y_true, y_pred):
"""均方误差的梯度(负梯度)"""
return y_true - y_pred
y_true = np.array([1.0, 2.0, 3.0, 4.0])
y_pred = np.array([1.5, 2.5, 2.5, 4.5])
loss = mse_loss(y_true, y_pred)
gradient = mse_gradient(y_true, y_pred)
print(f"损失: {loss:.4f}")
print(f"梯度(残差): {gradient}")
二分类问题 #
对于二分类问题,使用对数损失函数:
python
def binary_log_loss(y_true, y_pred_proba):
"""二分类对数损失"""
epsilon = 1e-15
y_pred_proba = np.clip(y_pred_proba, epsilon, 1 - epsilon)
return -np.mean(y_true * np.log(y_pred_proba) +
(1 - y_true) * np.log(1 - y_pred_proba))
def binary_log_loss_gradient(y_true, y_pred_proba):
"""二分类对数损失的梯度"""
return y_true - y_pred_proba
y_true = np.array([0, 1, 1, 0])
y_pred_proba = np.array([0.2, 0.8, 0.6, 0.3])
loss = binary_log_loss(y_true, y_pred_proba)
gradient = binary_log_loss_gradient(y_true, y_pred_proba)
print(f"损失: {loss:.4f}")
print(f"梯度: {gradient}")
多分类问题 #
对于多分类问题,使用交叉熵损失:
python
def cross_entropy_loss(y_true, y_pred_proba):
"""多分类交叉熵损失"""
epsilon = 1e-15
y_pred_proba = np.clip(y_pred_proba, epsilon, 1 - epsilon)
n_samples = y_true.shape[0]
log_likelihood = -np.log(y_pred_proba[np.arange(n_samples), y_true])
return np.mean(log_likelihood)
y_true = np.array([0, 1, 2])
y_pred_proba = np.array([
[0.7, 0.2, 0.1],
[0.1, 0.8, 0.1],
[0.1, 0.2, 0.7]
])
loss = cross_entropy_loss(y_true, y_pred_proba)
print(f"交叉熵损失: {loss:.4f}")
梯度提升过程 #
简单实现 #
python
import numpy as np
from sklearn.tree import DecisionTreeRegressor
class SimpleGBDT:
def __init__(self, n_estimators=100, learning_rate=0.1, max_depth=3):
self.n_estimators = n_estimators
self.learning_rate = learning_rate
self.max_depth = max_depth
self.trees = []
self.initial_prediction = None
def fit(self, X, y):
self.initial_prediction = np.mean(y)
F = np.full(len(y), self.initial_prediction)
for i in range(self.n_estimators):
residuals = y - F
tree = DecisionTreeRegressor(max_depth=self.max_depth)
tree.fit(X, residuals)
self.trees.append(tree)
predictions = tree.predict(X)
F += self.learning_rate * predictions
if (i + 1) % 10 == 0:
mse = np.mean((y - F) ** 2)
print(f"迭代 {i+1}: MSE = {mse:.4f}")
def predict(self, X):
F = np.full(X.shape[0], self.initial_prediction)
for tree in self.trees:
F += self.learning_rate * tree.predict(X)
return F
np.random.seed(42)
X = np.random.randn(100, 5)
y = 2 * X[:, 0] + 3 * X[:, 1] + np.random.randn(100) * 0.1
gbdt = SimpleGBDT(n_estimators=50, learning_rate=0.1, max_depth=3)
gbdt.fit(X, y)
y_pred = gbdt.predict(X)
mse = np.mean((y - y_pred) ** 2)
print(f"\n最终 MSE: {mse:.4f}")
可视化提升过程 #
python
import matplotlib.pyplot as plt
def visualize_boosting(X, y, n_estimators=10):
"""可视化梯度提升过程"""
initial_prediction = np.mean(y)
F = np.full(len(y), initial_prediction)
fig, axes = plt.subplots(2, 5, figsize=(20, 8))
axes = axes.flatten()
for i in range(n_estimators):
residuals = y - F
tree = DecisionTreeRegressor(max_depth=2)
tree.fit(X, residuals)
predictions = tree.predict(X)
F += 0.1 * predictions
axes[i].scatter(y, F, alpha=0.5)
axes[i].plot([y.min(), y.max()], [y.min(), y.max()], 'r--')
axes[i].set_xlabel('真实值')
axes[i].set_ylabel('预测值')
axes[i].set_title(f'迭代 {i+1}')
mse = np.mean((y - F) ** 2)
axes[i].text(0.05, 0.95, f'MSE: {mse:.3f}',
transform=axes[i].transAxes, verticalalignment='top')
plt.tight_layout()
plt.show()
visualize_boosting(X, y, n_estimators=10)
正则化 #
学习率 #
python
def compare_learning_rates(X, y):
"""比较不同学习率的效果"""
learning_rates = [0.01, 0.05, 0.1, 0.3]
plt.figure(figsize=(12, 6))
for lr in learning_rates:
model = SimpleGBDT(n_estimators=100, learning_rate=lr, max_depth=3)
model.fit(X, y)
y_pred = model.predict(X)
mse = np.mean((y - y_pred) ** 2)
plt.scatter(y, y_pred, alpha=0.5, label=f'lr={lr}, MSE={mse:.3f}')
plt.plot([y.min(), y.max()], [y.min(), y.max()], 'k--')
plt.xlabel('真实值')
plt.ylabel('预测值')
plt.title('不同学习率的效果')
plt.legend()
plt.grid(True)
plt.show()
compare_learning_rates(X, y)
子采样 #
python
class GBRTWithSubsampling:
def __init__(self, n_estimators=100, learning_rate=0.1,
max_depth=3, subsample=0.8):
self.n_estimators = n_estimators
self.learning_rate = learning_rate
self.max_depth = max_depth
self.subsample = subsample
self.trees = []
def fit(self, X, y):
F = np.zeros(len(y))
for i in range(self.n_estimators):
sample_idx = np.random.choice(
len(y),
size=int(len(y) * self.subsample),
replace=False
)
residuals = y - F
tree = DecisionTreeRegressor(max_depth=self.max_depth)
tree.fit(X[sample_idx], residuals[sample_idx])
self.trees.append(tree)
predictions = tree.predict(X)
F += self.learning_rate * predictions
def predict(self, X):
F = np.zeros(X.shape[0])
for tree in self.trees:
F += self.learning_rate * tree.predict(X)
return F
分裂准则 #
均方误差减少 #
python
def calculate_mse_reduction(y, left_idx, right_idx):
"""计算 MSE 减少"""
total_mse = np.var(y) * len(y)
left_mse = np.var(y[left_idx]) * len(left_idx)
right_mse = np.var(y[right_idx]) * len(right_idx)
reduction = total_mse - left_mse - right_mse
return reduction
y = np.array([1, 2, 3, 4, 5, 6, 7, 8])
left_idx = np.array([0, 1, 2, 3])
right_idx = np.array([4, 5, 6, 7])
reduction = calculate_mse_reduction(y, left_idx, right_idx)
print(f"MSE 减少: {reduction:.4f}")
寻找最佳分裂点 #
python
def find_best_split(X, y):
"""寻找最佳分裂点"""
best_gain = -np.inf
best_feature = None
best_threshold = None
for feature in range(X.shape[1]):
thresholds = np.unique(X[:, feature])
for threshold in thresholds:
left_idx = np.where(X[:, feature] <= threshold)[0]
right_idx = np.where(X[:, feature] > threshold)[0]
if len(left_idx) == 0 or len(right_idx) == 0:
continue
gain = calculate_mse_reduction(y, left_idx, right_idx)
if gain > best_gain:
best_gain = gain
best_feature = feature
best_threshold = threshold
return best_feature, best_threshold, best_gain
X = np.random.randn(100, 5)
y = 2 * X[:, 0] + np.random.randn(100) * 0.1
feature, threshold, gain = find_best_split(X, y)
print(f"最佳分裂特征: {feature}")
print(f"最佳分裂阈值: {threshold:.4f}")
print(f"增益: {gain:.4f}")
GBDT vs 随机森林 #
| 特性 | GBDT | 随机森林 |
|---|---|---|
| 树的关系 | 串行依赖 | 并行独立 |
| 树的数量 | 通常较少 | 通常较多 |
| 树的深度 | 较浅 | 较深 |
| 过拟合风险 | 较高 | 较低 |
| 训练速度 | 较慢 | 较快 |
| 预测速度 | 较快 | 较慢 |
| 参数敏感度 | 较高 | 较低 |
GBDT 的优缺点 #
优点 #
text
✅ 高准确率
- 强大的拟合能力
- 适合各种类型的数据
✅ 特征工程友好
- 自动特征选择
- 处理非线性关系
✅ 可解释性
- 特征重要性
- 树结构可视化
✅ 灵活性
- 可自定义损失函数
- 适合多种任务
缺点 #
text
⚠️ 训练时间较长
- 串行训练
- 难以并行化
⚠️ 容易过拟合
- 需要正则化
- 需要调参
⚠️ 参数较多
- 需要经验
- 调优耗时
⚠️ 对异常值敏感
- 需要数据清洗
- 鲁棒性有限
下一步 #
现在你已经理解了 GBDT 的基本原理,接下来学习 直方图算法,了解 LightGBM 如何加速 GBDT 训练!
最后更新:2026-04-04