GBDT 基础 #

什么是 GBDT？ #

GBDT（Gradient Boosting Decision Tree，梯度提升决策树）是一种集成学习算法，通过迭代训练多个决策树，每棵树学习前面所有树的残差（负梯度），最终将所有树的预测结果相加得到最终预测。

基本思想 #

text

┌─────────────────────────────────────────────────────────────┐
│                    GBDT 基本思想                             │
├─────────────────────────────────────────────────────────────┤
│                                                             │
│  初始预测 F₀(x) = argmin L(y, c)                            │
│                                                             │
│  for m = 1 to M:                                            │
│      1. 计算负梯度（残差）                                   │
│         rᵢₘ = -∂L(yᵢ, Fₘ₋₁(xᵢ)) / ∂Fₘ₋₁(xᵢ)              │
│                                                             │
│      2. 拟合残差训练新树                                     │
│         hₘ(x) = DecisionTree(rᵢₘ)                          │
│                                                             │
│      3. 更新模型                                            │
│         Fₘ(x) = Fₘ₋₁(x) + η·hₘ(x)                          │
│                                                             │
│  最终预测: Fₘ(x) = F₀(x) + Σ η·hₖ(x)                       │
│                                                             │
└─────────────────────────────────────────────────────────────┘

目标函数 #

回归问题 #

对于回归问题，常用均方误差作为损失函数：

python

import numpy as np

def mse_loss(y_true, y_pred):
    """均方误差损失函数"""
    return np.mean((y_true - y_pred) ** 2)

def mse_gradient(y_true, y_pred):
    """均方误差的梯度（负梯度）"""
    return y_true - y_pred

y_true = np.array([1.0, 2.0, 3.0, 4.0])
y_pred = np.array([1.5, 2.5, 2.5, 4.5])

loss = mse_loss(y_true, y_pred)
gradient = mse_gradient(y_true, y_pred)

print(f"损失: {loss:.4f}")
print(f"梯度（残差）: {gradient}")

二分类问题 #

对于二分类问题，使用对数损失函数：

python

def binary_log_loss(y_true, y_pred_proba):
    """二分类对数损失"""
    epsilon = 1e-15
    y_pred_proba = np.clip(y_pred_proba, epsilon, 1 - epsilon)
    return -np.mean(y_true * np.log(y_pred_proba) + 
                   (1 - y_true) * np.log(1 - y_pred_proba))

def binary_log_loss_gradient(y_true, y_pred_proba):
    """二分类对数损失的梯度"""
    return y_true - y_pred_proba

y_true = np.array([0, 1, 1, 0])
y_pred_proba = np.array([0.2, 0.8, 0.6, 0.3])

loss = binary_log_loss(y_true, y_pred_proba)
gradient = binary_log_loss_gradient(y_true, y_pred_proba)

print(f"损失: {loss:.4f}")
print(f"梯度: {gradient}")

多分类问题 #

对于多分类问题，使用交叉熵损失：

python

def cross_entropy_loss(y_true, y_pred_proba):
    """多分类交叉熵损失"""
    epsilon = 1e-15
    y_pred_proba = np.clip(y_pred_proba, epsilon, 1 - epsilon)
    n_samples = y_true.shape[0]
    log_likelihood = -np.log(y_pred_proba[np.arange(n_samples), y_true])
    return np.mean(log_likelihood)

y_true = np.array([0, 1, 2])
y_pred_proba = np.array([
    [0.7, 0.2, 0.1],
    [0.1, 0.8, 0.1],
    [0.1, 0.2, 0.7]
])

loss = cross_entropy_loss(y_true, y_pred_proba)
print(f"交叉熵损失: {loss:.4f}")

梯度提升过程 #

简单实现 #

python

import numpy as np
from sklearn.tree import DecisionTreeRegressor

class SimpleGBDT:
    def __init__(self, n_estimators=100, learning_rate=0.1, max_depth=3):
        self.n_estimators = n_estimators
        self.learning_rate = learning_rate
        self.max_depth = max_depth
        self.trees = []
        self.initial_prediction = None
    
    def fit(self, X, y):
        self.initial_prediction = np.mean(y)
        F = np.full(len(y), self.initial_prediction)
        
        for i in range(self.n_estimators):
            residuals = y - F
            
            tree = DecisionTreeRegressor(max_depth=self.max_depth)
            tree.fit(X, residuals)
            self.trees.append(tree)
            
            predictions = tree.predict(X)
            F += self.learning_rate * predictions
            
            if (i + 1) % 10 == 0:
                mse = np.mean((y - F) ** 2)
                print(f"迭代 {i+1}: MSE = {mse:.4f}")
    
    def predict(self, X):
        F = np.full(X.shape[0], self.initial_prediction)
        for tree in self.trees:
            F += self.learning_rate * tree.predict(X)
        return F

np.random.seed(42)
X = np.random.randn(100, 5)
y = 2 * X[:, 0] + 3 * X[:, 1] + np.random.randn(100) * 0.1

gbdt = SimpleGBDT(n_estimators=50, learning_rate=0.1, max_depth=3)
gbdt.fit(X, y)

y_pred = gbdt.predict(X)
mse = np.mean((y - y_pred) ** 2)
print(f"\n最终 MSE: {mse:.4f}")

可视化提升过程 #

python

import matplotlib.pyplot as plt

def visualize_boosting(X, y, n_estimators=10):
    """可视化梯度提升过程"""
    initial_prediction = np.mean(y)
    F = np.full(len(y), initial_prediction)
    
    fig, axes = plt.subplots(2, 5, figsize=(20, 8))
    axes = axes.flatten()
    
    for i in range(n_estimators):
        residuals = y - F
        
        tree = DecisionTreeRegressor(max_depth=2)
        tree.fit(X, residuals)
        predictions = tree.predict(X)
        F += 0.1 * predictions
        
        axes[i].scatter(y, F, alpha=0.5)
        axes[i].plot([y.min(), y.max()], [y.min(), y.max()], 'r--')
        axes[i].set_xlabel('真实值')
        axes[i].set_ylabel('预测值')
        axes[i].set_title(f'迭代 {i+1}')
        
        mse = np.mean((y - F) ** 2)
        axes[i].text(0.05, 0.95, f'MSE: {mse:.3f}', 
                    transform=axes[i].transAxes, verticalalignment='top')
    
    plt.tight_layout()
    plt.show()

visualize_boosting(X, y, n_estimators=10)

正则化 #

学习率 #

python

def compare_learning_rates(X, y):
    """比较不同学习率的效果"""
    learning_rates = [0.01, 0.05, 0.1, 0.3]
    
    plt.figure(figsize=(12, 6))
    
    for lr in learning_rates:
        model = SimpleGBDT(n_estimators=100, learning_rate=lr, max_depth=3)
        model.fit(X, y)
        y_pred = model.predict(X)
        mse = np.mean((y - y_pred) ** 2)
        plt.scatter(y, y_pred, alpha=0.5, label=f'lr={lr}, MSE={mse:.3f}')
    
    plt.plot([y.min(), y.max()], [y.min(), y.max()], 'k--')
    plt.xlabel('真实值')
    plt.ylabel('预测值')
    plt.title('不同学习率的效果')
    plt.legend()
    plt.grid(True)
    plt.show()

compare_learning_rates(X, y)

子采样 #

python

class GBRTWithSubsampling:
    def __init__(self, n_estimators=100, learning_rate=0.1, 
                 max_depth=3, subsample=0.8):
        self.n_estimators = n_estimators
        self.learning_rate = learning_rate
        self.max_depth = max_depth
        self.subsample = subsample
        self.trees = []
    
    def fit(self, X, y):
        F = np.zeros(len(y))
        
        for i in range(self.n_estimators):
            sample_idx = np.random.choice(
                len(y), 
                size=int(len(y) * self.subsample), 
                replace=False
            )
            
            residuals = y - F
            
            tree = DecisionTreeRegressor(max_depth=self.max_depth)
            tree.fit(X[sample_idx], residuals[sample_idx])
            self.trees.append(tree)
            
            predictions = tree.predict(X)
            F += self.learning_rate * predictions
    
    def predict(self, X):
        F = np.zeros(X.shape[0])
        for tree in self.trees:
            F += self.learning_rate * tree.predict(X)
        return F

分裂准则 #

均方误差减少 #

python

def calculate_mse_reduction(y, left_idx, right_idx):
    """计算 MSE 减少"""
    total_mse = np.var(y) * len(y)
    
    left_mse = np.var(y[left_idx]) * len(left_idx)
    right_mse = np.var(y[right_idx]) * len(right_idx)
    
    reduction = total_mse - left_mse - right_mse
    return reduction

y = np.array([1, 2, 3, 4, 5, 6, 7, 8])
left_idx = np.array([0, 1, 2, 3])
right_idx = np.array([4, 5, 6, 7])

reduction = calculate_mse_reduction(y, left_idx, right_idx)
print(f"MSE 减少: {reduction:.4f}")

寻找最佳分裂点 #

python

def find_best_split(X, y):
    """寻找最佳分裂点"""
    best_gain = -np.inf
    best_feature = None
    best_threshold = None
    
    for feature in range(X.shape[1]):
        thresholds = np.unique(X[:, feature])
        
        for threshold in thresholds:
            left_idx = np.where(X[:, feature] <= threshold)[0]
            right_idx = np.where(X[:, feature] > threshold)[0]
            
            if len(left_idx) == 0 or len(right_idx) == 0:
                continue
            
            gain = calculate_mse_reduction(y, left_idx, right_idx)
            
            if gain > best_gain:
                best_gain = gain
                best_feature = feature
                best_threshold = threshold
    
    return best_feature, best_threshold, best_gain

X = np.random.randn(100, 5)
y = 2 * X[:, 0] + np.random.randn(100) * 0.1

feature, threshold, gain = find_best_split(X, y)
print(f"最佳分裂特征: {feature}")
print(f"最佳分裂阈值: {threshold:.4f}")
print(f"增益: {gain:.4f}")

GBDT vs 随机森林 #

特性	GBDT	随机森林
树的关系	串行依赖	并行独立
树的数量	通常较少	通常较多
树的深度	较浅	较深
过拟合风险	较高	较低
训练速度	较慢	较快
预测速度	较快	较慢
参数敏感度	较高	较低

GBDT 的优缺点 #

优点 #

text

✅ 高准确率
   - 强大的拟合能力
   - 适合各种类型的数据

✅ 特征工程友好
   - 自动特征选择
   - 处理非线性关系

✅ 可解释性
   - 特征重要性
   - 树结构可视化

✅ 灵活性
   - 可自定义损失函数
   - 适合多种任务

缺点 #

text

⚠️ 训练时间较长
   - 串行训练
   - 难以并行化

⚠️ 容易过拟合
   - 需要正则化
   - 需要调参

⚠️ 参数较多
   - 需要经验
   - 调优耗时

⚠️ 对异常值敏感
   - 需要数据清洗
   - 鲁棒性有限

下一步 #

现在你已经理解了 GBDT 的基本原理，接下来学习直方图算法，了解 LightGBM 如何加速 GBDT 训练！