GOSS 算法 #

什么是 GOSS？ #

GOSS（Gradient-based One-Side Sampling，梯度单边采样）是 LightGBM 的核心优化算法之一。它通过保留大梯度样本（对学习重要的样本）和随机采样小梯度样本，在保持精度的同时大幅减少计算量。

核心思想 #

text

┌─────────────────────────────────────────────────────────────┐
│                    GOSS 核心思想                             │
├─────────────────────────────────────────────────────────────┤
│                                                             │
│  样本梯度大小反映了样本对模型学习的重要性：                   │
│                                                             │
│  - 大梯度样本：模型预测误差大，对学习重要                    │
│  - 小梯度样本：模型预测误差小，对学习不重要                  │
│                                                             │
│  GOSS 策略：                                                 │
│  1. 保留所有大梯度样本（top a%）                            │
│  2. 随机采样小梯度样本（b%）                                │
│  3. 对小梯度样本乘以权重 (1-a)/b                            │
│                                                             │
│  结果：                                                      │
│  - 减少样本数量，加速训练                                   │
│  - 保持数据分布，不损失精度                                 │
│                                                             │
└─────────────────────────────────────────────────────────────┘

算法原理 #

采样策略 #

python

import numpy as np

def goss_sampling(gradients, a=0.1, b=0.1):
    """
    GOSS 采样
    
    Args:
        gradients: 梯度数组
        a: 保留大梯度样本的比例
        b: 随机采样小梯度样本的比例
    
    Returns:
        selected_indices: 选中的样本索引
        weights: 样本权重
    """
    n_samples = len(gradients)
    abs_gradients = np.abs(gradients)
    
    n_top = int(n_samples * a)
    top_indices = np.argsort(abs_gradients)[-n_top:]
    
    remaining_indices = np.setdiff1d(np.arange(n_samples), top_indices)
    n_random = int(len(remaining_indices) * b)
    random_indices = np.random.choice(
        remaining_indices, n_random, replace=False
    )
    
    selected_indices = np.concatenate([top_indices, random_indices])
    
    weights = np.ones(n_samples)
    weights[random_indices] = (1 - a) / b
    
    return selected_indices, weights

gradients = np.array([0.1, 0.5, 0.01, 0.8, 0.02, 0.3, 0.001, 0.6])
selected_idx, weights = goss_sampling(gradients, a=0.25, b=0.25)

print(f"原始梯度: {gradients}")
print(f"选中索引: {selected_idx}")
print(f"选中样本梯度: {gradients[selected_idx]}")
print(f"样本权重: {weights[selected_idx]}")

信息增益计算 #

python

def calculate_gain_with_goss(gradients, hessians, selected_indices, weights, 
                             split_indices, lambda_l2=0.1):
    """
    考虑 GOSS 采样的信息增益计算
    
    Args:
        gradients: 梯度数组
        hessians: Hessian 数组
        selected_indices: 选中的样本索引
        weights: 样本权重
        split_indices: 分裂后的左右节点索引
        lambda_l2: L2 正则化参数
    
    Returns:
        gain: 信息增益
    """
    left_idx, right_idx = split_indices
    
    left_grad = np.sum(gradients[selected_indices[left_idx]] * 
                      weights[selected_indices[left_idx]])
    left_hess = np.sum(hessians[selected_indices[left_idx]] * 
                      weights[selected_indices[left_idx]])
    
    right_grad = np.sum(gradients[selected_indices[right_idx]] * 
                       weights[selected_indices[right_idx]])
    right_hess = np.sum(hessians[selected_indices[right_idx]] * 
                       weights[selected_indices[right_idx]])
    
    total_grad = left_grad + right_grad
    total_hess = left_hess + right_hess
    
    left_score = (left_grad ** 2) / (left_hess + lambda_l2)
    right_score = (right_grad ** 2) / (right_hess + lambda_l2)
    total_score = (total_grad ** 2) / (total_hess + lambda_l2)
    
    gain = left_score + right_score - total_score
    
    return gain

完整实现 #

python

import numpy as np

class GOSSTree:
    """使用 GOSS 的决策树"""
    
    def __init__(self, max_depth=3, min_samples=10, 
                 top_rate=0.2, other_rate=0.1, lambda_l2=0.1):
        self.max_depth = max_depth
        self.min_samples = min_samples
        self.top_rate = top_rate
        self.other_rate = other_rate
        self.lambda_l2 = lambda_l2
        self.tree = None
    
    def fit(self, X, gradients, hessians):
        """训练决策树"""
        self.tree = self._build_tree(X, gradients, hessians, depth=0)
    
    def _build_tree(self, X, gradients, hessians, depth):
        """递归构建树"""
        if depth >= self.max_depth or len(gradients) < self.min_samples:
            return {
                'leaf': True,
                'value': -np.sum(gradients) / (np.sum(hessians) + self.lambda_l2)
            }
        
        selected_idx, weights = self._goss_sample(gradients)
        
        best_feature, best_threshold, best_gain = self._find_best_split(
            X[selected_idx], gradients[selected_idx], 
            hessians[selected_idx], weights[selected_idx]
        )
        
        if best_gain <= 0:
            return {
                'leaf': True,
                'value': -np.sum(gradients) / (np.sum(hessians) + self.lambda_l2)
            }
        
        left_mask = X[:, best_feature] <= best_threshold
        right_mask = ~left_mask
        
        return {
            'leaf': False,
            'feature': best_feature,
            'threshold': best_threshold,
            'left': self._build_tree(
                X[left_mask], gradients[left_mask], 
                hessians[left_mask], depth + 1
            ),
            'right': self._build_tree(
                X[right_mask], gradients[right_mask], 
                hessians[right_mask], depth + 1
            )
        }
    
    def _goss_sample(self, gradients):
        """GOSS 采样"""
        n_samples = len(gradients)
        abs_gradients = np.abs(gradients)
        
        n_top = int(n_samples * self.top_rate)
        top_indices = np.argsort(abs_gradients)[-n_top:]
        
        remaining_indices = np.setdiff1d(np.arange(n_samples), top_indices)
        n_random = int(len(remaining_indices) * self.other_rate)
        random_indices = np.random.choice(
            remaining_indices, n_random, replace=False
        )
        
        selected_indices = np.concatenate([top_indices, random_indices])
        
        weights = np.ones(n_samples)
        weights[random_indices] = (1 - self.top_rate) / self.other_rate
        
        return selected_indices, weights
    
    def _find_best_split(self, X, gradients, hessians, weights):
        """找最佳分裂点"""
        best_gain = -np.inf
        best_feature = None
        best_threshold = None
        
        n_features = X.shape[1]
        
        for feature in range(n_features):
            thresholds = np.unique(X[:, feature])
            
            for threshold in thresholds:
                left_mask = X[:, feature] <= threshold
                right_mask = ~left_mask
                
                if np.sum(left_mask) == 0 or np.sum(right_mask) == 0:
                    continue
                
                gain = self._calculate_gain(
                    gradients, hessians, weights, left_mask, right_mask
                )
                
                if gain > best_gain:
                    best_gain = gain
                    best_feature = feature
                    best_threshold = threshold
        
        return best_feature, best_threshold, best_gain
    
    def _calculate_gain(self, gradients, hessians, weights, left_mask, right_mask):
        """计算信息增益"""
        left_grad = np.sum(gradients[left_mask] * weights[left_mask])
        left_hess = np.sum(hessians[left_mask] * weights[left_mask])
        
        right_grad = np.sum(gradients[right_mask] * weights[right_mask])
        right_hess = np.sum(hessians[right_mask] * weights[right_mask])
        
        total_grad = left_grad + right_grad
        total_hess = left_hess + right_hess
        
        left_score = (left_grad ** 2) / (left_hess + self.lambda_l2)
        right_score = (right_grad ** 2) / (right_hess + self.lambda_l2)
        total_score = (total_grad ** 2) / (total_hess + self.lambda_l2)
        
        return left_score + right_score - total_score
    
    def predict(self, X):
        """预测"""
        predictions = np.zeros(len(X))
        for i in range(len(X)):
            predictions[i] = self._predict_single(X[i], self.tree)
        return predictions
    
    def _predict_single(self, x, node):
        """单个样本预测"""
        if node['leaf']:
            return node['value']
        
        if x[node['feature']] <= node['threshold']:
            return self._predict_single(x, node['left'])
        else:
            return self._predict_single(x, node['right'])

性能分析 #

采样效果 #

python

import matplotlib.pyplot as plt

def analyze_goss_sampling(n_samples=10000, top_rate=0.1, other_rate=0.1):
    """分析 GOSS 采样效果"""
    gradients = np.random.randn(n_samples)
    
    abs_gradients = np.abs(gradients)
    n_top = int(n_samples * top_rate)
    top_indices = np.argsort(abs_gradients)[-n_top:]
    
    remaining_indices = np.setdiff1d(np.arange(n_samples), top_indices)
    n_random = int(len(remaining_indices) * other_rate)
    random_indices = np.random.choice(
        remaining_indices, n_random, replace=False
    )
    
    selected_indices = np.concatenate([top_indices, random_indices])
    
    fig, axes = plt.subplots(1, 3, figsize=(15, 4))
    
    axes[0].hist(abs_gradients, bins=50, alpha=0.7, label='所有样本')
    axes[0].set_xlabel('|梯度|')
    axes[0].set_ylabel('频数')
    axes[0].set_title('梯度分布')
    axes[0].legend()
    
    axes[1].hist(abs_gradients[top_indices], bins=20, alpha=0.7, 
                label=f'大梯度样本 (top {top_rate*100}%)')
    axes[1].hist(abs_gradients[random_indices], bins=20, alpha=0.7, 
                label=f'随机采样 ({other_rate*100}%)')
    axes[1].set_xlabel('|梯度|')
    axes[1].set_ylabel('频数')
    axes[1].set_title('采样分布')
    axes[1].legend()
    
    original_size = n_samples
    sampled_size = len(selected_indices)
    reduction = (1 - sampled_size / original_size) * 100
    
    axes[2].bar(['原始样本', '采样后'], [original_size, sampled_size])
    axes[2].set_ylabel('样本数量')
    axes[2].set_title(f'样本减少: {reduction:.1f}%')
    
    plt.tight_layout()
    plt.show()

analyze_goss_sampling()

计算复杂度 #

text

┌─────────────────────────────────────────────────────────────┐
│                    计算复杂度对比                            │
├─────────────────────────────────────────────────────────────┤
│                                                             │
│  传统方法:                                                   │
│  - 每次分裂: O(#data × #feature)                            │
│  - 总计: O(#data × #feature × #iterations)                  │
│                                                             │
│  GOSS 方法:                                                  │
│  - 采样: O(#data × log(#data))                              │
│  - 每次分裂: O((a+b) × #data × #feature)                    │
│  - 总计: O((a+b) × #data × #feature × #iterations)          │
│                                                             │
│  加速比: 1 / (a + b)                                        │
│  例如: a=0.1, b=0.1 → 加速 5 倍                             │
│                                                             │
└─────────────────────────────────────────────────────────────┘

精度影响 #

python

def compare_goss_accuracy(X, y, top_rates=[0.1, 0.2, 0.3], 
                         other_rates=[0.1, 0.2, 0.3]):
    """比较 GOSS 对精度的影响"""
    from sklearn.model_selection import train_test_split
    from sklearn.tree import DecisionTreeRegressor
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    
    gradients = y_train - np.mean(y_train)
    hessians = np.ones(len(y_train))
    
    results = []
    
    tree = DecisionTreeRegressor(max_depth=5)
    tree.fit(X_train, y_train)
    baseline_mse = np.mean((y_test - tree.predict(X_test)) ** 2)
    results.append({'method': 'Baseline', 'mse': baseline_mse})
    
    for top_rate in top_rates:
        for other_rate in other_rates:
            goss_tree = GOSSTree(
                max_depth=5, 
                top_rate=top_rate, 
                other_rate=other_rate
            )
            goss_tree.fit(X_train, gradients, hessians)
            y_pred = goss_tree.predict(X_test)
            mse = np.mean((y_test - y_pred) ** 2)
            
            results.append({
                'method': f'GOSS({top_rate},{other_rate})',
                'mse': mse
            })
    
    import pandas as pd
    results_df = pd.DataFrame(results)
    print(results_df)

X = np.random.randn(1000, 10)
y = 2 * X[:, 0] + 3 * X[:, 1] + np.random.randn(1000) * 0.1
compare_goss_accuracy(X, y)

参数调优 #

top_rate 参数 #

python

def tune_top_rate(X, y):
    """调优 top_rate 参数"""
    top_rates = [0.05, 0.1, 0.15, 0.2, 0.25, 0.3]
    mses = []
    times = []
    
    gradients = y - np.mean(y)
    hessians = np.ones(len(y))
    
    for top_rate in top_rates:
        start = time.time()
        tree = GOSSTree(max_depth=5, top_rate=top_rate, other_rate=0.1)
        tree.fit(X, gradients, hessians)
        elapsed = time.time() - start
        
        y_pred = tree.predict(X)
        mse = np.mean((y - y_pred) ** 2)
        
        mses.append(mse)
        times.append(elapsed)
    
    fig, axes = plt.subplots(1, 2, figsize=(12, 5))
    
    axes[0].plot(top_rates, mses, 'o-')
    axes[0].set_xlabel('top_rate')
    axes[0].set_ylabel('MSE')
    axes[0].set_title('MSE vs top_rate')
    
    axes[1].plot(top_rates, times, 'o-')
    axes[1].set_xlabel('top_rate')
    axes[1].set_ylabel('训练时间 (s)')
    axes[1].set_title('训练时间 vs top_rate')
    
    plt.tight_layout()
    plt.show()

GOSS 的优缺点 #

优点 #

text

✅ 减少计算量
   - 采样减少样本数量
   - 加速训练过程

✅ 保持精度
   - 保留重要样本
   - 权重补偿采样

✅ 自适应采样
   - 根据梯度动态调整
   - 关注难样本

✅ 理论保证
   - 有理论误差分析
   - 保证精度损失可控

缺点 #

text

⚠️ 参数调优
   - top_rate 和 other_rate 需要调优
   - 不同数据集最优参数不同

⚠️ 随机性
   - 随机采样引入随机性
   - 结果可能不稳定

⚠️ 小数据集效果有限
   - 数据量小时优势不明显
   - 可能降低精度

下一步 #

现在你已经理解了 GOSS 算法，接下来学习 EFB 算法，了解 LightGBM 如何通过特征捆绑进一步优化！