决策树基础 #

什么是决策树？ #

决策树是一种基本的分类与回归方法，它通过一系列规则对数据进行划分，最终形成树状结构。

基本概念 #

text

┌─────────────────────────────────────────────────────────────┐
│                      决策树结构                              │
├─────────────────────────────────────────────────────────────┤
│                                                              │
│                        根节点                                │
│                          │                                   │
│              ┌───────────┴───────────┐                      │
│              │                       │                       │
│          内部节点                  内部节点                   │
│              │                       │                       │
│      ┌───────┴───────┐       ┌───────┴───────┐             │
│      │               │       │               │             │
│   叶子节点        叶子节点  叶子节点       叶子节点          │
│   (类别A)         (类别B)   (类别A)        (类别B)          │
│                                                              │
└─────────────────────────────────────────────────────────────┘

决策树示例 #

python

from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.datasets import load_iris
import matplotlib.pyplot as plt

# 加载数据
iris = load_iris()
X, y = iris.data, iris.target

# 训练决策树
clf = DecisionTreeClassifier(max_depth=3, random_state=42)
clf.fit(X, y)

# 可视化
plt.figure(figsize=(12, 8))
plot_tree(clf, feature_names=iris.feature_names, 
          class_names=iris.target_names, filled=True)
plt.show()

CART 树 #

XGBoost 使用 CART（Classification And Regression Tree）作为基学习器。

CART 分类树 #

python

import numpy as np

class CARTClassifier:
    def __init__(self, max_depth=3, min_samples_split=2):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        
    def gini(self, y):
        """计算基尼系数"""
        _, counts = np.unique(y, return_counts=True)
        prob = counts / len(y)
        return 1 - np.sum(prob ** 2)
    
    def gini_gain(self, y, left_mask):
        """计算基尼增益"""
        n = len(y)
        n_left = np.sum(left_mask)
        n_right = n - n_left
        
        if n_left == 0 or n_right == 0:
            return 0
        
        gini_parent = self.gini(y)
        gini_left = self.gini(y[left_mask])
        gini_right = self.gini(y[~left_mask])
        
        gain = gini_parent - (n_left/n * gini_left + n_right/n * gini_right)
        return gain
    
    def find_best_split(self, X, y):
        """找到最佳分裂点"""
        best_gain = 0
        best_feature = None
        best_threshold = None
        
        for feature in range(X.shape[1]):
            thresholds = np.unique(X[:, feature])
            for threshold in thresholds:
                left_mask = X[:, feature] <= threshold
                gain = self.gini_gain(y, left_mask)
                
                if gain > best_gain:
                    best_gain = gain
                    best_feature = feature
                    best_threshold = threshold
        
        return best_feature, best_threshold, best_gain
    
    def fit(self, X, y, depth=0):
        """递归构建树"""
        # 终止条件
        if depth >= self.max_depth or len(y) < self.min_samples_split:
            values, counts = np.unique(y, return_counts=True)
            return {'leaf': True, 'class': values[np.argmax(counts)]}
        
        # 找最佳分裂
        feature, threshold, gain = self.find_best_split(X, y)
        
        if gain == 0:
            values, counts = np.unique(y, return_counts=True)
            return {'leaf': True, 'class': values[np.argmax(counts)]}
        
        # 分裂
        left_mask = X[:, feature] <= threshold
        
        return {
            'leaf': False,
            'feature': feature,
            'threshold': threshold,
            'left': self.fit(X[left_mask], y[left_mask], depth + 1),
            'right': self.fit(X[~left_mask], y[~left_mask], depth + 1)
        }
    
    def predict_single(self, x, tree):
        """预测单个样本"""
        if tree['leaf']:
            return tree['class']
        
        if x[tree['feature']] <= tree['threshold']:
            return self.predict_single(x, tree['left'])
        else:
            return self.predict_single(x, tree['right'])
    
    def predict(self, X):
        """预测"""
        return np.array([self.predict_single(x, self.tree) for x in X])

CART 回归树 #

python

class CARTRegressor:
    def __init__(self, max_depth=3, min_samples_split=2):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        
    def mse(self, y):
        """计算均方误差"""
        return np.mean((y - np.mean(y)) ** 2)
    
    def mse_reduction(self, y, left_mask):
        """计算 MSE 减少"""
        n = len(y)
        n_left = np.sum(left_mask)
        n_right = n - n_left
        
        if n_left == 0 or n_right == 0:
            return 0
        
        mse_parent = self.mse(y)
        mse_left = self.mse(y[left_mask])
        mse_right = self.mse(y[~left_mask])
        
        reduction = mse_parent - (n_left/n * mse_left + n_right/n * mse_right)
        return reduction
    
    def find_best_split(self, X, y):
        """找到最佳分裂点"""
        best_reduction = 0
        best_feature = None
        best_threshold = None
        
        for feature in range(X.shape[1]):
            thresholds = np.unique(X[:, feature])
            for threshold in thresholds:
                left_mask = X[:, feature] <= threshold
                reduction = self.mse_reduction(y, left_mask)
                
                if reduction > best_reduction:
                    best_reduction = reduction
                    best_feature = feature
                    best_threshold = threshold
        
        return best_feature, best_threshold, best_reduction
    
    def fit(self, X, y, depth=0):
        """递归构建树"""
        if depth >= self.max_depth or len(y) < self.min_samples_split:
            return {'leaf': True, 'value': np.mean(y)}
        
        feature, threshold, reduction = self.find_best_split(X, y)
        
        if reduction == 0:
            return {'leaf': True, 'value': np.mean(y)}
        
        left_mask = X[:, feature] <= threshold
        
        return {
            'leaf': False,
            'feature': feature,
            'threshold': threshold,
            'left': self.fit(X[left_mask], y[left_mask], depth + 1),
            'right': self.fit(X[~left_mask], y[~left_mask], depth + 1)
        }
    
    def predict_single(self, x, tree):
        """预测单个样本"""
        if tree['leaf']:
            return tree['value']
        
        if x[tree['feature']] <= tree['threshold']:
            return self.predict_single(x, tree['left'])
        else:
            return self.predict_single(x, tree['right'])
    
    def predict(self, X):
        """预测"""
        return np.array([self.predict_single(x, self.tree) for x in X])

分裂准则 #

分类树准则 #

1. 基尼系数（Gini Index） #

text

Gini(D) = 1 - Σ pₖ²

其中 pₖ 是第 k 类的概率

python

def gini_index(y):
    """计算基尼系数"""
    _, counts = np.unique(y, return_counts=True)
    prob = counts / len(y)
    return 1 - np.sum(prob ** 2)

# 示例
y1 = np.array([0, 0, 0, 0])  # 纯净
y2 = np.array([0, 0, 1, 1])  # 混合

print(f"纯净数据基尼系数: {gini_index(y1):.4f}")  # 0.0
print(f"混合数据基尼系数: {gini_index(y2):.4f}")  # 0.5

2. 信息增益（Information Gain） #

text

Entropy(D) = -Σ pₖ log₂ pₖ

信息增益 = Entropy(D) - Σ |Dᵥ|/|D| × Entropy(Dᵥ)

python

def entropy(y):
    """计算熵"""
    _, counts = np.unique(y, return_counts=True)
    prob = counts / len(y)
    return -np.sum(prob * np.log2(prob + 1e-10))

def information_gain(y, left_mask):
    """计算信息增益"""
    n = len(y)
    n_left = np.sum(left_mask)
    n_right = n - n_left
    
    if n_left == 0 or n_right == 0:
        return 0
    
    entropy_parent = entropy(y)
    entropy_left = entropy(y[left_mask])
    entropy_right = entropy(y[~left_mask])
    
    gain = entropy_parent - (n_left/n * entropy_left + n_right/n * entropy_right)
    return gain

回归树准则 #

1. 均方误差（MSE） #

text

MSE = 1/n Σ (yᵢ - ȳ)²

python

def mse(y):
    """计算均方误差"""
    return np.mean((y - np.mean(y)) ** 2)

2. 平均绝对误差（MAE） #

text

MAE = 1/n Σ |yᵢ - median(y)|

python

def mae(y):
    """计算平均绝对误差"""
    return np.mean(np.abs(y - np.median(y)))

剪枝策略 #

预剪枝 #

在构建树的过程中限制树的生长：

python

from sklearn.tree import DecisionTreeClassifier

# 预剪枝参数
clf = DecisionTreeClassifier(
    max_depth=5,              # 最大深度
    min_samples_split=10,     # 分裂所需最小样本数
    min_samples_leaf=5,       # 叶子节点最小样本数
    max_leaf_nodes=20,        # 最大叶子节点数
    min_impurity_decrease=0.01  # 最小不纯度减少
)

后剪枝 #

构建完整树后进行剪枝：

python

from sklearn.tree import DecisionTreeClassifier

# 使用 ccp_alpha 进行成本复杂度剪枝
clf = DecisionTreeClassifier(ccp_alpha=0.01)

# 获取剪枝路径
path = clf.cost_complexity_pruning_path(X_train, y_train)
ccp_alphas = path.ccp_alphas

# 选择最佳 alpha
clfs = []
for ccp_alpha in ccp_alphas:
    clf = DecisionTreeClassifier(ccp_alpha=ccp_alpha)
    clf.fit(X_train, y_train)
    clfs.append(clf)

# 评估不同 alpha 的效果
train_scores = [clf.score(X_train, y_train) for clf in clfs]
test_scores = [clf.score(X_test, y_test) for clf in clfs]

XGBoost 中的树 #

XGBoost 树的特点 #

text

┌─────────────────────────────────────────────────────────────┐
│                   XGBoost 树的特点                           │
├─────────────────────────────────────────────────────────────┤
│                                                              │
│  1. CART 回归树                                              │
│     - 分类和回归都使用回归树                                  │
│     - 叶子节点输出连续值                                      │
│                                                              │
│  2. 正则化                                                   │
│     - 叶子节点数惩罚                                          │
│     - 叶子权重 L2 正则化                                      │
│                                                              │
│  3. 最优权重计算                                              │
│     - wⱼ* = -Gⱼ/(Hⱼ + λ)                                   │
│                                                              │
│  4. 稀疏感知                                                  │
│     - 自动处理缺失值                                          │
│     - 学习最优分裂方向                                        │
│                                                              │
└─────────────────────────────────────────────────────────────┘

XGBoost 树的构建 #

python

import xgboost as xgb
import numpy as np

# 创建数据
X = np.random.rand(100, 5)
y = np.random.randint(0, 2, 100)

dtrain = xgb.DMatrix(X, label=y)

# 参数设置
params = {
    'objective': 'binary:logistic',
    'max_depth': 3,
    'eta': 0.1,
    'tree_method': 'exact'  # 精确算法
}

# 训练模型
model = xgb.train(params, dtrain, num_boost_round=10)

# 查看树结构
trees_dump = model.get_dump()
for i, tree in enumerate(trees_dump[:2]):
    print(f"Tree {i}:")
    print(tree)
    print()

树可视化 #

python

import matplotlib.pyplot as plt

# 绘制单棵树
plt.figure(figsize=(20, 10))
xgb.plot_tree(model, num_trees=0, rankdir='LR')
plt.show()

# 绘制特征重要性
plt.figure(figsize=(10, 6))
xgb.plot_importance(model, importance_type='gain')
plt.show()

树的深度与复杂度 #

深度影响 #

python

import xgboost as xgb
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

# 创建数据
X, y = make_classification(n_samples=1000, n_features=20, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

# 测试不同深度
depths = [1, 2, 3, 5, 7, 10]
train_scores = []
test_scores = []

for depth in depths:
    params = {
        'objective': 'binary:logistic',
        'max_depth': depth,
        'eta': 0.1,
        'eval_metric': 'logloss'
    }
    
    model = xgb.train(params, dtrain, num_boost_round=100)
    
    train_pred = model.predict(dtrain)
    test_pred = model.predict(dtest)
    
    train_scores.append(log_loss(y_train, train_pred))
    test_scores.append(log_loss(y_test, test_pred))

# 绘图
plt.figure(figsize=(10, 6))
plt.plot(depths, train_scores, 'o-', label='Train')
plt.plot(depths, test_scores, 'o-', label='Test')
plt.xlabel('Max Depth')
plt.ylabel('Log Loss')
plt.legend()
plt.grid(True)
plt.show()

复杂度控制 #

python

# 控制树复杂度的参数
params = {
    'max_depth': 6,              # 最大深度
    'min_child_weight': 1,       # 最小叶子权重
    'min_split_loss': 0,         # 最小分裂增益（gamma）
    'max_leaves': 0,             # 最大叶子数（0 表示无限制）
    'max_bin': 256,              # 最大分箱数
    'grow_policy': 'depthwise'   # 生长策略
}

树的生长策略 #

Level-wise（层级生长） #

text

┌─────────────────────────────────────────────────────────────┐
│                    Level-wise 生长                           │
├─────────────────────────────────────────────────────────────┤
│                                                              │
│  第1层:          [根节点]                                    │
│                   /     \                                    │
│  第2层:      [节点1]   [节点2]                               │
│              /    \     /    \                               │
│  第3层:   [叶1] [叶2] [叶3] [叶4]                            │
│                                                              │
│  特点：                                                       │
│  - 逐层分裂                                                   │
│  - 每层所有节点都尝试分裂                                     │
│  - XGBoost 默认策略                                           │
│                                                              │
└─────────────────────────────────────────────────────────────┘

Leaf-wise（叶子生长） #

text

┌─────────────────────────────────────────────────────────────┐
│                    Leaf-wise 生长                            │
├─────────────────────────────────────────────────────────────┤
│                                                              │
│  第1步:          [根节点]                                    │
│                   /     \                                    │
│  第2步:      [节点1]   [叶2]                                 │
│              /    \                                          │
│  第3步:   [叶1] [节点3]                                      │
│                   /    \                                     │
│  第4步:        [叶3] [叶4]                                   │
│                                                              │
│  特点：                                                       │
│  - 选择增益最大的叶子分裂                                     │
│  - 可能产生不对称树                                           │
│  - LightGBM 默认策略                                          │
│                                                              │
└─────────────────────────────────────────────────────────────┘

python

# XGBoost 中设置生长策略
params = {
    'grow_policy': 'depthwise',  # 或 'lossguide'
    'max_depth': 6,              # depthwise 时使用
    'max_leaves': 32             # lossguide 时使用
}

下一步 #

现在你已经理解决策树基础，接下来学习目标函数了解 XGBoost 如何优化模型！