决策树基础 #
什么是决策树? #
决策树是一种基本的分类与回归方法,它通过一系列规则对数据进行划分,最终形成树状结构。
基本概念 #
text
┌─────────────────────────────────────────────────────────────┐
│ 决策树结构 │
├─────────────────────────────────────────────────────────────┤
│ │
│ 根节点 │
│ │ │
│ ┌───────────┴───────────┐ │
│ │ │ │
│ 内部节点 内部节点 │
│ │ │ │
│ ┌───────┴───────┐ ┌───────┴───────┐ │
│ │ │ │ │ │
│ 叶子节点 叶子节点 叶子节点 叶子节点 │
│ (类别A) (类别B) (类别A) (类别B) │
│ │
└─────────────────────────────────────────────────────────────┘
决策树示例 #
python
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.datasets import load_iris
import matplotlib.pyplot as plt
# 加载数据
iris = load_iris()
X, y = iris.data, iris.target
# 训练决策树
clf = DecisionTreeClassifier(max_depth=3, random_state=42)
clf.fit(X, y)
# 可视化
plt.figure(figsize=(12, 8))
plot_tree(clf, feature_names=iris.feature_names,
class_names=iris.target_names, filled=True)
plt.show()
CART 树 #
XGBoost 使用 CART(Classification And Regression Tree)作为基学习器。
CART 分类树 #
python
import numpy as np
class CARTClassifier:
def __init__(self, max_depth=3, min_samples_split=2):
self.max_depth = max_depth
self.min_samples_split = min_samples_split
def gini(self, y):
"""计算基尼系数"""
_, counts = np.unique(y, return_counts=True)
prob = counts / len(y)
return 1 - np.sum(prob ** 2)
def gini_gain(self, y, left_mask):
"""计算基尼增益"""
n = len(y)
n_left = np.sum(left_mask)
n_right = n - n_left
if n_left == 0 or n_right == 0:
return 0
gini_parent = self.gini(y)
gini_left = self.gini(y[left_mask])
gini_right = self.gini(y[~left_mask])
gain = gini_parent - (n_left/n * gini_left + n_right/n * gini_right)
return gain
def find_best_split(self, X, y):
"""找到最佳分裂点"""
best_gain = 0
best_feature = None
best_threshold = None
for feature in range(X.shape[1]):
thresholds = np.unique(X[:, feature])
for threshold in thresholds:
left_mask = X[:, feature] <= threshold
gain = self.gini_gain(y, left_mask)
if gain > best_gain:
best_gain = gain
best_feature = feature
best_threshold = threshold
return best_feature, best_threshold, best_gain
def fit(self, X, y, depth=0):
"""递归构建树"""
# 终止条件
if depth >= self.max_depth or len(y) < self.min_samples_split:
values, counts = np.unique(y, return_counts=True)
return {'leaf': True, 'class': values[np.argmax(counts)]}
# 找最佳分裂
feature, threshold, gain = self.find_best_split(X, y)
if gain == 0:
values, counts = np.unique(y, return_counts=True)
return {'leaf': True, 'class': values[np.argmax(counts)]}
# 分裂
left_mask = X[:, feature] <= threshold
return {
'leaf': False,
'feature': feature,
'threshold': threshold,
'left': self.fit(X[left_mask], y[left_mask], depth + 1),
'right': self.fit(X[~left_mask], y[~left_mask], depth + 1)
}
def predict_single(self, x, tree):
"""预测单个样本"""
if tree['leaf']:
return tree['class']
if x[tree['feature']] <= tree['threshold']:
return self.predict_single(x, tree['left'])
else:
return self.predict_single(x, tree['right'])
def predict(self, X):
"""预测"""
return np.array([self.predict_single(x, self.tree) for x in X])
CART 回归树 #
python
class CARTRegressor:
def __init__(self, max_depth=3, min_samples_split=2):
self.max_depth = max_depth
self.min_samples_split = min_samples_split
def mse(self, y):
"""计算均方误差"""
return np.mean((y - np.mean(y)) ** 2)
def mse_reduction(self, y, left_mask):
"""计算 MSE 减少"""
n = len(y)
n_left = np.sum(left_mask)
n_right = n - n_left
if n_left == 0 or n_right == 0:
return 0
mse_parent = self.mse(y)
mse_left = self.mse(y[left_mask])
mse_right = self.mse(y[~left_mask])
reduction = mse_parent - (n_left/n * mse_left + n_right/n * mse_right)
return reduction
def find_best_split(self, X, y):
"""找到最佳分裂点"""
best_reduction = 0
best_feature = None
best_threshold = None
for feature in range(X.shape[1]):
thresholds = np.unique(X[:, feature])
for threshold in thresholds:
left_mask = X[:, feature] <= threshold
reduction = self.mse_reduction(y, left_mask)
if reduction > best_reduction:
best_reduction = reduction
best_feature = feature
best_threshold = threshold
return best_feature, best_threshold, best_reduction
def fit(self, X, y, depth=0):
"""递归构建树"""
if depth >= self.max_depth or len(y) < self.min_samples_split:
return {'leaf': True, 'value': np.mean(y)}
feature, threshold, reduction = self.find_best_split(X, y)
if reduction == 0:
return {'leaf': True, 'value': np.mean(y)}
left_mask = X[:, feature] <= threshold
return {
'leaf': False,
'feature': feature,
'threshold': threshold,
'left': self.fit(X[left_mask], y[left_mask], depth + 1),
'right': self.fit(X[~left_mask], y[~left_mask], depth + 1)
}
def predict_single(self, x, tree):
"""预测单个样本"""
if tree['leaf']:
return tree['value']
if x[tree['feature']] <= tree['threshold']:
return self.predict_single(x, tree['left'])
else:
return self.predict_single(x, tree['right'])
def predict(self, X):
"""预测"""
return np.array([self.predict_single(x, self.tree) for x in X])
分裂准则 #
分类树准则 #
1. 基尼系数(Gini Index) #
text
Gini(D) = 1 - Σ pₖ²
其中 pₖ 是第 k 类的概率
python
def gini_index(y):
"""计算基尼系数"""
_, counts = np.unique(y, return_counts=True)
prob = counts / len(y)
return 1 - np.sum(prob ** 2)
# 示例
y1 = np.array([0, 0, 0, 0]) # 纯净
y2 = np.array([0, 0, 1, 1]) # 混合
print(f"纯净数据基尼系数: {gini_index(y1):.4f}") # 0.0
print(f"混合数据基尼系数: {gini_index(y2):.4f}") # 0.5
2. 信息增益(Information Gain) #
text
Entropy(D) = -Σ pₖ log₂ pₖ
信息增益 = Entropy(D) - Σ |Dᵥ|/|D| × Entropy(Dᵥ)
python
def entropy(y):
"""计算熵"""
_, counts = np.unique(y, return_counts=True)
prob = counts / len(y)
return -np.sum(prob * np.log2(prob + 1e-10))
def information_gain(y, left_mask):
"""计算信息增益"""
n = len(y)
n_left = np.sum(left_mask)
n_right = n - n_left
if n_left == 0 or n_right == 0:
return 0
entropy_parent = entropy(y)
entropy_left = entropy(y[left_mask])
entropy_right = entropy(y[~left_mask])
gain = entropy_parent - (n_left/n * entropy_left + n_right/n * entropy_right)
return gain
回归树准则 #
1. 均方误差(MSE) #
text
MSE = 1/n Σ (yᵢ - ȳ)²
python
def mse(y):
"""计算均方误差"""
return np.mean((y - np.mean(y)) ** 2)
2. 平均绝对误差(MAE) #
text
MAE = 1/n Σ |yᵢ - median(y)|
python
def mae(y):
"""计算平均绝对误差"""
return np.mean(np.abs(y - np.median(y)))
剪枝策略 #
预剪枝 #
在构建树的过程中限制树的生长:
python
from sklearn.tree import DecisionTreeClassifier
# 预剪枝参数
clf = DecisionTreeClassifier(
max_depth=5, # 最大深度
min_samples_split=10, # 分裂所需最小样本数
min_samples_leaf=5, # 叶子节点最小样本数
max_leaf_nodes=20, # 最大叶子节点数
min_impurity_decrease=0.01 # 最小不纯度减少
)
后剪枝 #
构建完整树后进行剪枝:
python
from sklearn.tree import DecisionTreeClassifier
# 使用 ccp_alpha 进行成本复杂度剪枝
clf = DecisionTreeClassifier(ccp_alpha=0.01)
# 获取剪枝路径
path = clf.cost_complexity_pruning_path(X_train, y_train)
ccp_alphas = path.ccp_alphas
# 选择最佳 alpha
clfs = []
for ccp_alpha in ccp_alphas:
clf = DecisionTreeClassifier(ccp_alpha=ccp_alpha)
clf.fit(X_train, y_train)
clfs.append(clf)
# 评估不同 alpha 的效果
train_scores = [clf.score(X_train, y_train) for clf in clfs]
test_scores = [clf.score(X_test, y_test) for clf in clfs]
XGBoost 中的树 #
XGBoost 树的特点 #
text
┌─────────────────────────────────────────────────────────────┐
│ XGBoost 树的特点 │
├─────────────────────────────────────────────────────────────┤
│ │
│ 1. CART 回归树 │
│ - 分类和回归都使用回归树 │
│ - 叶子节点输出连续值 │
│ │
│ 2. 正则化 │
│ - 叶子节点数惩罚 │
│ - 叶子权重 L2 正则化 │
│ │
│ 3. 最优权重计算 │
│ - wⱼ* = -Gⱼ/(Hⱼ + λ) │
│ │
│ 4. 稀疏感知 │
│ - 自动处理缺失值 │
│ - 学习最优分裂方向 │
│ │
└─────────────────────────────────────────────────────────────┘
XGBoost 树的构建 #
python
import xgboost as xgb
import numpy as np
# 创建数据
X = np.random.rand(100, 5)
y = np.random.randint(0, 2, 100)
dtrain = xgb.DMatrix(X, label=y)
# 参数设置
params = {
'objective': 'binary:logistic',
'max_depth': 3,
'eta': 0.1,
'tree_method': 'exact' # 精确算法
}
# 训练模型
model = xgb.train(params, dtrain, num_boost_round=10)
# 查看树结构
trees_dump = model.get_dump()
for i, tree in enumerate(trees_dump[:2]):
print(f"Tree {i}:")
print(tree)
print()
树可视化 #
python
import matplotlib.pyplot as plt
# 绘制单棵树
plt.figure(figsize=(20, 10))
xgb.plot_tree(model, num_trees=0, rankdir='LR')
plt.show()
# 绘制特征重要性
plt.figure(figsize=(10, 6))
xgb.plot_importance(model, importance_type='gain')
plt.show()
树的深度与复杂度 #
深度影响 #
python
import xgboost as xgb
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
# 创建数据
X, y = make_classification(n_samples=1000, n_features=20, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)
# 测试不同深度
depths = [1, 2, 3, 5, 7, 10]
train_scores = []
test_scores = []
for depth in depths:
params = {
'objective': 'binary:logistic',
'max_depth': depth,
'eta': 0.1,
'eval_metric': 'logloss'
}
model = xgb.train(params, dtrain, num_boost_round=100)
train_pred = model.predict(dtrain)
test_pred = model.predict(dtest)
train_scores.append(log_loss(y_train, train_pred))
test_scores.append(log_loss(y_test, test_pred))
# 绘图
plt.figure(figsize=(10, 6))
plt.plot(depths, train_scores, 'o-', label='Train')
plt.plot(depths, test_scores, 'o-', label='Test')
plt.xlabel('Max Depth')
plt.ylabel('Log Loss')
plt.legend()
plt.grid(True)
plt.show()
复杂度控制 #
python
# 控制树复杂度的参数
params = {
'max_depth': 6, # 最大深度
'min_child_weight': 1, # 最小叶子权重
'min_split_loss': 0, # 最小分裂增益(gamma)
'max_leaves': 0, # 最大叶子数(0 表示无限制)
'max_bin': 256, # 最大分箱数
'grow_policy': 'depthwise' # 生长策略
}
树的生长策略 #
Level-wise(层级生长) #
text
┌─────────────────────────────────────────────────────────────┐
│ Level-wise 生长 │
├─────────────────────────────────────────────────────────────┤
│ │
│ 第1层: [根节点] │
│ / \ │
│ 第2层: [节点1] [节点2] │
│ / \ / \ │
│ 第3层: [叶1] [叶2] [叶3] [叶4] │
│ │
│ 特点: │
│ - 逐层分裂 │
│ - 每层所有节点都尝试分裂 │
│ - XGBoost 默认策略 │
│ │
└─────────────────────────────────────────────────────────────┘
Leaf-wise(叶子生长) #
text
┌─────────────────────────────────────────────────────────────┐
│ Leaf-wise 生长 │
├─────────────────────────────────────────────────────────────┤
│ │
│ 第1步: [根节点] │
│ / \ │
│ 第2步: [节点1] [叶2] │
│ / \ │
│ 第3步: [叶1] [节点3] │
│ / \ │
│ 第4步: [叶3] [叶4] │
│ │
│ 特点: │
│ - 选择增益最大的叶子分裂 │
│ - 可能产生不对称树 │
│ - LightGBM 默认策略 │
│ │
└─────────────────────────────────────────────────────────────┘
python
# XGBoost 中设置生长策略
params = {
'grow_policy': 'depthwise', # 或 'lossguide'
'max_depth': 6, # depthwise 时使用
'max_leaves': 32 # lossguide 时使用
}
下一步 #
现在你已经理解决策树基础,接下来学习 目标函数 了解 XGBoost 如何优化模型!
最后更新:2026-04-04