核心概念 #
决策树基础 #
什么是决策树? #
决策树是一种基本的分类和回归方法,通过一系列规则对数据进行划分:
text
┌─────────────────────────────────────────────────────────────┐
│ 决策树结构 │
├─────────────────────────────────────────────────────────────┤
│ │
│ [年龄 <= 30?] │
│ / \ │
│ 是 否 │
│ / \ │
│ [收入 <= 50000?] [收入 <= 80000?] │
│ / \ / \ │
│ 是 否 是 否 │
│ \ / \ / │
│ [拒绝] [批准] [审核] [批准] │
│ │
└─────────────────────────────────────────────────────────────┘
决策树的关键概念 #
python
import numpy as np
class SimpleDecisionTree:
def __init__(self, max_depth=3):
self.max_depth = max_depth
self.tree = None
def gini(self, y):
"""计算基尼不纯度"""
_, counts = np.unique(y, return_counts=True)
probabilities = counts / len(y)
return 1 - np.sum(probabilities ** 2)
def information_gain(self, y, left_idx, right_idx):
"""计算信息增益"""
n = len(y)
n_left, n_right = len(left_idx), len(right_idx)
if n_left == 0 or n_right == 0:
return 0
gain = (self.gini(y) -
(n_left / n) * self.gini(y[left_idx]) -
(n_right / n) * self.gini(y[right_idx]))
return gain
def find_best_split(self, X, y):
"""寻找最佳分裂点"""
best_gain = -np.inf
best_feature = None
best_threshold = None
for feature in range(X.shape[1]):
thresholds = np.unique(X[:, feature])
for threshold in thresholds:
left_idx = np.where(X[:, feature] <= threshold)[0]
right_idx = np.where(X[:, feature] > threshold)[0]
gain = self.information_gain(y, left_idx, right_idx)
if gain > best_gain:
best_gain = gain
best_feature = feature
best_threshold = threshold
return best_feature, best_threshold, best_gain
X = np.array([[25, 50000], [35, 60000], [45, 90000], [20, 30000]])
y = np.array([0, 1, 1, 0])
tree = SimpleDecisionTree()
feature, threshold, gain = tree.find_best_split(X, y)
print(f"最佳分裂特征: {feature}")
print(f"最佳分裂阈值: {threshold}")
print(f"信息增益: {gain:.4f}")
梯度提升决策树(GBDT) #
GBDT 基本原理 #
GBDT 通过迭代训练多个决策树,每棵树学习前面所有树的残差:
text
┌─────────────────────────────────────────────────────────────┐
│ GBDT 迭代过程 │
├─────────────────────────────────────────────────────────────┤
│ │
│ 初始预测 F₀(x) = mean(y) │
│ │
│ 第 1 轮: │
│ 残差 r₁ = y - F₀(x) │
│ 训练树 h₁(x) 拟合 r₁ │
│ 更新 F₁(x) = F₀(x) + η·h₁(x) │
│ │
│ 第 2 轮: │
│ 残差 r₂ = y - F₁(x) │
│ 训练树 h₂(x) 拟合 r₂ │
│ 更新 F₂(x) = F₁(x) + η·h₂(x) │
│ │
│ ... │
│ │
│ 第 n 轮: │
│ 最终预测 Fₙ(x) = F₀(x) + Σ η·hᵢ(x) │
│ │
└─────────────────────────────────────────────────────────────┘
GBDT 实现 #
python
class SimpleGBDT:
def __init__(self, n_estimators=10, learning_rate=0.1, max_depth=3):
self.n_estimators = n_estimators
self.learning_rate = learning_rate
self.max_depth = max_depth
self.trees = []
self.initial_prediction = None
def fit(self, X, y):
self.initial_prediction = np.mean(y)
F = np.full(len(y), self.initial_prediction)
for i in range(self.n_estimators):
residuals = y - F
tree = self.build_tree(X, residuals, depth=0)
self.trees.append(tree)
predictions = self.predict_tree(tree, X)
F += self.learning_rate * predictions
mse = np.mean((y - F) ** 2)
print(f"迭代 {i+1}: MSE = {mse:.4f}")
def build_tree(self, X, y, depth):
if depth >= self.max_depth or len(y) < 2:
return {'leaf': True, 'value': np.mean(y)}
best_feature, best_threshold, best_gain = self.find_best_split(X, y)
if best_gain <= 0:
return {'leaf': True, 'value': np.mean(y)}
left_idx = X[:, best_feature] <= best_threshold
right_idx = X[:, best_feature] > best_threshold
return {
'leaf': False,
'feature': best_feature,
'threshold': best_threshold,
'left': self.build_tree(X[left_idx], y[left_idx], depth + 1),
'right': self.build_tree(X[right_idx], y[right_idx], depth + 1)
}
def find_best_split(self, X, y):
best_gain = -np.inf
best_feature = None
best_threshold = None
for feature in range(X.shape[1]):
thresholds = np.unique(X[:, feature])
for threshold in thresholds:
left_idx = X[:, feature] <= threshold
right_idx = X[:, feature] > threshold
if np.sum(left_idx) == 0 or np.sum(right_idx) == 0:
continue
gain = self.variance_reduction(y, left_idx, right_idx)
if gain > best_gain:
best_gain = gain
best_feature = feature
best_threshold = threshold
return best_feature, best_threshold, best_gain
def variance_reduction(self, y, left_idx, right_idx):
total_var = np.var(y) * len(y)
left_var = np.var(y[left_idx]) * np.sum(left_idx)
right_var = np.var(y[right_idx]) * np.sum(right_idx)
return total_var - left_var - right_var
def predict_tree(self, tree, X):
if tree['leaf']:
return np.full(len(X), tree['value'])
left_idx = X[:, tree['feature']] <= tree['threshold']
right_idx = X[:, tree['feature']] > tree['threshold']
predictions = np.zeros(len(X))
predictions[left_idx] = self.predict_tree(tree['left'], X[left_idx])
predictions[right_idx] = self.predict_tree(tree['right'], X[right_idx])
return predictions
def predict(self, X):
F = np.full(len(X), self.initial_prediction)
for tree in self.trees:
F += self.learning_rate * self.predict_tree(tree, X)
return F
X = np.random.randn(100, 5)
y = 2 * X[:, 0] + 3 * X[:, 1] + np.random.randn(100) * 0.1
gbdt = SimpleGBDT(n_estimators=10, learning_rate=0.1, max_depth=3)
gbdt.fit(X, y)
Leaf-wise vs Level-wise #
Level-wise(按层)生长策略 #
传统 GBDT 采用 Level-wise 策略,每层所有节点都分裂:
text
┌─────────────────────────────────────────────────────────────┐
│ Level-wise 生长 │
├─────────────────────────────────────────────────────────────┤
│ │
│ 第 1 层: [根节点] │
│ / \ │
│ 第 2 层: [节点1] [节点2] │
│ / \ / \ │
│ 第 3 层: [叶1] [叶2] [叶3] [叶4] │
│ │
│ 特点: │
│ - 每层所有节点都分裂 │
│ - 树结构平衡 │
│ - 计算量大,效率低 │
│ - 可能分裂增益小的节点 │
│ │
└─────────────────────────────────────────────────────────────┘
Leaf-wise(按叶子)生长策略 #
LightGBM 采用 Leaf-wise 策略,只分裂增益最大的叶子:
text
┌─────────────────────────────────────────────────────────────┐
│ Leaf-wise 生长 │
├─────────────────────────────────────────────────────────────┤
│ │
│ 第 1 步: [根节点] │
│ / \ │
│ 第 2 步: [节点1] [叶1] │
│ / │
│ 第 3 步: [节点2] │
│ / \ │
│ 第 4 步: [叶2] [叶3] │
│ │
│ 特点: │
│ - 只分裂增益最大的叶子 │
│ - 树结构可能不平衡 │
│ - 计算量小,效率高 │
│ - 相同叶子数时精度更高 │
│ │
└─────────────────────────────────────────────────────────────┘
对比示例 #
python
import numpy as np
def simulate_level_wise(n_samples=1000, max_depth=3):
"""模拟 Level-wise 策略"""
n_leaves = 2 ** max_depth
n_splits = 2 ** max_depth - 1
return n_leaves, n_splits
def simulate_leaf_wise(n_samples=1000, num_leaves=8):
"""模拟 Leaf-wise 策略"""
n_leaves = num_leaves
n_splits = num_leaves - 1
return n_leaves, n_splits
level_leaves, level_splits = simulate_level_wise(max_depth=5)
leaf_leaves, leaf_splits = simulate_leaf_wise(num_leaves=31)
print("Level-wise 策略:")
print(f" 叶子节点数: {level_leaves}")
print(f" 分裂次数: {level_splits}")
print("\nLeaf-wise 策略:")
print(f" 叶子节点数: {leaf_leaves}")
print(f" 分裂次数: {leaf_splits}")
print(f"\n效率提升: {level_splits / leaf_splits:.2f}x")
特征重要性 #
特征重要性类型 #
LightGBM 提供多种特征重要性度量:
python
import lightgbm as lgb
from sklearn.datasets import load_breast_cancer
import matplotlib.pyplot as plt
data = load_breast_cancer()
X, y = data.data, data.target
train_data = lgb.Dataset(X, label=y)
params = {'objective': 'binary', 'verbose': -1}
model = lgb.train(params, train_data, num_boost_round=100)
importance_split = model.feature_importance(importance_type='split')
importance_gain = model.feature_importance(importance_type='gain')
fig, axes = plt.subplots(1, 2, figsize=(14, 6))
sorted_idx = np.argsort(importance_split)[-10:]
axes[0].barh(range(10), importance_split[sorted_idx])
axes[0].set_yticks(range(10))
axes[0].set_yticklabels([data.feature_names[i] for i in sorted_idx])
axes[0].set_xlabel('Split Count')
axes[0].set_title('特征重要性 (Split)')
sorted_idx = np.argsort(importance_gain)[-10:]
axes[1].barh(range(10), importance_gain[sorted_idx])
axes[1].set_yticks(range(10))
axes[1].set_yticklabels([data.feature_names[i] for i in sorted_idx])
axes[1].set_xlabel('Total Gain')
axes[1].set_title('特征重要性 (Gain)')
plt.tight_layout()
plt.show()
重要性类型说明 #
| 类型 | 说明 | 适用场景 |
|---|---|---|
| split | 特征被用于分裂的次数 | 了解特征使用频率 |
| gain | 特征带来的总增益 | 了解特征贡献度 |
特征选择 #
python
import pandas as pd
feature_importance = pd.DataFrame({
'feature': data.feature_names,
'importance': model.feature_importance(importance_type='gain')
})
feature_importance = feature_importance.sort_values('importance', ascending=False)
print("Top 10 重要特征:")
print(feature_importance.head(10))
top_features = feature_importance.head(10)['feature'].tolist()
print(f"\n选择的特征: {top_features}")
模型复杂度控制 #
主要参数 #
python
params = {
'num_leaves': 31,
'max_depth': -1,
'min_data_in_leaf': 20,
'min_sum_hessian_in_leaf': 1e-3,
'min_gain_to_split': 0.0,
}
参数说明 #
| 参数 | 说明 | 默认值 | 调优建议 |
|---|---|---|---|
| num_leaves | 叶子节点数 | 31 | 控制模型复杂度的主要参数 |
| max_depth | 树的最大深度 | -1 | 通常不需要设置 |
| min_data_in_leaf | 叶子最少样本数 | 20 | 防止过拟合 |
| min_sum_hessian_in_leaf | 叶子最小 Hessian 和 | 1e-3 | 防止过拟合 |
| min_gain_to_split | 最小分裂增益 | 0.0 | 防止过拟合 |
复杂度与性能权衡 #
python
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import cross_val_score
from lightgbm import LGBMClassifier
data = load_breast_cancer()
X, y = data.data, data.target
num_leaves_range = [7, 15, 31, 63, 127, 255]
train_scores = []
test_scores = []
for num_leaves in num_leaves_range:
clf = LGBMClassifier(num_leaves=num_leaves, n_estimators=100, verbose=-1)
clf.fit(X, y)
train_score = clf.score(X, y)
train_scores.append(train_score)
cv_score = cross_val_score(clf, X, y, cv=5).mean()
test_scores.append(cv_score)
plt.figure(figsize=(10, 6))
plt.plot(num_leaves_range, train_scores, 'o-', label='训练集')
plt.plot(num_leaves_range, test_scores, 's-', label='交叉验证')
plt.xlabel('num_leaves')
plt.ylabel('准确率')
plt.title('模型复杂度与性能')
plt.legend()
plt.xscale('log')
plt.grid(True)
plt.show()
正则化 #
L1 和 L2 正则化 #
python
params = {
'lambda_l1': 0.0,
'lambda_l2': 0.0,
'min_split_gain': 0.0,
}
正则化效果 #
python
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
lambda_values = [0, 0.01, 0.1, 1, 10]
results = []
for l1 in lambda_values:
for l2 in lambda_values:
params = {
'objective': 'binary',
'lambda_l1': l1,
'lambda_l2': l2,
'verbose': -1
}
train_data = lgb.Dataset(X_train, label=y_train)
valid_data = lgb.Dataset(X_test, label=y_test, reference=train_data)
model = lgb.train(params, train_data, num_boost_round=100,
valid_sets=[valid_data], callbacks=[lgb.log_evaluation(0)])
train_pred = (model.predict(X_train) > 0.5).astype(int)
test_pred = (model.predict(X_test) > 0.5).astype(int)
train_acc = (train_pred == y_train).mean()
test_acc = (test_pred == y_test).mean()
results.append({
'lambda_l1': l1,
'lambda_l2': l2,
'train_acc': train_acc,
'test_acc': test_acc
})
results_df = pd.DataFrame(results)
print(results_df.pivot_table(index='lambda_l1', columns='lambda_l2', values='test_acc'))
学习率与迭代次数 #
学习率的作用 #
python
learning_rates = [0.001, 0.01, 0.05, 0.1, 0.3]
n_estimators_list = [1000, 500, 200, 100, 50]
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
for lr, n_est in zip(learning_rates, n_estimators_list):
params = {
'objective': 'binary',
'learning_rate': lr,
'verbose': -1
}
train_data = lgb.Dataset(X_train, label=y_train)
valid_data = lgb.Dataset(X_test, label=y_test, reference=train_data)
evals_result = {}
model = lgb.train(
params, train_data, num_boost_round=n_est,
valid_sets=[valid_data],
callbacks=[lgb.log_evaluation(0), lgb.record_evaluation(evals_result)]
)
axes[0].plot(evals_result['valid_0']['binary_logloss'], label=f'lr={lr}')
axes[1].plot(evals_result['valid_0']['auc'], label=f'lr={lr}')
axes[0].set_xlabel('迭代次数')
axes[0].set_ylabel('Log Loss')
axes[0].set_title('不同学习率的损失曲线')
axes[0].legend()
axes[1].set_xlabel('迭代次数')
axes[1].set_ylabel('AUC')
axes[1].set_title('不同学习率的 AUC 曲线')
axes[1].legend()
plt.tight_layout()
plt.show()
学习率与迭代次数的关系 #
| 学习率 | 推荐迭代次数 | 训练时间 | 模型质量 |
|---|---|---|---|
| 0.001 | 1000+ | 长 | 高 |
| 0.01 | 500-1000 | 中 | 高 |
| 0.05 | 200-500 | 中 | 中 |
| 0.1 | 100-200 | 短 | 中 |
| 0.3 | 50-100 | 短 | 低 |
下一步 #
现在你已经理解了 LightGBM 的核心概念,接下来学习 数据接口,深入了解 LightGBM 的数据处理机制!
最后更新:2026-04-04