集成方法 #
概述 #
集成学习通过组合多个基学习器来构建更强大的模型,是机器学习中最有效的技术之一。
集成方法分类 #
| 类型 | 思想 | 代表算法 |
|---|---|---|
| Bagging | 并行训练,减少方差 | 随机森林 |
| Boosting | 串行训练,减少偏差 | AdaBoost, GBDT |
| Voting | 投票决策 | VotingClassifier |
| Stacking | 元学习 | StackingClassifier |
核心思想 #
text
单个模型:
模型 → 预测
集成模型:
模型1 ─┐
模型2 ─┼→ 组合策略 → 最终预测
模型3 ─┘
随机森林 #
基本使用 #
python
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
X, y = make_classification(
n_samples=1000, n_features=20, n_informative=15,
n_redundant=5, random_state=42
)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
rf = RandomForestClassifier(
n_estimators=100,
max_depth=10,
random_state=42
)
rf.fit(X_train, y_train)
print(f"训练分数: {rf.score(X_train, y_train):.4f}")
print(f"测试分数: {rf.score(X_test, y_test):.4f}")
重要参数 #
| 参数 | 描述 | 默认值 |
|---|---|---|
n_estimators |
树的数量 | 100 |
max_depth |
最大深度 | None |
min_samples_split |
分裂最小样本数 | 2 |
min_samples_leaf |
叶节点最小样本数 | 1 |
max_features |
最大特征数 | ‘sqrt’ |
bootstrap |
是否自助采样 | True |
oob_score |
袋外评分 | False |
特征重要性 #
python
import numpy as np
import matplotlib.pyplot as plt
importances = rf.feature_importances_
indices = np.argsort(importances)[::-1]
plt.figure(figsize=(10, 6))
plt.bar(range(X.shape[1]), importances[indices])
plt.xticks(range(X.shape[1]), indices)
plt.xlabel('Feature Index')
plt.ylabel('Importance')
plt.title('Random Forest Feature Importance')
袋外评分 #
python
rf = RandomForestClassifier(
n_estimators=100,
oob_score=True,
random_state=42
)
rf.fit(X_train, y_train)
print(f"袋外评分: {rf.oob_score_:.4f}")
随机森林回归 #
python
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_regression
X, y = make_regression(n_samples=1000, n_features=20, noise=10, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
rf_reg = RandomForestRegressor(n_estimators=100, random_state=42)
rf_reg.fit(X_train, y_train)
print(f"R²: {rf_reg.score(X_test, y_test):.4f}")
极端随机树 #
ExtraTreesClassifier #
python
from sklearn.ensemble import ExtraTreesClassifier
et = ExtraTreesClassifier(
n_estimators=100,
max_depth=10,
random_state=42
)
et.fit(X_train, y_train)
print(f"测试分数: {et.score(X_test, y_test):.4f}")
与随机森林对比 #
| 特性 | 随机森林 | 极端随机树 |
|---|---|---|
| 分裂方式 | 最优分裂 | 随机分裂 |
| 方差 | 较低 | 更低 |
| 偏差 | 较低 | 较高 |
| 计算速度 | 较慢 | 更快 |
AdaBoost #
基本使用 #
python
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
ada = AdaBoostClassifier(
estimator=DecisionTreeClassifier(max_depth=1),
n_estimators=50,
learning_rate=1.0,
random_state=42
)
ada.fit(X_train, y_train)
print(f"测试分数: {ada.score(X_test, y_test):.4f}")
参数说明 #
| 参数 | 描述 |
|---|---|
estimator |
基学习器 |
n_estimators |
学习器数量 |
learning_rate |
学习率 |
algorithm |
算法类型 |
AdaBoost 回归 #
python
from sklearn.ensemble import AdaBoostRegressor
ada_reg = AdaBoostRegressor(
n_estimators=50,
learning_rate=0.1,
random_state=42
)
ada_reg.fit(X_train, y_train)
梯度提升树 #
GradientBoostingClassifier #
python
from sklearn.ensemble import GradientBoostingClassifier
gb = GradientBoostingClassifier(
n_estimators=100,
learning_rate=0.1,
max_depth=3,
random_state=42
)
gb.fit(X_train, y_train)
print(f"测试分数: {gb.score(X_test, y_test):.4f}")
参数说明 #
| 参数 | 描述 | 默认值 |
|---|---|---|
n_estimators |
树的数量 | 100 |
learning_rate |
学习率 | 0.1 |
max_depth |
最大深度 | 3 |
min_samples_split |
分裂最小样本数 | 2 |
subsample |
子采样比例 | 1.0 |
梯度提升回归 #
python
from sklearn.ensemble import GradientBoostingRegressor
gb_reg = GradientBoostingRegressor(
n_estimators=100,
learning_rate=0.1,
max_depth=3,
random_state=42
)
gb_reg.fit(X_train, y_train)
print(f"R²: {gb_reg.score(X_test, y_test):.4f}")
早停策略 #
python
gb = GradientBoostingClassifier(
n_estimators=1000,
learning_rate=0.1,
validation_fraction=0.2,
n_iter_no_change=10,
tol=1e-4,
random_state=42
)
gb.fit(X_train, y_train)
print(f"实际使用树数量: {gb.n_estimators_}")
HistGradientBoosting #
高效梯度提升 #
python
from sklearn.ensemble import HistGradientBoostingClassifier
hgb = HistGradientBoostingClassifier(
max_iter=100,
learning_rate=0.1,
max_depth=None,
random_state=42
)
hgb.fit(X_train, y_train)
print(f"测试分数: {hgb.score(X_test, y_test):.4f}")
优势 #
| 特性 | GradientBoosting | HistGradientBoosting |
|---|---|---|
| 大数据支持 | 慢 | 快 |
| 缺失值处理 | 需预处理 | 原生支持 |
| 内存使用 | 高 | 低 |
投票法 #
硬投票 #
python
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
clf1 = LogisticRegression(max_iter=1000)
clf2 = RandomForestClassifier(n_estimators=50)
clf3 = SVC(probability=True)
voting_clf = VotingClassifier(
estimators=[
('lr', clf1),
('rf', clf2),
('svm', clf3)
],
voting='hard'
)
voting_clf.fit(X_train, y_train)
print(f"测试分数: {voting_clf.score(X_test, y_test):.4f}")
软投票 #
python
voting_clf = VotingClassifier(
estimators=[
('lr', clf1),
('rf', clf2),
('svm', clf3)
],
voting='soft'
)
voting_clf.fit(X_train, y_train)
加权投票 #
python
voting_clf = VotingClassifier(
estimators=[
('lr', clf1),
('rf', clf2),
('svm', clf3)
],
voting='soft',
weights=[1, 2, 1]
)
堆叠法 #
StackingClassifier #
python
from sklearn.ensemble import StackingClassifier
from sklearn.neighbors import KNeighborsClassifier
estimators = [
('rf', RandomForestClassifier(n_estimators=50, random_state=42)),
('svm', SVC(probability=True, random_state=42)),
('knn', KNeighborsClassifier())
]
stacking_clf = StackingClassifier(
estimators=estimators,
final_estimator=LogisticRegression(),
cv=5
)
stacking_clf.fit(X_train, y_train)
print(f"测试分数: {stacking_clf.score(X_test, y_test):.4f}")
StackingRegressor #
python
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import Ridge
from sklearn.svm import SVR
estimators = [
('rf', RandomForestRegressor(n_estimators=50, random_state=42)),
('svr', SVR())
]
stacking_reg = StackingRegressor(
estimators=estimators,
final_estimator=Ridge(),
cv=5
)
stacking_reg.fit(X_train, y_train)
Bagging #
BaggingClassifier #
python
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
bagging = BaggingClassifier(
estimator=DecisionTreeClassifier(),
n_estimators=100,
max_samples=0.8,
max_features=0.8,
bootstrap=True,
random_state=42
)
bagging.fit(X_train, y_train)
print(f"测试分数: {bagging.score(X_test, y_test):.4f}")
参数说明 #
| 参数 | 描述 |
|---|---|
estimator |
基学习器 |
n_estimators |
学习器数量 |
max_samples |
样本采样比例 |
max_features |
特征采样比例 |
bootstrap |
是否有放回采样 |
模型对比 #
性能对比 #
python
from sklearn.model_selection import cross_val_score
models = {
'Random Forest': RandomForestClassifier(n_estimators=100),
'Extra Trees': ExtraTreesClassifier(n_estimators=100),
'AdaBoost': AdaBoostClassifier(n_estimators=50),
'Gradient Boosting': GradientBoostingClassifier(n_estimators=100),
'HistGradientBoosting': HistGradientBoostingClassifier(max_iter=100)
}
for name, model in models.items():
scores = cross_val_score(model, X, y, cv=5)
print(f"{name}: {scores.mean():.4f} (+/- {scores.std():.4f})")
超参数调优 #
随机森林调优 #
python
from sklearn.model_selection import GridSearchCV
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 20, None],
'min_samples_split': [2, 5, 10],
'max_features': ['sqrt', 'log2']
}
grid_search = GridSearchCV(
RandomForestClassifier(random_state=42),
param_grid,
cv=5,
n_jobs=-1
)
grid_search.fit(X_train, y_train)
print(f"最佳参数: {grid_search.best_params_}")
梯度提升调优 #
python
param_grid = {
'n_estimators': [50, 100, 200],
'learning_rate': [0.01, 0.1, 0.2],
'max_depth': [3, 5, 7],
'subsample': [0.8, 1.0]
}
grid_search = GridSearchCV(
GradientBoostingClassifier(random_state=42),
param_grid,
cv=5,
n_jobs=-1
)
实战示例 #
手写数字识别 #
python
from sklearn.datasets import load_digits
digits = load_digits()
X, y = digits.data, digits.target
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
rf = RandomForestClassifier(n_estimators=200, random_state=42)
rf.fit(X_train, y_train)
print(f"准确率: {rf.score(X_test, y_test):.4f}")
房价预测 #
python
from sklearn.datasets import fetch_california_housing
from sklearn.metrics import mean_squared_error
housing = fetch_california_housing()
X, y = housing.data, housing.target
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
gb = GradientBoostingRegressor(n_estimators=200, random_state=42)
gb.fit(X_train, y_train)
y_pred = gb.predict(X_test)
print(f"RMSE: {np.sqrt(mean_squared_error(y_test, y_pred)):.4f}")
并行计算 #
使用 n_jobs #
python
rf = RandomForestClassifier(n_estimators=100, n_jobs=-1)
et = ExtraTreesClassifier(n_estimators=100, n_jobs=-1)
并行交叉验证 #
python
scores = cross_val_score(model, X, y, cv=5, n_jobs=-1)
最佳实践 #
1. 选择合适的集成方法 #
| 场景 | 推荐方法 |
|---|---|
| 快速原型 | 随机森林 |
| 高精度 | 梯度提升 |
| 大数据 | HistGradientBoosting |
| 模型融合 | 堆叠法 |
2. 控制过拟合 #
python
rf = RandomForestClassifier(
n_estimators=100,
max_depth=10,
min_samples_leaf=5
)
3. 使用早停 #
python
gb = GradientBoostingClassifier(
n_estimators=1000,
n_iter_no_change=10,
validation_fraction=0.1
)
4. 特征重要性分析 #
python
importances = rf.feature_importances_
selected_features = np.where(importances > 0.01)[0]
下一步 #
掌握集成方法后,继续学习 聚类算法 进入无监督学习领域!
最后更新:2026-04-04