决策树 #
概述 #
决策树是一种基于树形结构进行决策的算法,通过一系列规则对数据进行分类或回归。
决策树结构 #
text
根节点
│
┌─────────┴─────────┐
│ │
内部节点 内部节点
│ │
┌─────┴─────┐ ┌─────┴─────┐
│ │ │ │
叶节点 叶节点 叶节点 叶节点
(类别A) (类别B) (类别A) (类别B)
核心概念 #
| 概念 | 描述 |
|---|---|
| 根节点 | 树的起始点,包含所有样本 |
| 内部节点 | 决策节点,进行特征判断 |
| 叶节点 | 终端节点,输出预测结果 |
| 分支 | 连接节点的路径 |
分类决策树 #
基本使用 #
python
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
iris = load_iris()
X, y = iris.data, iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(f"准确率: {accuracy_score(y_test, y_pred):.4f}")
重要参数 #
python
clf = DecisionTreeClassifier(
criterion='gini',
max_depth=5,
min_samples_split=10,
min_samples_leaf=5,
max_features='sqrt',
random_state=42
)
参数说明 #
| 参数 | 描述 | 默认值 |
|---|---|---|
criterion |
分裂标准 | ‘gini’ |
max_depth |
最大深度 | None |
min_samples_split |
分裂最小样本数 | 2 |
min_samples_leaf |
叶节点最小样本数 | 1 |
max_features |
最大特征数 | None |
min_impurity_decrease |
最小不纯度减少 | 0.0 |
分裂标准 #
python
clf_gini = DecisionTreeClassifier(criterion='gini')
clf_entropy = DecisionTreeClassifier(criterion='entropy')
clf_log_loss = DecisionTreeClassifier(criterion='log_loss')
回归决策树 #
基本使用 #
python
from sklearn.tree import DecisionTreeRegressor
from sklearn.datasets import make_regression
from sklearn.metrics import mean_squared_error, r2_score
X, y = make_regression(n_samples=1000, n_features=10, noise=10, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
reg = DecisionTreeRegressor(max_depth=5, random_state=42)
reg.fit(X_train, y_train)
y_pred = reg.predict(X_test)
print(f"MSE: {mean_squared_error(y_test, y_pred):.4f}")
print(f"R²: {r2_score(y_test, y_pred):.4f}")
参数说明 #
python
reg = DecisionTreeRegressor(
criterion='squared_error',
max_depth=5,
min_samples_split=10,
min_samples_leaf=5,
random_state=42
)
特征重要性 #
获取特征重要性 #
python
import numpy as np
clf.fit(X_train, y_train)
importances = clf.feature_importances_
indices = np.argsort(importances)[::-1]
print("特征排名:")
for f in range(X.shape[1]):
print(f"{f + 1}. 特征 {indices[f]} ({importances[indices[f]]:.4f})")
可视化特征重要性 #
python
import matplotlib.pyplot as plt
plt.figure(figsize=(10, 6))
plt.bar(range(X.shape[1]), importances[indices])
plt.xticks(range(X.shape[1]), [iris.feature_names[i] for i in indices], rotation=45)
plt.xlabel('Feature')
plt.ylabel('Importance')
plt.title('Feature Importance')
plt.tight_layout()
决策树可视化 #
使用 plot_tree #
python
from sklearn.tree import plot_tree
import matplotlib.pyplot as plt
plt.figure(figsize=(20, 10))
plot_tree(
clf,
feature_names=iris.feature_names,
class_names=iris.target_names,
filled=True,
rounded=True,
fontsize=10
)
plt.show()
使用 export_text #
python
from sklearn.tree import export_text
tree_text = export_text(clf, feature_names=iris.feature_names)
print(tree_text)
使用 graphviz #
python
from sklearn.tree import export_graphviz
import graphviz
dot_data = export_graphviz(
clf,
feature_names=iris.feature_names,
class_names=iris.target_names,
filled=True,
rounded=True
)
graph = graphviz.Source(dot_data)
graph.render("decision_tree")
剪枝技术 #
预剪枝 #
通过参数限制树的生长:
python
clf = DecisionTreeClassifier(
max_depth=5,
min_samples_split=10,
min_samples_leaf=5,
max_leaf_nodes=20,
min_impurity_decrease=0.01
)
后剪枝(代价复杂度剪枝) #
python
path = clf.cost_complexity_pruning_path(X_train, y_train)
ccp_alphas = path.ccp_alphas
clfs = []
for ccp_alpha in ccp_alphas:
clf = DecisionTreeClassifier(random_state=42, ccp_alpha=ccp_alpha)
clf.fit(X_train, y_train)
clfs.append(clf)
train_scores = [clf.score(X_train, y_train) for clf in clfs]
test_scores = [clf.score(X_test, y_test) for clf in clfs]
plt.figure(figsize=(10, 6))
plt.plot(ccp_alphas, train_scores, marker='o', label='Train')
plt.plot(ccp_alphas, test_scores, marker='s', label='Test')
plt.xlabel('Alpha')
plt.ylabel('Accuracy')
plt.legend()
选择最佳 alpha #
python
best_idx = np.argmax(test_scores)
best_alpha = ccp_alphas[best_idx]
clf_pruned = DecisionTreeClassifier(random_state=42, ccp_alpha=best_alpha)
clf_pruned.fit(X_train, y_train)
决策边界 #
可视化决策边界 #
python
from sklearn.datasets import make_classification
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
X, y = make_classification(
n_samples=200, n_features=2, n_redundant=0, n_informative=2,
n_clusters_per_class=1, random_state=42
)
clf = DecisionTreeClassifier(max_depth=3)
clf.fit(X, y)
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.01),
np.arange(y_min, y_max, 0.01))
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
plt.contourf(xx, yy, Z, alpha=0.4, cmap=ListedColormap(['red', 'blue']))
plt.scatter(X[:, 0], X[:, 1], c=y, cmap=ListedColormap(['red', 'blue']))
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.title('Decision Tree Decision Boundary')
处理缺失值 #
使用缺失值支持 #
python
import numpy as np
X = np.array([[1, 2], [np.nan, 3], [4, 5], [6, np.nan]])
y = np.array([0, 1, 0, 1])
clf = DecisionTreeClassifier()
clf.fit(X, y)
决策树优缺点 #
优点 #
| 优点 | 描述 |
|---|---|
| 易于理解 | 可视化后直观易懂 |
| 无需特征缩放 | 对特征尺度不敏感 |
| 处理混合类型 | 同时处理数值和类别特征 |
| 计算效率高 | 训练和预测速度快 |
| 处理缺失值 | 原生支持缺失值 |
缺点 #
| 缺点 | 描述 |
|---|---|
| 容易过拟合 | 需要剪枝或限制深度 |
| 不稳定性 | 数据小变化导致树结构大变化 |
| 偏向性 | 偏向于取值较多的特征 |
| XOR 问题 | 难以学习 XOR 关系 |
与其他模型对比 #
python
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
models = {
'Decision Tree': DecisionTreeClassifier(max_depth=5),
'Logistic Regression': LogisticRegression(max_iter=1000),
'KNN': KNeighborsClassifier(),
'SVM': SVC()
}
for name, model in models.items():
model.fit(X_train, y_train)
score = model.score(X_test, y_test)
print(f"{name}: {score:.4f}")
超参数调优 #
GridSearchCV #
python
from sklearn.model_selection import GridSearchCV
param_grid = {
'max_depth': [3, 5, 7, 10, None],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 5],
'criterion': ['gini', 'entropy']
}
grid_search = GridSearchCV(
DecisionTreeClassifier(random_state=42),
param_grid,
cv=5,
scoring='accuracy'
)
grid_search.fit(X_train, y_train)
print(f"最佳参数: {grid_search.best_params_}")
print(f"最佳分数: {grid_search.best_score_:.4f}")
实战示例 #
泰坦尼克号生存预测 #
python
from sklearn.preprocessing import LabelEncoder
import pandas as pd
df = pd.read_csv('titanic.csv')
df['Age'] = df['Age'].fillna(df['Age'].median())
df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])
le = LabelEncoder()
df['Sex'] = le.fit_transform(df['Sex'])
df['Embarked'] = le.fit_transform(df['Embarked'])
features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
X = df[features]
y = df['Survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
clf = DecisionTreeClassifier(max_depth=5, random_state=42)
clf.fit(X_train, y_train)
print(f"准确率: {clf.score(X_test, y_test):.4f}")
importance_df = pd.DataFrame({
'feature': features,
'importance': clf.feature_importances_
}).sort_values('importance', ascending=False)
print(importance_df)
最佳实践 #
1. 控制过拟合 #
python
clf = DecisionTreeClassifier(
max_depth=5,
min_samples_leaf=10,
min_samples_split=20
)
2. 使用集成方法 #
python
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=100)
3. 特征选择 #
python
clf.fit(X_train, y_train)
important_features = np.where(clf.feature_importances_ > 0.01)[0]
X_selected = X[:, important_features]
4. 平衡类别 #
python
clf = DecisionTreeClassifier(class_weight='balanced')
下一步 #
掌握决策树后,继续学习 支持向量机 了解另一种强大的分类算法!
最后更新:2026-04-04