特征选择 #
概述 #
特征选择是从原始特征中选择最相关特征子集的过程,可以提高模型性能并减少计算成本。
为什么需要特征选择? #
| 原因 | 描述 |
|---|---|
| 减少过拟合 | 去除噪声特征 |
| 提高精度 | 保留相关信息 |
| 加速训练 | 减少特征数量 |
| 可解释性 | 简化模型理解 |
特征选择方法 #
| 方法 | 特点 | 代表算法 |
|---|---|---|
| 过滤法 | 独立于模型 | 方差阈值、相关系数 |
| 包装法 | 基于模型性能 | RFE、前向选择 |
| 嵌入法 | 训练时选择 | Lasso、树模型 |
过滤法 #
方差阈值 #
python
from sklearn.feature_selection import VarianceThreshold
import numpy as np
X = np.array([[0, 0, 1], [0, 1, 0], [1, 0, 0], [0, 1, 1], [0, 1, 0], [0, 1, 1]])
selector = VarianceThreshold(threshold=0.2)
X_selected = selector.fit_transform(X)
print(f"原始特征数: {X.shape[1]}")
print(f"选择后特征数: {X_selected.shape[1]}")
print(f"保留的特征: {selector.get_support(indices=True)}")
单变量特征选择 #
SelectKBest #
python
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.datasets import load_iris
iris = load_iris()
X, y = iris.data, iris.target
selector = SelectKBest(score_func=f_classif, k=2)
X_selected = selector.fit_transform(X, y)
print(f"特征分数: {selector.scores_}")
print(f"选择的特征: {selector.get_support(indices=True)}")
SelectPercentile #
python
from sklearn.feature_selection import SelectPercentile
selector = SelectPercentile(score_func=f_classif, percentile=50)
X_selected = selector.fit_transform(X, y)
print(f"选择前 50% 的特征")
评分函数 #
| 函数 | 适用场景 |
|---|---|
f_classif |
分类问题(ANOVA F值) |
f_regression |
回归问题 |
chi2 |
非负特征分类 |
mutual_info_classif |
分类问题(互信息) |
mutual_info_regression |
回归问题(互信息) |
python
from sklearn.feature_selection import chi2, mutual_info_classif
selector_chi2 = SelectKBest(score_func=chi2, k=2)
selector_mi = SelectKBest(score_func=mutual_info_classif, k=2)
相关性过滤 #
python
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
df = pd.DataFrame(X, columns=iris.feature_names)
df['target'] = y
corr_matrix = df.corr()
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.title('Feature Correlation Matrix')
高相关性特征去除 #
python
def remove_highly_correlated(X, threshold=0.9):
corr_matrix = np.corrcoef(X.T)
upper = np.triu(corr_matrix, k=1)
to_drop = [i for i in range(X.shape[1]) if any(upper[:, i] > threshold)]
return to_drop
to_drop = remove_highly_correlated(X, threshold=0.8)
print(f"建议删除的特征索引: {to_drop}")
包装法 #
递归特征消除(RFE) #
python
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
estimator = LogisticRegression(max_iter=1000)
rfe = RFE(estimator, n_features_to_select=2)
X_selected = rfe.fit_transform(X, y)
print(f"特征排名: {rfe.ranking_}")
print(f"选择的特征: {rfe.support_}")
RFECV(带交叉验证的 RFE) #
python
from sklearn.feature_selection import RFECV
from sklearn.model_selection import StratifiedKFold
estimator = LogisticRegression(max_iter=1000)
rfecv = RFECV(
estimator,
step=1,
cv=StratifiedKFold(5),
scoring='accuracy'
)
rfecv.fit(X, y)
print(f"最优特征数: {rfecv.n_features_}")
print(f"选择的特征: {rfecv.support_}")
import matplotlib.pyplot as plt
plt.plot(range(1, len(rfecv.cv_results_['mean_test_score']) + 1),
rfecv.cv_results_['mean_test_score'])
plt.xlabel('Number of Features')
plt.ylabel('CV Score')
plt.title('RFECV Results')
SelectFromModel #
python
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)
sfm = SelectFromModel(rf, threshold='median')
X_selected = sfm.fit_transform(X, y)
print(f"选择的特征: {sfm.get_support(indices=True)}")
print(f"特征重要性: {sfm.estimator_.feature_importances_}")
序列特征选择 #
python
from sklearn.feature_selection import SequentialFeatureSelector
sfs = SequentialFeatureSelector(
LogisticRegression(max_iter=1000),
n_features_to_select=2,
direction='forward',
cv=5
)
X_selected = sfs.fit_transform(X, y)
print(f"前向选择结果: {sfs.support_}")
sfs_backward = SequentialFeatureSelector(
LogisticRegression(max_iter=1000),
n_features_to_select=2,
direction='backward',
cv=5
)
嵌入法 #
L1 正则化(Lasso) #
python
from sklearn.linear_model import Lasso, LogisticRegression
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
lasso = Lasso(alpha=0.1)
lasso.fit(X_scaled, y)
selected = np.where(lasso.coef_ != 0)[0]
print(f"Lasso 选择的特征: {selected}")
print(f"系数: {lasso.coef_}")
树模型特征重要性 #
python
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X, y)
importances = rf.feature_importances_
indices = np.argsort(importances)[::-1]
plt.figure(figsize=(10, 6))
plt.bar(range(X.shape[1]), importances[indices])
plt.xticks(range(X.shape[1]), [iris.feature_names[i] for i in indices], rotation=45)
plt.xlabel('Feature')
plt.ylabel('Importance')
plt.title('Random Forest Feature Importance')
排列重要性 #
python
from sklearn.inspection import permutation_importance
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
rf.fit(X_train, y_train)
result = permutation_importance(rf, X_test, y_test, n_repeats=10, random_state=42)
for i in result.importances_mean.argsort()[::-1]:
print(f"{iris.feature_names[i]:20s}: {result.importances_mean[i]:.4f} +/- {result.importances_std[i]:.4f}")
特征选择 Pipeline #
组合使用 #
python
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.linear_model import LogisticRegression
pipe = Pipeline([
('scaler', StandardScaler()),
('selector', SelectKBest(f_classif, k=2)),
('classifier', LogisticRegression(max_iter=1000))
])
pipe.fit(X, y)
print(f"准确率: {pipe.score(X, y):.4f}")
与 GridSearch 结合 #
python
from sklearn.model_selection import GridSearchCV
param_grid = {
'selector__k': [1, 2, 3, 4],
'classifier__C': [0.1, 1, 10]
}
grid_search = GridSearchCV(pipe, param_grid, cv=5)
grid_search.fit(X, y)
print(f"最佳参数: {grid_search.best_params_}")
print(f"最佳分数: {grid_search.best_score_:.4f}")
方法对比 #
性能对比 #
python
from sklearn.feature_selection import (
SelectKBest, f_classif, RFE,
SelectFromModel, SequentialFeatureSelector
)
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
methods = {
'SelectKBest': SelectKBest(f_classif, k=2),
'RFE': RFE(LogisticRegression(max_iter=1000), n_features_to_select=2),
'SelectFromModel': SelectFromModel(RandomForestClassifier(n_estimators=100, random_state=42),
threshold='median'),
'SFS': SequentialFeatureSelector(LogisticRegression(max_iter=1000),
n_features_to_select=2, cv=5)
}
for name, selector in methods.items():
X_selected = selector.fit_transform(X, y)
scores = cross_val_score(LogisticRegression(max_iter=1000), X_selected, y, cv=5)
print(f"{name}: {scores.mean():.4f} (+/- {scores.std():.4f})")
特征选择评估 #
学习曲线 #
python
from sklearn.model_selection import learning_curve
train_sizes, train_scores, test_scores = learning_curve(
LogisticRegression(max_iter=1000),
X_selected, y,
cv=5
)
plt.plot(train_sizes, train_scores.mean(axis=1), label='Train')
plt.plot(train_sizes, test_scores.mean(axis=1), label='Test')
plt.xlabel('Training Size')
plt.ylabel('Score')
plt.legend()
特征数量 vs 性能 #
python
scores = []
n_features_range = range(1, X.shape[1] + 1)
for n in n_features_range:
selector = SelectKBest(f_classif, k=n)
X_selected = selector.fit_transform(X, y)
score = cross_val_score(LogisticRegression(max_iter=1000), X_selected, y, cv=5).mean()
scores.append(score)
plt.plot(n_features_range, scores, 'o-')
plt.xlabel('Number of Features')
plt.ylabel('CV Score')
plt.title('Feature Selection Performance')
最佳实践 #
1. 选择合适的方法 #
| 场景 | 推荐方法 |
|---|---|
| 初步筛选 | 方差阈值 |
| 小数据集 | RFE |
| 大数据集 | SelectFromModel |
| 高维数据 | L1 正则化 |
2. 避免数据泄露 #
python
selector.fit(X_train, y_train)
X_train_selected = selector.transform(X_train)
X_test_selected = selector.transform(X_test)
3. 结合业务知识 #
python
domain_features = ['age', 'income', 'education']
selected_features = [f for f in domain_features if f in feature_names[selector.get_support()]]
4. 迭代优化 #
python
for threshold in [0.01, 0.05, 0.1]:
sfm = SelectFromModel(rf, threshold=threshold)
X_selected = sfm.fit_transform(X, y)
score = cross_val_score(LogisticRegression(), X_selected, y, cv=5).mean()
print(f"Threshold {threshold}: {score:.4f}")
下一步 #
掌握特征选择后,继续学习 特征提取 了解如何从原始数据提取特征!
最后更新:2026-04-04