特征选择 #

概述 #

特征选择是从原始特征中选择最相关特征子集的过程，可以提高模型性能并减少计算成本。

为什么需要特征选择？ #

原因	描述
减少过拟合	去除噪声特征
提高精度	保留相关信息
加速训练	减少特征数量
可解释性	简化模型理解

特征选择方法 #

方法	特点	代表算法
过滤法	独立于模型	方差阈值、相关系数
包装法	基于模型性能	RFE、前向选择
嵌入法	训练时选择	Lasso、树模型

过滤法 #

方差阈值 #

python

from sklearn.feature_selection import VarianceThreshold
import numpy as np

X = np.array([[0, 0, 1], [0, 1, 0], [1, 0, 0], [0, 1, 1], [0, 1, 0], [0, 1, 1]])

selector = VarianceThreshold(threshold=0.2)
X_selected = selector.fit_transform(X)

print(f"原始特征数: {X.shape[1]}")
print(f"选择后特征数: {X_selected.shape[1]}")
print(f"保留的特征: {selector.get_support(indices=True)}")

单变量特征选择 #

SelectKBest #

python

from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.datasets import load_iris

iris = load_iris()
X, y = iris.data, iris.target

selector = SelectKBest(score_func=f_classif, k=2)
X_selected = selector.fit_transform(X, y)

print(f"特征分数: {selector.scores_}")
print(f"选择的特征: {selector.get_support(indices=True)}")

SelectPercentile #

python

from sklearn.feature_selection import SelectPercentile

selector = SelectPercentile(score_func=f_classif, percentile=50)
X_selected = selector.fit_transform(X, y)

print(f"选择前 50% 的特征")

评分函数 #

函数	适用场景
`f_classif`	分类问题（ANOVA F值）
`f_regression`	回归问题
`chi2`	非负特征分类
`mutual_info_classif`	分类问题（互信息）
`mutual_info_regression`	回归问题（互信息）

python

from sklearn.feature_selection import chi2, mutual_info_classif

selector_chi2 = SelectKBest(score_func=chi2, k=2)
selector_mi = SelectKBest(score_func=mutual_info_classif, k=2)

高相关性特征去除 #

python

def remove_highly_correlated(X, threshold=0.9):
    corr_matrix = np.corrcoef(X.T)
    upper = np.triu(corr_matrix, k=1)
    to_drop = [i for i in range(X.shape[1]) if any(upper[:, i] > threshold)]
    return to_drop

to_drop = remove_highly_correlated(X, threshold=0.8)
print(f"建议删除的特征索引: {to_drop}")

包装法 #

递归特征消除（RFE） #

python

from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

estimator = LogisticRegression(max_iter=1000)
rfe = RFE(estimator, n_features_to_select=2)
X_selected = rfe.fit_transform(X, y)

print(f"特征排名: {rfe.ranking_}")
print(f"选择的特征: {rfe.support_}")

RFECV（带交叉验证的 RFE） #

python

from sklearn.feature_selection import RFECV
from sklearn.model_selection import StratifiedKFold

estimator = LogisticRegression(max_iter=1000)
rfecv = RFECV(
    estimator,
    step=1,
    cv=StratifiedKFold(5),
    scoring='accuracy'
)
rfecv.fit(X, y)

print(f"最优特征数: {rfecv.n_features_}")
print(f"选择的特征: {rfecv.support_}")

import matplotlib.pyplot as plt
plt.plot(range(1, len(rfecv.cv_results_['mean_test_score']) + 1), 
         rfecv.cv_results_['mean_test_score'])
plt.xlabel('Number of Features')
plt.ylabel('CV Score')
plt.title('RFECV Results')

SelectFromModel #

python

from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=100, random_state=42)
sfm = SelectFromModel(rf, threshold='median')
X_selected = sfm.fit_transform(X, y)

print(f"选择的特征: {sfm.get_support(indices=True)}")
print(f"特征重要性: {sfm.estimator_.feature_importances_}")

序列特征选择 #

python

from sklearn.feature_selection import SequentialFeatureSelector

sfs = SequentialFeatureSelector(
    LogisticRegression(max_iter=1000),
    n_features_to_select=2,
    direction='forward',
    cv=5
)
X_selected = sfs.fit_transform(X, y)

print(f"前向选择结果: {sfs.support_}")

sfs_backward = SequentialFeatureSelector(
    LogisticRegression(max_iter=1000),
    n_features_to_select=2,
    direction='backward',
    cv=5
)

嵌入法 #

L1 正则化（Lasso） #

python

from sklearn.linear_model import Lasso, LogisticRegression
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

lasso = Lasso(alpha=0.1)
lasso.fit(X_scaled, y)

selected = np.where(lasso.coef_ != 0)[0]
print(f"Lasso 选择的特征: {selected}")
print(f"系数: {lasso.coef_}")

树模型特征重要性 #

python

from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt

rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X, y)

importances = rf.feature_importances_
indices = np.argsort(importances)[::-1]

plt.figure(figsize=(10, 6))
plt.bar(range(X.shape[1]), importances[indices])
plt.xticks(range(X.shape[1]), [iris.feature_names[i] for i in indices], rotation=45)
plt.xlabel('Feature')
plt.ylabel('Importance')
plt.title('Random Forest Feature Importance')

排列重要性 #

python

from sklearn.inspection import permutation_importance
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

rf.fit(X_train, y_train)

result = permutation_importance(rf, X_test, y_test, n_repeats=10, random_state=42)

for i in result.importances_mean.argsort()[::-1]:
    print(f"{iris.feature_names[i]:20s}: {result.importances_mean[i]:.4f} +/- {result.importances_std[i]:.4f}")

特征选择 Pipeline #

组合使用 #

python

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.linear_model import LogisticRegression

pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('selector', SelectKBest(f_classif, k=2)),
    ('classifier', LogisticRegression(max_iter=1000))
])

pipe.fit(X, y)
print(f"准确率: {pipe.score(X, y):.4f}")

与 GridSearch 结合 #

python

from sklearn.model_selection import GridSearchCV

param_grid = {
    'selector__k': [1, 2, 3, 4],
    'classifier__C': [0.1, 1, 10]
}

grid_search = GridSearchCV(pipe, param_grid, cv=5)
grid_search.fit(X, y)

print(f"最佳参数: {grid_search.best_params_}")
print(f"最佳分数: {grid_search.best_score_:.4f}")

方法对比 #

性能对比 #

python

from sklearn.feature_selection import (
    SelectKBest, f_classif, RFE, 
    SelectFromModel, SequentialFeatureSelector
)
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

methods = {
    'SelectKBest': SelectKBest(f_classif, k=2),
    'RFE': RFE(LogisticRegression(max_iter=1000), n_features_to_select=2),
    'SelectFromModel': SelectFromModel(RandomForestClassifier(n_estimators=100, random_state=42), 
                                        threshold='median'),
    'SFS': SequentialFeatureSelector(LogisticRegression(max_iter=1000), 
                                      n_features_to_select=2, cv=5)
}

for name, selector in methods.items():
    X_selected = selector.fit_transform(X, y)
    scores = cross_val_score(LogisticRegression(max_iter=1000), X_selected, y, cv=5)
    print(f"{name}: {scores.mean():.4f} (+/- {scores.std():.4f})")

特征选择评估 #

学习曲线 #

python

from sklearn.model_selection import learning_curve

train_sizes, train_scores, test_scores = learning_curve(
    LogisticRegression(max_iter=1000),
    X_selected, y,
    cv=5
)

plt.plot(train_sizes, train_scores.mean(axis=1), label='Train')
plt.plot(train_sizes, test_scores.mean(axis=1), label='Test')
plt.xlabel('Training Size')
plt.ylabel('Score')
plt.legend()

特征数量 vs 性能 #

python

scores = []
n_features_range = range(1, X.shape[1] + 1)

for n in n_features_range:
    selector = SelectKBest(f_classif, k=n)
    X_selected = selector.fit_transform(X, y)
    score = cross_val_score(LogisticRegression(max_iter=1000), X_selected, y, cv=5).mean()
    scores.append(score)

plt.plot(n_features_range, scores, 'o-')
plt.xlabel('Number of Features')
plt.ylabel('CV Score')
plt.title('Feature Selection Performance')

最佳实践 #

1. 选择合适的方法 #

场景	推荐方法
初步筛选	方差阈值
小数据集	RFE
大数据集	SelectFromModel
高维数据	L1 正则化

2. 避免数据泄露 #

python

selector.fit(X_train, y_train)
X_train_selected = selector.transform(X_train)
X_test_selected = selector.transform(X_test)

3. 结合业务知识 #

python

domain_features = ['age', 'income', 'education']
selected_features = [f for f in domain_features if f in feature_names[selector.get_support()]]

4. 迭代优化 #

python

for threshold in [0.01, 0.05, 0.1]:
    sfm = SelectFromModel(rf, threshold=threshold)
    X_selected = sfm.fit_transform(X, y)
    score = cross_val_score(LogisticRegression(), X_selected, y, cv=5).mean()
    print(f"Threshold {threshold}: {score:.4f}")

下一步 #

掌握特征选择后，继续学习特征提取了解如何从原始数据提取特征！

特征选择 #

概述 #

为什么需要特征选择？ #

特征选择方法 #

过滤法 #

方差阈值 #

单变量特征选择 #

SelectKBest #

SelectPercentile #

评分函数 #

相关性过滤 #

高相关性特征去除 #

包装法 #

递归特征消除（RFE） #

RFECV（带交叉验证的 RFE） #

SelectFromModel #

序列特征选择 #

嵌入法 #

L1 正则化（Lasso） #

树模型特征重要性 #

排列重要性 #

特征选择 Pipeline #

组合使用 #

与 GridSearch 结合 #

方法对比 #

性能对比 #

特征选择评估 #

学习曲线 #

特征数量 vs 性能 #

最佳实践 #

1. 选择合适的方法 #

2. 避免数据泄露 #

3. 结合业务知识 #

4. 迭代优化 #

下一步 #