特征提取 #

概述 #

特征提取是将原始数据转换为适合机器学习算法的特征向量的过程。

特征提取类型 #

类型	输入	输出
文本特征	文本字符串	数值向量
图像特征	图像像素	特征向量
字典特征	字典对象	稀疏矩阵

文本特征提取 #

词袋模型（CountVectorizer） #

python

from sklearn.feature_extraction.text import CountVectorizer

corpus = [
    'This is the first document.',
    'This document is the second document.',
    'And this is the third one.',
    'Is this the first document?'
]

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)

print(f"词汇表: {vectorizer.get_feature_names_out()}")
print(f"特征矩阵形状: {X.shape}")
print(f"特征矩阵:\n{X.toarray()}")

参数说明 #

参数	描述	默认值
`max_features`	最大特征数	None
`min_df`	最小文档频率	1
`max_df`	最大文档频率	1.0
`ngram_range`	n-gram 范围	(1, 1)
`stop_words`	停用词	None

python

vectorizer = CountVectorizer(
    max_features=1000,
    min_df=2,
    max_df=0.95,
    ngram_range=(1, 2),
    stop_words='english'
)

TF-IDF #

python

from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()
X = tfidf.fit_transform(corpus)

print(f"TF-IDF 矩阵:\n{X.toarray()}")
print(f"IDF 值: {tfidf.idf_}")

TfidfTransformer #

python

from sklearn.feature_extraction.text import TfidfTransformer

count_vec = CountVectorizer()
X_counts = count_vec.fit_transform(corpus)

tfidf_transformer = TfidfTransformer()
X_tfidf = tfidf_transformer.fit_transform(X_counts)

中文文本处理 #

python

import jieba

def chinese_tokenizer(text):
    return list(jieba.cut(text))

corpus_cn = [
    '我喜欢学习机器学习',
    '机器学习很有趣',
    '深度学习是机器学习的分支'
]

vectorizer = CountVectorizer(tokenizer=chinese_tokenizer)
X = vectorizer.fit_transform(corpus_cn)

HashingVectorizer #

python

from sklearn.feature_extraction.text import HashingVectorizer

hash_vec = HashingVectorizer(
    n_features=2**10,
    alternate_sign=False
)
X = hash_vec.transform(corpus)

print(f"特征矩阵形状: {X.shape}")

优点与缺点 #

特点	CountVectorizer	TfidfVectorizer	HashingVectorizer
内存使用	高	高	低
可解释性	高	高	低
适合大数据	否	否	是
逆变换	支持	支持	不支持

字典特征提取 #

DictVectorizer #

python

from sklearn.feature_extraction import DictVectorizer

measurements = [
    {'city': 'Dubai', 'temperature': 33.},
    {'city': 'London', 'temperature': 12.},
    {'city': 'San Francisco', 'temperature': 18.}
]

vec = DictVectorizer(sparse=False)
X = vec.fit_transform(measurements)

print(f"特征名: {vec.get_feature_names_out()}")
print(f"特征矩阵:\n{X}")

类别特征编码 #

python

data = [
    {'gender': 'male', 'age': 25},
    {'gender': 'female', 'age': 30},
    {'gender': 'male', 'age': 35}
]

vec = DictVectorizer(sparse=False)
X = vec.fit_transform(data)

print(f"特征名: {vec.get_feature_names_out()}")

图像特征提取 #

像素特征 #

python

from sklearn.datasets import load_digits
import matplotlib.pyplot as plt

digits = load_digits()
X, y = digits.data, digits.target

print(f"图像形状: {digits.images[0].shape}")
print(f"特征向量长度: {X.shape[1]}")

plt.imshow(digits.images[0], cmap='gray')
plt.title(f'Label: {y[0]}')

PatchExtractor #

python

from sklearn.feature_extraction.image import PatchExtractor
import numpy as np

image = np.random.rand(100, 100)

extractor = PatchExtractor(patch_size=(10, 10), max_patches=10)
patches = extractor.transform([image])

print(f"提取的 patches 形状: {patches.shape}")

图像直方图特征 #

python

from skimage.feature import hog
from skimage import exposure

def extract_hog_features(image):
    features, hog_image = hog(
        image,
        orientations=8,
        pixels_per_cell=(4, 4),
        cells_per_block=(1, 1),
        visualize=True
    )
    return features, hog_image

features, hog_image = extract_hog_features(digits.images[0])
print(f"HOG 特征长度: {len(features)}")

特征哈希 #

FeatureHasher #

python

from sklearn.feature_extraction import FeatureHasher

data = [
    {'dog': 1, 'cat': 2, 'elephant': 4},
    {'dog': 2, 'run': 5}
]

hasher = FeatureHasher(n_features=10)
X = hasher.transform(data)

print(f"哈希特征矩阵:\n{X.toarray()}")

文本哈希 #

python

hasher = FeatureHasher(n_features=2**10, input_type='string')
X = hasher.transform([['hello', 'world'], ['hello', 'sklearn']])

特征提取 Pipeline #

文本分类 Pipeline #

python

from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

texts = ['positive text', 'negative text', 'great news', 'bad news']
labels = [1, 0, 1, 0]

X_train, X_test, y_train, y_test = train_test_split(texts, labels, random_state=42)

pipe = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', LogisticRegression())
])

pipe.fit(X_train, y_train)
print(f"准确率: {pipe.score(X_test, y_test):.4f}")

结合特征选择 #

python

from sklearn.feature_selection import SelectKBest, chi2

pipe = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=10000)),
    ('select', SelectKBest(chi2, k=1000)),
    ('clf', LogisticRegression())
])

高级文本特征 #

N-gram 特征 #

python

vectorizer = CountVectorizer(ngram_range=(1, 3))
X = vectorizer.fit_transform(corpus)

print(f"1-3 gram 特征数: {X.shape[1]}")

字符级特征 #

python

char_vec = CountVectorizer(analyzer='char', ngram_range=(2, 4))
X = char_vec.fit_transform(corpus)

print(f"字符级特征数: {X.shape[1]}")

词干提取 #

python

from nltk.stem import PorterStemmer
import nltk

stemmer = PorterStemmer()

def stem_tokenizer(text):
    return [stemmer.stem(word) for word in text.split()]

vectorizer = CountVectorizer(tokenizer=stem_tokenizer)

词形还原 #

python

from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

def lemma_tokenizer(text):
    return [lemmatizer.lemmatize(word) for word in text.split()]

实战示例 #

新闻分类 #

python

from sklearn.datasets import fetch_20newsgroups
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med']
newsgroups = fetch_20newsgroups(subset='train', categories=categories)

pipe = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english')),
    ('clf', MultinomialNB())
])

pipe.fit(newsgroups.data, newsgroups.target)

newsgroups_test = fetch_20newsgroups(subset='test', categories=categories)
y_pred = pipe.predict(newsgroups_test.data)

print(classification_report(newsgroups_test.target, y_pred, 
                            target_names=newsgroups_test.target_names))

情感分析 #

python

from sklearn.linear_model import LogisticRegression

reviews = [
    'This movie is great!',
    'Terrible experience',
    'I love this product',
    'Waste of money'
]
sentiments = [1, 0, 1, 0]

pipe = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1, 2))),
    ('clf', LogisticRegression())
])

pipe.fit(reviews, sentiments)

最佳实践 #

1. 选择合适的向量化方法 #

场景	推荐方法
小数据集	TfidfVectorizer
大数据集	HashingVectorizer
需要解释性	CountVectorizer

2. 参数调优 #

python

from sklearn.model_selection import GridSearchCV

param_grid = {
    'tfidf__max_features': [1000, 5000, 10000],
    'tfidf__ngram_range': [(1, 1), (1, 2)],
    'tfidf__min_df': [1, 2, 5]
}

grid_search = GridSearchCV(pipe, param_grid, cv=5)

3. 内存优化 #

python

vectorizer = TfidfVectorizer(
    max_features=5000,
    dtype=np.float32
)

4. 并行处理 #

python

vectorizer = CountVectorizer(
    max_features=10000,
    n_jobs=-1
)

下一步 #

掌握特征提取后，继续学习特征转换了解如何进一步处理特征！