特征提取 #
概述 #
特征提取是将原始数据转换为适合机器学习算法的特征向量的过程。
特征提取类型 #
| 类型 | 输入 | 输出 |
|---|---|---|
| 文本特征 | 文本字符串 | 数值向量 |
| 图像特征 | 图像像素 | 特征向量 |
| 字典特征 | 字典对象 | 稀疏矩阵 |
文本特征提取 #
词袋模型(CountVectorizer) #
python
from sklearn.feature_extraction.text import CountVectorizer
corpus = [
'This is the first document.',
'This document is the second document.',
'And this is the third one.',
'Is this the first document?'
]
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)
print(f"词汇表: {vectorizer.get_feature_names_out()}")
print(f"特征矩阵形状: {X.shape}")
print(f"特征矩阵:\n{X.toarray()}")
参数说明 #
| 参数 | 描述 | 默认值 |
|---|---|---|
max_features |
最大特征数 | None |
min_df |
最小文档频率 | 1 |
max_df |
最大文档频率 | 1.0 |
ngram_range |
n-gram 范围 | (1, 1) |
stop_words |
停用词 | None |
python
vectorizer = CountVectorizer(
max_features=1000,
min_df=2,
max_df=0.95,
ngram_range=(1, 2),
stop_words='english'
)
TF-IDF #
python
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
X = tfidf.fit_transform(corpus)
print(f"TF-IDF 矩阵:\n{X.toarray()}")
print(f"IDF 值: {tfidf.idf_}")
TfidfTransformer #
python
from sklearn.feature_extraction.text import TfidfTransformer
count_vec = CountVectorizer()
X_counts = count_vec.fit_transform(corpus)
tfidf_transformer = TfidfTransformer()
X_tfidf = tfidf_transformer.fit_transform(X_counts)
中文文本处理 #
python
import jieba
def chinese_tokenizer(text):
return list(jieba.cut(text))
corpus_cn = [
'我喜欢学习机器学习',
'机器学习很有趣',
'深度学习是机器学习的分支'
]
vectorizer = CountVectorizer(tokenizer=chinese_tokenizer)
X = vectorizer.fit_transform(corpus_cn)
HashingVectorizer #
python
from sklearn.feature_extraction.text import HashingVectorizer
hash_vec = HashingVectorizer(
n_features=2**10,
alternate_sign=False
)
X = hash_vec.transform(corpus)
print(f"特征矩阵形状: {X.shape}")
优点与缺点 #
| 特点 | CountVectorizer | TfidfVectorizer | HashingVectorizer |
|---|---|---|---|
| 内存使用 | 高 | 高 | 低 |
| 可解释性 | 高 | 高 | 低 |
| 适合大数据 | 否 | 否 | 是 |
| 逆变换 | 支持 | 支持 | 不支持 |
字典特征提取 #
DictVectorizer #
python
from sklearn.feature_extraction import DictVectorizer
measurements = [
{'city': 'Dubai', 'temperature': 33.},
{'city': 'London', 'temperature': 12.},
{'city': 'San Francisco', 'temperature': 18.}
]
vec = DictVectorizer(sparse=False)
X = vec.fit_transform(measurements)
print(f"特征名: {vec.get_feature_names_out()}")
print(f"特征矩阵:\n{X}")
类别特征编码 #
python
data = [
{'gender': 'male', 'age': 25},
{'gender': 'female', 'age': 30},
{'gender': 'male', 'age': 35}
]
vec = DictVectorizer(sparse=False)
X = vec.fit_transform(data)
print(f"特征名: {vec.get_feature_names_out()}")
图像特征提取 #
像素特征 #
python
from sklearn.datasets import load_digits
import matplotlib.pyplot as plt
digits = load_digits()
X, y = digits.data, digits.target
print(f"图像形状: {digits.images[0].shape}")
print(f"特征向量长度: {X.shape[1]}")
plt.imshow(digits.images[0], cmap='gray')
plt.title(f'Label: {y[0]}')
PatchExtractor #
python
from sklearn.feature_extraction.image import PatchExtractor
import numpy as np
image = np.random.rand(100, 100)
extractor = PatchExtractor(patch_size=(10, 10), max_patches=10)
patches = extractor.transform([image])
print(f"提取的 patches 形状: {patches.shape}")
图像直方图特征 #
python
from skimage.feature import hog
from skimage import exposure
def extract_hog_features(image):
features, hog_image = hog(
image,
orientations=8,
pixels_per_cell=(4, 4),
cells_per_block=(1, 1),
visualize=True
)
return features, hog_image
features, hog_image = extract_hog_features(digits.images[0])
print(f"HOG 特征长度: {len(features)}")
特征哈希 #
FeatureHasher #
python
from sklearn.feature_extraction import FeatureHasher
data = [
{'dog': 1, 'cat': 2, 'elephant': 4},
{'dog': 2, 'run': 5}
]
hasher = FeatureHasher(n_features=10)
X = hasher.transform(data)
print(f"哈希特征矩阵:\n{X.toarray()}")
文本哈希 #
python
hasher = FeatureHasher(n_features=2**10, input_type='string')
X = hasher.transform([['hello', 'world'], ['hello', 'sklearn']])
特征提取 Pipeline #
文本分类 Pipeline #
python
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
texts = ['positive text', 'negative text', 'great news', 'bad news']
labels = [1, 0, 1, 0]
X_train, X_test, y_train, y_test = train_test_split(texts, labels, random_state=42)
pipe = Pipeline([
('tfidf', TfidfVectorizer()),
('clf', LogisticRegression())
])
pipe.fit(X_train, y_train)
print(f"准确率: {pipe.score(X_test, y_test):.4f}")
结合特征选择 #
python
from sklearn.feature_selection import SelectKBest, chi2
pipe = Pipeline([
('tfidf', TfidfVectorizer(max_features=10000)),
('select', SelectKBest(chi2, k=1000)),
('clf', LogisticRegression())
])
高级文本特征 #
N-gram 特征 #
python
vectorizer = CountVectorizer(ngram_range=(1, 3))
X = vectorizer.fit_transform(corpus)
print(f"1-3 gram 特征数: {X.shape[1]}")
字符级特征 #
python
char_vec = CountVectorizer(analyzer='char', ngram_range=(2, 4))
X = char_vec.fit_transform(corpus)
print(f"字符级特征数: {X.shape[1]}")
词干提取 #
python
from nltk.stem import PorterStemmer
import nltk
stemmer = PorterStemmer()
def stem_tokenizer(text):
return [stemmer.stem(word) for word in text.split()]
vectorizer = CountVectorizer(tokenizer=stem_tokenizer)
词形还原 #
python
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
def lemma_tokenizer(text):
return [lemmatizer.lemmatize(word) for word in text.split()]
实战示例 #
新闻分类 #
python
from sklearn.datasets import fetch_20newsgroups
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med']
newsgroups = fetch_20newsgroups(subset='train', categories=categories)
pipe = Pipeline([
('tfidf', TfidfVectorizer(stop_words='english')),
('clf', MultinomialNB())
])
pipe.fit(newsgroups.data, newsgroups.target)
newsgroups_test = fetch_20newsgroups(subset='test', categories=categories)
y_pred = pipe.predict(newsgroups_test.data)
print(classification_report(newsgroups_test.target, y_pred,
target_names=newsgroups_test.target_names))
情感分析 #
python
from sklearn.linear_model import LogisticRegression
reviews = [
'This movie is great!',
'Terrible experience',
'I love this product',
'Waste of money'
]
sentiments = [1, 0, 1, 0]
pipe = Pipeline([
('tfidf', TfidfVectorizer(ngram_range=(1, 2))),
('clf', LogisticRegression())
])
pipe.fit(reviews, sentiments)
最佳实践 #
1. 选择合适的向量化方法 #
| 场景 | 推荐方法 |
|---|---|
| 小数据集 | TfidfVectorizer |
| 大数据集 | HashingVectorizer |
| 需要解释性 | CountVectorizer |
2. 参数调优 #
python
from sklearn.model_selection import GridSearchCV
param_grid = {
'tfidf__max_features': [1000, 5000, 10000],
'tfidf__ngram_range': [(1, 1), (1, 2)],
'tfidf__min_df': [1, 2, 5]
}
grid_search = GridSearchCV(pipe, param_grid, cv=5)
3. 内存优化 #
python
vectorizer = TfidfVectorizer(
max_features=5000,
dtype=np.float32
)
4. 并行处理 #
python
vectorizer = CountVectorizer(
max_features=10000,
n_jobs=-1
)
下一步 #
掌握特征提取后,继续学习 特征转换 了解如何进一步处理特征!
最后更新:2026-04-04