文本数据处理 #
文本处理流程 #
text
┌─────────────────────────────────────────────────────────────┐
│ 文本处理流程 │
├─────────────────────────────────────────────────────────────┤
│ │
│ 原始文本 │
│ │ │
│ ▼ │
│ 文本清洗 (去除标点、特殊字符) │
│ │ │
│ ▼ │
│ 分词 (Tokenization) │
│ │ │
│ ▼ │
│ 构建词汇表 (Vocabulary) │
│ │ │
│ ▼ │
│ 序列转换 (Text to Sequence) │
│ │ │
│ ▼ │
│ 序列填充 (Padding) │
│ │ │
│ ▼ │
│ 词嵌入 (Embedding) │
│ │
└─────────────────────────────────────────────────────────────┘
TextVectorization 层 #
基本用法 #
python
import keras
texts = [
'Hello world',
'Keras is great for deep learning',
'Text processing is important'
]
vectorizer = keras.layers.TextVectorization(
max_tokens=100,
output_mode='int',
output_sequence_length=10
)
vectorizer.adapt(texts)
vectors = vectorizer(texts)
print(vectors)
参数详解 #
python
keras.layers.TextVectorization(
max_tokens=None,
standardize='lower_and_strip_punctuation',
split='whitespace',
ngrams=None,
output_mode='int',
output_sequence_length=None,
vocabulary=None,
idf_weights=None,
sparse=False,
ragged=False
)
text
┌─────────────────────────────────────────────────────────────┐
│ TextVectorization 参数 │
├─────────────────────────────────────────────────────────────┤
│ │
│ max_tokens: 词汇表最大大小 │
│ │
│ standardize: 标准化方式 │
│ ├── 'lower_and_strip_punctuation': 小写+去标点 │
│ ├── 'lower': 仅小写 │
│ ├── 'strip_punctuation': 仅去标点 │
│ └── None: 不处理 │
│ │
│ split: 分词方式 │
│ ├── 'whitespace': 按空格分词 │
│ ├── 'character': 按字符分词 │
│ └── 自定义函数 │
│ │
│ ngrams: N-gram 设置 │
│ 例: ngrams=2 表示生成 bigram │
│ │
│ output_mode: 输出模式 │
│ ├── 'int': 整数序列 │
│ ├── 'binary': 多热编码 │
│ ├── 'count': 词频统计 │
│ └── 'tf_idf': TF-IDF │
│ │
│ output_sequence_length: 输出序列长度 │
│ │
└─────────────────────────────────────────────────────────────┘
不同输出模式 #
python
import keras
texts = ['hello world', 'hello keras']
vectorizer_int = keras.layers.TextVectorization(output_mode='int', output_sequence_length=5)
vectorizer_int.adapt(texts)
print("int:", vectorizer_int(texts))
vectorizer_binary = keras.layers.TextVectorization(output_mode='binary')
vectorizer_binary.adapt(texts)
print("binary:", vectorizer_binary(texts))
vectorizer_count = keras.layers.TextVectorization(output_mode='count')
vectorizer_count.adapt(texts)
print("count:", vectorizer_count(texts))
vectorizer_tfidf = keras.layers.TextVectorization(output_mode='tf_idf')
vectorizer_tfidf.adapt(texts)
print("tf_idf:", vectorizer_tfidf(texts))
Tokenizer #
基本用法 #
python
import keras
texts = ['Hello world', 'Keras is great', 'Deep learning is fun']
tokenizer = keras.preprocessing.text.Tokenizer(num_words=100)
tokenizer.fit_on_texts(texts)
print("词汇表:", tokenizer.word_index)
sequences = tokenizer.texts_to_sequences(texts)
print("序列:", sequences)
one_hot = tokenizer.texts_to_matrix(texts, mode='binary')
print("One-hot:", one_hot.shape)
参数详解 #
python
keras.preprocessing.text.Tokenizer(
num_words=None,
filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
lower=True,
split=' ',
char_level=False,
oov_token=None
)
序列填充 #
pad_sequences #
python
import keras
sequences = [[1, 2, 3], [4, 5], [6, 7, 8, 9, 10]]
padded = keras.preprocessing.sequence.pad_sequences(
sequences,
maxlen=5,
padding='pre',
truncating='pre',
value=0
)
print(padded)
padded_post = keras.preprocessing.sequence.pad_sequences(
sequences,
maxlen=5,
padding='post',
truncating='post'
)
print(padded_post)
Embedding 层 #
基本用法 #
python
import keras
model = keras.Sequential([
keras.layers.Embedding(
input_dim=10000,
output_dim=128,
input_length=100
),
keras.layers.LSTM(64),
keras.layers.Dense(1, activation='sigmoid')
])
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
参数详解 #
python
keras.layers.Embedding(
input_dim,
output_dim,
embeddings_initializer='uniform',
embeddings_regularizer=None,
mask_zero=False,
input_length=None
)
使用预训练词向量 #
python
import keras
import numpy as np
embedding_dim = 100
vocab_size = 10000
embedding_matrix = np.zeros((vocab_size, embedding_dim))
embedding_layer = keras.layers.Embedding(
input_dim=vocab_size,
output_dim=embedding_dim,
embeddings_initializer=keras.initializers.Constant(embedding_matrix),
trainable=False
)
model = keras.Sequential([
embedding_layer,
keras.layers.LSTM(64),
keras.layers.Dense(1, activation='sigmoid')
])
完整文本分类示例 #
python
import keras
import numpy as np
max_features = 10000
maxlen = 200
(x_train, y_train), (x_test, y_test) = keras.datasets.imdb.load_data(num_words=max_features)
x_train = keras.preprocessing.sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = keras.preprocessing.sequence.pad_sequences(x_test, maxlen=maxlen)
model = keras.Sequential([
keras.layers.Embedding(max_features, 128, input_length=maxlen),
keras.layers.Bidirectional(keras.layers.LSTM(64, return_sequences=True)),
keras.layers.Bidirectional(keras.layers.LSTM(32)),
keras.layers.Dense(64, activation='relu'),
keras.layers.Dropout(0.5),
keras.layers.Dense(1, activation='sigmoid')
])
model.compile(
optimizer='adam',
loss='binary_crossentropy',
metrics=['accuracy']
)
history = model.fit(
x_train, y_train,
epochs=10,
batch_size=32,
validation_split=0.2,
callbacks=[
keras.callbacks.EarlyStopping(patience=3, restore_best_weights=True)
]
)
test_loss, test_acc = model.evaluate(x_test, y_test)
print(f'测试准确率: {test_acc:.4f}')
Transformer 文本处理 #
python
import keras
max_features = 10000
maxlen = 200
inputs = keras.Input(shape=(maxlen,), dtype='int32')
x = keras.layers.Embedding(max_features, 128)(inputs)
x = keras.layers.MultiHeadAttention(num_heads=8, key_dim=32)(x, x)
x = keras.layers.GlobalAveragePooling1D()(x)
x = keras.layers.Dense(64, activation='relu')(x)
x = keras.layers.Dropout(0.5)(x)
outputs = keras.layers.Dense(1, activation='sigmoid')(x)
model = keras.Model(inputs, outputs)
model.compile(
optimizer='adam',
loss='binary_crossentropy',
metrics=['accuracy']
)
下一步 #
现在你已经掌握了文本数据处理,接下来学习 迁移学习,利用预训练模型!
最后更新:2026-04-04