文本分类实战 #
项目概述 #
text
┌─────────────────────────────────────────────────────────────┐
│ 文本分类流程 │
├─────────────────────────────────────────────────────────────┤
│ │
│ 1. 数据准备 │
│ ├── 文本加载 │
│ ├── 文本清洗 │
│ └── 标签处理 │
│ │
│ 2. 文本预处理 │
│ ├── 分词 │
│ ├── 序列转换 │
│ └── 序列填充 │
│ │
│ 3. 模型构建 │
│ ├── Embedding 层 │
│ ├── LSTM/Transformer │
│ └── 分类层 │
│ │
│ 4. 模型训练与评估 │
│ │
└─────────────────────────────────────────────────────────────┘
IMDB 情感分析 #
数据准备 #
python
import keras
import numpy as np
max_features = 10000
maxlen = 200
(x_train, y_train), (x_test, y_test) = keras.datasets.imdb.load_data(num_words=max_features)
x_train = keras.preprocessing.sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = keras.preprocessing.sequence.pad_sequences(x_test, maxlen=maxlen)
print(f'训练集: {x_train.shape}')
print(f'测试集: {x_test.shape}')
LSTM 模型 #
python
import keras
model = keras.Sequential([
keras.layers.Embedding(max_features, 128, input_length=maxlen),
keras.layers.Bidirectional(keras.layers.LSTM(64, return_sequences=True)),
keras.layers.Bidirectional(keras.layers.LSTM(32)),
keras.layers.Dense(64, activation='relu'),
keras.layers.Dropout(0.5),
keras.layers.Dense(1, activation='sigmoid')
])
model.compile(
optimizer='adam',
loss='binary_crossentropy',
metrics=['accuracy']
)
model.summary()
模型训练 #
python
callbacks = [
keras.callbacks.EarlyStopping(
monitor='val_loss',
patience=5,
restore_best_weights=True
),
keras.callbacks.ModelCheckpoint(
'best_imdb.keras',
monitor='val_accuracy',
save_best_only=True
)
]
history = model.fit(
x_train, y_train,
validation_split=0.2,
epochs=20,
batch_size=32,
callbacks=callbacks
)
模型评估 #
python
test_loss, test_acc = model.evaluate(x_test, y_test, verbose=0)
print(f'测试准确率: {test_acc:.4f}')
predictions = model.predict(x_test)
predicted = (predictions > 0.5).astype(int).flatten()
from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_test, predicted, target_names=['Negative', 'Positive']))
Transformer 模型 #
python
import keras
class TransformerBlock(keras.layers.Layer):
def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
super().__init__()
self.att = keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
self.ffn = keras.Sequential([
keras.layers.Dense(ff_dim, activation='relu'),
keras.layers.Dense(embed_dim),
])
self.layernorm1 = keras.layers.LayerNormalization(epsilon=1e-6)
self.layernorm2 = keras.layers.LayerNormalization(epsilon=1e-6)
self.dropout1 = keras.layers.Dropout(rate)
self.dropout2 = keras.layers.Dropout(rate)
def call(self, inputs, training=False):
attn_output = self.att(inputs, inputs)
attn_output = self.dropout1(attn_output, training=training)
out1 = self.layernorm1(inputs + attn_output)
ffn_output = self.ffn(out1)
ffn_output = self.dropout2(ffn_output, training=training)
return self.layernorm2(out1 + ffn_output)
embed_dim = 128
num_heads = 4
ff_dim = 64
inputs = keras.Input(shape=(maxlen,))
x = keras.layers.Embedding(max_features, embed_dim)(inputs)
x = TransformerBlock(embed_dim, num_heads, ff_dim)(x)
x = keras.layers.GlobalAveragePooling1D()(x)
x = keras.layers.Dropout(0.1)(x)
x = keras.layers.Dense(64, activation='relu')(x)
x = keras.layers.Dropout(0.1)(x)
outputs = keras.layers.Dense(1, activation='sigmoid')(x)
model = keras.Model(inputs, outputs)
model.compile(
optimizer='adam',
loss='binary_crossentropy',
metrics=['accuracy']
)
model.fit(
x_train, y_train,
validation_split=0.2,
epochs=10,
batch_size=32
)
自定义文本分类 #
python
import keras
import numpy as np
texts = [
'This movie is great!',
'Terrible film, waste of time',
'Amazing acting and story',
'Boring and predictable',
'Highly recommended',
'Not worth watching'
]
labels = [1, 0, 1, 0, 1, 0]
vectorizer = keras.layers.TextVectorization(
max_tokens=1000,
output_mode='int',
output_sequence_length=20
)
vectorizer.adapt(texts)
model = keras.Sequential([
vectorizer,
keras.layers.Embedding(1000, 64),
keras.layers.LSTM(32),
keras.layers.Dense(1, activation='sigmoid')
])
model.compile(
optimizer='adam',
loss='binary_crossentropy',
metrics=['accuracy']
)
model.fit(np.array(texts), np.array(labels), epochs=10)
new_texts = ['This is excellent!', 'Very bad movie']
predictions = model.predict(np.array(new_texts))
print(predictions)
下一步 #
现在你已经完成了文本分类实战,接下来学习 回归预测,处理回归问题!
最后更新:2026-04-04