NLP 文本处理实战 #
NLP 概述 #
自然语言处理(NLP)是人工智能的重要分支,涉及文本分类、情感分析、机器翻译、问答系统等任务。
NLP 任务分类 #
text
┌─────────────────────────────────────────────────────────────┐
│ NLP 任务分类 │
├─────────────────────────────────────────────────────────────┤
│ │
│ 文本分类 │
│ ├── 情感分析 │
│ ├── 主题分类 │
│ └── 垃圾邮件检测 │
│ │
│ 序列标注 │
│ ├── 命名实体识别 │
│ ├── 词性标注 │
│ └── 分词 │
│ │
│ 序列到序列 │
│ ├── 机器翻译 │
│ ├── 文本摘要 │
│ └── 问答系统 │
│ │
│ 文本生成 │
│ ├── 对话系统 │
│ └── 创意写作 │
│ │
└─────────────────────────────────────────────────────────────┘
文本预处理 #
分词与编码 #
python
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
texts = [
"I love machine learning",
"Deep learning is amazing",
"Natural language processing is fun"
]
tokenizer = Tokenizer(num_words=100, oov_token='<OOV>')
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
padded = pad_sequences(sequences, maxlen=10, padding='post', truncating='post')
print(f"词表大小: {len(tokenizer.word_index)}")
print(f"序列: {sequences}")
print(f"填充后:\n{padded}")
TextVectorization 层 #
python
import tensorflow as tf
vectorize_layer = tf.keras.layers.TextVectorization(
max_tokens=10000,
output_mode='int',
output_sequence_length=200
)
texts = tf.constant([
"I love machine learning",
"Deep learning is amazing",
"Natural language processing is fun"
])
vectorize_layer.adapt(texts)
print(f"词表: {vectorize_layer.get_vocabulary()[:10]}")
print(f"向量化结果:\n{vectorize_layer(texts)}")
数据管道 #
python
import tensorflow as tf
import numpy as np
texts = np.array(["positive text", "negative text", "neutral text"] * 100)
labels = np.array([1, 0, 2] * 100)
vectorize_layer = tf.keras.layers.TextVectorization(
max_tokens=10000,
output_mode='int',
output_sequence_length=100
)
vectorize_layer.adapt(texts)
dataset = tf.data.Dataset.from_tensor_slices((texts, labels))
dataset = dataset.shuffle(300)
dataset = dataset.batch(32)
dataset = dataset.map(lambda x, y: (vectorize_layer(x), y))
dataset = dataset.prefetch(tf.data.AUTOTUNE)
词嵌入 #
Embedding 层 #
python
import tensorflow as tf
model = tf.keras.Sequential([
tf.keras.layers.Embedding(
input_dim=10000,
output_dim=128,
input_length=100
),
tf.keras.layers.GlobalAveragePooling1D(),
tf.keras.layers.Dense(64, activation='relu'),
tf.keras.layers.Dense(3, activation='softmax')
])
model.compile(
optimizer='adam',
loss='sparse_categorical_crossentropy',
metrics=['accuracy']
)
model.summary()
预训练词向量 #
python
import tensorflow as tf
import numpy as np
embedding_dim = 100
vocab_size = 10000
embedding_matrix = np.random.random((vocab_size, embedding_dim))
embedding_layer = tf.keras.layers.Embedding(
input_dim=vocab_size,
output_dim=embedding_dim,
weights=[embedding_matrix],
trainable=False
)
model = tf.keras.Sequential([
embedding_layer,
tf.keras.layers.LSTM(64),
tf.keras.layers.Dense(3, activation='softmax')
])
Transformer 架构 #
多头注意力 #
python
import tensorflow as tf
class MultiHeadSelfAttention(tf.keras.layers.Layer):
def __init__(self, embed_dim, num_heads):
super().__init__()
self.embed_dim = embed_dim
self.num_heads = num_heads
self.head_dim = embed_dim // num_heads
assert embed_dim % num_heads == 0
self.query_dense = tf.keras.layers.Dense(embed_dim)
self.key_dense = tf.keras.layers.Dense(embed_dim)
self.value_dense = tf.keras.layers.Dense(embed_dim)
self.combine_heads = tf.keras.layers.Dense(embed_dim)
def attention(self, query, key, value):
score = tf.matmul(query, key, transpose_b=True)
dim_key = tf.cast(tf.shape(key)[-1], tf.float32)
scaled_score = score / tf.math.sqrt(dim_key)
weights = tf.nn.softmax(scaled_score, axis=-1)
output = tf.matmul(weights, value)
return output
def separate_heads(self, x, batch_size):
x = tf.reshape(x, (batch_size, -1, self.num_heads, self.head_dim))
return tf.transpose(x, perm=[0, 2, 1, 3])
def call(self, inputs):
batch_size = tf.shape(inputs)[0]
query = self.query_dense(inputs)
key = self.key_dense(inputs)
value = self.value_dense(inputs)
query = self.separate_heads(query, batch_size)
key = self.separate_heads(key, batch_size)
value = self.separate_heads(value, batch_size)
attention = self.attention(query, key, value)
attention = tf.transpose(attention, perm=[0, 2, 1, 3])
concat_attention = tf.reshape(attention, (batch_size, -1, self.embed_dim))
return self.combine_heads(concat_attention)
Transformer 块 #
python
import tensorflow as tf
class TransformerBlock(tf.keras.layers.Layer):
def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
super().__init__()
self.att = MultiHeadSelfAttention(embed_dim, num_heads)
self.ffn = tf.keras.Sequential([
tf.keras.layers.Dense(ff_dim, activation='relu'),
tf.keras.layers.Dense(embed_dim)
])
self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
self.dropout1 = tf.keras.layers.Dropout(rate)
self.dropout2 = tf.keras.layers.Dropout(rate)
def call(self, inputs, training=False):
attn_output = self.att(inputs)
attn_output = self.dropout1(attn_output, training=training)
out1 = self.layernorm1(inputs + attn_output)
ffn_output = self.ffn(out1)
ffn_output = self.dropout2(ffn_output, training=training)
return self.layernorm2(out1 + ffn_output)
Transformer 分类器 #
python
import tensorflow as tf
class TokenAndPositionEmbedding(tf.keras.layers.Layer):
def __init__(self, maxlen, vocab_size, embed_dim):
super().__init__()
self.token_emb = tf.keras.layers.Embedding(vocab_size, embed_dim)
self.pos_emb = tf.keras.layers.Embedding(maxlen, embed_dim)
def call(self, x):
maxlen = tf.shape(x)[-1]
positions = tf.range(start=0, limit=maxlen, delta=1)
positions = self.pos_emb(positions)
x = self.token_emb(x)
return x + positions
vocab_size = 10000
maxlen = 200
embed_dim = 128
num_heads = 8
ff_dim = 128
inputs = tf.keras.Input(shape=(maxlen,))
x = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)(inputs)
x = TransformerBlock(embed_dim, num_heads, ff_dim)(x)
x = tf.keras.layers.GlobalAveragePooling1D()(x)
x = tf.keras.layers.Dropout(0.1)(x)
x = tf.keras.layers.Dense(64, activation='relu')(x)
x = tf.keras.layers.Dropout(0.1)(x)
outputs = tf.keras.layers.Dense(3, activation='softmax')(x)
model = tf.keras.Model(inputs, outputs)
model.compile(
optimizer='adam',
loss='sparse_categorical_crossentropy',
metrics=['accuracy']
)
model.summary()
情感分析 #
完整示例 #
python
import tensorflow as tf
import numpy as np
(x_train, y_train), (x_test, y_test) = tf.keras.datasets.imdb.load_data(num_words=10000)
maxlen = 200
x_train = tf.keras.preprocessing.sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = tf.keras.preprocessing.sequence.pad_sequences(x_test, maxlen=maxlen)
model = tf.keras.Sequential([
tf.keras.layers.Embedding(10000, 128, input_length=maxlen),
tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True)),
tf.keras.layers.GlobalMaxPooling1D(),
tf.keras.layers.Dense(64, activation='relu'),
tf.keras.layers.Dropout(0.5),
tf.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(
optimizer='adam',
loss='binary_crossentropy',
metrics=['accuracy']
)
callbacks = [
tf.keras.callbacks.EarlyStopping(
monitor='val_loss',
patience=5,
restore_best_weights=True
)
]
history = model.fit(
x_train, y_train,
batch_size=64,
epochs=20,
validation_split=0.2,
callbacks=callbacks
)
test_loss, test_acc = model.evaluate(x_test, y_test)
print(f"测试准确率: {test_acc:.4f}")
下一步 #
现在你已经完成了 NLP 文本处理实战,接下来学习 模型部署,了解如何将模型部署到生产环境!
最后更新:2026-04-04