GPU 加速 #
GPU 概述 #
TensorFlow 支持 NVIDIA GPU 加速,可以显著提升模型训练和推理速度。
GPU 加速原理 #
text
┌─────────────────────────────────────────────────────────────┐
│ GPU 加速原理 │
├─────────────────────────────────────────────────────────────┤
│ │
│ CPU: 通用计算,串行处理 │
│ ├── 适合复杂逻辑 │
│ └── 核心数较少 │
│ │
│ GPU: 并行计算,大规模并行 │
│ ├── 适合矩阵运算 │
│ ├── 数千个核心 │
│ └── 深度学习主要计算瓶颈 │
│ │
│ 加速流程: │
│ 数据 ──► CPU 预处理 ──► GPU 计算 ──► CPU 后处理 │
│ │
└─────────────────────────────────────────────────────────────┘
GPU 检测与配置 #
检测 GPU #
python
import tensorflow as tf
# 列出所有物理设备
gpus = tf.config.list_physical_devices('GPU')
print(f"发现 {len(gpus)} 个 GPU")
for gpu in gpus:
print(f" {gpu}")
# 查看 GPU 详细信息
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())
内存管理 #
python
import tensorflow as tf
gpus = tf.config.list_physical_devices('GPU')
if gpus:
try:
# 按需分配内存
for gpu in gpus:
tf.config.experimental.set_memory_growth(gpu, True)
# 或限制 GPU 内存使用
tf.config.set_logical_device_configuration(
gpus[0],
[tf.config.LogicalDeviceConfiguration(memory_limit=4096)]
)
except RuntimeError as e:
print(e)
指定设备 #
python
import tensorflow as tf
# 使用 CPU
with tf.device('/CPU:0'):
a = tf.constant([[1, 2], [3, 4]])
b = tf.constant([[5, 6], [7, 8]])
c = tf.matmul(a, b)
print(f"设备: {c.device}")
# 使用 GPU
if tf.config.list_physical_devices('GPU'):
with tf.device('/GPU:0'):
a = tf.constant([[1.0, 2.0], [3.0, 4.0]])
b = tf.constant([[5.0, 6.0], [7.0, 8.0]])
c = tf.matmul(a, b)
print(f"设备: {c.device}")
多 GPU 训练 #
MirroredStrategy #
python
import tensorflow as tf
import numpy as np
strategy = tf.distribute.MirroredStrategy()
print(f"设备数量: {strategy.num_replicas_in_sync}")
with strategy.scope():
model = tf.keras.Sequential([
tf.keras.layers.Dense(64, activation='relu', input_shape=(784,)),
tf.keras.layers.Dense(10, activation='softmax')
])
model.compile(
optimizer='adam',
loss='sparse_categorical_crossentropy',
metrics=['accuracy']
)
x_train = np.random.random((1000, 784)).astype(np.float32)
y_train = np.random.randint(10, size=(1000,))
batch_size = 64 * strategy.num_replicas_in_sync
model.fit(x_train, y_train, epochs=10, batch_size=batch_size)
MultiWorkerMirroredStrategy #
python
import tensorflow as tf
import json
# 配置集群
os.environ['TF_CONFIG'] = json.dumps({
'cluster': {
'worker': ["localhost:12345", "localhost:12346"]
},
'task': {'type': 'worker', 'index': 0}
})
strategy = tf.distribute.MultiWorkerMirroredStrategy()
with strategy.scope():
model = tf.keras.Sequential([
tf.keras.layers.Dense(64, activation='relu', input_shape=(784,)),
tf.keras.layers.Dense(10)
])
model.compile(
optimizer='adam',
loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
)
混合精度训练 #
启用混合精度 #
python
import tensorflow as tf
# 启用混合精度
policy = tf.keras.mixed_precision.Policy('mixed_float16')
tf.keras.mixed_precision.set_global_policy(policy)
print(f"计算策略: {policy.name}")
print(f"变量数据类型: {policy.variable_dtype}")
model = tf.keras.Sequential([
tf.keras.layers.Dense(64, activation='relu', input_shape=(784,)),
tf.keras.layers.Dense(10, activation='softmax', dtype='float32')
])
model.compile(
optimizer='adam',
loss='sparse_categorical_crossentropy',
metrics=['accuracy']
)
model.fit(x_train, y_train, epochs=10)
Loss Scaling #
python
import tensorflow as tf
policy = tf.keras.mixed_precision.Policy('mixed_float16')
tf.keras.mixed_precision.set_global_policy(policy)
optimizer = tf.keras.mixed_precision.LossScaleOptimizer(
tf.keras.optimizers.Adam(),
initial_scale=2**15,
dynamic_growth=True
)
model.compile(
optimizer=optimizer,
loss='sparse_categorical_crossentropy',
metrics=['accuracy']
)
XLA 编译 #
启用 XLA #
python
import tensorflow as tf
# 全局启用 XLA
tf.config.optimizer.set_jit(True)
# 或在特定函数上启用
@tf.function(jit_compile=True)
def train_step(x, y):
with tf.GradientTape() as tape:
predictions = model(x, training=True)
loss = loss_fn(y, predictions)
gradients = tape.gradient(loss, model.trainable_variables)
optimizer.apply_gradients(zip(gradients, model.trainable_variables))
return loss
XLA 编译模型 #
python
import tensorflow as tf
model = tf.keras.Sequential([
tf.keras.layers.Dense(64, activation='relu', input_shape=(784,)),
tf.keras.layers.Dense(10)
])
model.compile(
optimizer='adam',
loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
jit_compile=True
)
性能优化 #
数据管道优化 #
python
import tensorflow as tf
train_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train))
train_dataset = train_dataset.shuffle(10000)
train_dataset = train_dataset.batch(64)
train_dataset = train_dataset.prefetch(tf.data.AUTOTUNE)
train_dataset = train_dataset.cache()
model.fit(train_dataset, epochs=10)
性能分析 #
python
import tensorflow as tf
# 启用 TensorBoard 性能分析
log_dir = "./logs"
tensorboard_callback = tf.keras.callbacks.TensorBoard(
log_dir=log_dir,
histogram_freq=1,
profile_batch='10,20'
)
model.fit(
train_dataset,
epochs=10,
callbacks=[tensorboard_callback]
)
# 启动 TensorBoard
# tensorboard --logdir=./logs
GPU 利用率监控 #
python
import tensorflow as tf
import subprocess
def check_gpu_utilization():
result = subprocess.run(
['nvidia-smi', '--query-gpu=utilization.gpu,memory.used,memory.total', '--format=csv'],
capture_output=True,
text=True
)
print(result.stdout)
check_gpu_utilization()
常见问题 #
GPU 内存不足 #
python
import tensorflow as tf
# 解决方案1: 减小 batch size
batch_size = 16
# 解决方案2: 按需分配内存
gpus = tf.config.list_physical_devices('GPU')
for gpu in gpus:
tf.config.experimental.set_memory_growth(gpu, True)
# 解决方案3: 限制内存使用
tf.config.set_logical_device_configuration(
gpus[0],
[tf.config.LogicalDeviceConfiguration(memory_limit=2048)]
)
GPU 未被使用 #
python
import tensorflow as tf
# 检查 TensorFlow 是否编译了 GPU 支持
print(tf.sysconfig.get_build_info())
# 确保 CUDA 环境变量正确
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
# 验证 GPU 可用
print("GPU 可用:", len(tf.config.list_physical_devices('GPU')) > 0)
下一步 #
现在你已经掌握了 GPU 加速,接下来学习 分布式训练,了解如何进行大规模分布式训练!
最后更新:2026-04-04