优化器 #
优化器概述 #
优化器决定了模型如何根据损失函数的梯度更新参数。选择合适的优化器对模型训练至关重要。
优化器分类 #
text
┌─────────────────────────────────────────────────────────────┐
│ 优化器分类 │
├─────────────────────────────────────────────────────────────┤
│ │
│ 一阶优化器 │
│ ├── SGD (随机梯度下降) │
│ ├── Momentum │
│ └── Nesterov │
│ │
│ 自适应学习率优化器 │
│ ├── AdaGrad │
│ ├── RMSprop │
│ ├── Adam │
│ ├── AdamW │
│ └── Nadam │
│ │
│ 二阶优化器 │
│ └── (较少使用,计算成本高) │
│ │
└─────────────────────────────────────────────────────────────┘
SGD 优化器 #
基本 SGD #
python
import tensorflow as tf
model = tf.keras.Sequential([
tf.keras.layers.Dense(64, activation='relu', input_shape=(784,)),
tf.keras.layers.Dense(10)
])
optimizer = tf.keras.optimizers.SGD(learning_rate=0.01)
model.compile(optimizer=optimizer, loss='mse')
# 或简写
model.compile(optimizer='sgd', loss='mse')
带 Momentum 的 SGD #
python
import tensorflow as tf
# Momentum SGD
optimizer = tf.keras.optimizers.SGD(
learning_rate=0.01,
momentum=0.9
)
# Nesterov 加速梯度
optimizer = tf.keras.optimizers.SGD(
learning_rate=0.01,
momentum=0.9,
nesterov=True
)
model.compile(optimizer=optimizer, loss='mse')
Adam 优化器 #
基本 Adam #
python
import tensorflow as tf
optimizer = tf.keras.optimizers.Adam(
learning_rate=0.001,
beta_1=0.9,
beta_2=0.999,
epsilon=1e-07
)
model.compile(optimizer=optimizer, loss='mse')
# 或简写
model.compile(optimizer='adam', loss='mse')
AdamW (带权重衰减) #
python
import tensorflow as tf
optimizer = tf.keras.optimizers.AdamW(
learning_rate=0.001,
weight_decay=0.01,
beta_1=0.9,
beta_2=0.999,
epsilon=1e-07
)
model.compile(optimizer=optimizer, loss='mse')
其他优化器 #
RMSprop #
python
import tensorflow as tf
optimizer = tf.keras.optimizers.RMSprop(
learning_rate=0.001,
rho=0.9,
momentum=0.0,
epsilon=1e-07
)
model.compile(optimizer=optimizer, loss='mse')
AdaGrad #
python
import tensorflow as tf
optimizer = tf.keras.optimizers.Adagrad(
learning_rate=0.01,
initial_accumulator_value=0.1,
epsilon=1e-07
)
model.compile(optimizer=optimizer, loss='mse')
Nadam #
python
import tensorflow as tf
optimizer = tf.keras.optimizers.Nadam(
learning_rate=0.001,
beta_1=0.9,
beta_2=0.999,
epsilon=1e-07
)
model.compile(optimizer=optimizer, loss='mse')
FTRL #
python
import tensorflow as tf
optimizer = tf.keras.optimizers.Ftrl(
learning_rate=0.01,
learning_rate_power=-0.5,
initial_accumulator_value=0.1,
l1_regularization_strength=0.0,
l2_regularization_strength=0.0
)
model.compile(optimizer=optimizer, loss='mse')
学习率调度 #
LearningRateScheduler #
python
import tensorflow as tf
def lr_schedule(epoch, lr):
if epoch < 10:
return lr
elif epoch < 20:
return lr * 0.1
else:
return lr * 0.01
callback = tf.keras.callbacks.LearningRateScheduler(lr_schedule)
model.fit(x_train, y_train, epochs=30, callbacks=[callback])
ExponentialDecay #
python
import tensorflow as tf
lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
initial_learning_rate=0.1,
decay_steps=1000,
decay_rate=0.96,
staircase=True
)
optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule)
model.compile(optimizer=optimizer, loss='mse')
PiecewiseConstantDecay #
python
import tensorflow as tf
boundaries = [1000, 2000, 3000]
values = [0.01, 0.005, 0.001, 0.0005]
lr_schedule = tf.keras.optimizers.schedules.PiecewiseConstantDecay(
boundaries=boundaries,
values=values
)
optimizer = tf.keras.optimizers.SGD(learning_rate=lr_schedule)
PolynomialDecay #
python
import tensorflow as tf
lr_schedule = tf.keras.optimizers.schedules.PolynomialDecay(
initial_learning_rate=0.1,
decay_steps=10000,
end_learning_rate=0.0001,
power=1.0,
cycle=False
)
optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule)
CosineDecay #
python
import tensorflow as tf
lr_schedule = tf.keras.optimizers.schedules.CosineDecay(
initial_learning_rate=0.1,
decay_steps=10000,
alpha=0.0
)
optimizer = tf.keras.optimizers.SGD(learning_rate=lr_schedule)
# 带重启的余弦退火
lr_schedule = tf.keras.optimizers.schedules.CosineDecayRestarts(
initial_learning_rate=0.1,
first_decay_steps=1000,
t_mul=2.0,
m_mul=0.9,
alpha=0.0
)
InverseTimeDecay #
python
import tensorflow as tf
lr_schedule = tf.keras.optimizers.schedules.InverseTimeDecay(
initial_learning_rate=0.1,
decay_steps=1000,
decay_rate=0.5,
staircase=False
)
optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule)
ReduceLROnPlateau #
python
import tensorflow as tf
reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(
monitor='val_loss',
factor=0.5,
patience=5,
min_lr=1e-7,
verbose=1
)
model.fit(
x_train, y_train,
epochs=100,
validation_data=(x_val, y_val),
callbacks=[reduce_lr]
)
梯度裁剪 #
按值裁剪 #
python
import tensorflow as tf
optimizer = tf.keras.optimizers.Adam(clipvalue=0.5)
model.compile(optimizer=optimizer, loss='mse')
按范数裁剪 #
python
import tensorflow as tf
optimizer = tf.keras.optimizers.Adam(clipnorm=1.0)
model.compile(optimizer=optimizer, loss='mse')
# 全局范数裁剪
optimizer = tf.keras.optimizers.Adam(global_clipnorm=1.0)
自定义训练中的梯度裁剪 #
python
import tensorflow as tf
optimizer = tf.keras.optimizers.Adam()
model = tf.keras.Sequential([
tf.keras.layers.Dense(64, activation='relu', input_shape=(784,)),
tf.keras.layers.Dense(10)
])
@tf.function
def train_step(x, y):
with tf.GradientTape() as tape:
predictions = model(x, training=True)
loss = tf.keras.losses.mse(y, predictions)
gradients = tape.gradient(loss, model.trainable_variables)
# 梯度裁剪
gradients = [tf.clip_by_norm(g, 1.0) for g in gradients]
optimizer.apply_gradients(zip(gradients, model.trainable_variables))
return loss
优化器比较 #
python
import tensorflow as tf
import numpy as np
def compare_optimizers():
x_train = np.random.random((1000, 784)).astype(np.float32)
y_train = np.random.random((1000, 10)).astype(np.float32)
optimizers = {
'SGD': tf.keras.optimizers.SGD(learning_rate=0.01),
'Momentum': tf.keras.optimizers.SGD(learning_rate=0.01, momentum=0.9),
'Adam': tf.keras.optimizers.Adam(learning_rate=0.001),
'RMSprop': tf.keras.optimizers.RMSprop(learning_rate=0.001)
}
results = {}
for name, optimizer in optimizers.items():
model = tf.keras.Sequential([
tf.keras.layers.Dense(64, activation='relu', input_shape=(784,)),
tf.keras.layers.Dense(10)
])
model.compile(optimizer=optimizer, loss='mse')
history = model.fit(x_train, y_train, epochs=5, verbose=0)
results[name] = history.history['loss']
print(f"{name}: Final loss = {history.history['loss'][-1]:.4f}")
return results
compare_optimizers()
优化器选择建议 #
text
┌─────────────────────────────────────────────────────────────┐
│ 优化器选择建议 │
├─────────────────────────────────────────────────────────────┤
│ │
│ Adam / AdamW │
│ ├── 通用选择,适合大多数场景 │
│ ├── 自适应学习率,较少调参 │
│ └── 推荐作为默认选择 │
│ │
│ SGD + Momentum │
│ ├── 计算机视觉任务常用 │
│ ├── 配合学习率调度效果更好 │
│ └── 可能获得更好的泛化性能 │
│ │
│ RMSprop │
│ ├── RNN/LSTM 常用 │
│ └── 适合非平稳目标 │
│ │
│ AdaGrad │
│ ├── 稀疏数据场景 │
│ └── 学习率衰减较快 │
│ │
└─────────────────────────────────────────────────────────────┘
下一步 #
现在你已经掌握了优化器,接下来学习 评估指标,了解如何评估模型性能!
最后更新:2026-04-04