PyTorch 优化器 #
什么是优化器? #
优化器是深度学习训练的核心组件,负责根据梯度更新模型参数,使损失函数最小化。
text
┌─────────────────────────────────────────────────────────────┐
│ 优化器的作用 │
├─────────────────────────────────────────────────────────────┤
│ │
│ 训练循环: │
│ │
│ 1. 前向传播 ──► 计算损失 │
│ 2. 反向传播 ──► 计算梯度 │
│ 3. 优化器更新 ──► 更新参数 │
│ │
│ 参数更新公式: │
│ │
│ θ_new = θ_old - lr × ∇L(θ) │
│ │
│ 其中: │
│ - θ: 模型参数 │
│ - lr: 学习率 │
│ - ∇L(θ): 损失函数对参数的梯度 │
│ │
└─────────────────────────────────────────────────────────────┘
基本使用 #
优化器初始化 #
python
import torch
import torch.nn as nn
import torch.optim as optim
model = nn.Linear(10, 1)
optimizer = optim.SGD(model.parameters(), lr=0.01)
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
optimizer = optim.Adam(model.parameters(), lr=0.001)
optimizer = optim.Adam(model.parameters(), lr=0.001, betas=(0.9, 0.999))
训练循环 #
python
import torch
import torch.nn as nn
import torch.optim as optim
model = nn.Linear(10, 1)
criterion = nn.MSELoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)
data = torch.randn(32, 10)
target = torch.randn(32, 1)
for epoch in range(100):
optimizer.zero_grad()
output = model(data)
loss = criterion(output, target)
loss.backward()
optimizer.step()
if epoch % 10 == 0:
print(f"Epoch {epoch}, Loss: {loss.item():.4f}")
常用优化器 #
SGD(随机梯度下降) #
text
┌─────────────────────────────────────────────────────────────┐
│ SGD 算法 │
├─────────────────────────────────────────────────────────────┤
│ │
│ 基本公式: │
│ θ_t = θ_{t-1} - lr × g_t │
│ │
│ 带动量: │
│ v_t = β × v_{t-1} + g_t │
│ θ_t = θ_{t-1} - lr × v_t │
│ │
│ 特点: │
│ ✅ 简单高效 │
│ ✅ 泛化能力好 │
│ ⚠️ 需要手动调整学习率 │
│ ⚠️ 可能陷入局部最优 │
│ │
└─────────────────────────────────────────────────────────────┘
python
import torch.optim as optim
optimizer = optim.SGD(model.parameters(), lr=0.01)
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
optimizer = optim.SGD(
model.parameters(),
lr=0.01,
momentum=0.9,
nesterov=True
)
optimizer = optim.SGD(
model.parameters(),
lr=0.01,
momentum=0.9,
weight_decay=1e-4
)
Adam #
text
┌─────────────────────────────────────────────────────────────┐
│ Adam 算法 │
├─────────────────────────────────────────────────────────────┤
│ │
│ 自适应学习率优化器: │
│ │
│ m_t = β₁ × m_{t-1} + (1-β₁) × g_t 一阶矩估计 │
│ v_t = β₂ × v_{t-1} + (1-β₂) × g_t² 二阶矩估计 │
│ │
│ 偏差修正: │
│ m̂_t = m_t / (1 - β₁^t) │
│ v̂_t = v_t / (1 - β₂^t) │
│ │
│ 参数更新: │
│ θ_t = θ_{t-1} - lr × m̂_t / (√v̂_t + ε) │
│ │
│ 默认参数: │
│ - β₁ = 0.9 (一阶矩衰减率) │
│ - β₂ = 0.999 (二阶矩衰减率) │
│ - ε = 1e-8 (数值稳定性) │
│ │
│ 特点: │
│ ✅ 自适应学习率 │
│ ✅ 收敛快 │
│ ✅ 适合大多数任务 │
│ │
└─────────────────────────────────────────────────────────────┘
python
import torch.optim as optim
optimizer = optim.Adam(model.parameters(), lr=0.001)
optimizer = optim.Adam(
model.parameters(),
lr=0.001,
betas=(0.9, 0.999),
eps=1e-8
)
optimizer = optim.Adam(
model.parameters(),
lr=0.001,
weight_decay=1e-4
)
optimizer = optim.AdamW(
model.parameters(),
lr=0.001,
weight_decay=0.01
)
其他优化器 #
python
import torch.optim as optim
optimizer = optim.RMSprop(model.parameters(), lr=0.01)
optimizer = optim.Adagrad(model.parameters(), lr=0.01)
optimizer = optim.Adadelta(model.parameters(), lr=1.0)
optimizer = optim.Adamax(model.parameters(), lr=0.002)
optimizer = optim.NAdam(model.parameters(), lr=0.002)
优化器对比 #
text
┌─────────────────────────────────────────────────────────────┐
│ 优化器选择指南 │
├─────────────────────────────────────────────────────────────┤
│ │
│ ┌─────────────┬─────────────┬─────────────┐ │
│ │ 优化器 │ 适用场景 │ 特点 │ │
│ ├─────────────┼─────────────┼─────────────┤ │
│ │ SGD │ CV任务 │ 泛化好 │ │
│ │ SGD+Momentum│ 通用 │ 稳定 │ │
│ │ Adam │ NLP、通用 │ 快速收敛 │ │
│ │ AdamW │ Transformer │ 权重衰减正确 │ │
│ │ RMSprop │ RNN │ 适合非平稳 │ │
│ └─────────────┴─────────────┴─────────────┘ │
│ │
│ 推荐选择: │
│ - 不确定时:Adam 或 AdamW │
│ - CV任务:SGD + Momentum │
│ - NLP任务:AdamW │
│ - 研究:尝试多种优化器对比 │
│ │
└─────────────────────────────────────────────────────────────┘
学习率调度器 #
为什么需要学习率调度? #
text
┌─────────────────────────────────────────────────────────────┐
│ 学习率的作用 │
├─────────────────────────────────────────────────────────────┤
│ │
│ 学习率太大: │
│ - 训练不稳定 │
│ - 可能发散 │
│ - 难以收敛 │
│ │
│ 学习率太小: │
│ - 收敛太慢 │
│ - 可能陷入局部最优 │
│ │
│ 学习率调度策略: │
│ - 初期:较大学习率,快速收敛 │
│ - 后期:较小学习率,精细调优 │
│ │
└─────────────────────────────────────────────────────────────┘
StepLR #
python
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim.lr_scheduler import StepLR
model = nn.Linear(10, 1)
optimizer = optim.SGD(model.parameters(), lr=0.1)
scheduler = StepLR(optimizer, step_size=30, gamma=0.1)
for epoch in range(100):
train(...)
optimizer.step()
scheduler.step()
if epoch % 10 == 0:
print(f"Epoch {epoch}, LR: {scheduler.get_last_lr()[0]:.6f}")
MultiStepLR #
python
from torch.optim.lr_scheduler import MultiStepLR
scheduler = MultiStepLR(
optimizer,
milestones=[30, 60, 90],
gamma=0.1
)
for epoch in range(100):
train(...)
optimizer.step()
scheduler.step()
ExponentialLR #
python
from torch.optim.lr_scheduler import ExponentialLR
scheduler = ExponentialLR(optimizer, gamma=0.9)
for epoch in range(100):
train(...)
optimizer.step()
scheduler.step()
CosineAnnealingLR #
python
from torch.optim.lr_scheduler import CosineAnnealingLR
scheduler = CosineAnnealingLR(
optimizer,
T_max=100,
eta_min=0.0001
)
for epoch in range(100):
train(...)
optimizer.step()
scheduler.step()
ReduceLROnPlateau #
python
from torch.optim.lr_scheduler import ReduceLROnPlateau
scheduler = ReduceLROnPlateau(
optimizer,
mode='min',
factor=0.1,
patience=10,
verbose=True
)
for epoch in range(100):
train_loss = train(...)
optimizer.step()
scheduler.step(train_loss)
OneCycleLR #
python
from torch.optim.lr_scheduler import OneCycleLR
model = nn.Linear(10, 1)
optimizer = optim.SGD(model.parameters(), lr=0.1)
scheduler = OneCycleLR(
optimizer,
max_lr=0.1,
total_steps=1000
)
for step in range(1000):
train(...)
optimizer.step()
scheduler.step()
CosineAnnealingWarmRestarts #
python
from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts
scheduler = CosineAnnealingWarmRestarts(
optimizer,
T_0=10,
T_mult=2,
eta_min=0.0001
)
for epoch in range(100):
train(...)
optimizer.step()
scheduler.step()
学习率调度器对比 #
text
┌─────────────────────────────────────────────────────────────┐
│ 调度器选择指南 │
├─────────────────────────────────────────────────────────────┤
│ │
│ StepLR / MultiStepLR │
│ - 简单直观 │
│ - 适合已知训练周期的场景 │
│ │
│ CosineAnnealingLR │
│ - 平滑衰减 │
│ - CV任务常用 │
│ │
│ ReduceLROnPlateau │
│ - 自适应调整 │
│ - 根据验证损失调整 │
│ │
│ OneCycleLR │
│ - 快速训练 │
│ - 超参数少 │
│ │
│ CosineAnnealingWarmRestarts │
│ - 周期性重启 │
│ - 帮助跳出局部最优 │
│ │
└─────────────────────────────────────────────────────────────┘
梯度裁剪 #
python
import torch
import torch.nn as nn
import torch.optim as optim
model = nn.Linear(10, 1)
optimizer = optim.SGD(model.parameters(), lr=0.01)
for data, target in dataloader:
optimizer.zero_grad()
output = model(data)
loss = criterion(output, target)
loss.backward()
torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
torch.nn.utils.clip_grad_value_(model.parameters(), clip_value=0.5)
optimizer.step()
参数分组 #
python
import torch
import torch.nn as nn
import torch.optim as optim
model = nn.Sequential(
nn.Linear(784, 256),
nn.ReLU(),
nn.Linear(256, 10)
)
optimizer = optim.SGD([
{'params': model[0].parameters(), 'lr': 0.01},
{'params': model[2].parameters(), 'lr': 0.001}
], lr=0.01)
optimizer = optim.SGD([
{'params': model[0].parameters(), 'lr': 0.01, 'weight_decay': 1e-4},
{'params': model[2].parameters(), 'lr': 0.001, 'weight_decay': 1e-5}
])
优化器状态 #
python
import torch
import torch.nn as nn
import torch.optim as optim
model = nn.Linear(10, 1)
optimizer = optim.Adam(model.parameters(), lr=0.001)
for epoch in range(10):
optimizer.zero_grad()
output = model(torch.randn(32, 10))
loss = output.sum()
loss.backward()
optimizer.step()
state = optimizer.state_dict()
print("Optimizer state keys:", state.keys())
optimizer2 = optim.Adam(model.parameters(), lr=0.001)
optimizer2.load_state_dict(state)
实用技巧 #
学习率预热 #
python
import torch
import torch.nn as nn
import torch.optim as optim
import math
def warmup_lr(optimizer, warmup_epochs, base_lr, current_epoch):
if current_epoch < warmup_epochs:
lr = base_lr * (current_epoch + 1) / warmup_epochs
for param_group in optimizer.param_groups:
param_group['lr'] = lr
model = nn.Linear(10, 1)
optimizer = optim.SGD(model.parameters(), lr=0.1)
warmup_epochs = 5
for epoch in range(100):
warmup_lr(optimizer, warmup_epochs, 0.1, epoch)
train(...)
optimizer.step()
学习率查找 #
python
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
def find_lr(model, dataloader, criterion, init_lr=1e-7, final_lr=10, num_iter=100):
optimizer = optim.SGD(model.parameters(), lr=init_lr)
lr_mult = (final_lr / init_lr) ** (1 / num_iter)
lrs = []
losses = []
best_loss = float('inf')
for i, (data, target) in enumerate(dataloader):
if i >= num_iter:
break
lr = init_lr * (lr_mult ** i)
for param_group in optimizer.param_groups:
param_group['lr'] = lr
optimizer.zero_grad()
output = model(data)
loss = criterion(output, target)
loss.backward()
optimizer.step()
lrs.append(lr)
losses.append(loss.item())
if loss.item() < best_loss:
best_loss = loss.item()
plt.plot(lrs, losses)
plt.xscale('log')
plt.xlabel('Learning Rate')
plt.ylabel('Loss')
plt.show()
return lrs, losses
梯度累积 #
python
import torch
import torch.nn as nn
import torch.optim as optim
model = nn.Linear(10, 1)
optimizer = optim.SGD(model.parameters(), lr=0.01)
criterion = nn.MSELoss()
accumulation_steps = 4
for i, (data, target) in enumerate(dataloader):
output = model(data)
loss = criterion(output, target)
loss = loss / accumulation_steps
loss.backward()
if (i + 1) % accumulation_steps == 0:
optimizer.step()
optimizer.zero_grad()
完整训练示例 #
python
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim.lr_scheduler import CosineAnnealingLR
class Net(nn.Module):
def __init__(self):
super().__init__()
self.fc1 = nn.Linear(784, 256)
self.fc2 = nn.Linear(256, 10)
self.relu = nn.ReLU()
self.dropout = nn.Dropout(0.2)
def forward(self, x):
x = x.view(-1, 784)
x = self.relu(self.fc1(x))
x = self.dropout(x)
x = self.fc2(x)
return x
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = Net().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1, momentum=0.9, weight_decay=5e-4)
scheduler = CosineAnnealingLR(optimizer, T_max=100)
def train(model, dataloader, criterion, optimizer, device):
model.train()
total_loss = 0
correct = 0
total = 0
for data, target in dataloader:
data, target = data.to(device), target.to(device)
optimizer.zero_grad()
output = model(data)
loss = criterion(output, target)
loss.backward()
torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
optimizer.step()
total_loss += loss.item()
_, predicted = output.max(1)
total += target.size(0)
correct += predicted.eq(target).sum().item()
return total_loss / len(dataloader), 100. * correct / total
for epoch in range(100):
train_loss, train_acc = train(model, train_loader, criterion, optimizer, device)
scheduler.step()
if epoch % 10 == 0:
print(f"Epoch {epoch}, Loss: {train_loss:.4f}, Acc: {train_acc:.2f}%, LR: {scheduler.get_last_lr()[0]:.6f}")
下一步 #
现在你已经掌握了 PyTorch 优化器的核心概念,接下来学习 数据加载,了解如何高效处理训练数据!
最后更新:2026-03-29