PyTorch 优化器 #

什么是优化器? #

优化器是深度学习训练的核心组件,负责根据梯度更新模型参数,使损失函数最小化。

text
┌─────────────────────────────────────────────────────────────┐
│                    优化器的作用                              │
├─────────────────────────────────────────────────────────────┤
│                                                             │
│  训练循环:                                                  │
│                                                             │
│  1. 前向传播 ──► 计算损失                                    │
│  2. 反向传播 ──► 计算梯度                                    │
│  3. 优化器更新 ──► 更新参数                                  │
│                                                             │
│  参数更新公式:                                              │
│                                                             │
│  θ_new = θ_old - lr × ∇L(θ)                                │
│                                                             │
│  其中:                                                      │
│  - θ: 模型参数                                              │
│  - lr: 学习率                                               │
│  - ∇L(θ): 损失函数对参数的梯度                               │
│                                                             │
└─────────────────────────────────────────────────────────────┘

基本使用 #

优化器初始化 #

python
import torch
import torch.nn as nn
import torch.optim as optim

model = nn.Linear(10, 1)

optimizer = optim.SGD(model.parameters(), lr=0.01)

optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)

optimizer = optim.Adam(model.parameters(), lr=0.001)

optimizer = optim.Adam(model.parameters(), lr=0.001, betas=(0.9, 0.999))

训练循环 #

python
import torch
import torch.nn as nn
import torch.optim as optim

model = nn.Linear(10, 1)
criterion = nn.MSELoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)

data = torch.randn(32, 10)
target = torch.randn(32, 1)

for epoch in range(100):
    optimizer.zero_grad()
    
    output = model(data)
    loss = criterion(output, target)
    
    loss.backward()
    
    optimizer.step()
    
    if epoch % 10 == 0:
        print(f"Epoch {epoch}, Loss: {loss.item():.4f}")

常用优化器 #

SGD(随机梯度下降) #

text
┌─────────────────────────────────────────────────────────────┐
│                    SGD 算法                                  │
├─────────────────────────────────────────────────────────────┤
│                                                             │
│  基本公式:                                                  │
│  θ_t = θ_{t-1} - lr × g_t                                  │
│                                                             │
│  带动量:                                                    │
│  v_t = β × v_{t-1} + g_t                                   │
│  θ_t = θ_{t-1} - lr × v_t                                  │
│                                                             │
│  特点:                                                      │
│  ✅ 简单高效                                                │
│  ✅ 泛化能力好                                              │
│  ⚠️ 需要手动调整学习率                                      │
│  ⚠️ 可能陷入局部最优                                        │
│                                                             │
└─────────────────────────────────────────────────────────────┘
python
import torch.optim as optim

optimizer = optim.SGD(model.parameters(), lr=0.01)

optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)

optimizer = optim.SGD(
    model.parameters(), 
    lr=0.01, 
    momentum=0.9, 
    nesterov=True
)

optimizer = optim.SGD(
    model.parameters(), 
    lr=0.01, 
    momentum=0.9, 
    weight_decay=1e-4
)

Adam #

text
┌─────────────────────────────────────────────────────────────┐
│                    Adam 算法                                 │
├─────────────────────────────────────────────────────────────┤
│                                                             │
│  自适应学习率优化器:                                        │
│                                                             │
│  m_t = β₁ × m_{t-1} + (1-β₁) × g_t     一阶矩估计          │
│  v_t = β₂ × v_{t-1} + (1-β₂) × g_t²    二阶矩估计          │
│                                                             │
│  偏差修正:                                                  │
│  m̂_t = m_t / (1 - β₁^t)                                    │
│  v̂_t = v_t / (1 - β₂^t)                                    │
│                                                             │
│  参数更新:                                                  │
│  θ_t = θ_{t-1} - lr × m̂_t / (√v̂_t + ε)                   │
│                                                             │
│  默认参数:                                                  │
│  - β₁ = 0.9 (一阶矩衰减率)                                  │
│  - β₂ = 0.999 (二阶矩衰减率)                                │
│  - ε = 1e-8 (数值稳定性)                                    │
│                                                             │
│  特点:                                                      │
│  ✅ 自适应学习率                                            │
│  ✅ 收敛快                                                  │
│  ✅ 适合大多数任务                                          │
│                                                             │
└─────────────────────────────────────────────────────────────┘
python
import torch.optim as optim

optimizer = optim.Adam(model.parameters(), lr=0.001)

optimizer = optim.Adam(
    model.parameters(), 
    lr=0.001, 
    betas=(0.9, 0.999),
    eps=1e-8
)

optimizer = optim.Adam(
    model.parameters(), 
    lr=0.001, 
    weight_decay=1e-4
)

optimizer = optim.AdamW(
    model.parameters(), 
    lr=0.001, 
    weight_decay=0.01
)

其他优化器 #

python
import torch.optim as optim

optimizer = optim.RMSprop(model.parameters(), lr=0.01)

optimizer = optim.Adagrad(model.parameters(), lr=0.01)

optimizer = optim.Adadelta(model.parameters(), lr=1.0)

optimizer = optim.Adamax(model.parameters(), lr=0.002)

optimizer = optim.NAdam(model.parameters(), lr=0.002)

优化器对比 #

text
┌─────────────────────────────────────────────────────────────┐
│                    优化器选择指南                            │
├─────────────────────────────────────────────────────────────┤
│                                                             │
│  ┌─────────────┬─────────────┬─────────────┐               │
│  │ 优化器       │ 适用场景     │ 特点         │               │
│  ├─────────────┼─────────────┼─────────────┤               │
│  │ SGD         │ CV任务       │ 泛化好       │               │
│  │ SGD+Momentum│ 通用        │ 稳定         │               │
│  │ Adam        │ NLP、通用    │ 快速收敛     │               │
│  │ AdamW       │ Transformer │ 权重衰减正确 │               │
│  │ RMSprop     │ RNN         │ 适合非平稳   │               │
│  └─────────────┴─────────────┴─────────────┘               │
│                                                             │
│  推荐选择:                                                  │
│  - 不确定时:Adam 或 AdamW                                  │
│  - CV任务:SGD + Momentum                                   │
│  - NLP任务:AdamW                                           │
│  - 研究:尝试多种优化器对比                                  │
│                                                             │
└─────────────────────────────────────────────────────────────┘

学习率调度器 #

为什么需要学习率调度? #

text
┌─────────────────────────────────────────────────────────────┐
│                    学习率的作用                              │
├─────────────────────────────────────────────────────────────┤
│                                                             │
│  学习率太大:                                                │
│  - 训练不稳定                                               │
│  - 可能发散                                                 │
│  - 难以收敛                                                 │
│                                                             │
│  学习率太小:                                                │
│  - 收敛太慢                                                 │
│  - 可能陷入局部最优                                         │
│                                                             │
│  学习率调度策略:                                            │
│  - 初期:较大学习率,快速收敛                                │
│  - 后期:较小学习率,精细调优                                │
│                                                             │
└─────────────────────────────────────────────────────────────┘

StepLR #

python
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim.lr_scheduler import StepLR

model = nn.Linear(10, 1)
optimizer = optim.SGD(model.parameters(), lr=0.1)

scheduler = StepLR(optimizer, step_size=30, gamma=0.1)

for epoch in range(100):
    train(...)
    optimizer.step()
    scheduler.step()
    
    if epoch % 10 == 0:
        print(f"Epoch {epoch}, LR: {scheduler.get_last_lr()[0]:.6f}")

MultiStepLR #

python
from torch.optim.lr_scheduler import MultiStepLR

scheduler = MultiStepLR(
    optimizer, 
    milestones=[30, 60, 90], 
    gamma=0.1
)

for epoch in range(100):
    train(...)
    optimizer.step()
    scheduler.step()

ExponentialLR #

python
from torch.optim.lr_scheduler import ExponentialLR

scheduler = ExponentialLR(optimizer, gamma=0.9)

for epoch in range(100):
    train(...)
    optimizer.step()
    scheduler.step()

CosineAnnealingLR #

python
from torch.optim.lr_scheduler import CosineAnnealingLR

scheduler = CosineAnnealingLR(
    optimizer, 
    T_max=100,
    eta_min=0.0001
)

for epoch in range(100):
    train(...)
    optimizer.step()
    scheduler.step()

ReduceLROnPlateau #

python
from torch.optim.lr_scheduler import ReduceLROnPlateau

scheduler = ReduceLROnPlateau(
    optimizer, 
    mode='min',
    factor=0.1,
    patience=10,
    verbose=True
)

for epoch in range(100):
    train_loss = train(...)
    optimizer.step()
    scheduler.step(train_loss)

OneCycleLR #

python
from torch.optim.lr_scheduler import OneCycleLR

model = nn.Linear(10, 1)
optimizer = optim.SGD(model.parameters(), lr=0.1)

scheduler = OneCycleLR(
    optimizer,
    max_lr=0.1,
    total_steps=1000
)

for step in range(1000):
    train(...)
    optimizer.step()
    scheduler.step()

CosineAnnealingWarmRestarts #

python
from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts

scheduler = CosineAnnealingWarmRestarts(
    optimizer,
    T_0=10,
    T_mult=2,
    eta_min=0.0001
)

for epoch in range(100):
    train(...)
    optimizer.step()
    scheduler.step()

学习率调度器对比 #

text
┌─────────────────────────────────────────────────────────────┐
│                    调度器选择指南                            │
├─────────────────────────────────────────────────────────────┤
│                                                             │
│  StepLR / MultiStepLR                                       │
│  - 简单直观                                                 │
│  - 适合已知训练周期的场景                                    │
│                                                             │
│  CosineAnnealingLR                                          │
│  - 平滑衰减                                                 │
│  - CV任务常用                                               │
│                                                             │
│  ReduceLROnPlateau                                          │
│  - 自适应调整                                               │
│  - 根据验证损失调整                                         │
│                                                             │
│  OneCycleLR                                                 │
│  - 快速训练                                                 │
│  - 超参数少                                                 │
│                                                             │
│  CosineAnnealingWarmRestarts                                │
│  - 周期性重启                                               │
│  - 帮助跳出局部最优                                         │
│                                                             │
└─────────────────────────────────────────────────────────────┘

梯度裁剪 #

python
import torch
import torch.nn as nn
import torch.optim as optim

model = nn.Linear(10, 1)
optimizer = optim.SGD(model.parameters(), lr=0.01)

for data, target in dataloader:
    optimizer.zero_grad()
    output = model(data)
    loss = criterion(output, target)
    loss.backward()
    
    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
    
    torch.nn.utils.clip_grad_value_(model.parameters(), clip_value=0.5)
    
    optimizer.step()

参数分组 #

python
import torch
import torch.nn as nn
import torch.optim as optim

model = nn.Sequential(
    nn.Linear(784, 256),
    nn.ReLU(),
    nn.Linear(256, 10)
)

optimizer = optim.SGD([
    {'params': model[0].parameters(), 'lr': 0.01},
    {'params': model[2].parameters(), 'lr': 0.001}
], lr=0.01)

optimizer = optim.SGD([
    {'params': model[0].parameters(), 'lr': 0.01, 'weight_decay': 1e-4},
    {'params': model[2].parameters(), 'lr': 0.001, 'weight_decay': 1e-5}
])

优化器状态 #

python
import torch
import torch.nn as nn
import torch.optim as optim

model = nn.Linear(10, 1)
optimizer = optim.Adam(model.parameters(), lr=0.001)

for epoch in range(10):
    optimizer.zero_grad()
    output = model(torch.randn(32, 10))
    loss = output.sum()
    loss.backward()
    optimizer.step()

state = optimizer.state_dict()
print("Optimizer state keys:", state.keys())

optimizer2 = optim.Adam(model.parameters(), lr=0.001)
optimizer2.load_state_dict(state)

实用技巧 #

学习率预热 #

python
import torch
import torch.nn as nn
import torch.optim as optim
import math

def warmup_lr(optimizer, warmup_epochs, base_lr, current_epoch):
    if current_epoch < warmup_epochs:
        lr = base_lr * (current_epoch + 1) / warmup_epochs
        for param_group in optimizer.param_groups:
            param_group['lr'] = lr

model = nn.Linear(10, 1)
optimizer = optim.SGD(model.parameters(), lr=0.1)
warmup_epochs = 5

for epoch in range(100):
    warmup_lr(optimizer, warmup_epochs, 0.1, epoch)
    train(...)
    optimizer.step()

学习率查找 #

python
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt

def find_lr(model, dataloader, criterion, init_lr=1e-7, final_lr=10, num_iter=100):
    optimizer = optim.SGD(model.parameters(), lr=init_lr)
    
    lr_mult = (final_lr / init_lr) ** (1 / num_iter)
    lrs = []
    losses = []
    best_loss = float('inf')
    
    for i, (data, target) in enumerate(dataloader):
        if i >= num_iter:
            break
            
        lr = init_lr * (lr_mult ** i)
        for param_group in optimizer.param_groups:
            param_group['lr'] = lr
        
        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        
        lrs.append(lr)
        losses.append(loss.item())
        
        if loss.item() < best_loss:
            best_loss = loss.item()
    
    plt.plot(lrs, losses)
    plt.xscale('log')
    plt.xlabel('Learning Rate')
    plt.ylabel('Loss')
    plt.show()
    
    return lrs, losses

梯度累积 #

python
import torch
import torch.nn as nn
import torch.optim as optim

model = nn.Linear(10, 1)
optimizer = optim.SGD(model.parameters(), lr=0.01)
criterion = nn.MSELoss()

accumulation_steps = 4

for i, (data, target) in enumerate(dataloader):
    output = model(data)
    loss = criterion(output, target)
    loss = loss / accumulation_steps
    loss.backward()
    
    if (i + 1) % accumulation_steps == 0:
        optimizer.step()
        optimizer.zero_grad()

完整训练示例 #

python
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim.lr_scheduler import CosineAnnealingLR

class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(784, 256)
        self.fc2 = nn.Linear(256, 10)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.2)
    
    def forward(self, x):
        x = x.view(-1, 784)
        x = self.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.fc2(x)
        return x

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = Net().to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1, momentum=0.9, weight_decay=5e-4)
scheduler = CosineAnnealingLR(optimizer, T_max=100)

def train(model, dataloader, criterion, optimizer, device):
    model.train()
    total_loss = 0
    correct = 0
    total = 0
    
    for data, target in dataloader:
        data, target = data.to(device), target.to(device)
        
        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output, target)
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        
        optimizer.step()
        
        total_loss += loss.item()
        _, predicted = output.max(1)
        total += target.size(0)
        correct += predicted.eq(target).sum().item()
    
    return total_loss / len(dataloader), 100. * correct / total

for epoch in range(100):
    train_loss, train_acc = train(model, train_loader, criterion, optimizer, device)
    scheduler.step()
    
    if epoch % 10 == 0:
        print(f"Epoch {epoch}, Loss: {train_loss:.4f}, Acc: {train_acc:.2f}%, LR: {scheduler.get_last_lr()[0]:.6f}")

下一步 #

现在你已经掌握了 PyTorch 优化器的核心概念,接下来学习 数据加载,了解如何高效处理训练数据!

最后更新:2026-03-29