PyTorch 自动求导 #
什么是自动求导? #
自动求导(Automatic Differentiation)是深度学习框架的核心功能之一。它能够自动计算函数的导数,无需手动编写求导代码。
text
┌─────────────────────────────────────────────────────────────┐
│ 求导方式对比 │
├─────────────────────────────────────────────────────────────┤
│ │
│ 1. 手动求导 │
│ - 需要数学推导 │
│ - 手动实现代码 │
│ - 容易出错 │
│ │
│ 2. 数值求导 │
│ - 使用差分近似 │
│ - 精度有限 │
│ - 计算量大 │
│ │
│ 3. 符号求导 │
│ - 表达式展开 │
│ - 可能产生冗余 │
│ - 不适合复杂函数 │
│ │
│ 4. 自动求导 ✅ │
│ - 自动计算精确导数 │
│ - 高效且准确 │
│ - 支持复杂计算图 │
│ │
└─────────────────────────────────────────────────────────────┘
计算图基础 #
什么是计算图? #
计算图是一种有向无环图(DAG),用于表示数学运算的依赖关系。
text
┌─────────────────────────────────────────────────────────────┐
│ 计算图示例 │
├─────────────────────────────────────────────────────────────┤
│ │
│ 表达式: z = (x + y) * (y - 2) │
│ │
│ 计算图: │
│ │
│ x y │
│ \ /|\ │
│ \ / | \ │
│ \ / | \ │
│ \/ | \ │
│ add | sub │
│ | | | │
│ | | | │
│ +----+-----+ │
│ | │
│ mul │
│ | │
│ z │
│ │
│ 前向传播:从输入到输出 │
│ 反向传播:从输出到输入(计算梯度) │
│ │
└─────────────────────────────────────────────────────────────┘
动态计算图 #
PyTorch 使用动态计算图,每次前向传播都会构建新的计算图。
python
import torch
x = torch.tensor([2.0], requires_grad=True)
y = torch.tensor([3.0], requires_grad=True)
z = x * y + x ** 2
print(z)
z.backward()
print(f"dz/dx = {x.grad}")
print(f"dz/dy = {y.grad}")
requires_grad 属性 #
基本用法 #
python
import torch
x = torch.tensor([1.0, 2.0, 3.0])
print(x.requires_grad)
x = torch.tensor([1.0, 2.0, 3.0], requires_grad=True)
print(x.requires_grad)
x = torch.tensor([1.0, 2.0, 3.0])
x.requires_grad_(True)
print(x.requires_grad)
x = torch.randn(3, requires_grad=True)
y = x * 2
z = y.mean()
print(f"x.requires_grad: {x.requires_grad}")
print(f"y.requires_grad: {y.requires_grad}")
print(f"z.requires_grad: {z.requires_grad}")
梯度追踪控制 #
python
import torch
x = torch.tensor([2.0], requires_grad=True)
y = x ** 2
print(y.requires_grad)
y = x.detach()
print(y.requires_grad)
x = torch.tensor([2.0], requires_grad=True)
with torch.no_grad():
y = x ** 2
print(y.requires_grad)
x = torch.tensor([2.0], requires_grad=True)
y = x ** 2
y = y.detach()
z = y + 1
print(z.requires_grad)
反向传播 #
backward() 方法 #
python
import torch
x = torch.tensor([2.0], requires_grad=True)
y = x ** 3
y.backward()
print(x.grad)
x = torch.tensor([2.0], requires_grad=True)
y = x ** 2
z = y ** 2
z.backward()
print(x.grad)
标量输出 #
python
import torch
x = torch.tensor([1.0, 2.0, 3.0], requires_grad=True)
y = x ** 2
try:
y.backward()
except RuntimeError as e:
print(f"错误: {e}")
y.sum().backward()
print(x.grad)
x = torch.tensor([1.0, 2.0, 3.0], requires_grad=True)
y = x ** 2
gradient = torch.tensor([1.0, 1.0, 1.0])
y.backward(gradient)
print(x.grad)
雅可比矩阵 #
text
┌─────────────────────────────────────────────────────────────┐
│ 雅可比矩阵 │
├─────────────────────────────────────────────────────────────┤
│ │
│ 对于向量函数 y = f(x),其中 x ∈ R^n, y ∈ R^m │
│ │
│ 雅可比矩阵 J: │
│ │
│ │ ∂y₁/∂x₁ ∂y₁/∂x₂ ... ∂y₁/∂xₙ │ │
│ J = │ ∂y₂/∂x₁ ∂y₂/∂x₂ ... ∂y₂/∂xₙ │ │
│ │ ... ... ... ... │ │
│ │ ∂yₘ/∂x₁ ∂yₘ/∂x₂ ... ∂yₘ/∂xₙ │ │
│ │
│ 反向传播计算: │
│ v = ∂L/∂y (上游梯度) │
│ ∂L/∂x = J^T · v │
│ │
└─────────────────────────────────────────────────────────────┘
python
import torch
x = torch.tensor([1.0, 2.0, 3.0], requires_grad=True)
y = 2 * x + 1
v = torch.tensor([1.0, 1.0, 1.0])
y.backward(v)
print(x.grad)
x = torch.tensor([1.0, 2.0, 3.0], requires_grad=True)
y = 2 * x + 1
v = torch.tensor([1.0, 2.0, 3.0])
y.backward(v)
print(x.grad)
梯度累积 #
python
import torch
x = torch.tensor([2.0], requires_grad=True)
for i in range(3):
y = x ** 2
y.backward()
print(f"第 {i+1} 次: x.grad = {x.grad}")
x = torch.tensor([2.0], requires_grad=True)
for i in range(3):
if x.grad is not None:
x.grad.zero_()
y = x ** 2
y.backward()
print(f"第 {i+1} 次: x.grad = {x.grad}")
x = torch.tensor([2.0], requires_grad=True)
y1 = x ** 2
y2 = x ** 3
y1.backward(retain_graph=True)
print(f"dy1/dx = {x.grad}")
y2.backward()
print(f"dy2/dx = {x.grad}")
计算图可视化 #
python
import torch
from torchviz import make_dot
x = torch.tensor([2.0], requires_grad=True)
y = torch.tensor([3.0], requires_grad=True)
z = (x + y) * (y - 2)
dot = make_dot(z, params={'x': x, 'y': y})
dot.render('computational_graph', format='png')
手动追踪计算图 #
python
import torch
x = torch.tensor([2.0], requires_grad=True)
y = x ** 2
print(f"y.grad_fn: {y.grad_fn}")
print(f"y.grad_fn.next_functions: {y.grad_fn.next_functions}")
z = y + 1
print(f"z.grad_fn: {z.grad_fn}")
z.backward()
print(f"x.grad: {x.grad}")
高级自动求导 #
自定义梯度函数 #
python
import torch
from torch.autograd import Function
class SquareFunction(Function):
@staticmethod
def forward(ctx, x):
ctx.save_for_backward(x)
return x ** 2
@staticmethod
def backward(ctx, grad_output):
x, = ctx.saved_tensors
return 2 * x * grad_output
square = SquareFunction.apply
x = torch.tensor([3.0], requires_grad=True)
y = square(x)
y.backward()
print(x.grad)
自定义 Autograd 函数 #
python
import torch
class MyReLU(torch.autograd.Function):
@staticmethod
def forward(ctx, x):
ctx.save_for_backward(x)
return x.clamp(min=0)
@staticmethod
def backward(ctx, grad_output):
x, = ctx.saved_tensors
grad_input = grad_output.clone()
grad_input[x < 0] = 0
return grad_input
my_relu = MyReLU.apply
x = torch.tensor([-1.0, 0.0, 1.0], requires_grad=True)
y = my_relu(x)
y.sum().backward()
print(x.grad)
梯度检查 #
python
import torch
def f(x):
return x ** 3 + 2 * x ** 2 + 3 * x + 4
x = torch.tensor([2.0], requires_grad=True)
y = f(x)
y.backward()
analytical_grad = x.grad.item()
h = 1e-5
numerical_grad = (f(x + h) - f(x - h)) / (2 * h)
numerical_grad = numerical_grad.item()
print(f"解析梯度: {analytical_grad}")
print(f"数值梯度: {numerical_grad}")
print(f"差异: {abs(analytical_grad - numerical_grad)}")
from torch.autograd import gradcheck
def f(x):
return x ** 2
x = torch.tensor([2.0], requires_grad=True, dtype=torch.double)
print(gradcheck(f, x))
梯度裁剪 #
为什么需要梯度裁剪? #
text
┌─────────────────────────────────────────────────────────────┐
│ 梯度爆炸问题 │
├─────────────────────────────────────────────────────────────┤
│ │
│ 深度网络中,梯度可能在反向传播中指数级增长: │
│ │
│ Layer 1 ──► Layer 2 ──► Layer 3 ──► ... │
│ │ │ │ │
│ grad_1 grad_2 grad_3 │
│ │
│ 如果每层梯度 > 1,梯度会爆炸 │
│ 如果每层梯度 < 1,梯度会消失 │
│ │
│ 梯度裁剪解决方案: │
│ 限制梯度的范数,防止爆炸 │
│ │
└─────────────────────────────────────────────────────────────┘
梯度裁剪方法 #
python
import torch
import torch.nn as nn
x = torch.randn(10, requires_grad=True)
y = (x ** 2).sum()
y.backward()
print(f"裁剪前梯度范数: {x.grad.norm()}")
x.grad = torch.nn.utils.clip_grad_norm_([x], max_norm=1.0)
print(f"裁剪后梯度范数: {x.grad.norm()}")
x = torch.randn(10, requires_grad=True)
y = (x ** 2).sum()
y.backward()
torch.nn.utils.clip_grad_value_([x], clip_value=0.5)
print(f"裁剪后梯度: {x.grad}")
在训练中使用 #
python
import torch
import torch.nn as nn
import torch.optim as optim
model = nn.Linear(10, 1)
optimizer = optim.SGD(model.parameters(), lr=0.01)
criterion = nn.MSELoss()
data = torch.randn(32, 10)
target = torch.randn(32, 1)
optimizer.zero_grad()
output = model(data)
loss = criterion(output, target)
loss.backward()
torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
optimizer.step()
二阶导数 #
python
import torch
x = torch.tensor([2.0], requires_grad=True)
y = x ** 3
grad = torch.autograd.grad(y, x, create_graph=True)[0]
print(f"一阶导数: {grad}")
grad2 = torch.autograd.grad(grad, x)[0]
print(f"二阶导数: {grad2}")
x = torch.tensor([2.0], requires_grad=True)
y = x ** 4
grad = torch.autograd.grad(y, x, create_graph=True)[0]
grad2 = torch.autograd.grad(grad, x, create_graph=True)[0]
grad3 = torch.autograd.grad(grad2, x)[0]
print(f"一阶导数: {grad}")
print(f"二阶导数: {grad2}")
print(f"三阶导数: {grad3}")
实用技巧 #
禁用梯度计算 #
python
import torch
x = torch.tensor([2.0], requires_grad=True)
@torch.no_grad()
def inference(x):
return x ** 2
y = inference(x)
print(y.requires_grad)
x = torch.tensor([2.0], requires_grad=True)
with torch.inference_mode():
y = x ** 2
print(y.requires_grad)
梯度清零 #
python
import torch
x = torch.tensor([2.0], requires_grad=True)
if x.grad is not None:
x.grad.zero_()
y = x ** 2
y.backward()
print(x.grad)
x.grad = None
y = x ** 3
y.backward()
print(x.grad)
获取中间梯度 #
python
import torch
x = torch.tensor([2.0], requires_grad=True)
y = torch.tensor([3.0], requires_grad=True)
z = x * y
w = z ** 2
w.backward()
print(f"dw/dx: {x.grad}")
print(f"dw/dy: {y.grad}")
x = torch.tensor([2.0], requires_grad=True)
y = torch.tensor([3.0], requires_grad=True)
z = x * y
z.retain_grad()
w = z ** 2
w.backward()
print(f"dw/dz: {z.grad}")
常见问题 #
问题 1:梯度为 None #
python
import torch
x = torch.tensor([2.0])
y = x ** 2
y.backward()
print(x.grad)
x = torch.tensor([2.0], requires_grad=True)
y = x ** 2
y.backward()
print(x.grad)
问题 2:多次 backward 报错 #
python
import torch
x = torch.tensor([2.0], requires_grad=True)
y = x ** 2
y.backward()
print(x.grad)
try:
y.backward()
except RuntimeError as e:
print(f"错误: {e}")
x = torch.tensor([2.0], requires_grad=True)
y = x ** 2
y.backward(retain_graph=True)
print(x.grad)
y.backward()
print(x.grad)
问题 3:inplace 操作 #
python
import torch
x = torch.tensor([2.0], requires_grad=True)
y = x ** 2
try:
y += 1
y.backward()
except RuntimeError as e:
print(f"错误: {e}")
x = torch.tensor([2.0], requires_grad=True)
y = x ** 2
y = y + 1
y.backward()
print(x.grad)
下一步 #
现在你已经掌握了 PyTorch 自动求导的核心概念,接下来学习 神经网络模块,开始构建你的第一个神经网络!
最后更新:2026-03-29