PyTorch 卷积神经网络 #
CNN 简介 #
卷积神经网络(Convolutional Neural Network, CNN)是一类专门用于处理具有网格结构数据的神经网络,在图像识别、目标检测等领域取得了巨大成功。
text
┌─────────────────────────────────────────────────────────────┐
│ CNN 核心思想 │
├─────────────────────────────────────────────────────────────┤
│ │
│ 传统全连接网络的问题: │
│ - 参数量巨大:224×224×3 = 150,528 个输入 │
│ - 忽略空间结构 │
│ - 无法处理平移不变性 │
│ │
│ CNN 的解决方案: │
│ - 局部连接:只关注局部区域 │
│ - 权值共享:同一滤波器扫描整张图 │
│ - 层次特征:从低级到高级特征 │
│ │
│ CNN 架构: │
│ 输入图像 ──► 卷积层 ──► 池化层 ──► 卷积层 ──► 池化层 │
│ ──► 全连接层 ──► 输出 │
│ │
└─────────────────────────────────────────────────────────────┘
卷积层 #
卷积操作原理 #
text
┌─────────────────────────────────────────────────────────────┐
│ 卷积操作示意 │
├─────────────────────────────────────────────────────────────┤
│ │
│ 输入图像 (5×5) 卷积核 (3×3) 输出特征图 (3×3) │
│ ┌─┬─┬─┬─┬─┐ ┌─┬─┬─┐ ┌─┬─┬─┐ │
│ │1│1│1│0│0│ │1│0│1│ │4│3│4│ │
│ ├─┼─┼─┼─┼─┤ ├─┼─┼─┤ ├─┼─┼─┤ │
│ │0│1│1│1│0│ * │0│1│0│ = │2│4│3│ │
│ ├─┼─┼─┼─┼─┤ ├─┼─┼─┤ ├─┼─┼─┤ │
│ │0│0│1│1│1│ │1│0│1│ │2│3│4│ │
│ ├─┼─┼─┼─┼─┤ └─┴─┴─┘ └─┴─┴─┘ │
│ │0│0│1│1│0│ │
│ ├─┼─┼─┼─┼─┤ 计算方式: │
│ │0│1│1│0│0│ 1×1+1×0+1×1+ │
│ └─┴─┴─┴─┴─┘ 0×0+1×1+1×0+ │
│ 0×1+0×0+1×1 = 4 │
│ │
└─────────────────────────────────────────────────────────────┘
nn.Conv2d 参数 #
python
import torch
import torch.nn as nn
conv = nn.Conv2d(
in_channels=3,
out_channels=64,
kernel_size=3,
stride=1,
padding=0,
dilation=1,
groups=1,
bias=True,
padding_mode='zeros'
)
print(f"权重形状: {conv.weight.shape}")
print(f"偏置形状: {conv.bias.shape}")
输出尺寸计算 #
text
┌─────────────────────────────────────────────────────────────┐
│ 输出尺寸公式 │
├─────────────────────────────────────────────────────────────┤
│ │
│ 输出尺寸 = floor((W - K + 2P) / S) + 1 │
│ │
│ 其中: │
│ - W: 输入尺寸 │
│ - K: 卷积核大小 │
│ - P: 填充大小 │
│ - S: 步幅 │
│ │
│ 示例: │
│ 输入: 224×224, 卷积核: 3×3, 填充: 1, 步幅: 1 │
│ 输出: (224 - 3 + 2×1) / 1 + 1 = 224 │
│ │
│ 输入: 224×224, 卷积核: 3×3, 填充: 0, 步幅: 2 │
│ 输出: (224 - 3 + 0) / 2 + 1 = 112 │
│ │
└─────────────────────────────────────────────────────────────┘
python
import torch
import torch.nn as nn
x = torch.randn(1, 3, 224, 224)
conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1)
print(f"Padding=1, Stride=1: {conv1(x).shape}")
conv2 = nn.Conv2d(3, 64, kernel_size=3, stride=2, padding=1)
print(f"Padding=1, Stride=2: {conv2(x).shape}")
conv3 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3)
print(f"Padding=3, Stride=2, Kernel=7: {conv3(x).shape}")
不同填充模式 #
python
import torch
import torch.nn as nn
x = torch.randn(1, 3, 224, 224)
conv_same = nn.Conv2d(3, 64, kernel_size=3, padding='same')
print(f"Same padding: {conv_same(x).shape}")
conv_valid = nn.Conv2d(3, 64, kernel_size=3, padding='valid')
print(f"Valid padding: {conv_valid(x).shape}")
空洞卷积 #
python
import torch
import torch.nn as nn
x = torch.randn(1, 3, 224, 224)
conv_normal = nn.Conv2d(3, 64, kernel_size=3, padding=1, dilation=1)
conv_dilated = nn.Conv2d(3, 64, kernel_size=3, padding=2, dilation=2)
print(f"普通卷积: {conv_normal(x).shape}")
print(f"空洞卷积: {conv_dilated(x).shape}")
分组卷积 #
python
import torch
import torch.nn as nn
x = torch.randn(1, 64, 56, 56)
conv_normal = nn.Conv2d(64, 128, kernel_size=3, padding=1, groups=1)
conv_grouped = nn.Conv2d(64, 128, kernel_size=3, padding=1, groups=2)
conv_depthwise = nn.Conv2d(64, 64, kernel_size=3, padding=1, groups=64)
print(f"普通卷积参数: {sum(p.numel() for p in conv_normal.parameters())}")
print(f"分组卷积参数: {sum(p.numel() for p in conv_grouped.parameters())}")
print(f"深度卷积参数: {sum(p.numel() for p in conv_depthwise.parameters())}")
池化层 #
最大池化 #
python
import torch
import torch.nn as nn
x = torch.randn(1, 64, 56, 56)
maxpool = nn.MaxPool2d(kernel_size=2, stride=2)
print(f"MaxPool 输出: {maxpool(x).shape}")
maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
print(f"MaxPool (k=3, s=2, p=1): {maxpool(x).shape}")
平均池化 #
python
import torch
import torch.nn as nn
x = torch.randn(1, 64, 56, 56)
avgpool = nn.AvgPool2d(kernel_size=2, stride=2)
print(f"AvgPool 输出: {avgpool(x).shape}")
global_avgpool = nn.AdaptiveAvgPool2d((1, 1))
print(f"Global AvgPool: {global_avgpool(x).shape}")
自适应池化 #
python
import torch
import torch.nn as nn
x = torch.randn(1, 64, 56, 56)
adaptive_max = nn.AdaptiveMaxPool2d((7, 7))
adaptive_avg = nn.AdaptiveAvgPool2d((7, 7))
print(f"AdaptiveMaxPool: {adaptive_max(x).shape}")
print(f"AdaptiveAvgPool: {adaptive_avg(x).shape}")
经典网络架构 #
LeNet-5 #
python
import torch
import torch.nn as nn
class LeNet5(nn.Module):
def __init__(self, num_classes=10):
super().__init__()
self.features = nn.Sequential(
nn.Conv2d(1, 6, kernel_size=5, padding=2),
nn.ReLU(),
nn.MaxPool2d(kernel_size=2, stride=2),
nn.Conv2d(6, 16, kernel_size=5),
nn.ReLU(),
nn.MaxPool2d(kernel_size=2, stride=2),
nn.Conv2d(16, 120, kernel_size=5),
nn.ReLU()
)
self.classifier = nn.Sequential(
nn.Linear(120, 84),
nn.ReLU(),
nn.Linear(84, num_classes)
)
def forward(self, x):
x = self.features(x)
x = x.view(x.size(0), -1)
x = self.classifier(x)
return x
model = LeNet5()
x = torch.randn(1, 1, 28, 28)
print(f"LeNet-5 输出: {model(x).shape}")
AlexNet #
python
import torch
import torch.nn as nn
class AlexNet(nn.Module):
def __init__(self, num_classes=1000):
super().__init__()
self.features = nn.Sequential(
nn.Conv2d(3, 64, kernel_size=11, stride=4, padding=2),
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=3, stride=2),
nn.Conv2d(64, 192, kernel_size=5, padding=2),
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=3, stride=2),
nn.Conv2d(192, 384, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.Conv2d(384, 256, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.Conv2d(256, 256, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=3, stride=2),
)
self.avgpool = nn.AdaptiveAvgPool2d((6, 6))
self.classifier = nn.Sequential(
nn.Dropout(),
nn.Linear(256 * 6 * 6, 4096),
nn.ReLU(inplace=True),
nn.Dropout(),
nn.Linear(4096, 4096),
nn.ReLU(inplace=True),
nn.Linear(4096, num_classes),
)
def forward(self, x):
x = self.features(x)
x = self.avgpool(x)
x = torch.flatten(x, 1)
x = self.classifier(x)
return x
VGG #
python
import torch
import torch.nn as nn
class VGGBlock(nn.Module):
def __init__(self, in_channels, out_channels, num_convs):
super().__init__()
layers = []
for i in range(num_convs):
layers.append(nn.Conv2d(
in_channels if i == 0 else out_channels,
out_channels,
kernel_size=3,
padding=1
))
layers.append(nn.ReLU(inplace=True))
layers.append(nn.MaxPool2d(kernel_size=2, stride=2))
self.block = nn.Sequential(*layers)
def forward(self, x):
return self.block(x)
class VGG16(nn.Module):
def __init__(self, num_classes=1000):
super().__init__()
self.features = nn.Sequential(
VGGBlock(3, 64, 2),
VGGBlock(64, 128, 2),
VGGBlock(128, 256, 3),
VGGBlock(256, 512, 3),
VGGBlock(512, 512, 3),
)
self.avgpool = nn.AdaptiveAvgPool2d((7, 7))
self.classifier = nn.Sequential(
nn.Linear(512 * 7 * 7, 4096),
nn.ReLU(True),
nn.Dropout(),
nn.Linear(4096, 4096),
nn.ReLU(True),
nn.Dropout(),
nn.Linear(4096, num_classes),
)
def forward(self, x):
x = self.features(x)
x = self.avgpool(x)
x = torch.flatten(x, 1)
x = self.classifier(x)
return x
ResNet 残差块 #
python
import torch
import torch.nn as nn
class BasicBlock(nn.Module):
def __init__(self, in_channels, out_channels, stride=1):
super().__init__()
self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3,
stride=stride, padding=1, bias=False)
self.bn1 = nn.BatchNorm2d(out_channels)
self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3,
stride=1, padding=1, bias=False)
self.bn2 = nn.BatchNorm2d(out_channels)
self.relu = nn.ReLU(inplace=True)
self.shortcut = nn.Sequential()
if stride != 1 or in_channels != out_channels:
self.shortcut = nn.Sequential(
nn.Conv2d(in_channels, out_channels, kernel_size=1,
stride=stride, bias=False),
nn.BatchNorm2d(out_channels)
)
def forward(self, x):
identity = x
out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)
out = self.conv2(out)
out = self.bn2(out)
out += self.shortcut(identity)
out = self.relu(out)
return out
class ResNet(nn.Module):
def __init__(self, block, num_blocks, num_classes=1000):
super().__init__()
self.in_channels = 64
self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False)
self.bn1 = nn.BatchNorm2d(64)
self.relu = nn.ReLU(inplace=True)
self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1)
self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2)
self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2)
self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2)
self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
self.fc = nn.Linear(512, num_classes)
def _make_layer(self, block, out_channels, num_blocks, stride):
strides = [stride] + [1] * (num_blocks - 1)
layers = []
for stride in strides:
layers.append(block(self.in_channels, out_channels, stride))
self.in_channels = out_channels
return nn.Sequential(*layers)
def forward(self, x):
x = self.conv1(x)
x = self.bn1(x)
x = self.relu(x)
x = self.maxpool(x)
x = self.layer1(x)
x = self.layer2(x)
x = self.layer3(x)
x = self.layer4(x)
x = self.avgpool(x)
x = torch.flatten(x, 1)
x = self.fc(x)
return x
def resnet18(num_classes=1000):
return ResNet(BasicBlock, [2, 2, 2, 2], num_classes)
model = resnet18(10)
x = torch.randn(1, 3, 224, 224)
print(f"ResNet-18 输出: {model(x).shape}")
批归一化 #
python
import torch
import torch.nn as nn
class ConvBlock(nn.Module):
def __init__(self, in_channels, out_channels, kernel_size=3, stride=1):
super().__init__()
self.conv = nn.Conv2d(in_channels, out_channels, kernel_size,
stride=stride, padding=kernel_size//2, bias=False)
self.bn = nn.BatchNorm2d(out_channels)
self.relu = nn.ReLU(inplace=True)
def forward(self, x):
x = self.conv(x)
x = self.bn(x)
x = self.relu(x)
return x
block = ConvBlock(64, 128)
x = torch.randn(32, 64, 56, 56)
out = block(x)
print(f"ConvBlock 输出: {out.shape}")
完整 CNN 示例 #
python
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
class CNN(nn.Module):
def __init__(self, num_classes=10):
super().__init__()
self.features = nn.Sequential(
nn.Conv2d(1, 32, kernel_size=3, padding=1),
nn.BatchNorm2d(32),
nn.ReLU(inplace=True),
nn.Conv2d(32, 32, kernel_size=3, padding=1),
nn.BatchNorm2d(32),
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=2, stride=2),
nn.Dropout2d(0.25),
nn.Conv2d(32, 64, kernel_size=3, padding=1),
nn.BatchNorm2d(64),
nn.ReLU(inplace=True),
nn.Conv2d(64, 64, kernel_size=3, padding=1),
nn.BatchNorm2d(64),
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=2, stride=2),
nn.Dropout2d(0.25),
)
self.classifier = nn.Sequential(
nn.Linear(64 * 7 * 7, 128),
nn.BatchNorm1d(128),
nn.ReLU(inplace=True),
nn.Dropout(0.5),
nn.Linear(128, num_classes)
)
def forward(self, x):
x = self.features(x)
x = x.view(x.size(0), -1)
x = self.classifier(x)
return x
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = CNN().to(device)
transform = transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,))
])
train_dataset = datasets.MNIST('./data', train=True, download=True, transform=transform)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
def train(model, dataloader, criterion, optimizer, device, epoch):
model.train()
running_loss = 0.0
correct = 0
total = 0
for batch_idx, (data, target) in enumerate(dataloader):
data, target = data.to(device), target.to(device)
optimizer.zero_grad()
output = model(data)
loss = criterion(output, target)
loss.backward()
optimizer.step()
running_loss += loss.item()
_, predicted = output.max(1)
total += target.size(0)
correct += predicted.eq(target).sum().item()
if batch_idx % 100 == 0:
print(f'Epoch {epoch}, Batch {batch_idx}, Loss: {loss.item():.4f}, '
f'Acc: {100.*correct/total:.2f}%')
return running_loss / len(dataloader), 100. * correct / total
for epoch in range(1, 6):
train_loss, train_acc = train(model, train_loader, criterion, optimizer, device, epoch)
print(f'Epoch {epoch} 完成, Loss: {train_loss:.4f}, Acc: {train_acc:.2f}%')
下一步 #
现在你已经掌握了 PyTorch 卷积神经网络的核心概念,接下来学习 循环神经网络,了解序列数据处理!
最后更新:2026-03-29