PyTorch 卷积神经网络 #

CNN 简介 #

卷积神经网络（Convolutional Neural Network, CNN）是一类专门用于处理具有网格结构数据的神经网络，在图像识别、目标检测等领域取得了巨大成功。

text

┌─────────────────────────────────────────────────────────────┐
│                    CNN 核心思想                              │
├─────────────────────────────────────────────────────────────┤
│                                                             │
│  传统全连接网络的问题：                                       │
│  - 参数量巨大：224×224×3 = 150,528 个输入                   │
│  - 忽略空间结构                                             │
│  - 无法处理平移不变性                                       │
│                                                             │
│  CNN 的解决方案：                                            │
│  - 局部连接：只关注局部区域                                  │
│  - 权值共享：同一滤波器扫描整张图                            │
│  - 层次特征：从低级到高级特征                                │
│                                                             │
│  CNN 架构：                                                  │
│  输入图像 ──► 卷积层 ──► 池化层 ──► 卷积层 ──► 池化层       │
│       ──► 全连接层 ──► 输出                                  │
│                                                             │
└─────────────────────────────────────────────────────────────┘

卷积层 #

卷积操作原理 #

text

┌─────────────────────────────────────────────────────────────┐
│                    卷积操作示意                              │
├─────────────────────────────────────────────────────────────┤
│                                                             │
│  输入图像 (5×5)        卷积核 (3×3)       输出特征图 (3×3)  │
│  ┌─┬─┬─┬─┬─┐         ┌─┬─┬─┐           ┌─┬─┬─┐            │
│  │1│1│1│0│0│         │1│0│1│           │4│3│4│            │
│  ├─┼─┼─┼─┼─┤         ├─┼─┼─┤           ├─┼─┼─┤            │
│  │0│1│1│1│0│    *    │0│1│0│    =      │2│4│3│            │
│  ├─┼─┼─┼─┼─┤         ├─┼─┼─┤           ├─┼─┼─┤            │
│  │0│0│1│1│1│         │1│0│1│           │2│3│4│            │
│  ├─┼─┼─┼─┼─┤         └─┴─┴─┘           └─┴─┴─┘            │
│  │0│0│1│1│0│                                               │
│  ├─┼─┼─┼─┼─┤         计算方式：                            │
│  │0│1│1│0│0│         1×1+1×0+1×1+                         │
│  └─┴─┴─┴─┴─┘         0×0+1×1+1×0+                         │
│                      0×1+0×0+1×1 = 4                       │
│                                                             │
└─────────────────────────────────────────────────────────────┘

nn.Conv2d 参数 #

python

import torch
import torch.nn as nn

conv = nn.Conv2d(
    in_channels=3,
    out_channels=64,
    kernel_size=3,
    stride=1,
    padding=0,
    dilation=1,
    groups=1,
    bias=True,
    padding_mode='zeros'
)

print(f"权重形状: {conv.weight.shape}")
print(f"偏置形状: {conv.bias.shape}")

输出尺寸计算 #

text

┌─────────────────────────────────────────────────────────────┐
│                    输出尺寸公式                              │
├─────────────────────────────────────────────────────────────┤
│                                                             │
│  输出尺寸 = floor((W - K + 2P) / S) + 1                     │
│                                                             │
│  其中：                                                      │
│  - W: 输入尺寸                                              │
│  - K: 卷积核大小                                            │
│  - P: 填充大小                                              │
│  - S: 步幅                                                  │
│                                                             │
│  示例：                                                      │
│  输入: 224×224, 卷积核: 3×3, 填充: 1, 步幅: 1               │
│  输出: (224 - 3 + 2×1) / 1 + 1 = 224                        │
│                                                             │
│  输入: 224×224, 卷积核: 3×3, 填充: 0, 步幅: 2               │
│  输出: (224 - 3 + 0) / 2 + 1 = 112                          │
│                                                             │
└─────────────────────────────────────────────────────────────┘

python

import torch
import torch.nn as nn

x = torch.randn(1, 3, 224, 224)

conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1)
print(f"Padding=1, Stride=1: {conv1(x).shape}")

conv2 = nn.Conv2d(3, 64, kernel_size=3, stride=2, padding=1)
print(f"Padding=1, Stride=2: {conv2(x).shape}")

conv3 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3)
print(f"Padding=3, Stride=2, Kernel=7: {conv3(x).shape}")

不同填充模式 #

python

import torch
import torch.nn as nn

x = torch.randn(1, 3, 224, 224)

conv_same = nn.Conv2d(3, 64, kernel_size=3, padding='same')
print(f"Same padding: {conv_same(x).shape}")

conv_valid = nn.Conv2d(3, 64, kernel_size=3, padding='valid')
print(f"Valid padding: {conv_valid(x).shape}")

空洞卷积 #

python

import torch
import torch.nn as nn

x = torch.randn(1, 3, 224, 224)

conv_normal = nn.Conv2d(3, 64, kernel_size=3, padding=1, dilation=1)
conv_dilated = nn.Conv2d(3, 64, kernel_size=3, padding=2, dilation=2)

print(f"普通卷积: {conv_normal(x).shape}")
print(f"空洞卷积: {conv_dilated(x).shape}")

分组卷积 #

python

import torch
import torch.nn as nn

x = torch.randn(1, 64, 56, 56)

conv_normal = nn.Conv2d(64, 128, kernel_size=3, padding=1, groups=1)
conv_grouped = nn.Conv2d(64, 128, kernel_size=3, padding=1, groups=2)
conv_depthwise = nn.Conv2d(64, 64, kernel_size=3, padding=1, groups=64)

print(f"普通卷积参数: {sum(p.numel() for p in conv_normal.parameters())}")
print(f"分组卷积参数: {sum(p.numel() for p in conv_grouped.parameters())}")
print(f"深度卷积参数: {sum(p.numel() for p in conv_depthwise.parameters())}")

池化层 #

最大池化 #

python

import torch
import torch.nn as nn

x = torch.randn(1, 64, 56, 56)

maxpool = nn.MaxPool2d(kernel_size=2, stride=2)
print(f"MaxPool 输出: {maxpool(x).shape}")

maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
print(f"MaxPool (k=3, s=2, p=1): {maxpool(x).shape}")

平均池化 #

python

import torch
import torch.nn as nn

x = torch.randn(1, 64, 56, 56)

avgpool = nn.AvgPool2d(kernel_size=2, stride=2)
print(f"AvgPool 输出: {avgpool(x).shape}")

global_avgpool = nn.AdaptiveAvgPool2d((1, 1))
print(f"Global AvgPool: {global_avgpool(x).shape}")

自适应池化 #

python

import torch
import torch.nn as nn

x = torch.randn(1, 64, 56, 56)

adaptive_max = nn.AdaptiveMaxPool2d((7, 7))
adaptive_avg = nn.AdaptiveAvgPool2d((7, 7))

print(f"AdaptiveMaxPool: {adaptive_max(x).shape}")
print(f"AdaptiveAvgPool: {adaptive_avg(x).shape}")

经典网络架构 #

LeNet-5 #

python

import torch
import torch.nn as nn

class LeNet5(nn.Module):
    def __init__(self, num_classes=10):
        super().__init__()
        self.features = nn.Sequential(
            nn.Conv2d(1, 6, kernel_size=5, padding=2),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(6, 16, kernel_size=5),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(16, 120, kernel_size=5),
            nn.ReLU()
        )
        self.classifier = nn.Sequential(
            nn.Linear(120, 84),
            nn.ReLU(),
            nn.Linear(84, num_classes)
        )
    
    def forward(self, x):
        x = self.features(x)
        x = x.view(x.size(0), -1)
        x = self.classifier(x)
        return x

model = LeNet5()
x = torch.randn(1, 1, 28, 28)
print(f"LeNet-5 输出: {model(x).shape}")

AlexNet #

python

import torch
import torch.nn as nn

class AlexNet(nn.Module):
    def __init__(self, num_classes=1000):
        super().__init__()
        self.features = nn.Sequential(
            nn.Conv2d(3, 64, kernel_size=11, stride=4, padding=2),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=3, stride=2),
            nn.Conv2d(64, 192, kernel_size=5, padding=2),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=3, stride=2),
            nn.Conv2d(192, 384, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(384, 256, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(256, 256, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=3, stride=2),
        )
        self.avgpool = nn.AdaptiveAvgPool2d((6, 6))
        self.classifier = nn.Sequential(
            nn.Dropout(),
            nn.Linear(256 * 6 * 6, 4096),
            nn.ReLU(inplace=True),
            nn.Dropout(),
            nn.Linear(4096, 4096),
            nn.ReLU(inplace=True),
            nn.Linear(4096, num_classes),
        )
    
    def forward(self, x):
        x = self.features(x)
        x = self.avgpool(x)
        x = torch.flatten(x, 1)
        x = self.classifier(x)
        return x

VGG #

python

import torch
import torch.nn as nn

class VGGBlock(nn.Module):
    def __init__(self, in_channels, out_channels, num_convs):
        super().__init__()
        layers = []
        for i in range(num_convs):
            layers.append(nn.Conv2d(
                in_channels if i == 0 else out_channels,
                out_channels,
                kernel_size=3,
                padding=1
            ))
            layers.append(nn.ReLU(inplace=True))
        layers.append(nn.MaxPool2d(kernel_size=2, stride=2))
        self.block = nn.Sequential(*layers)
    
    def forward(self, x):
        return self.block(x)

class VGG16(nn.Module):
    def __init__(self, num_classes=1000):
        super().__init__()
        self.features = nn.Sequential(
            VGGBlock(3, 64, 2),
            VGGBlock(64, 128, 2),
            VGGBlock(128, 256, 3),
            VGGBlock(256, 512, 3),
            VGGBlock(512, 512, 3),
        )
        self.avgpool = nn.AdaptiveAvgPool2d((7, 7))
        self.classifier = nn.Sequential(
            nn.Linear(512 * 7 * 7, 4096),
            nn.ReLU(True),
            nn.Dropout(),
            nn.Linear(4096, 4096),
            nn.ReLU(True),
            nn.Dropout(),
            nn.Linear(4096, num_classes),
        )
    
    def forward(self, x):
        x = self.features(x)
        x = self.avgpool(x)
        x = torch.flatten(x, 1)
        x = self.classifier(x)
        return x

ResNet 残差块 #

python

import torch
import torch.nn as nn

class BasicBlock(nn.Module):
    def __init__(self, in_channels, out_channels, stride=1):
        super().__init__()
        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, 
                              stride=stride, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(out_channels)
        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3,
                              stride=1, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(out_channels)
        self.relu = nn.ReLU(inplace=True)
        
        self.shortcut = nn.Sequential()
        if stride != 1 or in_channels != out_channels:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_channels, out_channels, kernel_size=1,
                         stride=stride, bias=False),
                nn.BatchNorm2d(out_channels)
            )
    
    def forward(self, x):
        identity = x
        
        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)
        
        out = self.conv2(out)
        out = self.bn2(out)
        
        out += self.shortcut(identity)
        out = self.relu(out)
        
        return out

class ResNet(nn.Module):
    def __init__(self, block, num_blocks, num_classes=1000):
        super().__init__()
        self.in_channels = 64
        
        self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False)
        self.bn1 = nn.BatchNorm2d(64)
        self.relu = nn.ReLU(inplace=True)
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
        
        self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1)
        self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2)
        self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2)
        self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2)
        
        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = nn.Linear(512, num_classes)
    
    def _make_layer(self, block, out_channels, num_blocks, stride):
        strides = [stride] + [1] * (num_blocks - 1)
        layers = []
        for stride in strides:
            layers.append(block(self.in_channels, out_channels, stride))
            self.in_channels = out_channels
        return nn.Sequential(*layers)
    
    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.maxpool(x)
        
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)
        
        x = self.avgpool(x)
        x = torch.flatten(x, 1)
        x = self.fc(x)
        
        return x

def resnet18(num_classes=1000):
    return ResNet(BasicBlock, [2, 2, 2, 2], num_classes)

model = resnet18(10)
x = torch.randn(1, 3, 224, 224)
print(f"ResNet-18 输出: {model(x).shape}")

批归一化 #

python

import torch
import torch.nn as nn

class ConvBlock(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size=3, stride=1):
        super().__init__()
        self.conv = nn.Conv2d(in_channels, out_channels, kernel_size, 
                             stride=stride, padding=kernel_size//2, bias=False)
        self.bn = nn.BatchNorm2d(out_channels)
        self.relu = nn.ReLU(inplace=True)
    
    def forward(self, x):
        x = self.conv(x)
        x = self.bn(x)
        x = self.relu(x)
        return x

block = ConvBlock(64, 128)
x = torch.randn(32, 64, 56, 56)
out = block(x)
print(f"ConvBlock 输出: {out.shape}")

完整 CNN 示例 #

python

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision import datasets, transforms

class CNN(nn.Module):
    def __init__(self, num_classes=10):
        super().__init__()
        self.features = nn.Sequential(
            nn.Conv2d(1, 32, kernel_size=3, padding=1),
            nn.BatchNorm2d(32),
            nn.ReLU(inplace=True),
            nn.Conv2d(32, 32, kernel_size=3, padding=1),
            nn.BatchNorm2d(32),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Dropout2d(0.25),
            
            nn.Conv2d(32, 64, kernel_size=3, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(inplace=True),
            nn.Conv2d(64, 64, kernel_size=3, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Dropout2d(0.25),
        )
        
        self.classifier = nn.Sequential(
            nn.Linear(64 * 7 * 7, 128),
            nn.BatchNorm1d(128),
            nn.ReLU(inplace=True),
            nn.Dropout(0.5),
            nn.Linear(128, num_classes)
        )
    
    def forward(self, x):
        x = self.features(x)
        x = x.view(x.size(0), -1)
        x = self.classifier(x)
        return x

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = CNN().to(device)

transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.1307,), (0.3081,))
])

train_dataset = datasets.MNIST('./data', train=True, download=True, transform=transform)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

def train(model, dataloader, criterion, optimizer, device, epoch):
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0
    
    for batch_idx, (data, target) in enumerate(dataloader):
        data, target = data.to(device), target.to(device)
        
        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
        _, predicted = output.max(1)
        total += target.size(0)
        correct += predicted.eq(target).sum().item()
        
        if batch_idx % 100 == 0:
            print(f'Epoch {epoch}, Batch {batch_idx}, Loss: {loss.item():.4f}, '
                  f'Acc: {100.*correct/total:.2f}%')
    
    return running_loss / len(dataloader), 100. * correct / total

for epoch in range(1, 6):
    train_loss, train_acc = train(model, train_loader, criterion, optimizer, device, epoch)
    print(f'Epoch {epoch} 完成, Loss: {train_loss:.4f}, Acc: {train_acc:.2f}%')

下一步 #

现在你已经掌握了 PyTorch 卷积神经网络的核心概念，接下来学习循环神经网络，了解序列数据处理！