多层感知机（MLP）

多层感知机（Multi-Layer Perceptron，MLP）是最基础的神经网络结构，由多个全连接层堆叠而成。

一、基本概念

1.1 什么是 MLP？

MLP 是一种前馈神经网络，由以下部分组成：

输入层：接收特征
隐藏层：提取特征（可以有多个）
输出层：输出结果

关键点： 在隐藏层之间必须添加激活函数，否则多层网络等价于单层。

import torch
import torch.nn as nn
 
# 错误：没有激活函数
model = nn.Sequential(
    nn.Linear(784, 256),
    nn.Linear(256, 128),
    nn.Linear(128, 10)
)
# 等价于 nn.Linear(784, 10)
 
# 正确：添加激活函数
model = nn.Sequential(
    nn.Linear(784, 256),
    nn.ReLU(),
    nn.Linear(256, 128),
    nn.ReLU(),
    nn.Linear(128, 10)
)

1.2 为什么需要激活函数？

# 没有激活函数
# y = W2 * (W1 * x) = (W2 * W1) * x = W * x
# 多层线性变换仍然等价于单层
 
# 有激活函数
# y = W2 * relu(W1 * x)
# 引入非线性，可以拟合任意复杂函数

二、MLP 实现

2.1 使用 Sequential

import torch.nn as nn
 
# 简单 MLP
model = nn.Sequential(
    nn.Linear(784, 256),
    nn.ReLU(),
    nn.Linear(256, 128),
    nn.ReLU(),
    nn.Linear(128, 10)
)
 
# 前向传播
x = torch.randn(32, 784)  # batch_size=32
output = model(x)          # shape: [32, 10]

2.2 自定义 Module

import torch
import torch.nn as nn
 
class MLP(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.fc3 = nn.Linear(hidden_size, num_classes)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.2)
    
    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.dropout(x)
        
        x = self.fc2(x)
        x = self.relu(x)
        x = self.dropout(x)
        
        x = self.fc3(x)
        return x
 
# 使用
model = MLP(input_size=784, hidden_size=256, num_classes=10)
output = model(torch.randn(32, 784))

2.3 使用 ModuleList

class FlexibleMLP(nn.Module):
    def __init__(self, layer_sizes):
        """
        layer_sizes: [input_size, hidden1, hidden2, ..., output_size]
        """
        super(FlexibleMLP, self).__init__()
        
        self.layers = nn.ModuleList()
        for i in range(len(layer_sizes) - 1):
            self.layers.append(nn.Linear(layer_sizes[i], layer_sizes[i+1]))
        
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.2)
    
    def forward(self, x):
        for i, layer in enumerate(self.layers[:-1]):
            x = layer(x)
            x = self.relu(x)
            x = self.dropout(x)
        x = self.layers[-1](x)  # 输出层不加激活
        return x
 
# 使用
model = FlexibleMLP([784, 512, 256, 128, 10])

三、完整训练示例

3.1 MNIST 分类

import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
 
# 数据预处理
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.1307,), (0.3081,))
])
 
# 加载数据
train_dataset = datasets.MNIST('./data', train=True, download=True, transform=transform)
test_dataset = datasets.MNIST('./data', train=False, transform=transform)
 
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=1000, shuffle=False)
 
# 模型定义
class MLP(nn.Module):
    def __init__(self):
        super(MLP, self).__init__()
        self.flatten = nn.Flatten()
        self.layers = nn.Sequential(
            nn.Linear(28*28, 512),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(256, 10)
        )
    
    def forward(self, x):
        x = self.flatten(x)
        x = self.layers(x)
        return x
 
model = MLP()
 
# 损失函数和优化器
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
 
# 训练函数
def train(model, train_loader, optimizer, criterion, epoch):
    model.train()
    total_loss = 0
    correct = 0
    total = 0
    
    for batch_idx, (data, target) in enumerate(train_loader):
        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        pred = output.argmax(dim=1)
        correct += (pred == target).sum().item()
        total += target.size(0)
        
        if batch_idx % 100 == 0:
            print(f'Train Epoch: {epoch} [{batch_idx * len(data)}/{len(train_loader.dataset)} '
                  f'({100. * batch_idx / len(train_loader):.0f}%)]\tLoss: {loss.item():.6f}')
    
    return total_loss / len(train_loader), correct / total
 
# 测试函数
def test(model, test_loader, criterion):
    model.eval()
    test_loss = 0
    correct = 0
    
    with torch.no_grad():
        for data, target in test_loader:
            output = model(data)
            test_loss += criterion(output, target).item()
            pred = output.argmax(dim=1)
            correct += (pred == target).sum().item()
    
    test_loss /= len(test_loader)
    accuracy = correct / len(test_loader.dataset)
    
    print(f'\nTest set: Average loss: {test_loss:.4f}, '
          f'Accuracy: {correct}/{len(test_loader.dataset)} ({100. * accuracy:.2f}%)\n')
    
    return test_loss, accuracy
 
# 训练循环
for epoch in range(1, 11):
    train_loss, train_acc = train(model, train_loader, optimizer, criterion, epoch)
    test_loss, test_acc = test(model, test_loader, criterion)
    
    print(f'Epoch {epoch}: Train Loss={train_loss:.4f}, Train Acc={train_acc:.4f}, '
          f'Test Loss={test_loss:.4f}, Test Acc={test_acc:.4f}')

四、常用技巧

4.1 Batch Normalization

class MLPWithBN(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(MLPWithBN, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.bn1 = nn.BatchNorm1d(hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.bn2 = nn.BatchNorm1d(hidden_size)
        self.fc3 = nn.Linear(hidden_size, num_classes)
        self.relu = nn.ReLU()
    
    def forward(self, x):
        x = self.fc1(x)
        x = self.bn1(x)
        x = self.relu(x)
        
        x = self.fc2(x)
        x = self.bn2(x)
        x = self.relu(x)
        
        x = self.fc3(x)
        return x

4.2 Dropout

class MLPWithDropout(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes, dropout_prob=0.5):
        super(MLPWithDropout, self).__init__()
        self.layers = nn.Sequential(
            nn.Linear(input_size, hidden_size),
            nn.ReLU(),
            nn.Dropout(dropout_prob),
            nn.Linear(hidden_size, hidden_size),
            nn.ReLU(),
            nn.Dropout(dropout_prob),
            nn.Linear(hidden_size, num_classes)
        )
    
    def forward(self, x):
        return self.layers(x)

4.3 残差连接

class ResidualBlock(nn.Module):
    def __init__(self, hidden_size):
        super(ResidualBlock, self).__init__()
        self.fc1 = nn.Linear(hidden_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.relu = nn.ReLU()
    
    def forward(self, x):
        residual = x
        x = self.relu(self.fc1(x))
        x = self.fc2(x)
        x += residual  # 残差连接
        x = self.relu(x)
        return x
 
class ResidualMLP(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes, num_blocks=3):
        super(ResidualMLP, self).__init__()
        
        self.input_layer = nn.Linear(input_size, hidden_size)
        self.blocks = nn.ModuleList([
            ResidualBlock(hidden_size) for _ in range(num_blocks)
        ])
        self.output_layer = nn.Linear(hidden_size, num_classes)
        self.relu = nn.ReLU()
    
    def forward(self, x):
        x = self.relu(self.input_layer(x))
        for block in self.blocks:
            x = block(x)
        x = self.output_layer(x)
        return x

4.4 权重初始化

import torch.nn.init as init
 
class MLP(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, num_classes)
        
        # 初始化
        self._init_weights()
    
    def _init_weights(self):
        # He 初始化（适合 ReLU）
        init.kaiming_normal_(self.fc1.weight, mode='fan_in', nonlinearity='relu')
        init.zeros_(self.fc1.bias)
        
        # Xavier 初始化
        init.xavier_normal_(self.fc2.weight)
        init.zeros_(self.fc2.bias)
    
    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)
        return x

五、超参数选择

5.1 网络结构

# 隐藏层大小：通常是输入大小的 2-4 倍或与输入相同
hidden_size = 256  # 或 512, 1024
 
# 隐藏层数量：通常 2-5 层
num_layers = 3
 
# 建议：从简单模型开始，逐步增加复杂度
layer_configs = [
    [784, 10],              # 1层
    [784, 256, 10],         # 2层
    [784, 256, 128, 10],    # 3层
]

5.2 训练参数

# 学习率
learning_rate = 0.001  # Adam 默认
# learning_rate = 0.01  # SGD
 
# Batch size
batch_size = 64  # 或 128, 256, 512
 
# Dropout
dropout_prob = 0.2  # 通常 0.2-0.5
 
# 权重衰减（L2 正则化）
weight_decay = 0.0001

六、模型评估

6.1 模型复杂度

def count_parameters(model):
    """计算模型参数数量"""
    total_params = sum(p.numel() for p in model.parameters())
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    return total_params, trainable_params
 
model = MLP(784, 256, 10)
total, trainable = count_parameters(model)
print(f"Total parameters: {total:,}")
print(f"Trainable parameters: {trainable:,}")

6.2 可视化

from torch.utils.tensorboard import SummaryWriter
 
writer = SummaryWriter('runs/mlp_experiment')
 
# 记录模型结构
model = MLP()
writer.add_graph(model, torch.randn(1, 784))
 
# 记录损失和准确率
for epoch in range(epochs):
    # ... 训练代码 ...
    writer.add_scalar('Loss/train', train_loss, epoch)
    writer.add_scalar('Loss/test', test_loss, epoch)
    writer.add_scalar('Accuracy/train', train_acc, epoch)
    writer.add_scalar('Accuracy/test', test_acc, epoch)
 
writer.close()

七、常见问题

7.1 过拟合

# 症状：训练集准确率高，测试集准确率低
# 解决方法：
# 1. Dropout
model = nn.Sequential(
    nn.Linear(784, 256),
    nn.ReLU(),
    nn.Dropout(0.5),  # 添加 Dropout
    nn.Linear(256, 10)
)
 
# 2. 权重衰减
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=0.001)
 
# 3. 早停
best_val_loss = float('inf')
patience = 10
counter = 0
 
for epoch in range(epochs):
    # ... 训练和验证 ...
    
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(model.state_dict(), 'best_model.pth')
        counter = 0
    else:
        counter += 1
        if counter >= patience:
            print("Early stopping!")
            break
 
# 4. 数据增强
transform = transforms.Compose([
    transforms.RandomRotation(10),
    transforms.ToTensor(),
])

7.2 欠拟合

# 症状：训练集和测试集准确率都很低
# 解决方法：
# 1. 增加模型容量
model = nn.Sequential(
    nn.Linear(784, 512),  # 增加隐藏层大小
    nn.ReLU(),
    nn.Linear(512, 512),
    nn.ReLU(),
    nn.Linear(512, 10)
)
 
# 2. 增加训练时间
epochs = 50
 
# 3. 减少正则化
dropout_prob = 0.1  # 降低 dropout
weight_decay = 0.0  # 移除权重衰减
 
# 4. 使用更好的优化器
optimizer = optim.Adam(model.parameters(), lr=0.001)

7.3 梯度消失/爆炸

# 解决方法：
# 1. 使用 Batch Normalization
# 2. 使用合适的激活函数（ReLU）
# 3. 使用合适的初始化
init.kaiming_normal_(layer.weight, mode='fan_in', nonlinearity='relu')
 
# 4. 梯度裁剪
torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

八、MLP vs 其他模型

模型	优点	缺点	适用场景
MLP	简单、快速	不能处理空间结构	表格数据、简单分类
CNN	提取空间特征	需要更多计算	图像、视频
RNN/LSTM	处理序列	训练慢	文本、时间序列
Transformer	并行、长距离依赖	需要大量数据	NLP、CV

参考资源

06-Linear - 全连接层
07-激活函数 - 激活函数详解
08-损失函数 - 损失函数
09-优化器 - 优化器
13-模型训练完整流程 - 完整训练示例

CS Notes

导航

10-多层感知机-MLP

多层感知机（MLP）

一、基本概念

1.1 什么是 MLP？

1.2 为什么需要激活函数？

二、MLP 实现

2.1 使用 Sequential

2.2 自定义 Module

2.3 使用 ModuleList

三、完整训练示例

3.1 MNIST 分类

四、常用技巧

4.1 Batch Normalization

4.2 Dropout

4.3 残差连接

4.4 权重初始化

五、超参数选择

5.1 网络结构

5.2 训练参数

六、模型评估

6.1 模型复杂度

6.2 可视化

七、常见问题

7.1 过拟合

7.2 欠拟合

7.3 梯度消失/爆炸

八、MLP vs 其他模型

参考资源

关系图谱

目录

反向链接