在Pytorch中从头开始构建Vision Transformer时，mat1和mat2形状无法相乘（30x50176和768x768）错误

Question

我刚开始用pytorch从头做Vision Transformer。运行训练助手代码时出现了这样的错误。我知道是因为形状不匹配，但我不知道该怎么处理。代码是这样的：

import torch
import torchvision
from torchvision import transforms

from google.colab import drive
drive.mount('/content/gdrive')

import zipfile

zip_ref = zipfile.ZipFile('/content/gdrive/MyDrive/dataset/data9k.zip', 'r')
zip_ref.extractall("/content/dataset")
zip_ref.close()

import os
from torchvision import datasets

data_dir = 'dataset/datasets'

train_dataset = datasets.ImageFolder(root=os.path.join(data_dir, 'train'),
                                      transform=transforms.Compose([
                                          transforms.Resize((224, 224)),
                                          transforms.ToTensor(),
                                          transforms.Normalize(mean=[0.5, 0.5, 0.5],
                                 std=[0.5, 0.5, 0.5])
                                      ]))

patch_size = 16
in_channels = 3
embed_dim = 768
num_heads = 8
num_layers = 12
num_classes = 4
cls_token = 0
epochs = 5

class PatchEmbedding(torch.nn.Module):
    def __init__(self, patch_size, in_channels, embed_dim):
        super().__init__()
        self.patch_size = patch_size
        self.embed_dim = embed_dim
        self.projection = torch.nn.Linear(patch_size**2 * in_channels, embed_dim)

    def forward(self, x):
        B, C, H, W = x.shape
        x = x.reshape(B, C, H // self.patch_size, W // self.patch_size, self.patch_size, self.patch_size)
        x = x.permute(0, 1, 4, 2, 5, 3)
        x = x.flatten(2)
        x = self.projection(x)
        return x

class MultiHeadAttention(torch.nn.Module):
    def __init__(self, embed_dim, num_heads):
        super().__init__()
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.head_dim = embed_dim // num_heads
        self.q_proj = torch.nn.Linear(embed_dim, embed_dim)
        self.k_proj = torch.nn.Linear(embed_dim, embed_dim)
        self.v_proj = torch.nn.Linear(embed_dim, embed_dim)
        self.out_proj = torch.nn.Linear(embed_dim, embed_dim)

    def forward(self, q, k, v):
        B, T, E = q.shape
        q = self.q_proj(q).reshape(B, T, self.num_heads, self.head_dim)
        k = self.k_proj(k).reshape(B, T, self.num_heads, self.head_dim)
        v = self.v_proj(v).reshape(B, T, self.num_heads, self.head_dim)
        attn = torch.einsum('bhnd,bhnd->bhn', q, k) / self.head_dim**0.5
        attn = attn.softmax(dim=2)
        out = torch.einsum('bhn,bhnd->bhnd', attn, v)
        out = out.reshape(B, T, E)
        out = self.out_proj(out)
        return out

class TransformerEncoder(torch.nn.Module):
    def __init__(self, embed_dim, num_heads, num_layers):
        super().__init__()
        self.layers = torch.nn.ModuleList([
            torch.nn.Sequential(
                MultiHeadAttention(embed_dim, num_heads),
                torch.nn.LayerNorm(embed_dim),
                torch.nn.Linear(embed_dim, embed_dim),
                torch.nn.GELU(),
                torch.nn.Dropout(0.1),
            )
            for _ in range(num_layers)
        ])

    def forward(self, x):
        for layer in self.layers:
            x = layer(x)
        return x

class VisionTransformer(torch.nn.Module):
    def __init__(self, patch_size, in_channels, embed_dim, num_heads, num_layers, num_classes):
        super().__init__()
        self.patch_embed = PatchEmbedding(patch_size, in_channels, embed_dim)
        self.encoder = TransformerEncoder(embed_dim, num_heads, num_layers)
        self.classifier = torch.nn.Linear(embed_dim, num_classes)

    def forward(self, x):
        x = self.patch_embed(x)
        x = self.encoder(x)
        x = x[:, cls_token]
        x = self.classifier(x)
        return x

model = VisionTransformer(patch_size, in_channels, embed_dim, num_heads, num_layers, num_classes)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=0.001)
criterion = torch.nn.CrossEntropyLoss()

from torch.utils.data import DataLoader

dataloader = DataLoader(train_dataset, batch_size=10, shuffle=True)

for epoch in range(epochs):
    for batch_idx, (data, target) in enumerate(dataloader):
        data, target = data.to(device), target.to(device)
        output = model(data)
        loss = criterion(output, target)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if batch_idx % 100 == 0:
            print('Epoch: {}, Batch: {}, Loss: {:.4f}'.format(
                epoch, batch_idx, loss.item()))

我遇到了以下错误

---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
<ipython-input-49-fcd52f81880e> in <cell line: 1>()
      2     for batch_idx, (data, target) in enumerate(dataloader):
      3         data, target = data.to(device), target.to(device)
----> 4         output = model(data)
      5         loss = criterion(output, target)
      6         optimizer.zero_grad()

8 frames
/usr/local/lib/python3.10/dist-packages/torch/nn/modules/linear.py in forward(self, input)
    112 
    113     def forward(self, input: Tensor) -> Tensor:
--> 114         return F.linear(input, self.weight, self.bias)
    115 
    116     def extra_repr(self) -> str:

RuntimeError: mat1 and mat2 shapes cannot be multiplied (30x50176 and 768x768)

请问怎么解决这个问题呢？我想知道该如何处理这个问题。

深度学习 pytorch 矩阵乘法形状不匹配计算图 vision transformer 训练助手模型构建

在Pytorch中从头开始构建Vision Transformer时，mat1和mat2形状无法相乘（30x50176和768x768）错误

1 个回答

撰写回答