在Pytorch中从头开始构建Vision Transformer时,mat1和mat2形状无法相乘(30x50176和768x768)错误
我刚开始用pytorch从头做Vision Transformer。运行训练助手代码时出现了这样的错误。我知道是因为形状不匹配,但我不知道该怎么处理。代码是这样的:
import torch
import torchvision
from torchvision import transforms
from google.colab import drive
drive.mount('/content/gdrive')
import zipfile
zip_ref = zipfile.ZipFile('/content/gdrive/MyDrive/dataset/data9k.zip', 'r')
zip_ref.extractall("/content/dataset")
zip_ref.close()
import os
from torchvision import datasets
data_dir = 'dataset/datasets'
train_dataset = datasets.ImageFolder(root=os.path.join(data_dir, 'train'),
transform=transforms.Compose([
transforms.Resize((224, 224)),
transforms.ToTensor(),
transforms.Normalize(mean=[0.5, 0.5, 0.5],
std=[0.5, 0.5, 0.5])
]))
patch_size = 16
in_channels = 3
embed_dim = 768
num_heads = 8
num_layers = 12
num_classes = 4
cls_token = 0
epochs = 5
class PatchEmbedding(torch.nn.Module):
def __init__(self, patch_size, in_channels, embed_dim):
super().__init__()
self.patch_size = patch_size
self.embed_dim = embed_dim
self.projection = torch.nn.Linear(patch_size**2 * in_channels, embed_dim)
def forward(self, x):
B, C, H, W = x.shape
x = x.reshape(B, C, H // self.patch_size, W // self.patch_size, self.patch_size, self.patch_size)
x = x.permute(0, 1, 4, 2, 5, 3)
x = x.flatten(2)
x = self.projection(x)
return x
class MultiHeadAttention(torch.nn.Module):
def __init__(self, embed_dim, num_heads):
super().__init__()
self.embed_dim = embed_dim
self.num_heads = num_heads
self.head_dim = embed_dim // num_heads
self.q_proj = torch.nn.Linear(embed_dim, embed_dim)
self.k_proj = torch.nn.Linear(embed_dim, embed_dim)
self.v_proj = torch.nn.Linear(embed_dim, embed_dim)
self.out_proj = torch.nn.Linear(embed_dim, embed_dim)
def forward(self, q, k, v):
B, T, E = q.shape
q = self.q_proj(q).reshape(B, T, self.num_heads, self.head_dim)
k = self.k_proj(k).reshape(B, T, self.num_heads, self.head_dim)
v = self.v_proj(v).reshape(B, T, self.num_heads, self.head_dim)
attn = torch.einsum('bhnd,bhnd->bhn', q, k) / self.head_dim**0.5
attn = attn.softmax(dim=2)
out = torch.einsum('bhn,bhnd->bhnd', attn, v)
out = out.reshape(B, T, E)
out = self.out_proj(out)
return out
class TransformerEncoder(torch.nn.Module):
def __init__(self, embed_dim, num_heads, num_layers):
super().__init__()
self.layers = torch.nn.ModuleList([
torch.nn.Sequential(
MultiHeadAttention(embed_dim, num_heads),
torch.nn.LayerNorm(embed_dim),
torch.nn.Linear(embed_dim, embed_dim),
torch.nn.GELU(),
torch.nn.Dropout(0.1),
)
for _ in range(num_layers)
])
def forward(self, x):
for layer in self.layers:
x = layer(x)
return x
class VisionTransformer(torch.nn.Module):
def __init__(self, patch_size, in_channels, embed_dim, num_heads, num_layers, num_classes):
super().__init__()
self.patch_embed = PatchEmbedding(patch_size, in_channels, embed_dim)
self.encoder = TransformerEncoder(embed_dim, num_heads, num_layers)
self.classifier = torch.nn.Linear(embed_dim, num_classes)
def forward(self, x):
x = self.patch_embed(x)
x = self.encoder(x)
x = x[:, cls_token]
x = self.classifier(x)
return x
model = VisionTransformer(patch_size, in_channels, embed_dim, num_heads, num_layers, num_classes)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=0.001)
criterion = torch.nn.CrossEntropyLoss()
from torch.utils.data import DataLoader
dataloader = DataLoader(train_dataset, batch_size=10, shuffle=True)
for epoch in range(epochs):
for batch_idx, (data, target) in enumerate(dataloader):
data, target = data.to(device), target.to(device)
output = model(data)
loss = criterion(output, target)
optimizer.zero_grad()
loss.backward()
optimizer.step()
if batch_idx % 100 == 0:
print('Epoch: {}, Batch: {}, Loss: {:.4f}'.format(
epoch, batch_idx, loss.item()))
我遇到了以下错误
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
<ipython-input-49-fcd52f81880e> in <cell line: 1>()
2 for batch_idx, (data, target) in enumerate(dataloader):
3 data, target = data.to(device), target.to(device)
----> 4 output = model(data)
5 loss = criterion(output, target)
6 optimizer.zero_grad()
8 frames
/usr/local/lib/python3.10/dist-packages/torch/nn/modules/linear.py in forward(self, input)
112
113 def forward(self, input: Tensor) -> Tensor:
--> 114 return F.linear(input, self.weight, self.bias)
115
116 def extra_repr(self) -> str:
RuntimeError: mat1 and mat2 shapes cannot be multiplied (30x50176 and 768x768)
请问怎么解决这个问题呢?我想知道该如何处理这个问题。
1 个回答
1
在每一层之后检查一下你的数据形状,这样你就能看到它和你的线性层不匹配。线性层需要的输入维度是768,但它接收到的是50176。