计算机视觉神经网络 PyTorch

0 投票
1 回答
24 浏览
提问于 2025-04-13 12:47

大家好,我在这里有个比较长的问题,想问一下为什么我的卷积神经网络(CNN)无法正常工作。为了方便大家理解,我在使用Jupyter Notebooks,并且已经导入了所有必要的库。

我尝试了很长时间来编写和调试代码。现在我有以下的代码和输出:

#Any Tensor we are working with is the Batch Size
#Input Size b times 3 times 240 times 360
#When running convolution, it's b times 16 times height times width
#Batch size, feature dimension, height, width

class SceneClassificationCNN(nn.Module):
    def __init__(self, img_height=240, img_width=360):
        super(SceneClassificationCNN, self).__init__()
        nn.Flatten()

        # Convolutional layers

        self.conv1 = nn.Conv2d(in_channels=3, out_channels=16, kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(in_channels=16, out_channels=32, kernel_size=3, padding=1)
        self.conv3 = nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3, padding=1)
        
        # Pooling layer

        self.pool = nn.MaxPool2d(kernel_size=2, stride=2)
        self._flat_features_size = self._get_conv_output_size(img_height, img_width)

        # Fully connected layers
        # Assuming input images are resized to 128x128, the feature map size is 16x16 after convolutions and pooling

        self.fc1 = nn.Linear(in_features=self._flat_features_size , out_features=512)
        self.fc2 = nn.Linear(in_features=512, out_features=128)
        self.fc3 = nn.Linear(in_features=128, out_features=5)  # Assuming 5 classes

        #STOP HARD CODING NOTE TO SELF
        #For self.fc1 find how to calculate the size of the flattened image in place of the 120 and 180
        
    def _get_conv_output_size(self, img_height, img_width):

        # Simulate forward pass through the convolutional and pooling layers
        # without considering the actual data (using a dummy tensor)

        dummy_input = torch.zeros(1, 3, img_height, img_width)
        with torch.no_grad():
            dummy_output = self.pool(functional.relu(self.conv1(dummy_input)))
            dummy_output = self.pool(functional.relu(self.conv2(dummy_output)))
            dummy_output = self.pool(functional.relu(self.conv3(dummy_output)))
        print("Dummy output size:",dummy_output.size())
        return int(np.prod(dummy_output.size()[1:]))
    

    def forward(self, x):

        # Apply convolutions and pooling

        x = self.pool(functional.relu(self.conv1(x)))
        x = self.pool(functional.relu(self.conv2(x)))
        x = self.pool(functional.relu(self.conv3(x)))
        
        # Flatten the output for the fully connected layers
        #x = x.view(x.size(0), -1) #Change the shape
        
        # Apply fully connected layers with ReLU activations

        x = functional.relu(self.fc1(x))
        x = functional.relu(self.fc2(x))
        
        # Final layer without ReLU as it's going into a softmax (done by CrossEntropyLoss)

        x = self.fc3(x)

        #x = x.view(-1, self._flat_features_size)

        return x

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
model = SceneClassificationCNN().to(device)
print(model)
optimizer = optim.Adam(model.parameters(), lr=0.001)  

# Learning rate can be adjusted 
#I am use to the Adam Optimizer

这个代码的输出是:

Using device: cpu
Dummy output size: torch.Size([1, 64, 30, 45])
SceneClassificationCNN(
  (conv1): Conv2d(3, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv2): Conv2d(16, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv3): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (pool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (fc1): Linear(in_features=86400, out_features=512, bias=True)
  (fc2): Linear(in_features=512, out_features=128, bias=True)
  (fc3): Linear(in_features=128, out_features=5, bias=True)
)

虽然我在代码中有这个CNN模型,但它一直报错,所以我用了NeuralNetwork_Conv,不幸的是,它也给我带来了相同或类似的问题。

接着,我重新编写了一个名为NeuralNetwork_Conv的类的代码。

class NeuralNetwork_Conv(nn.Module):
    def __init__(self):
        super(NeuralNetwork_Conv, self).__init__()
        self.conv1 = nn.Conv2d(3, 16, kernel_size=3, stride=1, padding=1)
        self.pool1 = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(16, 32, kernel_size=3, stride=1, padding=1)
        self.pool2 = nn.MaxPool2d(2, 2)
        self.conv3 = nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1)
        self.pool3 = nn.MaxPool2d(2, 2)

        # Placeholder for the linear layers - adjust the size accordingly

        self.fc1 = nn.Linear(64 * 30 * 45, 512)  

    def forward(self, x):
        x = self.conv1(x)
        print("After conv1:", x.size())
        x = self.pool1(x)
        print("After pool1:", x.size())
        
        x = self.conv2(x)
        print("After conv2:", x.size())
        x = self.pool2(x)
        print("After pool2:", x.size())
        
        x = self.conv3(x)
        print("After conv3:", x.size())
        x = self.pool3(x)
        print("After pool3:", x.size())
        
        x = x.view(-1, 64 * 30 * 45)  
        x = self.fc1(x)
        return x

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
model = NeuralNetwork_Conv().to(device)
print(model)
optimizer = optim.Adam(model.parameters(), lr=0.001) 

# Test the forward pass with a dummy input to observe sizes

dummy_input = torch.randn(1, 3, 240, 360).to(device)  # Example input size, adjust as necessary
model(dummy_input)

这是我从上面的代码得到的两个输出:

Using device: cpu
NeuralNetwork_Conv(
  (conv1): Conv2d(3, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (pool1): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv2): Conv2d(16, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (pool2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv3): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (pool3): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (fc1): Linear(in_features=86400, out_features=512, bias=True)
)
After conv1: torch.Size([1, 16, 240, 360])
After pool1: torch.Size([1, 16, 120, 180])
After conv2: torch.Size([1, 32, 120, 180])
After pool2: torch.Size([1, 32, 60, 90])
After conv3: torch.Size([1, 64, 60, 90])
After pool3: torch.Size([1, 64, 30, 45])

还有另一个输出:

tensor([[-6.6583e-02,  7.8844e-02, -1.4249e-01,  6.1643e-02, -7.0408e-02,
          4.2068e-02, -4.0336e-02, -1.9950e-01, -7.1389e-02, -1.3796e-01,
          9.3257e-02,  1.9345e-01,  4.6931e-03,  3.6373e-01, -1.1949e-01,
         -1.6199e-01,  1.3884e-01,  2.8671e-02,  5.3386e-02,  6.5578e-02,
          1.8842e-01, -5.9504e-02, -1.8027e-01, -8.8520e-02, -2.0062e-02,
         -1.5458e-01, -1.1620e-01,  2.5340e-02,  1.2343e-01, -1.9141e-01,
          3.7330e-02, -1.5329e-02,  4.5059e-02,  1.3514e-01, -1.5895e-02,
         -1.0641e-01,  1.5021e-02,  6.7946e-02, -1.0390e-01, -5.0331e-02,
          1.2298e-02, -1.4691e-01,  2.8895e-01, -3.1915e-02,  7.0279e-02,
          5.5356e-02, -6.9813e-02,  3.4496e-02,  2.0414e-02,  2.7333e-02,
         -1.3051e-02, -3.4270e-02,  2.5378e-01, -5.9996e-02, -1.2498e-01,
         -1.7903e-02, -1.8139e-02, -1.8555e-01,  3.4394e-01, -7.8244e-02,
         -5.8675e-02, -2.3361e-02, -1.1856e-01,  1.4826e-02,  3.0094e-01,
         -1.0646e-01,  1.1270e-01, -1.1333e-01,  1.5593e-02, -8.6904e-02,
          2.7338e-02, -2.3950e-01,  1.0691e-01, -1.0705e-01,  8.4953e-02,
          3.5296e-02,  1.2546e-01,  5.4389e-02,  1.8928e-01,  1.1250e-01,
         -1.0872e-02,  1.6038e-02,  7.3459e-02, -1.6390e-02,  4.3814e-02,
         -4.8063e-02,  4.9660e-02, -4.8125e-02, -4.4857e-02,  6.9098e-03,
          1.5525e-01,  2.5897e-01,  1.1529e-01,  3.8345e-02,  1.3227e-01,
         -9.4112e-02, -1.5524e-01, -4.4205e-02, -1.4851e-01,  1.4226e-01,
         -1.9989e-01,  9.4048e-02,  3.0361e-01,  3.6938e-02, -1.9737e-01,
         -4.3552e-02, -2.6977e-01,  4.0547e-02, -2.5311e-01,  1.7712e-01,
          1.1568e-01,  7.7897e-02,  1.3156e-01, -3.6716e-02, -1.8810e-01,
         -4.5886e-02, -1.1666e-01, -2.7934e-02,  4.2246e-01,  6.5305e-02,
          6.8844e-02,  2.9094e-01,  1.5930e-01,  2.3336e-01, -1.0987e-01,
          2.6732e-01, -7.9896e-02,  3.4161e-01,  2.8448e-01,  2.7811e-01,
         -2.8356e-01, -1.4066e-01,  1.7005e-01, -1.6456e-01, -1.2238e-01,
          7.5385e-02, -8.2953e-02,  5.1057e-02, -1.2672e-02,  9.9285e-02,
          2.3554e-02, -1.4845e-03, -3.4829e-02,  3.6016e-01,  2.6877e-01,
          1.3761e-01, -6.4589e-02,  4.6149e-02,  4.3486e-02, -1.7645e-01,
          6.2570e-02, -7.3389e-02, -1.4512e-02, -1.4758e-01, -2.1920e-01,
          1.7140e-02, -3.2040e-02,  1.0936e-01,  8.7229e-02,  8.8445e-02,
         -1.9883e-01,  2.1703e-01, -1.7173e-01,  2.7462e-01,  1.2990e-01,
          1.5235e-01,  1.9545e-02,  1.1720e-02,  1.9312e-01,  5.9859e-02,
         -8.0287e-02,  1.0102e-01, -3.9349e-01,  1.4303e-01, -1.2415e-01,
         -8.5000e-03,  1.5991e-01, -3.3937e-01, -1.2435e-01,  1.8382e-01,
          4.4852e-02,  1.4590e-01,  1.8493e-01,  6.3306e-02,  6.4110e-02,
          1.8389e-02, -7.8453e-02, -3.3310e-02,  1.2160e-03, -1.4169e-01,
         -3.0171e-01,  1.4754e-01, -1.0948e-01,  9.7101e-02,  2.1271e-02,
          3.7804e-02, -8.1400e-04, -1.0619e-01, -3.3092e-02,  7.3220e-02,
         -3.5862e-03,  4.1970e-02,  1.5572e-02,  1.8815e-02,  8.1993e-02,
          1.2919e-01, -5.1420e-02,  4.7143e-02, -1.0359e-01, -1.0286e-01,
          1.6066e-02,  9.2730e-02, -5.8958e-02, -5.0492e-03,  1.0211e-01,
         -5.6311e-02, -1.2885e-01, -2.0300e-02,  1.7353e-02, -1.2325e-01,
         -5.6017e-02, -2.7655e-01, -9.5620e-02, -2.2155e-02,  2.1870e-01,
          2.3230e-01, -3.0554e-05,  1.1196e-01, -1.3114e-01,  8.9631e-02,
         -1.3647e-01, -4.4391e-02, -1.9639e-02, -1.0989e-01,  1.0549e-01,
         -4.2130e-02, -4.3476e-01,  1.6702e-01, -3.2339e-02,  2.6870e-01,
         -4.5465e-02,  2.5549e-01,  1.0326e-01,  9.0097e-02, -2.4702e-02,
          1.8878e-01, -3.2149e-01, -5.7085e-03,  4.5387e-04,  4.2761e-02,
          1.9696e-01,  2.3717e-01,  1.1287e-01, -2.7148e-01, -2.7781e-01,
          9.4704e-02,  1.1562e-02,  3.0118e-02,  2.2923e-01,  1.9578e-02,
         -2.5807e-01, -1.8138e-01, -2.7152e-01, -2.2853e-02, -2.0886e-02,
          1.4479e-01,  1.0336e-01,  1.4169e-01,  1.0363e-01,  1.0341e-01,
         -4.4898e-02,  6.4268e-02, -9.0907e-02, -3.9338e-01, -7.2397e-02,
         -8.9581e-03,  1.4663e-01,  8.2005e-02,  2.8790e-01, -1.3653e-02,
         -1.2523e-01, -1.6508e-01, -1.5815e-01,  5.4398e-02,  1.4715e-01,
         -2.4775e-01, -2.0883e-01, -1.9139e-02,  8.6897e-02,  6.9590e-02,
          7.5575e-02, -7.3747e-02, -1.4537e-02, -2.0008e-01, -1.5837e-01,
          3.1999e-01, -4.9494e-02,  7.5654e-02, -1.5142e-01, -6.2533e-02,
         -2.1078e-01,  6.5765e-02, -4.1795e-02, -1.0766e-03,  8.2803e-02,
          1.0490e-02, -1.6136e-02, -1.2969e-01, -1.2275e-01, -4.1438e-03,
          2.3278e-01,  7.1578e-02,  2.1670e-01,  4.9445e-02, -1.9228e-01,
          2.2035e-01, -8.2743e-02, -2.4098e-01, -6.4845e-02, -2.6872e-02,
          1.2220e-02, -8.4460e-03,  1.8067e-01, -3.4670e-01, -5.8514e-03,
         -3.3068e-01, -1.2138e-01,  9.7391e-02,  1.0332e-01,  6.1555e-02,
          3.6117e-02,  5.2334e-02, -3.7863e-02,  1.3306e-01, -4.3676e-02,
          2.2926e-01, -1.3721e-01,  2.1945e-01,  2.4629e-01, -1.4614e-01,
         -1.6835e-01, -1.1987e-01, -1.4246e-01, -1.5590e-01,  3.7477e-02,
         -6.6188e-02,  4.5236e-02, -2.4773e-01, -1.5510e-01,  1.9009e-01,
         -2.2781e-02, -7.3059e-02,  1.8916e-01,  1.3230e-01,  4.3261e-02,
          4.6859e-02, -4.9147e-01,  5.7847e-02, -1.4865e-01,  5.6057e-03,
         -2.0584e-01,  1.8284e-01, -2.1993e-01,  9.4927e-02, -5.8686e-03,
         -2.8759e-02, -3.9250e-02, -2.2812e-01,  2.1872e-01, -1.2176e-01,
         -1.0005e-01,  1.0752e-01,  4.7323e-02,  6.5562e-02,  1.0710e-01,
         -1.3260e-02, -2.4615e-02, -4.3050e-02, -8.8642e-02, -1.0462e-01,
         -1.7764e-01,  2.9332e-02, -8.1576e-03,  3.0675e-01, -2.8505e-02,
          3.3142e-02,  8.2312e-02,  9.8706e-03,  4.1091e-05, -1.1159e-01,
         -2.8545e-02, -1.7685e-01, -1.1629e-01, -5.1686e-02,  3.2084e-01,
          1.8053e-01,  8.7533e-02, -2.5515e-01, -1.8066e-01,  2.7144e-02,
         -1.4081e-02,  3.9089e-01,  3.0634e-02,  7.5020e-02, -8.9916e-02,
          3.2381e-01, -2.6234e-01, -3.0246e-01, -4.7191e-02,  1.4518e-02,
         -2.3411e-01,  1.5552e-02, -4.8474e-01, -1.2143e-01,  3.4133e-01,
          6.5519e-02, -1.6780e-02,  1.1881e-01, -8.4260e-02, -5.1083e-02,
          3.3583e-02,  2.8465e-01,  1.0361e-01, -3.9145e-02, -1.7361e-01,
          3.6966e-02, -1.5067e-01, -6.7469e-02, -1.1069e-01, -1.0071e-02,
          1.4910e-02,  2.9178e-02, -6.6805e-02, -2.0691e-01,  1.3508e-01,
         -1.4120e-01, -2.4821e-01,  1.4179e-01, -1.3962e-02, -8.2849e-02,
          3.3553e-02, -1.4514e-02,  3.2578e-01,  1.3230e-01, -1.4331e-01,
          1.3124e-01,  1.5074e-01, -1.1892e-01,  9.2938e-02, -2.2535e-01,
          1.2779e-01,  2.1662e-01,  1.0649e-01, -4.0648e-02,  2.7221e-01,
         -3.0154e-01,  1.2282e-01, -5.9368e-02,  3.2799e-02,  1.5916e-01,
          1.9251e-02,  6.4609e-02, -6.8788e-02, -9.1684e-02,  8.4554e-02,
         -2.5653e-01, -1.7326e-02, -5.9050e-03,  3.8589e-02, -2.9320e-01,
          2.7190e-01, -1.9769e-01, -5.7632e-02, -1.2614e-01, -1.4066e-01,
          1.3740e-01, -7.1185e-02,  1.1509e-01,  3.1625e-02, -3.8954e-03,
         -1.9939e-01,  1.2890e-01, -2.6241e-01, -4.1080e-02,  1.1835e-01,
          3.9253e-03,  1.3970e-01,  1.3278e-02,  1.5135e-01, -4.2120e-02,
          5.5194e-03,  8.3199e-03,  2.9458e-02, -1.9543e-01,  2.7979e-02,
          8.2800e-02,  2.0363e-01,  9.0182e-02,  1.2844e-01, -6.9860e-02,
         -1.9727e-01,  2.9260e-01, -1.2800e-01, -9.0905e-02, -2.9702e-01,
         -2.5170e-02,  2.0275e-02, -8.5073e-02,  7.5998e-02, -8.6023e-02,
          4.3336e-02,  2.0305e-01]], grad_fn=<AddmmBackward0>)

到目前为止,这一切看起来都不错,直到我运行我的代码:

#Consider flattening

def train_loop(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    train_loss, correct = 0, 0

    for batch, (x, y) in enumerate(dataloader):
        x, y = x.to(device), y.to(device)

        # Compute prediction and loss

        pred = model(x)
        loss = loss_fn(pred, y)
        train_loss += loss.item()
        correct += (pred.argmax(1) == y).type(torch.float).sum().item()

        # Backpropagation

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    train_loss /= num_batches
    correct /= size
    print(f"Train Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {train_loss:>8f} \n")

def validate_model(validation_dataloader, model, loss_fn):
    size = len(validation_dataloader.dataset)
    num_batches = len(validation_dataloader)
    model.eval()  
    val_loss, correct = 0, 0
    with torch.no_grad():
        for x, y in validation_dataloader:
            x, y = x.to(device), y.to(device)
            pred = model(x)
            val_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()
    val_loss /= num_batches
    correct /= size
    print(f"Validation Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {val_loss:>8f} \n")
    return 100*correct

best_val_accuracy = 0
model_path = 'best_model.pth'  
epochs = 50
batch_size = 32
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
valid_dataloader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)
for epoch in range(epochs):
    print(f"Epoch {epoch+1}\n-------------------------------")
    train_loop(train_dataloader, model, loss_fn, optimizer)
    val_accuracy = validate_model(valid_dataloader, model, loss_fn)
    if val_accuracy > best_val_accuracy:
        best_val_accuracy = val_accuracy
        torch.save(model.state_dict(), model_path)
        print(f"New best model saved at epoch {epoch+1} with validation accuracy: {best_val_accuracy:.2f}%")

print("Training and validation completed. Best model saved to 'best_model.pth'")

#Change the predictions to accuracy

这是我目前得到的输出(这是能运行的部分):

Epoch 1
-------------------------------
After conv1: torch.Size([32, 16, 60, 90])
After pool1: torch.Size([32, 16, 30, 45])
After conv2: torch.Size([32, 32, 30, 45])
After pool2: torch.Size([32, 32, 15, 22])
After conv3: torch.Size([32, 64, 15, 22])
After pool3: torch.Size([32, 64, 7, 11])

And I am getting a Runtime Error each time:

---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
Cell In[515], line 50
     48 for epoch in range(epochs):
     49     print(f"Epoch {epoch+1}\n-------------------------------")
---> 50     train_loop(train_dataloader, model, loss_fn, optimizer)
     51     val_accuracy = validate_model(valid_dataloader, model, loss_fn)
     52     if val_accuracy > best_val_accuracy:

Cell In[515], line 11
      8 x, y = x.to(device), y.to(device)
     10 # Compute prediction and loss
---> 11 pred = model(x)
     12 loss = loss_fn(pred, y)
     13 train_loss += loss.item()

File ~\AppData\Roaming\Python\Python311\site-packages\torch\nn\modules\module.py:1511, in Module._wrapped_call_impl(self, *args, **kwargs)
   1509     return self._compiled_call_impl(*args, **kwargs)  # type: ignore[misc]
   1510 else:
-> 1511     return self._call_impl(*args, **kwargs)

File ~\AppData\Roaming\Python\Python311\site-packages\torch\nn\modules\module.py:1520, in Module._call_impl(self, *args, **kwargs)
   1515 # If we don't have any hooks, we want to skip the rest of the logic in
   1516 # this function, and just call forward.
   1517 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
...
---> 29 x = x.view(-1, 64 * 30 * 45)  # Adjust this based on the actual size after pool3
     30 x = self.fc1(x)
     31 return x

RuntimeError: shape '[-1, 86400]' is invalid for input of size 157696

问题可能出在形状上,或者是mat1mat2无法相乘。有人能帮我吗?如果需要,我可以提供更多细节。另外,抱歉输入这么长。我只是想尽量提供更多的信息。

谢谢大家的帮助和关注。

1 个回答

0

我觉得最后的扁平化层使用的维度不对。那行代码 x.view(-1, 64 * 30 * 45) 假设你可以把数据扁平化成 (B, 86400),但错误信息显示你应该把它扁平化成 (B, 157696)。

试试这个修改,来调整维度:

...
        x = self.pool3(x)
        print("After pool3:", x.size())

        #Modifications start here.
        #Get the dimensions of x
        B, C, H, W = x.shape
        
        #Calculate the flattened size
        flattened_length = C * H * W
        
        #Reshape to flat
        x = x.view(-1, flattened_length)
  
        x = self.fc1(x)
        return x

这也意味着 fc1 需要配置成和扁平化后的 x 一样的输入大小(157696)。

撰写回答