RuntimeError: mat1 and mat2 shapes cannot be multiplied (4x32 and 400x120)

class Net(nn.Module):
    def __init__(self):
        super().__init__()
        #(input channel, output channel, kenel size)
        #channel is a dimension of a tensor which is a container that can house data in N dimensions (matrices)
        self.conv1 = nn.Conv2d(3, 6, 5) 
        #shrink the image stack by pooling(kernel size, stride(shift)) and take max value per window
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(6, 16, 5)
        #TODO: add conv3
        self.conv3 = nn.Conv2d(16, 32, 5)
        #drop layer deletes 20% of the feautures to help prevent overfitting
        self.drop = nn.Dropout2d(p=0.2)
        #linear predicts the output as a linear function of inputs 
        #(output channels, height, width, batch size
        #TODO:
        self.fc1 = nn.Linear(16 * 16 * 5, 120)
        #TODO:
        self.fc1_5 = nn.Linear()
        #layer(size of input, size of output)
        #Linear layer=Fully connected layer
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)
        
    def forward(self, x):
        #F.ReLUs change negative values to 0. Apply to all stack of images.
        #they are activation functions. We apply it after each liner layer. 
        #only used in hidden layers. 
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        #Select some feautures to drop after 3rd conv to prevent overfitting
        x = self.drop(F.relu(self.conv3(x)))
        x = torch.flatten(x, 1) # flatten all dimensions except batch into 1-D
        x = F.relu(self.fc1(x))
        #TODO: add fc1_5
        x = F.relu(self.fc1_5(x)) 
        x = F.relu(self.fc2(x))
        #Feed to Fully connected layer to predict class
        x = self.fc3(x) # no relu b/c it's a last layer. 
        return x

I am using images from CIFAR10 which are of size 3x32x32. When I ran the code before, it stopped because self.fc1 linear layer size did not work with self.conv3 I've added.

I'm also not sure what to write for self.fc1_5. Can someone explain me how this is actually working and the solution as well? Thank you!

I have added an extra convolutional layer and you can see it is self.conv3 = nn.Conv2d(16, 32, 5). Lines under the TODO are where I'm stuck at. I updated the line to: self.fc1 = nn.Linear(16 * 16 * 5, 120) before, it was: self.fc1 = nn.Linear(16 * 5 * 5, 120).

Solution

When you create a CNN for classification with a fixed input size, it's easy to figure out the size of your image by the time it has progressed through your CNN layers. Since we start with images of size [32,32] (channels are unimportant for now):

def __init__(self):
    super().__init__()
    #(input channel, output channel, kenel size)
    #channel is a dimension of a tensor which is a container that can house data in N dimensions (matrices)
    self.conv1 = nn.Conv2d(3, 6, 5) # size 28x28 - lose 2 px from each side with a kernel of size 5
    #shrink the image stack by pooling(kernel size, stride(shift)) and take max value per window
    self.pool = nn.MaxPool2d(2, 2) # size 14x14 - max pooling with K=2 halves the image size
    self.conv2 = nn.Conv2d(6, 16, 5) # size 10x10 -> 5x5 after pooling
    #TODO: add conv3
    self.conv3 = nn.Conv2d(16, 32, 5) # size 1x1
    #drop layer deletes 20% of the feautures to help prevent overfitting
    self.drop = nn.Dropout2d(p=0.2)
    #linear predicts the output as a linear function of inputs 
    #(output channels, height, width, batch size
    self.fc1 = nn.Linear(1 * 1 * 32, 120)
    self.fc1_5 = nn.Linear(120,120) # matches the output size of fc1 and input size of fc2

The CNN size losses can be negated by using padding of (K-1)//2, where K=kernel_size.