class Net(nn.Module):
def __init__(self):
super().__init__()
#(input channel, output channel, kenel size)
#channel is a dimension of a tensor which is a container that can house data in N dimensions (matrices)
self.conv1 = nn.Conv2d(3, 6, 5)
#shrink the image stack by pooling(kernel size, stride(shift)) and take max value per window
self.pool = nn.MaxPool2d(2, 2)
self.conv2 = nn.Conv2d(6, 16, 5)
#TODO: add conv3
self.conv3 = nn.Conv2d(16, 32, 5)
#drop layer deletes 20% of the feautures to help prevent overfitting
self.drop = nn.Dropout2d(p=0.2)
#linear predicts the output as a linear function of inputs
#(output channels, height, width, batch size
#TODO:
self.fc1 = nn.Linear(16 * 16 * 5, 120)
#TODO:
self.fc1_5 = nn.Linear()
#layer(size of input, size of output)
#Linear layer=Fully connected layer
self.fc2 = nn.Linear(120, 84)
self.fc3 = nn.Linear(84, 10)
def forward(self, x):
#F.ReLUs change negative values to 0. Apply to all stack of images.
#they are activation functions. We apply it after each liner layer.
#only used in hidden layers.
x = self.pool(F.relu(self.conv1(x)))
x = self.pool(F.relu(self.conv2(x)))
#Select some feautures to drop after 3rd conv to prevent overfitting
x = self.drop(F.relu(self.conv3(x)))
x = torch.flatten(x, 1) # flatten all dimensions except batch into 1-D
x = F.relu(self.fc1(x))
#TODO: add fc1_5
x = F.relu(self.fc1_5(x))
x = F.relu(self.fc2(x))
#Feed to Fully connected layer to predict class
x = self.fc3(x) # no relu b/c it's a last layer.
return x
I am using images from CIFAR10 which are of size 3x32x32. When I ran the code before, it stopped because self.fc1 linear layer size did not work with self.conv3 I've added.
I'm also not sure what to write for self.fc1_5. Can someone explain me how this is actually working and the solution as well? Thank you!
I have added an extra convolutional layer and you can see it is self.conv3 = nn.Conv2d(16, 32, 5). Lines under the TODO are where I'm stuck at. I updated the line to: self.fc1 = nn.Linear(16 * 16 * 5, 120) before, it was: self.fc1 = nn.Linear(16 * 5 * 5, 120).
When you create a CNN for classification with a fixed input size, it's easy to figure out the size of your image by the time it has progressed through your CNN layers. Since we start with images of size [32,32]
(channels are unimportant for now):
def __init__(self):
super().__init__()
#(input channel, output channel, kenel size)
#channel is a dimension of a tensor which is a container that can house data in N dimensions (matrices)
self.conv1 = nn.Conv2d(3, 6, 5) # size 28x28 - lose 2 px from each side with a kernel of size 5
#shrink the image stack by pooling(kernel size, stride(shift)) and take max value per window
self.pool = nn.MaxPool2d(2, 2) # size 14x14 - max pooling with K=2 halves the image size
self.conv2 = nn.Conv2d(6, 16, 5) # size 10x10 -> 5x5 after pooling
#TODO: add conv3
self.conv3 = nn.Conv2d(16, 32, 5) # size 1x1
#drop layer deletes 20% of the feautures to help prevent overfitting
self.drop = nn.Dropout2d(p=0.2)
#linear predicts the output as a linear function of inputs
#(output channels, height, width, batch size
self.fc1 = nn.Linear(1 * 1 * 32, 120)
self.fc1_5 = nn.Linear(120,120) # matches the output size of fc1 and input size of fc2
The CNN size losses can be negated by using padding of (K-1)//2
, where K=kernel_size
.