To get a better understanding of CNNs I implemented one without the use of any framework. The network structure is as follows and forward and backward propagation is successfully performed up until it returns to the first convolutional layer.
network = [
Conv(input_shape=(128,96,96,3), kernel_shape=(4,4), num_kernels=40, stride=2, optimizer=GD_M, conv_mode="valid", kernel_initializer="he_normal", name="C1"),
ReLU(name="R1"),
ValidMaxPooling(3,2, name="MP1"),
Conv(input_shape=(128,23,23,40), kernel_shape=(2,2), num_kernels=105, stride=1, optimizer=GD_M, conv_mode="valid", kernel_initializer="he_normal", name="C2"),
ReLU(name="R2"),
ValidMaxPooling(4,2, name="MP2"),
Conv(input_shape=(128,10,10,105), kernel_shape=(2,2), num_kernels=158, stride=1, optimizer=GD_M, conv_mode="valid", kernel_initializer="he_normal", name="C3"),
ReLU(name="R3"),
Conv(input_shape=(128,9,9,158), kernel_shape=(2,2), num_kernels=158, stride=1, optimizer=GD_M, conv_mode="valid", kernel_initializer="he_normal", name="C4"),
ReLU(name="R4"),
Conv(input_shape=(128,8,8,158), kernel_shape=(2,2), num_kernels=105, stride=1, optimizer=GD_M, conv_mode="valid", kernel_initializer="he_normal", name="C5"),
ReLU(name="R5"),
ValidMaxPooling(3,2, name="MP3"),
Flatten(name="F1"),
Dense(input_neurons=945, output_neurons=945, optimizer=GD_M, name="D1"),
ReLU(name="R6"),
Dense(input_neurons=945, output_neurons=475, optimizer=GD_M, name="D2"),
ReLU(name="R7"),
Dense(input_neurons=475, output_neurons=8, optimizer=GD_M, name="D3"),
Softmax(name="S1"),
]
While the gradient of the input (of the first Conv layer) does not need to be passed back to a prior layer, the weights and biases still need to be updated. However, I seem to be having some trouble with feature map/input dimensions in this layer. Only this layer. The input to the network is 128 batches of 96 x 96 RGB images. The kernel size is (4,4).
The Conv layer code is:
def backward(self, de_dy):
de_db = de_dy
de_dk_store = np.zeros(shape=(len(self.input),*self.kernels.shape))
de_dx_store = np.zeros(shape=self.input.shape)
for b in range(self.batch_size):
for k in range(self.num_kernels):
for i in range(self.kernels.shape[1]):
de_dk_store[b,k,i] = correlate2d(self.input[b, i], de_dy[k], "valid")
de_dx_store[b,i]+= convolve2d(de_dy[k], self.kernels[k,i], "full")
de_dk_avg = np.mean(de_dk_store, axis=0)
de_dx_avg = np.mean(de_dx_store, axis=0)
self.kernels = self.optimizer.apply_optimizer(self.name+":K", self.kernels, de_dk_avg)
self.biases = self.optimizer.apply_optimizer(self.name+":B", self.biases, de_db)
return de_dx_avg
and the error specifically is:
de_dk_store[b,k,i] = correlate2d(self.input[b, i], de_dy[k], "valid")
ValueError: could not broadcast input array from shape (50,50) into shape (4,4)
i.e. correlate2d(self.input[b, i], de_dy[k], "valid") produces a shape (50,50) and tries assign it to a (4,4) position.
Potential issues could be regarding stride, as this is the only layer that has a stride greater than one.
The forward method of the layer is as follows:
def forward(self, input_array): # batch, channels, height, width
self.input = input_array
self.batch_size, self.channels, input_height, input_width = input_array.shape
self.output_height = (input_height - self.pool_size) // self.stride + 1
self.output_width = (input_width - self.pool_size) // self.stride + 1
pooled_array = np.zeros((self.batch_size, self.channels, self.output_height, self.output_width))
self.gradient_indexes = np.zeros(shape=(self.batch_size*self.channels*self.output_height*self.output_width, 4), dtype=int)
z = 0
for b in range(self.batch_size):
for c in range(self.channels):
m = input_array[b][c]
for i in range(self.output_height): # rows
for j in range(self.output_width): # columns
patch = m[i*self.stride:i*self.stride+self.pool_size, j*self.stride:j*self.stride+self.pool_size]
pooled_array[b,c,i, j] = np.max(patch)
max_index_in_patch = np.unravel_index(np.argmax(patch, axis=None), patch.shape)
max_index_in_input = [b, c, int(i*self.stride) + int(max_index_in_patch[0]), int(j*self.stride) + int(max_index_in_patch[1])]
self.gradient_indexes[z] = max_index_in_input
z+=1
return pooled_array
The problem was that my backward method wasn't designed to deal with when a stride greater than 1 was applied in the forward method.
To resolve the issue, de_dy had to be dilated before performing the correlate2d operation.
The updated backward method is as follows:
def backward(self, de_dy):
de_db = de_dy
de_dk_store = np.zeros(shape=(len(self.input),*self.kernels.shape))
de_dx_store = np.zeros(shape=self.input.shape)
if self.input_layer == True: # Hardcoded to deal with first layer
for b in range(self.batch_size):
for k in range(self.num_kernels):
for i in range(self.kernels.shape[1]):
dilated_matrix = np.zeros(((2 * de_dy[k].shape[0])-1, (2 * de_dy[k].shape[1]-1)))
dilated_matrix[::2, ::2] = de_dy[k]
de_dk_store[b,k,i] = correlate2d(self.input[b, i], dilated_matrix, "valid")
de_dk_avg = np.mean(de_dk_store, axis=0)
self.kernels = self.optimizer.apply_optimizer(self.name+":K", self.kernels, de_dk_avg)
self.biases = self.optimizer.apply_optimizer(self.name+":B", self.biases, de_db)
return
for b in range(self.batch_size):
for k in range(self.num_kernels):
for i in range(self.kernels.shape[1]):
de_dk_store[b,k,i] = correlate2d(self.input[b, i], de_dy[k], "valid")
de_dx_store[b,i]+= convolve2d(de_dy[k], self.kernels[k,i], "full")
de_dk_avg = np.mean(de_dk_store, axis=0)
de_dx_avg = np.mean(de_dx_store, axis=0)
self.kernels = self.optimizer.apply_optimizer(self.name+":K", self.kernels, de_dk_avg)
self.biases = self.optimizer.apply_optimizer(self.name+":B", self.biases, de_db)
return de_dx_avg
Some of the code within 'if self.input_layer == True:' is hardcoded for that first layer, so it wont generalize to layers with different strides.
For further information regarding the de_dy dilation, I'd recommend this article: https://medium.com/@mayank.utexas/backpropagation-for-convolution-with-strides-fb2f2efc4faa
He also has an article addressing the calculation of de_dx with strides > 1. But as I was just dealing with the input layer, I was only interested in the derivatives of the weights.