I don't understand. When I hardcode my script, it converges excellent, but in the softcode version, given the same structure and learning rate, it converges very slowly and then simply stops converging from some point on.
Here is the softcode version:
def BCE_loss(Y_hat, Y):
m = Y_hat.shape[1]
cost = (-1 / m) * (np.dot(Y, np.log(Y_hat+1e-5).T) + np.dot(1-Y, np.log(1-Y_hat+1e-5).T))
cost = np.squeeze(cost)
return cost
def BCE_loss_backward(Y_hat, Y):
dA_prev = - (np.divide(Y, Y_hat) - np.divide(1-Y, 1-Y_hat))
return dA_prev
def gradient(dZ, A_prev):
dW = np.dot(dZ, A_prev.T) * (1 / A_prev.shape[1])
db = np.sum(dZ, axis=1, keepdims=True) * (1 / A_prev.shape[1])
return dW, db
def update(W, b, dW, db, learning_rate):
W -= np.dot(learning_rate, dW)
b -= np.dot(learning_rate, db)
return W, b
for i in range(epochs+1):
## Forward pass
for l in range(1, L):
if l==L-1:
if out_dim==1:
grads_GD['Z'+str(l)] = linear(params_GD['W'+str(l)], grads_GD['A'+str(l-1)], params_GD['b'+str(l)])
grads_GD['A'+str(l)] = sigmoid(grads_GD['Z'+str(l)])
else:
grads_GD['Z'+str(l)] = linear(params_GD['W'+str(l)], grads_GD['A'+str(l-1)], params_GD['b'+str(l)])
grads_GD['A'+str(l)] = softmax(grads_GD['Z'+str(l)])
else:
grads_GD['Z'+str(l)] = linear(params_GD['W'+str(l)], grads_GD['A'+str(l-1)], params_GD['b'+str(l)])
grads_GD['A'+str(l)] = relu(grads_GD['Z'+str(l)])
## Compute cost
if out_dim==1:
cost_GD = BCE_loss(grads_GD['A'+str(L-1)], Y)
cost_list_GD.append(cost_GD)
else:
cost_GD = CE_loss(grads_GD['A'+str(L-1)], Y)
cost_list_GD.append(cost_GD)
## Print cost
if i % print_num == 0:
print(f"Cost for gradient descent optimizer after epoch {i}: {cost_GD: .4f}")
elif cost_GD < cost_lim or i == epochs:
last_epoch_GD = i
print(f"Cost for gradient descent optimizer after epoch {i}: {cost_GD: .4f}")
break
else:
continue
## Backward pass
if out_dim==1:
grads_GD['dA'+str(L-1)] = BCE_loss_backward(grads_GD['A'+str(L-1)], Y)
grads_GD['dZ'+str(L-1)] = sigmoid_backward(grads_GD['dA'+str(L-1)], grads_GD['Z'+str(L-1)])
else:
grads_GD['dA'+str(L-1)] = CE_loss_backward(grads_GD['A'+str(L-1)], Y)
grads_GD['dZ'+str(L-1)] = softmax_backward(grads_GD['dA'+str(L-1)], grads_GD['Z'+str(L-1)])
grads_GD['dW'+str(L-1)], grads_GD['db'+str(L-1)] = gradient(grads_GD['dZ'+str(L-1)], grads_GD['A'+str(L-2)])
for l in reversed(range(1, L-1)):
grads_GD['dA'+str(l)] = linear_backward(params_GD['W'+str(l+1)], grads_GD['dZ'+str(l+1)])
grads_GD['dZ'+str(l)] = relu_backward(grads_GD['dA'+str(l)], grads_GD['Z'+str(l)])
grads_GD['dW'+str(l)], grads_GD['db'+str(l)] = gradient(grads_GD['dZ'+str(l)], grads_GD['A'+str(l-1)])
## Update parameters
for l in range(1, L):
params_GD['W'+str(l)], params_GD['b'+str(l)] = update(params_GD['W'+str(l)], params_GD['b'+str(l)], grads_GD['dW'+str(l)], grads_GD['db'+str(l)], learning_rate)
and here is the hardcode version:
def cost_function(Y, A4, N, epsilon):
cost = (-1 / N) * np.sum(np.multiply(Y, np.log(A4 + epsilon)) + np.multiply(1 - Y, np.log(1 - A4 + epsilon)))
return cost
for i in range(epochs):
Z1_GD = np.dot(W1_GD, X) + b1_GD
A1_GD = np.maximum(0, Z1_GD)
Z2_GD = np.dot(W2_GD, A1_GD) + b2_GD
A2_GD = np.maximum(0, Z2_GD)
Z3_GD = np.dot(W3_GD, A2_GD) + b3_GD
A3_GD = np.maximum(0, Z3_GD)
Z4_GD = np.dot(W4_GD, A3_GD) + b4_GD
A4_GD = class_layer(Z4_GD)
dZ4_GD = A4_GD - Y
dW4_GD = np.dot(dZ4_GD, A3_GD.T) * (1. / A3_GD.shape[1])
db4_GD = np.sum(dZ4_GD, axis=1, keepdims=True) * (1. / A3_GD.shape[1])
dA3_GD = np.dot(W4_GD.T, dZ4_GD)
dZ3_GD = np.array(dA3_GD, copy=True)
dZ3_GD[Z3_GD <= 0] = 0
dW3_GD = np.dot(dZ3_GD, A2_GD.T) * (1. / A2_GD.shape[1])
db3_GD = np.sum(dZ3_GD, axis=1, keepdims=True) * (1. / A2_GD.shape[1])
dA2_GD = np.dot(W3_GD.T, dZ3_GD)
dZ2_GD = np.array(dA2_GD, copy=True)
dZ2_GD[Z2_GD <= 0] = 0
dW2_GD = np.dot(dZ2_GD, A1_GD.T) * (1. / A1_GD.shape[1])
db2_GD = np.sum(dZ2_GD, axis=1, keepdims=True) * (1. / A1_GD.shape[1])
dA1_GD = np.dot(W2_GD.T, dZ2_GD)
dZ1_GD = np.array(dA1_GD, copy=True)
dZ1_GD[Z1_GD <= 0] = 0
dW1_GD = np.dot(dZ1_GD, X.T) * (1. / X.shape[1])
db1_GD = np.sum(dZ1_GD, axis=1, keepdims=True) * (1. / X.shape[1])
W1_GD = W1_GD - learning_rate * dW1_GD
b1_GD = b1_GD - learning_rate * db1_GD
W2_GD = W2_GD - learning_rate * dW2_GD
b2_GD = b2_GD - learning_rate * db2_GD
W3_GD = W3_GD - learning_rate * dW3_GD
b3_GD = b3_GD - learning_rate * db3_GD
W4_GD = W4_GD - learning_rate * dW4_GD
b4_GD = b4_GD - learning_rate * db4_GD
cost_GD = cost_function(Y, A4_GD, N, epsilon)
cost_GD = np.squeeze(cost_GD)
cost_list_GD.append(cost_GD)
I suppose something went wrong during softcoding.
I solved it myself. Apparently, the "else: continue" line in the print cost section caused the algorithm to do a backward pass only once. After that, it was just looping through the forward pass. Can anyone please explain the reason for such behavior?