machine-learning deep-learning neural-network

Cost does not converge/converges very slowly in the soft-coded version?

I don't understand. When I hardcode my script, it converges excellent, but in the softcode version, given the same structure and learning rate, it converges very slowly and then simply stops converging from some point on.

Here is the softcode version:

def BCE_loss(Y_hat, Y):
m = Y_hat.shape[1]
cost = (-1 / m) * (np.dot(Y, np.log(Y_hat+1e-5).T) + np.dot(1-Y, np.log(1-Y_hat+1e-5).T))
cost = np.squeeze(cost)
return cost

def BCE_loss_backward(Y_hat, Y):
dA_prev = - (np.divide(Y, Y_hat) - np.divide(1-Y, 1-Y_hat))
return dA_prev

def gradient(dZ, A_prev):
dW = np.dot(dZ, A_prev.T) * (1 / A_prev.shape[1])
db = np.sum(dZ, axis=1, keepdims=True) * (1 / A_prev.shape[1])
return dW, db

def update(W, b, dW, db, learning_rate):
W -= np.dot(learning_rate, dW)
b -= np.dot(learning_rate, db)
return W, b

for i in range(epochs+1):
## Forward pass
for l in range(1, L):
    if l==L-1:
        if out_dim==1:
            grads_GD['Z'+str(l)] = linear(params_GD['W'+str(l)], grads_GD['A'+str(l-1)], params_GD['b'+str(l)])
            grads_GD['A'+str(l)] = sigmoid(grads_GD['Z'+str(l)])
        else:
            grads_GD['Z'+str(l)] = linear(params_GD['W'+str(l)], grads_GD['A'+str(l-1)], params_GD['b'+str(l)])
            grads_GD['A'+str(l)] = softmax(grads_GD['Z'+str(l)])
    else:
        grads_GD['Z'+str(l)] = linear(params_GD['W'+str(l)], grads_GD['A'+str(l-1)], params_GD['b'+str(l)])
        grads_GD['A'+str(l)] = relu(grads_GD['Z'+str(l)])

## Compute cost
if out_dim==1:
    cost_GD = BCE_loss(grads_GD['A'+str(L-1)], Y)
    cost_list_GD.append(cost_GD)
else:
    cost_GD = CE_loss(grads_GD['A'+str(L-1)], Y)
    cost_list_GD.append(cost_GD)

## Print cost
if i % print_num == 0:
    print(f"Cost for gradient descent optimizer after epoch {i}: {cost_GD: .4f}")
elif cost_GD < cost_lim or i == epochs:
    last_epoch_GD = i
    print(f"Cost for gradient descent optimizer after epoch {i}: {cost_GD: .4f}")
    break
else:
    continue

## Backward pass
if out_dim==1:
    grads_GD['dA'+str(L-1)] = BCE_loss_backward(grads_GD['A'+str(L-1)], Y)
    grads_GD['dZ'+str(L-1)] = sigmoid_backward(grads_GD['dA'+str(L-1)], grads_GD['Z'+str(L-1)])
else:
    grads_GD['dA'+str(L-1)] = CE_loss_backward(grads_GD['A'+str(L-1)], Y)
    grads_GD['dZ'+str(L-1)] = softmax_backward(grads_GD['dA'+str(L-1)], grads_GD['Z'+str(L-1)])
grads_GD['dW'+str(L-1)], grads_GD['db'+str(L-1)] = gradient(grads_GD['dZ'+str(L-1)], grads_GD['A'+str(L-2)])
for l in reversed(range(1, L-1)):
    grads_GD['dA'+str(l)] = linear_backward(params_GD['W'+str(l+1)], grads_GD['dZ'+str(l+1)])
    grads_GD['dZ'+str(l)] = relu_backward(grads_GD['dA'+str(l)], grads_GD['Z'+str(l)])
    grads_GD['dW'+str(l)], grads_GD['db'+str(l)] = gradient(grads_GD['dZ'+str(l)], grads_GD['A'+str(l-1)])
## Update parameters
for l in range(1, L):
    params_GD['W'+str(l)], params_GD['b'+str(l)] = update(params_GD['W'+str(l)], params_GD['b'+str(l)], grads_GD['dW'+str(l)], grads_GD['db'+str(l)], learning_rate)

and here is the hardcode version:

def cost_function(Y, A4, N, epsilon):
cost = (-1 / N) * np.sum(np.multiply(Y, np.log(A4 + epsilon)) + np.multiply(1 - Y, np.log(1 - A4 + epsilon)))
return cost

for i in range(epochs):
Z1_GD = np.dot(W1_GD, X) + b1_GD
A1_GD = np.maximum(0, Z1_GD)
Z2_GD = np.dot(W2_GD, A1_GD) + b2_GD
A2_GD = np.maximum(0, Z2_GD)
Z3_GD = np.dot(W3_GD, A2_GD) + b3_GD
A3_GD = np.maximum(0, Z3_GD)
Z4_GD = np.dot(W4_GD, A3_GD) + b4_GD
A4_GD = class_layer(Z4_GD)

dZ4_GD = A4_GD - Y
dW4_GD = np.dot(dZ4_GD, A3_GD.T) * (1. / A3_GD.shape[1])
db4_GD = np.sum(dZ4_GD, axis=1, keepdims=True) * (1. / A3_GD.shape[1])

dA3_GD = np.dot(W4_GD.T, dZ4_GD)
dZ3_GD = np.array(dA3_GD, copy=True)
dZ3_GD[Z3_GD <= 0] = 0
dW3_GD = np.dot(dZ3_GD, A2_GD.T) * (1. / A2_GD.shape[1])
db3_GD = np.sum(dZ3_GD, axis=1, keepdims=True) * (1. / A2_GD.shape[1])

dA2_GD = np.dot(W3_GD.T, dZ3_GD)
dZ2_GD = np.array(dA2_GD, copy=True)
dZ2_GD[Z2_GD <= 0] = 0
dW2_GD = np.dot(dZ2_GD, A1_GD.T) * (1. / A1_GD.shape[1])
db2_GD = np.sum(dZ2_GD, axis=1, keepdims=True) * (1. / A1_GD.shape[1])

dA1_GD = np.dot(W2_GD.T, dZ2_GD)
dZ1_GD = np.array(dA1_GD, copy=True)
dZ1_GD[Z1_GD <= 0] = 0
dW1_GD = np.dot(dZ1_GD, X.T) * (1. / X.shape[1])
db1_GD = np.sum(dZ1_GD, axis=1, keepdims=True) * (1. / X.shape[1])

W1_GD = W1_GD - learning_rate * dW1_GD
b1_GD = b1_GD - learning_rate * db1_GD
W2_GD = W2_GD - learning_rate * dW2_GD
b2_GD = b2_GD - learning_rate * db2_GD
W3_GD = W3_GD - learning_rate * dW3_GD
b3_GD = b3_GD - learning_rate * db3_GD
W4_GD = W4_GD - learning_rate * dW4_GD
b4_GD = b4_GD - learning_rate * db4_GD

cost_GD = cost_function(Y, A4_GD, N, epsilon)
cost_GD = np.squeeze(cost_GD)
cost_list_GD.append(cost_GD)

I suppose something went wrong during softcoding.

Solution

I solved it myself. Apparently, the "else: continue" line in the print cost section caused the algorithm to do a backward pass only once. After that, it was just looping through the forward pass. Can anyone please explain the reason for such behavior?