neural network xor gate classification

I've written a simple neural network that can predict XOR gate function. I think I've used the math correctly, but the loss doesn't go down and remains near 0.6. Can anyone help me find the reason why?

import numpy as np
import matplotlib as plt

train_X = np.array([[0,0],[0,1],[1,0],[1,1]]).T
train_Y = np.array([[0,1,1,0]])
test_X = np.array([[0,0],[0,1],[1,0],[1,1]]).T
test_Y = np.array([[0,1,1,0]])

learning_rate = 0.1
S = 5

def sigmoid(z):
    return 1/(1+np.exp(-z))

def sigmoid_derivative(z):
    return sigmoid(z)*(1-sigmoid(z))

S0, S1, S2 = 2, 5, 1
m = 4

w1 = np.random.randn(S1, S0) * 0.01
b1 = np.zeros((S1, 1))
w2 = np.random.randn(S2, S1) * 0.01
b2 = np.zeros((S2, 1))

for i in range(1000000):
    Z1 = np.dot(w1, train_X) + b1
    A1 = sigmoid(Z1)
    Z2 = np.dot(w2, A1) + b2
    A2 = sigmoid(Z2)

    J = np.sum(-train_Y * np.log(A2) + (train_Y-1) * np.log(1-A2)) / m

    dZ2 = A2 - train_Y
    dW2 = np.dot(dZ2, A1.T) / m
    dB2 = np.sum(dZ2, axis = 1, keepdims = True) / m
    dZ1 = np.dot(w2.T, dZ2) * sigmoid_derivative(Z1)
    dW1 = np.dot(dZ1, train_X.T) / m
    dB1 = np.sum(dZ1, axis = 1, keepdims = True) / m

    w1 = w1 - dW1 * 0.03
    w2 = w2 - dW2 * 0.03
    b1 = b1 - dB1 * 0.03
    b2 = b2 - dB2 * 0.03

    print(J)

Solution

I think your dZ2 is not correct, as you do not multiply it with the derivative of sigmoid.

For the XOR problem, if you inspect the outputs the 1's are slightly higher than 0.5 and the 0's are slightly lower. I believe this is because the search has reached a plateau and therefore therefore progressing very slowly. I tried RMSProp which converged to almost 0 very fast. I also tried a pseudo second order algorithm, RProp, which converged almost immediately (I used iRProp-). I am showing the plot for RMSPprop below

Also, the final output of the network is now

[[1.67096234e-06 9.99999419e-01 9.99994158e-01 6.87836337e-06]]

Rounding which gets

array([[0., 1., 1., 0.]])

But, I would highly recommend to perform gradient checking to be sure that the analytical gradients match with the ones computed numerically. Also see Andrew Ng's coursera lecture on gradient checking.

I am adding the modified code to with the RMSProp implementation.

#!/usr/bin/python3

import numpy as np
import matplotlib.pyplot as plt

train_X = np.array([[0,0],[0,1],[1,0],[1,1]]).T
train_Y = np.array([[0,1,1,0]])
test_X = np.array([[0,0],[0,1],[1,0],[1,1]]).T
test_Y = np.array([[0,1,1,0]])

learning_rate = 0.1
S = 5

def sigmoid(z):
    return 1/(1+np.exp(-z))

def sigmoid_derivative(z):
    return sigmoid(z)*(1-sigmoid(z))

S0, S1, S2 = 2, 5, 1
m = 4

w1 = np.random.randn(S1, S0) * 0.01
b1 = np.zeros((S1, 1))
w2 = np.random.randn(S2, S1) * 0.01
b2 = np.zeros((S2, 1))

# RMSProp variables
dWsqsum1 = np.zeros_like (w1)
dWsqsum2 = np.zeros_like (w2)
dBsqsum1 = np.zeros_like (b1)
dBsqsum2 = np.zeros_like (b2)
alpha = 0.9
lr = 0.01

err_vec = list ();

for i in range(20000):
    Z1 = np.dot(w1, train_X) + b1
    A1 = sigmoid(Z1)
    Z2 = np.dot(w2, A1) + b2
    A2 = sigmoid(Z2)

    J = np.sum(-train_Y * np.log(A2) + (train_Y-1) * np.log(1-A2)) / m

    dZ2 = (A2 - train_Y) * sigmoid_derivative (Z2);
    dW2 = np.dot(dZ2, A1.T) / m
    dB2 = np.sum(dZ2, axis = 1, keepdims = True) / m
    dZ1 = np.dot(w2.T, dZ2) * sigmoid_derivative(Z1)
    dW1 = np.dot(dZ1, train_X.T) / m
    dB1 = np.sum(dZ1, axis = 1, keepdims = True) / m

    # RMSProp update
    dWsqsum1 = alpha * dWsqsum1 + (1 - learning_rate) * np.square (dW1);
    dWsqsum2 = alpha * dWsqsum2 + (1 - learning_rate) * np.square (dW2);
    dBsqsum1 = alpha * dBsqsum1 + (1 - learning_rate) * np.square (dB1);
    dBsqsum2 = alpha * dBsqsum2 + (1 - learning_rate) * np.square (dB2);


    w1 = w1 - (lr * dW1 / (np.sqrt (dWsqsum1) + 10e-10));
    w2 = w2 - (lr * dW2 / (np.sqrt (dWsqsum2) + 10e-10));
    b1 = b1 - (lr * dB1 / (np.sqrt (dBsqsum1) + 10e-10));
    b2 = b2 - (lr * dB2 / (np.sqrt (dBsqsum2) + 10e-10));

    print(J)
    err_vec.append (J);


Z1 = np.dot(w1, train_X) + b1
A1 = sigmoid(Z1)
Z2 = np.dot(w2, A1) + b2
A2 = sigmoid(Z2)

print ("\n", A2);

plt.plot (np.array (err_vec));
plt.show ();