python machine-learning deep-learning backpropagation

Output vector of The final Layer of a neural net for a classification problem stuck at 0.5

The output layer is stuck at [0.5, 0.5] vector. Can anyone help in understanding if there is any problem with the code.

The neural net I'm trying to train is an X-OR gate, so the output vector should be close to the one hot vector representing the correct class(0 or 1) in this case, but the output vector after all epoch still stays at [0.5, 0.5]

class Backpropogation:

    def setupWeightsBiases(self):
        for i in range(1, self.num_layers):
            self.weights_dict[i] = rnd.rand(self.layer_spec[i], self.layer_spec[i - 1])
            self.bias_dict[i] = rnd.rand(self.layer_spec[i], 1)

    def __init__(self, hidden_layer_neurons_tuple, train_data, num_output_classes, output_layer_func='sigmoid'):
        self.train_input = train_data[0]
        self.input_layer_size = self.train_input[0].size

        self.train_input = self.train_input.reshape(self.train_input.shape[0], self.input_layer_size).T

        self.output_layer_size = num_output_classes
        self.train_output = train_data[1]
        print(self.train_output.shape)

        num_hidden_layer = len(hidden_layer_neurons_tuple)
        self.hidden_layer_neurons_tuple = hidden_layer_neurons_tuple
        self.layer_spec = [self.input_layer_size] + \
                          list(hidden_layer_neurons_tuple) + \
                          [num_output_classes]
        self.layer_spec = tuple(self.layer_spec)

        self.num_layers = num_hidden_layer + 2
        self.train_data = train_data
        self.activation_layer_gradient_dict = {}
        self.preactivation_layer_gradient_dict = {}
        self.weights_gradient_dict = {}
        self.bias_gradient_dict = {}
        self.curr_input = None
        self.curr_output = None
        self.weights_dict = {}
        self.preactivation_layer_dict = {}
        self.activation_layer_dict = {}
        self.bias_dict = {}
        self.setupWeightsBiases()
        self.output = None
        self.output_diff = None
        self.num_output_classes = num_output_classes

    def predictClass(self):
        return np.argmax(self.activation_layer_dict[self.num_layers - 1])

    def forwardPropogation(self, input):
        # Load h[0] as the input data
        self.activation_layer_dict[0] = input

        '''
        load input data into h[0]
        for i in (1,L):
            a[k] = W[k] * h[k-1] + b[k]
        and finally calculate the Lth layer output with the special activation function
        '''
        for i in range(1, self.num_layers):
            self.preactivation_layer_dict[i] = \
                np.matmul(self.weights_dict[i], self.activation_layer_dict[i - 1]) + \
                self.bias_dict[i]
            # print(self.preactivation_layer_dict[i])
            vec = self.preactivation_layer_dict[i]
            self.activation_layer_dict[i] = self.activationFunction(vec)
            # This will change h[L] to y'
        self.activation_layer_dict[self.num_layers - 1] = self.outputFunction()

    def findGradients(self, index):
        class_label = self.train_output[index]
        output_one_hot_vector = np.zeros((self.num_output_classes, 1))
        output_one_hot_vector[class_label] = 1
        output = self.activation_layer_dict[self.num_layers - 1]
        self.preactivation_layer_gradient_dict[self.num_layers - 1] = -1 * (output_one_hot_vector - output)

        for layer in reversed(range(1, self.num_layers)):
            self.weights_gradient_dict[layer] = np.matmul(self.preactivation_layer_gradient_dict[layer],
                                                          self.activation_layer_dict[layer - 1].T)

            self.bias_gradient_dict[layer] = self.preactivation_layer_gradient_dict[layer]

            self.activation_layer_gradient_dict[layer - 1] = np.matmul(self.weights_dict[layer].T,
                                                                       self.preactivation_layer_gradient_dict[layer])

            if layer != 1:
                self.preactivation_layer_gradient_dict[layer - 1] = np.multiply(
                    self.activation_layer_gradient_dict[layer - 1],
                    self.outputFunctionDiff(layer - 1))

    def activationFunction(self, vec, type='sigmoid'):

        if type == 'sigmoid':
            return 1 / (1 + expit(-vec))
        else:
            print('Please select correct output function')
            exit()

    def outputFunction(self, type='sigmoid'):
        if type == 'sigmoid':
            return 1 / (1 + expit(-self.preactivation_layer_dict[self.num_layers - 1]))
        else:
            print('Please select correct output function')
            exit()

    def outputFunctionDiff(self, layer, type='sigmoid'):
        op_layer = self.num_layers - 1
        if type == 'sigmoid':
            vec = self.preactivation_layer_dict[layer]
            return np.multiply(self.activationFunction(vec), 1 - self.activationFunction(vec))

        else:
            print('Please select correct output function')
            exit()

    def updateWeightsAndBiases(self, learning_rate):
        for layer in range(1, self.num_layers):
            self.weights_dict[layer] = self.weights_dict[layer] - learning_rate * self.weights_gradient_dict[layer]

            self.preactivation_layer_dict[layer] = self.preactivation_layer_dict[layer] - \
                                                   learning_rate * self.preactivation_layer_gradient_dict[layer]

            if not (layer == self.num_layers - 1):
                self.activation_layer_dict[layer] = self.activation_layer_dict[layer] - \
                                                    learning_rate * self.activation_layer_gradient_dict[layer]

            self.bias_dict[layer] = self.bias_dict[layer] - learning_rate * self.bias_gradient_dict[layer]

    def getLoss(self, index):
      return np.log2(self.activation_layer_dict[self.num_layers - 1][self.train_output[index], 0])

    def train(self, learning_rate, num_epochs):
        for curr_epoch in range(num_epochs):
            print('Evaluating at ' + str(curr_epoch))
            index_array = list(np.arange(0, self.train_input.shape[1]))
            np.random.shuffle(index_array)
            for train_data_index in index_array:
                test_input = self.train_input[:, [train_data_index]]
                self.forwardPropogation(test_input)
                # print(self.activation_layer_dict[self.num_layers - 1])
                self.findGradients(train_data_index)
                self.updateWeightsAndBiases(learning_rate)
            print('Loss ' + str(self.getLoss(train_data_index)))

    # Assumes a 2D array of 784xN array as test input
    # This will return output classes of the data
    def test(self, test_data):
        index_range = test_data.shape[1]
        test_class_list = []
        for index in range(index_range):
            self.forwardPropogation(test_data[:, [index]])
            test_class_list.append(self.predictClass())
        return test_class_list

    # train the NN with BP
    train_data = (np.array([[0, 0], [0, 1], [1,0], [1, 1]]), np.array([0, 1, 1, 0]))

    b = Backpropogation((2, 2), train_data, 2)

Solution

The following code (check this for implementation and this for the theory) implements a neural net with backpropagation from scratch, using a single output unit with sigmoid activation (otherwise it looks similar to your implementation).

Using this the XOR function can be learnt with appropriate learning rate and epochs (although it can be sometimes stuck at local minima, you can consider implementing drop-out etc. regularizers). Also, you can convert it to 2-output (softmax?) version of yours, can you figure out any issue in your implementation? e.g., you can look at the following pointers:

batch updation of parameters during backpropagation instead of stochastic updates
running for enough epochs
changing the learning rate
using Relu activation instead of sigmoid for the hidden layers (to cope with vanishing gradient) etc.

from sklearn.metrics import accuracy_score, mean_squared_error

class FFSNNetwork:
  
  def __init__(self, n_inputs, hidden_sizes=[2]):
    #intialize the inputs
    self.nx = n_inputs
    self.ny = 1  # number of neurons in the output layer
    self.nh = len(hidden_sizes)
    self.sizes = [self.nx] + hidden_sizes + [self.ny]
    
    self.W = {}
    self.B = {}
    for i in range(self.nh+1): 
        self.W[i+1] = np.random.rand(self.sizes[i], self.sizes[i+1])
        self.B[i+1] = np.random.rand(1, self.sizes[i+1])

  def sigmoid(self, x):
    return 1.0/(1.0 + np.exp(-x))
  
  def forward_pass(self, x):
    self.A = {}
    self.H = {}
    self.H[0] = x.reshape(1, -1)
    for i in range(self.nh+1):
      self.A[i+1] = np.matmul(self.H[i], self.W[i+1]) + self.B[i+1]
      self.H[i+1] = self.sigmoid(self.A[i+1]) 
    return self.H[self.nh+1]
  
  def grad_sigmoid(self, x):
    return x*(1-x) 

  def grad(self, x, y):
    self.forward_pass(x)
    self.dW = {}
    self.dB = {}
    self.dH = {}
    self.dA = {}
    L = self.nh + 1
    self.dA[L] = (self.H[L] - y)
    for k in range(L, 0, -1):
      self.dW[k] = np.matmul(self.H[k-1].T, self.dA[k])
      self.dB[k] = self.dA[k]
      self.dH[k-1] = np.matmul(self.dA[k], self.W[k].T)
      self.dA[k-1] = np.multiply(self.dH[k-1], self.grad_sigmoid(self.H[k-1])) 
    
  def fit(self, X, Y, epochs=1, learning_rate=1, initialize=True):
    
    # initialize w, b
    if initialize:
      for i in range(self.nh+1):
        self.W[i+1] = np.random.randn(self.sizes[i], self.sizes[i+1])
        self.B[i+1] = np.zeros((1, self.sizes[i+1]))
      
    for e in range(epochs):
      dW = {}
      dB = {}
      for i in range(self.nh+1):
        dW[i+1] = np.zeros((self.sizes[i], self.sizes[i+1]))
        dB[i+1] = np.zeros((1, self.sizes[i+1]))
      for x, y in zip(X, Y):
        self.grad(x, y)
        for i in range(self.nh+1):
          dW[i+1] += self.dW[i+1]
          dB[i+1] += self.dB[i+1]
        
      m = X.shape[1]
      for i in range(self.nh+1):
        self.W[i+1] -= learning_rate * dW[i+1] / m
        self.B[i+1] -= learning_rate * dB[i+1] / m
      
      Y_pred = self.predict(X)
      print('loss at epoch {} = {}'.format(e, mean_squared_error(Y_pred, Y)))
    
  def predict(self, X):
    Y_pred = []
    for x in X:
      y_pred = self.forward_pass(x)
      Y_pred.append(y_pred)
    return np.array(Y_pred).squeeze()

Now, train the network:

#train the network with two hidden layers - 2 neurons and 2 neurons
ffsnn = FFSNNetwork(2, [2, 2])
# XOR data
X_train, y_train = np.array([[0, 0], [0, 1], [1,0], [1, 1]]), np.array([0, 1, 1, 0])
ffsnn.fit(X_train, y_train, epochs=5000, learning_rate=.15)

Next, predict with the network:

y_pred_prob = ffsnn.predict(X_train) # P(y = 1)
y_pred = (y_pred_prob >= 0.5).astype("int").ravel() # threshold = 0.5

X_train
# array([[0, 0], [0, 1], [1, 0], [1, 1]])
y_train
# array([0, 1, 1, 0])
y_pred_prob
# array([0.00803102, 0.99439243, 0.99097831, 0.00664639])
y_pred
# array([0, 1, 1, 0])
accuracy_score(y_train, y_pred)
# 1.0

Note that here the MSE between the true and predicted y values is used to plot the loss function, you can plot BCE (cross entropy) loss function too.

Finally, the following animations show how the loss function is minimized and also how the decision boundary is learnt:

Note that the green and red points represent the positive (with label 1) and negative (with label 0) training data points, respectively, in the above animation, notice how they are separated with the decision boundaries during the final phase of training epochs (darker region for negative and lighter region for positive datapoints corresponding to XOR).

You could implement the same with high level deep learning libraries such as keras with a few lines of code:

import tensorflow as tf
from tensorflow import keras

inputs = keras.Input(shape=(2,), name="in")
x = layers.Dense(4, activation="relu", name="dense_1")(inputs)
x = layers.Dense(4, activation="relu", name="dense_2")(x)
outputs = layers.Dense(1, activation="sigmoid", name="out")(x)

model = keras.Model(inputs=inputs, outputs=outputs)
X_train, y_train = np.array([[0, 0], [0, 1], [1,0], [1, 1]]), np.array([0, 1, 1, 0])
model.compile(
    optimizer=keras.optimizers.Adam(),  # Optimizer
    # Loss function to minimize
    loss=tf.keras.losses.BinaryCrossentropy(),
    # List of metrics to monitor
    metrics=[keras.metrics.BinaryAccuracy(name="accuracy")],
)

print("Fit model on training data")
history = model.fit(
    X_train,
    y_train,
    batch_size=4,
    epochs=1000)
# ...
# Epoch 371/1000
# 4/4 [==============================] - 0s 500us/sample - loss: 0.5178 - accuracy: 0.7500
# Epoch 372/1000
# 4/4 [==============================] - 0s 499us/sample - loss: 0.5169 - accuracy: 0.7500
# Epoch 373/1000
# 4/4 [==============================] - 0s 499us/sample - loss: 0.5160 - accuracy: 1.0000
# Epoch 374/1000
# 4/4 [==============================] - 0s 499us/sample - loss: 0.5150 - accuracy: 1.0000
# ...

print("Evaluate")
results = model.evaluate(X_train, y_train, batch_size=4)
print("loss, acc:", results)
# loss, acc: [0.1260240525007248, 1.0]

The following figure shows the loss / accuracy during training epochs.

Finally, with keras and softmax (instead of sigmoid):

from keras.utils import to_categorical
X_train, y_train = np.array([[0, 0], [0, 1], [1,0], [1, 1]]), np.array([0, 1, 1, 0])
y_train = to_categorical(y_train, num_classes=2)
inputs = keras.Input(shape=(2,), name="in")
x = layers.Dense(4, activation="relu", name="dense_1")(inputs)
x = layers.Dense(4, activation="relu", name="dense_2")(x)
outputs = layers.Dense(2, activation="softmax", name="out")(x)

model = keras.Model(inputs=inputs, outputs=outputs)
model.compile(
    optimizer='rmsprop', 
    loss='categorical_crossentropy',
    metrics=['acc']
)
print("Fit model on training data")
history = model.fit(
    X_train,
    y_train,
    batch_size=4,
    epochs=2000)
# Epoch 663/2000
# 4/4 [==============================] - 0s 500us/sample - loss: 0.3893 - acc: 0.7500
# Epoch 664/2000
# 4/4 [==============================] - 0s 500us/sample - loss: 0.3888 - acc: 1.0000
# Epoch 665/2000
# 4/4 [==============================] - 0s 500us/sample - loss: 0.3878 - acc: 1.0000
print("Evaluate")
results = model.evaluate(X_train, y_train, batch_size=4)
print("loss, acc:", results)
# loss, acc: [0.014970880933105946, 1.0]

with the following loss / accuracy convergence: