numpy neural-network gradient-descent backpropagation mlp

MLP outputting average of all training output for any input

I have tried to implement a multi layer perceptron with sigmoid activations. Below is the code:

import numpy as np

def sigmoid(x):
    return 1.0/(1.0 + np.exp(-x))
def sigmoid_derivative(x):
    return sigmoid(x) * (1.0 - sigmoid(x))

class MLP:
    def __init__(self, layers, x_train, y_train):
        self.layers = layers
        self.inputs = x_train
        self.outputs = y_train

    def forward(self, input):
        output = input
        for layer in self.layers:
            layer.activations = output
            output = layer.feedforward(output)
        return output

    def backward(self, output, predicted):
        error = np.multiply(2 * np.subtract(output, predicted), sigmoid_derivative(predicted))
        for layer in self.layers[::-1]:
            #recursively backpropagate the error
            error = layer.backpropagate(error)
    def train(self):
        for i in range(1,500):
                predicted = self.forward(self.inputs)
                self.backward(self.outputs,predicted)
    def test(self, input):
        return self.forward(input)



class Layer:
    def __init__(self, prevNodes, selfNodes):
        self.weights = np.random.rand(prevNodes,selfNodes)
        self.biases = np.zeros(selfNodes)
        self.activations = np.array([])

    def feedforward(self, input):
        return sigmoid(np.dot(input, self.weights) + self.biases)

    def backpropagate(self, error):
        delPropagate = np.dot(error, self.weights.transpose())

        dw = np.dot(self.activations.transpose(), error)
        db = error.mean(axis=0) * self.activations.shape[0]
        self.weights = self.weights + 0.1 * dw
        self.biases = self.biases + 0.1 * db
        return np.multiply(delPropagate ,sigmoid_derivative(self.activations))

layer1 = Layer(3,4)
layer2 = Layer(4,1)

x_train = np.array([[0,0,1],[0,1,1],[1,0,1],[1,1,1]])
y_train = np.array([[0],[1],[1],[0]])
x_test = np.array([[0,0,1]])
mlp = MLP([layer1,layer2], x_train, y_train)
mlp.train()
mlp.test(x_test)

However the problem is the network saturates to output the average of all training outputs for any input. For eg, in the above case the y_train avarage is about 0.5 and no matter what 'test_x' value I feed to the network the output is always around the 0.5 mark.

Where could be the problem in code. Am I missing something in the algorithms. Help is appreciated

Solution

The issue seems to be with lesser number of iterations, increasing iterations from 500 to 50000 works or changing the learning rate to 0.5 also works with lesser number of iterations. The matrix manipulations and all mathematics seem to be consistent