Search code examples

XOR classification using multilayer perceptron

I want to implement a multi-layer perceptron.
I found some code on GitHub that classifies MNIST quite well (96%). However, for some reason, it does not cope with the XOR task.
I want to understand why.
Here is the code:

import random
import numpy as np

class Perceptron:

    def __init__(self, *, layer_sizes, activation_functions, cost_function_deriv):
        self.layer_sizes = layer_sizes
        if len(self.layer_sizes) - 1 != len(activation_functions):
            raise ValueError("...")
        self.activation_functions = activation_functions
        self.cost_function_deriv = cost_function_deriv
        self.biases = [np.random.randn(y, 1) for y in layer_sizes[1:]]
        self.weights = [np.random.randn(y, x) for x, y in zip(layer_sizes[:-1], layer_sizes[1:])]

    def train(self, training_data, test_data, epochs, mini_batch_size, lr):
        test_data_len = len(test_data)
        for epoch in range(epochs):
            mini_batches = [training_data[x: x + mini_batch_size]
                            for x in range(0, len(training_data), mini_batch_size)]
            for mini_batch in mini_batches:
                mb_len = len(mini_batch)
                gradient_weights = [np.zeros(w.shape) for w in self.weights]
                gradient_biases = [np.zeros(b.shape) for b in self.biases]
                for x, y in mini_batch:
                    delta_gradient_biases, delta_gradient_weights = self.backpropagation(np.array(x), y)
                    gradient_weights = [grad + delta for grad, delta in zip(gradient_weights, delta_gradient_weights)]
                    gradient_biases = [grad + delta for grad, delta in zip(gradient_biases, delta_gradient_biases)]
                self.weights = [w - (lr / mb_len) * grad for w, grad in zip(self.weights, gradient_weights)]
                self.biases = [b - (lr / mb_len) * grad for b, grad in zip(self.biases, gradient_biases)]
            correct_answers = self.how_many_correct_answers(test_data)
            print(f"Epoch number {epoch}: {correct_answers}/{test_data_len} correct answers")

    def backpropagation(self, x, y):
        gradient_b = [np.zeros(b.shape) for b in self.biases]
        gradient_w = [np.zeros(w.shape) for w in self.weights]
        activations = [x]
        prev_activation = x
        for i, (b, w) in enumerate(zip(self.biases, self.weights)):
            current_activation = self.activation_functions[i](, prev_activation) + b)
            prev_activation = current_activation
        delta = self.cost_function_deriv(activations[-1], y) * self.activation_functions[-1].deriv(activations[-1])
        gradient_b[-1] = delta
        gradient_w[-1] =, activations[-2].T)
        for i in range(2, len(self.layer_sizes)):
            z = activations[-i]
            act_der = self.activation_functions[-i + 1].deriv(z)
            delta =[-i + 1].T, delta) * act_der
            gradient_b[-i] = delta
            gradient_w[-i] =, activations[-i - 1].T)
        # Normal indexing variant:
        # for i in range(len(self.layers) - 1, 0, -1):
        #     z = activations[i]
        #     act_der = self.activation_functions[i].deriv(z)
        #     delta =[i].T, delta) * act_der
        #     gradient_b[i - 1] = delta
        #     gradient_w[i - 1] =, activations[i - 1].T)
        return gradient_b, gradient_w

    def feedforward(self, a):
        for i, (b, w) in enumerate(zip(self.biases, self.weights)):
            a = self.activation_functions[i](, a) + b)
        return a

    def how_many_correct_answers(self, test_data):
        k = 0
        for x, y in test_data:
            y_predict = np.argmax(self.feedforward(x))
            print(y_predict, y)
            k += int(y_predict == y)
        return k

from copy import deepcopy
import numpy as np
from perceptron import Perceptron

class Sigmoid:
    out_min_max = [0, 1]

    def __call__(self, x):
        return 1. / (1. + np.exp(-x))

    def deriv(self, y):
        # t = self(x)
        # return t * (1. - t)
        return y * (1. - y)

def cost_function_derivative(y_predict, y_true_label):
    label_vector = np.zeros(y_predict.shape)
    label_vector[y_true_label] = 1.0
    return y_predict - label_vector

def main():
    training_data = np.asarray([[[[0], [0]], 0],
                                [[[0], [1]], 1],
                                [[[1], [0]], 1],
                                [[[1], [1]], 0]])
    layer_sizes = [2, 8, 2]
    model = Perceptron(layer_sizes=layer_sizes,
                       activation_functions=[Sigmoid(), Sigmoid()],

if __name__ == '__main__':

The final output in format 'y_predict y_true' (after each epoch):
0 0
0 1
0 1
0 0

If remove random.shuffle(training_data) then:
1 0
0 1
1 1
0 0
But not 0 1 1 0


  • I figured it out. It requires the following.

    # random.shuffle(training_data) -- comment

    And it's better to do this:


    The result in most cases is obtained after ~1000 epochs:
    0 0
    1 1
    1 1
    0 0