python neural-network addition backpropagation

Adding numbers using Neural network + back propagation

I am trying to use neural networks to add numbers toegther but i can't seem to make the back propagation function working.

This is how the neural network looks like where W1 = x1, W2 = x2, W3 = y1, W4 = y2, W5 = z1 and W6 = z2 This is my code so far:

from random import randint,random ,uniform
import numpy as np 

class Data:
    data_dict= {}
    def __init__(self,limit):
        self.limit = limit
    '''creates data but beware that the limit may not be the same as the size of the dictionary''' 
    def create_data(self):
        for i in range(self.limit):
            num1 = randint(0,100)
            num2 = randint(0,100)
            self.data_dict[(num1,num2)] = num1+num2

''' you compare the error with every test in the data set and find weights that minimise the error'''
class Neural:
    def __init__(self,data):
        self.x1 = uniform(-1,1) 
        self.x2 = uniform(-1,1) 
        self.y1 = uniform(-1,1) 
        self.y2 = uniform(-1,1) 
        self.z1 = uniform(-1,1) 
        self.z2 = uniform(-1,1) 
        self.data=data
    
    def relu(self,number):
        return max(0,number)
    
    def sigmoid(self,number):
         return 1/(1 + np.exp(-number))
        
    '''weighted summation with activation function to compute output '''
    def compute_output(self,num1,num2):
        hidden_layer_input1 = self.sigmoid((num1 * self.x1) + (num2 * self.y1))
        hidden_layer_input2 = self.sigmoid((num1 * self.x2) + (num2 * self.y2))
        return ((hidden_layer_input1*self.z1) + (hidden_layer_input2 * self.z2))
            
    '''mean swaured error error between the actual output with the output generated by the algorithm '''
    def compare_ouput(self,data):
        '''actually,better to find error between all tests. add all the errors up'''
        error = 0
        for key in data.data_dict:
            error += abs(data.data_dict[key] - self.compute_output(key[0],key[1])) **2
        return error/len(data.data_dict)
       # return abs(actual - self.compute_output(num1,num2))

    '''TODO function that changes the weight depending on the errors using gradient descent'''
    '''first make it random'''
    '''next perhaps change weights for each test and average out the adjustments for each weight'''
    def random_back_propagation(self):
        error = 100000
        while error>0.1:
            self.x1 = random() 
            self.x2 = random()
            self.y1 = random()
            self.y2 = random()
            self.z1 = random()
            self.z2 = random()
            error = self.compare_ouput(self.data)
            print(error)
        print(self.compute_output(140,15))     
        
        '''learning rate is the amount the weights are updated during training'''
    def back_propagation(self, learning_rate):
        
       for _ in range(1000):
            for key in self.data.data_dict:
            
                num1, num2 = key
                target = self.data.data_dict[key]
                
                hidden_layer1_output = self.sigmoid((num1 * self.x1) + (num2 * self.y1))
                hidden_layer2_output = self.sigmoid((num1 * self.x2) + (num2 * self.y2))
                
                output = ((hidden_layer1_output * self.z1) + (hidden_layer2_output * self.z2))
                
                error = target - output
                #check if you are happpy with the error    
                
                output_unit1  = output * (1 - output) * (error)
                hidden_unit1 = hidden_layer1_output * (1 - hidden_layer1_output) * self.z1 * output_unit1
                hidden_unit2 = hidden_layer2_output * (1 - hidden_layer2_output) * self.z2 * output_unit1
                
                self.z1 += (learning_rate * output * hidden_layer1_output)
                self.z2 += (learning_rate * output * hidden_layer2_output) 
                
                self.x2 += (learning_rate * hidden_unit2 * num1)
                self.x1 += (learning_rate * hidden_unit1 * num1)
                self.y1 += (learning_rate * hidden_unit1 * num2) 
                self.y2 += (learning_rate * hidden_unit2 * num2)
                               
                print(self.x1,self.x2,self.y1,self.y2,self.z1,self.z2)
            print(num1,num2,self.compute_output(num1,num2))        
   
data = Data(200)
data.create_data()
neural = Neural(data)
neural.back_propagation(0.01)
#print(data.data_array)
#print(uniform(-1,1))
print(neural.compute_output(15,7))

I tried changing the learning rate, number of itterations, number of items in the data set but I'm not sure if the problem is with trying to find correct values to use or if my function is just incorrect

Solution

There are some problems with your current approach. I'll try to elaborate in the following sections.

Loss function

The loss function you are currently using is error = target - output. At first glance it seems to work, because if target and output are equal, error will be 0. But remember that the network will try to minimize the loss function, and in this case one of the ways to do it is by generating an extremely large output, so that error will be negative.

I suggest using something like an MSE (Mean Squared Error): error = (target - output) ** 2. That way, if the network wants to minimize it, it will have no choice but to try to make target and output equal.

Gradient updates

Currently you are updating your parameters with the positive gradient self.z1 += (learning_rate * gradient). The gradient points to the direction of greatest increase of the loss function. We don't want to increase the loss. We want to go in the direction of greatest decrease, so we use the negative gradient self.z1 -= (learning_rate * gradient).

The sigmoid function

You are using a sigmoid as the activation function for some of your hidden units. Here's the sigmoid graph:

Notice that when X > 6 or X < -6, the sigmoid basically flatlines. This flatline translates into an extremely small gradient, practically 0, and that means the parameters update extremely slowly or not at all. The network is saturated.

The data that you are feeding to the network consists of numbers that are larger than what neural networks normally work with. Consider that one of your hidden units had its weights randomly initialized to 0.5 and 0.3. Now you feed the network with the numbers you wish to sum: 50 and 20. The computation at the hidden unit will be sigmoid(50 * 0.5 + 20 * 0.3) which is sigmoid(31), and its derivative is effectively 0.

If you still wanna use the sigmoid, I recommend that you preprocess your input data so that the network won't receive such large numbers. For example, you can divide every input by 100, as the largest possible integer in the training data is 100.

Putting it all together

Below is the code with the above mentioned problems fixed. As it uses a different loss function, all of the gradients are different (feel free to double check as I'm not that confident in calculus). I also added a print to keep track of the loss across epochs.

from random import randint, random, uniform
import numpy as np


class Data:
    data_dict = {}

    def __init__(self, limit):
        self.limit = limit

    """creates data but beware that the limit may not be the same as the size of the dictionary"""

    def create_data(self):
        for i in range(self.limit):
            num1 = randint(0, 100)
            num2 = randint(0, 100)
            self.data_dict[(num1, num2)] = num1 + num2


""" you compare the error with every test in the data set and find weights that minimise the error"""


class Neural:
    def __init__(self, data):
        self.x1 = uniform(-1, 1)
        self.x2 = uniform(-1, 1)
        self.y1 = uniform(-1, 1)
        self.y2 = uniform(-1, 1)
        self.z1 = uniform(-1, 1)
        self.z2 = uniform(-1, 1)
        self.data = data

    def relu(self, number):
        return max(0, number)

    def sigmoid(self, number):
        return 1 / (1 + np.exp(-number))

    """weighted summation with activation function to compute output """

    def compute_output(self, num1, num2):
        num1, num2 = num1 / 100, num2 / 100
        hidden_layer_input1 = self.sigmoid((num1 * self.x1) + (num2 * self.y1))
        hidden_layer_input2 = self.sigmoid((num1 * self.x2) + (num2 * self.y2))
        return 100 * ((hidden_layer_input1 * self.z1) + (hidden_layer_input2 * self.z2))

    """mean swaured error error between the actual output with the output generated by the algorithm """

    def compare_ouput(self, data):
        """actually,better to find error between all tests. add all the errors up"""
        error = 0
        for key in data.data_dict:
            error += abs(data.data_dict[key] - self.compute_output(key[0], key[1])) ** 2
        return error / len(data.data_dict)

    # return abs(actual - self.compute_output(num1,num2))

    """TODO function that changes the weight depending on the errors using gradient descent"""
    """first make it random"""
    """next perhaps change weights for each test and average out the adjustments for each weight"""

    def random_back_propagation(self):
        error = 100000
        while error > 0.1:
            self.x1 = random()
            self.x2 = random()
            self.y1 = random()
            self.y2 = random()
            self.z1 = random()
            self.z2 = random()
            error = self.compare_ouput(self.data)
            print(error)
        print(self.compute_output(140, 15))

        """learning rate is the amount the weights are updated during training"""

    def back_propagation(self, learning_rate):
        for epoch in range(1000):
            errors = []
            for key in self.data.data_dict:

                num1, num2 = key
                target = self.data.data_dict[key]

                # Rescaling everything
                num1, num2 = num1 / 100, num2 / 100
                target = target / 100

                hidden_layer1_output = self.sigmoid((num1 * self.x1) + (num2 * self.y1))
                hidden_layer2_output = self.sigmoid((num1 * self.x2) + (num2 * self.y2))

                output = (hidden_layer1_output * self.z1) + (hidden_layer2_output * self.z2)

                # loss
                error = (target - output) ** 2
                # check if you are happpy with the error

                # derivative of error with respect to output
                # d(error)/d(output) = -2 * (target - output)

                # derivative of error with respect to x1
                # d(error)/d(x1) = d(error)/d(output) * d(output)/d(x1)
                #                                                      /-> hidden_layer1_output
                # d(error)/d(x1) = d(error)/d(output) * ( d(output)/d(hl1) * d(hl1)/d(x1) )

                #   derivative of output with respect to hl1
                #   d(output)/d(hl1) = z1

                #   derivative of hl1 with respect to x1
                #   d(hl1)/d(x1) = hidden_layer1_output * (1 - hidden_layer1_output) * num1

                # d(error)/d(x1) = -2 * (target - output) * z1 * hidden_layer1_output * (1 - hidden_layer1_output) * num1

                self.x1 -= learning_rate * -2 * (target - output) * self.z1 * hidden_layer1_output * (1 - hidden_layer1_output) * num1
                self.y1 -= learning_rate * -2 * (target - output) * self.z1 * hidden_layer1_output * (1 - hidden_layer1_output) * num2
                self.x2 -= learning_rate * -2 * (target - output) * self.z2 * hidden_layer2_output * (1 - hidden_layer2_output) * num1
                self.y2 -= learning_rate * -2 * (target - output) * self.z2 * hidden_layer2_output * (1 - hidden_layer2_output) * num2

                # derivative of error with respect to z1
                # d(error)/d(z1) = d(error)/d(output) * d(output)/d(z1)

                #   derivative of output with respect to z1
                #   d(output)/d(z1) = hidden_layer1_output

                # d(error)/d(z1) = -2(target - output) * hidden_layer1_output
                self.z1 -= learning_rate * -2 * (target - output) * hidden_layer1_output
                self.z2 -= learning_rate * -2 * (target - output) * hidden_layer2_output

                # print(self.x1, self.x2, self.y1, self.y2, self.z1, self.z2)
                errors.append(error)
            print(f"Mean error: {np.mean(errors)}")


data = Data(2000)
data.create_data()

neural = Neural(data)
neural.back_propagation(0.1)
print("#################################PREDICTIONS############################################")
print(f"15 + 7 = {neural.compute_output(15, 7)}")
print(f"3 + 2 = {neural.compute_output(3, 2)}")
print(f"50 + 70 = {neural.compute_output(50, 70)}")