Search code examples
pythonnumpymachine-learningdeep-learningneural-network

Why is binary classification network not converging?


I have created a binary classification neural network from scratch using ReLu for hidden layers, sigmoid for my final layer and the binary cross entropy loss function, I also use minibatch gradient descent. I'm struggling to understand why my network converges for smaller data sets completely fine, but just hovers around 0.7 loss for larger data sets. M y code may be really poor so if anyone can tell me where my algorithm is going wrong I'd be really grateful. Here's the code:

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

# saves csv file to variable df as our data frame
df = pd.read_csv('housepricedata.csv')
#print(df)
# converts our data frame into a set of arrays
dataset = df.values
print(dataset)
# first 10 columns are the inputs and last column is our output
X = np.array(dataset[:,0:10])
Y = np.array(dataset[:,10])
print(np.shape(X))
# normalise our inputs between 1 and 0
min_max_scaler = preprocessing.MinMaxScaler()
X_scale = min_max_scaler.fit_transform(X)
#print(X_scale)
# SPLIT OUR DATA SET INTO TRAINING, 70%, VALIDATION,15%, TESTING,15%.
X_train = X[:1024, :]
Y_train = Y[:1024]
X_val_and_test = X[1024:, :]
Y_val_and_test = Y[1024:]
#X_train, X_val_and_test, Y_train, Y_val_and_test = train_test_split(X_scale, Y, test_size=0.3)
X_val, X_test, Y_val, Y_test = train_test_split(X_val_and_test, Y_val_and_test, test_size=0.5)


class network:
    def __init__(self, X, Y):
        self.w1, self.b1 = self.generate_w_b(10,32)
        self.w2, self.b2 = self.generate_w_b(32,32)
        self.w3, self.b3 = self.generate_w_b(32,1)

    def generate_w_b(self, n_inputs, n_outup_neurons):
        weights = 0.01*np.random.randn(n_inputs,n_outup_neurons)
        biases = np.zeros((n_outup_neurons))
        return weights, biases

    def forward(self, inputs):
        self.z1 = np.dot(inputs, self.w1) +self.b1
        #print("L1 z shape: ", np.shape(self.z1))
        self.a1 = self.Relu(self.z1)
        self.z2 = np.dot(self.a1, self.w2) + self.b2
        self.a2 = self.Relu(self.z2)
        self.z3 = np.dot(self.a2, self.w3) + self.b3
        y_pred = self.Sigmoid(self.z3)

        return y_pred

    def mini_batches(self, X, Y, minibatch_size):
        m = Y.shape[0]

        permutation = list(np.random.permutation(m))
        X_shuffled = X[permutation, :]
        # Y_shuffled = Y[permutation,:] this is for mult classification
        Y_shuffled = Y[permutation]
        minibatches = []

        n_minibatches = int(m / minibatch_size)
        for i in range(n_minibatches):
            X_minibatch = X_shuffled[i * minibatch_size:(i + 1) * minibatch_size, :]
            Y_minibatch = Y_shuffled[i * minibatch_size:(i + 1) * minibatch_size]
            minibatch_pair = (X_minibatch, Y_minibatch)
            minibatches.append(minibatch_pair)
        if m % minibatch_size != 0:
            Lat_X_minibatch = X_shuffled[n_minibatches * minibatch_size - 1:m, :]
            Last_Y_minibatch = Y_shuffled[n_minibatches * minibatch_size - 1:m]
            Last_minibatch_pair = (Lat_X_minibatch, Last_Y_minibatch)
            minibatches.append(Last_minibatch_pair)
        return minibatches

    def Relu(self, x):
        return np.maximum(x,0)

    def Sigmoid(self, Z):
        return 1/(1+np.exp(-Z))

    def Cross_Entropy(self, y_pred, Y):
        # clip means that any data below 1e-7 becomes 1e-7 and any greater 1-1e-7 becomes 1-1e-7
        # this prevents us from encountering log0 values which fuck the code up
        y_pred = np.clip(y_pred, 1e-7, 1-1e-7)
        term_y1 = Y*np.log(y_pred+1e-7)
        term_y0 = (1-Y)*np.log(1-y_pred+1e-7)
        return -np.mean(term_y1+term_y0, axis=0)

    def Relu_Derivative(self, x):
        return np.greater(x, 0).astype(int)



    def Backpropagation(self, X, Y, y_pred,lr =0.01):
        # this is the error term for the output layer, so this is the chain rule of BCE derivative x Sigmoid derivative
        #print("Y shape: ", np.shape(Y))
        #print("Predicted output shape: ", np.shape(y_pred))
        m = Y.shape[1]
        self.error_term_output = (1/m)*(y_pred-Y)
        #print(np.shape(self.error_term_output))
        self.W3_gradient = self.a2.T.dot(self.error_term_output)

        self.errorh2 = self.error_term_output.dot(self.w3.T)
        self.error_term_hidden2 = (1/m)*(self.errorh2 * self.Relu_Derivative(self.a2))
        self.W2_gradient = self.a1.T.dot(self.error_term_hidden2)

        self.errorh1 = self.error_term_hidden2.dot(self.w2.T)
        self.error_term_hidden1 = (1/m)*(self.errorh1 * self.Relu_Derivative(self.a1))
        self.W1_gradient = X.T.dot(self.error_term_hidden1)


        self.w1 = self.w1 - lr*self.W1_gradient
        self.b1 = self.b1 - lr*self.error_term_hidden1
        self.w2 = self.w2 - lr*self.W2_gradient
        self.b2 = self.b2 - lr*self.error_term_hidden2
        self.w3 = self.w3 - lr*self.W3_gradient
        self.b3 = self.b3 - lr*self.error_term_output


    def Train(self,X, Y, epochs = 10000):
        E=[]
        m = len(X)
        for i in range(epochs):
            e=[]
            minibatches = self.mini_batches(X, Y, 32)
            j=0
            for batch in minibatches:
                j += 1
                #print("Batch: ", j)
                x = np.array(batch[0])
                y = np.reshape(np.array(batch[1]),(32, 1))

                y_pred = self.forward(x)
                #print("Actual Output: /n",y, "/n Predicted Output: /n", y_pred)
                loss = np.mean(self.Cross_Entropy(y_pred, y))
                #print("Loss: /n",loss,"/n")
                e.append(loss)
                self.Backpropagation(x,y,y_pred)
            E.append(np.mean(e))
        return E
print(np.shape(X_train))
NN = network(X_train, Y_train)
Errors = NN.Train(X_train, Y_train)

Solution

  • First, you seem to forget to use the scaled data, i.e. X_train=X[:1024] should be indeed X_train=X_scale[:]. Second, the shape of bias gradient term does not look right, e.g. the last layer bias (b3) should be a scalar; however, in your update rule, lr*self.error_term_output which is lr*(y_pred-Y), a vector. I have corrected it in the modified version in below.

    import numpy as np
    import pandas as pd
    from matplotlib import pyplot as plt
    from sklearn import preprocessing
    from sklearn.model_selection import train_test_split
    
    # saves csv file to variable df as our data frame
    df = pd.read_csv('housepricedata.csv')
    #print(df)
    # converts our data frame into a set of arrays
    dataset = df.values
    print(dataset)
    # first 10 columns are the inputs and last column is our output
    X = np.array(dataset[:,0:10])
    Y = np.array(dataset[:,10])
    print(np.shape(X))
    # normalise our inputs between 1 and 0
    min_max_scaler = preprocessing.MinMaxScaler()
    X_scale = min_max_scaler.fit_transform(X)
    #print(X_scale)
    # SPLIT OUR DATA SET INTO TRAINING, 70%, VALIDATION,15%, TESTING,15%.
    X_train = X_scale[:1024, :]
    Y_train = Y[:1024]
    X_val_and_test = X_scale[1024:, :]
    Y_val_and_test = Y[1024:]
    #X_train, X_val_and_test, Y_train, Y_val_and_test = train_test_split(X_scale, Y, test_size=0.3)
    X_val, X_test, Y_val, Y_test = train_test_split(X_val_and_test, Y_val_and_test, test_size=0.5)
    
    
    class network:
        def __init__(self, X, Y):
            self.w1, self.b1 = self.generate_w_b(10,32)
            self.w2, self.b2 = self.generate_w_b(32,32)
            self.w3, self.b3 = self.generate_w_b(32,1)
    
        def generate_w_b(self, n_inputs, n_outup_neurons):
            weights = 0.01*np.random.randn(n_inputs,n_outup_neurons)
            biases = np.zeros((1, n_outup_neurons))
            return weights, biases
    
        def forward(self, inputs):
            self.z1 = np.dot(inputs, self.w1) +self.b1
            #print("L1 z shape: ", np.shape(self.z1))
            self.a1 = self.Relu(self.z1)
            self.z2 = np.dot(self.a1, self.w2) + self.b2
            self.a2 = self.Relu(self.z2)
            self.z3 = np.dot(self.a2, self.w3) + self.b3
            y_pred = self.Sigmoid(self.z3)
    
            return y_pred
    
        def mini_batches(self, X, Y, minibatch_size):
            m = Y.shape[0]
    
            permutation = list(np.random.permutation(m))
            X_shuffled = X[permutation, :]
            # Y_shuffled = Y[permutation,:] this is for mult classification
            Y_shuffled = Y[permutation]
            minibatches = []
    
            n_minibatches = int(m / minibatch_size)
            for i in range(n_minibatches):
                X_minibatch = X_shuffled[i * minibatch_size:(i + 1) * minibatch_size, :]
                Y_minibatch = Y_shuffled[i * minibatch_size:(i + 1) * minibatch_size]
                minibatch_pair = (X_minibatch, Y_minibatch)
                minibatches.append(minibatch_pair)
            if m % minibatch_size != 0:
                Lat_X_minibatch = X_shuffled[n_minibatches * minibatch_size - 1:m, :]
                Last_Y_minibatch = Y_shuffled[n_minibatches * minibatch_size - 1:m]
                Last_minibatch_pair = (Lat_X_minibatch, Last_Y_minibatch)
                minibatches.append(Last_minibatch_pair)
            return minibatches
    
        def Relu(self, x):
            return np.maximum(x,0)
    
        def Sigmoid(self, Z):
            return 1/(1+np.exp(-Z))
    
        def Cross_Entropy(self, y_pred, Y):
            # clip means that any data below 1e-7 becomes 1e-7 and any greater 1-1e-7 becomes 1-1e-7
            # this prevents us from encountering log0 values which fuck the code up
            y_pred = np.clip(y_pred, 1e-7, 1-1e-7)
            term_y1 = Y*np.log(y_pred+1e-7)
            term_y0 = (1-Y)*np.log(1-y_pred+1e-7)
            return -np.mean(term_y1+term_y0, axis=0)
    
        def Relu_Derivative(self, x):
            return np.greater(x, 0).astype(int)
    
    
        def Backpropagation(self, X, Y, y_pred,lr =0.01):
            # this is the error term for the output layer, so this is the chain rule of BCE derivative x Sigmoid derivative
            #print("Y shape: ", np.shape(Y))
            #print("Predicted output shape: ", np.shape(y_pred))
            m = Y.shape[1]
            self.error_term_output = (1/m)*(y_pred-Y)
            #print(np.shape(self.error_term_output))
            self.W3_gradient = self.a2.T.dot(self.error_term_output)
    
            self.errorh2 = self.error_term_output.dot(self.w3.T)
            self.error_term_hidden2 = self.errorh2 * self.Relu_Derivative(self.a2)
            self.W2_gradient = self.a1.T.dot(self.error_term_hidden2)
    
            self.errorh1 = self.error_term_hidden2.dot(self.w2.T)
            self.error_term_hidden1 = self.errorh1 * self.Relu_Derivative(self.a1)
            self.W1_gradient = X.T.dot(self.error_term_hidden1)
    
    
            self.w1 = self.w1 - lr*self.W1_gradient
            self.b1 = self.b1 - lr*self.error_term_hidden1.sum(axis=0, keepdims=True)
            self.w2 = self.w2 - lr*self.W2_gradient
            self.b2 = self.b2 - lr*self.error_term_hidden2.sum(axis=0, keepdims=True)
            self.w3 = self.w3 - lr*self.W3_gradient
            self.b3 = self.b3 - lr*self.error_term_output.sum(axis=0, keepdims=True)
    
    
        def Train(self, X, Y, lr=0.01, batch_size=32, epochs = 10000):
            E=[]
            m = len(X)
            for i in range(epochs):
                e=[]
                minibatches = self.mini_batches(X, Y, batch_size)
                j=0
                for batch in minibatches:
                    j += 1
                    #print("Batch: ", j)
                    x = np.array(batch[0])
                    y = np.reshape(np.array(batch[1]),(batch_size, 1))
    
                    y_pred = self.forward(x)
                    #print("Actual Output: /n",y, "/n Predicted Output: /n", y_pred)
                    loss = self.Cross_Entropy(y_pred, y).item()
                    #print("Loss: /n",loss,"/n")
                    e.append(loss)
                    self.Backpropagation(x,y,y_pred,lr=lr)
                E.append(np.mean(e))
            return E
    print(np.shape(X_train))
    NN = network(X_train, Y_train)
    Errors = NN.Train(X_train, Y_train, lr=0.01, batch_size=32, epochs=100)
    print(Errors)