python numpy machine-learning deep-learning neural-network

Why is binary classification network not converging?

I have created a binary classification neural network from scratch using ReLu for hidden layers, sigmoid for my final layer and the binary cross entropy loss function, I also use minibatch gradient descent. I'm struggling to understand why my network converges for smaller data sets completely fine, but just hovers around 0.7 loss for larger data sets. M y code may be really poor so if anyone can tell me where my algorithm is going wrong I'd be really grateful. Here's the code:

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

# saves csv file to variable df as our data frame
df = pd.read_csv('housepricedata.csv')
#print(df)
# converts our data frame into a set of arrays
dataset = df.values
print(dataset)
# first 10 columns are the inputs and last column is our output
X = np.array(dataset[:,0:10])
Y = np.array(dataset[:,10])
print(np.shape(X))
# normalise our inputs between 1 and 0
min_max_scaler = preprocessing.MinMaxScaler()
X_scale = min_max_scaler.fit_transform(X)
#print(X_scale)
# SPLIT OUR DATA SET INTO TRAINING, 70%, VALIDATION,15%, TESTING,15%.
X_train = X[:1024, :]
Y_train = Y[:1024]
X_val_and_test = X[1024:, :]
Y_val_and_test = Y[1024:]
#X_train, X_val_and_test, Y_train, Y_val_and_test = train_test_split(X_scale, Y, test_size=0.3)
X_val, X_test, Y_val, Y_test = train_test_split(X_val_and_test, Y_val_and_test, test_size=0.5)


class network:
    def __init__(self, X, Y):
        self.w1, self.b1 = self.generate_w_b(10,32)
        self.w2, self.b2 = self.generate_w_b(32,32)
        self.w3, self.b3 = self.generate_w_b(32,1)

    def generate_w_b(self, n_inputs, n_outup_neurons):
        weights = 0.01*np.random.randn(n_inputs,n_outup_neurons)
        biases = np.zeros((n_outup_neurons))
        return weights, biases

    def forward(self, inputs):
        self.z1 = np.dot(inputs, self.w1) +self.b1
        #print("L1 z shape: ", np.shape(self.z1))
        self.a1 = self.Relu(self.z1)
        self.z2 = np.dot(self.a1, self.w2) + self.b2
        self.a2 = self.Relu(self.z2)
        self.z3 = np.dot(self.a2, self.w3) + self.b3
        y_pred = self.Sigmoid(self.z3)

        return y_pred

    def mini_batches(self, X, Y, minibatch_size):
        m = Y.shape[0]

        permutation = list(np.random.permutation(m))
        X_shuffled = X[permutation, :]
        # Y_shuffled = Y[permutation,:] this is for mult classification
        Y_shuffled = Y[permutation]
        minibatches = []

        n_minibatches = int(m / minibatch_size)
        for i in range(n_minibatches):
            X_minibatch = X_shuffled[i * minibatch_size:(i + 1) * minibatch_size, :]
            Y_minibatch = Y_shuffled[i * minibatch_size:(i + 1) * minibatch_size]
            minibatch_pair = (X_minibatch, Y_minibatch)
            minibatches.append(minibatch_pair)
        if m % minibatch_size != 0:
            Lat_X_minibatch = X_shuffled[n_minibatches * minibatch_size - 1:m, :]
            Last_Y_minibatch = Y_shuffled[n_minibatches * minibatch_size - 1:m]
            Last_minibatch_pair = (Lat_X_minibatch, Last_Y_minibatch)
            minibatches.append(Last_minibatch_pair)
        return minibatches

    def Relu(self, x):
        return np.maximum(x,0)

    def Sigmoid(self, Z):
        return 1/(1+np.exp(-Z))

    def Cross_Entropy(self, y_pred, Y):
        # clip means that any data below 1e-7 becomes 1e-7 and any greater 1-1e-7 becomes 1-1e-7
        # this prevents us from encountering log0 values which fuck the code up
        y_pred = np.clip(y_pred, 1e-7, 1-1e-7)
        term_y1 = Y*np.log(y_pred+1e-7)
        term_y0 = (1-Y)*np.log(1-y_pred+1e-7)
        return -np.mean(term_y1+term_y0, axis=0)

    def Relu_Derivative(self, x):
        return np.greater(x, 0).astype(int)



    def Backpropagation(self, X, Y, y_pred,lr =0.01):
        # this is the error term for the output layer, so this is the chain rule of BCE derivative x Sigmoid derivative
        #print("Y shape: ", np.shape(Y))
        #print("Predicted output shape: ", np.shape(y_pred))
        m = Y.shape[1]
        self.error_term_output = (1/m)*(y_pred-Y)
        #print(np.shape(self.error_term_output))
        self.W3_gradient = self.a2.T.dot(self.error_term_output)

        self.errorh2 = self.error_term_output.dot(self.w3.T)
        self.error_term_hidden2 = (1/m)*(self.errorh2 * self.Relu_Derivative(self.a2))
        self.W2_gradient = self.a1.T.dot(self.error_term_hidden2)

        self.errorh1 = self.error_term_hidden2.dot(self.w2.T)
        self.error_term_hidden1 = (1/m)*(self.errorh1 * self.Relu_Derivative(self.a1))
        self.W1_gradient = X.T.dot(self.error_term_hidden1)


        self.w1 = self.w1 - lr*self.W1_gradient
        self.b1 = self.b1 - lr*self.error_term_hidden1
        self.w2 = self.w2 - lr*self.W2_gradient
        self.b2 = self.b2 - lr*self.error_term_hidden2
        self.w3 = self.w3 - lr*self.W3_gradient
        self.b3 = self.b3 - lr*self.error_term_output


    def Train(self,X, Y, epochs = 10000):
        E=[]
        m = len(X)
        for i in range(epochs):
            e=[]
            minibatches = self.mini_batches(X, Y, 32)
            j=0
            for batch in minibatches:
                j += 1
                #print("Batch: ", j)
                x = np.array(batch[0])
                y = np.reshape(np.array(batch[1]),(32, 1))

                y_pred = self.forward(x)
                #print("Actual Output: /n",y, "/n Predicted Output: /n", y_pred)
                loss = np.mean(self.Cross_Entropy(y_pred, y))
                #print("Loss: /n",loss,"/n")
                e.append(loss)
                self.Backpropagation(x,y,y_pred)
            E.append(np.mean(e))
        return E
print(np.shape(X_train))
NN = network(X_train, Y_train)
Errors = NN.Train(X_train, Y_train)

Solution

First, you seem to forget to use the scaled data, i.e. X_train=X[:1024] should be indeed X_train=X_scale[:]. Second, the shape of bias gradient term does not look right, e.g. the last layer bias (b3) should be a scalar; however, in your update rule, lr*self.error_term_output which is lr*(y_pred-Y), a vector. I have corrected it in the modified version in below.

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

# saves csv file to variable df as our data frame
df = pd.read_csv('housepricedata.csv')
#print(df)
# converts our data frame into a set of arrays
dataset = df.values
print(dataset)
# first 10 columns are the inputs and last column is our output
X = np.array(dataset[:,0:10])
Y = np.array(dataset[:,10])
print(np.shape(X))
# normalise our inputs between 1 and 0
min_max_scaler = preprocessing.MinMaxScaler()
X_scale = min_max_scaler.fit_transform(X)
#print(X_scale)
# SPLIT OUR DATA SET INTO TRAINING, 70%, VALIDATION,15%, TESTING,15%.
X_train = X_scale[:1024, :]
Y_train = Y[:1024]
X_val_and_test = X_scale[1024:, :]
Y_val_and_test = Y[1024:]
#X_train, X_val_and_test, Y_train, Y_val_and_test = train_test_split(X_scale, Y, test_size=0.3)
X_val, X_test, Y_val, Y_test = train_test_split(X_val_and_test, Y_val_and_test, test_size=0.5)


class network:
    def __init__(self, X, Y):
        self.w1, self.b1 = self.generate_w_b(10,32)
        self.w2, self.b2 = self.generate_w_b(32,32)
        self.w3, self.b3 = self.generate_w_b(32,1)

    def generate_w_b(self, n_inputs, n_outup_neurons):
        weights = 0.01*np.random.randn(n_inputs,n_outup_neurons)
        biases = np.zeros((1, n_outup_neurons))
        return weights, biases

    def forward(self, inputs):
        self.z1 = np.dot(inputs, self.w1) +self.b1
        #print("L1 z shape: ", np.shape(self.z1))
        self.a1 = self.Relu(self.z1)
        self.z2 = np.dot(self.a1, self.w2) + self.b2
        self.a2 = self.Relu(self.z2)
        self.z3 = np.dot(self.a2, self.w3) + self.b3
        y_pred = self.Sigmoid(self.z3)

        return y_pred

    def mini_batches(self, X, Y, minibatch_size):
        m = Y.shape[0]

        permutation = list(np.random.permutation(m))
        X_shuffled = X[permutation, :]
        # Y_shuffled = Y[permutation,:] this is for mult classification
        Y_shuffled = Y[permutation]
        minibatches = []

        n_minibatches = int(m / minibatch_size)
        for i in range(n_minibatches):
            X_minibatch = X_shuffled[i * minibatch_size:(i + 1) * minibatch_size, :]
            Y_minibatch = Y_shuffled[i * minibatch_size:(i + 1) * minibatch_size]
            minibatch_pair = (X_minibatch, Y_minibatch)
            minibatches.append(minibatch_pair)
        if m % minibatch_size != 0:
            Lat_X_minibatch = X_shuffled[n_minibatches * minibatch_size - 1:m, :]
            Last_Y_minibatch = Y_shuffled[n_minibatches * minibatch_size - 1:m]
            Last_minibatch_pair = (Lat_X_minibatch, Last_Y_minibatch)
            minibatches.append(Last_minibatch_pair)
        return minibatches

    def Relu(self, x):
        return np.maximum(x,0)

    def Sigmoid(self, Z):
        return 1/(1+np.exp(-Z))

    def Cross_Entropy(self, y_pred, Y):
        # clip means that any data below 1e-7 becomes 1e-7 and any greater 1-1e-7 becomes 1-1e-7
        # this prevents us from encountering log0 values which fuck the code up
        y_pred = np.clip(y_pred, 1e-7, 1-1e-7)
        term_y1 = Y*np.log(y_pred+1e-7)
        term_y0 = (1-Y)*np.log(1-y_pred+1e-7)
        return -np.mean(term_y1+term_y0, axis=0)

    def Relu_Derivative(self, x):
        return np.greater(x, 0).astype(int)


    def Backpropagation(self, X, Y, y_pred,lr =0.01):
        # this is the error term for the output layer, so this is the chain rule of BCE derivative x Sigmoid derivative
        #print("Y shape: ", np.shape(Y))
        #print("Predicted output shape: ", np.shape(y_pred))
        m = Y.shape[1]
        self.error_term_output = (1/m)*(y_pred-Y)
        #print(np.shape(self.error_term_output))
        self.W3_gradient = self.a2.T.dot(self.error_term_output)

        self.errorh2 = self.error_term_output.dot(self.w3.T)
        self.error_term_hidden2 = self.errorh2 * self.Relu_Derivative(self.a2)
        self.W2_gradient = self.a1.T.dot(self.error_term_hidden2)

        self.errorh1 = self.error_term_hidden2.dot(self.w2.T)
        self.error_term_hidden1 = self.errorh1 * self.Relu_Derivative(self.a1)
        self.W1_gradient = X.T.dot(self.error_term_hidden1)


        self.w1 = self.w1 - lr*self.W1_gradient
        self.b1 = self.b1 - lr*self.error_term_hidden1.sum(axis=0, keepdims=True)
        self.w2 = self.w2 - lr*self.W2_gradient
        self.b2 = self.b2 - lr*self.error_term_hidden2.sum(axis=0, keepdims=True)
        self.w3 = self.w3 - lr*self.W3_gradient
        self.b3 = self.b3 - lr*self.error_term_output.sum(axis=0, keepdims=True)


    def Train(self, X, Y, lr=0.01, batch_size=32, epochs = 10000):
        E=[]
        m = len(X)
        for i in range(epochs):
            e=[]
            minibatches = self.mini_batches(X, Y, batch_size)
            j=0
            for batch in minibatches:
                j += 1
                #print("Batch: ", j)
                x = np.array(batch[0])
                y = np.reshape(np.array(batch[1]),(batch_size, 1))

                y_pred = self.forward(x)
                #print("Actual Output: /n",y, "/n Predicted Output: /n", y_pred)
                loss = self.Cross_Entropy(y_pred, y).item()
                #print("Loss: /n",loss,"/n")
                e.append(loss)
                self.Backpropagation(x,y,y_pred,lr=lr)
            E.append(np.mean(e))
        return E
print(np.shape(X_train))
NN = network(X_train, Y_train)
Errors = NN.Train(X_train, Y_train, lr=0.01, batch_size=32, epochs=100)
print(Errors)