I have created a binary classification neural network from scratch using ReLu for hidden layers, sigmoid for my final layer and the binary cross entropy loss function, I also use minibatch gradient descent. I'm struggling to understand why my network converges for smaller data sets completely fine, but just hovers around 0.7 loss for larger data sets. M y code may be really poor so if anyone can tell me where my algorithm is going wrong I'd be really grateful. Here's the code:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
# saves csv file to variable df as our data frame
df = pd.read_csv('housepricedata.csv')
#print(df)
# converts our data frame into a set of arrays
dataset = df.values
print(dataset)
# first 10 columns are the inputs and last column is our output
X = np.array(dataset[:,0:10])
Y = np.array(dataset[:,10])
print(np.shape(X))
# normalise our inputs between 1 and 0
min_max_scaler = preprocessing.MinMaxScaler()
X_scale = min_max_scaler.fit_transform(X)
#print(X_scale)
# SPLIT OUR DATA SET INTO TRAINING, 70%, VALIDATION,15%, TESTING,15%.
X_train = X[:1024, :]
Y_train = Y[:1024]
X_val_and_test = X[1024:, :]
Y_val_and_test = Y[1024:]
#X_train, X_val_and_test, Y_train, Y_val_and_test = train_test_split(X_scale, Y, test_size=0.3)
X_val, X_test, Y_val, Y_test = train_test_split(X_val_and_test, Y_val_and_test, test_size=0.5)
class network:
def __init__(self, X, Y):
self.w1, self.b1 = self.generate_w_b(10,32)
self.w2, self.b2 = self.generate_w_b(32,32)
self.w3, self.b3 = self.generate_w_b(32,1)
def generate_w_b(self, n_inputs, n_outup_neurons):
weights = 0.01*np.random.randn(n_inputs,n_outup_neurons)
biases = np.zeros((n_outup_neurons))
return weights, biases
def forward(self, inputs):
self.z1 = np.dot(inputs, self.w1) +self.b1
#print("L1 z shape: ", np.shape(self.z1))
self.a1 = self.Relu(self.z1)
self.z2 = np.dot(self.a1, self.w2) + self.b2
self.a2 = self.Relu(self.z2)
self.z3 = np.dot(self.a2, self.w3) + self.b3
y_pred = self.Sigmoid(self.z3)
return y_pred
def mini_batches(self, X, Y, minibatch_size):
m = Y.shape[0]
permutation = list(np.random.permutation(m))
X_shuffled = X[permutation, :]
# Y_shuffled = Y[permutation,:] this is for mult classification
Y_shuffled = Y[permutation]
minibatches = []
n_minibatches = int(m / minibatch_size)
for i in range(n_minibatches):
X_minibatch = X_shuffled[i * minibatch_size:(i + 1) * minibatch_size, :]
Y_minibatch = Y_shuffled[i * minibatch_size:(i + 1) * minibatch_size]
minibatch_pair = (X_minibatch, Y_minibatch)
minibatches.append(minibatch_pair)
if m % minibatch_size != 0:
Lat_X_minibatch = X_shuffled[n_minibatches * minibatch_size - 1:m, :]
Last_Y_minibatch = Y_shuffled[n_minibatches * minibatch_size - 1:m]
Last_minibatch_pair = (Lat_X_minibatch, Last_Y_minibatch)
minibatches.append(Last_minibatch_pair)
return minibatches
def Relu(self, x):
return np.maximum(x,0)
def Sigmoid(self, Z):
return 1/(1+np.exp(-Z))
def Cross_Entropy(self, y_pred, Y):
# clip means that any data below 1e-7 becomes 1e-7 and any greater 1-1e-7 becomes 1-1e-7
# this prevents us from encountering log0 values which fuck the code up
y_pred = np.clip(y_pred, 1e-7, 1-1e-7)
term_y1 = Y*np.log(y_pred+1e-7)
term_y0 = (1-Y)*np.log(1-y_pred+1e-7)
return -np.mean(term_y1+term_y0, axis=0)
def Relu_Derivative(self, x):
return np.greater(x, 0).astype(int)
def Backpropagation(self, X, Y, y_pred,lr =0.01):
# this is the error term for the output layer, so this is the chain rule of BCE derivative x Sigmoid derivative
#print("Y shape: ", np.shape(Y))
#print("Predicted output shape: ", np.shape(y_pred))
m = Y.shape[1]
self.error_term_output = (1/m)*(y_pred-Y)
#print(np.shape(self.error_term_output))
self.W3_gradient = self.a2.T.dot(self.error_term_output)
self.errorh2 = self.error_term_output.dot(self.w3.T)
self.error_term_hidden2 = (1/m)*(self.errorh2 * self.Relu_Derivative(self.a2))
self.W2_gradient = self.a1.T.dot(self.error_term_hidden2)
self.errorh1 = self.error_term_hidden2.dot(self.w2.T)
self.error_term_hidden1 = (1/m)*(self.errorh1 * self.Relu_Derivative(self.a1))
self.W1_gradient = X.T.dot(self.error_term_hidden1)
self.w1 = self.w1 - lr*self.W1_gradient
self.b1 = self.b1 - lr*self.error_term_hidden1
self.w2 = self.w2 - lr*self.W2_gradient
self.b2 = self.b2 - lr*self.error_term_hidden2
self.w3 = self.w3 - lr*self.W3_gradient
self.b3 = self.b3 - lr*self.error_term_output
def Train(self,X, Y, epochs = 10000):
E=[]
m = len(X)
for i in range(epochs):
e=[]
minibatches = self.mini_batches(X, Y, 32)
j=0
for batch in minibatches:
j += 1
#print("Batch: ", j)
x = np.array(batch[0])
y = np.reshape(np.array(batch[1]),(32, 1))
y_pred = self.forward(x)
#print("Actual Output: /n",y, "/n Predicted Output: /n", y_pred)
loss = np.mean(self.Cross_Entropy(y_pred, y))
#print("Loss: /n",loss,"/n")
e.append(loss)
self.Backpropagation(x,y,y_pred)
E.append(np.mean(e))
return E
print(np.shape(X_train))
NN = network(X_train, Y_train)
Errors = NN.Train(X_train, Y_train)
First, you seem to forget to use the scaled data, i.e. X_train=X[:1024]
should be indeed X_train=X_scale[:]
. Second, the shape of bias gradient term does not look right, e.g. the last layer bias (b3
) should be a scalar; however, in your update rule, lr*self.error_term_output
which is lr*(y_pred-Y)
, a vector. I have corrected it in the modified version in below.
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
# saves csv file to variable df as our data frame
df = pd.read_csv('housepricedata.csv')
#print(df)
# converts our data frame into a set of arrays
dataset = df.values
print(dataset)
# first 10 columns are the inputs and last column is our output
X = np.array(dataset[:,0:10])
Y = np.array(dataset[:,10])
print(np.shape(X))
# normalise our inputs between 1 and 0
min_max_scaler = preprocessing.MinMaxScaler()
X_scale = min_max_scaler.fit_transform(X)
#print(X_scale)
# SPLIT OUR DATA SET INTO TRAINING, 70%, VALIDATION,15%, TESTING,15%.
X_train = X_scale[:1024, :]
Y_train = Y[:1024]
X_val_and_test = X_scale[1024:, :]
Y_val_and_test = Y[1024:]
#X_train, X_val_and_test, Y_train, Y_val_and_test = train_test_split(X_scale, Y, test_size=0.3)
X_val, X_test, Y_val, Y_test = train_test_split(X_val_and_test, Y_val_and_test, test_size=0.5)
class network:
def __init__(self, X, Y):
self.w1, self.b1 = self.generate_w_b(10,32)
self.w2, self.b2 = self.generate_w_b(32,32)
self.w3, self.b3 = self.generate_w_b(32,1)
def generate_w_b(self, n_inputs, n_outup_neurons):
weights = 0.01*np.random.randn(n_inputs,n_outup_neurons)
biases = np.zeros((1, n_outup_neurons))
return weights, biases
def forward(self, inputs):
self.z1 = np.dot(inputs, self.w1) +self.b1
#print("L1 z shape: ", np.shape(self.z1))
self.a1 = self.Relu(self.z1)
self.z2 = np.dot(self.a1, self.w2) + self.b2
self.a2 = self.Relu(self.z2)
self.z3 = np.dot(self.a2, self.w3) + self.b3
y_pred = self.Sigmoid(self.z3)
return y_pred
def mini_batches(self, X, Y, minibatch_size):
m = Y.shape[0]
permutation = list(np.random.permutation(m))
X_shuffled = X[permutation, :]
# Y_shuffled = Y[permutation,:] this is for mult classification
Y_shuffled = Y[permutation]
minibatches = []
n_minibatches = int(m / minibatch_size)
for i in range(n_minibatches):
X_minibatch = X_shuffled[i * minibatch_size:(i + 1) * minibatch_size, :]
Y_minibatch = Y_shuffled[i * minibatch_size:(i + 1) * minibatch_size]
minibatch_pair = (X_minibatch, Y_minibatch)
minibatches.append(minibatch_pair)
if m % minibatch_size != 0:
Lat_X_minibatch = X_shuffled[n_minibatches * minibatch_size - 1:m, :]
Last_Y_minibatch = Y_shuffled[n_minibatches * minibatch_size - 1:m]
Last_minibatch_pair = (Lat_X_minibatch, Last_Y_minibatch)
minibatches.append(Last_minibatch_pair)
return minibatches
def Relu(self, x):
return np.maximum(x,0)
def Sigmoid(self, Z):
return 1/(1+np.exp(-Z))
def Cross_Entropy(self, y_pred, Y):
# clip means that any data below 1e-7 becomes 1e-7 and any greater 1-1e-7 becomes 1-1e-7
# this prevents us from encountering log0 values which fuck the code up
y_pred = np.clip(y_pred, 1e-7, 1-1e-7)
term_y1 = Y*np.log(y_pred+1e-7)
term_y0 = (1-Y)*np.log(1-y_pred+1e-7)
return -np.mean(term_y1+term_y0, axis=0)
def Relu_Derivative(self, x):
return np.greater(x, 0).astype(int)
def Backpropagation(self, X, Y, y_pred,lr =0.01):
# this is the error term for the output layer, so this is the chain rule of BCE derivative x Sigmoid derivative
#print("Y shape: ", np.shape(Y))
#print("Predicted output shape: ", np.shape(y_pred))
m = Y.shape[1]
self.error_term_output = (1/m)*(y_pred-Y)
#print(np.shape(self.error_term_output))
self.W3_gradient = self.a2.T.dot(self.error_term_output)
self.errorh2 = self.error_term_output.dot(self.w3.T)
self.error_term_hidden2 = self.errorh2 * self.Relu_Derivative(self.a2)
self.W2_gradient = self.a1.T.dot(self.error_term_hidden2)
self.errorh1 = self.error_term_hidden2.dot(self.w2.T)
self.error_term_hidden1 = self.errorh1 * self.Relu_Derivative(self.a1)
self.W1_gradient = X.T.dot(self.error_term_hidden1)
self.w1 = self.w1 - lr*self.W1_gradient
self.b1 = self.b1 - lr*self.error_term_hidden1.sum(axis=0, keepdims=True)
self.w2 = self.w2 - lr*self.W2_gradient
self.b2 = self.b2 - lr*self.error_term_hidden2.sum(axis=0, keepdims=True)
self.w3 = self.w3 - lr*self.W3_gradient
self.b3 = self.b3 - lr*self.error_term_output.sum(axis=0, keepdims=True)
def Train(self, X, Y, lr=0.01, batch_size=32, epochs = 10000):
E=[]
m = len(X)
for i in range(epochs):
e=[]
minibatches = self.mini_batches(X, Y, batch_size)
j=0
for batch in minibatches:
j += 1
#print("Batch: ", j)
x = np.array(batch[0])
y = np.reshape(np.array(batch[1]),(batch_size, 1))
y_pred = self.forward(x)
#print("Actual Output: /n",y, "/n Predicted Output: /n", y_pred)
loss = self.Cross_Entropy(y_pred, y).item()
#print("Loss: /n",loss,"/n")
e.append(loss)
self.Backpropagation(x,y,y_pred,lr=lr)
E.append(np.mean(e))
return E
print(np.shape(X_train))
NN = network(X_train, Y_train)
Errors = NN.Train(X_train, Y_train, lr=0.01, batch_size=32, epochs=100)
print(Errors)