The output layer is stuck at [0.5, 0.5] vector. Can anyone help in understanding if there is any problem with the code.
The neural net I'm trying to train is an X-OR gate, so the output vector should be close to the one hot vector representing the correct class(0 or 1) in this case, but the output vector after all epoch still stays at [0.5, 0.5]
class Backpropogation:
def setupWeightsBiases(self):
for i in range(1, self.num_layers):
self.weights_dict[i] = rnd.rand(self.layer_spec[i], self.layer_spec[i - 1])
self.bias_dict[i] = rnd.rand(self.layer_spec[i], 1)
def __init__(self, hidden_layer_neurons_tuple, train_data, num_output_classes, output_layer_func='sigmoid'):
self.train_input = train_data[0]
self.input_layer_size = self.train_input[0].size
self.train_input = self.train_input.reshape(self.train_input.shape[0], self.input_layer_size).T
self.output_layer_size = num_output_classes
self.train_output = train_data[1]
print(self.train_output.shape)
num_hidden_layer = len(hidden_layer_neurons_tuple)
self.hidden_layer_neurons_tuple = hidden_layer_neurons_tuple
self.layer_spec = [self.input_layer_size] + \
list(hidden_layer_neurons_tuple) + \
[num_output_classes]
self.layer_spec = tuple(self.layer_spec)
self.num_layers = num_hidden_layer + 2
self.train_data = train_data
self.activation_layer_gradient_dict = {}
self.preactivation_layer_gradient_dict = {}
self.weights_gradient_dict = {}
self.bias_gradient_dict = {}
self.curr_input = None
self.curr_output = None
self.weights_dict = {}
self.preactivation_layer_dict = {}
self.activation_layer_dict = {}
self.bias_dict = {}
self.setupWeightsBiases()
self.output = None
self.output_diff = None
self.num_output_classes = num_output_classes
def predictClass(self):
return np.argmax(self.activation_layer_dict[self.num_layers - 1])
def forwardPropogation(self, input):
# Load h[0] as the input data
self.activation_layer_dict[0] = input
'''
load input data into h[0]
for i in (1,L):
a[k] = W[k] * h[k-1] + b[k]
and finally calculate the Lth layer output with the special activation function
'''
for i in range(1, self.num_layers):
self.preactivation_layer_dict[i] = \
np.matmul(self.weights_dict[i], self.activation_layer_dict[i - 1]) + \
self.bias_dict[i]
# print(self.preactivation_layer_dict[i])
vec = self.preactivation_layer_dict[i]
self.activation_layer_dict[i] = self.activationFunction(vec)
# This will change h[L] to y'
self.activation_layer_dict[self.num_layers - 1] = self.outputFunction()
def findGradients(self, index):
class_label = self.train_output[index]
output_one_hot_vector = np.zeros((self.num_output_classes, 1))
output_one_hot_vector[class_label] = 1
output = self.activation_layer_dict[self.num_layers - 1]
self.preactivation_layer_gradient_dict[self.num_layers - 1] = -1 * (output_one_hot_vector - output)
for layer in reversed(range(1, self.num_layers)):
self.weights_gradient_dict[layer] = np.matmul(self.preactivation_layer_gradient_dict[layer],
self.activation_layer_dict[layer - 1].T)
self.bias_gradient_dict[layer] = self.preactivation_layer_gradient_dict[layer]
self.activation_layer_gradient_dict[layer - 1] = np.matmul(self.weights_dict[layer].T,
self.preactivation_layer_gradient_dict[layer])
if layer != 1:
self.preactivation_layer_gradient_dict[layer - 1] = np.multiply(
self.activation_layer_gradient_dict[layer - 1],
self.outputFunctionDiff(layer - 1))
def activationFunction(self, vec, type='sigmoid'):
if type == 'sigmoid':
return 1 / (1 + expit(-vec))
else:
print('Please select correct output function')
exit()
def outputFunction(self, type='sigmoid'):
if type == 'sigmoid':
return 1 / (1 + expit(-self.preactivation_layer_dict[self.num_layers - 1]))
else:
print('Please select correct output function')
exit()
def outputFunctionDiff(self, layer, type='sigmoid'):
op_layer = self.num_layers - 1
if type == 'sigmoid':
vec = self.preactivation_layer_dict[layer]
return np.multiply(self.activationFunction(vec), 1 - self.activationFunction(vec))
else:
print('Please select correct output function')
exit()
def updateWeightsAndBiases(self, learning_rate):
for layer in range(1, self.num_layers):
self.weights_dict[layer] = self.weights_dict[layer] - learning_rate * self.weights_gradient_dict[layer]
self.preactivation_layer_dict[layer] = self.preactivation_layer_dict[layer] - \
learning_rate * self.preactivation_layer_gradient_dict[layer]
if not (layer == self.num_layers - 1):
self.activation_layer_dict[layer] = self.activation_layer_dict[layer] - \
learning_rate * self.activation_layer_gradient_dict[layer]
self.bias_dict[layer] = self.bias_dict[layer] - learning_rate * self.bias_gradient_dict[layer]
def getLoss(self, index):
return np.log2(self.activation_layer_dict[self.num_layers - 1][self.train_output[index], 0])
def train(self, learning_rate, num_epochs):
for curr_epoch in range(num_epochs):
print('Evaluating at ' + str(curr_epoch))
index_array = list(np.arange(0, self.train_input.shape[1]))
np.random.shuffle(index_array)
for train_data_index in index_array:
test_input = self.train_input[:, [train_data_index]]
self.forwardPropogation(test_input)
# print(self.activation_layer_dict[self.num_layers - 1])
self.findGradients(train_data_index)
self.updateWeightsAndBiases(learning_rate)
print('Loss ' + str(self.getLoss(train_data_index)))
# Assumes a 2D array of 784xN array as test input
# This will return output classes of the data
def test(self, test_data):
index_range = test_data.shape[1]
test_class_list = []
for index in range(index_range):
self.forwardPropogation(test_data[:, [index]])
test_class_list.append(self.predictClass())
return test_class_list
# train the NN with BP
train_data = (np.array([[0, 0], [0, 1], [1,0], [1, 1]]), np.array([0, 1, 1, 0]))
b = Backpropogation((2, 2), train_data, 2)
The following code (check this for implementation and this for the theory) implements a neural net with backpropagation from scratch, using a single output unit with sigmoid activation (otherwise it looks similar to your implementation).
Using this the XOR function can be learnt with appropriate learning rate and epochs (although it can be sometimes stuck at local minima, you can consider implementing drop-out etc. regularizers). Also, you can convert it to 2-output (softmax?) version of yours, can you figure out any issue in your implementation? e.g., you can look at the following pointers:
from sklearn.metrics import accuracy_score, mean_squared_error
class FFSNNetwork:
def __init__(self, n_inputs, hidden_sizes=[2]):
#intialize the inputs
self.nx = n_inputs
self.ny = 1 # number of neurons in the output layer
self.nh = len(hidden_sizes)
self.sizes = [self.nx] + hidden_sizes + [self.ny]
self.W = {}
self.B = {}
for i in range(self.nh+1):
self.W[i+1] = np.random.rand(self.sizes[i], self.sizes[i+1])
self.B[i+1] = np.random.rand(1, self.sizes[i+1])
def sigmoid(self, x):
return 1.0/(1.0 + np.exp(-x))
def forward_pass(self, x):
self.A = {}
self.H = {}
self.H[0] = x.reshape(1, -1)
for i in range(self.nh+1):
self.A[i+1] = np.matmul(self.H[i], self.W[i+1]) + self.B[i+1]
self.H[i+1] = self.sigmoid(self.A[i+1])
return self.H[self.nh+1]
def grad_sigmoid(self, x):
return x*(1-x)
def grad(self, x, y):
self.forward_pass(x)
self.dW = {}
self.dB = {}
self.dH = {}
self.dA = {}
L = self.nh + 1
self.dA[L] = (self.H[L] - y)
for k in range(L, 0, -1):
self.dW[k] = np.matmul(self.H[k-1].T, self.dA[k])
self.dB[k] = self.dA[k]
self.dH[k-1] = np.matmul(self.dA[k], self.W[k].T)
self.dA[k-1] = np.multiply(self.dH[k-1], self.grad_sigmoid(self.H[k-1]))
def fit(self, X, Y, epochs=1, learning_rate=1, initialize=True):
# initialize w, b
if initialize:
for i in range(self.nh+1):
self.W[i+1] = np.random.randn(self.sizes[i], self.sizes[i+1])
self.B[i+1] = np.zeros((1, self.sizes[i+1]))
for e in range(epochs):
dW = {}
dB = {}
for i in range(self.nh+1):
dW[i+1] = np.zeros((self.sizes[i], self.sizes[i+1]))
dB[i+1] = np.zeros((1, self.sizes[i+1]))
for x, y in zip(X, Y):
self.grad(x, y)
for i in range(self.nh+1):
dW[i+1] += self.dW[i+1]
dB[i+1] += self.dB[i+1]
m = X.shape[1]
for i in range(self.nh+1):
self.W[i+1] -= learning_rate * dW[i+1] / m
self.B[i+1] -= learning_rate * dB[i+1] / m
Y_pred = self.predict(X)
print('loss at epoch {} = {}'.format(e, mean_squared_error(Y_pred, Y)))
def predict(self, X):
Y_pred = []
for x in X:
y_pred = self.forward_pass(x)
Y_pred.append(y_pred)
return np.array(Y_pred).squeeze()
Now, train the network:
#train the network with two hidden layers - 2 neurons and 2 neurons
ffsnn = FFSNNetwork(2, [2, 2])
# XOR data
X_train, y_train = np.array([[0, 0], [0, 1], [1,0], [1, 1]]), np.array([0, 1, 1, 0])
ffsnn.fit(X_train, y_train, epochs=5000, learning_rate=.15)
Next, predict with the network:
y_pred_prob = ffsnn.predict(X_train) # P(y = 1)
y_pred = (y_pred_prob >= 0.5).astype("int").ravel() # threshold = 0.5
X_train
# array([[0, 0], [0, 1], [1, 0], [1, 1]])
y_train
# array([0, 1, 1, 0])
y_pred_prob
# array([0.00803102, 0.99439243, 0.99097831, 0.00664639])
y_pred
# array([0, 1, 1, 0])
accuracy_score(y_train, y_pred)
# 1.0
Note that here the MSE between the true and predicted y values is used to plot the loss function, you can plot BCE (cross entropy) loss function too.
Finally, the following animations show how the loss function is minimized and also how the decision boundary is learnt:
Note that the green and red points represent the positive (with label 1) and negative (with label 0) training data points, respectively, in the above animation, notice how they are separated with the decision boundaries during the final phase of training epochs (darker region for negative and lighter region for positive datapoints corresponding to XOR).
You could implement the same with high level deep learning libraries such as keras
with a few lines of code:
import tensorflow as tf
from tensorflow import keras
inputs = keras.Input(shape=(2,), name="in")
x = layers.Dense(4, activation="relu", name="dense_1")(inputs)
x = layers.Dense(4, activation="relu", name="dense_2")(x)
outputs = layers.Dense(1, activation="sigmoid", name="out")(x)
model = keras.Model(inputs=inputs, outputs=outputs)
X_train, y_train = np.array([[0, 0], [0, 1], [1,0], [1, 1]]), np.array([0, 1, 1, 0])
model.compile(
optimizer=keras.optimizers.Adam(), # Optimizer
# Loss function to minimize
loss=tf.keras.losses.BinaryCrossentropy(),
# List of metrics to monitor
metrics=[keras.metrics.BinaryAccuracy(name="accuracy")],
)
print("Fit model on training data")
history = model.fit(
X_train,
y_train,
batch_size=4,
epochs=1000)
# ...
# Epoch 371/1000
# 4/4 [==============================] - 0s 500us/sample - loss: 0.5178 - accuracy: 0.7500
# Epoch 372/1000
# 4/4 [==============================] - 0s 499us/sample - loss: 0.5169 - accuracy: 0.7500
# Epoch 373/1000
# 4/4 [==============================] - 0s 499us/sample - loss: 0.5160 - accuracy: 1.0000
# Epoch 374/1000
# 4/4 [==============================] - 0s 499us/sample - loss: 0.5150 - accuracy: 1.0000
# ...
print("Evaluate")
results = model.evaluate(X_train, y_train, batch_size=4)
print("loss, acc:", results)
# loss, acc: [0.1260240525007248, 1.0]
The following figure shows the loss / accuracy during training epochs.
Finally, with keras
and softmax
(instead of sigmoid
):
from keras.utils import to_categorical
X_train, y_train = np.array([[0, 0], [0, 1], [1,0], [1, 1]]), np.array([0, 1, 1, 0])
y_train = to_categorical(y_train, num_classes=2)
inputs = keras.Input(shape=(2,), name="in")
x = layers.Dense(4, activation="relu", name="dense_1")(inputs)
x = layers.Dense(4, activation="relu", name="dense_2")(x)
outputs = layers.Dense(2, activation="softmax", name="out")(x)
model = keras.Model(inputs=inputs, outputs=outputs)
model.compile(
optimizer='rmsprop',
loss='categorical_crossentropy',
metrics=['acc']
)
print("Fit model on training data")
history = model.fit(
X_train,
y_train,
batch_size=4,
epochs=2000)
# Epoch 663/2000
# 4/4 [==============================] - 0s 500us/sample - loss: 0.3893 - acc: 0.7500
# Epoch 664/2000
# 4/4 [==============================] - 0s 500us/sample - loss: 0.3888 - acc: 1.0000
# Epoch 665/2000
# 4/4 [==============================] - 0s 500us/sample - loss: 0.3878 - acc: 1.0000
print("Evaluate")
results = model.evaluate(X_train, y_train, batch_size=4)
print("loss, acc:", results)
# loss, acc: [0.014970880933105946, 1.0]
with the following loss / accuracy convergence: