I'm new to ML and have tried to use this GitHub repository to build an MNIST Machine Learning Model.
Since I have to import the dataset from my computer, I had to change things a bit. My imported dataset also doesn't include all 10 digits, but only 5.
The calculated accuracy is 96%, however when I cross check the .png files on my computer with the outcome txt, the labels make zero sense. It labels some 4's as 7's, some 2's as 5's and so on.
This is what the folder structure looks like on my computer:
2
-->001.png
-->002.png
-->003.png
-->...
3
-->001.png
-->002.png
-->003.png
-->...
4
-->001.png
-->002.png
-->003.png
-->...
5
-->001.png
-->002.png
-->003.png
-->...
7
-->001.png
-->002.png
-->003.png
-->...
Question 1: I previously had the error that it expected 8 different categories since 7 is the highest digit label. I didn't know how to fix this, so I rename the folders from 0 to 4. Any idea how to fix this, without having to rename all folders?
Question 2: Do you know why the outcome doesn't make any sense? It doesn't seem to be an overfitting issue, I've tried adjusting the training-test split, which didn't have any impact.
from sklearn.datasets import fetch_openml
from keras.utils.np_utils import to_categorical
import numpy as np
from sklearn.model_selection import train_test_split
import time
#x, y = fetch_openml('mnist_784', version=1, return_X_y=True)
import os
from os import listdir
from os.path import isfile, join
import cv2
label_folder_training = []
label_files_training = []
total_size_training = 0
total_size_testing = 0
data_path_training = r"Training_data"
data_path_testing = r"Testing_data"
for root, dirs, files in os.walk(data_path_training):
for dir in dirs:
label_folder_training.append(dir)
total_size_training += len(files)
for file in files:
label_files_training.append(file)
for root, dirs, files in os.walk(data_path_testing):
total_size_testing += len(files)
#to ignore .DS_Store file
total_size_training = total_size_training - 1
total_size_testing = total_size_testing
print("found", total_size_training, "training files and", total_size_testing, "testing files.")
print("folder Training:",label_folder_training)
# Print returns the following:
#found 20000 training files and 5000 testing files.
#folder Training: ['0', '1', '4', '3', '2']
x = []
y = []
for i in range(len(label_folder_training)):
labelPath_training = os.path.join(data_path_training,label_folder_training[i])
FileName = [f for f in listdir(labelPath_training) if isfile(join(labelPath_training, f))]
for j in range(len(FileName)):
path = os.path.join(labelPath_training,FileName[j])
img = cv2.imread(path,cv2.IMREAD_GRAYSCALE)
x.append(img)
y.append(label_folder_training[i])
x = np.array(x)
x = np.reshape(x, (20000, 784))
x = (x/255).astype('float32')
y = to_categorical(y)
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)
import pandas as pd
class Module:
def __init__(self):
self._train = True
def forward(self, input):
raise NotImplementedError
def backward(self, input, grad_output):
raise NotImplementedError
def parameters(self):
"""
Returns list of its parameters
"""
return []
def grad_parameters(self):
"""
Returns list of tensors gradients of its parameters
"""
return []
def train(self):
self._train = True
def eval(self):
self._train = False
class Criterion:
def forward(self, input, target):
raise NotImplementedError
def backward(self, input, target):
raise NotImplementedError
class Linear(Module):
def __init__(self, dim_in, dim_out):
super().__init__()
self.W = np.random.randn(dim_in, dim_out)
self.b = np.random.randn(1, dim_out)
def forward(self, input):
self.output = (np.dot(input, self.W) + self.b)
return self.output
def backward(self, input, grad_output):
self.grad_b = np.mean(grad_output, axis=0)
self.grad_W = np.dot(input.T, grad_output)
self.grad_W /= input.shape[0]
grad_input = np.dot(grad_output, self.W.T)
return grad_input
def parameters(self):
return [self.W, self.b]
def grad_parameters(self):
return [self.grad_W, self.grad_b]
def softmax(xs):
xs = np.subtract(xs, xs.max(axis=1, keepdims=True))
xs = np.exp(xs) / np.sum(np.exp(xs), axis=1, keepdims=True)
return xs
class CrossEntropy(Criterion):
def __init__(self):
super().__init__()
def forward(self, input, target):
eps = 1e-9
predictions = np.clip(input, eps, 1. - eps)
N = predictions.shape[0]
ce = -np.sum(target * np.log(predictions))
return ce / N
def backward(self, input, target):
eps = 1e-9
input_clamp = np.clip(input, eps, 1 - eps)
return softmax(input_clamp) - target
class Sequential(Module):
def __init__(self, *layers):
super().__init__()
self.layers = layers
def forward(self, input):
for layer in self.layers:
input = layer.forward(input)
self.output = input
return self.output
def backward(self, input, grad_output):
for i in range(len(self.layers) - 1, 0, -1):
grad_output = self.layers[i].backward(self.layers[i-1].output, grad_output)
grad_output = self.layers[0].backward(input, grad_output)
return grad_output
def parameters(self):
res = []
for l in self.layers:
res += l.parameters()
return res
def grad_parameters(self):
res = []
for l in self.layers:
res += l.grad_parameters()
return res
def train(self):
for layer in self.layers:
layer.train()
def eval(self):
for layer in self.layers:
layer.eval()
def sigmoid(x):
return 1 / (1 + np.exp(-x))
class Sigmoid(Module):
def __init__(self):
super().__init__()
def forward(self, input):
self.output = sigmoid(input)
return self.output
def backward(self, input, grad_output):
grad_input = sigmoid(input) * (1 - sigmoid(input)) * grad_output
return grad_input
class SoftMax(Module):
def __init__(self):
super().__init__()
def forward(self, input):
self.output = np.subtract(input, input.max(axis=1, keepdims=True))
self.output = np.exp(self.output) / np.sum(np.exp(self.output), axis=1, keepdims=True)
return self.output
def backward(self, input, grad_output):
return grad_output
def DataLoader(X, Y, batch_size=32):
n = X.shape[0]
indices = np.arange(n)
np.random.shuffle(indices)
for start in range(0, n, batch_size):
end = min(start + batch_size, n)
batch_idx = indices[start:end]
yield X[batch_idx], Y[batch_idx]
def accuracy_score(y_true, y_pred):
a = np.argmax(y_true, axis=1)
b = np.argmax(y_pred, axis=1)
return np.count_nonzero(a == b) / y_true.shape[0]
class Adam:
def __init__(self, model):
self.prev_m = None
self.prev_v = None
self.model = model
self.t = 1
def step(self, lr, beta1, beta2):
prev_m_tmp = []
prev_v_tmp = []
eps = 1e-7
for i, (weights, gradient) in enumerate(zip(self.model.parameters(), self.model.grad_parameters())):
if self.prev_m and self.prev_v:
m = beta1 * self.prev_m[i] + (1 - beta1) * gradient
v = beta2 * self.prev_v[i] + (1 - beta2) * gradient ** 2
m_hat = m / (1 - beta1 ** self.t)
v_hat = v / (1 - beta2 ** self.t)
else:
m = beta1 * 0 + (1 - beta1) * gradient
v = beta2 * 0 + (1 - beta2) * gradient ** 2
m_hat = m / (1 - beta1 ** self.t)
v_hat = v / (1 - beta2 ** self.t)
weights -= lr * m_hat / (np.sqrt(v_hat) + eps)
prev_m_tmp.append(m)
prev_v_tmp.append(v)
self.prev_m = prev_m_tmp
self.prev_v = prev_v_tmp
self.t += 1
model = Sequential(
Linear(784, 512),
Sigmoid(),
Linear(512, 256),
Sigmoid(),
Linear(256, 128),
Sigmoid(),
Linear(128, 64),
Sigmoid(),
Linear(64, 5),
SoftMax(),
)
epochs = 20
eval_every = 1
batch_size = 1024
criterion = CrossEntropy()
optimizer = Adam(model)
for epoch in range(epochs):
for x, y in DataLoader(X_train, y_train, batch_size=batch_size):
model.train()
y_pred = model.forward(x)
grad = criterion.backward(y_pred, y)
model.backward(x, grad)
optimizer.step(lr=0.003, beta1=0.9, beta2=0.999)
if (epoch + 1) % eval_every == 0:
model.eval()
y_train_pred = model.forward(X_train)
y_test_pred = model.forward(X_test)
loss_train = criterion.forward(y_train_pred, y_train)
loss_test = criterion.forward(y_test_pred, y_test)
print(f'Epoch: {epoch + 1}/{epochs}')
print(f'Train Loss: {loss_train} Train Accuracy: {accuracy_score(y_train, y_train_pred)}')
print(f'Test Loss: {loss_test} Test Accuracy: {accuracy_score(y_test, y_test_pred)} \n')
# Returns the following in epoch 20/20:
# Epoch: 20/20
# Train Loss: 0.151567557756849 Train Accuracy: 0.9905
# Test Loss: 0.706321046620394 Test Accuracy: 0.9563333333333334
test_x=[]
FileName = [f for f in listdir(data_path_testing) if isfile(join(data_path_testing, f))]
for j in range(len(FileName)):
path = os.path.join(data_path_testing,FileName[j])
img = cv2.imread(path,cv2.IMREAD_GRAYSCALE)
test_x.append(img)
x_val = np.array(test_x)
x_val = np.reshape(x_val, (5000, 784))
x_val = (x_val/255).astype('float32')
df_test = pd.DataFrame(x_val,columns=range(784)).add_prefix('pixels_')
output = model.forward(df_test)
output_arg = np.argmax(output, axis=1)
ImageId = df_test.index +1
submission = pd.DataFrame({'ImageId': ImageId, 'Label': output})
submission['ImageId'] = submission['ImageId'].apply('{:0>4}'.format)
submission.to_csv('export.txt', sep=' ', index=False, header=False)
Found the answer to my problem. The output didn't make any sense since python was importing the testing files in a random order. All I had to do was to sort FileName
before letting the model run.
I changed this
test_x=[]
FileName = [f for f in listdir(data_path_testing) if isfile(join(data_path_testing, f))]
for j in range(len(FileName)):
path = os.path.join(data_path_testing,FileName[j])
img = cv2.imread(path,cv2.IMREAD_GRAYSCALE)
test_x.append(img)
to this:
test_x=[]
FileName = sorted( filter( lambda x: os.path.isfile(os.path.join(data_path_testing, x)),
os.listdir(data_path_testing) ) )
for j in range(len(FileName)):
path = os.path.join(data_path_testing,FileName[j])
img = cv2.imread(path,cv2.IMREAD_GRAYSCALE)
test_x.append(img)