Search code examples
pythontensorflowmachine-learningcomputer-visionmnist

MNIST ML output doesn't make sense


I'm new to ML and have tried to use this GitHub repository to build an MNIST Machine Learning Model.

Since I have to import the dataset from my computer, I had to change things a bit. My imported dataset also doesn't include all 10 digits, but only 5.

The calculated accuracy is 96%, however when I cross check the .png files on my computer with the outcome txt, the labels make zero sense. It labels some 4's as 7's, some 2's as 5's and so on.

This is what the folder structure looks like on my computer:

2
-->001.png
-->002.png
-->003.png
-->...
3
-->001.png
-->002.png
-->003.png
-->...
4
-->001.png
-->002.png
-->003.png
-->...
5
-->001.png
-->002.png
-->003.png
-->...
7
-->001.png
-->002.png
-->003.png
-->...

Question 1: I previously had the error that it expected 8 different categories since 7 is the highest digit label. I didn't know how to fix this, so I rename the folders from 0 to 4. Any idea how to fix this, without having to rename all folders?

Question 2: Do you know why the outcome doesn't make any sense? It doesn't seem to be an overfitting issue, I've tried adjusting the training-test split, which didn't have any impact.

from sklearn.datasets import fetch_openml
from keras.utils.np_utils import to_categorical
import numpy as np
from sklearn.model_selection import train_test_split
import time


#x, y = fetch_openml('mnist_784', version=1, return_X_y=True)
import os
from os import listdir
from os.path import isfile, join

import cv2

label_folder_training = []
label_files_training = []

total_size_training = 0
total_size_testing = 0
data_path_training = r"Training_data"
data_path_testing = r"Testing_data"


for root, dirs, files in os.walk(data_path_training):
    for dir in dirs:
        label_folder_training.append(dir)
    total_size_training += len(files)
    for file in files:
        label_files_training.append(file)

for root, dirs, files in os.walk(data_path_testing):
    total_size_testing += len(files)

#to ignore .DS_Store file
total_size_training = total_size_training - 1
total_size_testing = total_size_testing

print("found", total_size_training, "training files and", total_size_testing, "testing files.")
print("folder Training:",label_folder_training)

# Print returns the following:
#found 20000 training files and 5000 testing files.
#folder Training: ['0', '1', '4', '3', '2']

x = []
y = []

for i in range(len(label_folder_training)):
    labelPath_training = os.path.join(data_path_training,label_folder_training[i])
    FileName = [f for f in listdir(labelPath_training) if isfile(join(labelPath_training, f))]

    for j in range(len(FileName)):
        path = os.path.join(labelPath_training,FileName[j])

        img = cv2.imread(path,cv2.IMREAD_GRAYSCALE)

        x.append(img)
        y.append(label_folder_training[i])

x = np.array(x)
x = np.reshape(x, (20000, 784))        
        
x = (x/255).astype('float32')
y = to_categorical(y)

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

import pandas as pd

class Module:
    def __init__(self):
        self._train = True

    def forward(self, input):
        raise NotImplementedError

    def backward(self, input, grad_output):
        raise NotImplementedError

    def parameters(self):
        """
        Returns list of its parameters
        """
        return []

    def grad_parameters(self):
        """
        Returns list of tensors gradients of its parameters
        """
        return []

    def train(self):
        self._train = True

    def eval(self):
        self._train = False

class Criterion:
    def forward(self, input, target):
        raise NotImplementedError

    def backward(self, input, target):
        raise NotImplementedError

class Linear(Module):
    def __init__(self, dim_in, dim_out):
        super().__init__()
        self.W = np.random.randn(dim_in, dim_out)
        self.b = np.random.randn(1, dim_out)

    def forward(self, input):
        self.output = (np.dot(input, self.W) + self.b)
        return self.output

    def backward(self, input, grad_output):
        self.grad_b = np.mean(grad_output, axis=0)
        self.grad_W = np.dot(input.T, grad_output)

        self.grad_W /= input.shape[0]

        grad_input = np.dot(grad_output, self.W.T)

        return grad_input

    def parameters(self):
        return [self.W, self.b]

    def grad_parameters(self):
        return [self.grad_W, self.grad_b]

def softmax(xs):
    xs = np.subtract(xs, xs.max(axis=1, keepdims=True))
    xs = np.exp(xs) / np.sum(np.exp(xs), axis=1, keepdims=True)
    return xs


class CrossEntropy(Criterion):
    def __init__(self):
        super().__init__()

    def forward(self, input, target):
        eps = 1e-9
        predictions = np.clip(input, eps, 1. - eps)
        N = predictions.shape[0]
        ce = -np.sum(target * np.log(predictions))
        return ce / N


    def backward(self, input, target):
        eps = 1e-9
        input_clamp = np.clip(input, eps, 1 - eps)
        return softmax(input_clamp) - target

class Sequential(Module):
    def __init__(self, *layers):
        super().__init__()
        self.layers = layers

    def forward(self, input):
        for layer in self.layers:
            input = layer.forward(input)

        self.output = input
        return self.output

    def backward(self, input, grad_output):
        for i in range(len(self.layers) - 1, 0, -1):
            grad_output = self.layers[i].backward(self.layers[i-1].output, grad_output)

        grad_output = self.layers[0].backward(input, grad_output)

        return grad_output

    def parameters(self):
        res = []
        for l in self.layers:
            res += l.parameters()
        return res

    def grad_parameters(self):
        res = []
        for l in self.layers:
            res += l.grad_parameters()
        return res

    def train(self):
        for layer in self.layers:
            layer.train()

    def eval(self):
        for layer in self.layers:
            layer.eval()

def sigmoid(x):
    return 1 / (1 + np.exp(-x))


class Sigmoid(Module):
    def __init__(self):
        super().__init__()

    def forward(self, input):
        self.output = sigmoid(input)
        return self.output

    def backward(self, input, grad_output):
        grad_input = sigmoid(input) * (1 - sigmoid(input)) * grad_output
        return grad_input

class SoftMax(Module):
    def __init__(self):
        super().__init__()

    def forward(self, input):
        self.output = np.subtract(input, input.max(axis=1, keepdims=True))
        self.output = np.exp(self.output) / np.sum(np.exp(self.output), axis=1, keepdims=True)
        return self.output

    def backward(self, input, grad_output):
        return grad_output

def DataLoader(X, Y, batch_size=32):
    n = X.shape[0]
    indices = np.arange(n)
    np.random.shuffle(indices)
    for start in range(0, n, batch_size):
        end = min(start + batch_size, n)
        batch_idx = indices[start:end]
        yield X[batch_idx], Y[batch_idx]

def accuracy_score(y_true, y_pred):
    a = np.argmax(y_true, axis=1)
    b = np.argmax(y_pred, axis=1)
    return np.count_nonzero(a == b) / y_true.shape[0]

class Adam:
    def __init__(self, model):
        self.prev_m = None
        self.prev_v = None
        self.model = model
        self.t = 1

    def step(self, lr, beta1, beta2):
        prev_m_tmp = []
        prev_v_tmp = []
        eps = 1e-7

        for i, (weights, gradient) in enumerate(zip(self.model.parameters(), self.model.grad_parameters())):
            if self.prev_m and self.prev_v:
                m = beta1 * self.prev_m[i] + (1 - beta1) * gradient
                v = beta2 * self.prev_v[i] + (1 - beta2) * gradient ** 2
                m_hat = m / (1 - beta1 ** self.t)
                v_hat = v / (1 - beta2 ** self.t)
            else:
                m = beta1 * 0 + (1 - beta1) * gradient
                v = beta2 * 0 + (1 - beta2) * gradient ** 2
                m_hat = m / (1 - beta1 ** self.t)
                v_hat = v / (1 - beta2 ** self.t)

            weights -= lr * m_hat / (np.sqrt(v_hat) + eps)

            prev_m_tmp.append(m)
            prev_v_tmp.append(v)

        self.prev_m = prev_m_tmp
        self.prev_v = prev_v_tmp

        self.t += 1

model = Sequential(
    Linear(784, 512),
    Sigmoid(),

    Linear(512, 256),
    Sigmoid(),

    Linear(256, 128),
    Sigmoid(),

    Linear(128, 64),
    Sigmoid(),

    Linear(64, 5),

    SoftMax(),
)


epochs = 20
eval_every = 1
batch_size = 1024
criterion = CrossEntropy()
optimizer = Adam(model)

for epoch in range(epochs):
    for x, y in DataLoader(X_train, y_train, batch_size=batch_size):
        model.train()

        y_pred = model.forward(x)
        grad = criterion.backward(y_pred, y)
        model.backward(x, grad)

        optimizer.step(lr=0.003, beta1=0.9, beta2=0.999)

    if (epoch + 1) % eval_every == 0:
        model.eval()
        y_train_pred = model.forward(X_train)
        y_test_pred = model.forward(X_test)
        loss_train = criterion.forward(y_train_pred, y_train)
        loss_test = criterion.forward(y_test_pred, y_test)
        print(f'Epoch: {epoch + 1}/{epochs}')
        print(f'Train Loss: {loss_train} Train Accuracy: {accuracy_score(y_train, y_train_pred)}')
        print(f'Test Loss: {loss_test} Test Accuracy: {accuracy_score(y_test, y_test_pred)} \n')

# Returns the following in epoch 20/20:
# Epoch: 20/20
# Train Loss: 0.151567557756849 Train Accuracy: 0.9905
# Test Loss: 0.706321046620394 Test Accuracy: 0.9563333333333334 

test_x=[]
FileName = [f for f in listdir(data_path_testing) if isfile(join(data_path_testing, f))]

for j in range(len(FileName)):
    path = os.path.join(data_path_testing,FileName[j])

    img = cv2.imread(path,cv2.IMREAD_GRAYSCALE)

    test_x.append(img)
    
x_val = np.array(test_x)
x_val = np.reshape(x_val, (5000, 784))        
  
x_val = (x_val/255).astype('float32')

df_test = pd.DataFrame(x_val,columns=range(784)).add_prefix('pixels_')

output = model.forward(df_test)
output_arg = np.argmax(output, axis=1)
ImageId = df_test.index +1

submission = pd.DataFrame({'ImageId': ImageId, 'Label': output})

submission['ImageId'] = submission['ImageId'].apply('{:0>4}'.format)
submission.to_csv('export.txt', sep=' ', index=False, header=False)

Solution

  • Found the answer to my problem. The output didn't make any sense since python was importing the testing files in a random order. All I had to do was to sort FileName before letting the model run.

    I changed this

    test_x=[]
    FileName = [f for f in listdir(data_path_testing) if isfile(join(data_path_testing, f))]
    
    for j in range(len(FileName)):
        path = os.path.join(data_path_testing,FileName[j])
        img = cv2.imread(path,cv2.IMREAD_GRAYSCALE)
        test_x.append(img)
           
    

    to this:

    test_x=[]
    FileName = sorted( filter( lambda x: os.path.isfile(os.path.join(data_path_testing, x)),
                            os.listdir(data_path_testing) ) )
    
    for j in range(len(FileName)):
        path = os.path.join(data_path_testing,FileName[j])
        img = cv2.imread(path,cv2.IMREAD_GRAYSCALE)
        test_x.append(img)