MNIST Neural Network not learning - Micheal Nielsen Example

I have been trying to code up the neural network for recognizing MNIST that was given by Michael Nielsen here http://neuralnetworksanddeeplearning.com/chap1.html

The original was written using Python 2.7 I believe, I'm using v3. The network does go through the test examples and updates the weights and biases, but it doesn't learn, and gets around 10% of the test examples right (so as good as random guessing).

I have also tried simply copying the code from the site and running it in Python 2.7, and it works as it should (getting up to 95% accuracy). The only significant difference in the nets are the data set (I'm using the one downloaded directly from MINST two days ago) and the two locations where i switched np.dot into np.outer, just to make it easier to keep track of the array shapes (I tried sticking to (N,) instead of (N,1)). But that part seems to be fine, since the layer sizes are different and the multiplications are going through. I am also using the same learning rate and layer sizes as given in the example.

I cannot see what could be throwing the net off. If anyone has tried doing the same, or has some insight into this I would greatly appreciate it.

Thanks !

The code :

import matplotlib.pyplot as plt
import numpy as np
import idx2numpy

import random



### LOAD DATASET ###


train = idx2numpy.convert_from_file("mnist/train-images.idx3-ubyte")

train_labels = idx2numpy.convert_from_file("mnist/train-labels.idx1-ubyte")

test = idx2numpy.convert_from_file("mnist/t10k-images.idx3-ubyte")

test_labels = idx2numpy.convert_from_file("mnist/t10k-labels.idx1-ubyte")


def vectorize(x):
    e = np.zeros(10)
    e[x] = 1.0
    return e

training_images = [np.reshape(i, (784))/255 for i in train]
training_labels = [vectorize(i) for i in train_labels]
training_set = list(zip(training_images,training_labels))

test_images = [np.reshape(i, (784))/255 for i in test]
test_set = list(zip(training_images,test_labels))


### NETWORK CLASS ###


class myNet():

    def __init__ (self , sizes ):
    
        self.sizes = sizes
    
        self.N = len(sizes)

        self.w = [np.random.randn(y, x) for x, y in zip( sizes [: -1] , sizes [1:]) ]
        
        self.b = [np.random.randn(i) for i in sizes[1:]]

    
    def sigmoid (self,z):
        
        return 1.0/(1.0+ np.exp(-z))

    def sigmoid_prime (self,z):

        return self.sigmoid (z)*(1 - self.sigmoid (z))

    def cost_derivative (self,output_activations , y):
    
        return ( output_activations - y)

    def feedforward (self , a):

        for bb, ww in zip(self.b , self.w ):
            a = self.sigmoid (np.dot(ww, a)+bb)
            
        return a


    def backprop (self , x, y):

        nabla_b = [np. zeros (bb. shape ) for bb in self.b ]
        nabla_w = [np. zeros (ww. shape ) for ww in self.w ]
    
        activation = x

        activations = [x] # list to store all the activations , layer by layer
        zs = [] # list to store all the z vectors , layer by layer
        
        for bb, ww in zip(self.b , self. w ):
            z = np.dot(ww, activation )+bb

            zs. append (z)
            activation = self.sigmoid (z)
            activations . append ( activation )
        # backward pass
        
        delta = self. cost_derivative ( activations [-1], y) * self.sigmoid_prime (zs [ -1])

        nabla_b [-1] = delta
        nabla_w [-1] = np.outer(delta , activations [ -2])
        # Note that the variable l in the loop below is used a little

        for l in range (2, self.N ):

            z = zs[-l]
            sp = self.sigmoid_prime (z)
            delta = np.dot(self.w [-l+1]. transpose () , delta ) * sp
            nabla_b [-l] = delta
            nabla_w [-l] = np.outer(delta , activations [-l -1])
            
        return (nabla_b , nabla_w )

    def update(self,mini_batch,eta):

        nabla_b = [np.zeros (bb.shape ) for bb in self.b ]
        nabla_w = [np.zeros (ww.shape ) for ww in self.w ]


        for x, y in mini_batch :
            
            delta_nabla_b , delta_nabla_w = self. backprop (x, y)
            
            nabla_b = [nb+dnb for nb , dnb in zip(nabla_b , delta_nabla_b )]
            nabla_w = [nw+dnw for nw , dnw in zip(nabla_w , delta_nabla_w )]
            
        self.w = [ww -( eta/len( mini_batch ))*nw
                         for ww, nw in zip(self.w , nabla_w )]
        self.b = [bb -( eta/len( mini_batch ))*nb
                        for bb, nb in zip(self.b, nabla_b )]
        
        return

    
    def gradient_descent(self,training_data,epochs,mini_batch_size,eta,test_data):

        i = 0

        n = len( training_data )

        for j in range (epochs):
            
            random.shuffle (training_data)
            
            mini_batches = [
                training_data [k:k+ mini_batch_size ]
                for k in range (0, n, mini_batch_size )]

            for mini_batch in mini_batches :
                self.update( mini_batch , eta)

            print("Epoch {0}: {1}". format (
                j, self.evaluate(test_data)))

        return

    def evaluate (self, test_data):
        
        test_results = [( np.argmax (self.feedforward (x)), y)
                        for (x, y) in test_data ]

        return sum(int(x == y) for (x, y) in test_results )
    

sizes =[28*28, 30, 10]

net = myNet(sizes)
    
net.gradient_descent(training_set,30,10,3.0,test_set)

Solution

I found the mistake... By mistake I zipped up the training images with the test labels to form the test set, which is clearly not what it should be. Now that I formed the test set properly, everything works and gets up to around 95% accuracy. Here is the full corrected code for completeness (usable in Python3)

import matplotlib.pyplot as plt
import numpy as np
import idx2numpy

import random


def vectorize(x):
    e = np.zeros(10)
    e[x] = 1.0
    return e


### LOAD DATASET ###


train_images = idx2numpy.convert_from_file("mnist/train-images.idx3-ubyte")/255
train_labels = idx2numpy.convert_from_file("mnist/train-labels.idx1-ubyte")

train_images = [np.reshape(x,(784)).astype('float32') for x in train_images]
train_labels = [vectorize(i) for i in train_labels]

test_images = idx2numpy.convert_from_file("mnist/t10k-images.idx3-ubyte")/255
test_labels = idx2numpy.convert_from_file("mnist/t10k-labels.idx1-ubyte")

test_images = [np.reshape(x,(784)).astype('float32') for x in test_images]


training_set = list(zip(train_images,train_labels))

test_set = list(zip(test_images,test_labels)) ## THIS IS WHERE I MESSED UP


### NETWORK CLASS ###


class myNet():

    def __init__ (self , sizes ):
    
        self.sizes = sizes
    
        self.N = len(sizes)

        self.w = [np.random.randn(y, x) for x, y in zip( sizes [: -1] , sizes [1:]) ]
        
        self.b = [np.random.randn(i) for i in sizes[1:]]

    
    def sigmoid (self,z):
        
        return 1.0/(1.0+ np.exp(-z))

    def sigmoid_prime (self,z):

        return self.sigmoid (z)*(1 - self.sigmoid (z))

    def cost_derivative (self,output_activations , y):
    
        return ( output_activations - y)

    def feedforward (self , a):

        for bb, ww in zip(self.b , self.w ):
            a = self.sigmoid (np.dot(ww, a)+bb)
            
        return a


    def backprop (self , x, y):

        nabla_b = [np. zeros (bb. shape ) for bb in self.b ]
        nabla_w = [np. zeros (ww. shape ) for ww in self.w ]
    
        activation = x

        activations = [x] # list to store all the activations , layer by layer
        zs = [] # list to store all the z vectors , layer by layer
        
        for bb, ww in zip(self.b , self. w ):
            z = np.dot(ww, activation )+bb

            zs. append (z)
            activation = self.sigmoid (z)
            activations . append ( activation )
        # backward pass
        
        delta = self. cost_derivative ( activations [-1], y) * self.sigmoid_prime (zs [ -1])

        nabla_b [-1] = delta
        nabla_w [-1] = np.outer(delta , activations [ -2])
        # Note that the variable l in the loop below is used a little

        for l in range (2, self.N ):

            z = zs[-l]
            sp = self.sigmoid_prime (z)
            delta = np.dot(self.w [-l+1]. transpose () , delta ) * sp
            nabla_b [-l] = delta
            nabla_w [-l] = np.outer(delta , activations [-l -1])
            
        return (nabla_b , nabla_w )

    def update(self,mini_batch,eta):

        nabla_b = [np.zeros (bb.shape ) for bb in self.b ]
        nabla_w = [np.zeros (ww.shape ) for ww in self.w ]


        for x, y in mini_batch :
            
            delta_nabla_b , delta_nabla_w = self. backprop (x, y)
            
            nabla_b = [nb+dnb for nb , dnb in zip(nabla_b , delta_nabla_b )]
            nabla_w = [nw+dnw for nw , dnw in zip(nabla_w , delta_nabla_w )]
            
        self.w = [ww -( eta/len( mini_batch ))*nw
                         for ww, nw in zip(self.w , nabla_w )]
        self.b = [bb -( eta/len( mini_batch ))*nb
                        for bb, nb in zip(self.b, nabla_b )]
        
        return

    
    def gradient_descent(self,training_data,epochs,mini_batch_size,eta,test_data):

        i = 0

        n = len( training_data )

        for j in range (epochs):
            
            random.shuffle (training_data)
            
            mini_batches = [
                training_data [k:k+ mini_batch_size ]
                for k in range (0, n, mini_batch_size )]

            for mini_batch in mini_batches :
                self.update( mini_batch , eta)

            print("Epoch {0}: {1}". format (
                j, self.evaluate(test_data)))

        return

    def evaluate (self, test_data):
        
        test_results = [( np.argmax (self.feedforward (x)), y)
                        for (x, y) in test_data ]
        
        return sum(int(x == y) for (x, y) in test_results )
    

sizes =[28*28, 30, 10]

net = myNet(sizes)
    
net.gradient_descent(training_set,30,10,3.0,test_set)