python machine-learning neural-network theano

MLP classifier in theano settles at local minima

I wrote an MLP classifier using theano. The training function using back propagation algorithm is as follows:

self.weights=[theano.shared(numpy.random.random((network.architecture[i+1],network.architecture[i]))) for i in range(len(network.architecture)-1)]
self.bias=[theano.shared(numpy.random.random(network.architecture[i+1])) for i in range(len(network.architecture)-1)]
self.layers=network.layers
self.prev_rate=[theano.shared(numpy.zeros((network.architecture[i+1],network.architecture[i]))) for i in range(len(network.architecture)-1)]+[theano.shared(numpy.zeros(network.architecture[i+1])) for i in range(len(network.architecture)-1)]
prediction=T.dmatrix()
output=T.dmatrix()
reg_lambda=T.dscalar()
alpha=T.dscalar()
momentum=T.dscalar()
cost=T.nnet.categorical_crossentropy(prediction,output).mean()
for i,j in zip(self.weights,self.bias):
    cost+=T.sum(i**2)*reg_lambda
    cost+=T.sum(j**2)*reg_lambda
parameters=self.weights+self.bias
rates=[(alpha*T.grad(cost,parameter)+momentum*prev_rate) for parameter,prev_rate in zip(parameters,self.prev_rate)]
updates=[(weight,weight-rate) for weight,rate in zip(parameters,rates)]+[(prev_rate,rate) for prev_rate,rate in zip(self.prev_rate,rates)]
self.backprop=theano.function([prediction,output,reg_lambda,alpha,momentum],cost,updates=updates)

I tried to train the classifier for the XOR problem. The implementation is

network=FeedForwardNetwork([2,2,2])
network.initialize()
network.train(numpy.array([[0.,0.],[0.,1.],[1.,0.],[1.,1.],[0.,0.],[0.,1.],[1.,0.],[1.,1.]]),numpy.array([[0.,1.],[1.,0.],[1.,0.],[0.,1.],[0.,1.],[1.,0.],[1.,0.],[0.,1.]]),alpha=0.01,epochs=1000000000000000,momentum=0.9)
print network.predict(numpy.array([[1.,0.]]))
print network.predict(numpy.array([[0.,0.]]))

The initialize() method just compiles all the functions at the backend i.e. the back propagation function,a forward pass function for calculating the predictions and a few other theano functions. Now, when i run this code, the training settles at a local minima.

0.69314718056
0.69314718056
0.69314718056
0.69314718056
0.69314718056
0.69314718056
0.69314718056
0.69314718056
0.69314718056
0.69314718056
0.69314718056
0.69314718056
0.69314718056
0.69314718056
0.69314718056
0.69314718056
0.69314718056
0.69314718056
0.69314718056
0.69314718056
0.69314718056
0.69314718056

At the start of the training, the loss was about 0.92. It steadily decreased upto the above value and stopped there. I tried changing the values of alpha and the momentum. What am i doing wrong?

P.S. The whole code is here: networks.py

import theano
import theano.tensor as T
import numpy
from layers import *
from backend import NetworkBackend

class Network:

    def __init__(self,architecture):
        self.architecture=architecture
        self.layers=[]
        self.weights=[]
        self.bias=[]

    def __str__(self):
        banner=''
        for i in range(len(self.weights)):
            banner+=str(self.weights[i])+'\n'
            banner+=str(self.bias[i])+'\n'
        return banner

class FeedForwardNetwork(Network):

    def initialize(self):
        self.layers.append(InputLayer(units=self.architecture[0]))
        for i in range(1,len(self.architecture[:-1])):
            self.layers.append(SigmoidLayer(units=self.architecture[i]))
        self.layers.append(SoftmaxLayer(units=self.architecture[-1]))
        self.backend=NetworkBackend(self)

    def predict(self,inputs):
        return self.backend.activate(inputs)

    def train(self,X,y,alpha=100,reg_lambda=0.0001,epochs=10000,momentum=0.9):
        cost=1
        while cost>0.01 and epochs:
            prediction=self.predict(X)
            cost=self.backend.backprop(prediction,y,reg_lambda,alpha,momentum)
            print cost
            epochs-=1


if __name__=='__main__':
    network=FeedForwardNetwork([2,2,2])
    network.initialize()
    network.train(numpy.array([[0.,0.],[0.,1.],[1.,0.],[1.,1.],[0.,0.],[0.,1.],[1.,0.],[1.,1.]]),numpy.array([[0.,1.],[1.,0.],[1.,0.],[0.,1.],[0.,1.],[1.,0.],[1.,0.],[0.,1.]]),alpha=0.01,epochs=1000000000000000,momentum=0.9)
    print network.predict(numpy.array([[1.,0.]]))
    print network.predict(numpy.array([[0.,0.]]))

layers.py

import theano
import theano.tensor as T
import scipy
from backend import ComputationBackend

class Layer:

    def __init__(self,units):
        self.units=units
        self.backend=ComputationBackend()

    def __str__(self):
        banner=self.__class__.__name__
        banner+=" Units:%d"%self.units
        return banner

class SigmoidLayer(Layer):

    def forwardPass(self,inputs):
        return self.backend.sigmoid(inputs)


class InputLayer(Layer):

    def forwardPass(self,inputs):
        return inputs

class SoftmaxLayer(Layer):

    def forwardPass(self,inputs):
        return self.backend.softmax(inputs)

backend.py

import theano
import theano.tensor as T
import numpy

class NetworkBackend:

    def __init__(self,network):

        # initialize shared variables
        self.weights=[theano.shared(numpy.random.random((network.architecture[i+1],network.architecture[i]))) for i in range(len(network.architecture)-1)]
        self.bias=[theano.shared(numpy.random.random(network.architecture[i+1])) for i in range(len(network.architecture)-1)]
        self.layers=network.layers
        self.prev_rate=[theano.shared(numpy.zeros((network.architecture[i+1],network.architecture[i]))) for i in range(len(network.architecture)-1)]+[theano.shared(numpy.zeros(network.architecture[i+1])) for i in range(len(network.architecture)-1)]

        # activation for network layers
        inputs=T.dmatrix()
        temp=self.layers[0].forwardPass(inputs)
        for i in range(1,len(self.layers[:-1])):
            temp=self.layers[i].forwardPass(T.dot(temp,self.weights[i-1].transpose())+self.bias[i-1])
        output=self.layers[-1].forwardPass(T.dot(temp,self.weights[-1].transpose())+self.bias[-1])
        self.activate=theano.function([inputs],output)

        prediction=T.dmatrix()
        output=T.dmatrix()
        reg_lambda=T.dscalar()
        alpha=T.dscalar()
        momentum=T.dscalar()
        cost=T.nnet.categorical_crossentropy(prediction,output).mean()
        for i,j in zip(self.weights,self.bias):
            cost+=T.sum(i**2)*reg_lambda
            cost+=T.sum(j**2)*reg_lambda
        parameters=self.weights+self.bias
        rates=[(alpha*T.grad(cost,parameter)+momentum*prev_rate) for parameter,prev_rate in zip(parameters,self.prev_rate)]
        updates=[(weight,weight-rate) for weight,rate in zip(parameters,rates)]+[(prev_rate,rate) for prev_rate,rate in zip(self.prev_rate,rates)]
        self.backprop=theano.function([prediction,output,reg_lambda,alpha,momentum],cost,updates=updates)


class ComputationBackend:

    def __init__(self):

        # sigmoid activation
        self.sigmoid=T.nnet.sigmoid

        # softmax activation
        self.softmax=T.nnet.softmax

Solution

Finally figured it out! In the NetworkBackend, while calculating the cost, I am calculating the cross entropy between the expected outputs and the predictions passed as arguments to the theano function instead of using the predictions computed by the activate function. Thus, the theano graph does not contain the forward pass. Consequently, theano.tensor.grad only finds the gradient for the regularization function and not the actual cost function! so the proper implementation should be:

inputs=T.dmatrix()
temp=self.layers[0].forwardPass(inputs)
for i in range(1,len(self.layers[:-1])):
    temp=self.layers[i].forwardPass(T.dot
    (temp,self.weights[i-1].transpose())+self.bias[i-1])
    output=self.layers[-1].forwardPass(T.dot(temp,self.weights[-1].
    transpose())+self.bias[-1])
self.activate=theano.function([inputs],output)

label=T.dmatrix()
reg_lambda=T.dscalar()
alpha=T.dscalar()
momentum=T.dscalar()
cost=T.nnet.categorical_crossentropy(output,label).mean()
for i,j in zip(self.weights,self.bias):
    cost+=T.sum(i**2)*reg_lambda
    cost+=T.sum(j**2)*reg_lambda
parameters=self.weights+self.bias
rates=[(alpha*T.grad(cost,parameter)+momentum*prev_rate) 
for parameter,prev_rate in zip(parameters,self.prev_rate)]
updates=[(weight,weight-rate) for weight,rate
 in zip(parameters,rates)]+[(prev_rate,rate) 
for prev_rate,rate in zip(self.prev_rate,rates)]
self.backprop=theano.function([inputs,label,reg_lambda,alpha,momentum],
cost,updates=updates)

so instead of declaring a new matrix for predictions, i am taking the inputs and computing the predictions in the train function using the same equation used in the activation function. This completes the theano graph and the theano.tensor.grad() now calculates the gradient for the cost function as well as the reqularization.