I wrote an MLP classifier using theano. The training function using back propagation algorithm is as follows:
self.weights=[theano.shared(numpy.random.random((network.architecture[i+1],network.architecture[i]))) for i in range(len(network.architecture)-1)]
self.bias=[theano.shared(numpy.random.random(network.architecture[i+1])) for i in range(len(network.architecture)-1)]
self.layers=network.layers
self.prev_rate=[theano.shared(numpy.zeros((network.architecture[i+1],network.architecture[i]))) for i in range(len(network.architecture)-1)]+[theano.shared(numpy.zeros(network.architecture[i+1])) for i in range(len(network.architecture)-1)]
prediction=T.dmatrix()
output=T.dmatrix()
reg_lambda=T.dscalar()
alpha=T.dscalar()
momentum=T.dscalar()
cost=T.nnet.categorical_crossentropy(prediction,output).mean()
for i,j in zip(self.weights,self.bias):
cost+=T.sum(i**2)*reg_lambda
cost+=T.sum(j**2)*reg_lambda
parameters=self.weights+self.bias
rates=[(alpha*T.grad(cost,parameter)+momentum*prev_rate) for parameter,prev_rate in zip(parameters,self.prev_rate)]
updates=[(weight,weight-rate) for weight,rate in zip(parameters,rates)]+[(prev_rate,rate) for prev_rate,rate in zip(self.prev_rate,rates)]
self.backprop=theano.function([prediction,output,reg_lambda,alpha,momentum],cost,updates=updates)
I tried to train the classifier for the XOR problem. The implementation is
network=FeedForwardNetwork([2,2,2])
network.initialize()
network.train(numpy.array([[0.,0.],[0.,1.],[1.,0.],[1.,1.],[0.,0.],[0.,1.],[1.,0.],[1.,1.]]),numpy.array([[0.,1.],[1.,0.],[1.,0.],[0.,1.],[0.,1.],[1.,0.],[1.,0.],[0.,1.]]),alpha=0.01,epochs=1000000000000000,momentum=0.9)
print network.predict(numpy.array([[1.,0.]]))
print network.predict(numpy.array([[0.,0.]]))
The initialize() method just compiles all the functions at the backend i.e. the back propagation function,a forward pass function for calculating the predictions and a few other theano functions. Now, when i run this code, the training settles at a local minima.
0.69314718056
0.69314718056
0.69314718056
0.69314718056
0.69314718056
0.69314718056
0.69314718056
0.69314718056
0.69314718056
0.69314718056
0.69314718056
0.69314718056
0.69314718056
0.69314718056
0.69314718056
0.69314718056
0.69314718056
0.69314718056
0.69314718056
0.69314718056
0.69314718056
0.69314718056
At the start of the training, the loss was about 0.92. It steadily decreased upto the above value and stopped there. I tried changing the values of alpha and the momentum. What am i doing wrong?
P.S. The whole code is here: networks.py
import theano
import theano.tensor as T
import numpy
from layers import *
from backend import NetworkBackend
class Network:
def __init__(self,architecture):
self.architecture=architecture
self.layers=[]
self.weights=[]
self.bias=[]
def __str__(self):
banner=''
for i in range(len(self.weights)):
banner+=str(self.weights[i])+'\n'
banner+=str(self.bias[i])+'\n'
return banner
class FeedForwardNetwork(Network):
def initialize(self):
self.layers.append(InputLayer(units=self.architecture[0]))
for i in range(1,len(self.architecture[:-1])):
self.layers.append(SigmoidLayer(units=self.architecture[i]))
self.layers.append(SoftmaxLayer(units=self.architecture[-1]))
self.backend=NetworkBackend(self)
def predict(self,inputs):
return self.backend.activate(inputs)
def train(self,X,y,alpha=100,reg_lambda=0.0001,epochs=10000,momentum=0.9):
cost=1
while cost>0.01 and epochs:
prediction=self.predict(X)
cost=self.backend.backprop(prediction,y,reg_lambda,alpha,momentum)
print cost
epochs-=1
if __name__=='__main__':
network=FeedForwardNetwork([2,2,2])
network.initialize()
network.train(numpy.array([[0.,0.],[0.,1.],[1.,0.],[1.,1.],[0.,0.],[0.,1.],[1.,0.],[1.,1.]]),numpy.array([[0.,1.],[1.,0.],[1.,0.],[0.,1.],[0.,1.],[1.,0.],[1.,0.],[0.,1.]]),alpha=0.01,epochs=1000000000000000,momentum=0.9)
print network.predict(numpy.array([[1.,0.]]))
print network.predict(numpy.array([[0.,0.]]))
layers.py
import theano
import theano.tensor as T
import scipy
from backend import ComputationBackend
class Layer:
def __init__(self,units):
self.units=units
self.backend=ComputationBackend()
def __str__(self):
banner=self.__class__.__name__
banner+=" Units:%d"%self.units
return banner
class SigmoidLayer(Layer):
def forwardPass(self,inputs):
return self.backend.sigmoid(inputs)
class InputLayer(Layer):
def forwardPass(self,inputs):
return inputs
class SoftmaxLayer(Layer):
def forwardPass(self,inputs):
return self.backend.softmax(inputs)
backend.py
import theano
import theano.tensor as T
import numpy
class NetworkBackend:
def __init__(self,network):
# initialize shared variables
self.weights=[theano.shared(numpy.random.random((network.architecture[i+1],network.architecture[i]))) for i in range(len(network.architecture)-1)]
self.bias=[theano.shared(numpy.random.random(network.architecture[i+1])) for i in range(len(network.architecture)-1)]
self.layers=network.layers
self.prev_rate=[theano.shared(numpy.zeros((network.architecture[i+1],network.architecture[i]))) for i in range(len(network.architecture)-1)]+[theano.shared(numpy.zeros(network.architecture[i+1])) for i in range(len(network.architecture)-1)]
# activation for network layers
inputs=T.dmatrix()
temp=self.layers[0].forwardPass(inputs)
for i in range(1,len(self.layers[:-1])):
temp=self.layers[i].forwardPass(T.dot(temp,self.weights[i-1].transpose())+self.bias[i-1])
output=self.layers[-1].forwardPass(T.dot(temp,self.weights[-1].transpose())+self.bias[-1])
self.activate=theano.function([inputs],output)
prediction=T.dmatrix()
output=T.dmatrix()
reg_lambda=T.dscalar()
alpha=T.dscalar()
momentum=T.dscalar()
cost=T.nnet.categorical_crossentropy(prediction,output).mean()
for i,j in zip(self.weights,self.bias):
cost+=T.sum(i**2)*reg_lambda
cost+=T.sum(j**2)*reg_lambda
parameters=self.weights+self.bias
rates=[(alpha*T.grad(cost,parameter)+momentum*prev_rate) for parameter,prev_rate in zip(parameters,self.prev_rate)]
updates=[(weight,weight-rate) for weight,rate in zip(parameters,rates)]+[(prev_rate,rate) for prev_rate,rate in zip(self.prev_rate,rates)]
self.backprop=theano.function([prediction,output,reg_lambda,alpha,momentum],cost,updates=updates)
class ComputationBackend:
def __init__(self):
# sigmoid activation
self.sigmoid=T.nnet.sigmoid
# softmax activation
self.softmax=T.nnet.softmax
Finally figured it out! In the NetworkBackend, while calculating the cost, I am calculating the cross entropy between the expected outputs and the predictions passed as arguments to the theano function instead of using the predictions computed by the activate function. Thus, the theano graph does not contain the forward pass. Consequently, theano.tensor.grad only finds the gradient for the regularization function and not the actual cost function! so the proper implementation should be:
inputs=T.dmatrix()
temp=self.layers[0].forwardPass(inputs)
for i in range(1,len(self.layers[:-1])):
temp=self.layers[i].forwardPass(T.dot
(temp,self.weights[i-1].transpose())+self.bias[i-1])
output=self.layers[-1].forwardPass(T.dot(temp,self.weights[-1].
transpose())+self.bias[-1])
self.activate=theano.function([inputs],output)
label=T.dmatrix()
reg_lambda=T.dscalar()
alpha=T.dscalar()
momentum=T.dscalar()
cost=T.nnet.categorical_crossentropy(output,label).mean()
for i,j in zip(self.weights,self.bias):
cost+=T.sum(i**2)*reg_lambda
cost+=T.sum(j**2)*reg_lambda
parameters=self.weights+self.bias
rates=[(alpha*T.grad(cost,parameter)+momentum*prev_rate)
for parameter,prev_rate in zip(parameters,self.prev_rate)]
updates=[(weight,weight-rate) for weight,rate
in zip(parameters,rates)]+[(prev_rate,rate)
for prev_rate,rate in zip(self.prev_rate,rates)]
self.backprop=theano.function([inputs,label,reg_lambda,alpha,momentum],
cost,updates=updates)
so instead of declaring a new matrix for predictions, i am taking the inputs and computing the predictions in the train function using the same equation used in the activation function. This completes the theano graph and the theano.tensor.grad() now calculates the gradient for the cost function as well as the reqularization.