Simple backpropagation Neural Network algorithm (Python)

I'm trying to understand back-propagation, for that I using some python code, but it's noting working properly. When I train with xor input-output the error does not converge. But if I change the value of the last output of xor it converge.

If I put some target output values >1 the error converge for target-1, this dos not seen to be right.

import numpy as np
import random

class neural_network():

    activation = [] #List of values with the values of activation of each layers
    weightsIn = []
    weightsOut = []

    def __init__(self, sizeOfLayers):
        '''
            sizeOfLayers: Tuple with numbers of neurons of each layer
            (in, hidden, out)
        '''
        if len(sizeOfLayers) > 3:
            raise ValueError('Wrong number of layers')

        self.sizeOfLayers = sizeOfLayers
        for i in range(len(sizeOfLayers)):
            if i == 0:
                #input layer + bias
                self.activation.append(sizeOfLayers[i]*[0.0] + [0.0])
            else:
                self.activation.append(sizeOfLayers[i]*[0.0])
        # Wi = len(Hid) x len(IN)+1(bias)
        self.weightsIn = np.random.random((sizeOfLayers[1], sizeOfLayers[0] + 1))
        # Wo = len(OUT) x len(Hid)
        self.weightsOut = np.random.random((sizeOfLayers[2], sizeOfLayers[1]))

    def forward(self, X):
        '''
            X: Vetor de entradas
        '''
        #In+bias add ativation vector
        self.activation[0] = np.vstack((np.array([X]).T, np.array([1])))
        #sum of (weights x in)
        self.sumHidden = self.weightsIn.dot(self.activation[0])
        #Ativation of hidden layer
        self.activation[1] = (self.sigmoid(self.sumHidden))
        #sum of(out weights x activation of last layer)
        self.sumOut = self.weightsOut.dot(self.activation[1])
        #activation of output
        self.activation[2] = (self.sigmoid(self.sumOut))
        return self.activation[2].T

    def backPropagate(self, Y, trainRate = 0.1):
        '''
            Y: output target
            trainRate:
        '''
        if len(Y) != self.sizeOfLayers[2]:
            raise ValueError('Wrong number of inputs')

        #Calc of output delta
        error_o = Y.T - self.activation[2].T
        out_delta = self.sigmoidPrime(self.activation[2]) * error_o.T
        #Calc of hidden delta
        error_h = out_delta.T.dot(self.weightsOut)
        hiden_delta = self.sigmoidPrime(self.activation[1]) * error_h.T

        # update output weights output
        change_o = self.activation[1] * out_delta.T
        for i in range(self.sizeOfLayers[2]):
            for j in range(self.sizeOfLayers[1]):
                self.weightsOut[i][j] = self.weightsOut[i][j] + trainRate*change_o[j][i]
        # update Input weights
        change_h = self.activation[0] * hiden_delta.T
        for i in range(self.sizeOfLayers[1]):
            for j in range(self.sizeOfLayers[0]):
                self.weightsIn[i][j] = self.weightsIn[i][j] + trainRate*change_h[j][i]

        #Error
        return np.sum((Y.T - self.activation[2].T)**2)/0.5

    def sigmoid(self, z, derv = False):
        if derv == False:
            return 1/(1+np.exp(-z))

    def sigmoidPrime(self, z):
        return self.sigmoid(z)*(1-self.sigmoid(z))

    def train(self, target, trainRate = 0.001, it = 50000):
        for i in range(it):
            error = 0.0
            for t in target:
                inputs = np.array(t[0])
                targets = np.array([t[1]])
                self.forward(inputs)
                error = error + self.backPropagate(targets, trainRate)

nn = neural_network((2,6,1))
xor = [
    [[0,0], [0]],
    [[0,1], [1]],
    [[1,0], [1]],
    [[1,1], [0]] #If I change her to 1 it converges
    ]

nn.train(xor)

Edit: Modifications were made according to what Diego Stéfano told (thank you Diego), but the error dos not converge yet.

import numpy as np
import math
import random
from scipy.special import expit
from sklearn.preprocessing import normalize


class neural_network(object):
    activation = []
    weightsIn = []
    weightsOut = []

    def __init__(self, sizeOfLayers):
        '''
            sizeOfLayers: Tuple with numbers of neurons of each layer
            (in, hidden, out)
        '''
        self.sizeOfLayers = sizeOfLayers
        for i in range(len(sizeOfLayers)):
            self.activation.append(sizeOfLayers[i]*[0.0] + [0.0])

        self.weightsIn = np.random.normal(scale=0.1, size = (sizeOfLayers[1], sizeOfLayers[0] + 1))
        self.weightsOut = np.random.normal(scale=0.1,  size = (sizeOfLayers[2], sizeOfLayers[1] + 1))


    def forward(self, X):
        '''
            X: Vetor de entradas
        '''
        #In+bias add ativation vector
        self.activation[0] = np.vstack((np.array([X]).T, np.array([1])))
        #sum of (weights x in)
        self.sumHidden = self.weightsIn.dot(self.activation[0])
        #+bias add ativation vector
        self.activation[1] = np.vstack((expit(self.sumHidden), np.array([1])))
        #sum of(out weights x activation of last layer)
        self.sumOut = self.weightsOut.dot(self.activation[1])
        #activation of output
        self.activation[2] = (expit(self.sumOut))
        return self.activation[2].T

    def backPropagate(self, X, Y, trainRate = 0.1):
        self.forward(X)
        #Calc of output delta
        error_o = Y - self.activation[2].T
        out_delta = self.sigmoidPrime(self.activation[2]) * error_o.T
        #Calc of hidden delta
        error_h = out_delta.T.dot(self.weightsOut)
        hiden_delta = self.sigmoidPrime(self.activation[1]) * error_h.T

        # update output weights output
        change_o = self.activation[1] * np.transpose(out_delta)

        self.weightsOut = self.weightsOut + trainRate*change_o.T
        # update hidden weights output
        change_h = self.activation[0].dot( hiden_delta[:-1].T)
        self.weightsIn = self.weightsIn + trainRate*change_h.T
        #error
        return np.sum((Y - self.activation[2].T)**2)*0.5


    def train(self, input_list, epochs):
        for epoch in range(epochs):
            ErrAcc = 0.0
            for inputs, targets in input_list:
                Err = self.backPropagate(np.array(inputs), np.array(targets), 0.2)
                ErrAcc = ErrAcc + Err
            if epoch % 1000 == 0:
                print 'Epoch =', epoch, 'ErrAcc =', ErrAcc

    def sigmoidPrime(self,x):
      return expit(x)*(1-expit(x))


nn = neural_network((2,10,1))
xor = [
    [[0,0], [0]],
    [[0,1], [1]],
    [[1,0], [1]],
    [[1,1], [0]] #If I change her to 1 it converges
    ]
nn.train(xor, 300000)

Solution

Here are the modifications I've made to your code that made it work:

Add biases to the output neurons too. All neurons in the network should have it since it detaches the activation field from the origin and, consequently, shifts your activation function left or right, greatly improving the chances of successful learning.
Istead of using np.random.random, which genearates number in the interval [0.0, 1.0) to initialize the weights, use np.random.uniform to generate uniform random floats in [-1.0, 1.0).
Center your input space around the origin (i.e, remove the mean) and normalize it.

Here's how your initialization should be made:

    for i in range(len(sizeOfLayers)):
        self.activation.append(sizeOfLayers[i]*[0.0] + [0.0])

    self.weightsIn = np.random.uniform(-1,1,(sizeOfLayers[1], sizeOfLayers[0] + 1))
    self.weightsOut = np.random.uniform(-1,1,(sizeOfLayers[2], sizeOfLayers[1] + 1))

And then you will also have to append 1 to activation in the function forward:

self.activation[1] = np.vstack((self.sigmoid(self.sumHidden), np.array([1])))

You may want to change the learning rate to make it work (about 0.5 worked for me). Also, your mean-squared error calculation is wrong: you should multiply by 0.5, not divide.

Here is your modified code:

import numpy as np
import random

class neural_network():

activation = [] #List of values with the values of activation of each layers
weightsIn = []
weightsOut = []

def __init__(self, sizeOfLayers):
    '''
        sizeOfLayers: Tuple with numbers of neurons of each layer
        (in, hidden, out)
    '''
    if len(sizeOfLayers) > 3:
        raise ValueError('Wrong number of layers')

    self.sizeOfLayers = sizeOfLayers
    for i in range(len(sizeOfLayers)):
        #input layer + bias
        self.activation.append(sizeOfLayers[i]*[0.0] + [0.0])

    # Wi = len(Hid) x len(IN)+1(bias)
    self.weightsIn = np.random.uniform(-1,1,(sizeOfLayers[1], sizeOfLayers[0] + 1))

    # Wo = len(OUT) x len(Hid)
    self.weightsOut = np.random.uniform(-1,1,(sizeOfLayers[2], sizeOfLayers[1] + 1))

def forward(self, X):
    '''
        X: Vetor de entradas
    '''
    #In+bias add ativation vector
    self.activation[0] = np.vstack((np.array([X]).T, np.array([1])))
    #sum of (weights x in)
    self.sumHidden = self.weightsIn.dot(self.activation[0])
    #Ativation of hidden layer
    self.activation[1] =  np.vstack( ( self.sigmoid(self.sumHidden), np.array([1]) ) )
    #sum of(out weights x activation of last layer)
    self.sumOut = self.weightsOut.dot(self.activation[1])
    #activation of output
    self.activation[2] = (self.sigmoid(self.sumOut))
    return self.activation[2].T

def backPropagate(self, Y, trainRate = 0.1):
    '''
        Y: output target
        trainRate:
    '''
    if len(Y) != self.sizeOfLayers[2]:
        raise ValueError('Wrong number of inputs')

    #Calc of output delta
    error_o = Y.T - self.activation[2].T
    out_delta = self.sigmoidPrime(self.activation[2]) * error_o.T
    #Calc of hidden delta
    error_h = out_delta.T.dot(self.weightsOut)
    hiden_delta = self.sigmoidPrime(self.activation[1]) * error_h.T

    # update output weights output
    change_o = self.activation[1] * out_delta.T
    for i in range(self.sizeOfLayers[2]):
        for j in range(self.sizeOfLayers[1]):
            self.weightsOut[i][j] = self.weightsOut[i][j] + trainRate*change_o[j][i]
    # update Input weights
    change_h = self.activation[0] * hiden_delta.T
    for i in range(self.sizeOfLayers[1]):
        for j in range(self.sizeOfLayers[0]):
            self.weightsIn[i][j] = self.weightsIn[i][j] + trainRate*change_h[j][i]

    #Error
    return np.sum((Y.T - self.activation[2].T)**2)*0.5

def sigmoid(self, z, derv = False):
    if derv == False:
        return 1/(1+np.exp(-z))

def sigmoidPrime(self, z):
    return self.sigmoid(z)*(1-self.sigmoid(z))

def train(self, target, trainRate = 0.5, it = 50000):
    for i in range(it):
        error = 0.0
        for t in target:
            inputs = np.array(t[0])
            targets = np.array([t[1]])
            self.forward(inputs)
            error = error + self.backPropagate(targets, trainRate)

nn = neural_network((2,5,1))
xor = [
    [[-1.0, -1.0], [0]],
    [[-1.0,  1.0], [1]],
    [[ 1.0, -1.0], [1]],
    [[ 1.0,  1.0], [0]] #If I change her to 1 it converges
]

nn.train(xor)

for e in xor:
    nn.forward(e[0])
    print nn.activation[2]

Good luck!