Search code examples
javamachine-learningneural-networkxor

XOR Neural Net converges to 0.5


I can't seem to find what's wrong with my neural net, despite verifying my net based on this example, which suggests my backprop and forward prop is working fine. However, after training on XOR my net returns around 0.5 for the output regardless of the input. In other words, the net seems to be minimizing the error as best it can without seeing any correlation between the input and the output. Since a single iteration of back propagation seems to be working fine, my instinct would suggest the problem lies somehow in the iterations that follow. However, there isn't any obvious problem that would cause this, leaving me quite stumped.

I've looked at other threads where similar problems have arisen, but it seems most of the time their error is either extremely niche to the way they set up their net, or their parameters such as learning rate or epochs is really off. Is anyone familiar with a case like this?

public class Net
{
int[] sizes;
double LEARNING_RATE;

double[][][] weights;
double[][] bias;

Random rand = new Random();  //53489085

public Net(int[] sizes_, double LEARNING_RATE_)
{
    LEARNING_RATE = LEARNING_RATE_;
    sizes = sizes_;

    int numInputs = sizes[0];
    double range = 1.0 / Math.sqrt(numInputs);

    bias = new double[sizes.length - 1][];
    weights = new double[sizes.length - 1][][];

    for(int w_layer = 0; w_layer < weights.length; w_layer++)
    {
        bias[w_layer] = new double[sizes[w_layer+1]];
        weights[w_layer] = new double[sizes[w_layer+1]][sizes[w_layer]];
        for(int j = 0; j < weights[w_layer].length; j++)
        {
            bias[w_layer][j] = 2*range*rand.nextDouble() - range;
            for(int i = 0; i < weights[w_layer][0].length; i++)
            {
                weights[w_layer][j][i] = 2*range*rand.nextDouble() - range;
            }
        }
    }
}

public double[] evaluate(double[] image_vector)
{
    return forwardPass(image_vector)[sizes.length-1];
}

public double totalError(double[][] expec, double[][] actual)
{
    double sum = 0;
    for(int i = 0; i < expec.length; i++)
    {
        sum += error(expec[i], evaluate(actual[i]));
    }
    return sum / expec.length;
}

private double error(double[] expec, double[] actual)
{
    double sum = 0;
    for(int i = 0; i < expec.length; i++)
    {
        double del = expec[i] - actual[i];
        sum += 0.5 * del * del;
    }
    return sum;
}

public void backpropagate(double[][] image_vector, double[][] outputs)
{
    double[][][] deltaWeights = new double[weights.length][][];
    double[][] deltaBias = new double[weights.length][];

    for(int w = 0; w < weights.length; w++)
    {
        deltaBias[w] = new double[bias[w].length];
        deltaWeights[w] = new double[weights[w].length][];
        for(int j = 0; j < weights[w].length; j++)
        {
            deltaWeights[w][j] = new double[weights[w][j].length];
        }
    }

    for(int batch = 0; batch < image_vector.length; batch++)
    {
        double[][] neuronVals = forwardPass(image_vector[batch]);

        /* OUTPUT DELTAS */
        int w_layer = weights.length-1;

        double[] deltas = new double[weights[w_layer].length];

        for(int j = 0; j < weights[w_layer].length; j++)
        {
            double actual = neuronVals[w_layer + 1][j]; 
            double expec = outputs[batch][j];

            double deltaErr = actual - expec;
            double deltaSig = actual * (1 - actual);

            double delta = deltaErr * deltaSig;
            deltas[j] = delta;

            deltaBias[w_layer][j] += delta;
            for(int i = 0; i < weights[w_layer][0].length; i++)
            {
                deltaWeights[w_layer][j][i] += delta * neuronVals[w_layer][i];
            }
        }

        w_layer--;
        /* REST OF THE DELTAS */
        while(w_layer >= 0)
        {   

            double[] nextDeltas = new double[weights[w_layer].length];
            for(int j = 0; j < weights[w_layer].length; j++)
            {
                double outNeur = neuronVals[w_layer+1][j];
                double deltaSig = outNeur * (1 - outNeur);

                double sum = 0;
                for(int i = 0; i < weights[w_layer+1].length; i++)
                {
                    sum += weights[w_layer+1][i][j] * deltas[i];
                }

                double delta = sum * deltaSig;
                nextDeltas[j] = delta;

                deltaBias[w_layer][j] += delta;
                for(int i = 0; i < weights[w_layer][0].length; i++)
                {
                    deltaWeights[w_layer][j][i] += delta * neuronVals[w_layer][i];
                }
            }
            deltas = nextDeltas;

            w_layer--;
        }
    }

    for(int w_layer = 0; w_layer < weights.length; w_layer++)
    {
        for(int j = 0; j < weights[w_layer].length; j++)
        {

            deltaBias[w_layer][j] /= (double) image_vector.length;

            bias[w_layer][j] -= LEARNING_RATE * deltaBias[w_layer][j];

            for(int i = 0; i < weights[w_layer][j].length; i++)
            {   
                deltaWeights[w_layer][j][i] /= (double) image_vector.length; // average of batches
                weights[w_layer][j][i] -= LEARNING_RATE * deltaWeights[w_layer][j][i];
            }
        }
    }
}

public double[][] forwardPass(double[] image_vector)
{
    double[][] outputs = new double[sizes.length][];

    double[] inputs = image_vector;

    for(int w = 0; w < weights.length; w++)
    {
        outputs[w] = inputs;

        double[] output = new double[weights[w].length];
        for(int j = 0; j < weights[w].length; j++)
        {
            output[j] = bias[w][j];
            for(int i = 0; i < weights[w][j].length; i++)
            {
                output[j] += weights[w][j][i] * inputs[i];
            }
            output[j] = sigmoid(output[j]);
        }
        inputs = output;
    }

    outputs[outputs.length-1] = inputs.clone();

    return outputs;
}

static public double sigmoid(double val)
{
    return 1.0 / (1.0 + Math.exp(-val));
}
}

And my XOR class looks like this. It's very unlikely that the error lies in this part given it's simplicity, but I figured it couldn't hurt to post just in case I have some fundamental misunderstanding to how XOR works. My net is set up to take examples in batches, but as you can see below for this particular example I send it batches of one, or effectively not using batches.

public class SingleLayer {

static int numEpochs = 10000;
static double LEARNING_RATE = 0.001;
static int[] sizes = new int[] {2, 2, 1};

public static void main(String[] args)
{

    System.out.println("Initializing randomly generate neural net...");
    Net n = new Net(sizes, LEARNING_RATE);
    System.out.println("Complete!");

    System.out.println("Loading dataset...");

    double[][] inputs = new double[4][2];
    double[][] outputs = new double[4][1];

    inputs[0] = new double[] {1, 1};
    outputs[0] = new double[] {0};

    inputs[1] = new double[] {1, 0};
    outputs[1] = new double[] {1};

    inputs[2] = new double[] {0, 1};
    outputs[2] = new double[] {1};

    inputs[3] = new double[] {0, 0};
    outputs[3] = new double[] {0};

    System.out.println("Complete!");

    System.out.println("STARTING ERROR: " + n.totalError(outputs, inputs));
    for(int epoch = 0; epoch < numEpochs; epoch++)
    {
        double[][] in = new double[1][2];
        double[][] out = new double[1][1];
        int num = (int)(Math.random()*inputs.length);

        in[0] = inputs[num];
        out[0] = outputs[num];

        n.backpropagate(inputs, outputs);
        System.out.println("ERROR: " + n.totalError(out, in));
    }

    System.out.println("Prediction After Training: " + n.evaluate(inputs[0])[0] + "  Expected: " + outputs[0][0]);
    System.out.println("Prediction After Training: " + n.evaluate(inputs[1])[0] + "  Expected: " + outputs[1][0]);
    System.out.println("Prediction After Training: " + n.evaluate(inputs[2])[0] + "  Expected: " + outputs[2][0]);
    System.out.println("Prediction After Training: " + n.evaluate(inputs[3])[0] + "  Expected: " + outputs[3][0]);
}
}

Can anyone provide some insight as to what may be wrong? My parameters are pretty well defined and I've followed all the suggestions for how the weights should be initialized and what the learning rate should be etc. Thanks!


Solution

  • I figured it out. I wasn't running enough epochs. That seems a little silly to me but this visualization revealed to me that the net lingers on answers ~0.5 for a long time before reducing the error to less than 0.00001.