c++backpropagation back-propagation-through-time

What is wrong with my BPTT Implementation?

I tried to implement Backpropagation through time manually, but in the end the network isn't converging. I tried looking around on the net for descriptions and courses on BPTT, and the code does everything accordingly:

Forward propagation
Error prpoagation backwards
Gradient calculation based on the expected values
Updating the weights based on the gradient and a learning rate

The way I understand recurrent derivatives, is that in case of recurrent Neural networks, the input from a previous step can not be considered as a constant. So e.g.: The derivative of w1 in the 3rd step depends not only the input of the current step, but on the previous steps as well. That's why dw1[1] = net_inputs_train[first_sample_index + 1][0]; is incorrect, it needs to be dw1[1] = net_inputs_train[first_sample_index + 1][0] + dw1[0] * w3;.

Everything else is supposed to be backpropagation "only" in an unfolded network.. Unfortunately this program just doesn't work, the error just jumps around without the net converging..

I don't know what else I could do to make this work, maybe I misunderstood the concept of it completely...

#include <iostream>
#include <vector>
#include <cmath>

using namespace std;

int main(int argc, char *argv[]){

  srand(time(nullptr));

  /* Manual BPTT with one custom implemented Neuron */
  double number_of_samples = 3;  /* Binary addition dataset */
  vector<vector<double>> net_inputs_train = {  /* 2 inputs in each step */
      {1,1},    {0,0},  {0,0}, /* 100 + 100 = 110 */
      {1,0},    {0,1},  {1,0}, /* 101 + 010 = 111*/
      {1,0},    {1,1},  {0,0}, /* 110 + 010 = 111 */
  };

  vector<vector<double>> expected_output = { /* 1 output in each step */
      {1},      {1},    {0}, /* 110 */
      {1},      {1},    {1}, /* 111 */
      {1},      {1},    {1}, /* 111 */
  };

  double w1 = 0.5;
  double w2 = 0.5;
  double w3 = 0.5;
  double b = 0.0;

  vector<double> neuron_data(3,0);
  vector<double> neuron_deriv(3,0); /* Neuron error value ( partial based on the output )*/

  vector<double> dw1(3,0); /* derivatives for weights for each sequence */
  vector<double> dw2(3,0);
  vector<double> dw3(3,0);
  vector<double> derb(3,0);

  int first_sample_index;
  double manual_error = 1.0;
  double learning_rate = 1e-2;
  while(manual_error > learning_rate){
    for(int mbIter = 0; mbIter < 4; ++mbIter){
      first_sample_index = (rand()%(static_cast<int>(number_of_samples)));

      /* Fill in the data and derviatives */
      neuron_data[0] = (
        net_inputs_train[first_sample_index][0] * w1
        + net_inputs_train[first_sample_index][1] * w2
        + b
      );
      dw1[0] = net_inputs_train[first_sample_index][0];
      dw2[0] = net_inputs_train[first_sample_index][1];
      dw3[0] = 0;
      derb[0] = 1;

      neuron_data[1] = (
        net_inputs_train[first_sample_index + 1][0] * w1
        + net_inputs_train[first_sample_index + 1][1] * w2
        + neuron_data[0] * w3
        + b
      );
      dw1[1] = net_inputs_train[first_sample_index + 1][0] + dw1[0] * w3;
      dw2[1] = net_inputs_train[first_sample_index + 1][1] + dw2[0] * w3;
      dw3[1] = neuron_data[0] + w3 * dw3[0];
      derb[1] = 1 + derb[0] * w3;

      neuron_data[2] = (
        net_inputs_train[first_sample_index + 2][0] * w1
        + net_inputs_train[first_sample_index + 2][1] * w2
        + neuron_data[1] * w3
        + b
      );
      dw1[2] = net_inputs_train[first_sample_index + 2][0] + dw1[1] * w3;
      dw2[2] = net_inputs_train[first_sample_index + 2][1] + dw2[1] * w3;
      dw3[2] = neuron_data[1] + w3 * dw3[1];
      derb[2] = 1 + derb[1] * w3;

      /* Calculate the error and the gradients */
      manual_error = (
        pow((neuron_data[2] - expected_output[first_sample_index + 2][0]),2)/2.0
        +pow((neuron_data[1] - expected_output[first_sample_index + 1][0]),2)/2.0
        +pow((neuron_data[0] - expected_output[first_sample_index + 0][0]),2)/2.0
      );

      neuron_deriv[2] = (
        (-(neuron_data[2] - expected_output[first_sample_index + 2][0])/2.0)
      );
      neuron_deriv[1] = (
        (-(neuron_data[1] - expected_output[first_sample_index + 1][0])/2.0)
        + (w3 * neuron_deriv[2])
      );
      neuron_deriv[0] = (
        (-(neuron_data[0] - expected_output[first_sample_index + 0][0])/2.0)
        + (w3 * neuron_deriv[1])
      );

      w1 += (learning_rate * (
        neuron_deriv[2] * dw1[2]
        + neuron_deriv[1] * dw1[1]
        + neuron_deriv[0] * dw1[0]
      ) / number_of_samples);

      w2 += (learning_rate * (
        neuron_deriv[2] * dw2[2]
        + neuron_deriv[1] * dw2[1]
        + neuron_deriv[0] * dw2[0]
      ) / number_of_samples);

      w3 += (learning_rate * (
        neuron_deriv[2] * dw3[2]
        + neuron_deriv[1] * dw3[1]
        + neuron_deriv[0] * dw3[0]
      ) / number_of_samples);

      b += (learning_rate * (
        neuron_deriv[2] * derb[2]
        + neuron_deriv[1] * derb[1]
        + neuron_deriv[0] * derb[0]
      ) / number_of_samples);
      std::cout << "\r Error: " << manual_error << "                    \n";
    }
  }

  return 0;
}

Edit: One interesting thing, is that the training converges if w1 += (learning_rate * (...)/number_of_samples); is switched to w1 += ((...)/number_of_samples);

Solution

So where do I start? Apart from some logic errors ( e.g. at line 43, at the setting of last_sample_index ), the main problem was that backpropagation was mixed inbetween the sequences.

Meaning: Each sequence was mixing the error values from other sequences. So even if an input is coming from a hidden state, it's not supposed to affect the gradient of other sequences.

This I realized when I was crying on top of my paper pile, forcing me to inspect the BPTT technique ( as well as my life choices ) to the bone, cross-checking it with the back-propagation algorithm, because different sequences through time is basically a special kind of back-propagation, where some of the coefficients of the formulas are repeating.

With this in mind, I re-worked the code to separate the gradient calculations in sequences.

Then, there is the vanishing/exploding gradients problem. After the above rework the network still haven't converged, because of that. After the third breakdown, and some experimentation I discovered, that simply halving the gradient coming from the bias of sequence two solves the vanishing problem. The gradient of the bias is tageted, because numerically it's the biggest of all of the weights.

The below program now works, with the network converging successfully.

#include <iostream>
#include <vector>
#include <cmath>

using namespace std;

int main(int argc, char *argv[]){

  srand(time(nullptr));

  /* Manual BPTT with one custom implemented Neuron */
  double sequence_size = 3;
  double number_of_samples = 3;  /* Binary addition dataset */
  double minibatch_size = 4;
  vector<vector<double>> net_inputs_train = {  /* 2 inputs in each step */
      {1,1},    {0,0},  {0,0}, /* 100 + 100 = 110 */
      {1,0},    {0,1},  {1,0}, /* 101 + 010 = 111*/
      {1,0},    {1,1},  {0,0}, /* 110 + 010 = 111 */
  };

  vector<vector<double>> expected_output = { /* 1 output in each step */
      {1},      {1},    {0}, /* 110 */
      {1},      {1},    {1}, /* 111 */
      {1},      {1},    {1}, /* 111 */
  };

  double w1 = 0.5;
  double w2 = 0.5;
  double w3 = 0.5;
  double b = 0.0;

  double gradw1; /* gradients for the weights */
  double gradw2;
  double gradw3;
  double gradb;

  vector<double> neuron_data(3,0);
  double neuron_deriv = 0; /* Neuron error value ( partial based on the expected output and the error function )*/

  vector<double> dw1(3,0); /* derivatives for weights for each sequence */
  vector<double> dw2(3,0);
  vector<double> dw3(3,0);
  vector<double> derb(3,0);

  int first_sample_index;
  double manual_error = 1.0;
  double learning_rate = 1e-2;
  while(manual_error > learning_rate){
    for(int mbIter = 0; mbIter < minibatch_size; ++mbIter){ /* minibatches */
      first_sample_index = sequence_size * (rand()%(static_cast<int>(number_of_samples)));
      gradw1 = 0;
      gradw2 = 0;
      gradw3 = 0;
      gradb = 0;

      /* Fill in the data and derviatives */
      neuron_data[0] = (
        net_inputs_train[first_sample_index][0] * w1
        + net_inputs_train[first_sample_index][1] * w2
        + b
      );
      dw1[0] = net_inputs_train[first_sample_index][0];
      dw2[0] = net_inputs_train[first_sample_index][1];
      dw3[0] = 0;
      derb[0] = 1;

      neuron_data[1] = (
        net_inputs_train[first_sample_index + 1][0] * w1
        + net_inputs_train[first_sample_index + 1][1] * w2
        + neuron_data[0] * w3
        + b
      );
      dw1[1] = net_inputs_train[first_sample_index + 1][0] + w3 * dw1[0];
      dw2[1] = net_inputs_train[first_sample_index + 1][1] + w3 * dw2[0];
      dw3[1] = neuron_data[0] + w3 * dw3[0];
      derb[1] = 1 + derb[0] * w3;

      neuron_data[2] = (
        net_inputs_train[first_sample_index + 2][0] * w1
        + net_inputs_train[first_sample_index + 2][1] * w2
        + neuron_data[1] * w3
        + b
      );
      dw1[2] = net_inputs_train[first_sample_index + 2][0] + w3 * dw1[1];
      dw2[2] = net_inputs_train[first_sample_index + 2][1] + w3 * dw2[1];
      dw3[2] = neuron_data[1] + w3 * dw3[1];
      derb[2] = 1 + derb[1] * w3;

      /* Calculate the error and the gradients */
      manual_error = (
         pow((neuron_data[2] - expected_output[first_sample_index + 2][0]),2)/2.0
        +pow((neuron_data[1] - expected_output[first_sample_index + 1][0]),2)/2.0
        +pow((neuron_data[0] - expected_output[first_sample_index + 0][0]),2)/2.0
      );    

      /* Calculate gradients for sequence 2 */
      neuron_deriv = (
       -(neuron_data[2] - expected_output[first_sample_index + 2][0])
       -w3*(neuron_data[2] - expected_output[first_sample_index + 2][0])
       -w3*(neuron_data[2] - expected_output[first_sample_index + 2][0])
      );
      gradw1 += dw1[2] * neuron_deriv;
      gradw2 += dw2[2] * neuron_deriv;
      gradw3 += dw3[2] * neuron_deriv;
      gradb += derb[2] * neuron_deriv / 2.0;

      /* Calculate gradients for sequence 1 */
      neuron_deriv = (
        -(neuron_data[1] - expected_output[first_sample_index + 1][0])
        -w3*(neuron_data[1] - expected_output[first_sample_index + 1][0])
      );
      gradw1 += dw1[1] * neuron_deriv;
      gradw2 += dw2[1] * neuron_deriv;
      gradw3 += dw3[1] * neuron_deriv;
      gradb += derb[1] * neuron_deriv;

      /* Calculate gradients for sequence 0 */
      neuron_deriv = -(neuron_data[0] - expected_output[first_sample_index + 0][0]);
      gradw1 += dw1[0] * neuron_deriv;
      gradw2 += dw2[0] * neuron_deriv;
      gradw3 += dw3[0] * neuron_deriv;
      gradb += derb[0] * neuron_deriv;

      w1 += (learning_rate * (gradw1) / (sequence_size * minibatch_size));
      w2 += (learning_rate * (gradw2) / (sequence_size * minibatch_size));
      w3 += (learning_rate * (gradw3) / (sequence_size * minibatch_size));
      b += (learning_rate * (gradb) / (sequence_size * minibatch_size));
      std::cout << "\r Error: " << manual_error << "                     ";
    }
  }
  std::cout << std::endl;

  return 0;
}

I'm honestly having difficuilty beliving that this actually works. I really hope I can help out anyone who tries this in the future.