Search code examples
c++machine-learningneural-networkmnist

Neural Network not learning properly


I am making a neural network for learning purpose from scratch in c++ and attempting to make my neural network learn the MNIST data set of handwritten digits. However, The network MSE is only outputting one value every time I fit the neural network. I don't know if it something to do with the way how I written the NeuralNetwok class or the way how I am fitting the data. these are the relevant code to recreate the problem:

// PredictedData.hpp
#pragma once

#include <stdint.h>

struct PredictedData
{
  uint64_t m_NeuronIndex;
  double m_NeuronValue;

  PredictedData(
      uint64_t neuronIndex,
      double neuronValue);

  ~PredictedData();
};
// PredictedData.cpp

#include "PredictData.hpp"

PredictedData::PredictedData(
    uint64_t neuronIndex,
    double neuronValue) : m_NeuronIndex(neuronIndex),
                          m_NeuronValue(neuronValue) {}

PredictedData::~PredictedData() {}
// NeuralNetwork.hpp

#pragma once

#include "PredictData.hpp"
#include <iostream>
#include <stdint.h>
#include <vector>

class NeuralNetwork
{
private:
  void CalculateOutputs(const std::vector<double> &inputData);

  void CalculateDeltas(const std::vector<double> &expectedData);

  void ApplyGradientDescent(
      const std::vector<double> &inputData,
      const double &learningRate);

public:
  std::vector<uint64_t> m_Topology;
  std::vector<std::vector<double>> m_CachedDeltas;
  std::vector<std::vector<double>> m_CachedSums;
  std::vector<std::vector<double>> m_CachedOutputs;
  std::vector<std::vector<double>> m_Biases;
  std::vector<std::vector<std::vector<double>>> m_Weights;

  NeuralNetwork(const std::vector<uint64_t> &topology);
  ~NeuralNetwork();

  void Fit(
      const std::vector<double> &trainingData,
      const std::vector<double> &expectedData,
      const double &learningRate = 0.001);

  double GetMSE(const std::vector<double> &expectedData);

  PredictedData Predict(const std::vector<double> &inputData);
};
// NeuralNetwork.cpp
#include "NeuralNetwork.hpp"
#include <cmath>
#include <random>

double SigmoidActivation(const double &x)
{
  return 1 / (1 + std::exp(-x));
}

double SigmoidActivationDerivative(const double &x)
{
  return SigmoidActivation(x) * (1 - SigmoidActivation(x));
}

double SigmoidActivationDerivativeForOutput(const double &y)
{
  return y * (1 - y);
}

double TanhActivation(const double &x)
{
  return std::tanh(x);
  // return 2 / (1 + std::exp(-2 * x)) - 1;
}

double TanhActivationDerivative(const double &x)
{
  const double factor = TanhActivation(x);
  return 1 - factor * factor;
}

double TanhActivationDerivativeForOutput(const double &y)
{
  return 1 - y * y;
}

double ReLUActivation(const double &x)
{
  return x < 0 ? 0 : x;
}

double ReLUActivationDerivative(const double &x)
{
  return x < 0 ? 0 : 1;
}

double ParametricReLUActivation(const double &x, const double &a)
{
  return x < 0 ? 0 : a * x;
}

double ParametricReLUActivationDerivative(const double &x, const double &a)
{
  return x < 0 ? a : 1;
}

double ELUActivation(const double &x, const double &a)
{
  return x < 0 ? 0 : a * (std::exp(x) - 1);
}

double ELUActivationDerivative(const double &x, const double &a)
{
  return x < 0 ? ELUActivation(x, a) + a : 1;
}

double ELUActivationDerivativeForOutput(const double &x, const double &y, const double &a)
{
  return x < 0 ? y + a : 1;
}

double SoftPlusActivation(const double &x)
{
  return std::log(1 + std::exp(x));
}

double SoftPlusActivationDerivative(const double &x)
{
  return 1 / (1 + std::exp(-x));
}

double Cost(
    const double &predicted,
    const double &actual)
{
  double error = actual - predicted;
  return error * error;
}

double CostDerivative(
    const double &predicted,
    const double &actual)
{
  return 2 * (actual - predicted);
}

NeuralNetwork::NeuralNetwork(const std::vector<uint64_t> &topology) : m_Topology(topology)
{
  // reserve for layers
  m_Biases.reserve(topology.size() - 1);
  m_Weights.reserve(topology.size() - 1);
  m_CachedDeltas.reserve(topology.size() - 1);
  m_CachedSums.reserve(topology.size() - 1);
  m_CachedOutputs.reserve(topology.size() - 1);

  for (uint64_t i = 1; i < topology.size(); i++)
  {
    // IMPORTANT! this opeartion will used throughout
    // topology is inputed as following {input layer, hidden layer 1, hidden layer 2, hidden layer n, output layer}
    // we don't need to create memory for the input layer as it will be passed by the user
    // that's why the real index with relative to topology index is topology index - 1
    const uint64_t l = i - 1;

    // reserve for neuron in layer i
    m_CachedSums.push_back(std::vector<double>());
    m_CachedSums[l].reserve(topology[i]);

    // reserve for neuron in layer i
    m_CachedOutputs.push_back(std::vector<double>());
    m_CachedOutputs[l].reserve(topology[i]);

    // reserve for neuron in layer i
    m_CachedDeltas.push_back(std::vector<double>());
    m_CachedDeltas[l].reserve(topology[i]);

    // reserve for neuron in layer i
    m_Biases.push_back(std::vector<double>());
    m_Biases[l].reserve(topology[i]);

    // reserve for neuron in layer i
    m_Weights.push_back(std::vector<std::vector<double>>());
    m_Weights[l].reserve(topology[i]);

    for (uint64_t j = 0; j < topology[i]; j++)
    {
      // initialize for neuron j in layer i
      m_CachedSums[l].push_back(0);
      m_CachedOutputs[l].push_back(0);
      m_CachedDeltas[l].push_back(0);
      m_Biases[l].push_back(0);

      // reserve for layer l that affects neuron j
      m_Weights[l].push_back(std::vector<double>());
      m_Weights[l][j].reserve(topology[l]);

      for (uint64_t z = 0; z < topology[l]; z++)
      {
        // initialize weights
        m_Weights[l][j].push_back((double)std::rand() / RAND_MAX * sqrt(1.0 / topology[l]));
      }
    }
  }
}

NeuralNetwork::~NeuralNetwork()
{
}

void NeuralNetwork::CalculateOutputs(const std::vector<double> &inputData)
{
  // layer after input layer
  for (uint64_t i = 0; i < m_Topology[1]; i++)
  {
    m_CachedSums.front()[i] = m_Biases.front()[i];

    // product = output * weight (some way connected to the output)
    // calculated the products of all neurons of the input layer that is connected to neuron i and sum them
    for (uint64_t j = 0; j < m_Topology.front(); j++)
    {
      m_CachedSums.front()[i] += inputData[j] * m_Weights.front()[i][j];
    }

    // attempt to work with different activation function
    m_CachedOutputs.front()[i] = ReLUActivation(m_CachedSums.front()[i]);
    // m_CachedOutputs.front()[i] = SigmoidActivation(m_CachedSums.front()[i]);
    // m_CachedOutputs.front()[i] = TanhActivation(m_CachedSums.front()[i]);
  }

  // more hidden layer
  for (uint64_t i = 2; i < m_Topology.size() - 1; i++)
  {
    const uint64_t l = i - 1;

    for (uint64_t j = 0; j < m_Topology[i]; j++)
    {
      m_CachedSums[l][j] = m_Biases[l][j];

      // product = output * weight (some way connected to the output)
      // calculated the products of all neurons of the prev layer that is connected to neuron i and sum them
      for (uint64_t z = 0; z < m_Topology[l]; z++)
      {
        m_CachedSums[l][j] += m_CachedOutputs[l - 1][z] * m_Weights[l][j][z];
      }

      m_CachedOutputs[l][i] = ReLUActivation(m_CachedSums[l][i]);
      // m_CachedOutputs[l][i] = SigmoidActivation(m_CachedSums[l][i]);
      // m_CachedOutputs[l][j] = TanhActivation(m_CachedSums[l][j]);
    }
  }

  // output layer
  for (uint64_t i = 0; i < m_Topology.back(); i++)
  {
    const uint64_t l = m_Topology.size() - 2;

    m_CachedSums.back()[i] = m_Biases.back()[i];

    // product = output * weight (some way connected to the output)
    // calculated the products of all neurons of the prev hidden layer that is connected to neuron i and sum them
    for (uint64_t j = 0; j < m_Topology[l]; j++)
    {
      m_CachedSums.back()[i] += m_CachedOutputs[l - 1][j] * m_Weights.back()[i][j];
    }

    // attempt to work with different activation function for output layer
    // m_CachedOutputs.back()[i] = ReLUActivation(m_CachedSums.back()[i]);
    // m_CachedOutputs.back()[i] = SigmoidActivation(m_CachedSums.back()[i]);
    m_CachedOutputs.back()[i] = SigmoidActivation(m_CachedSums.back()[i]);
  }
}

void NeuralNetwork::CalculateDeltas(const std::vector<double> &expectedData)
{
  // output layer
  for (uint64_t i = 0; i < m_Topology.back(); i++)
  {
    // attempt to work with different activation function derivatiove for output layer
    m_CachedDeltas.back()[i] = SigmoidActivationDerivative(m_CachedSums.back()[i]) * CostDerivative(m_CachedOutputs.back()[i], expectedData[i]) / m_Topology.back();
    // m_CachedDeltas.back()[i] = SoftPlusActivationDerivative(m_CachedSums.back()[i]) * CostDerivative(m_CachedOutputs.back()[i], expectedData[i]) / m_Topology.back();
    // m_CachedDeltas.back()[i] = SoftPlusActivationDerivativeForOutput(m_CachedOutputs.back()[i]) * CostDerivative(m_CachedOutputs.back()[i], expectedData[i]) / m_Topology.back();
  }

  // hidden layers
  for (uint64_t i = m_Topology.size() - 2; i > 0; i--)
  {
    const uint64_t l = i - 1;
    for (uint64_t j = 0; j < m_Topology[i]; j++)
    {
      // initialize delta to 0
      m_CachedDeltas[l][j] = 0;

      // sum all deltas of the next layer of layer i multiplied by the weights connected between neurons in the next layer with the neuron j
      for (uint64_t z = 0; z < m_Topology[i + 1]; z++)
      {
        m_CachedDeltas[l][j] += m_CachedDeltas[i][z] * m_Weights[i][z][j];
      }

      // attempt to work with different activation function derivatiove for hidden layer
      // m_CachedDeltas[l][j] *= TanhActivationDerivativeForOutput(m_CachedOutputs[l][j]);
      // m_CachedDeltas[l][j] *= SigmoidActivationDerivativeForOutput(m_CachedOutputs[l][j]);
      m_CachedDeltas[l][j] *= ReLUActivationDerivative(m_CachedSums[l][j]);
      // m_CachedDeltas[l][j] *= SoftPlusActivationDerivative(m_CachedSums[l][j]);
    }
  }
}

void NeuralNetwork::ApplyGradientDescent(const std::vector<double> &inputData, const double &learningRate)
{
  for (uint64_t i = 0; i < m_Topology[1]; i++)
  {
    // common operation to minimize redundancy
    double net = learningRate * m_CachedDeltas.front()[i];

    // change bias by net * 1
    m_Biases.front()[i] -= net;

    for (uint64_t j = 0; j < m_Topology.front(); j++)
    {
      // change weight by net * input
      m_Weights.front()[i][j] -= net * inputData[j];
    }
  }

  for (uint64_t i = 2; i < m_Topology.size(); i++)
  {
    const uint64_t l = i - 1;
    for (uint64_t j = 0; j < m_Topology[i]; j++)
    {
      // common operation to minimize redundancy
      double net = learningRate * m_CachedDeltas[l][j];

      // change bias by net * 1
      m_Biases[l][j] -= net;

      for (uint64_t z = 0; z < m_Topology[l]; z++)
      {
        // change weight by net * prev layer output
        m_Weights[l][j][z] -= net * m_CachedOutputs[l - 1][z];
      }
    }
  }
}

double NeuralNetwork::GetMSE(const std::vector<double> &expectedData)
{
  double SumSE = 0;
  for (uint64_t i = 0; i < m_Topology.back(); i++)
  {
    SumSE += Cost(m_CachedOutputs.back()[i], expectedData[i]);
  }

  return SumSE / (double)m_Topology.back();
}

void NeuralNetwork::Fit(const std::vector<double> &trainingData, const std::vector<double> &expectedData, const double &learningRate)
{
  CalculateOutputs(trainingData);
  CalculateDeltas(expectedData);
  ApplyGradientDescent(trainingData, learningRate);
}

PredictedData NeuralNetwork::Predict(const std::vector<double> &inputData)
{
  CalculateOutputs(inputData);
  PredictedData predictedData(-1, -1000000);

  for (uint64_t i = 0; i < m_Topology.back(); i++)
  {
    if (m_CachedOutputs.back()[i] <= predictedData.m_NeuronValue)
    {
      continue;
    }

    predictedData.m_NeuronIndex = i;
    predictedData.m_NeuronValue = m_CachedOutputs.back()[i];
  }

  return predictedData;
}
// Main.cpp

#include "NeuralNetwork.hpp"
#include <cmath>
#include <fstream>
#include <string>

std::ifstream &Read(std::ifstream &in, void *data, std::streamsize bytes);

template <typename T>
std::ifstream &Read(std::ifstream &in, T &data)
{
  return Read(in, &data, sizeof(T));
}

uint32_t SwapEndian(uint32_t val);

bool ReadMNISTImages(std::ifstream &file, std::vector<std::vector<double>> &data, uint64_t &rows, uint64_t &columns);
bool ReadMNISTLabels(std::ifstream &file, std::vector<uint8_t> &data);

#define LEARNING_RATE 0.1

int main()
{
  std::ifstream imageFile("F:\\DATA\\mnist\\train-images.idx3-ubyte", std::ios::binary);
  if (!imageFile.is_open())
  {
    std::cout << "couldn't open image file\n";
    return 1;
  }

  std::ifstream labelFile("F:\\DATA\\mnist\\train-labels.idx1-ubyte", std::ios::binary);
  if (!labelFile.is_open())
  {
    std::cout << "couldn't open label file\n";
    return 1;
  }

  uint64_t rows;
  uint64_t columns;

  std::vector<std::vector<double>> imageData;
  if (!ReadMNISTImages(imageFile, imageData, rows, columns))
  {
    // failed
    return 1;
  }

  imageFile.close();

  std::vector<uint8_t> labelData;
  if (!ReadMNISTLabels(labelFile, labelData))
  {
    // failed
    return 1;
  }

  labelFile.close();

  uint64_t correct = 0;
  uint64_t wrong = 0;

  // init network
  NeuralNetwork network({rows * columns, 16, 16, 10});

  for (uint64_t i = 0; i < labelData.size() / 6; i++)
  {
    const uint8_t label = labelData[i];
    std::vector<double> expectedData(10, 0);

    // set index label to 1 (hot encoding)
    expectedData[label] = 1;

    // fit
    network.Fit(imageData[i], expectedData, LEARNING_RATE);

    // log MSE every 1000 fit
    if (i % 1000 == 0)
    {
      std::cout << "MSE: " << network.GetMSE(expectedData) << '\n';
    }

    // predict
    PredictedData predictedData = network.Predict(imageData[i]);

    // check if the predicted neuron index is the same with the label
    if (predictedData.m_NeuronIndex == label)
    {
      correct++;
    }
    else
    {
      wrong++;
    }
  }

  // log the amount of correct and wrong predictions
  std::cout << "correct: " << correct << ", wrong: " << wrong << ", total: " << wrong + correct << std::endl;

  return 0;
}

std::ifstream &Read(std::ifstream &in, void *data, std::streamsize bytes)
{
  char *buffer = reinterpret_cast<char *>(data);
  return static_cast<std::ifstream &>(in.read(buffer, bytes));
}

uint32_t SwapEndian(uint32_t val)
{
  val = ((val << 8) & 0xFF00FF00) | ((val >> 8) & 0xFF00FF);
  return (val << 16) | (val >> 16);
}

bool ReadMNISTImages(std::ifstream &file, std::vector<std::vector<double>> &data, uint64_t &rows, uint64_t &columns)
{
  uint32_t magicNumber;
  uint32_t numberOfImages;
  uint32_t numberOfRows;
  uint32_t numberOfColumns;

  Read(file, &magicNumber, 4);
  magicNumber = SwapEndian(magicNumber);
  if (magicNumber != 2051)
  {
    std::cout << "Incorrect image file magicNumber: " << magicNumber << std::endl;
    return false;
  }

  Read(file, &numberOfImages, 4);
  numberOfImages = SwapEndian(numberOfImages);

  Read(file, &numberOfRows, 4);
  numberOfRows = SwapEndian(numberOfRows);
  rows = numberOfRows;

  Read(file, &numberOfColumns, 4);
  numberOfColumns = SwapEndian(numberOfColumns);
  columns = numberOfColumns;

  data.reserve(numberOfImages);
  for (uint64_t i = 0; i < numberOfImages; i++)
  {
    data.push_back(std::vector<double>());
    data[i].reserve(numberOfRows * numberOfColumns);

    for (uint64_t j = 0; j < numberOfRows * numberOfColumns; j++)
    {
      uint8_t pixel;
      Read(file, &pixel, 1);
      data[i].push_back((double)pixel / 255);
    }
  }

  return true;
}

bool ReadMNISTLabels(std::ifstream &file, std::vector<uint8_t> &data)
{
  uint32_t magicNumber;
  uint32_t numberOfLabels;

  Read(file, &magicNumber, 4);
  magicNumber = SwapEndian(magicNumber);
  if (magicNumber != 2049)
  {
    std::cout << "Incorrect label file magicNumber: " << magicNumber << std::endl;
    return false;
  }

  Read(file, &numberOfLabels, 4);
  numberOfLabels = SwapEndian(numberOfLabels);

  data.reserve(numberOfLabels);

  for (uint64_t i = 0; i < numberOfLabels; i++)
  {
    uint8_t pixel;
    Read(file, pixel);
    data.push_back(pixel);
  }

  return true;
}

these are the MSE outputs:

MSE: 0.898595
MSE: 0.899916
MSE: 0.899992
MSE: 0.899984
MSE: 0.899999
MSE: 0.9
MSE: 0.899999
MSE: 0.899999
MSE: 0.9
MSE: 0.9
correct: 979, wrong: 9021, total: 10000

I thought at first it was the wrong usage of the activation function that is giving me this error but even though I changed the activation function to different function the output was still the very same. It was the same case when changing the topology for the hidden layers. In addition, I changed the learning rate it had some effect to the MSE but was still around the same MSE. I don't know why is still giving the same output again and again. I expected the MSE to decrease after every fit. Moreover, when running multiple times the same neural network with different initialization the number of correct and number of wrong predicted remains the same as well as the MSE at each fit. Is something wrong with ApplyGradientDescent methods or am I going crazy. Please lend a hand to solve this problem and explain what went wrong.


Solution

  • The problem was that there was a missing minus sign in the cost derivative. After adding that simple sign, the problem was virtually gone. Thank @Alexey S. Larionov for pinpointing my error.