c++oop templates inheritance strategy-pattern

Issues with Strategy Pattern in c++

i'm implementing a neural network with "LEARNING PURPOSES".

Y have the following templated classes:

a Optimizer Base class with virtual method update taking a Parameter class reference as argument
a Parameter class that contains the weights and bias, and that needs to call the update method of the optimizer
a Linear class that stores information in the Parameter class
a SGD class that inherits from Optimizer Base class (Strategy Pattern) where I override the update method.

The problem is that Parameter class is templated, and when passing it as a reference to de update method of the optimizer the compiler complains that the virtual method cannot be templated, and if I pass it as non templated, it complains that it doesn't have any templated parameters.

Also tried CRTP idiom for making Strategy Pattern static, so I can pass templated arguments to the virtual method, but the problem is that the compiler complains that the base class that i'm passing to the pointer where I store the optimizer in the Parameter should be templated.


#include<iostream>
  using size_type = std::size_t;
#include<vector>
#include<random>
#include<memory>

// #include "tensor_algebra.h"  here I define alias for Tensor operations
// for avoid using global namespaces
// but for testig you can use the following include

#include <Fastor/Fastor.h>
using namespace Fastor;

template<size_type input_features, size_type output_features>
struct Parameters;

class Optimizer{
  public:
    virtual void update(Parameters& parameters) = 0;
    virtual ~Optimizer() = default;
};

template<size_type input_features, size_type output_features>
struct Parameters{
    Tensor<float, input_features, output_features> weight;
    Tensor<float, input_features, output_features> weight_gradient;
    Tensor<float, output_features> bias;
    Tensor<float, output_features> bias_gradient;


    Parameters(const std::string& initializer = "he")
    : bias(0,0) {

      std::random_device rd;
      std::mt19937 generator(rd());
      std::normal_distribution<float> distribution;

      switch(initializer){
        case "he":
          distribution = std::normal_distribution<float>(0, std::sqrt(2.0 / input_features));
        break;

        case "xavier":
          distribution = std::normal_distribution<float>(0, std::sqrt(2.0 / input_features + output_features));  
        break;

        default:
          std::cout << "Invalid initializer" << std::endl;
        break;
      }

      for(auto i = 0; i < input_features; ++i){
        for(auto j = 0; j < output_features; ++j){
          weight(i, j) = distribution(generator);
        }
      }
    }

    void update(){
      optimizer->update(*this);
    }
};


class SGD : public Optimizer{
  public:
    SGD(float learning_rate) : learning_rate_(learning_rate) {}

    void update(Parameters& parameters){
      parameters.weight -= learning_rate_ * parameters.weight_gradient;
      parameters.bias -= learning_rate_ * parameters.bias_gradient;
    }

  private:
    float learning_rate_;
};

//Linear layer class

template<size_type input_features, size_type output_features>
class Linear{
  public:

    Linear(float learning_rate, const std::string& initializer = "he")
      : parameters(learning_rate, initializer){}
        
    void set_optimizer(std::shared_ptr<Optimizer> optimizer){
      parameters.optimizer = optimizer;
    } //Seting optimizer in linear layer.

    //forward method
    template<size_type batch_size>
    Tensor<float, batch_size, output_features> forward(const Tensor<float, batch_size, input_features> &input){
      Tensor<float,batch_size> ones(1);
      return matmul(input, parameters.weight) + outer(ones, parameters.bias); 
    }
  
    //backward method
    template<size_type batch_size>
    Tensor<float, batch_size, input_features> backward(
      const Tensor<float, batch_size, output_features> &gradient,
      const Tensor<float, batch_size, input_features>& input
    ){
      parameters.weight_gradient = matmul(transpose(input), gradient);
      parameters.bias_gradient = 0.0; // Initialize bias_gradient with zeros
      
      for (size_t i = 0; i < batch_size; i++) {
        for (size_t j = 0; j < output_features; j++){
        parameters.bias_gradient(j) += gradient(i, j);
        }
      }

      Tensor<float, batch_size, input_features> input_gradient = matmul(gradient, transpose(parameters.weight));
      return input_gradient;
    }
  
  private:
    Parameters<input_features, output_features> parameters;
};


int main(){
  Linear<2,3> linear(0.01);
  linear.set_optimizer(std::make_shared<SGD>(0.01));
  Tensor<float, 2, 2> input = {{1, 2}, {3, 4}};
  Tensor<float, 2, 3> output = linear.forward(input);
  std::cout << output << std::endl;
  return 0;
}

I tried this:

template<class Derived>
class Optimizer{
  public:
    Derived& self(){return static_cast<Derived&>(*this);}
    const Derived& self() const {return static_cast<const Derived&>(*this);}

    template<size_type input_size, size_type output_size>
    void update(Parameters<input_size,output_size>& parameters){
      self().update(parameters);
    }
};

class SGD : public Optimizer<SGD>{
  public:
    SGD(float learning_rate) : learning_rate_(learning_rate) {}

    template<size_type input_size, size_type output_size>
    void update(Parameters<input_size,output_size>& parameters){
      parameters.weight -= learning_rate_ * parameters.weight_gradient;
      parameters.bias -= learning_rate_ * parameters.bias_gradient;
    }

  private:
    float learning_rate_;
};

but doesn't seems to work. Any advice?

Solution

I cannot understand what you really want to do because your code has many syntax errors, no call of Parameters::update().

However I tried something to remove syntax errors, and to call Parameters::update() from Linear.
What I did is just simply making all classes to be template samely.
I hope it will be of some help.

Note: I don't have unkowns "Fastor.h", so, following code becames skelton like( all unknown type data and process for them ware simply removed.)

template<size_type input_features, size_type output_features>
struct Parameters;

template<size_type input_features, size_type output_features>
class Optimizer{
public:
    virtual void update( Parameters<input_features,output_features>& parameters ) = 0;
    virtual ~Optimizer() = default;
};

template<size_type input_features, size_type output_features>
struct Parameters
{
    std::shared_ptr< Optimizer<input_features,output_features> > optimizer;

    Parameters(const std::string& initializer = "he"){  /*NOP*/ }
    void update(){  optimizer->update(*this);   }
};

template<size_type input_features, size_type output_features>
class SGD : public Optimizer<input_features, output_features>
{
public:
    SGD(float learning_rate) : learning_rate_(learning_rate) {}

    void update( Parameters<input_features,output_features> &parameters )
    {   std::cout << "SGD::update() called" << std::endl;   }

private:
    float learning_rate_;
};

//Linear layer class
template<size_type input_features, size_type output_features>
class Linear{
public:
    Linear( float learning_rate, const std::string& initializer = "he")
        : parameters( /*learning_rate,*/ initializer){}

    void set_optimizer( std::shared_ptr< Optimizer<input_features,output_features> > optimizer )
    {   parameters.optimizer = optimizer;   }

    //Test method I added
    void DoSomething(){ parameters.update();    }

private:
    Parameters<input_features, output_features> parameters;
};


int main(){
    Linear<2,3> linear(0.01f);
    linear.set_optimizer( std::make_shared< SGD<2,3> >(0.01f) );
    linear.DoSomething();
    return 0;
}