Search code examples
c++cudathrust

CUDA: how to do a matrix multiplication using thrust?


I'm new to CUDA and Thrust and I'm trying to implement a matrix multiplication and I want to achieve this by only using the thrust algorithms, because I want to avoid calling a kernel manually.

Is there a way I can achieve this efficiently? (At least without Using 2 nested for loops)

Or do I have to resign and call a CUDA Kernel?

//My data
thrust::device_vector<float> data(n*m);
thrust::device_vector<float> other(m*r);
thrust::device_vector<float> result(n*r);

// To make indexing faster, not really needed
transpose(other);

// My current approach
for (int i = 0; i < n; ++i)
{
   for (int j = 0; j < r;++j)
   {
       result[i*r+ j] = thrust::inner_product(data.begin()+(i*m), data.begin()+((i+1)*m),other+(j*m), 0.0f);
   }
}

Solution

  • If you are interested in performance (usually why people use GPUs for computing tasks) you should not use thrust and you should not call or write your own CUDA kernel. You should use the CUBLAS library. For a learning exercise, if you want to study your own CUDA kernel, you can refer to a first-level-optimized CUDA version in the CUDA programming guide in the shared memory section. If you really want to use thrust with a single thrust call, it is possible.

    The basic idea is to use an element-wise operation like thrust::transform as described here. The per-output-array-element dot-product is computed with a functor consisting of a loop.

    Here's a worked example considering 3 methods. Your original double-nested loop method (relatively slow), a single thrust call method (faster) and the cublas method (fastest, certainly for larger matrix sizes). The code below only runs method 1 for matrix side dimensions of 200 or less because it is so slow. Here is an example on Tesla P100:

    $ cat t463.cu
    #include <thrust/device_vector.h>
    #include <thrust/transform.h>
    #include <thrust/inner_product.h>
    #include <thrust/execution_policy.h>
    #include <thrust/equal.h>
    #include <thrust/iterator/constant_iterator.h>
    #include <cublas_v2.h>
    #include <iostream>
    #include <time.h>
    #include <sys/time.h>
    #include <cstdlib>
    #define USECPSEC 1000000ULL
    
    long long dtime_usec(unsigned long long start){
    
      timeval tv;
      gettimeofday(&tv, 0);
      return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;
    }
    
    struct dp
    {
      float *A, *B;
      int m,n,r;
      dp(float *_A, float *_B, int _m, int _n, int _r): A(_A), B(_B), m(_m), n(_n), r(_r) {};
      __host__ __device__
      float operator()(size_t idx){
        float sum = 0.0f;
        int row = idx/r;
        int col = idx - (row*r); // cheaper modulo
        for (int i = 0; i < m; i++)
          sum += A[col + row*i] * B[col + row*i];
        return sum;}
    };
    
    const int dsd = 200;
    int main(int argc, char *argv[]){
      int ds = dsd;
      if (argc > 1) ds = atoi(argv[1]);
      const int n = ds;
      const int m = ds;
      const int r = ds;
      // data setup
      thrust::device_vector<float> data(n*m,1);
      thrust::device_vector<float> other(m*r,1);
      thrust::device_vector<float> result(n*r,0);
      // method 1
      //let's pretend that other is (already) transposed for efficient memory access by thrust
      // therefore each dot-product is formed using a row of data and a row of other
      long long dt = dtime_usec(0);
        if (ds < 201){
        for (int i = 0; i < n; ++i)
        {
          for (int j = 0; j < r;++j)
          {
             result[i*r+ j] = thrust::inner_product(data.begin()+(i*m), data.begin()+((i+1)*m),other.begin()+(j*m), 0.0f);
          }
        }
        cudaDeviceSynchronize();
        dt = dtime_usec(dt);
        if (thrust::equal(result.begin(), result.end(), thrust::constant_iterator<float>(m)))
          std::cout << "method 1 time: " << dt/(float)USECPSEC << "s" << std::endl;
        else
          std::cout << "method 1 failure" << std::endl;
        }
      thrust::fill(result.begin(), result.end(), 0);
      cudaDeviceSynchronize();
    // method 2
      //let's pretend that data is (already) transposed for efficient memory access by thrust
      // therefore each dot-product is formed using a column of data and a column of other
      dt = dtime_usec(0);
      thrust::transform(thrust::counting_iterator<int>(0), thrust::counting_iterator<int>(n*r), result.begin(), dp(thrust::raw_pointer_cast(data.data()), thrust::raw_pointer_cast(other.data()), m, n, r));
      cudaDeviceSynchronize();
      dt = dtime_usec(dt);
      if (thrust::equal(result.begin(), result.end(), thrust::constant_iterator<float>(m)))
        std::cout << "method 2 time: " << dt/(float)USECPSEC << "s" << std::endl;
      else
        std::cout << "method 2 failure" << std::endl;
    // method 3
      // once again, let's pretend the data is ready to go for CUBLAS
      cublasHandle_t h;
      cublasCreate(&h);
      thrust::fill(result.begin(), result.end(), 0);
      float alpha = 1.0f;
      float beta = 0.0f;
      cudaDeviceSynchronize();
      dt = dtime_usec(0);
      cublasSgemm(h, CUBLAS_OP_T, CUBLAS_OP_T, n, r, m, &alpha, thrust::raw_pointer_cast(data.data()), n, thrust::raw_pointer_cast(other.data()), m, &beta, thrust::raw_pointer_cast(result.data()), n);
      cudaDeviceSynchronize();
      dt = dtime_usec(dt);
      if (thrust::equal(result.begin(), result.end(), thrust::constant_iterator<float>(m)))
        std::cout << "method 3 time: " << dt/(float)USECPSEC << "s" << std::endl;
      else
        std::cout << "method 3 failure" << std::endl;
    }
    $ nvcc -o t463 t463.cu -lcublas
    $ ./t463
    method 1 time: 20.1648s
    method 2 time: 6.3e-05s
    method 3 time: 5.7e-05s
    $ ./t463 1024
    method 2 time: 0.008063s
    method 3 time: 0.000458s
    $
    

    For the default dimension 200 case, the single thrust call and cublas method are fairly close, but are much faster than the loop method. For a side dimension of 1024, the cublas method is almost 20x faster than the single thrust call method.

    Note that I have chosen "optimal" transpose configurations for all 3 methods. For method 1, the best case timing is when the inner_product is using a "row" from each input matrix (effectively the tranpose of the 2nd input matrix). For method 2, the best case timing is when the functor is traversing a "column" from each input matrix (effectively the transpose of the first input matrix). For method 3, the choice of CUBLAS_OP_T for both input matrices seems to be fastest. In reality, only the cublas method has the flexibility to be useful for a variety of input cases with good performance.