Search code examples
c++cudathrust

Using thrust is slower than my own kernel?


EIDT

Change the code as Robert suggested, but thrust is still much slower.

The data I used is based on two .dat files, so I omit it in the code.

Original problem

I have two complex vectors which have been put on GPU Tesla M6. I want to compute element-wise product of the two vectors, namely [x1*y1,...,xN*yN]. The length of two vectors are both N = 720,896.

Code snippet(modified)

I solve this problem in two ways. One is using thrust with type conversion and a specific struct:

#include <cstdio>
#include <cstdlib>
#include <sys/time.h>

#include "cuda_runtime.h"
#include "cuComplex.h"

#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/execution_policy.h>
#include <thrust/complex.h>
#include <thrust/transform.h>
#include <thrust/functional.h>


using namespace std;

typedef thrust::complex<float> comThr;

// ---- struct for thrust ----//
struct Complex_Mul_Complex :public thrust::binary_function<comThr, comThr, comThr>
{
    __host__ __device__
    comThr operator() (comThr a, comThr b) const{
        return a*b;
    }
};

// ---- my kernel function ---- //
__global__ void HardamarProductOnDeviceCC(cuComplex *Result, cuComplex *A, cuComplex *B, int N)
{
unsigned int tid = threadIdx.x;
unsigned int index = threadIdx.x + blockDim.x * blockIdx.x;

if(index >= N)
    return;
Result[index].x = cuCmul(A[index],B[index]).x;
Result[index].y = cuCmul(A[index],B[index]).y;

}

// ---- timing function ---- //
double seconds()
{
    struct timeval tp;
    struct timezone tzp;
    int i = gettimeofday(&tp, &tzp);
    return ((double)tp.tv_sec + (double)tp.tv_usec * 1.e-6);
}
int main()
{
int N = 720896;
cuComplex *d_Data1, *d_Data2;
cudaMalloc(&d_Data1, N*sizeof(d_Data1[0]));
cudaMalloc(&d_Data2, N*sizeof(d_Data2[0]));
/************************************
 * Version 1: type conversion twice *
 ************************************/
// step 1: type convert (cuComplex->thrust)
comThr *thr_temp1 = reinterpret_cast<comThr*>(d_Data1);
thrust::device_ptr<comThr> thr_d_Data1 = thrust::device_pointer_cast(thr_temp1);

comThr *thr_temp2 = reinterpret_cast<comThr*>(d_Data2);
thrust::device_ptr<comThr> thr_d_Data2 = thrust::device_pointer_cast(thr_temp2);

// step 2: product and timing
Complex_Mul_Complex op_dot;
double iStart = cpuSecond();  // timing class
for(int i=0;i<1000;i++){    // loop 1000 times to get accurate time consumption
  thrust::transform(thrust::device,thr_d_Data1,thr_d_Data1+N,
    thr_d_Data2,thr_d_Data1,op_dot); 
}
cudaDeviceSynchronize();
double iElapse = cpuSecond() - iStart;
cout << "thrust time consume: " << iElapse <<endl;

/************************************************
 * Version 2: dot product using kernel function *
 ************************************************/
int blockSize;
int minGridSize;
int gridSize;
cudaOccupancyMaxPotentialBlockSize(&minGridSize, &blockSize, HardamarProductOnDeviceCC, 0, 0);

gridSize = (N+blockSize-1)/blockSize;
dim3 grid(gridSize);
dim3 block(blockSize);
iStart = cpuSecond();
for(int i=0;i<1000;i++){
    HardamarProductOnDeviceCC<<<grid,block>>>(d_Data1,d_Data1,d_Data2,N);
}
cudaDeviceSynchronize();
iElapse = cpuSecond() - iStart;
cout << "kernel time consume: " << iElapse <<endl;
}

Result:
thrust time consume: 25.6063
kernel time consume: 2.87929

My question

After I added cudaDeviceSynchronize(), It seems thrust version is much slower than kernel version. There is a "golden rule" that use libraries instead of writing own kernel function. But I want to know why in this situation thrust version is slower?


Solution

  • CUDA kernel launches are asynchronous. This means that control is returned to the host thread so that it can proceed with the next line of code after the kernel launch before the kernel has even started to execute.

    This is covered in numerous questions here on the cuda tag. This is a common mistake when timing CUDA code. It can affect the way you time thrust code as well as the way you time ordinary CUDA code. The usual solution is to insert a cudaDeviceSynchronize() call before closing the timing region. This ensures that all CUDA activity is complete when you finish your timing measurement.

    When I turned what you have into a complete code with proper timing methods, the thrust code was actually faster. Your kernel design is inefficient. Here is my version of your code, running on CUDA 10 on a Tesla P100, showing that the timing between the two cases is nearly the same:

    $ cat t469.cu
    #include <thrust/transform.h>
    #include <thrust/complex.h>
    #include <thrust/device_ptr.h>
    #include <thrust/execution_policy.h>
    #include <cuComplex.h>
    
    #include <iostream>
    #include <time.h>
    #include <sys/time.h>
    #include <cstdlib>
    #define USECPSEC 1000000ULL
    
    long long dtime_usec(unsigned long long start){
    
      timeval tv;
      gettimeofday(&tv, 0);
      return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;
    }
    
    typedef thrust::complex<float> comThr;
    
    struct Complex_Mul_Complex :public thrust::binary_function<comThr, comThr, comThr>
    {
        __host__ __device__
        comThr operator() (comThr a, comThr b) const{
            return a*b;
        }
    };
    double cpuSecond(){
      long long dt = dtime_usec(0);
      return dt/(double)USECPSEC;
    }
    __global__ void HardamarProductOnDeviceCC(cuComplex *Result, cuComplex *A, cuComplex *B, int N)
    {
      unsigned int index = threadIdx.x + blockDim.x * blockIdx.x;
    
      if(index < N)
        Result[index] = cuCmulf(A[index],B[index]);
    }
    
    int main(){
    int N = 720896;
    cuComplex *d_Data1, *d_Data2;
    cudaMalloc(&d_Data1, N*sizeof(d_Data1[0]));
    cudaMalloc(&d_Data2, N*sizeof(d_Data2[0]));
    // step 1: type convert (cuComplex->thrust)
    comThr *thr_temp1 = reinterpret_cast<comThr*>(d_Data1);
    thrust::device_ptr<comThr> thr_d_Data1 = thrust::device_pointer_cast(thr_temp1);
    
    comThr *thr_temp2 = reinterpret_cast<comThr*>(d_Data2);
    thrust::device_ptr<comThr> thr_d_Data2 = thrust::device_pointer_cast(thr_temp2);
    
    // step 2: product and timing
    Complex_Mul_Complex op_dot;
    double iStart = cpuSecond();  // timing class
    for(int i=0;i<1000;i++){    // loop 1000 times to get accurate time consumption
      thrust::transform(thrust::device,thr_d_Data1,thr_d_Data1+N,
        thr_d_Data2,thr_d_Data1,op_dot);
    }
    cudaDeviceSynchronize();
    double iElapse = cpuSecond() - iStart;
    std::cout << "thrust time consume: " << iElapse <<std::endl;
    
    int blockSize;
    int minGridSize;
    int gridSize;
    cudaOccupancyMaxPotentialBlockSize(&minGridSize, &blockSize, HardamarProductOnDeviceCC, 0, 0);
    
    gridSize = (N+blockSize-1)/blockSize;
    std::cout << "gridsize: " << gridSize << " blocksize: " << blockSize << std::endl;
    dim3 grid(gridSize);
    dim3 block(blockSize);
    iStart = cpuSecond();
    for(int i=0;i<1000;i++){
        HardamarProductOnDeviceCC<<<grid,block>>>(d_Data1,d_Data1,d_Data2,N);
    }
    cudaDeviceSynchronize();
    iElapse = cpuSecond() - iStart;
    std::cout << "kernel time consume: " << iElapse <<std::endl;
    }
    $ nvcc -o t469 t469.cu
    $ ./t469
    thrust time consume: 0.033937
    gridsize: 704 blocksize: 1024
    kernel time consume: 0.0337021
    $
    

    Note: In order for me to demonstrate the correctness of my answer, its important to me to provide a complete code. If you want help from others, my suggestion to you is to provide a complete code, not bits and pieces that have to be assembled, and then converted into a complete code by adding includes, etc. You're welcome to do as you wish, of course, but if you make it easier for others to help you, you might find that you get help more easily.