Search code examples
c++vectorcudareducedot-product

Which of the following approaches is more suitable for CUDA parallelism?


I found that there are two approaches to calculating dot products of two vectors in CUDA:

Which of the following approaches is more suitable for CUDA parallelism?

  1. The approach used in the given code is a simple parallelization approach using a single block of threads.

    The dotProductKernel() function computes the dot product of two input vectors a and b using parallel processing on the GPU. Each thread in the single block processes a segment of the input vectors and calculates a partial sum. The partial sums are then summed up using atomicAdd() to produce the final dot product.

    #include <vector>
    #include <iostream>
    
    __global__ void dotProductKernel(int *a, int *b, int *result, int n) {
        //Get the thread index
        int tid = blockIdx.x * blockDim.x + threadIdx.x;
        //Only calculate if thread index is valid
        if (tid < n) {
           int sum = 0;
           //Perform dot product calculation for this thread's segment of the arrays
           for (int i = tid; i < n; i+= blockDim.x * gridDim.x) {
               sum += a[i] * b[i];
           }
           //Atomically add this thread's result to the shared result
           atomicAdd(result, sum);
        } }
    
    int main() {
        int n = 10000;
    
        // Use vectors instead of raw pointers
        std::vector<int> a(n), b(n);
        int *c = (int*) malloc(sizeof(int));
    
        // Initialize the input vectors
        for (int i = 0; i < n; i++) {
            a[i] = i+1;
            b[i] = i+1;
        }
        *c = 0;
    
    
            // Allocate memory on the GPU
        int* d_a, *d_b, *d_c;
        cudaMalloc(&d_a, a.size() * sizeof(int));
        cudaMalloc(&d_b, b.size() * sizeof(int));
        cudaMalloc(&d_c, 1 * sizeof(int));
    
        // Copy vectors to GPU
        cudaMemcpy(d_a, a.data(), a.size() * sizeof(int), cudaMemcpyHostToDevice);
        cudaMemcpy(d_b, b.data(), b.size() * sizeof(int), cudaMemcpyHostToDevice);
        cudaMemcpy(d_c, c, 1 * sizeof(int), cudaMemcpyHostToDevice);
    
        // Launch kernel
        dotProductKernel<<<1, 1024>>>(d_a, d_b, d_c, n);
    
        // Copy result back from GPU
        cudaMemcpy(c, d_c, 1 * sizeof(int),cudaMemcpyDeviceToHost);
    
        // Free GPU memory
        cudaFree(d_a);
        cudaFree(d_b);
        cudaFree(d_c);
    
        // Print first 10 elements
        for (int i = 0; i < 10; i++) {
            std::cout << *c << '\n';
        }
    
        free(c); } ```
    
    
  2. This code uses a parallel reduction approach.

    In the dotProductKernel() function, each thread computes a partial sum of the dot product of two input vectors a and b, and stores the result in a shared memory array results. The partial sums are calculated in parallel using multiple threads, with each thread processing a segment of the input vectors.

    In the sumResultsKernel() function, a single thread sums up the partial results stored in the shared memory array results and stores the final result in a separate memory location pointed to by the result. This final reduction operation is also performed in parallel using multiple threads.

    #include <vector>
    #include <iostream>
    
    __global__ void dotProductKernel(int *a, int *b, int *results, int n) {
        //Get the thread index
        int tid = blockIdx.x * blockDim.x + threadIdx.x;
        //Only calculate if thread index is valid
        if (tid < n) {
           int sum = 0;
           //Perform dot product calculation for this thread's segment of the arrays
           for (int i = tid; i < n; i+= blockDim.x * gridDim.x) {
               sum += a[i] * b[i];
           }
           //Store this thread's result in the shared results array
           results[blockIdx.x * blockDim.x + threadIdx.x] = sum;
        } }
    
    __global__ void sumResultsKernel(int *results, int *result, int n) {
        //Get the thread index
        int tid = blockIdx.x * blockDim.x + threadIdx.x;
        //Only calculate if thread index is valid
        if (tid == 0) {
           int sum = 0;
           //Sum up the partial results
           for (int i = 0; i < n; i++) {
               sum += results[i];
           }
           //Store the final result
           *result = sum;
        } }
    
    int main() {
        int n = 10000;
    
        // Use vectors instead of raw pointers
        std::vector<int> a(n), b(n);
        int *c = (int*) malloc(sizeof(int));
    
        // Initialize the input vectors
        for (int i = 0; i < n; i++) {
            a[i] = i+1;
            b[i] = i+1;
        }
        *c = 0;
    
        // Determine the grid size and block size
        int blockSize = 1024;
        int gridSize = (n + blockSize - 1) / blockSize;
    
        // Allocate memory on the GPU
        int* d_a, *d_b, *d_results, *d_result;
        cudaMalloc(&d_a, a.size() * sizeof(int));
        cudaMalloc(&d_b, b.size() * sizeof(int));
        cudaMalloc(&d_results, gridSize * blockSize * sizeof(int));
        cudaMalloc(&d_result, 1 * sizeof(int));
    
        // Copy vectors to GPU
        cudaMemcpy(d_a, a.data(), a.size() * sizeof(int), cudaMemcpyHostToDevice);
        cudaMemcpy(d_b, b.data(), b.size() * sizeof(int), cudaMemcpyHostToDevice);
        cudaMemset(d_results, 0, gridSize * blockSize * sizeof(int));
        cudaMemset(d_result, 0, sizeof(int));
    
        // Launch kernel
        dotProductKernel<<<gridSize, blockSize>>>(d_a, d_b, d_results, n);
    
        // Sum up the partial results
        sumResultsKernel<<<1, 1024>>>(d_results, d_result, gridSize * blockSize);
    
        // Copy result back from GPU
        cudaMemcpy(c, d_result, 1 * sizeof(int),cudaMemcpyDeviceToHost);
    
        // Free GPU memory
        cudaFree(d_a);
        cudaFree(d_b);
        cudaFree(d_results);
        cudaFree(d_result);
    
        // Print the result
        std::cout << "Dot product = " << *c << '\n';
    
        free(c); } ```
    

Solution

  • The only reason I know of that people use CUDA is to make their code run faster. I'm not sure what "suitable for CUDA parallelism" means, or why it would be necessary to consider something other than performance as a figure of merit.

    If we consider performance, then, the best approach will be a combination of the two methods you have shown. In CUDA, the first two priorities of any CUDA programmer are:

    1. expose enough parallelism; this roughly translates into "use lots of threads in your kernel launch"
    2. make efficient use of memory

    The first method makes efficient use of memory in that it does a running sum reduction out of global memory into each thread using a coalesced grid-stride loop. It makes inefficient use of memory by unnecessarily loading the atomic mechanism with potentially lots of atomics. Even in the suboptimal case shown, there are 1024 atomic calls when only one or zero are needed.

    The first method is also not optimal in that it launches only 1 block. This is almost never optimal in CUDA (not enough threads); it leaves much of the GPU idle.

    To rectify these problems and come close to optimal speed, we will want to combine aspects of both methods:

    • use a running sum per thread to initially gather all values from global memory in a coalesced fashion
    • choose a grid size to "fill" the GPU being used
    • once the running product-sums per thread are complete, perform an intra-threadblock reduction. A canonical shared memory sweep style reduction is probably optimal here.
    • the preceding step will place the threadblock partial sum into a single location. Atomically add this location to the global sum. This eliminates the need for multiple kernel launches (each of which will cost a minimum of 5 microseconds or more) as might be used in the second method, and other canonical reduction material.

    The following achieves these design goals:

    const int BLOCK_SIZE=512; // must be a power of 2 less than 2048, and preferably larger than 32
    template <typename T>
    __global__ void dotProductKernel(const T * __restrict__ a, const T * __restrict__ b, T * __restrict__ result, size_t n)
    {
        __shared__ T smem[BLOCK_SIZE];
        //Get the thread index
        size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
        T sum = 0;
        //Perform dot product calculation for this thread's segment of the arrays
        for (size_t i = tid; i < n; i+= gridDim.x*blockDim.x) {
          sum += a[i] * b[i];
          }
        //Perform threadblock sweep reduction
        smem[threadIdx.x] = sum;
        for (int i = BLOCK_SIZE>>1; i > 0; i>>=1){
          __syncthreads();
          if (threadIdx.x < i) smem[threadIdx.x] += smem[threadIdx.x+i];}      
        //Atomically add this block's result to the global result
        if ((!threadIdx.x) && (tid < n)) atomicAdd(result, smem[0]);
    }
    

    Again, you would typically want to launch the above kernel with a number of threads per block defined by BLOCK_SIZE and a number of blocks as a whole number positive multiple of the number of SMs in your GPU. Some tuning of these numbers may be necessary. For a starting point, I would choose a BLOCK_SIZE of 512 and either two times the number of SMs, three times the number of SMs, or four times the number of SMs, as the number of blocks to launch.

    This method will generally work well for larger dot-products. Special adaptations might be desirable for very small dot products, where the length of the vector is less than the instantaneous thread-carrying capacity of the GPU (number of SMs times maximum number of threads per SM). For example, you might choose BLOCK_SIZE of 32 for very small dot products.

    Some of these concepts will be covered in units 3-4 of this online training (basic cuda optimization goals) and some additional topics (reductions and atomics) are covered in unit 5 of that training.

    As a final note, this question seems oriented towards "learning". If the goal is to implement something for production, the usual advice is to use a library implementation when/where available, and CUBLAS includes a dot-product, albeit not for integer types. You can also realize a dot-product with for example thrust transform_reduce.

    Responding to a question in the comments below, here is a test case comparing my method vs. the first method presented in the question:

    $ cat t11.cu
    #include <iostream>
    #include <time.h>
    #include <sys/time.h>
    #define USECPSEC 1000000ULL
    
    unsigned long long dtime_usec(unsigned long long start=0){
    
      timeval tv;
      gettimeofday(&tv, 0);
      return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;
    }
    
    #ifdef USE_BAD
    __global__ void dotProductKernel(int *a, int *b, int *result, int n)
    {
        //Get the thread index
        int tid = blockIdx.x * blockDim.x + threadIdx.x;
        //Only calculate if thread index is valid
        if (tid < n) {
           int sum = 0;
           //Perform dot product calculation for this thread's segment of the arrays
           for (int i = tid; i < n; i+= blockDim.x * gridDim.x) {
               sum += a[i] * b[i];
           }
           //Atomically add this thread's result to the shared result
           atomicAdd(result, sum);
        }
    }
    #else
    const int BLOCK_SIZE=512; // must be a power of 2 less than 2048, and preferably larger than 32
    template <typename T>
    __global__ void dotProductKernel(const T * __restrict__ a, const T * __restrict__ b, T * __restrict__ result, size_t n)
    {
        __shared__ T smem[BLOCK_SIZE];
        //Get the thread index
        size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
        T sum = 0;
        //Perform dot product calculation for this thread's segment of the arrays
        for (size_t i = tid; i < n; i+= gridDim.x*blockDim.x) {
          sum += a[i] * b[i];
          }
        //Perform threadblock sweep reduction
        smem[threadIdx.x] = sum;
        for (int i = BLOCK_SIZE>>1; i > 0; i>>=1){
          __syncthreads();
          if (threadIdx.x < i) smem[threadIdx.x] += smem[threadIdx.x+i];}
        //Atomically add this block's result to the global result
        if ((!threadIdx.x) && (tid < n)) atomicAdd(result, smem[0]);
    }
    #endif
    
    int main(){
    
      const int sz = 1048576;
      int *a, *b, *c;
      cudaMallocManaged(&a, sz*sizeof(a[0]));
      cudaMallocManaged(&b, sz*sizeof(b[0]));
      cudaMallocManaged(&c, sizeof(c[0]));
      for (int i = 0; i < sz; i++) {a[i] = 1; b[i] = 2;}
      c[0] = 0;
      cudaMemPrefetchAsync(a, sz*sizeof(a[0]), 0);
      cudaMemPrefetchAsync(b, sz*sizeof(b[0]), 0);
      cudaMemPrefetchAsync(c, sizeof(c[0]), 0);
      // warm-up
    #ifdef USE_BAD
      dotProductKernel<<<1, 1024>>>(a, b, c, sz);
    #else
      dotProductKernel<<<128, BLOCK_SIZE>>>(a, b, c, sz);
    #endif
      cudaDeviceSynchronize();
      c[0] = 0;
      cudaMemPrefetchAsync(c, sizeof(c[0]), 0);
      unsigned long long dt = dtime_usec(0);
    #ifdef USE_BAD
      dotProductKernel<<<1, 1024>>>(a, b, c, sz);
    #else
      dotProductKernel<<<128, BLOCK_SIZE>>>(a, b, c, sz);
    #endif
      cudaDeviceSynchronize();
      dt = dtime_usec(dt);
      cudaError_t err = cudaGetLastError();
      if (err != cudaSuccess)
        std::cout << cudaGetErrorString(err) << std::endl;
      else {
        std::cout << c[0] << std::endl;
        std::cout << "time: " << dt  << "us" << std::endl;}
    }
    
    $ nvcc -o t11 t11.cu -DUSE_BAD
    $ ./t11
    2097152
    time: 185us
    $ nvcc -o t11 t11.cu
    $ ./t11
    2097152
    time: 15us
    $
    

    I also note after some further testing that if the example 1 in the question is modified to have an appropriate number of blocks in the kernel launch, it will be approximately the same speed (within about 1%) of the kernel I have outlined in my answer. So the atomic pressure does not appear to be a serious concern for my testing so far.