Which of the following approaches is more suitable for CUDA parallelism?

I found that there are two approaches to calculating dot products of two vectors in CUDA:

The approach used in the given code is a simple parallelization approach using a single block of threads.

The dotProductKernel() function computes the dot product of two input vectors a and b using parallel processing on the GPU. Each thread in the single block processes a segment of the input vectors and calculates a partial sum. The partial sums are then summed up using atomicAdd() to produce the final dot product.

#include <vector>
#include <iostream>

__global__ void dotProductKernel(int *a, int *b, int *result, int n) {
    //Get the thread index
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    //Only calculate if thread index is valid
    if (tid < n) {
       int sum = 0;
       //Perform dot product calculation for this thread's segment of the arrays
       for (int i = tid; i < n; i+= blockDim.x * gridDim.x) {
           sum += a[i] * b[i];
       }
       //Atomically add this thread's result to the shared result
       atomicAdd(result, sum);
    } }

int main() {
    int n = 10000;

    // Use vectors instead of raw pointers
    std::vector<int> a(n), b(n);
    int *c = (int*) malloc(sizeof(int));

    // Initialize the input vectors
    for (int i = 0; i < n; i++) {
        a[i] = i+1;
        b[i] = i+1;
    }
    *c = 0;


        // Allocate memory on the GPU
    int* d_a, *d_b, *d_c;
    cudaMalloc(&d_a, a.size() * sizeof(int));
    cudaMalloc(&d_b, b.size() * sizeof(int));
    cudaMalloc(&d_c, 1 * sizeof(int));

    // Copy vectors to GPU
    cudaMemcpy(d_a, a.data(), a.size() * sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, b.data(), b.size() * sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(d_c, c, 1 * sizeof(int), cudaMemcpyHostToDevice);

    // Launch kernel
    dotProductKernel<<<1, 1024>>>(d_a, d_b, d_c, n);

    // Copy result back from GPU
    cudaMemcpy(c, d_c, 1 * sizeof(int),cudaMemcpyDeviceToHost);

    // Free GPU memory
    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_c);

    // Print first 10 elements
    for (int i = 0; i < 10; i++) {
        std::cout << *c << '\n';
    }

    free(c); } ```

This code uses a parallel reduction approach.

In the dotProductKernel() function, each thread computes a partial sum of the dot product of two input vectors a and b, and stores the result in a shared memory array results. The partial sums are calculated in parallel using multiple threads, with each thread processing a segment of the input vectors.

In the sumResultsKernel() function, a single thread sums up the partial results stored in the shared memory array results and stores the final result in a separate memory location pointed to by the result. This final reduction operation is also performed in parallel using multiple threads.

#include <vector>
#include <iostream>

__global__ void dotProductKernel(int *a, int *b, int *results, int n) {
    //Get the thread index
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    //Only calculate if thread index is valid
    if (tid < n) {
       int sum = 0;
       //Perform dot product calculation for this thread's segment of the arrays
       for (int i = tid; i < n; i+= blockDim.x * gridDim.x) {
           sum += a[i] * b[i];
       }
       //Store this thread's result in the shared results array
       results[blockIdx.x * blockDim.x + threadIdx.x] = sum;
    } }

__global__ void sumResultsKernel(int *results, int *result, int n) {
    //Get the thread index
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    //Only calculate if thread index is valid
    if (tid == 0) {
       int sum = 0;
       //Sum up the partial results
       for (int i = 0; i < n; i++) {
           sum += results[i];
       }
       //Store the final result
       *result = sum;
    } }

int main() {
    int n = 10000;

    // Use vectors instead of raw pointers
    std::vector<int> a(n), b(n);
    int *c = (int*) malloc(sizeof(int));

    // Initialize the input vectors
    for (int i = 0; i < n; i++) {
        a[i] = i+1;
        b[i] = i+1;
    }
    *c = 0;

    // Determine the grid size and block size
    int blockSize = 1024;
    int gridSize = (n + blockSize - 1) / blockSize;

    // Allocate memory on the GPU
    int* d_a, *d_b, *d_results, *d_result;
    cudaMalloc(&d_a, a.size() * sizeof(int));
    cudaMalloc(&d_b, b.size() * sizeof(int));
    cudaMalloc(&d_results, gridSize * blockSize * sizeof(int));
    cudaMalloc(&d_result, 1 * sizeof(int));

    // Copy vectors to GPU
    cudaMemcpy(d_a, a.data(), a.size() * sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, b.data(), b.size() * sizeof(int), cudaMemcpyHostToDevice);
    cudaMemset(d_results, 0, gridSize * blockSize * sizeof(int));
    cudaMemset(d_result, 0, sizeof(int));

    // Launch kernel
    dotProductKernel<<<gridSize, blockSize>>>(d_a, d_b, d_results, n);

    // Sum up the partial results
    sumResultsKernel<<<1, 1024>>>(d_results, d_result, gridSize * blockSize);

    // Copy result back from GPU
    cudaMemcpy(c, d_result, 1 * sizeof(int),cudaMemcpyDeviceToHost);

    // Free GPU memory
    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_results);
    cudaFree(d_result);

    // Print the result
    std::cout << "Dot product = " << *c << '\n';

    free(c); } ```

Solution

The only reason I know of that people use CUDA is to make their code run faster. I'm not sure what "suitable for CUDA parallelism" means, or why it would be necessary to consider something other than performance as a figure of merit.

If we consider performance, then, the best approach will be a combination of the two methods you have shown. In CUDA, the first two priorities of any CUDA programmer are:

expose enough parallelism; this roughly translates into "use lots of threads in your kernel launch"
make efficient use of memory

The first method makes efficient use of memory in that it does a running sum reduction out of global memory into each thread using a coalesced grid-stride loop. It makes inefficient use of memory by unnecessarily loading the atomic mechanism with potentially lots of atomics. Even in the suboptimal case shown, there are 1024 atomic calls when only one or zero are needed.

The first method is also not optimal in that it launches only 1 block. This is almost never optimal in CUDA (not enough threads); it leaves much of the GPU idle.

To rectify these problems and come close to optimal speed, we will want to combine aspects of both methods:

use a running sum per thread to initially gather all values from global memory in a coalesced fashion
choose a grid size to "fill" the GPU being used
once the running product-sums per thread are complete, perform an intra-threadblock reduction. A canonical shared memory sweep style reduction is probably optimal here.
the preceding step will place the threadblock partial sum into a single location. Atomically add this location to the global sum. This eliminates the need for multiple kernel launches (each of which will cost a minimum of 5 microseconds or more) as might be used in the second method, and other canonical reduction material.

The following achieves these design goals:

const int BLOCK_SIZE=512; // must be a power of 2 less than 2048, and preferably larger than 32
template <typename T>
__global__ void dotProductKernel(const T * __restrict__ a, const T * __restrict__ b, T * __restrict__ result, size_t n)
{
    __shared__ T smem[BLOCK_SIZE];
    //Get the thread index
    size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
    T sum = 0;
    //Perform dot product calculation for this thread's segment of the arrays
    for (size_t i = tid; i < n; i+= gridDim.x*blockDim.x) {
      sum += a[i] * b[i];
      }
    //Perform threadblock sweep reduction
    smem[threadIdx.x] = sum;
    for (int i = BLOCK_SIZE>>1; i > 0; i>>=1){
      __syncthreads();
      if (threadIdx.x < i) smem[threadIdx.x] += smem[threadIdx.x+i];}      
    //Atomically add this block's result to the global result
    if ((!threadIdx.x) && (tid < n)) atomicAdd(result, smem[0]);
}

Again, you would typically want to launch the above kernel with a number of threads per block defined by BLOCK_SIZE and a number of blocks as a whole number positive multiple of the number of SMs in your GPU. Some tuning of these numbers may be necessary. For a starting point, I would choose a BLOCK_SIZE of 512 and either two times the number of SMs, three times the number of SMs, or four times the number of SMs, as the number of blocks to launch.

This method will generally work well for larger dot-products. Special adaptations might be desirable for very small dot products, where the length of the vector is less than the instantaneous thread-carrying capacity of the GPU (number of SMs times maximum number of threads per SM). For example, you might choose BLOCK_SIZE of 32 for very small dot products.

Some of these concepts will be covered in units 3-4 of this online training (basic cuda optimization goals) and some additional topics (reductions and atomics) are covered in unit 5 of that training.

As a final note, this question seems oriented towards "learning". If the goal is to implement something for production, the usual advice is to use a library implementation when/where available, and CUBLAS includes a dot-product, albeit not for integer types. You can also realize a dot-product with for example thrust transform_reduce.

Responding to a question in the comments below, here is a test case comparing my method vs. the first method presented in the question:

$ cat t11.cu
#include <iostream>
#include <time.h>
#include <sys/time.h>
#define USECPSEC 1000000ULL

unsigned long long dtime_usec(unsigned long long start=0){

  timeval tv;
  gettimeofday(&tv, 0);
  return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;
}

#ifdef USE_BAD
__global__ void dotProductKernel(int *a, int *b, int *result, int n)
{
    //Get the thread index
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    //Only calculate if thread index is valid
    if (tid < n) {
       int sum = 0;
       //Perform dot product calculation for this thread's segment of the arrays
       for (int i = tid; i < n; i+= blockDim.x * gridDim.x) {
           sum += a[i] * b[i];
       }
       //Atomically add this thread's result to the shared result
       atomicAdd(result, sum);
    }
}
#else
const int BLOCK_SIZE=512; // must be a power of 2 less than 2048, and preferably larger than 32
template <typename T>
__global__ void dotProductKernel(const T * __restrict__ a, const T * __restrict__ b, T * __restrict__ result, size_t n)
{
    __shared__ T smem[BLOCK_SIZE];
    //Get the thread index
    size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
    T sum = 0;
    //Perform dot product calculation for this thread's segment of the arrays
    for (size_t i = tid; i < n; i+= gridDim.x*blockDim.x) {
      sum += a[i] * b[i];
      }
    //Perform threadblock sweep reduction
    smem[threadIdx.x] = sum;
    for (int i = BLOCK_SIZE>>1; i > 0; i>>=1){
      __syncthreads();
      if (threadIdx.x < i) smem[threadIdx.x] += smem[threadIdx.x+i];}
    //Atomically add this block's result to the global result
    if ((!threadIdx.x) && (tid < n)) atomicAdd(result, smem[0]);
}
#endif

int main(){

  const int sz = 1048576;
  int *a, *b, *c;
  cudaMallocManaged(&a, sz*sizeof(a[0]));
  cudaMallocManaged(&b, sz*sizeof(b[0]));
  cudaMallocManaged(&c, sizeof(c[0]));
  for (int i = 0; i < sz; i++) {a[i] = 1; b[i] = 2;}
  c[0] = 0;
  cudaMemPrefetchAsync(a, sz*sizeof(a[0]), 0);
  cudaMemPrefetchAsync(b, sz*sizeof(b[0]), 0);
  cudaMemPrefetchAsync(c, sizeof(c[0]), 0);
  // warm-up
#ifdef USE_BAD
  dotProductKernel<<<1, 1024>>>(a, b, c, sz);
#else
  dotProductKernel<<<128, BLOCK_SIZE>>>(a, b, c, sz);
#endif
  cudaDeviceSynchronize();
  c[0] = 0;
  cudaMemPrefetchAsync(c, sizeof(c[0]), 0);
  unsigned long long dt = dtime_usec(0);
#ifdef USE_BAD
  dotProductKernel<<<1, 1024>>>(a, b, c, sz);
#else
  dotProductKernel<<<128, BLOCK_SIZE>>>(a, b, c, sz);
#endif
  cudaDeviceSynchronize();
  dt = dtime_usec(dt);
  cudaError_t err = cudaGetLastError();
  if (err != cudaSuccess)
    std::cout << cudaGetErrorString(err) << std::endl;
  else {
    std::cout << c[0] << std::endl;
    std::cout << "time: " << dt  << "us" << std::endl;}
}

$ nvcc -o t11 t11.cu -DUSE_BAD
$ ./t11
2097152
time: 185us
$ nvcc -o t11 t11.cu
$ ./t11
2097152
time: 15us
$

I also note after some further testing that if the example 1 in the question is modified to have an appropriate number of blocks in the kernel launch, it will be approximately the same speed (within about 1%) of the kernel I have outlined in my answer. So the atomic pressure does not appear to be a serious concern for my testing so far.