I found that there are two approaches to calculating dot products of two vectors in CUDA:
Which of the following approaches is more suitable for CUDA parallelism?
The approach used in the given code is a simple parallelization approach using a single block of threads.
The dotProductKernel()
function computes the dot product of two
input vectors a
and b
using parallel processing on the GPU. Each
thread in the single block processes a segment of the input vectors
and calculates a partial sum. The partial sums are then summed up
using atomicAdd()
to produce the final dot product.
#include <vector>
#include <iostream>
__global__ void dotProductKernel(int *a, int *b, int *result, int n) {
//Get the thread index
int tid = blockIdx.x * blockDim.x + threadIdx.x;
//Only calculate if thread index is valid
if (tid < n) {
int sum = 0;
//Perform dot product calculation for this thread's segment of the arrays
for (int i = tid; i < n; i+= blockDim.x * gridDim.x) {
sum += a[i] * b[i];
}
//Atomically add this thread's result to the shared result
atomicAdd(result, sum);
} }
int main() {
int n = 10000;
// Use vectors instead of raw pointers
std::vector<int> a(n), b(n);
int *c = (int*) malloc(sizeof(int));
// Initialize the input vectors
for (int i = 0; i < n; i++) {
a[i] = i+1;
b[i] = i+1;
}
*c = 0;
// Allocate memory on the GPU
int* d_a, *d_b, *d_c;
cudaMalloc(&d_a, a.size() * sizeof(int));
cudaMalloc(&d_b, b.size() * sizeof(int));
cudaMalloc(&d_c, 1 * sizeof(int));
// Copy vectors to GPU
cudaMemcpy(d_a, a.data(), a.size() * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(d_b, b.data(), b.size() * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(d_c, c, 1 * sizeof(int), cudaMemcpyHostToDevice);
// Launch kernel
dotProductKernel<<<1, 1024>>>(d_a, d_b, d_c, n);
// Copy result back from GPU
cudaMemcpy(c, d_c, 1 * sizeof(int),cudaMemcpyDeviceToHost);
// Free GPU memory
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
// Print first 10 elements
for (int i = 0; i < 10; i++) {
std::cout << *c << '\n';
}
free(c); } ```
This code uses a parallel reduction approach.
In the dotProductKernel()
function, each thread computes a partial
sum of the dot product of two input vectors a
and b
, and stores
the result in a shared memory array results. The partial sums are
calculated in parallel using multiple threads, with each thread
processing a segment of the input vectors.
In the sumResultsKernel()
function, a single thread sums up the
partial results stored in the shared memory array results and stores
the final result in a separate memory location pointed to by the
result. This final reduction operation is also performed in parallel
using multiple threads.
#include <vector>
#include <iostream>
__global__ void dotProductKernel(int *a, int *b, int *results, int n) {
//Get the thread index
int tid = blockIdx.x * blockDim.x + threadIdx.x;
//Only calculate if thread index is valid
if (tid < n) {
int sum = 0;
//Perform dot product calculation for this thread's segment of the arrays
for (int i = tid; i < n; i+= blockDim.x * gridDim.x) {
sum += a[i] * b[i];
}
//Store this thread's result in the shared results array
results[blockIdx.x * blockDim.x + threadIdx.x] = sum;
} }
__global__ void sumResultsKernel(int *results, int *result, int n) {
//Get the thread index
int tid = blockIdx.x * blockDim.x + threadIdx.x;
//Only calculate if thread index is valid
if (tid == 0) {
int sum = 0;
//Sum up the partial results
for (int i = 0; i < n; i++) {
sum += results[i];
}
//Store the final result
*result = sum;
} }
int main() {
int n = 10000;
// Use vectors instead of raw pointers
std::vector<int> a(n), b(n);
int *c = (int*) malloc(sizeof(int));
// Initialize the input vectors
for (int i = 0; i < n; i++) {
a[i] = i+1;
b[i] = i+1;
}
*c = 0;
// Determine the grid size and block size
int blockSize = 1024;
int gridSize = (n + blockSize - 1) / blockSize;
// Allocate memory on the GPU
int* d_a, *d_b, *d_results, *d_result;
cudaMalloc(&d_a, a.size() * sizeof(int));
cudaMalloc(&d_b, b.size() * sizeof(int));
cudaMalloc(&d_results, gridSize * blockSize * sizeof(int));
cudaMalloc(&d_result, 1 * sizeof(int));
// Copy vectors to GPU
cudaMemcpy(d_a, a.data(), a.size() * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(d_b, b.data(), b.size() * sizeof(int), cudaMemcpyHostToDevice);
cudaMemset(d_results, 0, gridSize * blockSize * sizeof(int));
cudaMemset(d_result, 0, sizeof(int));
// Launch kernel
dotProductKernel<<<gridSize, blockSize>>>(d_a, d_b, d_results, n);
// Sum up the partial results
sumResultsKernel<<<1, 1024>>>(d_results, d_result, gridSize * blockSize);
// Copy result back from GPU
cudaMemcpy(c, d_result, 1 * sizeof(int),cudaMemcpyDeviceToHost);
// Free GPU memory
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_results);
cudaFree(d_result);
// Print the result
std::cout << "Dot product = " << *c << '\n';
free(c); } ```
The only reason I know of that people use CUDA is to make their code run faster. I'm not sure what "suitable for CUDA parallelism" means, or why it would be necessary to consider something other than performance as a figure of merit.
If we consider performance, then, the best approach will be a combination of the two methods you have shown. In CUDA, the first two priorities of any CUDA programmer are:
The first method makes efficient use of memory in that it does a running sum reduction out of global memory into each thread using a coalesced grid-stride loop. It makes inefficient use of memory by unnecessarily loading the atomic mechanism with potentially lots of atomics. Even in the suboptimal case shown, there are 1024 atomic calls when only one or zero are needed.
The first method is also not optimal in that it launches only 1 block. This is almost never optimal in CUDA (not enough threads); it leaves much of the GPU idle.
To rectify these problems and come close to optimal speed, we will want to combine aspects of both methods:
The following achieves these design goals:
const int BLOCK_SIZE=512; // must be a power of 2 less than 2048, and preferably larger than 32
template <typename T>
__global__ void dotProductKernel(const T * __restrict__ a, const T * __restrict__ b, T * __restrict__ result, size_t n)
{
__shared__ T smem[BLOCK_SIZE];
//Get the thread index
size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
T sum = 0;
//Perform dot product calculation for this thread's segment of the arrays
for (size_t i = tid; i < n; i+= gridDim.x*blockDim.x) {
sum += a[i] * b[i];
}
//Perform threadblock sweep reduction
smem[threadIdx.x] = sum;
for (int i = BLOCK_SIZE>>1; i > 0; i>>=1){
__syncthreads();
if (threadIdx.x < i) smem[threadIdx.x] += smem[threadIdx.x+i];}
//Atomically add this block's result to the global result
if ((!threadIdx.x) && (tid < n)) atomicAdd(result, smem[0]);
}
Again, you would typically want to launch the above kernel with a number of threads per block defined by BLOCK_SIZE
and a number of blocks as a whole number positive multiple of the number of SMs in your GPU. Some tuning of these numbers may be necessary. For a starting point, I would choose a BLOCK_SIZE
of 512 and either two times the number of SMs, three times the number of SMs, or four times the number of SMs, as the number of blocks to launch.
This method will generally work well for larger dot-products. Special adaptations might be desirable for very small dot products, where the length of the vector is less than the instantaneous thread-carrying capacity of the GPU (number of SMs times maximum number of threads per SM). For example, you might choose BLOCK_SIZE
of 32 for very small dot products.
Some of these concepts will be covered in units 3-4 of this online training (basic cuda optimization goals) and some additional topics (reductions and atomics) are covered in unit 5 of that training.
As a final note, this question seems oriented towards "learning". If the goal is to implement something for production, the usual advice is to use a library implementation when/where available, and CUBLAS includes a dot-product, albeit not for integer types. You can also realize a dot-product with for example thrust transform_reduce.
Responding to a question in the comments below, here is a test case comparing my method vs. the first method presented in the question:
$ cat t11.cu
#include <iostream>
#include <time.h>
#include <sys/time.h>
#define USECPSEC 1000000ULL
unsigned long long dtime_usec(unsigned long long start=0){
timeval tv;
gettimeofday(&tv, 0);
return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;
}
#ifdef USE_BAD
__global__ void dotProductKernel(int *a, int *b, int *result, int n)
{
//Get the thread index
int tid = blockIdx.x * blockDim.x + threadIdx.x;
//Only calculate if thread index is valid
if (tid < n) {
int sum = 0;
//Perform dot product calculation for this thread's segment of the arrays
for (int i = tid; i < n; i+= blockDim.x * gridDim.x) {
sum += a[i] * b[i];
}
//Atomically add this thread's result to the shared result
atomicAdd(result, sum);
}
}
#else
const int BLOCK_SIZE=512; // must be a power of 2 less than 2048, and preferably larger than 32
template <typename T>
__global__ void dotProductKernel(const T * __restrict__ a, const T * __restrict__ b, T * __restrict__ result, size_t n)
{
__shared__ T smem[BLOCK_SIZE];
//Get the thread index
size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
T sum = 0;
//Perform dot product calculation for this thread's segment of the arrays
for (size_t i = tid; i < n; i+= gridDim.x*blockDim.x) {
sum += a[i] * b[i];
}
//Perform threadblock sweep reduction
smem[threadIdx.x] = sum;
for (int i = BLOCK_SIZE>>1; i > 0; i>>=1){
__syncthreads();
if (threadIdx.x < i) smem[threadIdx.x] += smem[threadIdx.x+i];}
//Atomically add this block's result to the global result
if ((!threadIdx.x) && (tid < n)) atomicAdd(result, smem[0]);
}
#endif
int main(){
const int sz = 1048576;
int *a, *b, *c;
cudaMallocManaged(&a, sz*sizeof(a[0]));
cudaMallocManaged(&b, sz*sizeof(b[0]));
cudaMallocManaged(&c, sizeof(c[0]));
for (int i = 0; i < sz; i++) {a[i] = 1; b[i] = 2;}
c[0] = 0;
cudaMemPrefetchAsync(a, sz*sizeof(a[0]), 0);
cudaMemPrefetchAsync(b, sz*sizeof(b[0]), 0);
cudaMemPrefetchAsync(c, sizeof(c[0]), 0);
// warm-up
#ifdef USE_BAD
dotProductKernel<<<1, 1024>>>(a, b, c, sz);
#else
dotProductKernel<<<128, BLOCK_SIZE>>>(a, b, c, sz);
#endif
cudaDeviceSynchronize();
c[0] = 0;
cudaMemPrefetchAsync(c, sizeof(c[0]), 0);
unsigned long long dt = dtime_usec(0);
#ifdef USE_BAD
dotProductKernel<<<1, 1024>>>(a, b, c, sz);
#else
dotProductKernel<<<128, BLOCK_SIZE>>>(a, b, c, sz);
#endif
cudaDeviceSynchronize();
dt = dtime_usec(dt);
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess)
std::cout << cudaGetErrorString(err) << std::endl;
else {
std::cout << c[0] << std::endl;
std::cout << "time: " << dt << "us" << std::endl;}
}
$ nvcc -o t11 t11.cu -DUSE_BAD
$ ./t11
2097152
time: 185us
$ nvcc -o t11 t11.cu
$ ./t11
2097152
time: 15us
$
I also note after some further testing that if the example 1 in the question is modified to have an appropriate number of blocks in the kernel launch, it will be approximately the same speed (within about 1%) of the kernel I have outlined in my answer. So the atomic pressure does not appear to be a serious concern for my testing so far.