Binary Matrix Reduction in CUDA

I have to traverse all cells of an imaginary matrix m * n and add + 1 for all cells that meet a certain condition.

My naive solution was as follows:

#include <stdio.h>

__global__ void calculate_pi(int center, int *count) {
    int x = threadIdx.x;
    int y = blockIdx.x;

    if (x*x + y*y <= center*center) {
        *count++;
    }
}

int main() {
    int interactions;
    printf("Enter the number of interactions: ");
    scanf("%d", &interactions);

    int l = sqrt(interactions);

    int h_count = 0;
    int *d_count;

    cudaMalloc(&d_count, sizeof(int));
    cudaMemcpy(&d_count, &h_count, sizeof(int), cudaMemcpyHostToDevice);

    calculate_pi<<<l,l>>>(l/2, d_count);

    cudaMemcpy(&h_count, d_count, sizeof(int), cudaMemcpyDeviceToHost);
    cudaFree(d_count);

    printf("Sum: %d\n", h_count);

    return 0;
}

In my use case, the value of interactions can be very large, making it impossible to allocate l * l of space.

Can someone help me? Any suggestions are welcome.

Solution

There are at least 2 problems with your code:

Your kernel code will not work correctly with an ordinary add here:
```
*count++;
```
this is because multiple threads are trying to do this at the same time, and CUDA does not automatically sort that out for you. For the purpose of this explanation, we will fix this with an atomicAdd(), although other methods are possible.

The ampersand doesn't belong here:

cudaMemcpy(&d_count, &h_count, sizeof(int), cudaMemcpyHostToDevice);
           ^

I assume that is just a typo, since you did it correctly on the subsequent cudaMemcpy operation:

cudaMemcpy(&h_count, d_count, sizeof(int), cudaMemcpyDeviceToHost);

This methodology (effectively creating a square array of threads using threadIdx.x for one dimension and blockIdx.x for the other) will only work up to an interactions value that leads to an l value of 1024, or less, because CUDA threadblocks are limited to 1024 threads, and you are using l as the size of the threadblock in your kernel launch. To fix this you would want to learn how to create a CUDA 2D grid of arbitrary dimensions, and adjust your kernel launch and in-kernel indexing calculations appropriately. For now we will just make sure that the calculated l value is in range for your code design.

Here's an example addressing the above issues:

$ cat t1590.cu
#include <stdio.h>

__global__ void calculate_pi(int center, int *count) {
    int x = threadIdx.x;
    int y = blockIdx.x;

    if (x*x + y*y <= center*center) {
        atomicAdd(count, 1);
    }
}

int main() {
    int interactions;
    printf("Enter the number of interactions: ");
    scanf("%d", &interactions);

    int l = sqrt(interactions);
    if ((l > 1024) || (l < 1)) {printf("Error: interactions out of range\n"); return 0;}
    int h_count = 0;
    int *d_count;

    cudaMalloc(&d_count, sizeof(int));
    cudaMemcpy(d_count, &h_count, sizeof(int), cudaMemcpyHostToDevice);

    calculate_pi<<<l,l>>>(l/2, d_count);

    cudaMemcpy(&h_count, d_count, sizeof(int), cudaMemcpyDeviceToHost);
    cudaFree(d_count);
    cudaError_t err = cudaGetLastError();
    if (err == cudaSuccess){
      printf("Sum: %d\n", h_count);
      printf("fraction satisfying test:  %f\n", h_count/(float)interactions);
      }
    else
      printf("CUDA error: %s\n", cudaGetErrorString(err));
    return 0;
}
$ nvcc -o t1590 t1590.cu
$ ./t1590
Enter the number of interactions: 1048576
Sum: 206381
fraction satisfying test:  0.196820
$

We see that the code indicates a calculated fraction of about 0.2. Does this appear to be correct? I claim that it does appear to be correct based on your test. You are effectively creating a grid that represents dimensions of lxl. Your test is asking, effectively, "which points in that grid are within a circle, with the center at the origin (corner) of the grid, and radius l/2 ?"

Pictorially, that looks something like this:

and it is reasonable to assume the red shaded area is somewhat less than 0.25 of the total area, so 0.2 is a reasonable estimate of that area.

As a bonus, here is a version of the code that reduces the restriction listed in item 3 above:

#include <stdio.h>

__global__ void calculate_pi(int center, int *count) {
    int x = threadIdx.x+blockDim.x*blockIdx.x;
    int y = threadIdx.y+blockDim.y*blockIdx.y;

    if (x*x + y*y <= center*center) {
        atomicAdd(count, 1);
    }
}

int main() {
    int interactions;
    printf("Enter the number of interactions: ");
    scanf("%d", &interactions);

    int l = sqrt(interactions);
    int h_count = 0;
    int *d_count;
    const int bs = 32;
    dim3 threads(bs, bs);
    dim3 blocks((l+threads.x-1)/threads.x, (l+threads.y-1)/threads.y);

    cudaMalloc(&d_count, sizeof(int));
    cudaMemcpy(d_count, &h_count, sizeof(int), cudaMemcpyHostToDevice);

    calculate_pi<<<blocks,threads>>>(l/2, d_count);

    cudaMemcpy(&h_count, d_count, sizeof(int), cudaMemcpyDeviceToHost);
    cudaFree(d_count);
    cudaError_t err = cudaGetLastError();
    if (err == cudaSuccess){
      printf("Sum: %d\n", h_count);
      printf("fraction satisfying test:  %f\n", h_count/(float)interactions);
      }
    else
      printf("CUDA error: %s\n", cudaGetErrorString(err));
    return 0;
}

This is launching a 2D grid based on l, and should work up to at least 1 billion interactions .