CUDA + count int element occurrences using C

On the host side I'm reading in a 128 x 128 integer array with random values between 0-31. I have an Occurrences array that stores the values 0-31 and then on the device I am trying to execute a kernel that loops through the values in the 128 x 128 array and then counts the number of times 0-31 appears.

I am having issues with how to split up the blocks/threads in CUDA and how to get the kernel to provide communication back to the host and print out the number of occurrences of every element.This is my first time using CUDA and I would appreciate any constructive advice! Here is my code so far:

 #include <stdio.h>
#include <stdlib.h>
#include <cuda.h>


#define MAXR 16
#define MAXC 16
#define N 256
__global__ void count(int *arrayONE_d, int *occurrences_d, int *occurrences_final_d) {

    int count = 0;
    //provide unique thread ID
    int idx = threadIdx.x + blockIdx.x * blockDim.x;

    int k;
    //for(k=0; k < 32;k++) {
    //  occurrences_d[k]=k;
//  }


    if(idx < N) {
        //for(k=0; k < MAXR*MAXC; k++) {
    for(int j=0; j<32; j++) {
            count =0;
        if(arrayONE_d[idx]==occurrences_d[j]){

            count+=1;
            occurrences_final_d[j] =count;
        }
        else {}


    }
    }
    //occurrences_final_d[0] = 77;
    }
}


int main(void) {



    //const int N = MAXR*MAXC;

    int arr1_h[MAXR][MAXC];
    //int *occurrences_h[0][32];
    //creating arrays for the device (GPU)
    //int *arr1_d;
    int occurrences_h[32];
    int *occurrences_d;

    int *occurrences_final_h[32] = {0};
    int *occurrences_final_d;

    int *arrayONE_h[256] = {0};
    int *arrayONE_d;

    int i, j;

    // allocating memory for the arrays on the device
    cudaMalloc( (void**) &arrayONE_d, MAXR*MAXC*sizeof(int)); // change to 16384 when using 128x128
    cudaMalloc( (void**) &occurrences_d,  32* sizeof(int));
    cudaMalloc( (void**) &occurrences_final_d, 32*sizeof(int));

    /*
    for(i=0; i < 32; i++) {

        occurrences_h[i] = i;

    }
/*
 *
 */
    //Reading in matrix from .txt file and storing it in arr1 on the host (CPU)
    FILE *fp;
    fp =fopen("arrays16.txt","r");

     // this loop takes the information from .txt file and puts it into arr1 matrix
    for(i=0;i<MAXR;i++) {


        for(j=0;j<MAXC;j++)
        {
            fscanf(fp,"%d\t", &arr1_h[i][j]);
        }

    }

    for(i=0;i<MAXR;i++) {
        printf("\n");

        for(j=0;j<MAXC;j++) {
            //printf("d\t", arr1_h[i][j]);
        }

        printf("\n\n");
    }


    int x,y;
    int z=0;
// this loop flattens the 2d array and makes it a 1d array of length MAXR*MAXC
    for(x=0;x<MAXR;x++)
    {
        for(y=0;y<MAXC;y++)
        {
            //  printf("**%d   ",arr1_h[x][y]);

            arrayONE_h[z]= &arr1_h[x][y];
            z++;

        }
    }


    for(x=0; x < 256; x++) {
        printf("%d\n", *arrayONE_h[x]);
        //return 0;

    }

    int length = sizeof(arrayONE_h)/sizeof(arrayONE_h[0]);

    printf("\n\n");
    printf("**LENGTH = %d", length);

    // copying the arrays/memory from the host to the device (GPU)
    cudaMemcpy(arrayONE_d, &arrayONE_h, MAXR*MAXC*sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(occurrences_d, &occurrences_h, 32*sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(occurrences_final_d, &occurrences_final_h, 32*sizeof(int), cudaMemcpyHostToDevice);

    // how many blocks we will allocate
    //dim3 DimGrid();
    //how many threads per block we will allocate
    dim3 DimBlock(256);

    //kernel launch against the GPU
    count<<<1, DimBlock>>>(arrayONE_d,occurrences_d,occurrences_final_d);

    //copy the arrays post-computation from the device back to the host (CPU)
    cudaMemcpy(&occurrences_final_h, occurrences_final_d, 32*sizeof(int), cudaMemcpyDeviceToHost);
    cudaMemcpy(&occurrences_h, occurrences_d, 32*sizeof(int), cudaMemcpyDeviceToHost);

    // some error checking - run this with cuda-memcheck when executing your code
    cudaError_t errSync  = cudaGetLastError();
    cudaError_t errAsync = cudaDeviceSynchronize();
    if (errSync != cudaSuccess)
        printf("Sync kernel error: %s\n", cudaGetErrorString(errSync));
    if (errAsync != cudaSuccess)
        printf("Async kernel error: %s\n", cudaGetErrorString(errAsync));

    //free up the memory of the device arrays
    cudaFree(arrayONE_d);
    cudaFree(occurrences_d);
    cudaFree(occurrences_final_d);

    //print out the number of occurrences of each 0-31 value
    for(i=0;i<32;i++) {
        printf("\n");

        printf("%d\n",occurrences_final_h[i]);

    }

}

Solution

As I mentioned in the comments, your understanding of pointers is flawed. I've made changes at many places in your code to address this. I've marked most of them with the comment // mod but I may have missed some.

In addition, your kernel simply cannot keep track of elements when multiple threads can update the same location. One way to sort this out is to use atomics (which I've demonstrated.) There are various other approaches such as parallel reduction, but none of these are trivial changes to the kernel. In addition, your kernel logic was broken in a few ways.

What follows then is the smallest number of modifications I could make to your code to get something sensible. There are a few compile switches you can use to explore different kernel behavior:

no switch - close to your kernel, but it will not work correctly
-DUSE_ATOMICS will demonstrate a modification to your kernel to get it to count correctly.
-DUSE_ALT_KERNEL explores a different approach to kernel logic: assign one thread per histogram bin, and have each thread traverse the entire array, keeping track of elements that belong to that bin. Since only one thread is writing to each bin result, there is no need for atomics. However we can only have as many threads (with this trivial realization) as there are bins. Without too much difficulty this method could probably be extended to one warp per bin, using warp shuffle to do a final warp-level reduction before having one thread write the final results to the bin. This would improve memory access efficiency somewhat. However this will also introduce complexity into the kernel that you've probably not learned yet.

Here is the code:

$ cat t316.cu
 #include <stdio.h>
#include <stdlib.h>
#include <cuda.h>


#define MAXR 16
#define MAXC 16
#define BINS 32
#define N (MAXR*MAXC)
__global__ void count(int *arrayONE_d, int *occurrences_d, int *occurrences_final_d) {

    //provide unique thread ID
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
#ifndef USE_ALT_KERNEL
    if(idx < N) {
        //for(k=0; k < MAXR*MAXC; k++) {
    for(int j=0; j<32; j++) {
        if(arrayONE_d[idx]==occurrences_d[j]){
#ifndef USE_ATOMICS
            occurrences_final_d[j]++;
#else
         atomicAdd(occurrences_final_d+j, 1);
#endif

        }
        else {}


    }
    }
#else
   // use one thread per histo bin
   if (idx < BINS){
     int count = 0;
     int myval = occurrences_d[idx];
     for (int i = 0; i < N; i++) if (arrayONE_d[i] == myval) count++;
     occurrences_final_d[idx] = count;
     }

#endif
    }


int main(void) {



    //const int N = MAXR*MAXC;

    int arr1_h[MAXR][MAXC];
    //int *occurrences_h[0][32];
    //creating arrays for the device (GPU)
    //int *arr1_d;
    int occurrences_h[32]; // mod
    int *occurrences_d;

    int occurrences_final_h[32] = {0};  // mod
    int *occurrences_final_d;

    int arrayONE_h[256] = {0};  // mod
    int *arrayONE_d;

    int i, j;

    // allocating memory for the arrays on the device
    cudaMalloc( (void**) &arrayONE_d, MAXR*MAXC*sizeof(int)); // change to 16384 when using 128x128
    cudaMalloc( (void**) &occurrences_d,  32* sizeof(int));
    cudaMalloc( (void**) &occurrences_final_d, 32*sizeof(int));

    /*
    for(i=0; i < 32; i++) {

        occurrences_h[i] = i;

    }
 */
    //Reading in matrix from .txt file and storing it in arr1 on the host (CPU)

//    FILE *fp;
//    fp =fopen("arrays16.txt","r");

     // this loop takes the information from .txt file and puts it into arr1 matrix
    for(i=0;i<MAXR;i++) {


        for(j=0;j<MAXC;j++)
        {
//            fscanf(fp,"%d\t", &arr1_h[i][j]);
              arr1_h[i][j] = j;  // mod
        }

    }

    for(i=0;i<MAXR;i++) {

        for(j=0;j<MAXC;j++) {
            //printf("d\t", arr1_h[i][j]);
        }

    }


    int x,y;
    int z=0;
// this loop flattens the 2d array and makes it a 1d array of length MAXR*MAXC
    for(x=0;x<MAXR;x++)
    {
        for(y=0;y<MAXC;y++)
        {
            //  printf("**%d   ",arr1_h[x][y]);

            arrayONE_h[z]= arr1_h[x][y];  // mod
            z++;

        }
    }


    for(x=0; x < 256; x++) {
//        printf("%d\n", arrayONE_h[x]);  // mod
        //return 0;

    }

    int length = sizeof(arrayONE_h)/sizeof(arrayONE_h[0]);

    printf("**LENGTH = %d\n", length);

    // copying the arrays/memory from the host to the device (GPU)
    cudaMemcpy(arrayONE_d, arrayONE_h, MAXR*MAXC*sizeof(int), cudaMemcpyHostToDevice);  //mod
    cudaMemcpy(occurrences_d, occurrences_h, 32*sizeof(int), cudaMemcpyHostToDevice);   // mod
    cudaMemcpy(occurrences_final_d, occurrences_final_h, 32*sizeof(int), cudaMemcpyHostToDevice); // mod

    // how many blocks we will allocate
    //dim3 DimGrid();
    //how many threads per block we will allocate
#ifndef USE_ALT_KERNEL
    dim3 DimBlock(N);
#else
    dim3 DimBlock(BINS);
#endif
    //kernel launch against the GPU
    count<<<1, DimBlock>>>(arrayONE_d,occurrences_d,occurrences_final_d);

    //copy the arrays post-computation from the device back to the host (CPU)
    cudaMemcpy(occurrences_final_h, occurrences_final_d, 32*sizeof(int), cudaMemcpyDeviceToHost); // mod
    cudaMemcpy(occurrences_h, occurrences_d, 32*sizeof(int), cudaMemcpyDeviceToHost);  // mod

    // some error checking - run this with cuda-memcheck when executing your code
    cudaError_t errSync  = cudaGetLastError();
    cudaError_t errAsync = cudaDeviceSynchronize();
    if (errSync != cudaSuccess)
        printf("Sync kernel error: %s\n", cudaGetErrorString(errSync));
    if (errAsync != cudaSuccess)
        printf("Async kernel error: %s\n", cudaGetErrorString(errAsync));

    //free up the memory of the device arrays
    cudaFree(arrayONE_d);
    cudaFree(occurrences_d);
    cudaFree(occurrences_final_d);

    //print out the number of occurrences of each 0-31 value
    for(i=0;i<32;i++) {
        printf("%d ",occurrences_final_h[i]);

    }
    printf("\n");
}
$ nvcc -o t316 t316.cu
$ cuda-memcheck ./t316
========= CUDA-MEMCHECK
**LENGTH = 256
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
========= ERROR SUMMARY: 0 errors
$ nvcc -o t316 t316.cu -DUSE_ATOMICS
$ ./t316
**LENGTH = 256
16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16
$ nvcc -o t316 t316.cu -DUSE_ALT_KERNEL
$ cuda-memcheck ./t316
========= CUDA-MEMCHECK
**LENGTH = 256
16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16
========= ERROR SUMMARY: 0 errors
$

In the above output we see that the base kernel produces incorrect results. The atomics kernel and the alternate kernel produce correct results

(Your code has been modified to use synthesized data so that it does not need to open a file.)