Computing the mean of 2000 2D-arrays with CUDA C

I have 2000 2D-arrays (each array is 1000x1000). I need to compute the mean of each one and put the result in one 2000 vector.

I tried to do that by calling the kernel for each 2D-array, but it is naive, and I want to do the computation once.

What I have been doing is a kernel for one 2D-array. I want to make my kernel do this for 2000 2D-arrays, but in one kernel.

#include <stdio.h>
#include <cuda.h>
#include <time.h>

void init_mat(float *a, const int N, const int M);
void print_mat(float *a, const int N, const int M, char *d);
void print_array(float *a, const int N, char *d);

const int threadsPerBlock=256;

__global__
void kernel(float *mat, float *out, const int N, const int M){

    __shared__ float cache[threadsPerBlock];

    int tid=threadIdx.x+blockIdx.x*blockDim.x;
    int cacheIndex = threadIdx.x;
    float sum=0;
    
    if (tid<M) {
        for(int i=0; i<N; i++)
            sum += mat[(i*M)+tid];
            cache[cacheIndex] = sum;
            out[tid] =cache[cacheIndex];
    }

    __syncthreads();

    int i = blockDim.x/2;
    while(i != 0) {
        if(cacheIndex<i)
            cache[cacheIndex]+= cache[cacheIndex +i];
        __syncthreads();
        I /= 2;
    }

   if (cacheIndex == 0)
      out[blockIdx.x]=cache[0];
}

int main (void) {
   srand( time(NULL) );

   float *a, *b, *c;
   float *dev_a, *dev_b, *dev_c;

   int N=1000;
   int M=1000;

   b=(float*)malloc(sizeof(float)*N*M);
   c=(float*)malloc(sizeof(float)*M);

   init_mat(b, N, M);

   printf("<<<<<<<<<< initial data:\n");

   print_mat(b, N, M, "matrix");

   cudaMalloc((void**)&dev_b, sizeof(float)*N*M);
   cudaMalloc((void**)&dev_c, sizeof(float)*M);

   cudaMemcpy(dev_b, b, sizeof(float)*N*M, cudaMemcpyHostToDevice);

   printf("\n\nRunning Kernel...\n\n");
      kernel<<<M/256+1, 256>>>(dev_b, dev_c, N, M);


   cudaMemcpy(c, dev_c, sizeof(float)*M, cudaMemcpyDeviceToHost);

   cudaFree(dev_a);
   cudaFree(dev_b);
   cudaFree(dev_c);

   printf(">>>>>>>>>> final data:\n");
   print_array(c, M, "out-vector");
};

void init_mat(float *a, const int N, const int M) {
   int i, j;
   for(i=0; i<N; i++)
      for(j=0; j<M; j++)
         a[i*M+j] = rand() % 100 + 1;
}

void print_mat(float *a, const int N, const int M, char *d) {
   int i, j;
   for(i=0; i<N; i++){
      printf("\n%s[%d]:", d, i);
      for (j=0; j<M; j++)
         printf("\t%6.4f", a[i*M+j]);
    }
    printf("\n");
}

void print_array(float *a, const int N, char *d) {
   int i;
   for(i=0; i<N; i++)
      printf("\n%s[%d]: %f",d, i, a[i]);
    printf("\n");
}

Solution

For a reasonably large number of arrays (e.g. 2000) and reasonably large sized arrays (e.g. 2000), the GPU can be fairly efficient if we assign a block to perform the sum reduction (and mean calculation) for each array. This means if you have 2000 arrays we will launch 2000 blocks.

In order to handle arbitrary sized arrays with a fixed number of threads per block, we will use an idea like the grid-striding loop but instead we will cause each block to use a block-striding loop to load all the data associated with a particular array. This means the threads of each block will "stride" through the assigned array, to load all the elements of that array.

Apart from this, the main reduction operation is similar to what you have written, and calculation of the mean is trivial this way - we can calculate the mean before writing the result to global memory, once we have the sum calculated via reduction.

Here is a worked example. If you compile with -DMEAN the code will output the mean of each array. If you omit that compile switch, the code will output the sum of each array. Let N be the number of arrays, and let K be the size of each array.

$ cat t1285.cu
#include <stdio.h>

const size_t N = 1000; // number of arrays
const size_t K = 1000; // size of each array
const int nTPB = 256; // number of threads per block,  must be a power-of-2
typedef float mytype; // type of data to be summed

// produce the sum or mean of each array
template <typename T>
__global__ void breduce(const T * __restrict__ idata, T * __restrict__ odata, const int bsize){

  __shared__ T sdata[nTPB];

  T sum = 0;
  //block-striding loop
  size_t offset = blockIdx.x*bsize + threadIdx.x;
  while (offset < (blockIdx.x+1)*bsize){
    sum += idata[offset];
    offset += blockDim.x;}
  sdata[threadIdx.x] = sum;
  __syncthreads();
  //shared memory reduction sweep
  for (int i = nTPB>>1; i > 0; i>>=1){
    if (threadIdx.x < i) sdata[threadIdx.x] += sdata[threadIdx.x+i];
    __syncthreads();}
  // write output sum for this block/array
#ifndef MEAN
  if (!threadIdx.x) odata[blockIdx.x] = sdata[0];
#else
  if (!threadIdx.x) odata[blockIdx.x] = sdata[0]/bsize;
#endif
}

int main(){

  mytype *h_idata, *h_odata, *d_idata, *d_odata;
  h_idata=(mytype *)malloc(N*K*sizeof(mytype));
  h_odata=(mytype *)malloc(N*sizeof(mytype));
  cudaMalloc(&d_idata, N*K*sizeof(mytype));
  cudaMalloc(&d_odata, N*sizeof(mytype));
  for (size_t i = 0; i < N; i++)
    for (size_t j = 0; j < K; j++)
      h_idata[i*K+j] = 1 + (i&1); // fill alternating arrays with 1 and 2
  memset(h_odata, 0, N*sizeof(mytype));  // zero out
  cudaMemset(d_odata, 0, N*sizeof(mytype)); // zero out
  cudaMemcpy(d_idata, h_idata, N*K*sizeof(mytype), cudaMemcpyHostToDevice);
  breduce<<<N, nTPB>>>(d_idata, d_odata, K);
  cudaMemcpy(h_odata, d_odata, N*sizeof(mytype), cudaMemcpyDeviceToHost);
  // validate
  for (size_t i = 0; i < N; i++)
#ifndef MEAN
    if (h_odata[i] != (K*(1 + (i&1)))) {printf("mismatch at %d, was: %f, should be: %f\n", i, (float)h_odata[i], (float)(K*(1 + (i&1)))); return 1;}
#else
    if (h_odata[i] != ((1 + (i&1)))) {printf("mismatch at %d, was: %f, should be: %f\n", i, (float)h_odata[i], (float)((1 + (i&1)))); return 1;}
#endif
  return 0;
}

$ nvcc -arch=sm_35 -o t1285 t1285.cu -DMEAN
$ cuda-memcheck ./t1285
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
$ nvcc -arch=sm_35 -o t1285 t1285.cu
$ cuda-memcheck ./t1285
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
$