CUDA atomic function usage with volatile shared memory

I have a CUDA kernel that needs to use an atomic function on volatile shared integer memory. However, when I try to declare the shared memory as volatile and use it in an atomic function, I get an error message.

Below is some minimalist code that reproduces the error. Please note that the following kernel does nothing and horribly abuses why you would ever want to declare shared memory as volatile (or even use shared memory at all). But it does reproduce the error.

The code uses atomic functions on shared memory, so, to run it, you probably need to compile with "arch12" or higher (in Visual Studio 2010, right click on your project and go to "Properties -> Configuration Properties -> CUDA C/C++ -> Device" and enter "compute_12,sm_12" in the "Code Generation" line). The code should otherwise compile as is.

#include <cstdlib>
#include <cuda_runtime.h>

static int const X_THRDS_PER_BLK = 32;
static int const Y_THRDS_PER_BLK = 8;

__global__ void KernelWithSharedMemoryAndAtomicFunction(int * d_array, int numTotX, int numTotY)
{
              __shared__ int s_blk[Y_THRDS_PER_BLK][X_THRDS_PER_BLK]; // compiles
   //volatile __shared__ int s_blk[Y_THRDS_PER_BLK][X_THRDS_PER_BLK]; // will not compile

   int tx = threadIdx.x;
   int ty = threadIdx.y;

   int mx = blockIdx.x*blockDim.x + threadIdx.x;
   int my = blockIdx.y*blockDim.y + threadIdx.y;
   int mi = my*numTotX + mx;

   if (mx < numTotX && my < numTotY)
   {
      s_blk[ty][tx] = d_array[mi];

      __syncthreads();

      atomicMin(&s_blk[ty][tx], 4); // will compile with volatile shared memory only if this line is commented out

      __syncthreads();

      d_array[mi] = s_blk[ty][tx];
   }
}

int main(void)
{
   // Declare and initialize some array on host
   int const NUM_TOT_X = 4*X_THRDS_PER_BLK;
   int const NUM_TOT_Y = 6*Y_THRDS_PER_BLK;

   int * h_array = (int *)malloc(NUM_TOT_X*NUM_TOT_Y*sizeof(int));

   for (int i = 0; i < NUM_TOT_X*NUM_TOT_Y; ++i) h_array[i] = i;

   // Copy array to device
   int * d_array;
   cudaMalloc((void **)&d_array, NUM_TOT_X*NUM_TOT_Y*sizeof(int));

   cudaMemcpy(d_array, h_array, NUM_TOT_X*NUM_TOT_Y*sizeof(int), cudaMemcpyHostToDevice);

   // Declare block and thread variables
   dim3 thdsPerBlk;
   dim3 blks;

   thdsPerBlk.x = X_THRDS_PER_BLK;
   thdsPerBlk.y = Y_THRDS_PER_BLK;
   thdsPerBlk.z = 1;

   blks.x = (NUM_TOT_X + X_THRDS_PER_BLK - 1)/X_THRDS_PER_BLK;
   blks.y = (NUM_TOT_Y + Y_THRDS_PER_BLK - 1)/Y_THRDS_PER_BLK;
   blks.z = 1;

   // Run kernel
   KernelWithSharedMemoryAndAtomicFunction<<<blks, thdsPerBlk>>>(d_array, NUM_TOT_X, NUM_TOT_Y);

   // Cleanup
   free    (h_array);
   cudaFree(d_array);

   return 0;
}

Anyway, if you comment out the "s_blk" declaration towards the top of the kernel and uncomment the commented-out declaration immediately following it, then you should get the following error:

error : no instance of overloaded function "atomicMin" matches the argument list

I do not understand why declaring the shared memory as volatile would affect its type, as (I think) this error message is indicating, nor why it cannot be used with atomic operations.

Can anyone please provide any insight?

Thanks,

Aaron

Solution

Just replace
atomicMin(&s_blk[ty][tx], 4);
by
atomicMin((int *)&s_blk[ty][tx], 4);.

It typecasts &s_blk[ty][tx] so it matches the argument of atomicMin(..).