I have a CUDA kernel that needs to use an atomic function on volatile shared integer memory. However, when I try to declare the shared memory as volatile and use it in an atomic function, I get an error message.
Below is some minimalist code that reproduces the error. Please note that the following kernel does nothing and horribly abuses why you would ever want to declare shared memory as volatile (or even use shared memory at all). But it does reproduce the error.
The code uses atomic functions on shared memory, so, to run it, you probably need to compile with "arch12" or higher (in Visual Studio 2010, right click on your project and go to "Properties -> Configuration Properties -> CUDA C/C++ -> Device" and enter "compute_12,sm_12" in the "Code Generation" line). The code should otherwise compile as is.
#include <cstdlib>
#include <cuda_runtime.h>
static int const X_THRDS_PER_BLK = 32;
static int const Y_THRDS_PER_BLK = 8;
__global__ void KernelWithSharedMemoryAndAtomicFunction(int * d_array, int numTotX, int numTotY)
{
__shared__ int s_blk[Y_THRDS_PER_BLK][X_THRDS_PER_BLK]; // compiles
//volatile __shared__ int s_blk[Y_THRDS_PER_BLK][X_THRDS_PER_BLK]; // will not compile
int tx = threadIdx.x;
int ty = threadIdx.y;
int mx = blockIdx.x*blockDim.x + threadIdx.x;
int my = blockIdx.y*blockDim.y + threadIdx.y;
int mi = my*numTotX + mx;
if (mx < numTotX && my < numTotY)
{
s_blk[ty][tx] = d_array[mi];
__syncthreads();
atomicMin(&s_blk[ty][tx], 4); // will compile with volatile shared memory only if this line is commented out
__syncthreads();
d_array[mi] = s_blk[ty][tx];
}
}
int main(void)
{
// Declare and initialize some array on host
int const NUM_TOT_X = 4*X_THRDS_PER_BLK;
int const NUM_TOT_Y = 6*Y_THRDS_PER_BLK;
int * h_array = (int *)malloc(NUM_TOT_X*NUM_TOT_Y*sizeof(int));
for (int i = 0; i < NUM_TOT_X*NUM_TOT_Y; ++i) h_array[i] = i;
// Copy array to device
int * d_array;
cudaMalloc((void **)&d_array, NUM_TOT_X*NUM_TOT_Y*sizeof(int));
cudaMemcpy(d_array, h_array, NUM_TOT_X*NUM_TOT_Y*sizeof(int), cudaMemcpyHostToDevice);
// Declare block and thread variables
dim3 thdsPerBlk;
dim3 blks;
thdsPerBlk.x = X_THRDS_PER_BLK;
thdsPerBlk.y = Y_THRDS_PER_BLK;
thdsPerBlk.z = 1;
blks.x = (NUM_TOT_X + X_THRDS_PER_BLK - 1)/X_THRDS_PER_BLK;
blks.y = (NUM_TOT_Y + Y_THRDS_PER_BLK - 1)/Y_THRDS_PER_BLK;
blks.z = 1;
// Run kernel
KernelWithSharedMemoryAndAtomicFunction<<<blks, thdsPerBlk>>>(d_array, NUM_TOT_X, NUM_TOT_Y);
// Cleanup
free (h_array);
cudaFree(d_array);
return 0;
}
Anyway, if you comment out the "s_blk" declaration towards the top of the kernel and uncomment the commented-out declaration immediately following it, then you should get the following error:
error : no instance of overloaded function "atomicMin" matches the argument list
I do not understand why declaring the shared memory as volatile would affect its type, as (I think) this error message is indicating, nor why it cannot be used with atomic operations.
Can anyone please provide any insight?
Thanks,
Aaron
Just replace
atomicMin(&s_blk[ty][tx], 4);
by
atomicMin((int *)&s_blk[ty][tx], 4);
.
It typecasts &s_blk[ty][tx]
so it matches the argument of atomicMin(..)
.