c image cuda histogram gpu-shared-memory

Shared memory declaration inside device

How many shared memory decalrations are allowed inside the device kernel in CUDA?

Can we do something like this:

extern __shared__ float a[];
extern __shared__ float b[];

I wish to have 2 arrays of different sizes. For instance in a 1024x768 image. I can do parallel minimization by first minimizing across rows and then across columns. So to store intermediate values i will require

sizeof(a)/sizeof(float) == 768

and

sizeof(b)/sizeof(float) == 1024

Or should I just initialize one long 1D shared array and append a and b?

Solution

You can have as many shared memory declarations as you like. However, the runtime only allocates a single shared memory buffer, and each shared memory array will be allocated the same address (i.e. the starting address of the shared memory allocation). So, for example, this:

#include <cstdio>

extern __shared__ int a[];
extern __shared__ int b[];
extern __shared__ int c[];

__global__
void kernel(void)
{
    int * a0 = &a[0];
    int * b0 = &b[0];
    int * c0 = &c[0];

    printf("a0 = %#x \n", a0);
    printf("b0 = %#x \n", b0);
    printf("c0 = %#x \n", c0);
}

int main()
{
    kernel<<<1,1,1024>>>();
    cudaDeviceReset();

    return 0;
}

does this:

$ nvcc -arch=sm_30 -run extshm.cu 
a0 = 0x1000000 
b0 = 0x1000000 
c0 = 0x1000000

If you wanted to have two shared arrays, then on any supported (ie. compute capability >= 2.0) GPU, you can do something like this:

#include <cstdio>

extern __shared__ int a[];

__global__
void kernel(void)
{
    int * a0 = &a[0];
    int * b0 = &a[1024];
    int * c0 = &a[1024+768];

    printf("a0 = %#x \n", a0);
    printf("b0 = %#x \n", b0);
    printf("c0 = %#x \n", c0);
}

int main()
{
    kernel<<<1,1,1024+768+512>>>();
    cudaDeviceReset();

    return 0;
}

which gives:

nvcc -arch=sm_30 -run extshm2.cu 
a0 = 0x1000000 
b0 = 0x1001000 
c0 = 0x1001c00

The latter is what you are looking for, I think.