Search code examples
cimagecudahistogramgpu-shared-memory

Shared memory declaration inside device


How many shared memory decalrations are allowed inside the device kernel in CUDA?

Can we do something like this:

extern __shared__ float a[];
extern __shared__ float b[];

I wish to have 2 arrays of different sizes. For instance in a 1024x768 image. I can do parallel minimization by first minimizing across rows and then across columns. So to store intermediate values i will require

sizeof(a)/sizeof(float) == 768

and

sizeof(b)/sizeof(float) == 1024

Or should I just initialize one long 1D shared array and append a and b?


Solution

  • You can have as many shared memory declarations as you like. However, the runtime only allocates a single shared memory buffer, and each shared memory array will be allocated the same address (i.e. the starting address of the shared memory allocation). So, for example, this:

    #include <cstdio>
    
    extern __shared__ int a[];
    extern __shared__ int b[];
    extern __shared__ int c[];
    
    __global__
    void kernel(void)
    {
        int * a0 = &a[0];
        int * b0 = &b[0];
        int * c0 = &c[0];
    
        printf("a0 = %#x \n", a0);
        printf("b0 = %#x \n", b0);
        printf("c0 = %#x \n", c0);
    }
    
    int main()
    {
        kernel<<<1,1,1024>>>();
        cudaDeviceReset();
    
        return 0;
    }
    

    does this:

    $ nvcc -arch=sm_30 -run extshm.cu 
    a0 = 0x1000000 
    b0 = 0x1000000 
    c0 = 0x1000000 
    

    If you wanted to have two shared arrays, then on any supported (ie. compute capability >= 2.0) GPU, you can do something like this:

    #include <cstdio>
    
    extern __shared__ int a[];
    
    __global__
    void kernel(void)
    {
        int * a0 = &a[0];
        int * b0 = &a[1024];
        int * c0 = &a[1024+768];
    
        printf("a0 = %#x \n", a0);
        printf("b0 = %#x \n", b0);
        printf("c0 = %#x \n", c0);
    }
    
    int main()
    {
        kernel<<<1,1,1024+768+512>>>();
        cudaDeviceReset();
    
        return 0;
    }
    

    which gives:

    nvcc -arch=sm_30 -run extshm2.cu 
    a0 = 0x1000000 
    b0 = 0x1001000 
    c0 = 0x1001c00 
    

    The latter is what you are looking for, I think.