It is a mystery for me how shared memory on CUDA devices work. I was curious to count threads having access to the same shared memory. For this I wrote a simple program
#include <cuda_runtime.h>
#include <stdio.h>
#define nblc 13
#define nthr 1024
//------------------------@device--------------------
__device__ int inwarpD[nblc];
__global__ void kernel(){
__shared__ int mywarp;
mywarp=0;
for (int i=0;i<5;i++) mywarp += (10000*threadIdx.x+1);
__syncthreads();
inwarpD[blockIdx.x]=mywarp;
}
//------------------------@host-----------------------
int main(int argc, char **argv){
int inwarpH[nblc];
cudaSetDevice(2);
kernel<<<nblc, nthr>>>();
cudaMemcpyFromSymbol(inwarpH, inwarpD, nblc*sizeof(int), 0, cudaMemcpyDeviceToHost);
for (int i=0;i<nblc;i++) printf("%i : %i\n",i, inwarpH[i]);
}
and ran it on K80 GPU. Since several threads are having access to the same shared memory variable I was expecting that this variable will be updated 5*nthr
times, albeit not at the same cycle because of the bank conflict. However, the output indicates that the mywarp
shared variable was updated only 5 times. For each blocks different threads accomplished this task:
0 : 35150005
1 : 38350005
2 : 44750005
3 : 38350005
4 : 51150005
5 : 38350005
6 : 38350005
7 : 38350005
8 : 51150005
9 : 44750005
10 : 51150005
11 : 38350005
12 : 38350005
Instead, I was expecting
523776*10000 + 5*1024 = 5237765120
for each block. Can someone kindly explain me where my understanding of shared memory fails. I would like also to know how would it be possible that all threads in one block access (update) the same shared variable. I know it is not possible at the same MP cycle. Serialization is fine for me because it is going to be a rare event.
Lets walk through the ptx that it generates.
//Declare some registers
.reg .s32 %r<5>;
.reg .s64 %rd<4>;
// demoted variable
.shared .align 4 .u32 _Z6kernelv$__cuda_local_var_35411_30_non_const_mywarp;
//load tid in register r1
mov.u32 %r1, %tid.x;
//multiple tid*5000+5 and store in r2
mad.lo.s32 %r2, %r1, 50000, 5;
//store result in shared memory
st.shared.u32 [_Z6kernelv$__cuda_local_var_35411_30_non_const_mywarp], %r2;
///synchronize
bar.sync 0;
//load from shared memory and store in r3
ld.shared.u32 %r3, [_Z6kernelv$__cuda_local_var_35411_30_non_const_mywarp];
mov.u32 %r4, %ctaid.x;
mul.wide.u32 %rd1, %r4, 4;
mov.u64 %rd2, inwarpD;
add.s64 %rd3, %rd2, %rd1;
//store r3 in global memory
st.global.u32 [%rd3], %r3;
ret;
So basically
for (int i=0;i<5;i++)
mywarp += (10000*threadIdx.x+1);
is being optimized down to
mywarp=50000*threadIdx.x+5
so you're not experiencing a bank-conflict. You are experiencing a race-condition.