I like to do CUDA synchronization for multiple blocks. It is not for each block where __syncthreads() can easily handle it.
I saw there are exiting discussions on this topic, for example cuda block synchronization, and I like the simple solution brought up by @johan, https://stackoverflow.com/a/67252761/3188690, essentially it uses a 64 bits counter to track the synchronized blocks.
However, I wrote the following code trying to accomplish the similar job but meet a problem. Here I used the term environment
so that the wkNumberEnvs
of blocks within this environment shall be synchronized. It has a counter. I used atomicAdd()
to count how many blocks have already been synchronized themselves, once the number of sync blocks == wkBlocksPerEnv
, I know all blocks finished sync and it is free to go. However, it has a strange outcome that I am not sure why.
The problem comes from this while loop. Since the first threads of all blocks are doing the atomicAdd, there is a while loop to check until the condition meets. But I find that some blocks will be stuck into the endless loop, which I am not sure why the condition cannot be met eventually? And if I printf some messages either in *** I can print here 1
or *** I can print here 2
, there is no endless loop and everything is perfect. I do not see something obvious.
const int wkBlocksPerEnv = 2;
__device__ int env_sync_block_count[wkNumberEnvs];
__device__ void syncthreads_for_env(){
// sync threads for each block so all threads in this block finished the previous tasks
__syncthreads();
// sync threads for wkBlocksPerEnv blocks for each environment
if(wkBlocksPerEnv > 1){
const int kThisEnvId = get_env_scope_block_id(blockIdx.x);
if (threadIdx.x == 0){
// incrementing env_sync_block_count by 1
atomicAdd(&env_sync_block_count[kThisEnvId], 1);
// *** I can print here 1
while(env_sync_block_count[kThisEnvId] != wkBlocksPerEnv){
// *** I can print here 2
}
// Do the next job ...
}
}
There are two potential issues with your code. Caching and block scheduling.
Caching can prevent you from observing an updated value during the while loop.
Block scheduling can cause a dead-lock if you wait for an update of a block which has not yet been scheduled. Since CUDA does not guarantee a specific order of scheduled blocks, the only way to prevent this dead-lock is to limit the number of blocks in the grid such that all blocks can run simultaneously.
Following code shows how you could synchronize multiple blocks while avoiding above issues. I adapted the code from the multi-grid synchronization given in the CUDA-sample conjugateGradientMultiDeviceCG
https://github.com/NVIDIA/cuda-samples/blob/master/Samples/4_CUDA_Libraries/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG.cu#L186
On pre-Volta devices, it uses volatile memory accesses. Volta and later uses acquire/release semantics. Grid size is limited by querying device properties.
#include <cassert>
#include <cstdio>
constexpr int wkBlocksPerEnv = 13;
__device__
int getEnv(int blockId){
return blockId / wkBlocksPerEnv;
}
__device__
int getRankInEnv(int blockId){
return blockId % wkBlocksPerEnv;
}
__device__
unsigned char load_arrived(unsigned char *arrived) {
#if __CUDA_ARCH__ < 700
return *(volatile unsigned char *)arrived;
#else
unsigned int result;
asm volatile("ld.acquire.gpu.global.u8 %0, [%1];"
: "=r"(result)
: "l"(arrived)
: "memory");
return result;
#endif
}
__device__
void store_arrived(unsigned char *arrived,
unsigned char val) {
#if __CUDA_ARCH__ < 700
*(volatile unsigned char *)arrived = val;
#else
unsigned int reg_val = val;
asm volatile(
"st.release.gpu.global.u8 [%1], %0;" ::"r"(reg_val) "l"(arrived)
: "memory");
// Avoids compiler warnings from unused variable val.
(void)(reg_val = reg_val);
#endif
}
#if 0
//wrong implementation which does not synchronize. to check that kernel assert does trigger without proper synchronization
__device__
void syncthreads_for_env(unsigned char* temp){
}
#else
//temp must have at least size sizeof(unsigned char) * total_number_of_blocks in grid
__device__
void syncthreads_for_env(unsigned char* temp){
__syncthreads();
const int env = getEnv(blockIdx.x);
const int blockInEnv = getRankInEnv(blockIdx.x);
unsigned char* const mytemp = temp + env * wkBlocksPerEnv;
if(threadIdx.x == 0){
if(blockInEnv == 0){
// Leader block waits for others to join and then releases them.
// Other blocks in env can arrive in any order, so the leader have to wait for
// all others.
for (int i = 0; i < wkBlocksPerEnv - 1; i++) {
while (load_arrived(&mytemp[i]) == 0)
;
}
for (int i = 0; i < wkBlocksPerEnv - 1; i++) {
store_arrived(&mytemp[i], 0);
}
__threadfence();
}else{
// Other blocks in env note their arrival and wait to be released.
store_arrived(&mytemp[blockInEnv - 1], 1);
while (load_arrived(&mytemp[blockInEnv - 1]) == 1)
;
}
}
__syncthreads();
}
#endif
__global__
void kernel(unsigned char* synctemp, int* array){
const int env = getEnv(blockIdx.x);
const int blockInEnv = getRankInEnv(blockIdx.x);
if(threadIdx.x == 0){
array[blockIdx.x] = 1;
}
syncthreads_for_env(synctemp);
if(threadIdx.x == 0){
int sum = 0;
for(int i = 0; i < wkBlocksPerEnv; i++){
sum += array[env * wkBlocksPerEnv + i];
}
assert(sum == wkBlocksPerEnv);
}
}
int main(){
const int smem = 0;
const int blocksize = 128;
int deviceId = 0;
int numSMs = 0;
int maxBlocksPerSM = 0;
cudaGetDevice(&deviceId);
cudaDeviceGetAttribute(&numSMs, cudaDevAttrMultiProcessorCount, deviceId);
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
&maxBlocksPerSM,
kernel,
blocksize,
smem
);
int maxBlocks = maxBlocksPerSM * numSMs;
maxBlocks -= maxBlocks % wkBlocksPerEnv; //round down to nearest multiple of wkBlocksPerEnv
printf("wkBlocksPerEnv %d, maxBlocks: %d\n", wkBlocksPerEnv, maxBlocks);
int* d_array;
unsigned char* d_synctemp;
cudaMalloc(&d_array, sizeof(int) * maxBlocks);
cudaMalloc(&d_synctemp, sizeof(unsigned char) * maxBlocks);
cudaMemset(d_synctemp, 0, sizeof(unsigned char) * maxBlocks);
kernel<<<maxBlocks, blocksize>>>(d_synctemp, d_array);
cudaFree(d_synctemp);
cudaFree(d_array);
return 0;
}