Search code examples
cudasynchronizationblock

The way to properly do multiple CUDA block synchronization


I like to do CUDA synchronization for multiple blocks. It is not for each block where __syncthreads() can easily handle it.

I saw there are exiting discussions on this topic, for example cuda block synchronization, and I like the simple solution brought up by @johan, https://stackoverflow.com/a/67252761/3188690, essentially it uses a 64 bits counter to track the synchronized blocks.

However, I wrote the following code trying to accomplish the similar job but meet a problem. Here I used the term environment so that the wkNumberEnvs of blocks within this environment shall be synchronized. It has a counter. I used atomicAdd() to count how many blocks have already been synchronized themselves, once the number of sync blocks == wkBlocksPerEnv, I know all blocks finished sync and it is free to go. However, it has a strange outcome that I am not sure why.

The problem comes from this while loop. Since the first threads of all blocks are doing the atomicAdd, there is a while loop to check until the condition meets. But I find that some blocks will be stuck into the endless loop, which I am not sure why the condition cannot be met eventually? And if I printf some messages either in *** I can print here 1 or *** I can print here 2, there is no endless loop and everything is perfect. I do not see something obvious.

const int wkBlocksPerEnv = 2;

__device__ int env_sync_block_count[wkNumberEnvs];

__device__ void syncthreads_for_env(){
    // sync threads for each block so all threads in this block finished the previous tasks
    __syncthreads();

    // sync threads for wkBlocksPerEnv blocks for each environment
    if(wkBlocksPerEnv > 1){
       const int kThisEnvId = get_env_scope_block_id(blockIdx.x);

       if (threadIdx.x == 0){
            // incrementing env_sync_block_count by 1
            atomicAdd(&env_sync_block_count[kThisEnvId], 1);
            // *** I can print here 1
            while(env_sync_block_count[kThisEnvId] != wkBlocksPerEnv){
            // *** I can print here 2
            }

    // Do the next job ...
    }
}

Solution

  • There are two potential issues with your code. Caching and block scheduling.

    Caching can prevent you from observing an updated value during the while loop.

    Block scheduling can cause a dead-lock if you wait for an update of a block which has not yet been scheduled. Since CUDA does not guarantee a specific order of scheduled blocks, the only way to prevent this dead-lock is to limit the number of blocks in the grid such that all blocks can run simultaneously.

    Following code shows how you could synchronize multiple blocks while avoiding above issues. I adapted the code from the multi-grid synchronization given in the CUDA-sample conjugateGradientMultiDeviceCG https://github.com/NVIDIA/cuda-samples/blob/master/Samples/4_CUDA_Libraries/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG.cu#L186

    On pre-Volta devices, it uses volatile memory accesses. Volta and later uses acquire/release semantics. Grid size is limited by querying device properties.

    
    #include <cassert>
    #include <cstdio>
    
    constexpr int wkBlocksPerEnv = 13;
    
    __device__
    int getEnv(int blockId){
        return blockId / wkBlocksPerEnv;
    }
    
    __device__
    int getRankInEnv(int blockId){
        return blockId % wkBlocksPerEnv;
    }
    
    __device__ 
    unsigned char load_arrived(unsigned char *arrived) {
    #if __CUDA_ARCH__ < 700
        return *(volatile unsigned char *)arrived;
    #else
        unsigned int result;
        asm volatile("ld.acquire.gpu.global.u8 %0, [%1];"
                     : "=r"(result)
                     : "l"(arrived)
                     : "memory");
        return result;
    #endif
      }
    
    __device__ 
    void store_arrived(unsigned char *arrived,
                                    unsigned char val) {
    #if __CUDA_ARCH__ < 700
        *(volatile unsigned char *)arrived = val;
    #else
        unsigned int reg_val = val;
        asm volatile(
            "st.release.gpu.global.u8 [%1], %0;" ::"r"(reg_val) "l"(arrived)
            : "memory");
    
        // Avoids compiler warnings from unused variable val.
        (void)(reg_val = reg_val);
    #endif
      }
    
    #if 0
    //wrong implementation which does not synchronize. to check that kernel assert does trigger without proper synchronization
    __device__ 
    void syncthreads_for_env(unsigned char* temp){
    
    }
    #else
    //temp must have at least size sizeof(unsigned char) * total_number_of_blocks in grid
    __device__ 
    void syncthreads_for_env(unsigned char* temp){
        __syncthreads();
        const int env = getEnv(blockIdx.x);
        const int blockInEnv = getRankInEnv(blockIdx.x);
        unsigned char* const mytemp = temp + env * wkBlocksPerEnv;
    
        if(threadIdx.x == 0){
            if(blockInEnv == 0){
                // Leader block waits for others to join and then releases them.
                // Other blocks in env can arrive in any order, so the leader have to wait for
                // all others.
                for (int i = 0; i < wkBlocksPerEnv - 1; i++) {
                    while (load_arrived(&mytemp[i]) == 0)
                        ;
                }
                for (int i = 0; i < wkBlocksPerEnv - 1; i++) {
                    store_arrived(&mytemp[i], 0);
                }
                __threadfence();
            }else{
                // Other blocks in env note their arrival and wait to be released.
                store_arrived(&mytemp[blockInEnv - 1], 1);
                while (load_arrived(&mytemp[blockInEnv - 1]) == 1)
                    ;
            }
        }
    
        __syncthreads();
    }
    #endif
    
    __global__
    void kernel(unsigned char* synctemp, int* array){
        const int env = getEnv(blockIdx.x);
        const int blockInEnv = getRankInEnv(blockIdx.x);
    
        if(threadIdx.x == 0){
            array[blockIdx.x] = 1;
        }
    
        syncthreads_for_env(synctemp);
        
        if(threadIdx.x == 0){
            int sum = 0;
            for(int i = 0; i < wkBlocksPerEnv; i++){
                sum += array[env * wkBlocksPerEnv + i];
            }
            assert(sum == wkBlocksPerEnv);
        }
    }
    
    
    int main(){
        
        const int smem = 0;
        const int blocksize = 128;
    
        int deviceId = 0;
        int numSMs = 0;
        int maxBlocksPerSM = 0;
    
        cudaGetDevice(&deviceId);
        cudaDeviceGetAttribute(&numSMs, cudaDevAttrMultiProcessorCount, deviceId);
        cudaOccupancyMaxActiveBlocksPerMultiprocessor(
            &maxBlocksPerSM,
            kernel,
            blocksize, 
            smem
        );
    
        int maxBlocks = maxBlocksPerSM * numSMs;
        maxBlocks -= maxBlocks % wkBlocksPerEnv; //round down to nearest multiple of wkBlocksPerEnv
        printf("wkBlocksPerEnv %d, maxBlocks: %d\n", wkBlocksPerEnv, maxBlocks);
    
        int* d_array;
        unsigned char* d_synctemp;
        cudaMalloc(&d_array, sizeof(int) * maxBlocks);
    
        cudaMalloc(&d_synctemp, sizeof(unsigned char) * maxBlocks);
        cudaMemset(d_synctemp, 0, sizeof(unsigned char) * maxBlocks);
    
        kernel<<<maxBlocks, blocksize>>>(d_synctemp, d_array);
    
        cudaFree(d_synctemp);
        cudaFree(d_array);
    
        return 0;
    }