Search code examples
cudagpunvidianvccgpu-shared-memory

Thread block clusters and distributed shared memory not working as intended


I have written a simple CUDA program to perform array reduction using thread block clusters and distributed shared memory. I am compiling it with CUDA 12.0 and running on a hopper GPU. Below is the code I use:

#include <stdio.h>
#include <cooperative_groups.h>

#define CLUSTER_SIZE 4
#define BLOCK_SIZE 32

namespace cg = cooperative_groups;

__global__ void __cluster_dims__(CLUSTER_SIZE, 1, 1) 
cluster_reduce_sum(int n, float *arr, float *sum) 
{
    __shared__ float shared_mem[BLOCK_SIZE];
    __shared__ float cluster_sum;

    cluster_sum = 0.0f;

    cg::cluster_group cluster = cg::this_cluster();
    unsigned int cluster_block_rank = cluster.block_rank();
    unsigned int cluster_size = cluster.dim_blocks().x;
    
    const int idx = blockIdx.x * blockDim.x + threadIdx.x;


    shared_mem[threadIdx.x] = 0.0f;
    if (idx < n) {
        shared_mem[threadIdx.x] = arr[idx];
    }

    __syncthreads();

    for (int offset = BLOCK_SIZE / 2; offset; offset /= 2) {
        if (threadIdx.x < offset) {
            shared_mem[threadIdx.x] += shared_mem[threadIdx.x + offset];
        }
        __syncthreads();
    }

    cluster.sync();

    if (threadIdx.x == 0) {
        atomicAdd(cluster.map_shared_rank(&cluster_sum, 0), shared_mem[0]);
        printf("blockIdx: %d  cluster_sum: %f\n", blockIdx.x,  (float)*cluster.map_shared_rank(&cluster_sum, 0));
    }

    cluster.sync();

    if (threadIdx.x == 0 && cluster_block_rank == 0) {
        atomicAdd(sum, cluster_sum);
    }

    cluster.sync();
}

int main(int argc, char* argv[]) {
    int n = 128;

    if (argc > 1) {
        n = atoi(argv[1]);
    }

    float *h_arr, *h_sum, sum;
    h_arr = (float*) malloc(n * sizeof(float));
    h_sum = (float*) malloc(sizeof(float));

    int upper = 1024, lower = -1024;

    sum = 0.0f;
    for(int i = 0; i < n; i++)
    {
        h_arr[i] = 1;
        sum += h_arr[i];
    }

    float *d_arr, *d_sum;
    cudaMalloc(&d_arr, n * sizeof(float));
    cudaMalloc(&d_sum, sizeof(float));

    cudaMemcpy(d_arr, h_arr, n * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemset(d_sum, 0, sizeof(float));

    int num_clusters = (n-1) / (CLUSTER_SIZE * BLOCK_SIZE) + 1;
    
    cluster_reduce_sum <<< CLUSTER_SIZE * num_clusters, BLOCK_SIZE >>> (n, d_arr, d_sum);

    cudaError_t error = cudaGetLastError();
    if(error != cudaSuccess)
    {
        printf("CUDA error: %s\n", cudaGetErrorString(error));
        return -1;
    }

    cudaMemcpy(h_sum, d_sum, sizeof(float), cudaMemcpyDeviceToHost);

    if (*h_sum != sum) {
        printf("Kernel incorrect: %f vs %f\n", sum, *h_sum);
    }

    return 0;
}

The outputs of the kernel and CPU do not match and as far as I know, there doesn’t seem to be any bugs in my code. Please help me find the issue with this code. Thanks!

Edit: including some debug info to the question. I added a print statement just before adding the per block sum (shared_mem[0])to the cluster_sum. The per block sum seem to be all correct (32). However when I print the final cluster just before adding it to the total sum, it is not correct. Expected value of cluster_sum is 128, but I see 64. Seems the block sum is not properly accumulated to the cluster sum.

Another interesting point I observed is that if I just change the input datatype to int from float, the code works completely fine and passes the output check.

Edit: I run the code. It looks that atomicAdd does not work correctly on float. I add several printf to the original codes.


Solution

  • The issue reported here appears to be a defect in CUDA prior to 12.3, related to distributed shared memory atomics on float or double type. The issue should be resolved by updating to CUDA 12.3.

    As an alternative "verification" or "workaround", the data type can be changed from float to eg. int in the provided code, and the code behavior should be correct, even on CUDA 12.0

    Example of that case on CUDA 12.0:

    $ cat t21.cu
    #include <stdio.h>
    #include <cooperative_groups.h>
    
    #define CLUSTER_SIZE 4
    #define BLOCK_SIZE 32
    
    namespace cg = cooperative_groups;
    using mt=int;
    __global__ void __cluster_dims__(CLUSTER_SIZE, 1, 1)
    cluster_reduce_sum(int n, mt *arr, mt *sum)
    {
        __shared__ mt shared_mem[BLOCK_SIZE];
        __shared__ mt cluster_sum;
    
        cluster_sum = 0;
    
        cg::cluster_group cluster = cg::this_cluster();
        unsigned int cluster_block_rank = cluster.block_rank();
        unsigned int cluster_size = cluster.dim_blocks().x;
    
        const int idx = blockIdx.x * blockDim.x + threadIdx.x;
    
    
        shared_mem[threadIdx.x] = 0;
        if (idx < n) {
            shared_mem[threadIdx.x] = arr[idx];
        }
    
        __syncthreads();
    
        for (int offset = BLOCK_SIZE / 2; offset; offset /= 2) {
                    if (threadIdx.x < offset) {
                            shared_mem[threadIdx.x] += shared_mem[threadIdx.x + offset];
                    }
                    __syncthreads();
            }
    
        cluster.sync();
    
        if (threadIdx.x == 0) {
                    atomicAdd(cluster.map_shared_rank(&cluster_sum, 0), shared_mem[0]);
        }
    
        cluster.sync();
    
        if (threadIdx.x == 0 && cluster_block_rank == 0) {
                    atomicAdd(sum, cluster_sum);
        }
    
        cluster.sync();
    }
    
    int main(int argc, char* argv[]) {
        int n = 128;
    
        if (argc > 1) {
            n = atoi(argv[1]);
        }
    
        mt *h_arr, *h_sum, sum;
        h_arr = (mt*) malloc(n * sizeof(float));
        h_sum = (mt*) malloc(sizeof(float));
    
        //int upper = 1024, lower = -1024;
    
        sum = 0;
        for(int i = 0; i < n; i++)
        {
            h_arr[i] = 1;
            sum += h_arr[i];
        }
    
        mt *d_arr, *d_sum;
        cudaMalloc(&d_arr, n * sizeof(mt));
        cudaMalloc(&d_sum, sizeof(mt));
    
        cudaMemcpy(d_arr, h_arr, n * sizeof(mt), cudaMemcpyHostToDevice);
    
        //int num_clusters = ceil ((float)n / (CLUSTER_SIZE * BLOCK_SIZE)) + 1;
        int num_clusters = 1;
        cluster_reduce_sum <<< CLUSTER_SIZE * num_clusters, BLOCK_SIZE >>> (n, d_arr, d_sum);
    
        cudaError_t error = cudaGetLastError();
        if(error != cudaSuccess)
        {
            printf("CUDA error: %s\n", cudaGetErrorString(error));
            return -1;
        }
    
        cudaMemcpy(h_sum, d_sum, sizeof(mt), cudaMemcpyDeviceToHost);
    
        if (*h_sum != sum) {
            printf("Kernel incorrect: %f vs %f\n", (float)sum, (float)(*h_sum));
        }
    
        return 0;
    }
    $ nvcc --version
    nvcc: NVIDIA (R) Cuda compiler driver
    Copyright (c) 2005-2023 NVIDIA Corporation
    Built on Fri_Jan__6_16:45:21_PST_2023
    Cuda compilation tools, release 12.0, V12.0.140
    Build cuda_12.0.r12.0/compiler.32267302_0
    $ nvcc -o t21 t21.cu -arch=sm_90
    $ compute-sanitizer ./t21
    ========= COMPUTE-SANITIZER
    ========= ERROR SUMMARY: 0 errors
    $
    

    (4112521)

    (The num_clusters calculation was also incorrect, but that did not materially affect the observation.)