Search code examples
c++cudacublas

cublasGemmEx result is always zero


I tried matrix multiplication using cublasGemmEx. A and b are 1X1 half matrix. The result is always zero if i set the compute type and output date type to CUDA_R_16F. And the result is correct if i set compute type and output date type to CUDA_R_32F.

Does anyone know that why the result is zero if i set types to CUDA_R_16F? Thanks for your answers in advance.

My cuda version is 10.2, gpu is T4. I build below code with command 'nvcc -arch=sm_75 test_cublas.cu -o test_cublas -lcublas'

#include "cublas_v2.h"
#include "library_types.h"
#include <stdio.h>

__global__ void init_kernel(half *a, half *b, half *c_half, float *c_float)
{
    *a = __float2half_rn(1.0);
    *b = __float2half_rn(1.5);
    *c_half = __float2half_rn(0.0);
    *c_float = 0.0;
}

__global__ void print_gpu_values(half *a, half *b, half *c_half, float *c_float)
{
    printf("a %f, b %f, c_half %f, c_float %f\n", __half2float(*a), __half2float(*b), __half2float(*c_half), *c_float);
}

int main(int argc, char **argv)
{
    cudaStream_t cudaStream;
    if (cudaSuccess != cudaStreamCreateWithFlags(&cudaStream, cudaStreamNonBlocking))
    {
        printf("create cuda stream failed\n");
        exit(-1);
    }

    cublasHandle_t handle;
    cublasCreate(&handle);
    if (CUBLAS_STATUS_SUCCESS != cublasSetStream(handle, cudaStream))
    {
        printf("cublas set stream failed\n");
        exit(-1);
    }

    half *a;
    half *b;
    half *c_half;
    float *c_float;
    cudaMalloc(&a, sizeof(half));
    cudaMalloc(&b, sizeof(half));
    cudaMalloc(&c_half, sizeof(half));
    cudaMalloc(&c_float,sizeof(float));
    float alpha = 1.0;
    float beta = 1.0;

    init_kernel<<<1, 1, 0, cudaStream>>>(a, b, c_half, c_float);

    if (CUBLAS_STATUS_SUCCESS != cublasGemmEx(handle, CUBLAS_OP_N, CUBLAS_OP_N, 1, 1, 1,
        &alpha, b, CUDA_R_16F, 1, a, CUDA_R_16F, 1, &beta, c_half, CUDA_R_16F, 1, CUDA_R_16F, CUBLAS_GEMM_DEFAULT_TENSOR_OP))
    {
        printf("cublasGemmEx failed\n");
        exit(-1);
    }

    if (CUBLAS_STATUS_SUCCESS != cublasGemmEx(handle, CUBLAS_OP_N, CUBLAS_OP_N, 1, 1, 1,
        &alpha, b, CUDA_R_16F, 1, a, CUDA_R_16F, 1, &beta, c_float, CUDA_R_32F, 1, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP))
    {
        printf("cublasGemmEx failed\n");
        exit(-1);
    }

    print_gpu_values<<<1, 1, 0, cudaStream>>>(a, b, c_half, c_float);
    cudaStreamSynchronize(cudaStream);

    return 0;

}

Solution

  • According to the documentation for cublasGemmEx, specifically for alpha and beta parameters, both say:

    of same type as computeType

    However your code does not satisfy that requirement. For the (working) CUDA_R_32F case, your alpha and beta arguments of type float are matching. For the (non-working) CUDA_R_16F case, they do not match.

    When I modify your code with that change, I get a correct result on CUDA 11.0:

    # cat t3.cu
    #include "cublas_v2.h"
    #include "library_types.h"
    #include <stdio.h>
    
    __global__ void init_kernel(half *a, half *b, half *c_half, float *c_float)
    {
        *a = __float2half_rn(1.0);
        *b = __float2half_rn(1.5);
        *c_half = __float2half_rn(0.0);
        *c_float = 0.0;
    }
    
    __global__ void print_gpu_values(half *a, half *b, half *c_half, float *c_float)
    {
        printf("a %f, b %f, c_half %f, c_float %f\n", __half2float(*a), __half2float(*b), __half2float(*c_half), *c_float);
    }
    
    int main(int argc, char **argv)
    {
        cudaStream_t cudaStream;
        if (cudaSuccess != cudaStreamCreateWithFlags(&cudaStream, cudaStreamNonBlocking))
        {
            printf("create cuda stream failed\n");
            exit(-1);
        }
    
        cublasHandle_t handle;
        cublasCreate(&handle);
        if (CUBLAS_STATUS_SUCCESS != cublasSetStream(handle, cudaStream))
        {
            printf("cublas set stream failed\n");
            exit(-1);
        }
    
        half *a;
        half *b;
        half *c_half;
        float *c_float;
        cudaMalloc(&a, sizeof(half));
        cudaMalloc(&b, sizeof(half));
        cudaMalloc(&c_half, sizeof(half));
        cudaMalloc(&c_float,sizeof(float));
        float alpha = 1.0;
        float beta = 1.0;
        half halpha = __float2half_rn(alpha);
        half hbeta =  __float2half_rn(beta);
    
        init_kernel<<<1, 1, 0, cudaStream>>>(a, b, c_half, c_float);
    
        if (CUBLAS_STATUS_SUCCESS != cublasGemmEx(handle, CUBLAS_OP_N, CUBLAS_OP_N, 1, 1, 1,
            &halpha, b, CUDA_R_16F, 1, a, CUDA_R_16F, 1, &hbeta, c_half, CUDA_R_16F, 1, CUDA_R_16F, CUBLAS_GEMM_DEFAULT_TENSOR_OP))
        {
            printf("cublasGemmEx failed\n");
            exit(-1);
        }
    
        if (CUBLAS_STATUS_SUCCESS != cublasGemmEx(handle, CUBLAS_OP_N, CUBLAS_OP_N, 1, 1, 1,
            &alpha, b, CUDA_R_16F, 1, a, CUDA_R_16F, 1, &beta, c_float, CUDA_R_32F, 1, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP))
        {
            printf("cublasGemmEx failed\n");
            exit(-1);
        }
    
        print_gpu_values<<<1, 1, 0, cudaStream>>>(a, b, c_half, c_float);
        cudaStreamSynchronize(cudaStream);
    
        return 0;
    
    }
    # nvcc t3.cu -o t3 -lcublas
    # cuda-memcheck ./t3
    ========= CUDA-MEMCHECK
    a 1.000000, b 1.500000, c_half 1.500000, c_float 1.500000
    ========= ERROR SUMMARY: 0 errors
    # nvcc --version
    nvcc: NVIDIA (R) Cuda compiler driver
    Copyright (c) 2005-2020 NVIDIA Corporation
    Built on Wed_Jul_22_19:09:09_PDT_2020
    Cuda compilation tools, release 11.0, V11.0.221
    Build cuda_11.0_bu.TC445_37.28845127_0
    #