CuDNN Reduce Format Bug

I really hate to dump a lot of code here, but I wanted it to be compilable. The following is used to demonstrate a possible bug (most likely a misunderstanding) in CuDNN.

#include <vector>
#include <cudnn.h>
#include <cuda.h>
#include <iostream>
#include <sstream>

#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }

inline void gpuAssert(cudnnStatus_t code, const char *file, int line, bool abort=true)
{
    if (code != CUDNN_STATUS_SUCCESS) 
    {
        std::stringstream ss;
        ss << "CuDNNassert: (" << code << ") " << cudnnGetErrorString(code) << " " << file << " " << line;
        std::cerr << ss.str() << std::endl;
        if (abort)
        {
            throw std::runtime_error(ss.str());
        }
    }
}

inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
    if (code != cudaSuccess) 
    {
        std::stringstream ss;
        ss << "CUDAassert: (" << code << ") " << cudaGetErrorString(code) << " " << file << " " << line;
        std::cerr << ss.str() << std::endl;
        if (abort)
        {
            throw std::runtime_error(ss.str());
        }
    }
}

template<typename T>
cudnnDataType_t getCudnnType()
{
    if(std::is_same<T, float>::value)
        return CUDNN_DATA_FLOAT;
    else if(std::is_same<T, double>::value)
        return CUDNN_DATA_DOUBLE;
    else if(std::is_same<T, int>::value)
        return CUDNN_DATA_INT32;
    else if(std::is_same<T, char>::value)
        return CUDNN_DATA_INT8;
    else
        throw std::runtime_error("Cannot use any other type of");
}

template<typename T>
void _reduce(cudnnHandle_t& cudnn, T* gpuA, T** gpuB,
    int n,    int h,    int w,    int c,
    int outN, int outH, int outW, int outC,
    cudnnReduceTensorOp_t reduceType, cudnnTensorFormat_t format)
{
    gpuErrchk( cudaMalloc(gpuB, outN*outH*outW*outC*sizeof(T)) );
    gpuErrchk( cudaMemset(*gpuB, 0, outN*outH*outW*outC*sizeof(T)) );

    cudnnDataType_t dType = getCudnnType<T>();

    cudnnTensorDescriptor_t inputDescriptor;
    gpuErrchk( cudnnCreateTensorDescriptor(&inputDescriptor) );
    gpuErrchk( cudnnSetTensor4dDescriptor(inputDescriptor,
                                            format,
                                            dType,
                                            n, c, h, w) );

    cudnnTensorDescriptor_t outputDescriptor;
    gpuErrchk( cudnnCreateTensorDescriptor(&outputDescriptor) );
    gpuErrchk( cudnnSetTensor4dDescriptor(outputDescriptor,
                                            format,
                                            dType,
                                            outN, outC, outH, outW) );

    cudnnReduceTensorDescriptor_t reduceTensorDesc;
    gpuErrchk( cudnnCreateReduceTensorDescriptor(&reduceTensorDesc) );
    gpuErrchk( cudnnSetReduceTensorDescriptor(reduceTensorDesc,
                                                reduceType,
                                                dType,
                                                CUDNN_NOT_PROPAGATE_NAN,
                                                CUDNN_REDUCE_TENSOR_NO_INDICES,
                                                CUDNN_8BIT_INDICES) );

    size_t workspaceSize;
    gpuErrchk( cudnnGetReductionWorkspaceSize(cudnn,
                                                reduceTensorDesc,
                                                inputDescriptor,
                                                outputDescriptor,
                                                &workspaceSize) );

    size_t indicesSize;
    gpuErrchk( cudnnGetReductionIndicesSize(cudnn,
                                                reduceTensorDesc,
                                                inputDescriptor,
                                                outputDescriptor,
                                                &indicesSize) );

    float alpha = 1;
    float beta = 0;

    void* gpuWorkspace;
    gpuErrchk( cudaMalloc(&gpuWorkspace, workspaceSize) );

    void* gpuIndices;
    gpuErrchk( cudaMalloc(&gpuIndices, indicesSize) );

    gpuErrchk( cudnnReduceTensor(cudnn,
                                    reduceTensorDesc,
                                    gpuIndices, indicesSize,
                                    gpuWorkspace, workspaceSize,
                                    &alpha,
                                    inputDescriptor, gpuA,
                                    &beta,
                                    outputDescriptor, *gpuB) );

    gpuErrchk( cudaDeviceSynchronize() );

    gpuErrchk( cudnnDestroyReduceTensorDescriptor(reduceTensorDesc) );
    gpuErrchk( cudnnDestroyTensorDescriptor(inputDescriptor) );
    gpuErrchk( cudnnDestroyTensorDescriptor(outputDescriptor) );

    gpuErrchk( cudaFree(gpuIndices) );
    gpuErrchk( cudaFree(gpuWorkspace) );

}

int main(int argc, char **argv) {
    std::cout << "cudnn ver: " << CUDNN_MAJOR << "." << CUDNN_MINOR << "." << CUDNN_PATCHLEVEL << std::endl;

    cudnnHandle_t cudnn;
    gpuErrchk( cudnnCreate(&cudnn) );

    std::vector<float> in = {3,5,7,11,13,17,19,23,29,31};
    //NHWC: 3, 7,  13, 19, 29
    //      5, 11, 17, 23, 31

    //HCHW: 3,  5,  7,  11, 13
    //      17, 19, 23, 29, 31

    float* data_d;
    int n = 1, h = 1, w = 5, c = 2;
    size_t numElem = n*h*w*c;
    size_t arrSize = numElem*sizeof(float);

    //buffer to print results
    std::vector<float> cpuRes(5);

    gpuErrchk( cudaMalloc((void**) &data_d, arrSize) );

    gpuErrchk( cudaMemcpy(data_d, &in[0], arrSize, cudaMemcpyHostToDevice) );

    float* res_d;

    _reduce(cudnn, data_d, &res_d,
        n, h, w, c,
        1, 1, 5, 1, //reduce along channels
        CUDNN_REDUCE_TENSOR_ADD, CUDNN_TENSOR_NHWC); //use intended format

    gpuErrchk( cudaMemcpy(&cpuRes[0], res_d, 5*sizeof(float), cudaMemcpyDeviceToHost) );

    std::cout << "[";
    for(auto& v : cpuRes)
        std::cout << v << ",";
    std::cout << "]" << std::endl;
    //expected: [8,18,30,42,60,]
    //result: [20,24,30,40,44,]

    gpuErrchk( cudaFree(res_d) ); //next call will alloc again

    _reduce(cudnn, data_d, &res_d,
            n, h, w, c,
            1, 1, 5, 1, //reduce along channels
            CUDNN_REDUCE_TENSOR_ADD, CUDNN_TENSOR_NCHW); //use other format


    gpuErrchk( cudaMemcpy(&cpuRes[0], res_d, 5*sizeof(float), cudaMemcpyDeviceToHost) );

    std::cout << "[";
    for(auto& v : cpuRes)
        std::cout << v << ",";
    std::cout << "]" << std::endl;
    //expected: [20,24,30,40,44,]
    //result: [20,24,30,40,44,]

    gpuErrchk( cudaFree(res_d) );
    gpuErrchk( cudaFree(data_d) );
    gpuErrchk( cudnnDestroy(cudnn) );

    return 0;
}

If you want to test this yourself, here is the cmake file I use to compile this:

cmake_minimum_required(VERSION 3.0)

project(Main)

find_package(OpenCV REQUIRED)
find_package(CUDA REQUIRED)
#find_package(CUDNN REQUIRED)

set(CMAKE_CXX_FLAGS "--std=c++11 -Wall -fPIC -D_GLIBCXX_USE_CXX11_ABI=0 -D GOOGLE_CUDA=1")
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} --default-stream per-thread" )
set(CMAKE_BUILD_TYPE Debug)

#pass flags to c++ compiler
set(CUDA_PROPAGATE_HOST_FLAGS ON)

set(MAIN_SRC
    "main.cu"
)
include_directories(${OpenCV_INCLUDE_DIRS} ${CUDA_INCLUDE_DIRS})

cuda_add_executable(Main ${MAIN_SRC})
target_link_libraries(Main ${OpenCV_LIBS} ${CUDA_LIBRARIES} cudnn stdc++fs)

The output to the console is:

cudnn ver: 7.3.1
[20,24,30,40,44,]
[20,24,30,40,44,]

This is obviously the wrong output. Changing the dimension order should result in a different value when reducing along the same dimensions (i.e. [8,18,30,42,60,]).

Even using cudnnSetTensor4dDescriptorEx to set the stride for each doesn't seem to work using this as the calculation for each stride:

int ns = c*w*h;
int cs = 1;
int hs = c*w;
int ws = c;

Looking at the examples available with the download of CuDNN library they use cudnnSetTensorNdDescriptor rather than cudnnSetTensor4dDescriptor. However the documentation of cudnnSetTensorNdDescriptor states that:

When working with lower dimensional data, it is recommended that the user create a 4D tensor, and set the size along unused dimensions to 1.

And given that you need to compute strides yourself for cudnnSetTensorNdDescriptor, it is preferable to use cudnnSetTensor4dDescriptor.

Is this a bug in CuDNN or is there something wrong with my code that I don't see?

Solution

The problem with the above code is a very silly bug in my code. From the documentation:

C = alpha * reduce op ( A ) + beta * C

And

The data types of the tensors A and C must match if of type double. In this case, alpha and beta and the computation enum of reduceTensorDesc are all assumed to be of type double.

The error is in the two lines of code:

float alpha = 1;
float beta = 0;

Which should be:

T alpha = 1;
T beta = 0;

The two float numbers are interpreted as a double, and multiplied by the result of the reduce operation, which is essentially garbage data.