Search code examples
templatescudanvcc

nvcc handling of __restrict__ and template function default arguments


The code below is legal C++ (compiles clean with g++ -Wall):

#ifdef MAKE_COMPILE
#define __restrict__ /* empty */
#define NO_FORWARD_DECLARATIONS
#endif

#include <stdio.h>

template <class T>
struct Array
{
int width, height;
T *ptr;
};

#ifdef HAVE_CUDA
template<typename T, int KernelSize>
     static __global__ void genConvolve_kernel(const T __restrict__ * inputImageArray , T __restrict__ * outputImageArray , int inputWidth , int outputWidth )
{
    if ((threadIdx.x == 4) && (threadIdx.y == 2))
       printf("Hello world from CUDA!\n");
}
#endif

#ifndef NO_FORWARD_DECLARATIONS
template <typename T, int KernelSize>
     void genConvolve_cuda(const Array<T> & kernelArray , const Array<T> & inputImageArray , Array<T> & outputImageArray , int blockWidth=16, int blockHeight=16);

template <typename T, int KernelSize>
     void genConvolve_cuda_deviceptrs( const T __restrict__ * inputImageArray , T __restrict__ * outputImageArray , int inputWidth , int outputWidth , int outputHeight , int blockWidth=16, int blockHeight=16);
#endif

template <typename T, int KernelSize>
     void genConvolve_cuda_deviceptrs( const T __restrict__ * inputImageArray , T __restrict__ * outputImageArray , int inputWidth , int outputWidth , int outputHeight , int blockWidth=16, int blockHeight=16)
{
#ifdef HAVE_CUDA
    dim3 block(blockWidth,blockHeight);
    dim3 grid(1,1);
    genConvolve_kernel<T,KernelSize><<<grid,block>>>(inputImageArray,outputImageArray,inputWidth,outputWidth);
#else
    printf("Hello, world!\n");
#endif
}

template <typename T, int KernelSize>
     void genConvolve_cuda( const Array<T> & kernelArray , const Array<T> & inputImageArray , Array<T> & outputImageArray , int blockWidth=16, int blockHeight=16)
{
    genConvolve_cuda_deviceptrs<T,KernelSize>((const T *)inputImageArray.ptr,outputImageArray.ptr, inputImageArray.width, outputImageArray.width, outputImageArray.height, blockWidth, blockHeight);
}

int main(int argc, char *argv[])
{
    Array<float> a;

    genConvolve_cuda<float,3>(a,a,a);
#ifdef HAVE_CUDA
    cudaDeviceSynchronize();
#endif

    return 0;
}

However, when I try to compile this with nvcc I get the errors:

nvcc t.cu

t.cu(39): warning: specifying a default argument when redeclaring an unreferenced function template is nonstandard

t.cu(39): warning: redefinition of default argument

t.cu(51): warning: specifying a default argument when redeclaring an unreferenced function template is nonstandard

t.cu(51): warning: redefinition of default argument

t.cu(53): error: template instantiation resulted in unexpected function type of "void (const float *, float *, int, int, int, int, int)" (the meaning of a name may have changed since the template declaration -- the type of the template is "void (const __restrict__ T *, __restrict__ T *, int, int, int, int, int)") detected during: instantiation of "genConvolve_cuda_deviceptrs" based on template arguments (53): here instantiation of "void genConvolve_cuda(const Array &, const Array &, Array &, int, int) [with T=float, KernelSize=3]" (60): here

(line numbers lightly offset as I clean-up the example before posting.)

The warnings and errors go away when I define -DMAKE_COMPILE; however, I really would like to specify the forward declarations in a header file, and to use restrict !

So two questions:

  1. How to specify forward declarations of template functions with NVCC when there are default function arguments (in my case blockWidth and blockHeight?)
  2. How to properly use __restrict__ with template arguments?

Solution

  • How to properly use __restrict__ with template arguments?

    After conferring with colleagues, it was pointed out to me that this __restrict__ usage:

    const T __restrict__ * inputImageArray ...
    

    is questionable. In order for __restrict__ to have any effect, it is expected to be placed between the asterisk and the pointer name:

    const T * __restrict__ inputImageArray ...
    

    (gcc reference, and CUDA reference)

    In the non-standard usage you have shown, gcc seems to allow this but silently "drops" the intent; the effect of __restrict__ is not applied in that case. In this respect, it is true that CUDA differs from gcc behavior. However because it is questionable usage as described above, it's unlikely that nvcc would be modified to "fix" this issue.

    You can make the compile error disappear in the code you have shown if you switch to standard __restrict__ usage. This is recommended anyway if your intent is to declare to the compiler that these are in fact restricted pointers:

    #ifdef MAKE_COMPILE
    #define __restrict__ /* empty */
    #define NO_FORWARD_DECLARATIONS
    #endif
    
    #include <stdio.h>
    
    template <class T>
    struct Array
    {
    int width, height;
    T *ptr;
    };
    
    #ifdef HAVE_CUDA
    template<typename T, int KernelSize>
         static __global__ void genConvolve_kernel(const T * __restrict__ inputImageArray , T * __restrict__ outputImageArray , int inputWidth , int outputWidth )
    {
        if ((threadIdx.x == 4) && (threadIdx.y == 2))
           printf("Hello world from CUDA!\n");
    }
    #endif
    
    #ifndef NO_FORWARD_DECLARATIONS
    template <typename T, int KernelSize>
         void genConvolve_cuda(const Array<T> & kernelArray , const Array<T> & inputImageArray , Array<T> & outputImageArray , int blockWidth=16, int blockHeight=16);
    
    template <typename T, int KernelSize>
         void genConvolve_cuda_deviceptrs( const T * __restrict__ inputImageArray , T * __restrict__ outputImageArray , int inputWidth , int outputWidth , int outputHeight , int blockWidth=16, int blockHeight=16);
    #endif
    
    template <typename T, int KernelSize>
         void genConvolve_cuda_deviceptrs( const T * __restrict__ inputImageArray , T * __restrict__ outputImageArray , int inputWidth , int outputWidth , int outputHeight , int blockWidth=16, int blockHeight=16)
    {
    #ifdef HAVE_CUDA
        dim3 block(blockWidth,blockHeight);
        dim3 grid(1,1);
        genConvolve_kernel<T,KernelSize><<<grid,block>>>(inputImageArray,outputImageArray,inputWidth,outputWidth);
    #else
        printf("Hello, world!\n");
    #endif
    }
    
    template <typename T, int KernelSize>
         void genConvolve_cuda( const Array<T> & kernelArray , const Array<T> & inputImageArray , Array<T> & outputImageArray , int blockWidth=16, int blockHeight=16)
    {
        genConvolve_cuda_deviceptrs<T,KernelSize>((const T *)inputImageArray.ptr,outputImageArray.ptr, inputImageArray.width, outputImageArray.width, outputImageArray.height, blockWidth, blockHeight);
    }
    
    int main(int argc, char *argv[])
    {
        Array<float> a;
    
        genConvolve_cuda<float,3>(a,a,a);
    #ifdef HAVE_CUDA
        cudaDeviceSynchronize();
    #endif
    
        return 0;
    }
    

    The warnings remain; that appears to be a separate issue:

    t986.cu(33): warning: specifying a default argument when redeclaring an unreferenced function template is nonstandard

    t986.cu(33): warning: redefinition of default argument

    t986.cu(45): warning: specifying a default argument when redeclaring an unreferenced function template is nonstandard

    t986.cu(45): warning: redefinition of default argument

    Those warnings can be made to disappear if the default (template) function arguments are included on the first declaration but not the subsequent declarations, as follows:

    #ifdef MAKE_COMPILE
    #define __restrict__ /* empty */
    #define NO_FORWARD_DECLARATIONS
    #endif
    
    #include <stdio.h>
    
    template <class T>
    struct Array
    {
    int width, height;
    T *ptr;
    };
    
    #ifdef HAVE_CUDA
    template<typename T, int KernelSize>
         static __global__ void genConvolve_kernel(const T * __restrict__ inputImageArray , T * __restrict__ outputImageArray , int inputWidth , int outputWidth )
    {
        if ((threadIdx.x == 4) && (threadIdx.y == 2))
           printf("Hello world from CUDA!\n");
    }
    #endif
    
    #ifndef NO_FORWARD_DECLARATIONS
    template <typename T, int KernelSize>
         void genConvolve_cuda(const Array<T> & kernelArray , const Array<T> & inputImageArray , Array<T> & outputImageArray , int blockWidth=16, int blockHeight=16);
    
    template <typename T, int KernelSize>
         void genConvolve_cuda_deviceptrs( const T * __restrict__ inputImageArray , T * __restrict__ outputImageArray , int inputWidth , int outputWidth , int outputHeight , int blockWidth=16, int blockHeight=16);
    #endif
    
    template <typename T, int KernelSize>
         void genConvolve_cuda_deviceptrs( const T * __restrict__ inputImageArray , T * __restrict__ outputImageArray , int inputWidth , int outputWidth , int outputHeight , int blockWidth, int blockHeight)
    {
    #ifdef HAVE_CUDA
        dim3 block(blockWidth,blockHeight);
        dim3 grid(1,1);
        genConvolve_kernel<T,KernelSize><<<grid,block>>>(inputImageArray,outputImageArray,inputWidth,outputWidth);
    #else
        printf("Hello, world!\n");
    #endif
    }
    
    template <typename T, int KernelSize>
         void genConvolve_cuda( const Array<T> & kernelArray , const Array<T> & inputImageArray , Array<T> & outputImageArray , int blockWidth, int blockHeight)
    {
        genConvolve_cuda_deviceptrs<T,KernelSize>((const T *)inputImageArray.ptr,outputImageArray.ptr, inputImageArray.width, outputImageArray.width, outputImageArray.height, blockWidth, blockHeight);
    }
    
    int main(int argc, char *argv[])
    {
        Array<float> a;
    
        genConvolve_cuda<float,3>(a,a,a);
    #ifdef HAVE_CUDA
        cudaDeviceSynchronize();
    #endif
    
        return 0;
    }
    

    although I agree that still differs from g++ behavior. The gnu tools may still be the unusual case here, however. The redefinition of default arguments is still unexpected, and both clang and cl.exe (microsoft) will have issues with it.