The code below is legal C++ (compiles clean with g++ -Wall):
#ifdef MAKE_COMPILE
#define __restrict__ /* empty */
#define NO_FORWARD_DECLARATIONS
#endif
#include <stdio.h>
template <class T>
struct Array
{
int width, height;
T *ptr;
};
#ifdef HAVE_CUDA
template<typename T, int KernelSize>
static __global__ void genConvolve_kernel(const T __restrict__ * inputImageArray , T __restrict__ * outputImageArray , int inputWidth , int outputWidth )
{
if ((threadIdx.x == 4) && (threadIdx.y == 2))
printf("Hello world from CUDA!\n");
}
#endif
#ifndef NO_FORWARD_DECLARATIONS
template <typename T, int KernelSize>
void genConvolve_cuda(const Array<T> & kernelArray , const Array<T> & inputImageArray , Array<T> & outputImageArray , int blockWidth=16, int blockHeight=16);
template <typename T, int KernelSize>
void genConvolve_cuda_deviceptrs( const T __restrict__ * inputImageArray , T __restrict__ * outputImageArray , int inputWidth , int outputWidth , int outputHeight , int blockWidth=16, int blockHeight=16);
#endif
template <typename T, int KernelSize>
void genConvolve_cuda_deviceptrs( const T __restrict__ * inputImageArray , T __restrict__ * outputImageArray , int inputWidth , int outputWidth , int outputHeight , int blockWidth=16, int blockHeight=16)
{
#ifdef HAVE_CUDA
dim3 block(blockWidth,blockHeight);
dim3 grid(1,1);
genConvolve_kernel<T,KernelSize><<<grid,block>>>(inputImageArray,outputImageArray,inputWidth,outputWidth);
#else
printf("Hello, world!\n");
#endif
}
template <typename T, int KernelSize>
void genConvolve_cuda( const Array<T> & kernelArray , const Array<T> & inputImageArray , Array<T> & outputImageArray , int blockWidth=16, int blockHeight=16)
{
genConvolve_cuda_deviceptrs<T,KernelSize>((const T *)inputImageArray.ptr,outputImageArray.ptr, inputImageArray.width, outputImageArray.width, outputImageArray.height, blockWidth, blockHeight);
}
int main(int argc, char *argv[])
{
Array<float> a;
genConvolve_cuda<float,3>(a,a,a);
#ifdef HAVE_CUDA
cudaDeviceSynchronize();
#endif
return 0;
}
However, when I try to compile this with nvcc I get the errors:
nvcc t.cu
t.cu(39): warning: specifying a default argument when redeclaring an unreferenced function template is nonstandard
t.cu(39): warning: redefinition of default argument
t.cu(51): warning: specifying a default argument when redeclaring an unreferenced function template is nonstandard
t.cu(51): warning: redefinition of default argument
t.cu(53): error: template instantiation resulted in unexpected function type of "void (const float *, float *, int, int, int, int, int)" (the meaning of a name may have changed since the template declaration -- the type of the template is "void (const __restrict__ T *, __restrict__ T *, int, int, int, int, int)") detected during: instantiation of "genConvolve_cuda_deviceptrs" based on template arguments (53): here instantiation of "void genConvolve_cuda(const Array &, const Array &, Array &, int, int) [with T=float, KernelSize=3]" (60): here
(line numbers lightly offset as I clean-up the example before posting.)
The warnings and errors go away when I define -DMAKE_COMPILE; however, I really would like to specify the forward declarations in a header file, and to use restrict !
So two questions:
How to properly use
__restrict__
with template arguments?
After conferring with colleagues, it was pointed out to me that this __restrict__
usage:
const T __restrict__ * inputImageArray ...
is questionable. In order for __restrict__
to have any effect, it is expected to be placed between the asterisk and the pointer name:
const T * __restrict__ inputImageArray ...
(gcc reference, and CUDA reference)
In the non-standard usage you have shown, gcc seems to allow this but silently "drops" the intent; the effect of __restrict__
is not applied in that case. In this respect, it is true that CUDA differs from gcc behavior. However because it is questionable usage as described above, it's unlikely that nvcc
would be modified to "fix" this issue.
You can make the compile error disappear in the code you have shown if you switch to standard __restrict__
usage. This is recommended anyway if your intent is to declare to the compiler that these are in fact restricted pointers:
#ifdef MAKE_COMPILE
#define __restrict__ /* empty */
#define NO_FORWARD_DECLARATIONS
#endif
#include <stdio.h>
template <class T>
struct Array
{
int width, height;
T *ptr;
};
#ifdef HAVE_CUDA
template<typename T, int KernelSize>
static __global__ void genConvolve_kernel(const T * __restrict__ inputImageArray , T * __restrict__ outputImageArray , int inputWidth , int outputWidth )
{
if ((threadIdx.x == 4) && (threadIdx.y == 2))
printf("Hello world from CUDA!\n");
}
#endif
#ifndef NO_FORWARD_DECLARATIONS
template <typename T, int KernelSize>
void genConvolve_cuda(const Array<T> & kernelArray , const Array<T> & inputImageArray , Array<T> & outputImageArray , int blockWidth=16, int blockHeight=16);
template <typename T, int KernelSize>
void genConvolve_cuda_deviceptrs( const T * __restrict__ inputImageArray , T * __restrict__ outputImageArray , int inputWidth , int outputWidth , int outputHeight , int blockWidth=16, int blockHeight=16);
#endif
template <typename T, int KernelSize>
void genConvolve_cuda_deviceptrs( const T * __restrict__ inputImageArray , T * __restrict__ outputImageArray , int inputWidth , int outputWidth , int outputHeight , int blockWidth=16, int blockHeight=16)
{
#ifdef HAVE_CUDA
dim3 block(blockWidth,blockHeight);
dim3 grid(1,1);
genConvolve_kernel<T,KernelSize><<<grid,block>>>(inputImageArray,outputImageArray,inputWidth,outputWidth);
#else
printf("Hello, world!\n");
#endif
}
template <typename T, int KernelSize>
void genConvolve_cuda( const Array<T> & kernelArray , const Array<T> & inputImageArray , Array<T> & outputImageArray , int blockWidth=16, int blockHeight=16)
{
genConvolve_cuda_deviceptrs<T,KernelSize>((const T *)inputImageArray.ptr,outputImageArray.ptr, inputImageArray.width, outputImageArray.width, outputImageArray.height, blockWidth, blockHeight);
}
int main(int argc, char *argv[])
{
Array<float> a;
genConvolve_cuda<float,3>(a,a,a);
#ifdef HAVE_CUDA
cudaDeviceSynchronize();
#endif
return 0;
}
The warnings remain; that appears to be a separate issue:
t986.cu(33): warning: specifying a default argument when redeclaring an unreferenced function template is nonstandard
t986.cu(33): warning: redefinition of default argument
t986.cu(45): warning: specifying a default argument when redeclaring an unreferenced function template is nonstandard
t986.cu(45): warning: redefinition of default argument
Those warnings can be made to disappear if the default (template) function arguments are included on the first declaration but not the subsequent declarations, as follows:
#ifdef MAKE_COMPILE
#define __restrict__ /* empty */
#define NO_FORWARD_DECLARATIONS
#endif
#include <stdio.h>
template <class T>
struct Array
{
int width, height;
T *ptr;
};
#ifdef HAVE_CUDA
template<typename T, int KernelSize>
static __global__ void genConvolve_kernel(const T * __restrict__ inputImageArray , T * __restrict__ outputImageArray , int inputWidth , int outputWidth )
{
if ((threadIdx.x == 4) && (threadIdx.y == 2))
printf("Hello world from CUDA!\n");
}
#endif
#ifndef NO_FORWARD_DECLARATIONS
template <typename T, int KernelSize>
void genConvolve_cuda(const Array<T> & kernelArray , const Array<T> & inputImageArray , Array<T> & outputImageArray , int blockWidth=16, int blockHeight=16);
template <typename T, int KernelSize>
void genConvolve_cuda_deviceptrs( const T * __restrict__ inputImageArray , T * __restrict__ outputImageArray , int inputWidth , int outputWidth , int outputHeight , int blockWidth=16, int blockHeight=16);
#endif
template <typename T, int KernelSize>
void genConvolve_cuda_deviceptrs( const T * __restrict__ inputImageArray , T * __restrict__ outputImageArray , int inputWidth , int outputWidth , int outputHeight , int blockWidth, int blockHeight)
{
#ifdef HAVE_CUDA
dim3 block(blockWidth,blockHeight);
dim3 grid(1,1);
genConvolve_kernel<T,KernelSize><<<grid,block>>>(inputImageArray,outputImageArray,inputWidth,outputWidth);
#else
printf("Hello, world!\n");
#endif
}
template <typename T, int KernelSize>
void genConvolve_cuda( const Array<T> & kernelArray , const Array<T> & inputImageArray , Array<T> & outputImageArray , int blockWidth, int blockHeight)
{
genConvolve_cuda_deviceptrs<T,KernelSize>((const T *)inputImageArray.ptr,outputImageArray.ptr, inputImageArray.width, outputImageArray.width, outputImageArray.height, blockWidth, blockHeight);
}
int main(int argc, char *argv[])
{
Array<float> a;
genConvolve_cuda<float,3>(a,a,a);
#ifdef HAVE_CUDA
cudaDeviceSynchronize();
#endif
return 0;
}
although I agree that still differs from g++ behavior. The gnu tools may still be the unusual case here, however. The redefinition of default arguments is still unexpected, and both clang and cl.exe (microsoft) will have issues with it.