How to use make_cudaExtent to define a cudaExtent correctly?

I want to have a 3d float array in CUDA, here is my code:

#define  SIZE_X 128 //numbers in elements
#define  SIZE_Y 128
#define  SIZE_Z 128
typedef float  VolumeType;
cudaExtent volumeSize = make_cudaExtent(SIZE_X, SIZE_Y, SIZE_Z); //The first argument should be SIZE_X*sizeof(VolumeType)??

float *d_volumeMem;
cutilSafeCall(cudaMalloc((void**)&d_volumeMem, SIZE_X*SIZE_Y*SIZE_Z*sizeof(float)));

.....//assign value to d_volumeMem in GPU

cudaArray *d_volumeArray = 0;
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<VolumeType>();
cutilSafeCall( cudaMalloc3DArray(&d_volumeArray, &channelDesc, volumeSize) ); 
cudaMemcpy3DParms copyParams = {0};
copyParams.srcPtr = make_cudaPitchedPtr((void*)d_volumeMem, SIZE_X*sizeof(VolumeType), SIZE_X, SIZE_Y); //
copyParams.dstArray = d_volumeArray;
copyParams.extent = volumeSize;
copyParams.kin = cudaMemcpyDeviceToDevice;
cutilSafeCall( cudaMemcpy3D(&copyParams) );

Actually, my program runs well. But I'm not sure the result is right. Here is my problem, in the CUDA liberay, it said that the first parameter of make_cudaExtent is "Width in bytes" and the other two is height and depth in elements. So I think in my code above, the fifth line should be

cudaExtent volumeSize = make_cudaExtent(SIZE_X*sizeof(VolumeType), SIZE_Y, SIZE_Z);

But in this way, there would be error "invalid argument" in cutilSafeCall( cudaMemcpy3D(&copyParams) ); Why?

And another puzzle is the strcut cudaExtent, as CUDA library stated,its component width stands for "Width in elements when referring to array memory, in bytes when referring to linear memory". So I think in my code when I refer volumeSize.width it should be number in elements. However, if I use

 cudaExtent volumeSize = make_cudaExtent(SIZE_X*sizeof(VolumeType), SIZE_Y, SIZE_Z);

The volumeSize.width would be SIZE_X*sizeof(VolumeType)(128*4), that is number in bytes instead of number in elements.

In many CUDA SDK, they use char as the VolumeType, so they just use SIZE_X as the first argument in make_cudaExtent. But mine is float, so, anyone could tell me which is the right way to create a cudaExtent if I need to use this to create a 3D array?? Thanks a lot!

Solution

Let's review what the documentation for cudaMemcpy3D says:

The extent field defines the dimensions of the transferred area in elements. If a CUDA array is participating in the copy, the extent is defined in terms of that array's elements. If no CUDA array is participating in the copy then the extents are defined in elements of unsigned char.

and similarly the documentation for cudaMalloc3DArray notes:

All values are specified in elements

So the extent you need to form for both calls needs to have the first dimension in elements (because one of the allocations in the cudaMemcpy3D is an array).

But you potentially have a different problem in your code, because you are allocating the linear memory source d_volumeMem using cudaMalloc. cudaMemcpy3D expects that linear source memory has been allocated with a compatible pitch. Your code is just using a linear allocation of size

SIZE_X*SIZE_Y*SIZE_Z*sizeof(float)

Now it might be that the dimensions you have chosen produces a compatible pitch for the hardware you are using, but it is not guaranteed that it will do so. I would recommend using cudaMalloc3D to allocate the linear source memory as well. An expanded demonstration of this built around your little code snippet might look like this:

#include <cstdio>

typedef float  VolumeType;

const size_t SIZE_X = 8;
const size_t SIZE_Y = 8;
const size_t SIZE_Z = 8;
const size_t width = sizeof(VolumeType) * SIZE_X;

texture<VolumeType, cudaTextureType3D, cudaReadModeElementType> tex; 

__global__ void testKernel(VolumeType * output, int dimx, int dimy, int dimz)
{
    int tidx = threadIdx.x + blockIdx.x * blockDim.x;
    int tidy = threadIdx.y + blockIdx.y * blockDim.y;
    int tidz = threadIdx.z + blockIdx.z * blockDim.z;

    float x = float(tidx)+0.5f;
    float y = float(tidy)+0.5f;
    float z = float(tidz)+0.5f;

    size_t oidx = tidx + tidy*dimx + tidz*dimx*dimy;
    output[oidx] = tex3D(tex, x, y, z);
}

inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
   if (code != cudaSuccess) 
   {
      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
      if (abort) exit(code);
   }
}

#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }

template<typename T>
void init(char * devPtr, size_t pitch, int width, int height, int depth)
{
    size_t slicePitch = pitch * height;
    int v = 0;
    for (int z = 0; z < depth; ++z) {
        char * slice = devPtr + z * slicePitch;
        for (int y = 0; y < height; ++y) {
            T * row = (T *)(slice + y * pitch);
            for (int x = 0; x < width; ++x) {
                row[x] = T(v++);
            }
        }
    }
}

int main(void)
{
    VolumeType *h_volumeMem, *d_output, *h_output;

    cudaExtent volumeSizeBytes = make_cudaExtent(width, SIZE_Y, SIZE_Z);
    cudaPitchedPtr d_volumeMem; 
    gpuErrchk(cudaMalloc3D(&d_volumeMem, volumeSizeBytes));

    size_t size = d_volumeMem.pitch * SIZE_Y * SIZE_Z;
    h_volumeMem = (VolumeType *)malloc(size);
    init<VolumeType>((char *)h_volumeMem, d_volumeMem.pitch, SIZE_X, SIZE_Y, SIZE_Z);
    gpuErrchk(cudaMemcpy(d_volumeMem.ptr, h_volumeMem, size, cudaMemcpyHostToDevice));

    cudaArray * d_volumeArray;
    cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<VolumeType>();
    cudaExtent volumeSize = make_cudaExtent(SIZE_X, SIZE_Y, SIZE_Z);
    gpuErrchk( cudaMalloc3DArray(&d_volumeArray, &channelDesc, volumeSize) ); 

    cudaMemcpy3DParms copyParams = {0};
    copyParams.srcPtr = d_volumeMem;
    copyParams.dstArray = d_volumeArray;
    copyParams.extent = volumeSize;
    copyParams.kind = cudaMemcpyDeviceToDevice;
    gpuErrchk( cudaMemcpy3D(&copyParams) ); 

    tex.normalized = false;                      
    tex.filterMode = cudaFilterModeLinear;      
    tex.addressMode[0] = cudaAddressModeWrap;   
    tex.addressMode[1] = cudaAddressModeWrap;
    tex.addressMode[2] = cudaAddressModeWrap;
    gpuErrchk(cudaBindTextureToArray(tex, d_volumeArray, channelDesc));

    size_t osize = 64 * sizeof(VolumeType);
    gpuErrchk(cudaMalloc((void**)&d_output, osize));

    testKernel<<<1,dim3(4,4,4)>>>(d_output,4,4,4);
    gpuErrchk(cudaPeekAtLastError());

    h_output = (VolumeType *)malloc(osize);
    gpuErrchk(cudaMemcpy(h_output, d_output, osize, cudaMemcpyDeviceToHost));

    for(int i=0; i<64; i++)
        fprintf(stdout, "%d %f\n", i, h_output[i]);

    return 0;
}

You can confirm for yourself that the output of the textures reads matches the original source memory on the host.