I want to have a 3d float array in CUDA, here is my code:
#define SIZE_X 128 //numbers in elements
#define SIZE_Y 128
#define SIZE_Z 128
typedef float VolumeType;
cudaExtent volumeSize = make_cudaExtent(SIZE_X, SIZE_Y, SIZE_Z); //The first argument should be SIZE_X*sizeof(VolumeType)??
float *d_volumeMem;
cutilSafeCall(cudaMalloc((void**)&d_volumeMem, SIZE_X*SIZE_Y*SIZE_Z*sizeof(float)));
.....//assign value to d_volumeMem in GPU
cudaArray *d_volumeArray = 0;
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<VolumeType>();
cutilSafeCall( cudaMalloc3DArray(&d_volumeArray, &channelDesc, volumeSize) );
cudaMemcpy3DParms copyParams = {0};
copyParams.srcPtr = make_cudaPitchedPtr((void*)d_volumeMem, SIZE_X*sizeof(VolumeType), SIZE_X, SIZE_Y); //
copyParams.dstArray = d_volumeArray;
copyParams.extent = volumeSize;
copyParams.kin = cudaMemcpyDeviceToDevice;
cutilSafeCall( cudaMemcpy3D(©Params) );
Actually, my program runs well. But I'm not sure the result is right. Here is my problem, in the CUDA liberay, it said that the first parameter of make_cudaExtent is "Width in bytes" and the other two is height and depth in elements. So I think in my code above, the fifth line should be
cudaExtent volumeSize = make_cudaExtent(SIZE_X*sizeof(VolumeType), SIZE_Y, SIZE_Z);
But in this way, there would be error "invalid argument" in cutilSafeCall( cudaMemcpy3D(©Params) ); Why?
And another puzzle is the strcut cudaExtent, as CUDA library stated,its component width stands for "Width in elements when referring to array memory, in bytes when referring to linear memory". So I think in my code when I refer volumeSize.width it should be number in elements. However, if I use
cudaExtent volumeSize = make_cudaExtent(SIZE_X*sizeof(VolumeType), SIZE_Y, SIZE_Z);
The volumeSize.width would be SIZE_X*sizeof(VolumeType)(128*4), that is number in bytes instead of number in elements.
In many CUDA SDK, they use char as the VolumeType, so they just use SIZE_X as the first argument in make_cudaExtent. But mine is float, so, anyone could tell me which is the right way to create a cudaExtent if I need to use this to create a 3D array?? Thanks a lot!
Let's review what the documentation for cudaMemcpy3D
says:
The extent field defines the dimensions of the transferred area in elements. If a CUDA array is participating in the copy, the extent is defined in terms of that array's elements. If no CUDA array is participating in the copy then the extents are defined in elements of unsigned char.
and similarly the documentation for cudaMalloc3DArray
notes:
All values are specified in elements
So the extent you need to form for both calls needs to have the first dimension in elements (because one of the allocations in the cudaMemcpy3D
is an array).
But you potentially have a different problem in your code, because you are allocating the linear memory source d_volumeMem
using cudaMalloc
. cudaMemcpy3D
expects that linear source memory has been allocated with a compatible pitch. Your code is just using a linear allocation of size
SIZE_X*SIZE_Y*SIZE_Z*sizeof(float)
Now it might be that the dimensions you have chosen produces a compatible pitch for the hardware you are using, but it is not guaranteed that it will do so. I would recommend using cudaMalloc3D
to allocate the linear source memory as well. An expanded demonstration of this built around your little code snippet might look like this:
#include <cstdio>
typedef float VolumeType;
const size_t SIZE_X = 8;
const size_t SIZE_Y = 8;
const size_t SIZE_Z = 8;
const size_t width = sizeof(VolumeType) * SIZE_X;
texture<VolumeType, cudaTextureType3D, cudaReadModeElementType> tex;
__global__ void testKernel(VolumeType * output, int dimx, int dimy, int dimz)
{
int tidx = threadIdx.x + blockIdx.x * blockDim.x;
int tidy = threadIdx.y + blockIdx.y * blockDim.y;
int tidz = threadIdx.z + blockIdx.z * blockDim.z;
float x = float(tidx)+0.5f;
float y = float(tidy)+0.5f;
float z = float(tidz)+0.5f;
size_t oidx = tidx + tidy*dimx + tidz*dimx*dimy;
output[oidx] = tex3D(tex, x, y, z);
}
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
template<typename T>
void init(char * devPtr, size_t pitch, int width, int height, int depth)
{
size_t slicePitch = pitch * height;
int v = 0;
for (int z = 0; z < depth; ++z) {
char * slice = devPtr + z * slicePitch;
for (int y = 0; y < height; ++y) {
T * row = (T *)(slice + y * pitch);
for (int x = 0; x < width; ++x) {
row[x] = T(v++);
}
}
}
}
int main(void)
{
VolumeType *h_volumeMem, *d_output, *h_output;
cudaExtent volumeSizeBytes = make_cudaExtent(width, SIZE_Y, SIZE_Z);
cudaPitchedPtr d_volumeMem;
gpuErrchk(cudaMalloc3D(&d_volumeMem, volumeSizeBytes));
size_t size = d_volumeMem.pitch * SIZE_Y * SIZE_Z;
h_volumeMem = (VolumeType *)malloc(size);
init<VolumeType>((char *)h_volumeMem, d_volumeMem.pitch, SIZE_X, SIZE_Y, SIZE_Z);
gpuErrchk(cudaMemcpy(d_volumeMem.ptr, h_volumeMem, size, cudaMemcpyHostToDevice));
cudaArray * d_volumeArray;
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<VolumeType>();
cudaExtent volumeSize = make_cudaExtent(SIZE_X, SIZE_Y, SIZE_Z);
gpuErrchk( cudaMalloc3DArray(&d_volumeArray, &channelDesc, volumeSize) );
cudaMemcpy3DParms copyParams = {0};
copyParams.srcPtr = d_volumeMem;
copyParams.dstArray = d_volumeArray;
copyParams.extent = volumeSize;
copyParams.kind = cudaMemcpyDeviceToDevice;
gpuErrchk( cudaMemcpy3D(©Params) );
tex.normalized = false;
tex.filterMode = cudaFilterModeLinear;
tex.addressMode[0] = cudaAddressModeWrap;
tex.addressMode[1] = cudaAddressModeWrap;
tex.addressMode[2] = cudaAddressModeWrap;
gpuErrchk(cudaBindTextureToArray(tex, d_volumeArray, channelDesc));
size_t osize = 64 * sizeof(VolumeType);
gpuErrchk(cudaMalloc((void**)&d_output, osize));
testKernel<<<1,dim3(4,4,4)>>>(d_output,4,4,4);
gpuErrchk(cudaPeekAtLastError());
h_output = (VolumeType *)malloc(osize);
gpuErrchk(cudaMemcpy(h_output, d_output, osize, cudaMemcpyDeviceToHost));
for(int i=0; i<64; i++)
fprintf(stdout, "%d %f\n", i, h_output[i]);
return 0;
}
You can confirm for yourself that the output of the textures reads matches the original source memory on the host.