im trying to create a texture 3d from a part of a device array.
To do this, these are my steps:
The way im doing it it creates no compiler errors, but when i run cuda-memcheck it's failing when im trying to fetch data from the Texture.
Invalid global read of size 8 .. Address 0x10dfaf3a0 is out of bounds
Thats why i'm guessing i declared the texture Array wrong. here is how i access the texture:
tex3D(NoiseTextures[i],x,y,z)
The way im doing the steps mentioned above:
1.Malloc Device Array
cudaMalloc((void **)&d_Noise, sqrSizeNoise*nNoise*sizeof(float));
2.Write Device Array
curandCreateGenerator(&gen,CURAND_RNG_PSEUDO_DEFAULT);
curandSetPseudoRandomGeneratorSeed(gen,Seed);
curandGenerateUniform(gen, d_Noise, sqrSizeNoise*nNoise);
curandDestroyGenerator(gen);
3+4.Creating the Cuda Array and binding it to the texture (Im guessing the mistake is here)
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();//cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
cudaArray *d_cuArr;
cudaMalloc3DArray(&d_cuArr, &channelDesc, make_cudaExtent(SizeNoise,SizeNoise,SizeNoise), 0);
cudaMemcpy3DParms copyParams = {0};
//Loop for every separated Noise Texture (nNoise = 4)
for(int i = 0; i < nNoise; i++){
//initialize the textures
NoiseTextures[i] = texture<float, 3, cudaReadModeElementType>(1,cudaFilterModeLinear,cudaAddressModeWrap,channelDesc);
//Array creation
//+(sqrSizeNoise*i) is to separate the created Noise Array into smaller parts with the size of SizeNoise^3
copyParams.srcPtr = make_cudaPitchedPtr(d_Noise+(sqrSizeNoise*i), SizeNoise*sizeof(float), SizeNoise, SizeNoise);
copyParams.dstArray = d_cuArr;
copyParams.extent = make_cudaExtent(SizeNoise,SizeNoise,SizeNoise);
copyParams.kind = cudaMemcpyDeviceToDevice;
checkCudaErrors(cudaMemcpy3D(©Params));
//Array creation End
//new Bind
// set texture parameters
NoiseTextures[i].normalized = true; // access with normalized texture coordinates
NoiseTextures[i].filterMode = cudaFilterModeLinear; // linear interpolation
NoiseTextures[i].addressMode[0] = cudaAddressModeWrap; // wrap texture coordinates
NoiseTextures[i].addressMode[1] = cudaAddressModeWrap;
NoiseTextures[i].addressMode[2] = cudaAddressModeWrap;
// bind array to 3D texture
checkCudaErrors(cudaBindTextureToArray(NoiseTextures[i], d_cuArr, channelDesc));
//end Bind
}
cudaFreeArray(d_cuArr);
I've Pasted this code snippet to Pastebin so its easier to look at with colors etc. http://pastebin.com/SM3dYd38
I hope I clearly described my problem. If not pls comment!
Can you help me with this? Thanks for reading,
Cery
Edit: Here is a complete code so you can try it on your own machine:
#include <helper_cuda.h>
#include <helper_functions.h>
#include <helper_cuda_gl.h>
#include <texture_types.h>
#include <cuda_runtime.h>
#include <curand.h>
static texture<float, 3, cudaReadModeElementType> NoiseTextures[4];//texture Array
float *d_NoiseTest;//Device Array with random floats
int SizeNoiseTest = 32;
int sqrSizeNoiseTest = 32768;
void CreateTexture();
__global__ void AccesTexture(texture<float, 3, cudaReadModeElementType>* NoiseTextures)
{
int test = tex3D(NoiseTextures[0],threadIdx.x,threadIdx.y,threadIdx.z);//by using this the error occurs
}
int
main(int argc, char **argv)
{
CreateTexture();
}
void CreateTexture()
{
//curand Random Generator (needs compiler link -lcurand)
curandGenerator_t gen;
cudaMalloc((void **)&d_NoiseTest, sqrSizeNoiseTest*4*sizeof(float));//Allocation of device Array
curandCreateGenerator(&gen,CURAND_RNG_PSEUDO_DEFAULT);
curandSetPseudoRandomGeneratorSeed(gen,1234ULL);
curandGenerateUniform(gen, d_NoiseTest, sqrSizeNoiseTest*4);//writing data to d_NoiseTest
curandDestroyGenerator(gen);
//cudaArray Descriptor
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
//cuda Array
cudaArray *d_cuArr;
cudaMalloc3DArray(&d_cuArr, &channelDesc, make_cudaExtent(SizeNoiseTest*sizeof(float),SizeNoiseTest,SizeNoiseTest), 0);
cudaMemcpy3DParms copyParams = {0};
//Loop for every separated Noise Texture (4 = 4)
for(int i = 0; i < 4; i++){
//initialize the textures
NoiseTextures[i] = texture<float, 3, cudaReadModeElementType>(1,cudaFilterModeLinear,cudaAddressModeWrap,channelDesc);
//Array creation
//+(sqrSizeNoise*i) is to separate the created Noise Array into smaller parts with the size of SizeNoise^3
copyParams.srcPtr = make_cudaPitchedPtr(d_NoiseTest+(sqrSizeNoiseTest*i), SizeNoiseTest*sizeof(float), SizeNoiseTest, SizeNoiseTest);
copyParams.dstArray = d_cuArr;
copyParams.extent = make_cudaExtent(SizeNoiseTest*sizeof(float),SizeNoiseTest,SizeNoiseTest);
copyParams.kind = cudaMemcpyDeviceToDevice;
checkCudaErrors(cudaMemcpy3D(©Params));
//Array creation End
//new Bind
// set texture parameters
NoiseTextures[i].normalized = true; // access with normalized texture coordinates
NoiseTextures[i].filterMode = cudaFilterModeLinear; // linear interpolation
NoiseTextures[i].addressMode[0] = cudaAddressModeWrap; // wrap texture coordinates
NoiseTextures[i].addressMode[1] = cudaAddressModeWrap;
NoiseTextures[i].addressMode[2] = cudaAddressModeWrap;
// bind array to 3D texture
checkCudaErrors(cudaBindTextureToArray(NoiseTextures[i], d_cuArr, channelDesc));
//end Bind
}
cudaFreeArray(d_cuArr);
AccesTexture<<<1,dim3(4,4,4)>>>(NoiseTextures);
}
You need to link -lcurand though. And include CUDA-6.0/samples/common/inc
Im now getting a different error in this code
code=11(cudaErrorInvalidValue) "cudaMemcpy3D(©Params)"
Even though it's the exact same code then my original. - Im starting to get completely confused. Thank you for your help
Here's a worked example showing the creation of an array of texture objects, roughly following the path of the code you provided. You can see, by comparing to the texture reference code I placed here, that the first set of texture reads from the first texture object (i.e. the first kernel call) are the same numerical values as the set of reads from the texture reference example (you may need to adjust the grid size of the two example codes to match).
Texture object usage requires compute capability 3.0 or higher.
example:
$ cat t507.cu
#include <helper_cuda.h>
#include <curand.h>
#define NUM_TEX 4
const int SizeNoiseTest = 32;
const int cubeSizeNoiseTest = SizeNoiseTest*SizeNoiseTest*SizeNoiseTest;
static cudaTextureObject_t texNoise[NUM_TEX];
__global__ void AccesTexture(cudaTextureObject_t my_tex)
{
float test = tex3D<float>(my_tex,(float)threadIdx.x,(float)threadIdx.y,(float)threadIdx.z);//by using this the error occurs
printf("thread: %d,%d,%d, value: %f\n", threadIdx.x, threadIdx.y, threadIdx.z, test);
}
void CreateTexture()
{
float *d_NoiseTest;//Device Array with random floats
cudaMalloc((void **)&d_NoiseTest, cubeSizeNoiseTest*sizeof(float));//Allocation of device Array
for (int i = 0; i < NUM_TEX; i++){
//curand Random Generator (needs compiler link -lcurand)
curandGenerator_t gen;
curandCreateGenerator(&gen,CURAND_RNG_PSEUDO_DEFAULT);
curandSetPseudoRandomGeneratorSeed(gen,1235ULL+i);
curandGenerateUniform(gen, d_NoiseTest, cubeSizeNoiseTest);//writing data to d_NoiseTest
curandDestroyGenerator(gen);
//cudaArray Descriptor
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
//cuda Array
cudaArray *d_cuArr;
checkCudaErrors(cudaMalloc3DArray(&d_cuArr, &channelDesc, make_cudaExtent(SizeNoiseTest*sizeof(float),SizeNoiseTest,SizeNoiseTest), 0));
cudaMemcpy3DParms copyParams = {0};
//Array creation
copyParams.srcPtr = make_cudaPitchedPtr(d_NoiseTest, SizeNoiseTest*sizeof(float), SizeNoiseTest, SizeNoiseTest);
copyParams.dstArray = d_cuArr;
copyParams.extent = make_cudaExtent(SizeNoiseTest,SizeNoiseTest,SizeNoiseTest);
copyParams.kind = cudaMemcpyDeviceToDevice;
checkCudaErrors(cudaMemcpy3D(©Params));
//Array creation End
cudaResourceDesc texRes;
memset(&texRes, 0, sizeof(cudaResourceDesc));
texRes.resType = cudaResourceTypeArray;
texRes.res.array.array = d_cuArr;
cudaTextureDesc texDescr;
memset(&texDescr, 0, sizeof(cudaTextureDesc));
texDescr.normalizedCoords = false;
texDescr.filterMode = cudaFilterModeLinear;
texDescr.addressMode[0] = cudaAddressModeClamp; // clamp
texDescr.addressMode[1] = cudaAddressModeClamp;
texDescr.addressMode[2] = cudaAddressModeClamp;
texDescr.readMode = cudaReadModeElementType;
checkCudaErrors(cudaCreateTextureObject(&texNoise[i], &texRes, &texDescr, NULL));}
}
int main(int argc, char **argv)
{
CreateTexture();
AccesTexture<<<1,dim3(2,2,2)>>>(texNoise[0]);
AccesTexture<<<1,dim3(2,2,2)>>>(texNoise[1]);
AccesTexture<<<1,dim3(2,2,2)>>>(texNoise[2]);
checkCudaErrors(cudaPeekAtLastError());
checkCudaErrors(cudaDeviceSynchronize());
return 0;
}
compile with:
$ nvcc -arch=sm_30 -I/shared/apps/cuda/CUDA-v6.0.37/samples/common/inc -lcurand -o t507 t507.cu
output:
$ cuda-memcheck ./t507
========= CUDA-MEMCHECK
thread: 0,0,0, value: 0.310691
thread: 1,0,0, value: 0.627906
thread: 0,1,0, value: 0.638900
thread: 1,1,0, value: 0.665186
thread: 0,0,1, value: 0.167465
thread: 1,0,1, value: 0.565227
thread: 0,1,1, value: 0.397606
thread: 1,1,1, value: 0.503013
thread: 0,0,0, value: 0.809163
thread: 1,0,0, value: 0.795669
thread: 0,1,0, value: 0.808565
thread: 1,1,0, value: 0.847564
thread: 0,0,1, value: 0.853998
thread: 1,0,1, value: 0.688446
thread: 0,1,1, value: 0.733255
thread: 1,1,1, value: 0.649379
thread: 0,0,0, value: 0.040824
thread: 1,0,0, value: 0.087417
thread: 0,1,0, value: 0.301392
thread: 1,1,0, value: 0.298669
thread: 0,0,1, value: 0.161962
thread: 1,0,1, value: 0.316443
thread: 0,1,1, value: 0.452077
thread: 1,1,1, value: 0.477722
========= ERROR SUMMARY: 0 errors
In this case I'm using the same kernel, called multiple times, to read from the individual texture objects. It should be possible to pass multiple objects to the same kernel, however it is not advisable to have a single warp read from multiple textures, if that can be avoided in your code. The actual issue resides at the quad level, which I'd prefer not to get into. It's best if you can arrange your code so that a warp is reading from the same texture object, on any given cycle.
Note that for simplicity of presentation, this CreateTexture()
function overwrites previously allocated device pointers such as d_cuArr
, during the processing of the loop. This isn't illegal or a functional issue, but it raises the possibility of memory leaks.
I assume you can modify the code to handle deallocation of those if this is a concern. The purpose of this code is to demonstrate the method to get things working.