Search code examples
c++carrayscudatextures

Cuda Create 3d texture and cudaArray(3d) from device memory


im trying to create a texture 3d from a part of a device array.

To do this, these are my steps:

  1. malloc Device Array
  2. Write Device Array
  3. Create CudaArray (3D)
  4. Bind Texture to CudaArray

The way im doing it it creates no compiler errors, but when i run cuda-memcheck it's failing when im trying to fetch data from the Texture.

Invalid global read of size 8 .. Address 0x10dfaf3a0 is out of bounds

Thats why i'm guessing i declared the texture Array wrong. here is how i access the texture:

tex3D(NoiseTextures[i],x,y,z)

The way im doing the steps mentioned above:

1.Malloc Device Array

cudaMalloc((void **)&d_Noise, sqrSizeNoise*nNoise*sizeof(float));

2.Write Device Array

curandCreateGenerator(&gen,CURAND_RNG_PSEUDO_DEFAULT);
curandSetPseudoRandomGeneratorSeed(gen,Seed);
curandGenerateUniform(gen, d_Noise, sqrSizeNoise*nNoise);
curandDestroyGenerator(gen);

3+4.Creating the Cuda Array and binding it to the texture (Im guessing the mistake is here)

cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();//cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
cudaArray *d_cuArr;
cudaMalloc3DArray(&d_cuArr, &channelDesc, make_cudaExtent(SizeNoise,SizeNoise,SizeNoise), 0);
cudaMemcpy3DParms copyParams = {0};

//Loop for every separated Noise Texture (nNoise = 4)
for(int i = 0; i < nNoise; i++){

    //initialize the textures
    NoiseTextures[i] = texture<float, 3, cudaReadModeElementType>(1,cudaFilterModeLinear,cudaAddressModeWrap,channelDesc);

    //Array creation
    //+(sqrSizeNoise*i) is to separate the created Noise Array into smaller parts with the size of SizeNoise^3
    copyParams.srcPtr   = make_cudaPitchedPtr(d_Noise+(sqrSizeNoise*i), SizeNoise*sizeof(float), SizeNoise, SizeNoise);
    copyParams.dstArray = d_cuArr;
    copyParams.extent   = make_cudaExtent(SizeNoise,SizeNoise,SizeNoise);
    copyParams.kind     = cudaMemcpyDeviceToDevice;
    checkCudaErrors(cudaMemcpy3D(&copyParams));
    //Array creation End

    //new Bind
    // set texture parameters
    NoiseTextures[i].normalized = true;                      // access with normalized texture coordinates
    NoiseTextures[i].filterMode = cudaFilterModeLinear;      // linear interpolation
    NoiseTextures[i].addressMode[0] = cudaAddressModeWrap;   // wrap texture coordinates
    NoiseTextures[i].addressMode[1] = cudaAddressModeWrap;
    NoiseTextures[i].addressMode[2] = cudaAddressModeWrap;

    // bind array to 3D texture
    checkCudaErrors(cudaBindTextureToArray(NoiseTextures[i], d_cuArr, channelDesc));
    //end Bind
}
cudaFreeArray(d_cuArr);

I've Pasted this code snippet to Pastebin so its easier to look at with colors etc. http://pastebin.com/SM3dYd38

I hope I clearly described my problem. If not pls comment!

Can you help me with this? Thanks for reading,

Cery

Edit: Here is a complete code so you can try it on your own machine:

#include <helper_cuda.h>  
#include <helper_functions.h>
#include <helper_cuda_gl.h>
#include <texture_types.h>
#include <cuda_runtime.h>
#include <curand.h>

static texture<float, 3, cudaReadModeElementType> NoiseTextures[4];//texture Array
float *d_NoiseTest;//Device Array with random floats
int SizeNoiseTest = 32;
int sqrSizeNoiseTest = 32768;

void CreateTexture();

__global__ void AccesTexture(texture<float, 3, cudaReadModeElementType>* NoiseTextures)
{
        int test = tex3D(NoiseTextures[0],threadIdx.x,threadIdx.y,threadIdx.z);//by using this the error occurs
}

int
main(int argc, char **argv)
{
        CreateTexture();
}
void CreateTexture()
{
        //curand Random Generator (needs compiler link -lcurand)
        curandGenerator_t gen;
        cudaMalloc((void **)&d_NoiseTest, sqrSizeNoiseTest*4*sizeof(float));//Allocation of device Array
        curandCreateGenerator(&gen,CURAND_RNG_PSEUDO_DEFAULT);
        curandSetPseudoRandomGeneratorSeed(gen,1234ULL);
        curandGenerateUniform(gen, d_NoiseTest, sqrSizeNoiseTest*4);//writing data to d_NoiseTest
        curandDestroyGenerator(gen);

        //cudaArray Descriptor
        cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
        //cuda Array
        cudaArray *d_cuArr;
        cudaMalloc3DArray(&d_cuArr, &channelDesc, make_cudaExtent(SizeNoiseTest*sizeof(float),SizeNoiseTest,SizeNoiseTest), 0);
        cudaMemcpy3DParms copyParams = {0};

        //Loop for every separated Noise Texture (4 = 4)
        for(int i = 0; i < 4; i++){

                //initialize the textures
                NoiseTextures[i] = texture<float, 3, cudaReadModeElementType>(1,cudaFilterModeLinear,cudaAddressModeWrap,channelDesc);

                //Array creation
                //+(sqrSizeNoise*i) is to separate the created Noise Array into smaller parts with the size of SizeNoise^3
                copyParams.srcPtr   = make_cudaPitchedPtr(d_NoiseTest+(sqrSizeNoiseTest*i), SizeNoiseTest*sizeof(float), SizeNoiseTest, SizeNoiseTest);
                copyParams.dstArray = d_cuArr;
                copyParams.extent   = make_cudaExtent(SizeNoiseTest*sizeof(float),SizeNoiseTest,SizeNoiseTest);
                copyParams.kind     = cudaMemcpyDeviceToDevice;
                checkCudaErrors(cudaMemcpy3D(&copyParams));
                //Array creation End

                //new Bind
                // set texture parameters
                NoiseTextures[i].normalized = true;                      // access with normalized texture coordinates
                NoiseTextures[i].filterMode = cudaFilterModeLinear;      // linear interpolation
                NoiseTextures[i].addressMode[0] = cudaAddressModeWrap;   // wrap texture coordinates
                NoiseTextures[i].addressMode[1] = cudaAddressModeWrap;
                NoiseTextures[i].addressMode[2] = cudaAddressModeWrap;

                // bind array to 3D texture
                checkCudaErrors(cudaBindTextureToArray(NoiseTextures[i], d_cuArr, channelDesc));
                //end Bind
        }
        cudaFreeArray(d_cuArr);

        AccesTexture<<<1,dim3(4,4,4)>>>(NoiseTextures);
}

You need to link -lcurand though. And include CUDA-6.0/samples/common/inc

Im now getting a different error in this code

code=11(cudaErrorInvalidValue) "cudaMemcpy3D(&copyParams)"

Even though it's the exact same code then my original. - Im starting to get completely confused. Thank you for your help


Solution

  • Here's a worked example showing the creation of an array of texture objects, roughly following the path of the code you provided. You can see, by comparing to the texture reference code I placed here, that the first set of texture reads from the first texture object (i.e. the first kernel call) are the same numerical values as the set of reads from the texture reference example (you may need to adjust the grid size of the two example codes to match).

    Texture object usage requires compute capability 3.0 or higher.

    example:

    $ cat t507.cu
    #include <helper_cuda.h>
    #include <curand.h>
    #define NUM_TEX 4
    
    const int SizeNoiseTest = 32;
    const int cubeSizeNoiseTest = SizeNoiseTest*SizeNoiseTest*SizeNoiseTest;
    static cudaTextureObject_t texNoise[NUM_TEX];
    
    __global__ void AccesTexture(cudaTextureObject_t my_tex)
    {
            float test = tex3D<float>(my_tex,(float)threadIdx.x,(float)threadIdx.y,(float)threadIdx.z);//by using this the error occurs
            printf("thread: %d,%d,%d, value: %f\n", threadIdx.x, threadIdx.y, threadIdx.z, test);
    }
    
    void CreateTexture()
    {
    
        float *d_NoiseTest;//Device Array with random floats
        cudaMalloc((void **)&d_NoiseTest, cubeSizeNoiseTest*sizeof(float));//Allocation of device Array
        for (int i = 0; i < NUM_TEX; i++){
            //curand Random Generator (needs compiler link -lcurand)
            curandGenerator_t gen;
            curandCreateGenerator(&gen,CURAND_RNG_PSEUDO_DEFAULT);
            curandSetPseudoRandomGeneratorSeed(gen,1235ULL+i);
            curandGenerateUniform(gen, d_NoiseTest, cubeSizeNoiseTest);//writing data to d_NoiseTest
            curandDestroyGenerator(gen);
    
            //cudaArray Descriptor
            cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
            //cuda Array
            cudaArray *d_cuArr;
            checkCudaErrors(cudaMalloc3DArray(&d_cuArr, &channelDesc, make_cudaExtent(SizeNoiseTest*sizeof(float),SizeNoiseTest,SizeNoiseTest), 0));
            cudaMemcpy3DParms copyParams = {0};
    
    
            //Array creation
            copyParams.srcPtr   = make_cudaPitchedPtr(d_NoiseTest, SizeNoiseTest*sizeof(float), SizeNoiseTest, SizeNoiseTest);
            copyParams.dstArray = d_cuArr;
            copyParams.extent   = make_cudaExtent(SizeNoiseTest,SizeNoiseTest,SizeNoiseTest);
            copyParams.kind     = cudaMemcpyDeviceToDevice;
            checkCudaErrors(cudaMemcpy3D(&copyParams));
            //Array creation End
    
            cudaResourceDesc    texRes;
            memset(&texRes, 0, sizeof(cudaResourceDesc));
            texRes.resType = cudaResourceTypeArray;
            texRes.res.array.array  = d_cuArr;
            cudaTextureDesc     texDescr;
            memset(&texDescr, 0, sizeof(cudaTextureDesc));
            texDescr.normalizedCoords = false;
            texDescr.filterMode = cudaFilterModeLinear;
            texDescr.addressMode[0] = cudaAddressModeClamp;   // clamp
            texDescr.addressMode[1] = cudaAddressModeClamp;
            texDescr.addressMode[2] = cudaAddressModeClamp;
            texDescr.readMode = cudaReadModeElementType;
            checkCudaErrors(cudaCreateTextureObject(&texNoise[i], &texRes, &texDescr, NULL));}
    }
    
    int main(int argc, char **argv)
    {
            CreateTexture();
            AccesTexture<<<1,dim3(2,2,2)>>>(texNoise[0]);
            AccesTexture<<<1,dim3(2,2,2)>>>(texNoise[1]);
            AccesTexture<<<1,dim3(2,2,2)>>>(texNoise[2]);
            checkCudaErrors(cudaPeekAtLastError());
            checkCudaErrors(cudaDeviceSynchronize());
            return 0;
    }
    

    compile with:

    $ nvcc -arch=sm_30 -I/shared/apps/cuda/CUDA-v6.0.37/samples/common/inc -lcurand -o t507 t507.cu
    

    output:

    $ cuda-memcheck ./t507
    ========= CUDA-MEMCHECK
    thread: 0,0,0, value: 0.310691
    thread: 1,0,0, value: 0.627906
    thread: 0,1,0, value: 0.638900
    thread: 1,1,0, value: 0.665186
    thread: 0,0,1, value: 0.167465
    thread: 1,0,1, value: 0.565227
    thread: 0,1,1, value: 0.397606
    thread: 1,1,1, value: 0.503013
    thread: 0,0,0, value: 0.809163
    thread: 1,0,0, value: 0.795669
    thread: 0,1,0, value: 0.808565
    thread: 1,1,0, value: 0.847564
    thread: 0,0,1, value: 0.853998
    thread: 1,0,1, value: 0.688446
    thread: 0,1,1, value: 0.733255
    thread: 1,1,1, value: 0.649379
    thread: 0,0,0, value: 0.040824
    thread: 1,0,0, value: 0.087417
    thread: 0,1,0, value: 0.301392
    thread: 1,1,0, value: 0.298669
    thread: 0,0,1, value: 0.161962
    thread: 1,0,1, value: 0.316443
    thread: 0,1,1, value: 0.452077
    thread: 1,1,1, value: 0.477722
    ========= ERROR SUMMARY: 0 errors
    

    In this case I'm using the same kernel, called multiple times, to read from the individual texture objects. It should be possible to pass multiple objects to the same kernel, however it is not advisable to have a single warp read from multiple textures, if that can be avoided in your code. The actual issue resides at the quad level, which I'd prefer not to get into. It's best if you can arrange your code so that a warp is reading from the same texture object, on any given cycle.

    Note that for simplicity of presentation, this CreateTexture() function overwrites previously allocated device pointers such as d_cuArr, during the processing of the loop. This isn't illegal or a functional issue, but it raises the possibility of memory leaks.

    I assume you can modify the code to handle deallocation of those if this is a concern. The purpose of this code is to demonstrate the method to get things working.