Search code examples
c++cudatextures

CUDA: how to create 2D texture object?


I'm trying to create 2D texture object, 4x4 uint8_t. Here is the code:

__global__ void kernel(cudaTextureObject_t tex)
{
    int x = threadIdx.x;
    int y = threadIdx.y;
    uint8_t val = tex2D<uint8_t>(tex, x, y);
    printf("%d, ", val);
    return;
}

int main(int argc, char **argv)
{
    cudaTextureObject_t tex;
    uint8_t dataIn[16] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
    uint8_t* dataDev = 0;
    cudaMalloc((void**)&dataDev, 16);
    struct cudaResourceDesc resDesc;
    memset(&resDesc, 0, sizeof(resDesc));
    resDesc.resType = cudaResourceTypePitch2D;
    resDesc.res.pitch2D.devPtr = dataDev;
    resDesc.res.pitch2D.desc.x = 8;
    resDesc.res.pitch2D.desc.y = 8;
    resDesc.res.pitch2D.desc.f = cudaChannelFormatKindUnsigned;
    resDesc.res.pitch2D.width = 4;
    resDesc.res.pitch2D.height = 4;
    resDesc.res.pitch2D.pitchInBytes = 4;
    struct cudaTextureDesc texDesc;
    memset(&texDesc, 0, sizeof(texDesc));
    cudaCreateTextureObject(&tex, &resDesc, &texDesc, NULL);
    cudaMemcpy(dataDev, &dataIn[0], 16, cudaMemcpyHostToDevice);
    dim3 threads(4, 4);
    kernel<<<1, threads>>>(tex);
    cudaDeviceSynchronize();
    return 0;
}

I expect that the result will be something like this:

0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,

i.e. all values of the texture object (order doesn't matter).

But the actual result is:

0, 2, 4, 6, 0, 2, 4, 6, 0, 2, 4, 6, 0, 2, 4, 6,     

What am I doing wrong?


Solution

  • When you use the pitch2D variant for the texture operation, the underlying allocation is supposed to be a proper pitched allocation. I think typically people would create this with cudaMallocPitch. However the requirement stated is:

    cudaResourceDesc::res::pitch2D::pitchInBytes specifies the pitch between two rows in bytes and has to be aligned to cudaDeviceProp::texturePitchAlignment.

    On my GPU, that last property is 32. I don't know about your GPU, but I bet that property is not 4 for your GPU. However you are specifying 4 here:

    resDesc.res.pitch2D.pitchInBytes = 4;
    

    Again, I think people would typically use a pitched allocation created with cudaMallocPitch for this. However it does appear to be possible to me to pass an ordinary linear allocation if the row-to-row dimension (in bytes) is divisible by texturePitchAlignment (32 in my case).

    Another change I made is to use cudaCreateChannelDesc<>() instead of manually setting the parameters like you did. This creates a different set of desc parameters and seems to affect the result also. It should not be difficult to study the differences.

    When I adjust your code to address those issues, I get results that seem sensible to me:

    $ cat t30.cu
    #include <stdio.h>
    #include <stdint.h>
    
    typedef uint8_t mt;  // use an integer type
    
    __global__ void kernel(cudaTextureObject_t tex)
    {
        int x = threadIdx.x;
        int y = threadIdx.y;
        mt val = tex2D<mt>(tex, x, y);
        printf("%d, ", val);
    }
    
    int main(int argc, char **argv)
    {
        cudaDeviceProp prop;
        cudaGetDeviceProperties(&prop, 0);
        printf("texturePitchAlignment: %lu\n", prop.texturePitchAlignment);
        cudaTextureObject_t tex;
        const int num_rows = 4;
        const int num_cols = prop.texturePitchAlignment*1; // should be able to use a different multiplier here
        const int ts = num_cols*num_rows;
        const int ds = ts*sizeof(mt);
        mt dataIn[ts];
        for (int i = 0; i < ts; i++) dataIn[i] = i;
        mt* dataDev = 0;
        cudaMalloc((void**)&dataDev, ds);
        cudaMemcpy(dataDev, dataIn, ds, cudaMemcpyHostToDevice);
        struct cudaResourceDesc resDesc;
        memset(&resDesc, 0, sizeof(resDesc));
        resDesc.resType = cudaResourceTypePitch2D;
        resDesc.res.pitch2D.devPtr = dataDev;
        resDesc.res.pitch2D.width = num_cols;
        resDesc.res.pitch2D.height = num_rows;
        resDesc.res.pitch2D.desc = cudaCreateChannelDesc<mt>();
        resDesc.res.pitch2D.pitchInBytes = num_cols*sizeof(mt);
        struct cudaTextureDesc texDesc;
        memset(&texDesc, 0, sizeof(texDesc));
        cudaCreateTextureObject(&tex, &resDesc, &texDesc, NULL);
        dim3 threads(4, 4);
        kernel<<<1, threads>>>(tex);
        cudaDeviceSynchronize();
        printf("\n");
        return 0;
    }
    $ nvcc -o t30 t30.cu
    $ cuda-memcheck ./t30
    ========= CUDA-MEMCHECK
    texturePitchAlignment: 32
    0, 1, 2, 3, 32, 33, 34, 35, 64, 65, 66, 67, 96, 97, 98, 99,
    ========= ERROR SUMMARY: 0 errors
    $
    

    As asked in the comments, if I were going to do something similar to this but using cudaMallocPitch and cudaMemcpy2D, it could look something like this:

    $ cat t1421.cu
    #include <stdio.h>
    #include <stdint.h>
    
    typedef uint8_t mt;  // use an integer type
    
    __global__ void kernel(cudaTextureObject_t tex)
    {
        int x = threadIdx.x;
        int y = threadIdx.y;
        mt val = tex2D<mt>(tex, x, y);
        printf("%d, ", val);
    }
    
    int main(int argc, char **argv)
    {
        cudaDeviceProp prop;
        cudaGetDeviceProperties(&prop, 0);
        printf("texturePitchAlignment: %lu\n", prop.texturePitchAlignment);
        cudaTextureObject_t tex;
        const int num_rows = 4;
        const int num_cols = prop.texturePitchAlignment*1; // should be able to use a different multiplier here
        const int ts = num_cols*num_rows;
        const int ds = ts*sizeof(mt);
        mt dataIn[ts];
        for (int i = 0; i < ts; i++) dataIn[i] = i;
        mt* dataDev = 0;
        size_t pitch;
        cudaMallocPitch((void**)&dataDev, &pitch,  num_cols*sizeof(mt), num_rows);
        cudaMemcpy2D(dataDev, pitch, dataIn, num_cols*sizeof(mt), num_cols*sizeof(mt), num_rows, cudaMemcpyHostToDevice);
        struct cudaResourceDesc resDesc;
        memset(&resDesc, 0, sizeof(resDesc));
        resDesc.resType = cudaResourceTypePitch2D;
        resDesc.res.pitch2D.devPtr = dataDev;
        resDesc.res.pitch2D.width = num_cols;
        resDesc.res.pitch2D.height = num_rows;
        resDesc.res.pitch2D.desc = cudaCreateChannelDesc<mt>();
        resDesc.res.pitch2D.pitchInBytes = pitch;
        struct cudaTextureDesc texDesc;
        memset(&texDesc, 0, sizeof(texDesc));
        cudaCreateTextureObject(&tex, &resDesc, &texDesc, NULL);
        dim3 threads(4, 4);
        kernel<<<1, threads>>>(tex);
        cudaDeviceSynchronize();
        printf("\n");
        return 0;
    }
    $ nvcc -o t1421 t1421.cu
    $ cuda-memcheck ./t1421
    ========= CUDA-MEMCHECK
    texturePitchAlignment: 32
    0, 1, 2, 3, 32, 33, 34, 35, 64, 65, 66, 67, 96, 97, 98, 99,
    ========= ERROR SUMMARY: 0 errors
    $
    

    Although what we have here are texture objects, its easy enough to demonstrate that similar issues occur with texture references. You cannot create an arbitrarily small 2D texture reference just as you cannot create an arbitrarily small 2D texture object. I'm not going to provide a demonstration of that also, as it would largely duplicate the above, and folks shouldn't be using texture references anymore for new development work - texture objects are the better approach.