Search code examples
cudatextures

Pass a jpeg image contents as a CUDA Texture


Would like to pass the content from jpeg file (3 byte RGB) as a texture to a CUDA kernel but getting compilation error

a pointer to a bound function may only be used to call the function

on value.x = tex2D<unsigned char>(_texture, u, v) * 1.0f / 255.0f; and the rest of the tex2D() calls.

What may be the reason(s) for the error?

Host side code where the texture is created:

cudaArray* cudaArray;
    cudaTextureObject_t textureObject{};
    {
        const static uint32_t bytesPerPixel{ 3u };
        uint8_t* pHostData;
        int32_t textureWidth, textureHeight;
        uint32_t bytesPerScanline;
        cudaChannelFormatDesc channelFormatDesc;
        cudaResourceDesc  resourceDesc{};
        cudaTextureDesc textureDesc{};
        int32_t componentsPerPixel = bytesPerPixel;
        pHostData = stbi_load(textureFilename.c_str(), &textureWidth, &textureHeight, &componentsPerPixel, componentsPerPixel);
        if (nullptr == pHostData) {
            std::cerr << "ERROR: Could not load texture image file '" << textureFilename << std::endl;
            return;
        }
        bytesPerScanline = bytesPerPixel * textureWidth;

        channelFormatDesc = cudaCreateChannelDesc<uint8_t>();
        checkCudaErrors(cudaMallocArray(&cudaArray, &channelFormatDesc, bytesPerScanline, textureHeight));
        checkCudaErrors(cudaMemcpyToArray(cudaArray, 0, 0, pHostData, bytesPerScanline * textureHeight * sizeof(uint8_t), cudaMemcpyHostToDevice));
        
        resourceDesc.resType = cudaResourceTypeArray;
        resourceDesc.res.array.array = cudaArray;

        textureDesc.normalizedCoords = true;
        textureDesc.filterMode = cudaFilterModePoint;
        textureDesc.addressMode[0] = cudaAddressModeWrap;
        textureDesc.addressMode[1] = cudaAddressModeWrap;
        textureDesc.readMode = cudaReadModeElementType;
        checkCudaErrors(cudaCreateTextureObject(&textureObject, &resourceDesc, &textureDesc, nullptr));
        STBI_FREE(pHostData);
    }

Device side code:

class imageTexture {
public:
    __device__ imageTexture(cudaTextureObject_t tex) :_texture(tex) {}
    __device__ virtual vec3 value(float u, float v, const vec3& p) const {
        vec3 value;
        u *= 3;
        value.x = tex2D<unsigned char>(_texture, u, v) * 1.0f / 255.0f;
        u++;
        value.y = tex2D<unsigned char>(_texture, u, v) * 1.0f / 255.0f;
        u++;
        value.z = tex2D<unsigned char>(_texture, u, v) * 1.0f / 255.0f;
        return value;
    }
private:
    cudaTextureObject_t _texture;
};

Changed device side function, but the error persists:

class imageTexture :public textureX {
public:
    __device__ imageTexture(cudaTextureObject_t tex) :_text(tex) {}
    __device__ virtual vec3 value(float u, float v, const vec3& p) const override {
        vec3 val;
        u *= 3;
        val.x = tex2D<unsigned char>(_text, u, v) * 1.0f / 255.0f;
        u++;
        val.y = tex2D<unsigned char>(_text, u, v) * 1.0f / 255.0f;
        u++;
        val.z = tex2D<unsigned char>(_text, u, v) * 1.0f / 255.0f;
        return val;
    }
private:
    cudaTextureObject_t _text;
};

I have written a new test program and planning to build up from there. The idea is have each thread read 3 values from texture and write it back to a buffer. Only the first triplet is correct. Is there anything inconsistent in my texture lookups with this:

#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <curand_kernel.h>

#include <iostream>
#include <string>
#include <chrono>
#include <cmath>
#include <ctime>
#include <cstdint>

#include <stdio.h>

#define checkCudaErrors(val) check_cuda( (val), #val, __FILE__, __LINE__ )
void check_cuda(cudaError_t result, char const* const func, const char* const file, int const line) {
    if (result) {
        std::cerr << "CUDA error = " << static_cast<unsigned int>(result) << " at " << file << ":" << line << " " << func << std::endl;
        std::cerr << cudaGetErrorString(result) << std::endl;
        // Make sure we call CUDA Device Reset before exiting
        cudaDeviceReset();
        exit(99);
    }
}

__global__ void texCheck(uint32_t width, uint32_t height, uint8_t* pOutput, cudaTextureObject_t textureObject) {
    uint32_t x = blockIdx.x * blockDim.x + threadIdx.x;
    uint32_t y = blockIdx.y * blockDim.y + threadIdx.y;
    if ((x < width) && (y < height)) {
        float u = (float)x / (float)width;
        float v = (float)y / (float)height;
        pOutput[y * (3 * width) + (3 * x)] = tex2D<uint8_t>(textureObject, 3*u, v);
        pOutput[y * (3 * width) + (3 * x) + 1] = tex2D<uint8_t>(textureObject, 3*u + 1, v);
        pOutput[y * (3 * width) + (3 * x) + 2] = tex2D<uint8_t>(textureObject, 3*u + 2, v);
    }
}

void cudaTex() {
    const uint32_t bytesPerPixel{ 3u };
    const uint32_t textureWidth = 1024u;
    const uint32_t textureHeight = 512u;
    uint32_t bytesPerScanline;

    bytesPerScanline = bytesPerPixel * textureWidth;

    cudaChannelFormatDesc channelFormatDesc = cudaCreateChannelDesc(8, 0, 0, 0, cudaChannelFormatKindUnsigned);
    cudaArray* cudaArray;
    checkCudaErrors(cudaMallocArray(&cudaArray, &channelFormatDesc, bytesPerScanline, textureHeight));

    uint8_t* pHostData = new uint8_t[bytesPerScanline * textureHeight];
    std::srand(std::time(nullptr));
    for (uint64_t idx = 0ull; idx < bytesPerScanline * textureHeight; idx++)
        pHostData[idx] = std::rand();
    checkCudaErrors(cudaMemcpyToArray(cudaArray, 0, 0, pHostData, bytesPerScanline * textureHeight * sizeof(uint8_t), cudaMemcpyHostToDevice));

    cudaResourceDesc resourceDesc{};
    resourceDesc.resType = cudaResourceTypeArray;
    resourceDesc.res.array.array = cudaArray;

    cudaTextureDesc textureDesc{};
    textureDesc.normalizedCoords = false;
    textureDesc.filterMode = cudaFilterModePoint;
    textureDesc.addressMode[0] = cudaAddressModeWrap;
    textureDesc.addressMode[1] = cudaAddressModeWrap;
    textureDesc.readMode = cudaReadModeElementType;

    cudaTextureObject_t textureObject{};
    checkCudaErrors(cudaCreateTextureObject(&textureObject, &resourceDesc, &textureDesc, nullptr));

    dim3 dimBlock(8u, 8u, 1u);
    dim3 dimGrid(textureWidth / dimBlock.x, textureHeight / dimBlock.y, 1u);
    uint8_t* dOutput{ nullptr };
    checkCudaErrors(cudaMalloc((void**)&dOutput, bytesPerScanline * textureHeight * sizeof(uint8_t)));
    texCheck << < dimGrid, dimBlock >> > (textureWidth, textureHeight, dOutput, textureObject);
    checkCudaErrors(cudaGetLastError());
    checkCudaErrors(cudaDeviceSynchronize());
    uint8_t* hOutput = new uint8_t[bytesPerScanline * textureHeight];
    checkCudaErrors(cudaMemcpy(hOutput, dOutput, bytesPerScanline * textureHeight * sizeof(uint8_t), cudaMemcpyDeviceToHost));

    for (uint64_t idx = 0ull; idx < textureHeight; idx++) {
        for (uint64_t jdx = 0ull; jdx < bytesPerScanline; jdx++) {
            if (hOutput[jdx] != pHostData[jdx])
                std::cerr << "Mismatch @ " << idx << " " << jdx << " Expected " << (uint32_t)pHostData[jdx] << " Received " << (uint32_t)hOutput[jdx] << std::endl;
        }
        hOutput += bytesPerScanline;
        pHostData += bytesPerScanline;
    }

    checkCudaErrors(cudaDestroyTextureObject(textureObject));
    checkCudaErrors(cudaFree(dOutput));
    checkCudaErrors(cudaFreeArray(cudaArray));

    delete[] hOutput;
    delete[] pHostData;
}

int main() {
    cudaTex();
    return 0;
}

Switching to integer coordinated in the kernel solved the problem

Resolution of the Original Problem

It turned out that the

a pointer to a bound function may only be used to call the function

error was caused by vec3 class having a getter function x() and not a member variable named x. So the code was trying use the getter function as an l-value!!!


Solution

  • There are several problems with the code you have now posted:

    1. after the discussion in the comments, hopefully you can figure out what is wrong with this line of code:

      cudaArray* cudaArray;
      
    2. Your kernel code appears to be trying pass normalized float coordinates but doing it incorrectly. There are several issues here: your x normalization is considering textureWidth but it should be done over 3*textureWidth (i.e. bytesPerScanline). Although you are calling the width of your texture textureWidth, really it is 3*textureWidth. Also, texturing in this fashion is typically offset by 0.5. Finally, you are doing this:

      textureDesc.normalizedCoords = false;
      

      but if you want to use float coordinates (seems to be what you want) you shoudl do:

      textureDesc.normalizedCoords = true;
      
    3. After you fix all that, you'll run into a non-CUDA issue. You're modifying these pointers:

       hOutput += bytesPerScanline;
       pHostData += bytesPerScanline;
      

      then trying to delete them after modification:

      delete[] hOutput;
      delete[] pHostData;
      

      that won't work correctly.

    Here's a modified code that has the above issues addressed, it seems to run correctly for me:

    $ cat t7.cu
    #include <cuda_runtime.h>
    #include <device_launch_parameters.h>
    #include <curand_kernel.h>
    
    #include <iostream>
    #include <string>
    #include <chrono>
    #include <cmath>
    #include <ctime>
    #include <cstdint>
    
    #include <stdio.h>
    
    #define checkCudaErrors(val) check_cuda( (val), #val, __FILE__, __LINE__ )
    
    void check_cuda(cudaError_t result, char const* const func, const char* const file, int const line) {
        if (result) {
            std::cerr << "CUDA error = " << static_cast<unsigned int>(result) << " at " << file << ":" << line << " " << func << std::endl;
            std::cerr << cudaGetErrorString(result) << std::endl;
            // Make sure we call CUDA Device Reset before exiting
            cudaDeviceReset();
            exit(99);
        }
    }
    
    __global__ void texCheck(uint32_t width, uint32_t height, uint8_t* pOutput, cudaTextureObject_t textureObject) {
        uint32_t x = blockIdx.x * blockDim.x + threadIdx.x;
        uint32_t y = blockIdx.y * blockDim.y + threadIdx.y;
        const float pix_offset = 0.5;
        if ((x < width) && (y < height)) {
            float u = (float)(3*x+pix_offset) / (float)(3*width);
            float v = (float)y / (float)height;
            pOutput[y * (3 * width) + (3 * x)] = tex2D<uint8_t>(textureObject, u, v);
            u = (float)(3*x+1+pix_offset) / (float)(3*width);
            pOutput[y * (3 * width) + (3 * x) + 1] = tex2D<uint8_t>(textureObject, u, v);
            u = (float)(3*x+2+pix_offset) / (float)(3*width);
            pOutput[y * (3 * width) + (3 * x) + 2] = tex2D<uint8_t>(textureObject, u, v);
        }
    }
    
    void cudaTex() {
        const uint32_t bytesPerPixel{ 3u };
        const uint32_t textureWidth = 1024u;
        const uint32_t textureHeight = 512u;
        uint32_t bytesPerScanline;
    
        bytesPerScanline = bytesPerPixel * textureWidth;
    
        cudaChannelFormatDesc channelFormatDesc = cudaCreateChannelDesc(8, 0, 0, 0, cudaChannelFormatKindUnsigned);
        cudaArray* cArray;
        checkCudaErrors(cudaMallocArray(&cArray, &channelFormatDesc, bytesPerScanline, textureHeight));
        uint8_t* pHostData = new uint8_t[bytesPerScanline * textureHeight];
        std::srand(std::time(nullptr));
        for (uint64_t idx = 0ull; idx < bytesPerScanline * textureHeight; idx++)
            pHostData[idx] = std::rand();
        checkCudaErrors(cudaMemcpyToArray(cArray, 0, 0, pHostData, bytesPerScanline * textureHeight * sizeof(uint8_t), cudaMemcpyHostToDevice));
    
        cudaResourceDesc resourceDesc{};
        resourceDesc.resType = cudaResourceTypeArray;
        resourceDesc.res.array.array = cArray;
    
        cudaTextureDesc textureDesc{};
        textureDesc.normalizedCoords = true;
        textureDesc.filterMode = cudaFilterModePoint;
        textureDesc.addressMode[0] = cudaAddressModeWrap;
        textureDesc.addressMode[1] = cudaAddressModeWrap;
        textureDesc.readMode = cudaReadModeElementType;
    
        cudaTextureObject_t textureObject{};
        checkCudaErrors(cudaCreateTextureObject(&textureObject, &resourceDesc, &textureDesc, nullptr));
    
        dim3 dimBlock(8u, 8u, 1u);
        dim3 dimGrid(textureWidth / dimBlock.x, textureHeight / dimBlock.y, 1u);
        uint8_t* dOutput{ nullptr };
        checkCudaErrors(cudaMalloc((void**)&dOutput, bytesPerScanline * textureHeight * sizeof(uint8_t)));
        texCheck << < dimGrid, dimBlock >> > (textureWidth, textureHeight, dOutput, textureObject);
        checkCudaErrors(cudaGetLastError());
        checkCudaErrors(cudaDeviceSynchronize());
        uint8_t* hOutput = new uint8_t[bytesPerScanline * textureHeight];
        checkCudaErrors(cudaMemcpy(hOutput, dOutput, bytesPerScanline * textureHeight * sizeof(uint8_t), cudaMemcpyDeviceToHost));
        uint8_t *my_hOutput = hOutput;
        uint8_t *my_pHostData = pHostData;
        for (uint64_t idx = 0ull; idx < textureHeight; idx++) {
            for (uint64_t jdx = 0ull; jdx < bytesPerScanline; jdx++) {
                if (hOutput[jdx] != pHostData[jdx]){
                    std::cerr << "Mismatch @ " << idx << " " << jdx << " Expected " << (uint32_t)pHostData[jdx] << " Received " << (uint32_t)hOutput[jdx] << std::endl;
                return;}
            }
            hOutput += bytesPerScanline;
            pHostData += bytesPerScanline;
        }
        checkCudaErrors(cudaDestroyTextureObject(textureObject));
        checkCudaErrors(cudaFree(dOutput));
        checkCudaErrors(cudaFreeArray(cArray));
    
        delete[] my_hOutput;
        delete[] my_pHostData;
    }
    
    int main() {
        cudaTex();
        return 0;
    }
    $ nvcc -o t7 t7.cu -std=c++11
    t7.cu: In function ‘void cudaTex()’:
    t7.cu:56:12: warning: ‘cudaError_t cudaMemcpyToArray(cudaArray_t, size_t, size_t, const void*, size_t, cudaMemcpyKind)’ is deprecated (declared at /usr/local/cuda/bin/../targets/x86_64-linux/include/cuda_runtime_api.h:6782) [-Wdeprecated-declarations]
         checkCudaErrors(cudaMemcpyToArray(cArray, 0, 0, pHostData, bytesPerScanline * textureHeight * sizeof(uint8_t), cudaMemcpyHostToDevice));
                ^
    t7.cu:56:131: warning: ‘cudaError_t cudaMemcpyToArray(cudaArray_t, size_t, size_t, const void*, size_t, cudaMemcpyKind)’ is deprecated (declared at /usr/local/cuda/bin/../targets/x86_64-linux/include/cuda_runtime_api.h:6782) [-Wdeprecated-declarations]
         checkCudaErrors(cudaMemcpyToArray(cArray, 0, 0, pHostData, bytesPerScanline * textureHeight * sizeof(uint8_t), cudaMemcpyHostToDevice));
                                                                                                                                       ^
    $ cuda-memcheck ./t7
    ========= CUDA-MEMCHECK
    ========= ERROR SUMMARY: 0 errors
    $
    

    I'm not suggesting the above code is defect-free. It's mostly your code. I'm just pointing out some things I found. You can read about how to address the deprecation warning here.