Pass a jpeg image contents as a CUDA Texture

Would like to pass the content from jpeg file (3 byte RGB) as a texture to a CUDA kernel but getting compilation error

a pointer to a bound function may only be used to call the function

on value.x = tex2D<unsigned char>(_texture, u, v) * 1.0f / 255.0f; and the rest of the tex2D() calls.

What may be the reason(s) for the error?

Host side code where the texture is created:

cudaArray* cudaArray;
    cudaTextureObject_t textureObject{};
    {
        const static uint32_t bytesPerPixel{ 3u };
        uint8_t* pHostData;
        int32_t textureWidth, textureHeight;
        uint32_t bytesPerScanline;
        cudaChannelFormatDesc channelFormatDesc;
        cudaResourceDesc  resourceDesc{};
        cudaTextureDesc textureDesc{};
        int32_t componentsPerPixel = bytesPerPixel;
        pHostData = stbi_load(textureFilename.c_str(), &textureWidth, &textureHeight, &componentsPerPixel, componentsPerPixel);
        if (nullptr == pHostData) {
            std::cerr << "ERROR: Could not load texture image file '" << textureFilename << std::endl;
            return;
        }
        bytesPerScanline = bytesPerPixel * textureWidth;

        channelFormatDesc = cudaCreateChannelDesc<uint8_t>();
        checkCudaErrors(cudaMallocArray(&cudaArray, &channelFormatDesc, bytesPerScanline, textureHeight));
        checkCudaErrors(cudaMemcpyToArray(cudaArray, 0, 0, pHostData, bytesPerScanline * textureHeight * sizeof(uint8_t), cudaMemcpyHostToDevice));
        
        resourceDesc.resType = cudaResourceTypeArray;
        resourceDesc.res.array.array = cudaArray;

        textureDesc.normalizedCoords = true;
        textureDesc.filterMode = cudaFilterModePoint;
        textureDesc.addressMode[0] = cudaAddressModeWrap;
        textureDesc.addressMode[1] = cudaAddressModeWrap;
        textureDesc.readMode = cudaReadModeElementType;
        checkCudaErrors(cudaCreateTextureObject(&textureObject, &resourceDesc, &textureDesc, nullptr));
        STBI_FREE(pHostData);
    }

Device side code:

class imageTexture {
public:
    __device__ imageTexture(cudaTextureObject_t tex) :_texture(tex) {}
    __device__ virtual vec3 value(float u, float v, const vec3& p) const {
        vec3 value;
        u *= 3;
        value.x = tex2D<unsigned char>(_texture, u, v) * 1.0f / 255.0f;
        u++;
        value.y = tex2D<unsigned char>(_texture, u, v) * 1.0f / 255.0f;
        u++;
        value.z = tex2D<unsigned char>(_texture, u, v) * 1.0f / 255.0f;
        return value;
    }
private:
    cudaTextureObject_t _texture;
};

Changed device side function, but the error persists:

class imageTexture :public textureX {
public:
    __device__ imageTexture(cudaTextureObject_t tex) :_text(tex) {}
    __device__ virtual vec3 value(float u, float v, const vec3& p) const override {
        vec3 val;
        u *= 3;
        val.x = tex2D<unsigned char>(_text, u, v) * 1.0f / 255.0f;
        u++;
        val.y = tex2D<unsigned char>(_text, u, v) * 1.0f / 255.0f;
        u++;
        val.z = tex2D<unsigned char>(_text, u, v) * 1.0f / 255.0f;
        return val;
    }
private:
    cudaTextureObject_t _text;
};

I have written a new test program and planning to build up from there. The idea is have each thread read 3 values from texture and write it back to a buffer. Only the first triplet is correct. Is there anything inconsistent in my texture lookups with this:

#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <curand_kernel.h>

#include <iostream>
#include <string>
#include <chrono>
#include <cmath>
#include <ctime>
#include <cstdint>

#include <stdio.h>

#define checkCudaErrors(val) check_cuda( (val), #val, __FILE__, __LINE__ )
void check_cuda(cudaError_t result, char const* const func, const char* const file, int const line) {
    if (result) {
        std::cerr << "CUDA error = " << static_cast<unsigned int>(result) << " at " << file << ":" << line << " " << func << std::endl;
        std::cerr << cudaGetErrorString(result) << std::endl;
        // Make sure we call CUDA Device Reset before exiting
        cudaDeviceReset();
        exit(99);
    }
}

__global__ void texCheck(uint32_t width, uint32_t height, uint8_t* pOutput, cudaTextureObject_t textureObject) {
    uint32_t x = blockIdx.x * blockDim.x + threadIdx.x;
    uint32_t y = blockIdx.y * blockDim.y + threadIdx.y;
    if ((x < width) && (y < height)) {
        float u = (float)x / (float)width;
        float v = (float)y / (float)height;
        pOutput[y * (3 * width) + (3 * x)] = tex2D<uint8_t>(textureObject, 3*u, v);
        pOutput[y * (3 * width) + (3 * x) + 1] = tex2D<uint8_t>(textureObject, 3*u + 1, v);
        pOutput[y * (3 * width) + (3 * x) + 2] = tex2D<uint8_t>(textureObject, 3*u + 2, v);
    }
}

void cudaTex() {
    const uint32_t bytesPerPixel{ 3u };
    const uint32_t textureWidth = 1024u;
    const uint32_t textureHeight = 512u;
    uint32_t bytesPerScanline;

    bytesPerScanline = bytesPerPixel * textureWidth;

    cudaChannelFormatDesc channelFormatDesc = cudaCreateChannelDesc(8, 0, 0, 0, cudaChannelFormatKindUnsigned);
    cudaArray* cudaArray;
    checkCudaErrors(cudaMallocArray(&cudaArray, &channelFormatDesc, bytesPerScanline, textureHeight));

    uint8_t* pHostData = new uint8_t[bytesPerScanline * textureHeight];
    std::srand(std::time(nullptr));
    for (uint64_t idx = 0ull; idx < bytesPerScanline * textureHeight; idx++)
        pHostData[idx] = std::rand();
    checkCudaErrors(cudaMemcpyToArray(cudaArray, 0, 0, pHostData, bytesPerScanline * textureHeight * sizeof(uint8_t), cudaMemcpyHostToDevice));

    cudaResourceDesc resourceDesc{};
    resourceDesc.resType = cudaResourceTypeArray;
    resourceDesc.res.array.array = cudaArray;

    cudaTextureDesc textureDesc{};
    textureDesc.normalizedCoords = false;
    textureDesc.filterMode = cudaFilterModePoint;
    textureDesc.addressMode[0] = cudaAddressModeWrap;
    textureDesc.addressMode[1] = cudaAddressModeWrap;
    textureDesc.readMode = cudaReadModeElementType;

    cudaTextureObject_t textureObject{};
    checkCudaErrors(cudaCreateTextureObject(&textureObject, &resourceDesc, &textureDesc, nullptr));

    dim3 dimBlock(8u, 8u, 1u);
    dim3 dimGrid(textureWidth / dimBlock.x, textureHeight / dimBlock.y, 1u);
    uint8_t* dOutput{ nullptr };
    checkCudaErrors(cudaMalloc((void**)&dOutput, bytesPerScanline * textureHeight * sizeof(uint8_t)));
    texCheck << < dimGrid, dimBlock >> > (textureWidth, textureHeight, dOutput, textureObject);
    checkCudaErrors(cudaGetLastError());
    checkCudaErrors(cudaDeviceSynchronize());
    uint8_t* hOutput = new uint8_t[bytesPerScanline * textureHeight];
    checkCudaErrors(cudaMemcpy(hOutput, dOutput, bytesPerScanline * textureHeight * sizeof(uint8_t), cudaMemcpyDeviceToHost));

    for (uint64_t idx = 0ull; idx < textureHeight; idx++) {
        for (uint64_t jdx = 0ull; jdx < bytesPerScanline; jdx++) {
            if (hOutput[jdx] != pHostData[jdx])
                std::cerr << "Mismatch @ " << idx << " " << jdx << " Expected " << (uint32_t)pHostData[jdx] << " Received " << (uint32_t)hOutput[jdx] << std::endl;
        }
        hOutput += bytesPerScanline;
        pHostData += bytesPerScanline;
    }

    checkCudaErrors(cudaDestroyTextureObject(textureObject));
    checkCudaErrors(cudaFree(dOutput));
    checkCudaErrors(cudaFreeArray(cudaArray));

    delete[] hOutput;
    delete[] pHostData;
}

int main() {
    cudaTex();
    return 0;
}

Switching to integer coordinated in the kernel solved the problem

Resolution of the Original Problem

It turned out that the

a pointer to a bound function may only be used to call the function

error was caused by vec3 class having a getter function x() and not a member variable named x. So the code was trying use the getter function as an l-value!!!

Solution

There are several problems with the code you have now posted:

after the discussion in the comments, hopefully you can figure out what is wrong with this line of code:
```
cudaArray* cudaArray;
```
Your kernel code appears to be trying pass normalized float coordinates but doing it incorrectly. There are several issues here: your x normalization is considering textureWidth but it should be done over 3*textureWidth (i.e. bytesPerScanline). Although you are calling the width of your texture textureWidth, really it is 3*textureWidth. Also, texturing in this fashion is typically offset by 0.5. Finally, you are doing this:
```
textureDesc.normalizedCoords = false;
```
but if you want to use float coordinates (seems to be what you want) you shoudl do:
```
textureDesc.normalizedCoords = true;
```
After you fix all that, you'll run into a non-CUDA issue. You're modifying these pointers:
```
 hOutput += bytesPerScanline;
 pHostData += bytesPerScanline;
```
then trying to delete them after modification:
```
delete[] hOutput;
delete[] pHostData;
```
that won't work correctly.

Here's a modified code that has the above issues addressed, it seems to run correctly for me:

$ cat t7.cu
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <curand_kernel.h>

#include <iostream>
#include <string>
#include <chrono>
#include <cmath>
#include <ctime>
#include <cstdint>

#include <stdio.h>

#define checkCudaErrors(val) check_cuda( (val), #val, __FILE__, __LINE__ )

void check_cuda(cudaError_t result, char const* const func, const char* const file, int const line) {
    if (result) {
        std::cerr << "CUDA error = " << static_cast<unsigned int>(result) << " at " << file << ":" << line << " " << func << std::endl;
        std::cerr << cudaGetErrorString(result) << std::endl;
        // Make sure we call CUDA Device Reset before exiting
        cudaDeviceReset();
        exit(99);
    }
}

__global__ void texCheck(uint32_t width, uint32_t height, uint8_t* pOutput, cudaTextureObject_t textureObject) {
    uint32_t x = blockIdx.x * blockDim.x + threadIdx.x;
    uint32_t y = blockIdx.y * blockDim.y + threadIdx.y;
    const float pix_offset = 0.5;
    if ((x < width) && (y < height)) {
        float u = (float)(3*x+pix_offset) / (float)(3*width);
        float v = (float)y / (float)height;
        pOutput[y * (3 * width) + (3 * x)] = tex2D<uint8_t>(textureObject, u, v);
        u = (float)(3*x+1+pix_offset) / (float)(3*width);
        pOutput[y * (3 * width) + (3 * x) + 1] = tex2D<uint8_t>(textureObject, u, v);
        u = (float)(3*x+2+pix_offset) / (float)(3*width);
        pOutput[y * (3 * width) + (3 * x) + 2] = tex2D<uint8_t>(textureObject, u, v);
    }
}

void cudaTex() {
    const uint32_t bytesPerPixel{ 3u };
    const uint32_t textureWidth = 1024u;
    const uint32_t textureHeight = 512u;
    uint32_t bytesPerScanline;

    bytesPerScanline = bytesPerPixel * textureWidth;

    cudaChannelFormatDesc channelFormatDesc = cudaCreateChannelDesc(8, 0, 0, 0, cudaChannelFormatKindUnsigned);
    cudaArray* cArray;
    checkCudaErrors(cudaMallocArray(&cArray, &channelFormatDesc, bytesPerScanline, textureHeight));
    uint8_t* pHostData = new uint8_t[bytesPerScanline * textureHeight];
    std::srand(std::time(nullptr));
    for (uint64_t idx = 0ull; idx < bytesPerScanline * textureHeight; idx++)
        pHostData[idx] = std::rand();
    checkCudaErrors(cudaMemcpyToArray(cArray, 0, 0, pHostData, bytesPerScanline * textureHeight * sizeof(uint8_t), cudaMemcpyHostToDevice));

    cudaResourceDesc resourceDesc{};
    resourceDesc.resType = cudaResourceTypeArray;
    resourceDesc.res.array.array = cArray;

    cudaTextureDesc textureDesc{};
    textureDesc.normalizedCoords = true;
    textureDesc.filterMode = cudaFilterModePoint;
    textureDesc.addressMode[0] = cudaAddressModeWrap;
    textureDesc.addressMode[1] = cudaAddressModeWrap;
    textureDesc.readMode = cudaReadModeElementType;

    cudaTextureObject_t textureObject{};
    checkCudaErrors(cudaCreateTextureObject(&textureObject, &resourceDesc, &textureDesc, nullptr));

    dim3 dimBlock(8u, 8u, 1u);
    dim3 dimGrid(textureWidth / dimBlock.x, textureHeight / dimBlock.y, 1u);
    uint8_t* dOutput{ nullptr };
    checkCudaErrors(cudaMalloc((void**)&dOutput, bytesPerScanline * textureHeight * sizeof(uint8_t)));
    texCheck << < dimGrid, dimBlock >> > (textureWidth, textureHeight, dOutput, textureObject);
    checkCudaErrors(cudaGetLastError());
    checkCudaErrors(cudaDeviceSynchronize());
    uint8_t* hOutput = new uint8_t[bytesPerScanline * textureHeight];
    checkCudaErrors(cudaMemcpy(hOutput, dOutput, bytesPerScanline * textureHeight * sizeof(uint8_t), cudaMemcpyDeviceToHost));
    uint8_t *my_hOutput = hOutput;
    uint8_t *my_pHostData = pHostData;
    for (uint64_t idx = 0ull; idx < textureHeight; idx++) {
        for (uint64_t jdx = 0ull; jdx < bytesPerScanline; jdx++) {
            if (hOutput[jdx] != pHostData[jdx]){
                std::cerr << "Mismatch @ " << idx << " " << jdx << " Expected " << (uint32_t)pHostData[jdx] << " Received " << (uint32_t)hOutput[jdx] << std::endl;
            return;}
        }
        hOutput += bytesPerScanline;
        pHostData += bytesPerScanline;
    }
    checkCudaErrors(cudaDestroyTextureObject(textureObject));
    checkCudaErrors(cudaFree(dOutput));
    checkCudaErrors(cudaFreeArray(cArray));

    delete[] my_hOutput;
    delete[] my_pHostData;
}

int main() {
    cudaTex();
    return 0;
}
$ nvcc -o t7 t7.cu -std=c++11
t7.cu: In function ‘void cudaTex()’:
t7.cu:56:12: warning: ‘cudaError_t cudaMemcpyToArray(cudaArray_t, size_t, size_t, const void*, size_t, cudaMemcpyKind)’ is deprecated (declared at /usr/local/cuda/bin/../targets/x86_64-linux/include/cuda_runtime_api.h:6782) [-Wdeprecated-declarations]
     checkCudaErrors(cudaMemcpyToArray(cArray, 0, 0, pHostData, bytesPerScanline * textureHeight * sizeof(uint8_t), cudaMemcpyHostToDevice));
            ^
t7.cu:56:131: warning: ‘cudaError_t cudaMemcpyToArray(cudaArray_t, size_t, size_t, const void*, size_t, cudaMemcpyKind)’ is deprecated (declared at /usr/local/cuda/bin/../targets/x86_64-linux/include/cuda_runtime_api.h:6782) [-Wdeprecated-declarations]
     checkCudaErrors(cudaMemcpyToArray(cArray, 0, 0, pHostData, bytesPerScanline * textureHeight * sizeof(uint8_t), cudaMemcpyHostToDevice));
                                                                                                                                   ^
$ cuda-memcheck ./t7
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
$

I'm not suggesting the above code is defect-free. It's mostly your code. I'm just pointing out some things I found. You can read about how to address the deprecation warning here.