Would like to pass the content from jpeg file (3 byte RGB) as a texture to a CUDA kernel but getting compilation error
a pointer to a bound function may only be used to call the function
on value.x = tex2D<unsigned char>(_texture, u, v) * 1.0f / 255.0f;
and the rest of the tex2D()
calls.
What may be the reason(s) for the error?
Host side code where the texture is created:
cudaArray* cudaArray;
cudaTextureObject_t textureObject{};
{
const static uint32_t bytesPerPixel{ 3u };
uint8_t* pHostData;
int32_t textureWidth, textureHeight;
uint32_t bytesPerScanline;
cudaChannelFormatDesc channelFormatDesc;
cudaResourceDesc resourceDesc{};
cudaTextureDesc textureDesc{};
int32_t componentsPerPixel = bytesPerPixel;
pHostData = stbi_load(textureFilename.c_str(), &textureWidth, &textureHeight, &componentsPerPixel, componentsPerPixel);
if (nullptr == pHostData) {
std::cerr << "ERROR: Could not load texture image file '" << textureFilename << std::endl;
return;
}
bytesPerScanline = bytesPerPixel * textureWidth;
channelFormatDesc = cudaCreateChannelDesc<uint8_t>();
checkCudaErrors(cudaMallocArray(&cudaArray, &channelFormatDesc, bytesPerScanline, textureHeight));
checkCudaErrors(cudaMemcpyToArray(cudaArray, 0, 0, pHostData, bytesPerScanline * textureHeight * sizeof(uint8_t), cudaMemcpyHostToDevice));
resourceDesc.resType = cudaResourceTypeArray;
resourceDesc.res.array.array = cudaArray;
textureDesc.normalizedCoords = true;
textureDesc.filterMode = cudaFilterModePoint;
textureDesc.addressMode[0] = cudaAddressModeWrap;
textureDesc.addressMode[1] = cudaAddressModeWrap;
textureDesc.readMode = cudaReadModeElementType;
checkCudaErrors(cudaCreateTextureObject(&textureObject, &resourceDesc, &textureDesc, nullptr));
STBI_FREE(pHostData);
}
Device side code:
class imageTexture {
public:
__device__ imageTexture(cudaTextureObject_t tex) :_texture(tex) {}
__device__ virtual vec3 value(float u, float v, const vec3& p) const {
vec3 value;
u *= 3;
value.x = tex2D<unsigned char>(_texture, u, v) * 1.0f / 255.0f;
u++;
value.y = tex2D<unsigned char>(_texture, u, v) * 1.0f / 255.0f;
u++;
value.z = tex2D<unsigned char>(_texture, u, v) * 1.0f / 255.0f;
return value;
}
private:
cudaTextureObject_t _texture;
};
Changed device side function, but the error persists:
class imageTexture :public textureX {
public:
__device__ imageTexture(cudaTextureObject_t tex) :_text(tex) {}
__device__ virtual vec3 value(float u, float v, const vec3& p) const override {
vec3 val;
u *= 3;
val.x = tex2D<unsigned char>(_text, u, v) * 1.0f / 255.0f;
u++;
val.y = tex2D<unsigned char>(_text, u, v) * 1.0f / 255.0f;
u++;
val.z = tex2D<unsigned char>(_text, u, v) * 1.0f / 255.0f;
return val;
}
private:
cudaTextureObject_t _text;
};
I have written a new test program and planning to build up from there. The idea is have each thread read 3 values from texture and write it back to a buffer. Only the first triplet is correct. Is there anything inconsistent in my texture lookups with this:
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <curand_kernel.h>
#include <iostream>
#include <string>
#include <chrono>
#include <cmath>
#include <ctime>
#include <cstdint>
#include <stdio.h>
#define checkCudaErrors(val) check_cuda( (val), #val, __FILE__, __LINE__ )
void check_cuda(cudaError_t result, char const* const func, const char* const file, int const line) {
if (result) {
std::cerr << "CUDA error = " << static_cast<unsigned int>(result) << " at " << file << ":" << line << " " << func << std::endl;
std::cerr << cudaGetErrorString(result) << std::endl;
// Make sure we call CUDA Device Reset before exiting
cudaDeviceReset();
exit(99);
}
}
__global__ void texCheck(uint32_t width, uint32_t height, uint8_t* pOutput, cudaTextureObject_t textureObject) {
uint32_t x = blockIdx.x * blockDim.x + threadIdx.x;
uint32_t y = blockIdx.y * blockDim.y + threadIdx.y;
if ((x < width) && (y < height)) {
float u = (float)x / (float)width;
float v = (float)y / (float)height;
pOutput[y * (3 * width) + (3 * x)] = tex2D<uint8_t>(textureObject, 3*u, v);
pOutput[y * (3 * width) + (3 * x) + 1] = tex2D<uint8_t>(textureObject, 3*u + 1, v);
pOutput[y * (3 * width) + (3 * x) + 2] = tex2D<uint8_t>(textureObject, 3*u + 2, v);
}
}
void cudaTex() {
const uint32_t bytesPerPixel{ 3u };
const uint32_t textureWidth = 1024u;
const uint32_t textureHeight = 512u;
uint32_t bytesPerScanline;
bytesPerScanline = bytesPerPixel * textureWidth;
cudaChannelFormatDesc channelFormatDesc = cudaCreateChannelDesc(8, 0, 0, 0, cudaChannelFormatKindUnsigned);
cudaArray* cudaArray;
checkCudaErrors(cudaMallocArray(&cudaArray, &channelFormatDesc, bytesPerScanline, textureHeight));
uint8_t* pHostData = new uint8_t[bytesPerScanline * textureHeight];
std::srand(std::time(nullptr));
for (uint64_t idx = 0ull; idx < bytesPerScanline * textureHeight; idx++)
pHostData[idx] = std::rand();
checkCudaErrors(cudaMemcpyToArray(cudaArray, 0, 0, pHostData, bytesPerScanline * textureHeight * sizeof(uint8_t), cudaMemcpyHostToDevice));
cudaResourceDesc resourceDesc{};
resourceDesc.resType = cudaResourceTypeArray;
resourceDesc.res.array.array = cudaArray;
cudaTextureDesc textureDesc{};
textureDesc.normalizedCoords = false;
textureDesc.filterMode = cudaFilterModePoint;
textureDesc.addressMode[0] = cudaAddressModeWrap;
textureDesc.addressMode[1] = cudaAddressModeWrap;
textureDesc.readMode = cudaReadModeElementType;
cudaTextureObject_t textureObject{};
checkCudaErrors(cudaCreateTextureObject(&textureObject, &resourceDesc, &textureDesc, nullptr));
dim3 dimBlock(8u, 8u, 1u);
dim3 dimGrid(textureWidth / dimBlock.x, textureHeight / dimBlock.y, 1u);
uint8_t* dOutput{ nullptr };
checkCudaErrors(cudaMalloc((void**)&dOutput, bytesPerScanline * textureHeight * sizeof(uint8_t)));
texCheck << < dimGrid, dimBlock >> > (textureWidth, textureHeight, dOutput, textureObject);
checkCudaErrors(cudaGetLastError());
checkCudaErrors(cudaDeviceSynchronize());
uint8_t* hOutput = new uint8_t[bytesPerScanline * textureHeight];
checkCudaErrors(cudaMemcpy(hOutput, dOutput, bytesPerScanline * textureHeight * sizeof(uint8_t), cudaMemcpyDeviceToHost));
for (uint64_t idx = 0ull; idx < textureHeight; idx++) {
for (uint64_t jdx = 0ull; jdx < bytesPerScanline; jdx++) {
if (hOutput[jdx] != pHostData[jdx])
std::cerr << "Mismatch @ " << idx << " " << jdx << " Expected " << (uint32_t)pHostData[jdx] << " Received " << (uint32_t)hOutput[jdx] << std::endl;
}
hOutput += bytesPerScanline;
pHostData += bytesPerScanline;
}
checkCudaErrors(cudaDestroyTextureObject(textureObject));
checkCudaErrors(cudaFree(dOutput));
checkCudaErrors(cudaFreeArray(cudaArray));
delete[] hOutput;
delete[] pHostData;
}
int main() {
cudaTex();
return 0;
}
Switching to integer coordinated in the kernel solved the problem
Resolution of the Original Problem
It turned out that the
a pointer to a bound function may only be used to call the function
error was caused by vec3
class having a getter function x()
and not a member variable named x
. So the code was trying use the getter function as an l-value!!!
There are several problems with the code you have now posted:
after the discussion in the comments, hopefully you can figure out what is wrong with this line of code:
cudaArray* cudaArray;
Your kernel code appears to be trying pass normalized float
coordinates but doing it incorrectly. There are several issues here: your x normalization is considering textureWidth
but it should be done over 3*textureWidth
(i.e. bytesPerScanline
). Although you are calling the width of your texture textureWidth
, really it is 3*textureWidth
. Also, texturing in this fashion is typically offset by 0.5. Finally, you are doing this:
textureDesc.normalizedCoords = false;
but if you want to use float
coordinates (seems to be what you want) you shoudl do:
textureDesc.normalizedCoords = true;
After you fix all that, you'll run into a non-CUDA issue. You're modifying these pointers:
hOutput += bytesPerScanline;
pHostData += bytesPerScanline;
then trying to delete them after modification:
delete[] hOutput;
delete[] pHostData;
that won't work correctly.
Here's a modified code that has the above issues addressed, it seems to run correctly for me:
$ cat t7.cu
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <curand_kernel.h>
#include <iostream>
#include <string>
#include <chrono>
#include <cmath>
#include <ctime>
#include <cstdint>
#include <stdio.h>
#define checkCudaErrors(val) check_cuda( (val), #val, __FILE__, __LINE__ )
void check_cuda(cudaError_t result, char const* const func, const char* const file, int const line) {
if (result) {
std::cerr << "CUDA error = " << static_cast<unsigned int>(result) << " at " << file << ":" << line << " " << func << std::endl;
std::cerr << cudaGetErrorString(result) << std::endl;
// Make sure we call CUDA Device Reset before exiting
cudaDeviceReset();
exit(99);
}
}
__global__ void texCheck(uint32_t width, uint32_t height, uint8_t* pOutput, cudaTextureObject_t textureObject) {
uint32_t x = blockIdx.x * blockDim.x + threadIdx.x;
uint32_t y = blockIdx.y * blockDim.y + threadIdx.y;
const float pix_offset = 0.5;
if ((x < width) && (y < height)) {
float u = (float)(3*x+pix_offset) / (float)(3*width);
float v = (float)y / (float)height;
pOutput[y * (3 * width) + (3 * x)] = tex2D<uint8_t>(textureObject, u, v);
u = (float)(3*x+1+pix_offset) / (float)(3*width);
pOutput[y * (3 * width) + (3 * x) + 1] = tex2D<uint8_t>(textureObject, u, v);
u = (float)(3*x+2+pix_offset) / (float)(3*width);
pOutput[y * (3 * width) + (3 * x) + 2] = tex2D<uint8_t>(textureObject, u, v);
}
}
void cudaTex() {
const uint32_t bytesPerPixel{ 3u };
const uint32_t textureWidth = 1024u;
const uint32_t textureHeight = 512u;
uint32_t bytesPerScanline;
bytesPerScanline = bytesPerPixel * textureWidth;
cudaChannelFormatDesc channelFormatDesc = cudaCreateChannelDesc(8, 0, 0, 0, cudaChannelFormatKindUnsigned);
cudaArray* cArray;
checkCudaErrors(cudaMallocArray(&cArray, &channelFormatDesc, bytesPerScanline, textureHeight));
uint8_t* pHostData = new uint8_t[bytesPerScanline * textureHeight];
std::srand(std::time(nullptr));
for (uint64_t idx = 0ull; idx < bytesPerScanline * textureHeight; idx++)
pHostData[idx] = std::rand();
checkCudaErrors(cudaMemcpyToArray(cArray, 0, 0, pHostData, bytesPerScanline * textureHeight * sizeof(uint8_t), cudaMemcpyHostToDevice));
cudaResourceDesc resourceDesc{};
resourceDesc.resType = cudaResourceTypeArray;
resourceDesc.res.array.array = cArray;
cudaTextureDesc textureDesc{};
textureDesc.normalizedCoords = true;
textureDesc.filterMode = cudaFilterModePoint;
textureDesc.addressMode[0] = cudaAddressModeWrap;
textureDesc.addressMode[1] = cudaAddressModeWrap;
textureDesc.readMode = cudaReadModeElementType;
cudaTextureObject_t textureObject{};
checkCudaErrors(cudaCreateTextureObject(&textureObject, &resourceDesc, &textureDesc, nullptr));
dim3 dimBlock(8u, 8u, 1u);
dim3 dimGrid(textureWidth / dimBlock.x, textureHeight / dimBlock.y, 1u);
uint8_t* dOutput{ nullptr };
checkCudaErrors(cudaMalloc((void**)&dOutput, bytesPerScanline * textureHeight * sizeof(uint8_t)));
texCheck << < dimGrid, dimBlock >> > (textureWidth, textureHeight, dOutput, textureObject);
checkCudaErrors(cudaGetLastError());
checkCudaErrors(cudaDeviceSynchronize());
uint8_t* hOutput = new uint8_t[bytesPerScanline * textureHeight];
checkCudaErrors(cudaMemcpy(hOutput, dOutput, bytesPerScanline * textureHeight * sizeof(uint8_t), cudaMemcpyDeviceToHost));
uint8_t *my_hOutput = hOutput;
uint8_t *my_pHostData = pHostData;
for (uint64_t idx = 0ull; idx < textureHeight; idx++) {
for (uint64_t jdx = 0ull; jdx < bytesPerScanline; jdx++) {
if (hOutput[jdx] != pHostData[jdx]){
std::cerr << "Mismatch @ " << idx << " " << jdx << " Expected " << (uint32_t)pHostData[jdx] << " Received " << (uint32_t)hOutput[jdx] << std::endl;
return;}
}
hOutput += bytesPerScanline;
pHostData += bytesPerScanline;
}
checkCudaErrors(cudaDestroyTextureObject(textureObject));
checkCudaErrors(cudaFree(dOutput));
checkCudaErrors(cudaFreeArray(cArray));
delete[] my_hOutput;
delete[] my_pHostData;
}
int main() {
cudaTex();
return 0;
}
$ nvcc -o t7 t7.cu -std=c++11
t7.cu: In function ‘void cudaTex()’:
t7.cu:56:12: warning: ‘cudaError_t cudaMemcpyToArray(cudaArray_t, size_t, size_t, const void*, size_t, cudaMemcpyKind)’ is deprecated (declared at /usr/local/cuda/bin/../targets/x86_64-linux/include/cuda_runtime_api.h:6782) [-Wdeprecated-declarations]
checkCudaErrors(cudaMemcpyToArray(cArray, 0, 0, pHostData, bytesPerScanline * textureHeight * sizeof(uint8_t), cudaMemcpyHostToDevice));
^
t7.cu:56:131: warning: ‘cudaError_t cudaMemcpyToArray(cudaArray_t, size_t, size_t, const void*, size_t, cudaMemcpyKind)’ is deprecated (declared at /usr/local/cuda/bin/../targets/x86_64-linux/include/cuda_runtime_api.h:6782) [-Wdeprecated-declarations]
checkCudaErrors(cudaMemcpyToArray(cArray, 0, 0, pHostData, bytesPerScanline * textureHeight * sizeof(uint8_t), cudaMemcpyHostToDevice));
^
$ cuda-memcheck ./t7
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
$
I'm not suggesting the above code is defect-free. It's mostly your code. I'm just pointing out some things I found. You can read about how to address the deprecation warning here.