I have a 4 channel texture / surface that are allocated with the following descriptors:
cudaChannelFormatDesc cuda_map_desc = cudaCreateChannelDesc(32, 32, 32, 32, cudaChannelFormatKindFloat);
CUDA_CHECK(cudaMallocArray(&cuMapLeftArray, &cuda_map_desc, map_width, map_height));
cudaResourceDesc map_resource_desc{};
map_resource_desc.resType = cudaResourceTypeArray;
map_resource_desc.res.array.array = cuMapLeftArray;
cudaTextureDesc map_texture_desc{};
map_texture_desc.normalizedCoords = false;
map_texture_desc.addressMode[0] = cudaAddressModeClamp;
map_texture_desc.addressMode[1] = cudaAddressModeClamp;
map_texture_desc.filterMode = cudaFilterModeLinear;
map_texture_desc.readMode = cudaReadModeElementType;
CUDA_CHECK(cudaCreateTextureObject(&MapLeftTexObject, &map_resource_desc, &map_texture_desc, NULL));
CUDA_CHECK(cudaCreateSurfaceObject(&cuMapLeftSurf, &map_resource_desc));
Here is how I write to it in a cuda kernel:
float4 addr_val = make_float4(pixel_left.x, pixel_left.y, pixel_right.x, pixel_right.y);
surf2Dwrite<float4>(addr_val, g_mapL, ix * sizeof(addr_val), iy);
Here is how I read the data in a different kernel:
float4 coords = tex2D<float4>(map_left, map_x, map_y); // returns image coords (not normalized)
I wish to try utilizing fp16 support in cuda and convert the code to use half
instead of float.
How can I convert the provided code samples to utilize half / fp16? Is this even possible / efficient?
I tried simply changing the channel size from 32 to 16 but I could not find good docs / examples on how to read write to this kind of objects.
This is adapted from the surface write/texture read cuda sample code ("simpleSurfaceWrite") and also taking note of the relevant programming guide section, excerpting:
The 16-bit floating-point or half format supported by CUDA arrays is the same as the IEEE 754-2008 binary2 format.
CUDA C++ does not support a matching data type, but provides intrinsic functions to convert to and from the 32-bit floating-point format via the unsigned short type...
16-bit floating-point components are promoted to 32 bit float during texture fetching before any filtering is performed.
A channel description for the 16-bit floating-point format can be created by calling one of the cudaCreateChannelDescHalf*() functions.
There are at least 2 ways to tackle this:
We could use purely 16-bit traffic: writing ushort4
quantities in the surface write kernel and reading ushort4
quantities in the texture read kernel. Conversion from ushort
to half
(or float
) is demonstrated in the host code of the first example below; it can be used similarly in device code as needed. For this we need a channel descriptor specifying ushort4
.
We could use a mostly 16-bit definition, but take advantage of the note above: "16-bit floating-point components are promoted to 32 bit float during texture fetching". Using this methodology, our general setup isn't much different, our surface write kernel is unchanged, but we will specify a "half4" channel descriptor (effectively switching from integer to "floating point mode" in the texture unit) and our texture read kernel will now read float4
quantities instead of ushort4
.
Examples of both follow:
# cat t65.cu
/*
* demonstrate "half4" texture/surface using ushort4 texture fetch
*/
#include <cstdlib>
#include <cuda_fp16.h>
#include <iostream>
#define checkCudaErrors(x) {cudaError_t err=x; if (err != cudaSuccess) std::cout << "CUDA Error at line: " << __LINE__ << " " << cudaGetErrorString(err) << std::endl;}
using mt = ushort4;
////////////////////////////////////////////////////////////////////////////////
// Kernels
////////////////////////////////////////////////////////////////////////////////
//! Write to a cuArray (texture data source) using surface writes
//! @param gIData input data in global memory
////////////////////////////////////////////////////////////////////////////////
__global__ void surfaceWriteKernel(mt *gIData, int width, int height,
cudaSurfaceObject_t outputSurface) {
// calculate surface coordinates
unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
// read from global memory and write to cuarray (via surface reference)
surf2Dwrite(gIData[y * width + x], outputSurface, x * sizeof(mt), y);
}
////////////////////////////////////////////////////////////////////////////////
//! read data using texture lookups
//! @param gOData output data in global memory
////////////////////////////////////////////////////////////////////////////////
__global__ void texReadKernel(mt *gOData, int width, int height,
cudaTextureObject_t tex) {
unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
float u = x;
float v = y;
// read from texture and write to global memory
gOData[y * width + x] = tex2D<mt>(tex, u, v);
}
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv) {
// Get number of SMs on this GPU
cudaDeviceProp deviceProps;
const int devID = 0;
checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID));
std::cout << "CUDA device [" << deviceProps.name << "] has " << deviceProps.multiProcessorCount << " Multi-Processors, SM " << deviceProps.major << "." << deviceProps.minor << std::endl;
// create data
unsigned int width = 256, height=256;
mt *hData = (mt *)malloc(width*height*sizeof(mt));
unsigned int size = width * height * sizeof(mt);
// populate
for (int row = 0; row < height; row++)
for (int col = 0; col < width; col++) {
unsigned short x = __half_as_ushort(__float2half(row));
unsigned short y = __half_as_ushort(__float2half(1.0f));
unsigned short z = __half_as_ushort(__float2half(col));
unsigned short w = __half_as_ushort(__float2half(2.0f));
ushort4 val = {x, y, z, w};
hData[row*width+col] = val;}
// Allocate device memory for result
mt *dData = NULL;
checkCudaErrors(cudaMalloc((void **)&dData, size));
checkCudaErrors(cudaMemcpy(dData, hData, size, cudaMemcpyHostToDevice));
// Allocate array and copy image data
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<ushort4>();
cudaArray *cuArray;
checkCudaErrors(cudaMallocArray(&cuArray, &channelDesc, width, height,
cudaArraySurfaceLoadStore));
dim3 dimBlock(8, 8, 1);
dim3 dimGrid(width / dimBlock.x, height / dimBlock.y, 1);
cudaSurfaceObject_t outputSurface;
cudaResourceDesc surfRes;
memset(&surfRes, 0, sizeof(cudaResourceDesc));
surfRes.resType = cudaResourceTypeArray;
surfRes.res.array.array = cuArray;
checkCudaErrors(cudaCreateSurfaceObject(&outputSurface, &surfRes));
checkCudaErrors(cudaMemcpy(dData, hData, size, cudaMemcpyHostToDevice));
surfaceWriteKernel<<<dimGrid, dimBlock>>>(dData, width, height,
outputSurface);
checkCudaErrors(cudaMemset(dData, 0, size));
cudaTextureObject_t tex;
cudaResourceDesc texRes;
memset(&texRes, 0, sizeof(cudaResourceDesc));
texRes.resType = cudaResourceTypeArray;
texRes.res.array.array = cuArray;
cudaTextureDesc texDescr;
memset(&texDescr, 0, sizeof(cudaTextureDesc));
texDescr.normalizedCoords = false;
texDescr.filterMode = cudaFilterModePoint;
texDescr.addressMode[0] = cudaAddressModeClamp;
texDescr.addressMode[1] = cudaAddressModeClamp;
texDescr.readMode = cudaReadModeElementType;
checkCudaErrors(cudaCreateTextureObject(&tex, &texRes, &texDescr, NULL));
texReadKernel<<<dimGrid, dimBlock, 0>>>(dData, width, height, tex);
// Allocate mem for the result on host side
mt *hOData = (mt *)malloc(size);
// copy result from device to host
checkCudaErrors(cudaMemcpy(hOData, dData, size, cudaMemcpyDeviceToHost));
// check
for (int row = 0; row < height; row++)
for (int col = 0; col < width; col++) {
ushort4 val = hOData[row*width+col];
if (val.x != __half_as_ushort(__float2half(row))) {std::cout << "mismatch at: " << row << "," << col << std::endl; return 0;}
if (val.y != __half_as_ushort(__float2half(1.0f))) {std::cout << "mismatch at: " << row << "," << col << std::endl; return 0;}
if (val.z != __half_as_ushort(__float2half(col))) {std::cout << "mismatch at: " << row << "," << col << std::endl; return 0;}
if (val.w != __half_as_ushort(__float2half(2.0f))) {std::cout << "mismatch at: " << row << "," << col << std::endl; return 0;}
}
std::cout << "Success" << std::endl;
checkCudaErrors(cudaDestroySurfaceObject(outputSurface));
checkCudaErrors(cudaDestroyTextureObject(tex));
checkCudaErrors(cudaFree(dData));
checkCudaErrors(cudaFreeArray(cuArray));
}
# nvcc -o t65 t65.cu
# compute-sanitizer ./t65
========= COMPUTE-SANITIZER
CUDA device [NVIDIA L4] has 58 Multi-Processors, SM 8.9
Success
========= ERROR SUMMARY: 0 errors
# cat t65f.cu
/*
* demonstrate "half4" texture/surface using float4 texture fetch
*/
#include <cstdlib>
#include <cuda_fp16.h>
#include <iostream>
#include <cstdio>
#define checkCudaErrors(x) {cudaError_t err=x; if (err != cudaSuccess) std::cout << "CUDA Error at line: " << __LINE__ << " " << cudaGetErrorString(err) << std::endl;}
using mt = ushort4;
////////////////////////////////////////////////////////////////////////////////
// Kernels
////////////////////////////////////////////////////////////////////////////////
//! Write to a cuArray (texture data source) using surface writes
//! @param gIData input data in global memory
////////////////////////////////////////////////////////////////////////////////
__global__ void surfaceWriteKernel(mt *gIData, int width, int height,
cudaSurfaceObject_t outputSurface) {
// calculate surface coordinates
unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
// read from global memory and write to cuarray (via surface reference)
surf2Dwrite(gIData[y * width + x], outputSurface, x * sizeof(mt), y);
}
////////////////////////////////////////////////////////////////////////////////
//! read data using texture lookups
////////////////////////////////////////////////////////////////////////////////
__global__ void texReadKernel(int width, int height,
cudaTextureObject_t tex) {
unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
float u = x;
float v = y;
// read from texture and write to global memory
float4 val = tex2D<float4>(tex, u, v);
if (val.x != v) printf("mismatchx at %d,%d, was: %f, should be: %f\n", x,y,val.x, v);
if (val.y != 1.0f) printf("mismatchy at %d,%d, was: %f, should be: %f\n", x,y,val.y, 1.0f);
if (val.z != u) printf("mismatchz at %d,%d, was: %f, should be: %f\n", x,y,val.z, u);
if (val.w != 2.0f) printf("mismatchw at %d,%d, was: %f, should be: %f\n", x,y,val.w, 2.0f);
}
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv) {
// Get number of SMs on this GPU
cudaDeviceProp deviceProps;
const int devID = 0;
checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID));
std::cout << "CUDA device [" << deviceProps.name << "] has " << deviceProps.multiProcessorCount << " Multi-Processors, SM " << deviceProps.major << "." << deviceProps.minor << std::endl;
// create data
unsigned int width = 256, height=256;
mt *hData = (mt *)malloc(width*height*sizeof(mt));
unsigned int size = width * height * sizeof(mt);
// populate
for (int row = 0; row < height; row++)
for (int col = 0; col < width; col++) {
unsigned short x = __half_as_ushort(__float2half(row));
unsigned short y = __half_as_ushort(__float2half(1.0f));
unsigned short z = __half_as_ushort(__float2half(col));
unsigned short w = __half_as_ushort(__float2half(2.0f));
ushort4 val = {x, y, z, w};
hData[row*width+col] = val;}
// Allocate device memory for result
mt *dData = NULL;
checkCudaErrors(cudaMalloc((void **)&dData, size));
checkCudaErrors(cudaMemcpy(dData, hData, size, cudaMemcpyHostToDevice));
// Allocate array and copy image data
cudaChannelFormatDesc channelDesc = cudaCreateChannelDescHalf4();
cudaArray *cuArray;
checkCudaErrors(cudaMallocArray(&cuArray, &channelDesc, width, height,
cudaArraySurfaceLoadStore));
dim3 dimBlock(8, 8, 1);
dim3 dimGrid(width / dimBlock.x, height / dimBlock.y, 1);
cudaSurfaceObject_t outputSurface;
cudaResourceDesc surfRes;
memset(&surfRes, 0, sizeof(cudaResourceDesc));
surfRes.resType = cudaResourceTypeArray;
surfRes.res.array.array = cuArray;
checkCudaErrors(cudaCreateSurfaceObject(&outputSurface, &surfRes));
checkCudaErrors(cudaMemcpy(dData, hData, size, cudaMemcpyHostToDevice));
surfaceWriteKernel<<<dimGrid, dimBlock>>>(dData, width, height,
outputSurface);
checkCudaErrors(cudaMemset(dData, 0, size));
cudaTextureObject_t tex;
cudaResourceDesc texRes;
memset(&texRes, 0, sizeof(cudaResourceDesc));
texRes.resType = cudaResourceTypeArray;
texRes.res.array.array = cuArray;
cudaTextureDesc texDescr;
memset(&texDescr, 0, sizeof(cudaTextureDesc));
texDescr.normalizedCoords = false;
texDescr.filterMode = cudaFilterModePoint;
texDescr.addressMode[0] = cudaAddressModeClamp;
texDescr.addressMode[1] = cudaAddressModeClamp;
texDescr.readMode = cudaReadModeElementType;
checkCudaErrors(cudaCreateTextureObject(&tex, &texRes, &texDescr, NULL));
texReadKernel<<<dimGrid, dimBlock, 0>>>(width, height, tex);
checkCudaErrors(cudaDeviceSynchronize());
checkCudaErrors(cudaDestroySurfaceObject(outputSurface));
checkCudaErrors(cudaDestroyTextureObject(tex));
checkCudaErrors(cudaFree(dData));
checkCudaErrors(cudaFreeArray(cuArray));
}
# nvcc -o t65f t65f.cu
# compute-sanitizer ./t65f
========= COMPUTE-SANITIZER
CUDA device [NVIDIA L4] has 58 Multi-Processors, SM 8.9
========= ERROR SUMMARY: 0 errors
#