How can I write to an fp16 surface?

I have a 4 channel texture / surface that are allocated with the following descriptors:

cudaChannelFormatDesc cuda_map_desc = cudaCreateChannelDesc(32, 32, 32, 32, cudaChannelFormatKindFloat);
CUDA_CHECK(cudaMallocArray(&cuMapLeftArray, &cuda_map_desc, map_width, map_height));
cudaResourceDesc map_resource_desc{};
map_resource_desc.resType = cudaResourceTypeArray;
map_resource_desc.res.array.array = cuMapLeftArray;
cudaTextureDesc map_texture_desc{};
map_texture_desc.normalizedCoords = false;
map_texture_desc.addressMode[0] = cudaAddressModeClamp;
map_texture_desc.addressMode[1] = cudaAddressModeClamp;
map_texture_desc.filterMode = cudaFilterModeLinear;
map_texture_desc.readMode = cudaReadModeElementType;
CUDA_CHECK(cudaCreateTextureObject(&MapLeftTexObject, &map_resource_desc, &map_texture_desc, NULL));
CUDA_CHECK(cudaCreateSurfaceObject(&cuMapLeftSurf, &map_resource_desc));

Here is how I write to it in a cuda kernel:

float4 addr_val = make_float4(pixel_left.x, pixel_left.y, pixel_right.x, pixel_right.y);
surf2Dwrite<float4>(addr_val, g_mapL, ix * sizeof(addr_val), iy);

Here is how I read the data in a different kernel:

float4 coords = tex2D<float4>(map_left, map_x, map_y); // returns image coords (not normalized)

I wish to try utilizing fp16 support in cuda and convert the code to use half instead of float.

How can I convert the provided code samples to utilize half / fp16? Is this even possible / efficient?

I tried simply changing the channel size from 32 to 16 but I could not find good docs / examples on how to read write to this kind of objects.

Solution

This is adapted from the surface write/texture read cuda sample code ("simpleSurfaceWrite") and also taking note of the relevant programming guide section, excerpting:

The 16-bit floating-point or half format supported by CUDA arrays is the same as the IEEE 754-2008 binary2 format.

CUDA C++ does not support a matching data type, but provides intrinsic functions to convert to and from the 32-bit floating-point format via the unsigned short type...

16-bit floating-point components are promoted to 32 bit float during texture fetching before any filtering is performed.

A channel description for the 16-bit floating-point format can be created by calling one of the cudaCreateChannelDescHalf*() functions.

There are at least 2 ways to tackle this:

We could use purely 16-bit traffic: writing ushort4 quantities in the surface write kernel and reading ushort4 quantities in the texture read kernel. Conversion from ushort to half (or float) is demonstrated in the host code of the first example below; it can be used similarly in device code as needed. For this we need a channel descriptor specifying ushort4.
We could use a mostly 16-bit definition, but take advantage of the note above: "16-bit floating-point components are promoted to 32 bit float during texture fetching". Using this methodology, our general setup isn't much different, our surface write kernel is unchanged, but we will specify a "half4" channel descriptor (effectively switching from integer to "floating point mode" in the texture unit) and our texture read kernel will now read float4 quantities instead of ushort4.

Examples of both follow:

# cat t65.cu

/*
 * demonstrate "half4" texture/surface using ushort4 texture fetch
 */

#include <cstdlib>
#include <cuda_fp16.h>
#include <iostream>
#define checkCudaErrors(x) {cudaError_t err=x; if (err != cudaSuccess) std::cout << "CUDA Error at line: " << __LINE__ << " " << cudaGetErrorString(err) << std::endl;}
using mt = ushort4;
////////////////////////////////////////////////////////////////////////////////
// Kernels
////////////////////////////////////////////////////////////////////////////////
//! Write to a cuArray (texture data source) using surface writes
//! @param gIData input data in global memory
////////////////////////////////////////////////////////////////////////////////
__global__ void surfaceWriteKernel(mt *gIData, int width, int height,
                                   cudaSurfaceObject_t outputSurface) {
  // calculate surface coordinates
  unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
  unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;

  // read from global memory and write to cuarray (via surface reference)
  surf2Dwrite(gIData[y * width + x], outputSurface, x * sizeof(mt), y);
}

////////////////////////////////////////////////////////////////////////////////
//! read data using texture lookups
//! @param gOData  output data in global memory
////////////////////////////////////////////////////////////////////////////////
__global__ void texReadKernel(mt *gOData, int width, int height,
                                cudaTextureObject_t tex) {
  unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
  unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;

  float u = x;
  float v = y;

  // read from texture and write to global memory
  gOData[y * width + x] = tex2D<mt>(tex, u, v);
}

////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv) {

  // Get number of SMs on this GPU
  cudaDeviceProp deviceProps;
  const int devID = 0;
  checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID));
  std::cout << "CUDA device [" << deviceProps.name << "] has " << deviceProps.multiProcessorCount << " Multi-Processors, SM " << deviceProps.major << "." << deviceProps.minor << std::endl;

  // create data
  unsigned int width = 256, height=256;
  mt *hData = (mt *)malloc(width*height*sizeof(mt));

  unsigned int size = width * height * sizeof(mt);
  // populate
  for (int row = 0; row < height; row++)
    for (int col = 0; col < width; col++) {
      unsigned short x = __half_as_ushort(__float2half(row));
      unsigned short y = __half_as_ushort(__float2half(1.0f));
      unsigned short z = __half_as_ushort(__float2half(col));
      unsigned short w = __half_as_ushort(__float2half(2.0f));
      ushort4 val = {x, y, z, w};
      hData[row*width+col] = val;}

  // Allocate device memory for result
  mt *dData = NULL;
  checkCudaErrors(cudaMalloc((void **)&dData, size));
  checkCudaErrors(cudaMemcpy(dData, hData, size, cudaMemcpyHostToDevice));
  // Allocate array and copy image data
  cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<ushort4>();
  cudaArray *cuArray;
  checkCudaErrors(cudaMallocArray(&cuArray, &channelDesc, width, height,
                                  cudaArraySurfaceLoadStore));

  dim3 dimBlock(8, 8, 1);
  dim3 dimGrid(width / dimBlock.x, height / dimBlock.y, 1);

  cudaSurfaceObject_t outputSurface;
  cudaResourceDesc surfRes;
  memset(&surfRes, 0, sizeof(cudaResourceDesc));
  surfRes.resType = cudaResourceTypeArray;
  surfRes.res.array.array = cuArray;

  checkCudaErrors(cudaCreateSurfaceObject(&outputSurface, &surfRes));
  checkCudaErrors(cudaMemcpy(dData, hData, size, cudaMemcpyHostToDevice));
  surfaceWriteKernel<<<dimGrid, dimBlock>>>(dData, width, height,
                                            outputSurface);
  checkCudaErrors(cudaMemset(dData, 0, size));
  cudaTextureObject_t tex;
  cudaResourceDesc texRes;
  memset(&texRes, 0, sizeof(cudaResourceDesc));

  texRes.resType = cudaResourceTypeArray;
  texRes.res.array.array = cuArray;

  cudaTextureDesc texDescr;
  memset(&texDescr, 0, sizeof(cudaTextureDesc));

  texDescr.normalizedCoords = false;
  texDescr.filterMode = cudaFilterModePoint;
  texDescr.addressMode[0] = cudaAddressModeClamp;
  texDescr.addressMode[1] = cudaAddressModeClamp;
  texDescr.readMode = cudaReadModeElementType;

  checkCudaErrors(cudaCreateTextureObject(&tex, &texRes, &texDescr, NULL));

  texReadKernel<<<dimGrid, dimBlock, 0>>>(dData, width, height, tex);


  // Allocate mem for the result on host side
  mt *hOData = (mt *)malloc(size);
  // copy result from device to host
  checkCudaErrors(cudaMemcpy(hOData, dData, size, cudaMemcpyDeviceToHost));
  // check
  for (int row = 0; row < height; row++)
    for (int col = 0; col < width; col++) {
      ushort4 val = hOData[row*width+col];
      if (val.x != __half_as_ushort(__float2half(row)))  {std::cout << "mismatch at: " << row << "," << col << std::endl; return 0;}
      if (val.y != __half_as_ushort(__float2half(1.0f))) {std::cout << "mismatch at: " << row << "," << col << std::endl; return 0;}
      if (val.z != __half_as_ushort(__float2half(col)))  {std::cout << "mismatch at: " << row << "," << col << std::endl; return 0;}
      if (val.w != __half_as_ushort(__float2half(2.0f))) {std::cout << "mismatch at: " << row << "," << col << std::endl; return 0;}
      }
  std::cout << "Success" << std::endl;

  checkCudaErrors(cudaDestroySurfaceObject(outputSurface));
  checkCudaErrors(cudaDestroyTextureObject(tex));
  checkCudaErrors(cudaFree(dData));
  checkCudaErrors(cudaFreeArray(cuArray));
}
# nvcc -o t65 t65.cu
# compute-sanitizer ./t65
========= COMPUTE-SANITIZER
CUDA device [NVIDIA L4] has 58 Multi-Processors, SM 8.9
Success
========= ERROR SUMMARY: 0 errors
# cat t65f.cu

/*
 * demonstrate "half4" texture/surface using float4 texture fetch
 */

#include <cstdlib>
#include <cuda_fp16.h>
#include <iostream>
#include <cstdio>
#define checkCudaErrors(x) {cudaError_t err=x; if (err != cudaSuccess) std::cout << "CUDA Error at line: " << __LINE__ << " " << cudaGetErrorString(err) << std::endl;}
using mt = ushort4;
////////////////////////////////////////////////////////////////////////////////
// Kernels
////////////////////////////////////////////////////////////////////////////////
//! Write to a cuArray (texture data source) using surface writes
//! @param gIData input data in global memory
////////////////////////////////////////////////////////////////////////////////
__global__ void surfaceWriteKernel(mt *gIData, int width, int height,
                                   cudaSurfaceObject_t outputSurface) {
  // calculate surface coordinates
  unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
  unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;

  // read from global memory and write to cuarray (via surface reference)
  surf2Dwrite(gIData[y * width + x], outputSurface, x * sizeof(mt), y);
}

////////////////////////////////////////////////////////////////////////////////
//! read data using texture lookups
////////////////////////////////////////////////////////////////////////////////
__global__ void texReadKernel(int width, int height,
                                cudaTextureObject_t tex) {
  unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
  unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;

  float u = x;
  float v = y;

  // read from texture and write to global memory
  float4 val = tex2D<float4>(tex, u, v);
  if (val.x != v)    printf("mismatchx at %d,%d, was: %f, should be: %f\n", x,y,val.x, v);
  if (val.y != 1.0f) printf("mismatchy at %d,%d, was: %f, should be: %f\n", x,y,val.y, 1.0f);
  if (val.z != u)    printf("mismatchz at %d,%d, was: %f, should be: %f\n", x,y,val.z, u);
  if (val.w != 2.0f) printf("mismatchw at %d,%d, was: %f, should be: %f\n", x,y,val.w, 2.0f);
}

////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv) {

  // Get number of SMs on this GPU
  cudaDeviceProp deviceProps;
  const int devID = 0;
  checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID));
  std::cout << "CUDA device [" << deviceProps.name << "] has " << deviceProps.multiProcessorCount << " Multi-Processors, SM " << deviceProps.major << "." << deviceProps.minor << std::endl;

  // create data
  unsigned int width = 256, height=256;
  mt *hData = (mt *)malloc(width*height*sizeof(mt));

  unsigned int size = width * height * sizeof(mt);
  // populate
  for (int row = 0; row < height; row++)
    for (int col = 0; col < width; col++) {
      unsigned short x = __half_as_ushort(__float2half(row));
      unsigned short y = __half_as_ushort(__float2half(1.0f));
      unsigned short z = __half_as_ushort(__float2half(col));
      unsigned short w = __half_as_ushort(__float2half(2.0f));
      ushort4 val = {x, y, z, w};
      hData[row*width+col] = val;}

  // Allocate device memory for result
  mt *dData = NULL;
  checkCudaErrors(cudaMalloc((void **)&dData, size));
  checkCudaErrors(cudaMemcpy(dData, hData, size, cudaMemcpyHostToDevice));
  // Allocate array and copy image data
  cudaChannelFormatDesc channelDesc = cudaCreateChannelDescHalf4();
  cudaArray *cuArray;
  checkCudaErrors(cudaMallocArray(&cuArray, &channelDesc, width, height,
                                  cudaArraySurfaceLoadStore));

  dim3 dimBlock(8, 8, 1);
  dim3 dimGrid(width / dimBlock.x, height / dimBlock.y, 1);

  cudaSurfaceObject_t outputSurface;
  cudaResourceDesc surfRes;
  memset(&surfRes, 0, sizeof(cudaResourceDesc));
  surfRes.resType = cudaResourceTypeArray;
  surfRes.res.array.array = cuArray;

  checkCudaErrors(cudaCreateSurfaceObject(&outputSurface, &surfRes));
  checkCudaErrors(cudaMemcpy(dData, hData, size, cudaMemcpyHostToDevice));
  surfaceWriteKernel<<<dimGrid, dimBlock>>>(dData, width, height,
                                            outputSurface);
  checkCudaErrors(cudaMemset(dData, 0, size));
  cudaTextureObject_t tex;
  cudaResourceDesc texRes;
  memset(&texRes, 0, sizeof(cudaResourceDesc));

  texRes.resType = cudaResourceTypeArray;
  texRes.res.array.array = cuArray;

  cudaTextureDesc texDescr;
  memset(&texDescr, 0, sizeof(cudaTextureDesc));

  texDescr.normalizedCoords = false;
  texDescr.filterMode = cudaFilterModePoint;
  texDescr.addressMode[0] = cudaAddressModeClamp;
  texDescr.addressMode[1] = cudaAddressModeClamp;
  texDescr.readMode = cudaReadModeElementType;

  checkCudaErrors(cudaCreateTextureObject(&tex, &texRes, &texDescr, NULL));

  texReadKernel<<<dimGrid, dimBlock, 0>>>(width, height, tex);

  checkCudaErrors(cudaDeviceSynchronize());

  checkCudaErrors(cudaDestroySurfaceObject(outputSurface));
  checkCudaErrors(cudaDestroyTextureObject(tex));
  checkCudaErrors(cudaFree(dData));
  checkCudaErrors(cudaFreeArray(cuArray));
}
# nvcc -o t65f t65f.cu
# compute-sanitizer ./t65f
========= COMPUTE-SANITIZER
CUDA device [NVIDIA L4] has 58 Multi-Processors, SM 8.9
========= ERROR SUMMARY: 0 errors
#