CUDA has a nice set of SIMD instructions for integers that allow efficient SIMD computations. Among those, there are some that compute addition and subtraction per byte or per half-word (like __vadd2 and __vadd4), however, I couldn't find a similar function that computes per-byte multiplication for a 32bit register. I would appreciate it if someone can help me find a proper solution.
however, I couldn't find a similar function that computes per-byte multiplication for a 32bit register.
There isn't one that returns the 4 individual products.
The closest is the __dp4a()
intrinsic which returns the sum of the 4 products, in a 32-bit integer.
You could write an 8-bit packed unsigned multiply with saturation like this:
$ cat t2048.cu
#include <cstdio>
#include <cstdint>
__host__ __device__ uchar4 u8mulsat(const uchar4 &a, const uchar4 &b){
const unsigned sv = 255;
uchar4 result;
unsigned t;
t = a.x*b.x;
if (t > sv) t = sv;
result.x = t;
t = a.y*b.y;
if (t > sv) t = sv;
result.y = t;
t = a.z*b.z;
if (t > sv) t = sv;
result.z = t;
t = a.w*b.w;
if (t > sv) t = sv;
result.w = t;
return result;
}
__global__ void k(uchar4 a, uchar4 b, uchar4 *c){
*c = u8mulsat(a, b);
}
int main(){
uchar4 a,b,c, *d_c;
cudaMalloc(&d_c, sizeof(uchar4));
a.x = 1;
a.y = 2;
a.z = 4;
a.w = 8;
b.x = 64;
b.y = 64;
b.z = 64;
b.w = 1;
k<<<1,1>>>(a, b, d_c);
cudaMemcpy(&c, d_c, sizeof(uchar4), cudaMemcpyDeviceToHost);
printf("c.x = %u\n", (unsigned)c.x);
printf("c.y = %u\n", (unsigned)c.y);
printf("c.z = %u\n", (unsigned)c.z);
printf("c.w = %u\n", (unsigned)c.w);
}
$ nvcc -o t2048 t2048.cu
$ compute-sanitizer ./t2048
========= COMPUTE-SANITIZER
c.x = 64
c.y = 128
c.z = 255
c.w = 8
========= ERROR SUMMARY: 0 errors
$ cuobjdump -sass ./t2048
Fatbin elf code:
================
arch = sm_52
code version = [1,7]
producer = <unknown>
host = linux
compile_size = 64bit
code for sm_52
Fatbin elf code:
================
arch = sm_52
code version = [1,7]
producer = <unknown>
host = linux
compile_size = 64bit
code for sm_52
Function : _Z1k6uchar4S_PS_
.headerflags @"EF_CUDA_SM52 EF_CUDA_PTX_SM(EF_CUDA_SM52)"
/* 0x001c4400e22007f6 */
/*0008*/ MOV R1, c[0x0][0x20] ; /* 0x4c98078000870001 */
/*0010*/ LDC.U8 R0, c[0x0][0x140] ; /* 0xef9000001407ff00 */
/*0018*/ LDC.U8 R2, c[0x0][0x144] ; /* 0xef9000001447ff02 */
/* 0x001d4400e6200731 */
/*0028*/ LDC.U8 R3, c[0x0][0x141] ; /* 0xef9000001417ff03 */
/*0030*/ LDC.U8 R4, c[0x0][0x145] ; /* 0xef9000001457ff04 */
/*0038*/ LDC.U8 R5, c[0x0][0x142] ; /* 0xef9000001427ff05 */
/* 0x001dfc00ee200751 */
/*0048*/ LDC.U8 R6, c[0x0][0x146] ; /* 0xef9000001467ff06 */
/*0050*/ LDC.U8 R7, c[0x0][0x143] ; /* 0xef9000001437ff07 */
/*0058*/ LDC.U8 R8, c[0x0][0x147] ; /* 0xef9000001477ff08 */
/* 0x009fd002fe200fe1 */
/*0068*/ XMAD R0, R2, R0, RZ ; /* 0x5b007f8000070200 */
/*0070*/ XMAD R2, R4, R3, RZ ; /* 0x5b007f8000370402 */
/*0078*/ XMAD R3, R6, R5, RZ ; /* 0x5b007f8000570603 */
/* 0x001fc408fe2007f1 */
/*0088*/ IMNMX.U32 R0, R0, 0xff, PT ; /* 0x382003800ff70000 */
/*0090*/ XMAD R4, R8, R7, RZ ; /* 0x5b007f8000770804 */
/*0098*/ IMNMX.U32 R2, R2, 0xff, PT ; /* 0x382003800ff70202 */
/* 0x001fc400fe2007e4 */
/*00a8*/ IMNMX.U32 R3, R3, 0xff, PT ; /* 0x382003800ff70303 */
/*00b0*/ IMNMX.U32 R4, R4, 0xff, PT ; /* 0x382003800ff70404 */
/*00b8*/ BFI R0, R2, 0x808, R0 ; /* 0x36f0000080870200 */
/* 0x001fd400fe2007f5 */
/*00c8*/ MOV R2, c[0x0][0x148] ; /* 0x4c98078005270002 */
/*00d0*/ BFI R5, R3, 0x810, R0 ; /* 0x36f0000081070305 */
/*00d8*/ MOV R3, c[0x0][0x14c] ; /* 0x4c98078005370003 */
/* 0x001ffc00fe2007e2 */
/*00e8*/ BFI R4, R4, 0x818, R5 ; /* 0x36f0028081870404 */
/*00f0*/ STG.E [R2], R4 ; /* 0xeedc200000070204 */
/*00f8*/ EXIT ; /* 0xe30000000007000f */
/* 0x001f8000fc0007ff */
/*0108*/ BRA 0x100 ; /* 0xe2400fffff07000f */
/*0110*/ NOP; /* 0x50b0000000070f00 */
/*0118*/ NOP; /* 0x50b0000000070f00 */
/* 0x001f8000fc0007e0 */
/*0128*/ NOP; /* 0x50b0000000070f00 */
/*0130*/ NOP; /* 0x50b0000000070f00 */
/*0138*/ NOP; /* 0x50b0000000070f00 */
..........
Fatbin ptx code:
================
arch = sm_52
code version = [7,4]
producer = <unknown>
host = linux
compile_size = 64bit
compressed
$
The SASS code appears to be about as I would expect, roughly the same length as the C++ code, ignoring the LDC
and STG
instructions.
FWIW, on Tesla V100, CUDA 11.4, the implementation by njuffa and mine are pretty close in terms of register usage (njuffa: 16, mine: 17) and performance (njuffa about 1% faster):
$ cat t2048.cu
#include <iostream>
#include <cstdint>
__device__ unsigned int vmulus4 (unsigned int a, unsigned int b)
{
unsigned int plo, phi, res;
// compute products
plo = ((a & 0x000000ff) * (b & 0x000000ff) +
(a & 0x0000ff00) * (b & 0x0000ff00));
phi = (__umulhi (a & 0x00ff0000, b & 0x00ff0000) +
__umulhi (a & 0xff000000, b & 0xff000000));
// clamp products to 255
plo |= __vcmpne2 (plo & 0xff00ff00, 0x00000000);
phi |= __vcmpne2 (phi & 0xff00ff00, 0x00000000);
// extract least significant eight bits of each product
res = __byte_perm (plo, phi, 0x6420);
return res;
}
__host__ __device__ uchar4 u8mulsat(const uchar4 &a, const uchar4 &b){
const unsigned sv = 255;
uchar4 result;
unsigned t;
t = a.x*b.x;
if (t > sv) t = sv;
result.x = t;
t = a.y*b.y;
if (t > sv) t = sv;
result.y = t;
t = a.z*b.z;
if (t > sv) t = sv;
result.z = t;
t = a.w*b.w;
if (t > sv) t = sv;
result.w = t;
return result;
}
__global__ void k(const uchar4 * __restrict__ a, const uchar4 * __restrict__ b, uchar4 * __restrict__ c, unsigned N){
unsigned idx = blockIdx.x*blockDim.x+threadIdx.x;
if (idx < N)
c[idx] = u8mulsat(a[idx], b[idx]);
}
__global__ void k1(const unsigned * __restrict__ a, const unsigned * __restrict__ b, unsigned * __restrict__ c, unsigned N){
unsigned idx = blockIdx.x*blockDim.x+threadIdx.x;
if (idx < N)
c[idx] = vmulus4(a[idx], b[idx]);
}
int main(){
unsigned N = 256U*80U*8U*400U;
uchar4 *d_a,*d_b, *d_c;
cudaMalloc(&d_c, sizeof(uchar4)*N);
cudaMalloc(&d_a, sizeof(uchar4)*N);
cudaMalloc(&d_b, sizeof(uchar4)*N);
for (int i = 0; i < 100; i++) {
k<<<N/256,256>>>(d_a, d_b, d_c, N);
k1<<<N/256,256>>>((unsigned *)d_a, (unsigned *)d_b, (unsigned *)d_c, N);}
cudaDeviceSynchronize();
}
$ nvcc -o t2048 t2048.cu -arch=sm_70 -Xptxas -v
ptxas info : 0 bytes gmem
ptxas info : Compiling entry function '_Z2k1PKjS0_Pjj' for 'sm_70'
ptxas info : Function properties for _Z2k1PKjS0_Pjj
0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info : Used 16 registers, 380 bytes cmem[0]
ptxas info : Compiling entry function '_Z1kPK6uchar4S1_PS_j' for 'sm_70'
ptxas info : Function properties for _Z1kPK6uchar4S1_PS_j
0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info : Used 17 registers, 380 bytes cmem[0]
$ nvprof ./t2048
==2696== NVPROF is profiling process 2696, command: ./t2048
==2696== Profiling application: ./t2048
==2696== Profiling result:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 50.21% 100.24ms 100 1.0024ms 998.26us 1.0084ms k(uchar4 const *, uchar4 const *, uchar4*, unsigned int)
49.79% 99.412ms 100 994.12us 990.33us 1.0015ms k1(unsigned int const *, unsigned int const *, unsigned int*, unsigned int)
API calls: 57.39% 279.76ms 3 93.254ms 557.75us 278.64ms cudaMalloc
40.69% 198.31ms 1 198.31ms 198.31ms 198.31ms cudaDeviceSynchronize
1.03% 5.0147ms 4 1.2537ms 589.80us 3.2328ms cuDeviceTotalMem
0.51% 2.4799ms 404 6.1380us 333ns 272.34us cuDeviceGetAttribute
0.30% 1.4715ms 200 7.3570us 6.5220us 68.684us cudaLaunchKernel
0.07% 354.69us 4 88.672us 61.927us 166.60us cuDeviceGetName
0.00% 20.956us 4 5.2390us 3.1200us 7.8000us cuDeviceGetPCIBusId
0.00% 10.445us 8 1.3050us 522ns 4.9100us cuDeviceGet
0.00% 3.7970us 4 949ns 780ns 1.2230us cuDeviceGetUuid
0.00% 3.2030us 3 1.0670us 751ns 1.5050us cuDeviceGetCount
$
Later:
Here is a slightly faster routine (a few percent, on sm_70
) compared to my previous:
__device__ uchar4 u8mulsat(const uchar4 &a, const uchar4 &b){
uchar4 result;
const half sv = 255;
const short svi = 255;
__half2 ah2, bh2, rh2;
ah2 = __floats2half2_rn(a.x, a.y);
bh2 = __floats2half2_rn(b.x, b.y);
rh2 = __hmul2(ah2, bh2);
result.x = (rh2.x > sv) ? (svi):((short)rh2.x);
result.y = (rh2.y > sv) ? (svi):((short)rh2.y);
ah2 = __floats2half2_rn(a.z, a.w);
bh2 = __floats2half2_rn(b.z, b.w);
rh2 = __hmul2(ah2, bh2);
result.z = (rh2.x > sv) ? (svi):((short)rh2.x);
result.w = (rh2.y > sv) ? (svi):((short)rh2.y);
return result;
}
It has the disadvantage that it uses CUDA half-precision intrinsics, so it is "less portable" than the previous, and likewise cannot be decorated with __host__
.