I finally got a CPU with FP16 support and wanted to learn AVX512 programming in VS2022. I wrote a simple FIR filtering loop on fp16 values. If I compile in debug, everything works fine and I get similar results to fp32 minus some quantization error. If I compile in release with optimization enabled, the output of all samples becomes Inf. Even stranger, if I print any values inside the FIR loop (which I guess restricts the optimizer), the FIR output is correct even though logically this has no effect on the calculation.
Here is the FIR loop:
__m256h x1_accumh = _mm256_setzero_ph();
for (int t = 0; t < taps_samp[ii] + 1; t++)
{
if (!usefp16)
{
\\ommitted fp32 case that works normally
}
else //fp16 code path
{
//load into an XMM register and then broadcast
__m128i b1 = _mm_loadu_si16((barrayh + count++));
//take lowest 16 bit value and broadcast
__m256h bh = _mm256_broadcastw_epi16(b1);
//load all 16 channels of int16 RF data
__m256i x1 = _mm256_lddqu_si256(((__m256i*)startPtr) + t);
//convert all 16 channels to fp16
__m256h x1hp = _mm256_cvtepi16_ph(x1);
//multiply the fp16 RF data by the fp16 b filter coefficient and store into x1_accumh
x1_accumh = _mm256_fmadd_ph(x1hp, bh, x1_accumh);
if (ii == 0 && i == 0 && t == 0)
{
print256_f16(x1_accumh); //uncommenting this fixes decoding???????
}
}
}
With the print uncommented, this returns the expected filtered data in x1_accumh. With it commented out, all values are always Inf regardless of input. I checked the disassembly, and it looks like the arguments to _mm256_fmadd_ph are wrong in the optimized case.
Print included (which works):
for (int t = 0; t < taps_samp[ii] + 1; t++)
00007FF6AA2E1B1E xor edi,edi
00007FF6AA2E1B20 cdq
00007FF6AA2E1B21 sub eax,edx
00007FF6AA2E1B23 sar eax,1
00007FF6AA2E1B25 sub ecx,eax
00007FF6AA2E1B27 lea eax,[r8+1]
00007FF6AA2E1B2B shl ecx,5
00007FF6AA2E1B2E movsxd r12,ecx
00007FF6AA2E1B31 add r12,r10
00007FF6AA2E1B34 vxorps xmm3,xmm3,xmm3
00007FF6AA2E1B38 vmovdqu ymmword ptr [x1_accumh],ymm3
00007FF6AA2E1B3D test eax,eax
00007FF6AA2E1B3F jle main+0BEAh (07FF6AA2E1BEAh)
00007FF6AA2E1B45 mov r14,qword ptr [barrayh]
00007FF6AA2E1B49 nop dword ptr [rax]
{
if (!usefp16)
{
}
else //fp16 code path
{
//load into an XMM register and then broadcast
__m128i b1 = _mm_loadu_si16((barrayh + count++));
00007FF6AA2E1B50 movzx eax,word ptr [r14+r15*2]
00007FF6AA2E1B55 inc r15
00007FF6AA2E1B58 vmovd xmm0,eax
//take lowest 16 bit value and broadcast
__m256h bh = _mm256_broadcastw_epi16(b1);
00007FF6AA2E1B5C vpbroadcastw ymm2,xmm0
//load all 16 channels of int16 RF data
__m256i x1 = _mm256_lddqu_si256(((__m256i*)startPtr) + t);
00007FF6AA2E1B61 movsxd rax,edi
00007FF6AA2E1B64 shl rax,5
00007FF6AA2E1B68 vlddqu ymm0,ymmword ptr [rax+r12]
//convert all 16 channels to fp16
__m256h x1hp = _mm256_cvtepi16_ph(x1);
00007FF6AA2E1B6E vcvtw2ph ymm1,ymm0
//multiply the fp16 RF data by the fp16 b filter coefficient and store into x1_accumh
x1_accumh = _mm256_fmadd_ph(x1hp, bh, x1_accumh);
00007FF6AA2E1B74 vfmadd213ph ymm1,ymm2,ymm3 //ymm1 = rf samples, ymm2 = bh coefficient, ymm3 = accumulator, so this does rf samples * bh coefficients + accumulator
00007FF6AA2E1B7A vmovdqu ymm3,ymm1
00007FF6AA2E1B7E vmovdqu ymmword ptr [x1_accumh],ymm1
if (ii == 0 && i == 0 && t == 0)
00007FF6AA2E1B83 test rsi,rsi
00007FF6AA2E1B86 jne main+0BD3h (07FF6AA2E1BD3h)
00007FF6AA2E1B88 test r9d,r9d
00007FF6AA2E1B8B jne main+0BD3h (07FF6AA2E1BD3h)
00007FF6AA2E1B8D test edi,edi
00007FF6AA2E1B8F jne main+0BD3h (07FF6AA2E1BD3h)
{
print256_f16(x1_accumh); //uncommenting this fixes decoding???????
00007FF6AA2E1B91 vcvtph2ps zmm0,ymm1
00007FF6AA2E1B97 vmovups zmmword ptr [frac_scalar],zmm0
00007FF6AA2E1B9E xor ebx,ebx
00007FF6AA2E1BA0 vmovss xmm1,dword ptr [rbp+rbx*4+80h]
00007FF6AA2E1BA9 vcvtss2sd xmm1,xmm1,xmm1
00007FF6AA2E1BAD vmovq rdx,xmm1
00007FF6AA2E1BB2 lea rcx,[string "%f\n" (07FF6AA2E42C0h)]
00007FF6AA2E1BB9 vzeroupper
00007FF6AA2E1BBC call printf (07FF6AA2E2C60h)
00007FF6AA2E1BC1 inc rbx
00007FF6AA2E1BC4 cmp rbx,10h
00007FF6AA2E1BC8 jl main+0BA0h (07FF6AA2E1BA0h)
00007FF6AA2E1BCA vmovdqu ymm3,ymmword ptr [x1_accumh]
00007FF6AA2E1BCF mov r9d,dword ptr [rbp]
__m256 x1_accum = _mm256_setzero_ps();
__m256 x2_accum = _mm256_setzero_ps();
__m256h x1_accumh = _mm256_setzero_ph();
for (int t = 0; t < taps_samp[ii] + 1; t++)
00007FF6AA2E1BD3 mov eax,dword ptr taps_samp[rsi*4]
00007FF6AA2E1BDA inc edi
00007FF6AA2E1BDC inc eax
00007FF6AA2E1BDE cmp edi,eax
00007FF6AA2E1BE0 jl main+0B50h (07FF6AA2E1B50h)
00007FF6AA2E1BE6 mov r14,qword ptr [filtered_data]
}
}
}
Here is the optimized version without print that doesn't work:
{
__m128i* startPtr = (__m128i* ) (linedata+ (ii - taps_samp[ii] / 2) * sampleSize);
00007FF64FE11B2B mov ecx,r8d
00007FF64FE11B2E nop
{
if (!usefp16)
{
}
else //fp16 code path
{
//load into an XMM register and then broadcast
__m128i b1 = _mm_loadu_si16((barrayh + count++));
00007FF64FE11B30 movzx eax,word ptr [r13+rdi*2]
00007FF64FE11B36 inc rdi
//take lowest 16 bit value and broadcast
__m256h bh = _mm256_broadcastw_epi16(b1);
//load all 16 channels of int16 RF data
__m256i x1 = _mm256_lddqu_si256(((__m256i*)startPtr) + t);
00007FF64FE11B39 vlddqu ymm1,ymmword ptr [rdx]
00007FF64FE11B3D lea rdx,[rdx+20h]
00007FF64FE11B41 vmovd xmm0,eax
00007FF64FE11B45 vpbroadcastw ymm2,xmm0
//convert all 16 channels to fp16
__m256h x1hp = _mm256_cvtepi16_ph(x1);
00007FF64FE11B4A vcvtw2ph ymm0,ymm1
//multiply the fp16 RF data by the fp16 b filter coefficient and store into x1_accumh
x1_accumh = _mm256_fmadd_ph(x1hp, bh, x1_accumh);
00007FF64FE11B50 vfmadd132ph ymm3,ymm2,ymm0 \\ymm3 =accumulator, ymm2 = b coefficients, ymm0 = sample data, so this does accumulator *sample data + b coefficient
00007FF64FE11B56 sub rcx,1
00007FF64FE11B5A jne main+0B30h (07FF64FE11B30h)
00007FF64FE11B5C vmovdqu ymmword ptr [x1_accumh],ymm3
/*if (ii == 0 && i == 0 && t == 0)
{
print256_f16(x1_accumh); //uncommenting this fixes decoding???????
}*/
}
I'm very new to x86 assembly, but the key difference appears to be the change of
vfmadd213ph ymm1,ymm2,ymm3 //ymm1 = rf samples, ymm2 = b coefficient, ymm3 = accumulator, so this does rf samples * b coefficients + accumulator
to
vfmadd132ph ymm3,ymm2,ymm0 \\ymm3 =accumulator, ymm2 = b coefficients, ymm0 = sample data, so this does accumulator *sample data + b coefficient
It looks like the compiler changes vfmadd213ph to vfmadd132ph but doesn't reorder the arguments (b stays in ymm2), so instead of multiplying the FIR tap coefficient times the sample and adding to the accumulator, it multiplies the accumulator times the sample data and then adds it to the FIR tap coefficient. The result is Inf because that value becomes enormous very quickly.
Question: Am I completely misunderstanding how AVX programming works and making some trivial code error? Is there anything I can do to fix this? This is my first AVX512 program ever, so I may be doing something very foolish.
The bug was confirmed by Microsoft and then corrected in VS 17.7.0, which can compile the above code correctly.