AVX512-FP16 intrinsics fails in release mode, works in debug

I finally got a CPU with FP16 support and wanted to learn AVX512 programming in VS2022. I wrote a simple FIR filtering loop on fp16 values. If I compile in debug, everything works fine and I get similar results to fp32 minus some quantization error. If I compile in release with optimization enabled, the output of all samples becomes Inf. Even stranger, if I print any values inside the FIR loop (which I guess restricts the optimizer), the FIR output is correct even though logically this has no effect on the calculation.

Here is the FIR loop:

                __m256h x1_accumh = _mm256_setzero_ph();

                for (int t = 0; t < taps_samp[ii] + 1; t++)
                {
                    if (!usefp16)
                    {

                         \\ommitted fp32 case that works normally
                    }
                    else //fp16 code path
                    {

                        //load into an XMM register and then broadcast 
                        __m128i b1 = _mm_loadu_si16((barrayh + count++));

                        //take lowest 16 bit value and broadcast
                        __m256h bh = _mm256_broadcastw_epi16(b1);

                        //load all 16 channels of int16 RF data
                        __m256i x1 = _mm256_lddqu_si256(((__m256i*)startPtr) + t);

                        //convert all 16 channels to fp16
                        __m256h x1hp = _mm256_cvtepi16_ph(x1);

                        //multiply the fp16 RF data by the fp16 b filter coefficient and store into x1_accumh                   
                        x1_accumh = _mm256_fmadd_ph(x1hp, bh, x1_accumh);

                        if (ii == 0 && i == 0 && t == 0)
                        {
                            print256_f16(x1_accumh);        //uncommenting this fixes decoding???????
                        }

                    }
                }

With the print uncommented, this returns the expected filtered data in x1_accumh. With it commented out, all values are always Inf regardless of input. I checked the disassembly, and it looks like the arguments to _mm256_fmadd_ph are wrong in the optimized case.

Print included (which works):

                for (int t = 0; t < taps_samp[ii] + 1; t++)
00007FF6AA2E1B1E  xor         edi,edi  
00007FF6AA2E1B20  cdq  
00007FF6AA2E1B21  sub         eax,edx  
00007FF6AA2E1B23  sar         eax,1  
00007FF6AA2E1B25  sub         ecx,eax  
00007FF6AA2E1B27  lea         eax,[r8+1]  
00007FF6AA2E1B2B  shl         ecx,5  
00007FF6AA2E1B2E  movsxd      r12,ecx  
00007FF6AA2E1B31  add         r12,r10  
00007FF6AA2E1B34  vxorps      xmm3,xmm3,xmm3  
00007FF6AA2E1B38  vmovdqu     ymmword ptr [x1_accumh],ymm3  
00007FF6AA2E1B3D  test        eax,eax  
00007FF6AA2E1B3F  jle         main+0BEAh (07FF6AA2E1BEAh)  
00007FF6AA2E1B45  mov         r14,qword ptr [barrayh]  
00007FF6AA2E1B49  nop         dword ptr [rax]  

                {
                    if (!usefp16)
                    {
                    }
                    else //fp16 code path
                    {

                        //load into an XMM register and then broadcast 
                        __m128i b1 = _mm_loadu_si16((barrayh + count++));
00007FF6AA2E1B50  movzx       eax,word ptr [r14+r15*2]  
00007FF6AA2E1B55  inc         r15  
00007FF6AA2E1B58  vmovd       xmm0,eax  

                        //take lowest 16 bit value and broadcast
                        __m256h bh = _mm256_broadcastw_epi16(b1);
00007FF6AA2E1B5C  vpbroadcastw ymm2,xmm0  

                        //load all 16 channels of int16 RF data
                        __m256i x1 = _mm256_lddqu_si256(((__m256i*)startPtr) + t);
00007FF6AA2E1B61  movsxd      rax,edi  
00007FF6AA2E1B64  shl         rax,5  
00007FF6AA2E1B68  vlddqu      ymm0,ymmword ptr [rax+r12]  

                        //convert all 16 channels to fp16
                        __m256h x1hp = _mm256_cvtepi16_ph(x1);
00007FF6AA2E1B6E  vcvtw2ph    ymm1,ymm0  

                        //multiply the fp16 RF data by the fp16 b filter coefficient and store into x1_accumh                   
                        x1_accumh = _mm256_fmadd_ph(x1hp, bh, x1_accumh);
00007FF6AA2E1B74  vfmadd213ph ymm1,ymm2,ymm3    //ymm1 = rf samples, ymm2 = bh coefficient, ymm3 = accumulator, so this does rf samples * bh coefficients + accumulator
00007FF6AA2E1B7A  vmovdqu     ymm3,ymm1  
00007FF6AA2E1B7E  vmovdqu     ymmword ptr [x1_accumh],ymm1  


                        if (ii == 0 && i == 0 && t == 0)
00007FF6AA2E1B83  test        rsi,rsi  
00007FF6AA2E1B86  jne         main+0BD3h (07FF6AA2E1BD3h)  
00007FF6AA2E1B88  test        r9d,r9d  
00007FF6AA2E1B8B  jne         main+0BD3h (07FF6AA2E1BD3h)  
00007FF6AA2E1B8D  test        edi,edi  
00007FF6AA2E1B8F  jne         main+0BD3h (07FF6AA2E1BD3h)  
                        {
                            print256_f16(x1_accumh);        //uncommenting this fixes decoding???????
00007FF6AA2E1B91  vcvtph2ps   zmm0,ymm1  
00007FF6AA2E1B97  vmovups     zmmword ptr [frac_scalar],zmm0  
00007FF6AA2E1B9E  xor         ebx,ebx  
00007FF6AA2E1BA0  vmovss      xmm1,dword ptr [rbp+rbx*4+80h]  
00007FF6AA2E1BA9  vcvtss2sd   xmm1,xmm1,xmm1  
00007FF6AA2E1BAD  vmovq       rdx,xmm1  
00007FF6AA2E1BB2  lea         rcx,[string "%f\n" (07FF6AA2E42C0h)]  
00007FF6AA2E1BB9  vzeroupper  
00007FF6AA2E1BBC  call        printf (07FF6AA2E2C60h)  
00007FF6AA2E1BC1  inc         rbx  
00007FF6AA2E1BC4  cmp         rbx,10h  
00007FF6AA2E1BC8  jl          main+0BA0h (07FF6AA2E1BA0h)  
00007FF6AA2E1BCA  vmovdqu     ymm3,ymmword ptr [x1_accumh]  
00007FF6AA2E1BCF  mov         r9d,dword ptr [rbp]  

                __m256 x1_accum = _mm256_setzero_ps();
                __m256 x2_accum = _mm256_setzero_ps();

                __m256h x1_accumh = _mm256_setzero_ph();

                for (int t = 0; t < taps_samp[ii] + 1; t++)
00007FF6AA2E1BD3  mov         eax,dword ptr taps_samp[rsi*4]  
00007FF6AA2E1BDA  inc         edi  
00007FF6AA2E1BDC  inc         eax  
00007FF6AA2E1BDE  cmp         edi,eax  
00007FF6AA2E1BE0  jl          main+0B50h (07FF6AA2E1B50h)  
00007FF6AA2E1BE6  mov         r14,qword ptr [filtered_data]  
                        }

                    }
                }

Here is the optimized version without print that doesn't work:


            {
                __m128i* startPtr = (__m128i* ) (linedata+ (ii - taps_samp[ii] / 2) * sampleSize);
00007FF64FE11B2B  mov         ecx,r8d  
00007FF64FE11B2E  nop  
                {

                    if (!usefp16)
                    {

                    }
                    else //fp16 code path
                    {

                        //load into an XMM register and then broadcast 
                        __m128i b1 = _mm_loadu_si16((barrayh + count++));
00007FF64FE11B30  movzx       eax,word ptr [r13+rdi*2]  
00007FF64FE11B36  inc         rdi  

                        //take lowest 16 bit value and broadcast
                        __m256h bh = _mm256_broadcastw_epi16(b1);

                        //load all 16 channels of int16 RF data
                        __m256i x1 = _mm256_lddqu_si256(((__m256i*)startPtr) + t);
00007FF64FE11B39  vlddqu      ymm1,ymmword ptr [rdx]  
00007FF64FE11B3D  lea         rdx,[rdx+20h]  
00007FF64FE11B41  vmovd       xmm0,eax  
00007FF64FE11B45  vpbroadcastw ymm2,xmm0  

                        //convert all 16 channels to fp16
                        __m256h x1hp = _mm256_cvtepi16_ph(x1);
00007FF64FE11B4A  vcvtw2ph    ymm0,ymm1  

                        //multiply the fp16 RF data by the fp16 b filter coefficient and store into x1_accumh                   
                        x1_accumh = _mm256_fmadd_ph(x1hp, bh, x1_accumh);
00007FF64FE11B50  vfmadd132ph ymm3,ymm2,ymm0  \\ymm3 =accumulator, ymm2 = b coefficients, ymm0 = sample data, so this does accumulator *sample data + b coefficient

00007FF64FE11B56  sub         rcx,1  
00007FF64FE11B5A  jne         main+0B30h (07FF64FE11B30h)  
00007FF64FE11B5C  vmovdqu     ymmword ptr [x1_accumh],ymm3  

                        /*if (ii == 0 && i == 0 && t == 0)
                        {
                            print256_f16(x1_accumh);        //uncommenting this fixes decoding???????
                        }*/

                    }

I'm very new to x86 assembly, but the key difference appears to be the change of

vfmadd213ph ymm1,ymm2,ymm3      //ymm1 = rf samples, ymm2 = b coefficient, ymm3 = accumulator, so this does rf samples * b coefficients + accumulator

vfmadd132ph ymm3,ymm2,ymm0  \\ymm3 =accumulator, ymm2 = b coefficients, ymm0 = sample data, so this does accumulator *sample data + b coefficient

It looks like the compiler changes vfmadd213ph to vfmadd132ph but doesn't reorder the arguments (b stays in ymm2), so instead of multiplying the FIR tap coefficient times the sample and adding to the accumulator, it multiplies the accumulator times the sample data and then adds it to the FIR tap coefficient. The result is Inf because that value becomes enormous very quickly.

Question: Am I completely misunderstanding how AVX programming works and making some trivial code error? Is there anything I can do to fix this? This is my first AVX512 program ever, so I may be doing something very foolish.

Solution

The bug was confirmed by Microsoft and then corrected in VS 17.7.0, which can compile the above code correctly.