Search code examples
c++x86simdintrinsicsavx

Fast interleave 2 double arrays into an array of structs with 2 float and 1 int (loop invariant) member, with SIMD double->float conversion?


I have a section of code which is a bottleneck in a C++ application running on x86 processors, where we take double values from two arrays, cast to float and store in an array of structs. The reason this is a bottleneck is it is called either with very large loops, or thousands of times.

Is there a faster way to do this copy & cast operation using SIMD Intrinsics? I have seen this answer on faster memcpy but doesn't address the cast.

The simple C++ loop case looks like this

        int _iNum;
        const unsigned int _uiDefaultOffset; // a constant 

        double * pInputValues1; // array of double values, count = _iNum;
        double * pInputValues2; 

        MyStruct * pOutput;    // array of outputs defined as
        // struct MyStruct 
        // { 
        //    float O1;
        //    float O2;
        //    unsigned int Offset;
        // };

        for (int i = 0; i < _iNum; ++i)
        {
            _pPoints[i].O1 = static_cast<float>(pInputValues1[i]);
            _pPoints[i].O2 = static_cast<float>(pInputValues2[i]);
            _pPoints[i].Offset = _uiDefaultOffset;
        }

Note: The struct format is [Float,Float,Int] (24 bytes) but we could (if it will help performance) add an extra 4bytes padding making it 32 bytes.


Solution

  • Loosely inspired by Intel's 4x3 transposition example and based on @PeterCordes solution, here is an AVX1 solution, which should get a throughput of 8 structs within 8 cycles (bottleneck is still p5):

    #include <immintrin.h>
    #include <stddef.h>
    
    struct f2u { 
      float O1, O2;
      unsigned int Offset;
    };
    static const unsigned uiDefaultOffset = 123;
    
    void cvt_interleave_avx(f2u *__restrict dst, double *__restrict pA, double *__restrict pB, ptrdiff_t len)
    {
        __m256 voffset = _mm256_castsi256_ps(_mm256_set1_epi32(uiDefaultOffset));
    
        // 8 structs per iteration
        ptrdiff_t i=0;
        for(; i<len-7; i+=8)
        {
            // destination address for next 8 structs as float*:
            float* dst_f = reinterpret_cast<float*>(dst + i);
    
            // 4*vcvtpd2ps    --->  4*(p1,p5,p23)
            __m128 inA3210 = _mm256_cvtpd_ps(_mm256_loadu_pd(&pA[i]));
            __m128 inB3210 = _mm256_cvtpd_ps(_mm256_loadu_pd(&pB[i]));
            __m128 inA7654 = _mm256_cvtpd_ps(_mm256_loadu_pd(&pA[i+4]));
            __m128 inB7654 = _mm256_cvtpd_ps(_mm256_loadu_pd(&pB[i+4]));
    
            // 2*vinsertf128  --->  2*p5
            __m256 A76543210 = _mm256_set_m128(inA7654,inA3210);
            __m256 B76543210 = _mm256_set_m128(inB7654,inB3210);
    
            // 2*vpermilps    --->  2*p5
            __m256 A56741230 = _mm256_shuffle_ps(A76543210,A76543210,_MM_SHUFFLE(1,2,3,0));
            __m256 B67452301 = _mm256_shuffle_ps(B76543210,B76543210,_MM_SHUFFLE(2,3,0,1));
    
            // 6*vblendps     ---> 6*p015 (does not need to use p5)
            __m256 outA1__B0A0 = _mm256_blend_ps(A56741230,B67452301,2+16*2);
            __m256 outA1ccB0A0 = _mm256_blend_ps(outA1__B0A0,voffset,4+16*4);
    
            __m256 outB2A2__B1 = _mm256_blend_ps(B67452301,A56741230,4+16*4);
            __m256 outB2A2ccB1 = _mm256_blend_ps(outB2A2__B1,voffset,2+16*2);
    
            __m256 outccB3__cc = _mm256_blend_ps(voffset,B67452301,4+16*4);
            __m256 outccB3A3cc = _mm256_blend_ps(outccB3__cc,A56741230,2+16*2);
    
            // 3* vmovups     ---> 3*(p237,p4)
            _mm_storeu_ps(dst_f+ 0,_mm256_castps256_ps128(outA1ccB0A0));
            _mm_storeu_ps(dst_f+ 4,_mm256_castps256_ps128(outB2A2ccB1));
            _mm_storeu_ps(dst_f+ 8,_mm256_castps256_ps128(outccB3A3cc));
            // 3*vextractf128 ---> 3*(p23,p4)
            _mm_storeu_ps(dst_f+12,_mm256_extractf128_ps(outA1ccB0A0,1));
            _mm_storeu_ps(dst_f+16,_mm256_extractf128_ps(outB2A2ccB1,1));
            _mm_storeu_ps(dst_f+20,_mm256_extractf128_ps(outccB3A3cc,1));
        }
    
        // scalar cleanup for  if _iNum is not even
        for (; i < len; i++)
        {
            dst[i].O1 = static_cast<float>(pA[i]);
            dst[i].O2 = static_cast<float>(pB[i]);
            dst[i].Offset = uiDefaultOffset;
        }
    }
    

    Godbolt link, with minimal test-code at the end: https://godbolt.org/z/0kTO2b

    For some reason, gcc does not like to generate vcvtpd2ps which directly convert from memory to a register. This might works better with aligned loads (having input and output aligned is likely beneficial anyway). And clang apparently wants to outsmart me with one of the vextractf128 instructions at the end.