Fast interleave 2 double arrays into an array of structs with 2 float and 1 int (loop invariant) member, with SIMD double->float conversion?

I have a section of code which is a bottleneck in a C++ application running on x86 processors, where we take double values from two arrays, cast to float and store in an array of structs. The reason this is a bottleneck is it is called either with very large loops, or thousands of times.

Is there a faster way to do this copy & cast operation using SIMD Intrinsics? I have seen this answer on faster memcpy but doesn't address the cast.

The simple C++ loop case looks like this

        int _iNum;
        const unsigned int _uiDefaultOffset; // a constant 

        double * pInputValues1; // array of double values, count = _iNum;
        double * pInputValues2; 

        MyStruct * pOutput;    // array of outputs defined as
        // struct MyStruct 
        // { 
        //    float O1;
        //    float O2;
        //    unsigned int Offset;
        // };

        for (int i = 0; i < _iNum; ++i)
        {
            _pPoints[i].O1 = static_cast<float>(pInputValues1[i]);
            _pPoints[i].O2 = static_cast<float>(pInputValues2[i]);
            _pPoints[i].Offset = _uiDefaultOffset;
        }

Note: The struct format is [Float,Float,Int] (24 bytes) but we could (if it will help performance) add an extra 4bytes padding making it 32 bytes.

Solution

Loosely inspired by Intel's 4x3 transposition example and based on @PeterCordes solution, here is an AVX1 solution, which should get a throughput of 8 structs within 8 cycles (bottleneck is still p5):

#include <immintrin.h>
#include <stddef.h>

struct f2u { 
  float O1, O2;
  unsigned int Offset;
};
static const unsigned uiDefaultOffset = 123;

void cvt_interleave_avx(f2u *__restrict dst, double *__restrict pA, double *__restrict pB, ptrdiff_t len)
{
    __m256 voffset = _mm256_castsi256_ps(_mm256_set1_epi32(uiDefaultOffset));

    // 8 structs per iteration
    ptrdiff_t i=0;
    for(; i<len-7; i+=8)
    {
        // destination address for next 8 structs as float*:
        float* dst_f = reinterpret_cast<float*>(dst + i);

        // 4*vcvtpd2ps    --->  4*(p1,p5,p23)
        __m128 inA3210 = _mm256_cvtpd_ps(_mm256_loadu_pd(&pA[i]));
        __m128 inB3210 = _mm256_cvtpd_ps(_mm256_loadu_pd(&pB[i]));
        __m128 inA7654 = _mm256_cvtpd_ps(_mm256_loadu_pd(&pA[i+4]));
        __m128 inB7654 = _mm256_cvtpd_ps(_mm256_loadu_pd(&pB[i+4]));

        // 2*vinsertf128  --->  2*p5
        __m256 A76543210 = _mm256_set_m128(inA7654,inA3210);
        __m256 B76543210 = _mm256_set_m128(inB7654,inB3210);

        // 2*vpermilps    --->  2*p5
        __m256 A56741230 = _mm256_shuffle_ps(A76543210,A76543210,_MM_SHUFFLE(1,2,3,0));
        __m256 B67452301 = _mm256_shuffle_ps(B76543210,B76543210,_MM_SHUFFLE(2,3,0,1));

        // 6*vblendps     ---> 6*p015 (does not need to use p5)
        __m256 outA1__B0A0 = _mm256_blend_ps(A56741230,B67452301,2+16*2);
        __m256 outA1ccB0A0 = _mm256_blend_ps(outA1__B0A0,voffset,4+16*4);

        __m256 outB2A2__B1 = _mm256_blend_ps(B67452301,A56741230,4+16*4);
        __m256 outB2A2ccB1 = _mm256_blend_ps(outB2A2__B1,voffset,2+16*2);

        __m256 outccB3__cc = _mm256_blend_ps(voffset,B67452301,4+16*4);
        __m256 outccB3A3cc = _mm256_blend_ps(outccB3__cc,A56741230,2+16*2);

        // 3* vmovups     ---> 3*(p237,p4)
        _mm_storeu_ps(dst_f+ 0,_mm256_castps256_ps128(outA1ccB0A0));
        _mm_storeu_ps(dst_f+ 4,_mm256_castps256_ps128(outB2A2ccB1));
        _mm_storeu_ps(dst_f+ 8,_mm256_castps256_ps128(outccB3A3cc));
        // 3*vextractf128 ---> 3*(p23,p4)
        _mm_storeu_ps(dst_f+12,_mm256_extractf128_ps(outA1ccB0A0,1));
        _mm_storeu_ps(dst_f+16,_mm256_extractf128_ps(outB2A2ccB1,1));
        _mm_storeu_ps(dst_f+20,_mm256_extractf128_ps(outccB3A3cc,1));
    }

    // scalar cleanup for  if _iNum is not even
    for (; i < len; i++)
    {
        dst[i].O1 = static_cast<float>(pA[i]);
        dst[i].O2 = static_cast<float>(pB[i]);
        dst[i].Offset = uiDefaultOffset;
    }
}

Godbolt link, with minimal test-code at the end: https://godbolt.org/z/0kTO2b

For some reason, gcc does not like to generate vcvtpd2ps which directly convert from memory to a register. This ~~might~~ works better with aligned loads (having input and output aligned is likely beneficial anyway). And clang apparently wants to outsmart me with one of the vextractf128 instructions at the end.