NEON: How to I get my SoA 4x quaternion-to-matrix out to array of non-interleaved 4x4 matrices?

I am still learning all the best ways to work with NEON, and here is my problem. I have a quaternion-to-matrix operation that needs to operate on an array of quaternions and then add translation in to make a 4x4 matrix. I have the data arranged in an SOA and have written the following using intrinsics that operates on 4 quaternions at a time

// two constants
float32x4_t one = { 1.f, 1.f, 1.f, 1.f };
float32x4_t two = { 2.f, 2.f, 2.f, 2.f };

// load the data 4 quaternions wide
float32x4_t rot_x = vld1q_f32(data->rotation_x); // load 4 quatenion's worth of x's
float32x4_t rot_y = vld1q_f32(data->rotation_y); // load 4 quatenion's worth of y's
float32x4_t rot_z = vld1q_f32(data->rotation_z); // load 4 quatenion's worth of z's
float32x4_t rot_w = vld1q_f32(data->rotation_w); // load 4 quatenion's worth of w's


float32x4_t qxx2 = vmulq_f32( vmulq_f32( rot_x, rot_x ), two );
float32x4_t qyy2 = vmulq_f32( vmulq_f32( rot_y, rot_y ), two );
float32x4_t qzz2 = vmulq_f32( vmulq_f32( rot_z, rot_z ), two );
float32x4_t qxy2 = vmulq_f32( vmulq_f32( rot_x, rot_y ), two );
float32x4_t qxz2 = vmulq_f32( vmulq_f32( rot_x, rot_z ), two );
float32x4_t qyz2 = vmulq_f32( vmulq_f32( rot_y, rot_z ), two );
float32x4_t qxw2 = vmulq_f32( vmulq_f32( rot_x, rot_w ), two );
float32x4_t qyw2 = vmulq_f32( vmulq_f32( rot_y, rot_w ), two );
float32x4_t qzw2 = vmulq_f32( vmulq_f32( rot_z, rot_w ), two );


float32x4_t m11 = vsubq_f32( one, vsubq_f32( qyy2, qzz2 ) );
float32x4_t m21 = vsubq_f32( qxy2, qzw2 );
float32x4_t m31 = vaddq_f32(qxz2, qyw2);

float32x4_t m12 = vaddq_f32(qxy2, qzw2);
float32x4_t m22 = vsubq_f32( one, vsubq_f32( qxx2, qzz2 ) );
float32x4_t m32 = vsubq_f32(qyz2, qxw2);

float32x4_t m13 = vsubq_f32( qxz2, qyw2 );
float32x4_t m23 = vaddq_f32( qyz2, qxw2);
float32x4_t m33 = vsubq_f32( one, vsubq_f32( qxx2, qyy2 ) );

This gives me 4 3x3 matrices with the rotation.

In the end, I need to create four 4x4 matrices with translation where M14, M24, M34 are 0, and the translation is stored in M41, M42, M43, and M44 is 1.f.

struct Matrix
{
  float m11, m12, m13, m14;
  float m21, m22, m23, m24;
  float m31, m32, m33, m34;
  float m41, m42, m43, m44;
};

But I don't know how to efficiently extract the data from the NEON registers. I have tried simply storing the data from the NEON Registers and then manipulating it, but obviously the performance is bad. I would like to include the translation in the most efficient way possible, but I don't think loading a vector just to store it really helps?

Any insight would be helpful. What am I looking for here?

Solution

Not sure NEON has enough vector registers, but you can try something like this, untested:

inline float32x4_t unpackLow( float32x4_t a, float32x4_t b )
{
    float32x2_t x = vget_low_f32( a );
    float32x2_t y = vget_low_f32( b );
    return vcombine_f32( x, y );
}

inline float32x4_t unpackHigh( float32x4_t a, float32x4_t b )
{
    float32x2_t x = vget_high_f32( a );
    float32x2_t y = vget_high_f32( b );
    return vcombine_f32( x, y );
}

const float32x4_t zero = vdupq_n_f32( 0 );
const float32x4_t lastRow = vsetq_lane_f32( 1, zero, 3 );

// Zip them pairwise
const float32x4x2_t r11 = vzipq_f32( m11, m12 );
const float32x4x2_t r12 = vzipq_f32( m13, zero );
const float32x4x2_t r21 = vzipq_f32( m21, m22 );
const float32x4x2_t r22 = vzipq_f32( m23, zero );
const float32x4x2_t r31 = vzipq_f32( m31, m32 );
const float32x4x2_t r32 = vzipq_f32( m33, zero );

// Produce the matrices
vst1q_f32( pointer, unpackLow( r11.val[ 0 ], r12.val[ 0 ] ) );
vst1q_f32( pointer + 4, unpackLow( r21.val[ 0 ], r22.val[ 0 ] ) );
vst1q_f32( pointer + 8, unpackLow( r31.val[ 0 ], r32.val[ 0 ] ) );
vst1q_f32( pointer + 12, lastRow );

vst1q_f32( pointer + 16, unpackHigh( r11.val[ 0 ], r12.val[ 0 ] ) );
vst1q_f32( pointer + 20, unpackHigh( r21.val[ 0 ], r22.val[ 0 ] ) );
vst1q_f32( pointer + 24, unpackHigh( r31.val[ 0 ], r32.val[ 0 ] ) );
vst1q_f32( pointer + 28, lastRow );

vst1q_f32( pointer + 32, unpackLow( r11.val[ 1 ], r12.val[ 1 ] ) );
vst1q_f32( pointer + 36, unpackLow( r21.val[ 1 ], r22.val[ 1 ] ) );
vst1q_f32( pointer + 40, unpackLow( r31.val[ 1 ], r32.val[ 1 ] ) );
vst1q_f32( pointer + 44, lastRow );

vst1q_f32( pointer + 48, unpackHigh( r11.val[ 1 ], r12.val[ 1 ] ) );
vst1q_f32( pointer + 52, unpackHigh( r21.val[ 1 ], r22.val[ 1 ] ) );
vst1q_f32( pointer + 56, unpackHigh( r31.val[ 1 ], r32.val[ 1 ] ) );
vst1q_f32( pointer + 60, lastRow );

These vget_low / vget_high / vcombine are relatively cheap, the assembly has names to address the 64-bit pieces of these registers.

If that won’t work well due to registers shortage, second option is ZIP just the central elements, vzipq_f32( m11, m12 ), vzipq_f32( m21, m22 ), vzipq_f32( m31, m32 ), then use vget_low / vget_high for them, 64-bit vector stores i.e. vst1_f32 to write the corresponding matrix elements in pairs, and vst1q_lane_f32 to store individual scalars extracted from your m13, m23, m33 vectors.