ARM neon optimization - getting rid of superfluous loads

I'm trying to build an optimized right-hand matrix multiplication using arm neon. This

void transform ( glm::mat4 const & matrix, glm::vec4 const & input, glm::vec4 & output )
{
   float32x4_t &       result_local = reinterpret_cast < float32x4_t & > (*(&output[0]));
   float32x4_t const & input_local  = reinterpret_cast < float32x4_t const & > (*(&input[0] ));

   result_local = vmulq_f32 (               reinterpret_cast < float32x4_t const & > ( matrix[ 0 ] ), input_local );
   result_local = vmlaq_f32 ( result_local, reinterpret_cast < float32x4_t const & > ( matrix[ 1 ] ), input_local );
   result_local = vmlaq_f32 ( result_local, reinterpret_cast < float32x4_t const & > ( matrix[ 2 ] ), input_local );
   result_local = vmlaq_f32 ( result_local, reinterpret_cast < float32x4_t const & > ( matrix[ 3 ] ), input_local );
}

The compiler (gcc) does produce neon instructions, however, it seems that the input parameter (which is supposedly in x1) is reloaded to q1 after every fmla call:

0x0000000000400a78 <+0>:    ldr q1, [x1]
0x0000000000400a7c <+4>:    ldr q0, [x0]
0x0000000000400a80 <+8>:    fmul    v0.4s, v0.4s, v1.4s
0x0000000000400a84 <+12>:   str q0, [x2]
0x0000000000400a88 <+16>:   ldr q2, [x0,#16]
0x0000000000400a8c <+20>:   ldr q1, [x1]
0x0000000000400a90 <+24>:   fmla    v0.4s, v2.4s, v1.4s
0x0000000000400a94 <+28>:   str q0, [x2]
0x0000000000400a98 <+32>:   ldr q2, [x0,#32]
0x0000000000400a9c <+36>:   ldr q1, [x1]
0x0000000000400aa0 <+40>:   fmla    v0.4s, v2.4s, v1.4s
0x0000000000400aa4 <+44>:   str q0, [x2]
0x0000000000400aa8 <+48>:   ldr q2, [x0,#48]
0x0000000000400aac <+52>:   ldr q1, [x1]
0x0000000000400ab0 <+56>:   fmla    v0.4s, v2.4s, v1.4s
0x0000000000400ab4 <+60>:   str q0, [x2]
0x0000000000400ab8 <+64>:   ret

Is it possible to evade this too?

Compiler is gcc-linaro-6.3.1-2017.05-x86_64_aarch64-linux-gnu with O2 option.

Regards

Edit: Removing the reference on input_local did the trick:

0x0000000000400af0 <+0>:    ldr q1, [x1]
0x0000000000400af4 <+4>:    ldr q0, [x0]
0x0000000000400af8 <+8>:    fmul    v0.4s, v1.4s, v0.4s
0x0000000000400afc <+12>:   str q0, [x2]
0x0000000000400b00 <+16>:   ldr q2, [x0,#16]
0x0000000000400b04 <+20>:   fmla    v0.4s, v1.4s, v2.4s
0x0000000000400b08 <+24>:   str q0, [x2]
0x0000000000400b0c <+28>:   ldr q2, [x0,#32]
0x0000000000400b10 <+32>:   fmla    v0.4s, v1.4s, v2.4s
0x0000000000400b14 <+36>:   str q0, [x2]
0x0000000000400b18 <+40>:   ldr q2, [x0,#48]
0x0000000000400b1c <+44>:   fmla    v0.4s, v1.4s, v2.4s
0x0000000000400b20 <+48>:   str q0, [x2]
0x0000000000400b24 <+52>:   ret

Edit 2: Thats the most I obtained for now.

0x0000000000400ea0 <+0>:    ldr q1, [x1]
0x0000000000400ea4 <+4>:    ldr q0, [x0,#16]
0x0000000000400ea8 <+8>:    ldr q4, [x0]
0x0000000000400eac <+12>:   ldr q3, [x0,#32]
0x0000000000400eb0 <+16>:   fmul    v0.4s, v0.4s, v1.4s
0x0000000000400eb4 <+20>:   ldr q2, [x0,#48] 
0x0000000000400eb8 <+24>:   fmla    v0.4s, v4.4s, v1.4s
0x0000000000400ebc <+28>:   fmla    v0.4s, v3.4s, v1.4s
0x0000000000400ec0 <+32>:   fmla    v0.4s, v2.4s, v1.4s
0x0000000000400ec4 <+36>:   str q0, [x2]
0x0000000000400ec8 <+40>:   ret

There still seems to be a large overhead in the ldr calls according to perf.

Solution

You are operating directly on pointers (call by reference basis). If you operate on pointers, you should be aware that you are completely at compiler's mercy. And compilers for ARM aren't exactly the best.

There might be compiler options dealing with this, or even compilers doing the needed optimizations out of the box, but your best bet is doing it manually:

declare local vectors (without &)
load the values from the pointer into corresponding vectors (preferably the whole matrix plus the vector)
do the math with the vectors
store the vectors to the pointer

The process above is also valid for non-neon computations. The compiler almost always gets seriously crippled by the slightest hints on (automatic) memory operations.

Remember, local variables are your best friends. And ALWAYS do the memory load/store manually.

compiler: Android clang 8.0.2 -o2

void transform(const float *matrix, const float *input, float *output)
{
    const float32x4_t input_local = vld1q_f32(input);
    const float32x4_t row0 = vld1q_f32(&matrix[0*4]);
    const float32x4_t row1 = vld1q_f32(&matrix[1*4]);
    const float32x4_t row2 = vld1q_f32(&matrix[2*4]);
    const float32x4_t row3 = vld1q_f32(&matrix[3*4]);

    float32x4_t rslt;
    rslt = vmulq_f32(row0, input_local);
    rslt = vmlaq_f32(rslt, row1, input_local);
    rslt = vmlaq_f32(rslt, row2, input_local);
    rslt = vmlaq_f32(rslt, row3, input_local);

    vst1q_f32(output, rslt);
}

; void __fastcall transform(const float *matrix, const float *input, float *output)
EXPORT _Z9transformPKfS0_Pf
_Z9transformPKfS0_Pf
matrix = X0             ; const float *
input = X1              ; const float *
output = X2             ; float *
; __unwind {
LDR             Q0, [input]
LDP             Q1, Q2, [matrix]
LDP             Q3, Q4, [matrix,#0x20]
FMUL            V1.4S, V0.4S, V1.4S
FMUL            V2.4S, V0.4S, V2.4S
FMUL            V3.4S, V0.4S, V3.4S
FADD            V1.4S, V1.4S, V2.4S
FADD            V1.4S, V3.4S, V1.4S
FMUL            V0.4S, V0.4S, V4.4S
FADD            V0.4S, V0.4S, V1.4S
STR             Q0, [output]
RET
; } // starts at 4

As you can see, Android clang 8.0.2 is quite an improvement over the previous versions when it comes to neon codes. Finally the compiler generates codes loading multiple registers. Why it doesn't like FMLA is beyond me though.