I'm trying to build an optimized right-hand matrix multiplication using arm neon. This
void transform ( glm::mat4 const & matrix, glm::vec4 const & input, glm::vec4 & output )
{
float32x4_t & result_local = reinterpret_cast < float32x4_t & > (*(&output[0]));
float32x4_t const & input_local = reinterpret_cast < float32x4_t const & > (*(&input[0] ));
result_local = vmulq_f32 ( reinterpret_cast < float32x4_t const & > ( matrix[ 0 ] ), input_local );
result_local = vmlaq_f32 ( result_local, reinterpret_cast < float32x4_t const & > ( matrix[ 1 ] ), input_local );
result_local = vmlaq_f32 ( result_local, reinterpret_cast < float32x4_t const & > ( matrix[ 2 ] ), input_local );
result_local = vmlaq_f32 ( result_local, reinterpret_cast < float32x4_t const & > ( matrix[ 3 ] ), input_local );
}
The compiler (gcc) does produce neon instructions, however, it seems that the input parameter (which is supposedly in x1) is reloaded to q1 after every fmla call:
0x0000000000400a78 <+0>: ldr q1, [x1]
0x0000000000400a7c <+4>: ldr q0, [x0]
0x0000000000400a80 <+8>: fmul v0.4s, v0.4s, v1.4s
0x0000000000400a84 <+12>: str q0, [x2]
0x0000000000400a88 <+16>: ldr q2, [x0,#16]
0x0000000000400a8c <+20>: ldr q1, [x1]
0x0000000000400a90 <+24>: fmla v0.4s, v2.4s, v1.4s
0x0000000000400a94 <+28>: str q0, [x2]
0x0000000000400a98 <+32>: ldr q2, [x0,#32]
0x0000000000400a9c <+36>: ldr q1, [x1]
0x0000000000400aa0 <+40>: fmla v0.4s, v2.4s, v1.4s
0x0000000000400aa4 <+44>: str q0, [x2]
0x0000000000400aa8 <+48>: ldr q2, [x0,#48]
0x0000000000400aac <+52>: ldr q1, [x1]
0x0000000000400ab0 <+56>: fmla v0.4s, v2.4s, v1.4s
0x0000000000400ab4 <+60>: str q0, [x2]
0x0000000000400ab8 <+64>: ret
Is it possible to evade this too?
Compiler is gcc-linaro-6.3.1-2017.05-x86_64_aarch64-linux-gnu with O2 option.
Regards
Edit: Removing the reference on input_local did the trick:
0x0000000000400af0 <+0>: ldr q1, [x1]
0x0000000000400af4 <+4>: ldr q0, [x0]
0x0000000000400af8 <+8>: fmul v0.4s, v1.4s, v0.4s
0x0000000000400afc <+12>: str q0, [x2]
0x0000000000400b00 <+16>: ldr q2, [x0,#16]
0x0000000000400b04 <+20>: fmla v0.4s, v1.4s, v2.4s
0x0000000000400b08 <+24>: str q0, [x2]
0x0000000000400b0c <+28>: ldr q2, [x0,#32]
0x0000000000400b10 <+32>: fmla v0.4s, v1.4s, v2.4s
0x0000000000400b14 <+36>: str q0, [x2]
0x0000000000400b18 <+40>: ldr q2, [x0,#48]
0x0000000000400b1c <+44>: fmla v0.4s, v1.4s, v2.4s
0x0000000000400b20 <+48>: str q0, [x2]
0x0000000000400b24 <+52>: ret
Edit 2: Thats the most I obtained for now.
0x0000000000400ea0 <+0>: ldr q1, [x1]
0x0000000000400ea4 <+4>: ldr q0, [x0,#16]
0x0000000000400ea8 <+8>: ldr q4, [x0]
0x0000000000400eac <+12>: ldr q3, [x0,#32]
0x0000000000400eb0 <+16>: fmul v0.4s, v0.4s, v1.4s
0x0000000000400eb4 <+20>: ldr q2, [x0,#48]
0x0000000000400eb8 <+24>: fmla v0.4s, v4.4s, v1.4s
0x0000000000400ebc <+28>: fmla v0.4s, v3.4s, v1.4s
0x0000000000400ec0 <+32>: fmla v0.4s, v2.4s, v1.4s
0x0000000000400ec4 <+36>: str q0, [x2]
0x0000000000400ec8 <+40>: ret
There still seems to be a large overhead in the ldr calls according to perf.
You are operating directly on pointers (call by reference basis). If you operate on pointers, you should be aware that you are completely at compiler's mercy. And compilers for ARM aren't exactly the best.
There might be compiler options dealing with this, or even compilers doing the needed optimizations out of the box, but your best bet is doing it manually:
The process above is also valid for non-neon computations. The compiler almost always gets seriously crippled by the slightest hints on (automatic) memory operations.
Remember, local variables are your best friends. And ALWAYS do the memory load/store manually.
compiler: Android clang 8.0.2 -o2
void transform(const float *matrix, const float *input, float *output)
{
const float32x4_t input_local = vld1q_f32(input);
const float32x4_t row0 = vld1q_f32(&matrix[0*4]);
const float32x4_t row1 = vld1q_f32(&matrix[1*4]);
const float32x4_t row2 = vld1q_f32(&matrix[2*4]);
const float32x4_t row3 = vld1q_f32(&matrix[3*4]);
float32x4_t rslt;
rslt = vmulq_f32(row0, input_local);
rslt = vmlaq_f32(rslt, row1, input_local);
rslt = vmlaq_f32(rslt, row2, input_local);
rslt = vmlaq_f32(rslt, row3, input_local);
vst1q_f32(output, rslt);
}
; void __fastcall transform(const float *matrix, const float *input, float *output)
EXPORT _Z9transformPKfS0_Pf
_Z9transformPKfS0_Pf
matrix = X0 ; const float *
input = X1 ; const float *
output = X2 ; float *
; __unwind {
LDR Q0, [input]
LDP Q1, Q2, [matrix]
LDP Q3, Q4, [matrix,#0x20]
FMUL V1.4S, V0.4S, V1.4S
FMUL V2.4S, V0.4S, V2.4S
FMUL V3.4S, V0.4S, V3.4S
FADD V1.4S, V1.4S, V2.4S
FADD V1.4S, V3.4S, V1.4S
FMUL V0.4S, V0.4S, V4.4S
FADD V0.4S, V0.4S, V1.4S
STR Q0, [output]
RET
; } // starts at 4
As you can see, Android clang 8.0.2 is quite an improvement over the previous versions when it comes to neon codes. Finally the compiler generates codes loading multiple registers. Why it doesn't like FMLA
is beyond me though.