Search code examples
cassemblyarmsimdneon

Why (or why not) pass Neon intrinsics datatypes as inputs/outputs functions parameters?


This is a small test I built. Here we have two scenarios:

  1. Scenario 1: Two functions (scenario1a and scenario1b) which inputs and outputs are uint16_t* and load/store to/from Neon datatype (uint16x8x4_t).
  2. Scenario 2: Same functions as Scenario 1 (in this case scenario2a and scenario2b) but the inputs and outputs are uint16x8x4_t*, and the load and store are done in the main function.

(Below the c code I include the disassembly generated after compiling with -O3).

#include <stdio.h>
#include <stdlib.h>
#include <arm_neon.h>

void scenario1a(uint16_t* resultArray, const uint16_t* X);
void scenario1b(uint16_t* resultArray, const uint16_t* X);

void scenario2a(uint16x8x4_t* result, const uint16x8x4_t* temp);
void scenario2b(uint16x8x4_t* result, const uint16x8x4_t* temp);

void scenario1a(uint16_t* resultArray, const uint16_t* X) {
    uint16x8x4_t temp, result;

    temp = vld1q_u16_x4(X);

    result.val[0] = vextq_u16(temp.val[0], vmulq_n_u16(temp.val[1], -1), 2);
    result.val[1] = vextq_u16(temp.val[1], vmulq_n_u16(temp.val[2], -1), 2);
    result.val[2] = vextq_u16(temp.val[2], vmulq_n_u16(temp.val[3], -1), 2);
    result.val[3] = vextq_u16(temp.val[3], vmulq_n_u16(temp.val[0], -1), 2);

    vst1q_u16_x4(resultArray, result);
}

void scenario1b(uint16_t* resultArray, const uint16_t* X) {
    uint16x8x4_t temp, result;

    temp = vld1q_u16_x4(X);

    result.val[0] = vaddq_u16(temp.val[0], temp.val[1]);
    result.val[1] = vmulq_n_u16(temp.val[1], -2);
    result.val[2] = vaddq_u16(temp.val[2], temp.val[3]);
    result.val[3] = vmulq_n_u16(temp.val[3], -2);

    vst1q_u16_x4(resultArray, result);
}

void scenario2a(uint16x8x4_t* result, const uint16x8x4_t* temp) {

    result->val[0] = vextq_u16(temp->val[0], vmulq_n_u16(temp->val[1], -1), 2);
    result->val[1] = vextq_u16(temp->val[1], vmulq_n_u16(temp->val[2], -1), 2);
    result->val[2] = vextq_u16(temp->val[2], vmulq_n_u16(temp->val[3], -1), 2);
    result->val[3] = vextq_u16(temp->val[3], vmulq_n_u16(temp->val[0], -1), 2);
}

void scenario2b(uint16x8x4_t* result, const uint16x8x4_t* temp) {

    result->val[0] = vaddq_u16(temp->val[0], temp->val[1]);
    result->val[1] = vmulq_n_u16(temp->val[1], -2);
    result->val[2] = vaddq_u16(temp->val[2], temp->val[3]);
    result->val[3] = vmulq_n_u16(temp->val[3], -2);
}

int main(void) {

    uint16_t input[32] = {15,3,1,85,44,156,32,97,3,54,97,17,0,55,9,17,163,23,74,85,96,14,25,36,95,84,76,51,42,63,58,74};
    
    // Scenario 01: Input and output are uint16_t*
    uint16_t result01a[32];
    uint16_t result01_final[32];

    scenario1a(result01a, input);
    scenario1b(result01_final, result01a);

    // Scenario 02: Input and output are uint16x8x4_t

    uint16_t result02_final[32];
    uint16x8x4_t temp, result02a, result02b;

    temp = vld1q_u16_x4(input);

    scenario2a(&result02a, &temp);
    scenario2b(&result02b, &result02a);

    vst1q_u16_x4(result02_final, result02b);

    return 0;
}

Disassembly:

test:     file format elf64-littleaarch64


Disassembly of section .init:

0000000000000658 <_init>:
 658:   a9bf7bfd    stp x29, x30, [sp, #-16]!
 65c:   910003fd    mov x29, sp
 660:   94000065    bl  7f4 <call_weak_fn>
 664:   a8c17bfd    ldp x29, x30, [sp], #16
 668:   d65f03c0    ret

Disassembly of section .plt:

0000000000000670 <.plt>:
 670:   a9bf7bf0    stp x16, x30, [sp, #-16]!
 674:   90000090    adrp    x16, 10000 <__FRAME_END__+0xf3d8>
 678:   f947c611    ldr x17, [x16, #3976]
 67c:   913e2210    add x16, x16, #0xf88
 680:   d61f0220    br  x17
 684:   d503201f    nop
 688:   d503201f    nop
 68c:   d503201f    nop

0000000000000690 <__cxa_finalize@plt>:
 690:   90000090    adrp    x16, 10000 <__FRAME_END__+0xf3d8>
 694:   f947ca11    ldr x17, [x16, #3984]
 698:   913e4210    add x16, x16, #0xf90
 69c:   d61f0220    br  x17

00000000000006a0 <__libc_start_main@plt>:
 6a0:   90000090    adrp    x16, 10000 <__FRAME_END__+0xf3d8>
 6a4:   f947ce11    ldr x17, [x16, #3992]
 6a8:   913e6210    add x16, x16, #0xf98
 6ac:   d61f0220    br  x17

00000000000006b0 <__stack_chk_fail@plt>:
 6b0:   90000090    adrp    x16, 10000 <__FRAME_END__+0xf3d8>
 6b4:   f947d211    ldr x17, [x16, #4000]
 6b8:   913e8210    add x16, x16, #0xfa0
 6bc:   d61f0220    br  x17

00000000000006c0 <__gmon_start__@plt>:
 6c0:   90000090    adrp    x16, 10000 <__FRAME_END__+0xf3d8>
 6c4:   f947d611    ldr x17, [x16, #4008]
 6c8:   913ea210    add x16, x16, #0xfa8
 6cc:   d61f0220    br  x17

00000000000006d0 <abort@plt>:
 6d0:   90000090    adrp    x16, 10000 <__FRAME_END__+0xf3d8>
 6d4:   f947da11    ldr x17, [x16, #4016]
 6d8:   913ec210    add x16, x16, #0xfb0
 6dc:   d61f0220    br  x17

Disassembly of section .text:

00000000000006e0 <main>:
 6e0:   90000085    adrp    x5, 10000 <__FRAME_END__+0xf3d8>
 6e4:   a9a67bfd    stp x29, x30, [sp, #-416]!
 6e8:   910003fd    mov x29, sp
 6ec:   90000002    adrp    x2, 0 <_init-0x658>
 6f0:   91292042    add x2, x2, #0xa48
 6f4:   910263e3    add x3, sp, #0x98
 6f8:   910363e0    add x0, sp, #0xd8
 6fc:   6f008434    mvni    v20.8h, #0x1
 700:   f947f0a5    ldr x5, [x5, #4064]
 704:   aa0303e1    mov x1, x3
 708:   910143e4    add x4, sp, #0x50
 70c:   a940344c    ldp x12, x13, [x2]
 710:   a9412c4a    ldp x10, x11, [x2, #16]
 714:   f94000a6    ldr x6, [x5]
 718:   f900cfe6    str x6, [sp, #408]
 71c:   d2800006    mov x6, #0x0                    // #0
 720:   a9422448    ldp x8, x9, [x2, #32]
 724:   a9431c46    ldp x6, x7, [x2, #48]
 728:   910463e2    add x2, sp, #0x118
 72c:   a909b7ec    stp x12, x13, [sp, #152]
 730:   a90aafea    stp x10, x11, [sp, #168]
 734:   a90ba7e8    stp x8, x9, [sp, #184]
 738:   a90c9fe6    stp x6, x7, [sp, #200]
 73c:   94000069    bl  8e0 <scenario1a>
 740:   4c402400    ld1 {v0.8h-v3.8h}, [x0]
 744:   910043e1    add x1, sp, #0x10
 748:   aa0403e0    mov x0, x4
 74c:   4c402470    ld1 {v16.8h-v19.8h}, [x3]
 750:   4e619e85    mul v5.8h, v20.8h, v1.8h
 754:   4e608424    add v4.8h, v1.8h, v0.8h
 758:   4e628466    add v6.8h, v3.8h, v2.8h
 75c:   4e639e87    mul v7.8h, v20.8h, v3.8h
 760:   4c002030    st1 {v16.16b-v19.16b}, [x1]
 764:   4c002444    st1 {v4.8h-v7.8h}, [x2]
 768:   94000072    bl  930 <scenario2a>
 76c:   ad409885    ldp q5, q6, [x4, #16]
 770:   90000081    adrp    x1, 10000 <__FRAME_END__+0xf3d8>
 774:   910563e2    add x2, sp, #0x158
 778:   3dc00c84    ldr q4, [x4, #48]
 77c:   3dc017e7    ldr q7, [sp, #80]
 780:   f947f021    ldr x1, [x1, #4064]
 784:   4e749c83    mul v3.8h, v4.8h, v20.8h
 788:   4e668482    add v2.8h, v4.8h, v6.8h
 78c:   4e749ca1    mul v1.8h, v5.8h, v20.8h
 790:   4e6784a0    add v0.8h, v5.8h, v7.8h
 794:   4c002440    st1 {v0.8h-v3.8h}, [x2]
 798:   f940cfe0    ldr x0, [sp, #408]
 79c:   f9400022    ldr x2, [x1]
 7a0:   eb020000    subs    x0, x0, x2
 7a4:   d2800002    mov x2, #0x0                    // #0
 7a8:   54000081    b.ne    7b8 <main+0xd8>  // b.any
 7ac:   52800000    mov w0, #0x0                    // #0
 7b0:   a8da7bfd    ldp x29, x30, [sp], #416
 7b4:   d65f03c0    ret
 7b8:   97ffffbe    bl  6b0 <__stack_chk_fail@plt>

00000000000007bc <_start>:
 7bc:   d280001d    mov x29, #0x0                       // #0
 7c0:   d280001e    mov x30, #0x0                       // #0
 7c4:   aa0003e5    mov x5, x0
 7c8:   f94003e1    ldr x1, [sp]
 7cc:   910023e2    add x2, sp, #0x8
 7d0:   910003e6    mov x6, sp
 7d4:   90000080    adrp    x0, 10000 <__FRAME_END__+0xf3d8>
 7d8:   f947f800    ldr x0, [x0, #4080]
 7dc:   90000083    adrp    x3, 10000 <__FRAME_END__+0xf3d8>
 7e0:   f947f463    ldr x3, [x3, #4072]
 7e4:   90000084    adrp    x4, 10000 <__FRAME_END__+0xf3d8>
 7e8:   f947e084    ldr x4, [x4, #4032]
 7ec:   97ffffad    bl  6a0 <__libc_start_main@plt>
 7f0:   97ffffb8    bl  6d0 <abort@plt>

00000000000007f4 <call_weak_fn>:
 7f4:   90000080    adrp    x0, 10000 <__FRAME_END__+0xf3d8>
 7f8:   f947ec00    ldr x0, [x0, #4056]
 7fc:   b4000040    cbz x0, 804 <call_weak_fn+0x10>
 800:   17ffffb0    b   6c0 <__gmon_start__@plt>
 804:   d65f03c0    ret
 808:   d503201f    nop
 80c:   d503201f    nop

0000000000000810 <deregister_tm_clones>:
 810:   b0000080    adrp    x0, 11000 <__data_start>
 814:   91004000    add x0, x0, #0x10
 818:   b0000081    adrp    x1, 11000 <__data_start>
 81c:   91004021    add x1, x1, #0x10
 820:   eb00003f    cmp x1, x0
 824:   540000c0    b.eq    83c <deregister_tm_clones+0x2c>  // b.none
 828:   90000081    adrp    x1, 10000 <__FRAME_END__+0xf3d8>
 82c:   f947e421    ldr x1, [x1, #4040]
 830:   b4000061    cbz x1, 83c <deregister_tm_clones+0x2c>
 834:   aa0103f0    mov x16, x1
 838:   d61f0200    br  x16
 83c:   d65f03c0    ret

0000000000000840 <register_tm_clones>:
 840:   b0000080    adrp    x0, 11000 <__data_start>
 844:   91004000    add x0, x0, #0x10
 848:   b0000081    adrp    x1, 11000 <__data_start>
 84c:   91004021    add x1, x1, #0x10
 850:   cb000021    sub x1, x1, x0
 854:   d37ffc22    lsr x2, x1, #63
 858:   8b810c41    add x1, x2, x1, asr #3
 85c:   9341fc21    asr x1, x1, #1
 860:   b40000c1    cbz x1, 878 <register_tm_clones+0x38>
 864:   90000082    adrp    x2, 10000 <__FRAME_END__+0xf3d8>
 868:   f947fc42    ldr x2, [x2, #4088]
 86c:   b4000062    cbz x2, 878 <register_tm_clones+0x38>
 870:   aa0203f0    mov x16, x2
 874:   d61f0200    br  x16
 878:   d65f03c0    ret
 87c:   d503201f    nop

0000000000000880 <__do_global_dtors_aux>:
 880:   a9be7bfd    stp x29, x30, [sp, #-32]!
 884:   910003fd    mov x29, sp
 888:   f9000bf3    str x19, [sp, #16]
 88c:   b0000093    adrp    x19, 11000 <__data_start>
 890:   39404260    ldrb    w0, [x19, #16]
 894:   35000140    cbnz    w0, 8bc <__do_global_dtors_aux+0x3c>
 898:   90000080    adrp    x0, 10000 <__FRAME_END__+0xf3d8>
 89c:   f947e800    ldr x0, [x0, #4048]
 8a0:   b4000080    cbz x0, 8b0 <__do_global_dtors_aux+0x30>
 8a4:   b0000080    adrp    x0, 11000 <__data_start>
 8a8:   f9400400    ldr x0, [x0, #8]
 8ac:   97ffff79    bl  690 <__cxa_finalize@plt>
 8b0:   97ffffd8    bl  810 <deregister_tm_clones>
 8b4:   52800020    mov w0, #0x1                    // #1
 8b8:   39004260    strb    w0, [x19, #16]
 8bc:   f9400bf3    ldr x19, [sp, #16]
 8c0:   a8c27bfd    ldp x29, x30, [sp], #32
 8c4:   d65f03c0    ret
 8c8:   d503201f    nop
 8cc:   d503201f    nop

00000000000008d0 <frame_dummy>:
 8d0:   17ffffdc    b   840 <register_tm_clones>
 8d4:   d503201f    nop
 8d8:   d503201f    nop
 8dc:   d503201f    nop

00000000000008e0 <scenario1a>:
 8e0:   4c402420    ld1 {v0.8h-v3.8h}, [x1]
 8e4:   6e60b833    neg v19.8h, v1.8h
 8e8:   6e60b852    neg v18.8h, v2.8h
 8ec:   6e60b871    neg v17.8h, v3.8h
 8f0:   6e60b810    neg v16.8h, v0.8h
 8f4:   6e132004    ext v4.16b, v0.16b, v19.16b, #4
 8f8:   6e122025    ext v5.16b, v1.16b, v18.16b, #4
 8fc:   6e112046    ext v6.16b, v2.16b, v17.16b, #4
 900:   6e102067    ext v7.16b, v3.16b, v16.16b, #4
 904:   4c002404    st1 {v4.8h-v7.8h}, [x0]
 908:   d65f03c0    ret
 90c:   d503201f    nop

0000000000000910 <scenario1b>:
 910:   4c402420    ld1 {v0.8h-v3.8h}, [x1]
 914:   6f008430    mvni    v16.8h, #0x1
 918:   4e619e05    mul v5.8h, v16.8h, v1.8h
 91c:   4e608424    add v4.8h, v1.8h, v0.8h
 920:   4e628466    add v6.8h, v3.8h, v2.8h
 924:   4e639e07    mul v7.8h, v16.8h, v3.8h
 928:   4c002404    st1 {v4.8h-v7.8h}, [x0]
 92c:   d65f03c0    ret

0000000000000930 <scenario2a>:
 930:   ad400025    ldp q5, q0, [x1]
 934:   ad408423    ldp q3, q1, [x1, #16]
 938:   3dc00c24    ldr q4, [x1, #48]
 93c:   6e60b800    neg v0.8h, v0.8h
 940:   4ea11c22    mov v2.16b, v1.16b
 944:   6e60b821    neg v1.8h, v1.8h
 948:   6e0020a5    ext v5.16b, v5.16b, v0.16b, #4
 94c:   4ea41c80    mov v0.16b, v4.16b
 950:   6e60b884    neg v4.8h, v4.8h
 954:   6e012063    ext v3.16b, v3.16b, v1.16b, #4
 958:   3d800005    str q5, [x0]
 95c:   3dc00021    ldr q1, [x1]
 960:   6e042042    ext v2.16b, v2.16b, v4.16b, #4
 964:   ad008803    stp q3, q2, [x0, #16]
 968:   6e60b821    neg v1.8h, v1.8h
 96c:   6e012000    ext v0.16b, v0.16b, v1.16b, #4
 970:   3d800c00    str q0, [x0, #48]
 974:   d65f03c0    ret
 978:   d503201f    nop
 97c:   d503201f    nop

0000000000000980 <scenario2b>:
 980:   ad401022    ldp q2, q4, [x1]
 984:   6f008420    mvni    v0.8h, #0x1
 988:   ad410c21    ldp q1, q3, [x1, #32]
 98c:   4e609c85    mul v5.8h, v4.8h, v0.8h
 990:   4e648442    add v2.8h, v2.8h, v4.8h
 994:   4e609c60    mul v0.8h, v3.8h, v0.8h
 998:   4e638421    add v1.8h, v1.8h, v3.8h
 99c:   ad001402    stp q2, q5, [x0]
 9a0:   ad010001    stp q1, q0, [x0, #32]
 9a4:   d65f03c0    ret

00000000000009a8 <__libc_csu_init>:
 9a8:   a9bc7bfd    stp x29, x30, [sp, #-64]!
 9ac:   910003fd    mov x29, sp
 9b0:   a90153f3    stp x19, x20, [sp, #16]
 9b4:   90000094    adrp    x20, 10000 <__FRAME_END__+0xf3d8>
 9b8:   9135c294    add x20, x20, #0xd70
 9bc:   a9025bf5    stp x21, x22, [sp, #32]
 9c0:   90000095    adrp    x21, 10000 <__FRAME_END__+0xf3d8>
 9c4:   9135a2b5    add x21, x21, #0xd68
 9c8:   cb150294    sub x20, x20, x21
 9cc:   2a0003f6    mov w22, w0
 9d0:   a90363f7    stp x23, x24, [sp, #48]
 9d4:   aa0103f7    mov x23, x1
 9d8:   aa0203f8    mov x24, x2
 9dc:   97ffff1f    bl  658 <_init>
 9e0:   eb940fff    cmp xzr, x20, asr #3
 9e4:   54000160    b.eq    a10 <__libc_csu_init+0x68>  // b.none
 9e8:   9343fe94    asr x20, x20, #3
 9ec:   d2800013    mov x19, #0x0                       // #0
 9f0:   f8737aa3    ldr x3, [x21, x19, lsl #3]
 9f4:   aa1803e2    mov x2, x24
 9f8:   91000673    add x19, x19, #0x1
 9fc:   aa1703e1    mov x1, x23
 a00:   2a1603e0    mov w0, w22
 a04:   d63f0060    blr x3
 a08:   eb13029f    cmp x20, x19
 a0c:   54ffff21    b.ne    9f0 <__libc_csu_init+0x48>  // b.any
 a10:   a94153f3    ldp x19, x20, [sp, #16]
 a14:   a9425bf5    ldp x21, x22, [sp, #32]
 a18:   a94363f7    ldp x23, x24, [sp, #48]
 a1c:   a8c47bfd    ldp x29, x30, [sp], #64
 a20:   d65f03c0    ret
 a24:   d503201f    nop

0000000000000a28 <__libc_csu_fini>:
 a28:   d65f03c0    ret

Disassembly of section .fini:

0000000000000a2c <_fini>:
 a2c:   a9bf7bfd    stp x29, x30, [sp, #-16]!
 a30:   910003fd    mov x29, sp
 a34:   a8c17bfd    ldp x29, x30, [sp], #16
 a38:   d65f03c0    ret

Questions

  1. Normally people load the data from the pointer (using vld1q_u16_x4), operates using the Neon datatypes, and store back to another pointer (using vst1q_u16_x4), and don't use an approach like the one I used in Scenario 2 (sending the Neon datatypes as inputs/outputs). Is there a general reason why is this?

  2. I checked the disassembly of Scenario 1a (starts at line 8e0) vs. Scenario 2a (starts at line 930). It seems scenario 2a has more data movement. Will this happen in all scenarios? So is it faster to do what I asked in question 1? If so, then why this doesn't happen in Scenario 1b vs 2b (lines 910 and 980, respectively).

  3. In the main function, there are some add/mul after both Scenario1a and 2a (in lines 750,754,758,75c and 784,788,78c,790), but my main function has no multiplications nor additions. Why is this happening? (I'm just curious)

Thank you for all your help!


Solution

    1. There is absolutely no reason for using pointer to neon datatypes for parameters. Memory doesn't care about datatypes. Compilers are very conservative and bureaucratic, they simply have to. It's like filing an application to authorities: One wrong check mark, your application will land in the wrong hand, causing tremendeous unnecessary trouble.
      Short: Keep it simple. Don't try to impress compilers or reviewers in any way.

    2. I told you last time to be explicit on memory load and store. You are computing directly from/to memory in scenario2. Never do this. Stick to load->compute->store. Local variables are your best friends. (__restrict directive might help)
      Again, do not try to impress compilers or reviewers. Your scenario2 is just asking for trouble. A sheer disaster. The reviewer will raise a red flag immediately, and keep his eye on you and all your codes, if you are lucky and didn't get fired the instant.

    3. You shouldn't put callees in the same file as the caller. More than often, the caller will inline short non-static callees which makes profiling harder.