Search code examples
carmneon

Are there are ARM NEON instructions for signed right-shift that round toward zero?


I am trying to implement an algorithm using ARM intrinsics.

On step of the algorithm requires a right shift of a signed integer, but it needs to round negative values upwards (i.e. less negative). For example, if shifting right by one, then -8 should give -4, but -1 should give 0.

In other words, I want something that rounds negative values towards zero instead of rounding down:

int rightshift(int number, unsigned int shift)
{
    return ((number < 0) ? -1 : 1) * (abs(number) >> shift);
}

I am not able to find a suitable function to do this in SIMD fashion. Is there a way to do this in one function call, or some trick that could be used?


Solution

  • I don't think there's a single-instruction shift with round-toward-zero behaviour.

    However, you can do it fairly simply with a couple of shift and mask instructions. What we need to do is to add one to the result if we started with a negative number and there is a 'carry' out (i.e. any bit to the right of the result would have been 1).

    I can demonstrate this with the following pure C code:

    #include <stddef.h>
    #include <limits.h>
    
    int16_t rightshift(int number, unsigned int shift)
    {
        static const size_t bits = sizeof number * CHAR_BIT;
    
        number += ((1<<shift) - 1) & (number >> bits-1);
        return number >> shift;
    }
    
    #include <stdio.h>
    int main() {
        for (int i = -16; i <= 16;  ++i) {
            printf(" %3d: ", i);
            for (int j = 0;  j < 4;  ++j)
                printf("%4d", rightshift(i, j));
            puts("");
        }
    }
    

    This compiles to some nice branch-fee assembly, which looks amenable to inlining (especially when shift is a compile-time constant):

    rightshift:
        @ args = 0, pretend = 0, frame = 0
        @ frame_needed = 0, uses_anonymous_args = 0
        @ link register save eliminated.
        movs    r3, #1
        lsls    r3, r3, r1
        subs    r3, r3, #1
        and r3, r3, r0, asr #31
        add r0, r0, r3
        asrs    r0, r0, r1
        bx  lr
    

    To target Neon, I wrote another function, to exercise it with multiple data:

    void do_shift(int16_t *restrict dest, const int16_t *restrict src,
                  size_t count, unsigned int shift)
    {
        for (size_t j = 0;  j < count;  ++j) {
            dest[j] = rightshift(src[j], shift);
        }
    }
    

    And a test program for it:

    #include <stdio.h>
    int main() {
        static const int16_t src[] = {
            -32768, -32767, -32766, -32765, -32764,
            -16384, -16383, -16382, -16381, -16380,
            -8193,  -8192,  -8191,  -8190,  -8189,
            -16, -15, -14, -13, -12, -10, -9,
            -8, -7, -6, -5, -4, -3, -2, -1, 0,
            1, 2, 3, 4, 5, 6, 7, 8,
            9, 10, 11, 12, 13, 14, 15, 16,
            1023, 1024, 32767,
        };
        static const size_t count = sizeof src / sizeof *src;
        int16_t dest[16][count];
    
        for (unsigned int i = 0; i < 16;  ++i) {
            do_shift(dest[i], src, count, i);
        }
    
        for (size_t i = 0;  i < count;  ++i) {
            printf("%7d: ", src[i]);
            for (int j = 0;  j < 16;  ++j)
                printf("%7d", dest[j][i]);
            puts("");
        }
    }
    

    I compiled this with gcc -O3 -march=armv7 -mfpu=neon. I confess I'm not familiar with the Neon instructions, but the results may be instructive:

    do_shift:
        @ args = 0, pretend = 0, frame = 0
        @ frame_needed = 0, uses_anonymous_args = 0
        cmp r2, #0
        beq .L21
        push    {r4, r5, r6, r7, r8, lr}
        ubfx    r4, r1, #1, #2
        negs    r4, r4
        movs    r5, #1
        and r4, r4, #7
        lsls    r5, r5, r3
        adds    r7, r4, #7
        subs    r6, r2, #1
        subs    r5, r5, #1
        cmp r6, r7
        sxth    r5, r5
        bcc .L8
        cmp r4, #0
        beq .L9
        ldrsh   r7, [r1]
        cmp r4, #1
        and r6, r5, r7, asr #31
        add r6, r6, r7
        sxth    r6, r6
        asr r6, r6, r3
        strh    r6, [r0]    @ movhi
        beq .L9
        ldrsh   r7, [r1, #2]
        cmp r4, #2
        and r6, r5, r7, asr #31
        add r6, r6, r7
        sxth    r6, r6
        asr r6, r6, r3
        strh    r6, [r0, #2]    @ movhi
        beq .L9
        ldrsh   r7, [r1, #4]
        cmp r4, #3
        and r6, r5, r7, asr #31
        add r6, r6, r7
        sxth    r6, r6
        asr r6, r6, r3
        strh    r6, [r0, #4]    @ movhi
        beq .L9
        ldrsh   r7, [r1, #6]
        cmp r4, #4
        and r6, r5, r7, asr #31
        add r6, r6, r7
        sxth    r6, r6
        asr r6, r6, r3
        strh    r6, [r0, #6]    @ movhi
        beq .L9
        ldrsh   r7, [r1, #8]
        cmp r4, #5
        and r6, r5, r7, asr #31
        add r6, r6, r7
        sxth    r6, r6
        asr r6, r6, r3
        strh    r6, [r0, #8]    @ movhi
        beq .L9
        ldrsh   r7, [r1, #10]
        cmp r4, #7
        ite eq
        moveq   r8, r4
        movne   r8, #6
        and r6, r5, r7, asr #31
        add r6, r6, r7
        it  eq
        ldrsheq r7, [r1, #12]
        sxth    r6, r6
        asr r6, r6, r3
        strh    r6, [r0, #10]   @ movhi
        itttt   eq
        andeq   r6, r5, r7, asr #31
        addeq   r6, r6, r7
        sxtheq  r6, r6
        asreq   r6, r6, r3
        it  eq
        strheq  r6, [r0, #12]   @ movhi
    .L4:
        vdup.32 q10, r3
        sub lr, r2, r4
        lsls    r4, r4, #1
        movs    r7, #0
        vneg.s32    q10, q10
        adds    r6, r1, r4
        lsr ip, lr, #3
        add r4, r4, r0
        vdup.16 q12, r5
    .L6:
        adds    r7, r7, #1
        adds    r6, r6, #16
        vldr    d18, [r6, #-16]
        vldr    d19, [r6, #-8]
        cmp r7, ip
        vshr.s16    q8, q9, #15
        vand    q8, q8, q12
        vadd.i16    q8, q8, q9
        vmovl.s16 q9, d16
        vmovl.s16 q8, d17
        vshl.s32    q9, q9, q10
        vshl.s32    q8, q8, q10
        vmovn.i32   d22, q9
        vmovn.i32   d23, q8
        vst1.16 {q11}, [r4]
        add r4, r4, #16
        bcc .L6
        bic r6, lr, #7
        cmp lr, r6
        add r4, r8, r6
        beq .L1
    .L3:
        ldrsh   ip, [r1, r4, lsl #1]
        adds    r7, r4, #1
        cmp r2, r7
        and r6, r5, ip, asr #31
        add r6, r6, ip
        sxth    r6, r6
        asr r6, r6, r3
        strh    r6, [r0, r4, lsl #1]    @ movhi
        bls .L1
        ldrsh   lr, [r1, r7, lsl #1]
        add ip, r4, #2
        cmp r2, ip
        and r6, r5, lr, asr #31
        add r6, r6, lr
        sxth    r6, r6
        asr r6, r6, r3
        strh    r6, [r0, r7, lsl #1]    @ movhi
        bls .L1
        ldrsh   lr, [r1, ip, lsl #1]
        adds    r7, r4, #3
        cmp r2, r7
        and r6, r5, lr, asr #31
        add r6, r6, lr
        sxth    r6, r6
        asr r6, r6, r3
        strh    r6, [r0, ip, lsl #1]    @ movhi
        bls .L1
        ldrsh   lr, [r1, r7, lsl #1]
        add ip, r4, #4
        cmp r2, ip
        and r6, r5, lr, asr #31
        add r6, r6, lr
        sxth    r6, r6
        asr r6, r6, r3
        strh    r6, [r0, r7, lsl #1]    @ movhi
        bls .L1
        ldrsh   lr, [r1, ip, lsl #1]
        adds    r7, r4, #5
        cmp r2, r7
        and r6, r5, lr, asr #31
        add r6, r6, lr
        sxth    r6, r6
        asr r6, r6, r3
        strh    r6, [r0, ip, lsl #1]    @ movhi
        bls .L1
        ldrsh   lr, [r1, r7, lsl #1]
        add ip, r4, #6
        cmp r2, ip
        and r6, r5, lr, asr #31
        add r6, r6, lr
        sxth    r6, r6
        asr r6, r6, r3
        strh    r6, [r0, r7, lsl #1]    @ movhi
        bls .L1
        ldrsh   lr, [r1, ip, lsl #1]
        adds    r7, r4, #7
        cmp r2, r7
        and r6, r5, lr, asr #31
        add r6, r6, lr
        sxth    r6, r6
        asr r6, r6, r3
        strh    r6, [r0, ip, lsl #1]    @ movhi
        bls .L1
        ldrsh   lr, [r1, r7, lsl #1]
        add ip, r4, #8
        cmp r2, ip
        and r6, r5, lr, asr #31
        add r6, r6, lr
        sxth    r6, r6
        asr r6, r6, r3
        strh    r6, [r0, r7, lsl #1]    @ movhi
        bls .L1
        ldrsh   lr, [r1, ip, lsl #1]
        add r7, r4, #9
        cmp r2, r7
        and r6, r5, lr, asr #31
        add r6, r6, lr
        sxth    r6, r6
        asr r6, r6, r3
        strh    r6, [r0, ip, lsl #1]    @ movhi
        bls .L1
        ldrsh   lr, [r1, r7, lsl #1]
        add ip, r4, #10
        cmp r2, ip
        and r6, r5, lr, asr #31
        add r6, r6, lr
        sxth    r6, r6
        asr r6, r6, r3
        strh    r6, [r0, r7, lsl #1]    @ movhi
        bls .L1
        ldrsh   lr, [r1, ip, lsl #1]
        add r7, r4, #11
        cmp r2, r7
        and r6, r5, lr, asr #31
        add r6, r6, lr
        sxth    r6, r6
        asr r6, r6, r3
        strh    r6, [r0, ip, lsl #1]    @ movhi
        bls .L1
        ldrsh   lr, [r1, r7, lsl #1]
        add ip, r4, #12
        cmp r2, ip
        and r6, r5, lr, asr #31
        add r6, r6, lr
        sxth    r6, r6
        asr r6, r6, r3
        strh    r6, [r0, r7, lsl #1]    @ movhi
        bls .L1
        ldrsh   r7, [r1, ip, lsl #1]
        adds    r4, r4, #13
        cmp r2, r4
        and r6, r5, r7, asr #31
        add r6, r6, r7
        sxth    r6, r6
        asr r6, r6, r3
        strh    r6, [r0, ip, lsl #1]    @ movhi
        bls .L1
        ldrsh   r1, [r1, r4, lsl #1]
        and r2, r5, r1, asr #31
        add r2, r2, r1
        sxth    r2, r2
        asr r3, r2, r3
        strh    r3, [r0, r4, lsl #1]    @ movhi
    .L1:
        pop {r4, r5, r6, r7, r8, pc}
    .L9:
        mov r8, r4
        b   .L4
    .L21:
        bx  lr
    .L8:
        movs    r4, #0
        b   .L3
    

    There's a lot of loop unrolling which makes the code longer, but the pattern should be clear.