Search code examples
armneon

Find minimum and maximum value of an array using ARM NEON instructions


I have the following code which I would like to optimise using ARM NEON instructions. How can I implement it? Thanks for the answers

unsigned char someVector[] = {1, 2, 4, 1, 2, 0, 8, 100};
unsigned char maxVal = 0, minVal = 255;
for (int i = 0; i < sizeof(someVector); i++)
{
    if (someVector[i] < minVal)
    {
        minVal = someVector[i];
    }
    else if (someVector[i] > maxVal)
    {
        maxVal = someVector[i];
    }
}

Solution

  • Below is an highly optimized example how to find min and max in a large array. The function simply returns if size is smaller than 128 :

    /*
     * minmax.S
     *
     *  Created on: 2014. 10. 29.
     *      Author: Jake Lee
     */
    
    
    // unsigned int minmax(unsigned char *pSrc, unsigned int size);
    
        .text
        .arm
        .global minmax
    
        pSrc    .req    r0
        size    .req    r1
    
        qmin1   .req    q0
            dmina   .req    d0
            dminb   .req    d1
    
        qmax1   .req    q1
            dmaxa   .req    d2
            dmaxb   .req    d3
    
        qmin2   .req    q2
        qmax2   .req    q3
    
        .align 5
        .func
    minmax:
        subs    size, size, #128
        bxmi    lr
        vmov.i8     qmin1, #0xff
        vmov.i8     qmax1, #0
        vmov.i8     qmin2, #0xff
        vmov.i8     qmax2, #0
    
        .align 5
    1:
        vld1.8      {q8, q9}, [pSrc]!
        vld1.8      {q10, q11}, [pSrc]!
        vld1.8      {q12, q13}, [pSrc]!
        vld1.8      {q14, q15}, [pSrc]!
        subs    size, size, #128
        pld     [pSrc, #64*3]
        pld     [pSrc, #64*4]
        vmin.u8     qmin1, q8
        vmax.u8     qmax1, q8
        vmin.u8     qmin2, q9
        vmax.u8     qmax2, q9
        vmin.u8     qmin1, q10
        vmax.u8     qmax1, q10
        vmin.u8     qmin2, q11
        vmax.u8     qmax2, q11
        vmin.u8     qmin1, q12
        vmax.u8     qmax1, q12
        vmin.u8     qmin2, q13
        vmax.u8     qmax2, q13
        vmin.u8     qmin1, q14
        vmax.u8     qmax1, q14
        vmin.u8     qmin2, q15
        vmax.u8     qmax2, q15
        bpl     1b
    
    // deal width residuals (size % 128)
        cmp     size, #-128
        addgt   pSrc, pSrc, size
        bgt     1b
    
    // shrink to sixteen
        vmin.u8     qmin1, qmin2
        vmax.u8     qmax1, qmax2
    // shrink to eight
        vpmin.u8    dmina, dmina, dminb
        vpmax.u8    dmaxa, dmaxa, dmaxb
    // shrink to four
        vpmin.u8    dmina, dmina, dminb
        vpmax.u8    dmaxa, dmaxa, dmaxb
    // shrink to two
        vpmin.u8    dmina, dmina, dminb
        vpmax.u8    dmaxa, dmaxa, dmaxb
    // shrink to one
        vpmin.u8    dmina, dmina, dminb
        vpmax.u8    dmaxa, dmaxa, dmaxb
    
        vmov    r0, dmina[0]
        vmov    r1, dmaxa[0]
    
        and     r0, r0, #0xff
        and     r1, r1, #0xff
        orr     r0, r0, r1, lsl #16
        bx      lr
        .endfunc
        .end
    

    The return value is an unsigned int. The lower 16 bits contain min and higher ones max :

    result = minmax(pSrc, size);
    min = result & 0xff;
    max = result >> 16;