I have the following code which I would like to optimise using ARM NEON instructions. How can I implement it? Thanks for the answers
unsigned char someVector[] = {1, 2, 4, 1, 2, 0, 8, 100};
unsigned char maxVal = 0, minVal = 255;
for (int i = 0; i < sizeof(someVector); i++)
{
if (someVector[i] < minVal)
{
minVal = someVector[i];
}
else if (someVector[i] > maxVal)
{
maxVal = someVector[i];
}
}
Below is an highly optimized example how to find min and max in a large array. The function simply returns if size is smaller than 128 :
/*
* minmax.S
*
* Created on: 2014. 10. 29.
* Author: Jake Lee
*/
// unsigned int minmax(unsigned char *pSrc, unsigned int size);
.text
.arm
.global minmax
pSrc .req r0
size .req r1
qmin1 .req q0
dmina .req d0
dminb .req d1
qmax1 .req q1
dmaxa .req d2
dmaxb .req d3
qmin2 .req q2
qmax2 .req q3
.align 5
.func
minmax:
subs size, size, #128
bxmi lr
vmov.i8 qmin1, #0xff
vmov.i8 qmax1, #0
vmov.i8 qmin2, #0xff
vmov.i8 qmax2, #0
.align 5
1:
vld1.8 {q8, q9}, [pSrc]!
vld1.8 {q10, q11}, [pSrc]!
vld1.8 {q12, q13}, [pSrc]!
vld1.8 {q14, q15}, [pSrc]!
subs size, size, #128
pld [pSrc, #64*3]
pld [pSrc, #64*4]
vmin.u8 qmin1, q8
vmax.u8 qmax1, q8
vmin.u8 qmin2, q9
vmax.u8 qmax2, q9
vmin.u8 qmin1, q10
vmax.u8 qmax1, q10
vmin.u8 qmin2, q11
vmax.u8 qmax2, q11
vmin.u8 qmin1, q12
vmax.u8 qmax1, q12
vmin.u8 qmin2, q13
vmax.u8 qmax2, q13
vmin.u8 qmin1, q14
vmax.u8 qmax1, q14
vmin.u8 qmin2, q15
vmax.u8 qmax2, q15
bpl 1b
// deal width residuals (size % 128)
cmp size, #-128
addgt pSrc, pSrc, size
bgt 1b
// shrink to sixteen
vmin.u8 qmin1, qmin2
vmax.u8 qmax1, qmax2
// shrink to eight
vpmin.u8 dmina, dmina, dminb
vpmax.u8 dmaxa, dmaxa, dmaxb
// shrink to four
vpmin.u8 dmina, dmina, dminb
vpmax.u8 dmaxa, dmaxa, dmaxb
// shrink to two
vpmin.u8 dmina, dmina, dminb
vpmax.u8 dmaxa, dmaxa, dmaxb
// shrink to one
vpmin.u8 dmina, dmina, dminb
vpmax.u8 dmaxa, dmaxa, dmaxb
vmov r0, dmina[0]
vmov r1, dmaxa[0]
and r0, r0, #0xff
and r1, r1, #0xff
orr r0, r0, r1, lsl #16
bx lr
.endfunc
.end
The return value is an unsigned int. The lower 16 bits contain min and higher ones max :
result = minmax(pSrc, size);
min = result & 0xff;
max = result >> 16;