Trying to xor a huge uint32
array I decided to use NEON coprocessor.
I implemented two c
versions:
version 1:
uint32_t xor_array_ver_1(uint32_t *array, int size)
{
uint32x2_t acc = vmov_n_u32(0);
uint32_t acc1 = 0;
for (; size != 0; size -= 2) {
uint32x2_t vec;
vec = vld1_u32(array);
array += 2;
acc = veor_u32(acc, vec);
}
acc1 = vget_lane_u32(acc,0) ^ vget_lane_u32(acc,1);
return acc1;
}
version 2:
uint32_t xor_array_ver_2(uint32_t *array, int size)
{
uint32x4_t acc = vmovq_n_u32(0);
uint32_t acc1 = 0;
for (; size != 0; size -= 4) {
uint32x4_t vec;
vec = vld1q_u32(array);
array += 4;
acc = veorq_u32(acc, vec);
}
acc1 ^= vgetq_lane_u32(acc,0);
acc1 ^= vgetq_lane_u32(acc,1);
acc1 ^= vgetq_lane_u32(acc,2);
acc1 ^= vgetq_lane_u32(acc,3);
return acc1;
}
Comparing the above 2 versions to the traditional xor implementation:
for (i=0; i<arr_size; i++)
val ^= my_array[i];
I observed 2 issues:
my_array
is declared as
uint32_t my_array[BIG_LENGTH];
Most likely this will be memory bandwidth limited - once you saturate the available DRAM bandwidth, which should be quite easy to do with only one ALU operation per load, you won't get any further benefit from optimisation.
Try to combine your XOR with another operation on the same data if possible - that way you amortise the cost of the cache misses.