I recently came across this piece of code in an interrupt service routine (ISR):
#define MAX_CHANNELS 4
static uint16_t volatile* ADCVALS[MAX_CHANNELS] = {
&ADC1BUF0, &ADC1BUF1, &ADC1BUF2, &ADC1BUF3
};
static uint8_t CHANNELS = 0;
static uint16_t volatile* volatile BUFFER_IDX[MAX_CHANNELS];
void __attribute__((interrupt, no_auto_psv)) _AD1Interrupt(void) {
*(BUFFER_IDX[0]++) = *ADCVALS[0];
if (CHANNELS >= 1) {
*(BUFFER_IDX[1]++) = *ADCVALS[1];
if (CHANNELS >= 2) {
*(BUFFER_IDX[2]++) = *ADCVALS[2];
if (CHANNELS >= 3) {
*(BUFFER_IDX[3]++) = *ADCVALS[3];
}
}
}
}
It copies between 1-4 register values into memory, depending on the value of CHANNELS
, which is a value between 0-3 which is set elsewhere in the program via a setter function.
I found the nested if's extremely ugly and changed it to this:
int i;
for (i = 0; i <= CHANNELS; i++) {
*(BUFFER_IDX[i]++) = *ADCVALS[i];
}
which promptly broke the ISR. This is an embedded system, PIC24 architecture, 64 MHz clock. The ISR is severely time constrained and must finish within 1 µs. The for loop is apparently too slow, while the nested if is fast enough.
My question, then, is two-fold:
-O2
).for (int i = 0; i <= CHANNELS; i++) {
*(BUFFER_IDX[i]++) = *ADCVALS[i];
}
Is
mov DWORD PTR [rbp-4], 0
jmp .L2
.L3:
mov eax, DWORD PTR [rbp-4]
cdqe
mov rax, QWORD PTR [rbp-80+rax*8]
mov edx, DWORD PTR [rax]
mov eax, DWORD PTR [rbp-4]
cdqe
mov rax, QWORD PTR [rbp-48+rax*8]
lea rsi, [rax+4]
mov ecx, DWORD PTR [rbp-4]
movsx rcx, ecx
mov QWORD PTR [rbp-48+rcx*8], rsi
mov DWORD PTR [rax], edx
add DWORD PTR [rbp-4], 1
.L2:
mov eax, DWORD PTR [rbp-4]
cmp eax, DWORD PTR [rbp-8]
jle .L3
And
*(BUFFER_IDX[0]++) = *ADCVALS[0];
if (CHANNELS >= 1) {
*(BUFFER_IDX[1]++) = *ADCVALS[1];
if (CHANNELS >= 2) {
*(BUFFER_IDX[2]++) = *ADCVALS[2];
if (CHANNELS >= 3) {
*(BUFFER_IDX[3]++) = *ADCVALS[3];
}
}
}
Is
mov rax, QWORD PTR [rbp-80]
mov edx, DWORD PTR [rax]
mov rax, QWORD PTR [rbp-48]
lea rcx, [rax+4]
mov QWORD PTR [rbp-48], rcx
mov DWORD PTR [rax], edx
cmp DWORD PTR [rbp-4], 0
jle .L2
mov rax, QWORD PTR [rbp-72]
mov edx, DWORD PTR [rax]
mov rax, QWORD PTR [rbp-40]
lea rcx, [rax+4]
mov QWORD PTR [rbp-40], rcx
mov DWORD PTR [rax], edx
cmp DWORD PTR [rbp-4], 1
jle .L2
mov rax, QWORD PTR [rbp-64]
mov edx, DWORD PTR [rax]
mov rax, QWORD PTR [rbp-32]
lea rcx, [rax+4]
mov QWORD PTR [rbp-32], rcx
mov DWORD PTR [rax], edx
cmp DWORD PTR [rbp-4], 2
jle .L2
mov rax, QWORD PTR [rbp-56]
mov edx, DWORD PTR [rax]
mov rax, QWORD PTR [rbp-24]
lea rcx, [rax+4]
mov QWORD PTR [rbp-24], rcx
mov DWORD PTR [rax], edx
How you can see nested if will do less jumps but current compilers can optimize it and with -O3 flag you will get something like this
mov eax, DWORD PTR [rsp+12]
test eax, eax
js .L2
mov rax, QWORD PTR [rsp+48]
mov edx, DWORD PTR [rax]
mov rax, QWORD PTR [rsp+16]
mov DWORD PTR [rax], edx
mov eax, DWORD PTR [rsp+12]
test eax, eax
jle .L2
mov rax, QWORD PTR [rsp+56]
mov edx, DWORD PTR [rax]
mov rax, QWORD PTR [rsp+24]
mov DWORD PTR [rax], edx
mov eax, DWORD PTR [rsp+12]
sub eax, 1
jle .L2
mov rax, QWORD PTR [rsp+64]
mov edx, DWORD PTR [rax]
mov rax, QWORD PTR [rsp+32]
mov DWORD PTR [rax], edx
mov eax, DWORD PTR [rsp+12]
cmp eax, 2
jle .L2
mov rax, QWORD PTR [rsp+72]
mov edx, DWORD PTR [rax]
mov rax, QWORD PTR [rsp+40]
mov DWORD PTR [rax], edx
mov eax, DWORD PTR [rsp+12]
.L2:
That has +- same performance as nested if-s