Search code examples
cperformanceembeddedinterruptpic

Speed of nested if vs for loop


I recently came across this piece of code in an interrupt service routine (ISR):

#define MAX_CHANNELS 4
static uint16_t volatile* ADCVALS[MAX_CHANNELS] = {
    &ADC1BUF0, &ADC1BUF1, &ADC1BUF2, &ADC1BUF3
};
static uint8_t CHANNELS = 0;
static uint16_t volatile* volatile BUFFER_IDX[MAX_CHANNELS];

void __attribute__((interrupt, no_auto_psv)) _AD1Interrupt(void) {
    *(BUFFER_IDX[0]++) = *ADCVALS[0];
    if (CHANNELS >= 1) {
        *(BUFFER_IDX[1]++) = *ADCVALS[1];
        if (CHANNELS >= 2) {
            *(BUFFER_IDX[2]++) = *ADCVALS[2];
             if (CHANNELS >= 3) {
                *(BUFFER_IDX[3]++) = *ADCVALS[3];
            }
        }
    }
}

It copies between 1-4 register values into memory, depending on the value of CHANNELS, which is a value between 0-3 which is set elsewhere in the program via a setter function.

I found the nested if's extremely ugly and changed it to this:

int i;
for (i = 0; i <= CHANNELS; i++) {
    *(BUFFER_IDX[i]++) = *ADCVALS[i];
}

which promptly broke the ISR. This is an embedded system, PIC24 architecture, 64 MHz clock. The ISR is severely time constrained and must finish within 1 µs. The for loop is apparently too slow, while the nested if is fast enough.

My question, then, is two-fold:

  1. Is there a less ugly way to do what the nested if clauses do, without slowing down the ISR?
  2. Why is the for loop so much slower? I would have expected the compiler (xc16) to be smart enough to generate similar asm for both (at -O2).

Solution

  • for (int i = 0; i <= CHANNELS; i++) {
        *(BUFFER_IDX[i]++) = *ADCVALS[i];
    }
    

    Is

            mov     DWORD PTR [rbp-4], 0
            jmp     .L2
    .L3:
            mov     eax, DWORD PTR [rbp-4]
            cdqe
            mov     rax, QWORD PTR [rbp-80+rax*8]
            mov     edx, DWORD PTR [rax]
            mov     eax, DWORD PTR [rbp-4]
            cdqe
            mov     rax, QWORD PTR [rbp-48+rax*8]
            lea     rsi, [rax+4]
            mov     ecx, DWORD PTR [rbp-4]
            movsx   rcx, ecx
            mov     QWORD PTR [rbp-48+rcx*8], rsi
            mov     DWORD PTR [rax], edx
            add     DWORD PTR [rbp-4], 1
    .L2:
            mov     eax, DWORD PTR [rbp-4]
            cmp     eax, DWORD PTR [rbp-8]
            jle     .L3
    

    And

    *(BUFFER_IDX[0]++) = *ADCVALS[0];
    if (CHANNELS >= 1) {
        *(BUFFER_IDX[1]++) = *ADCVALS[1];
        if (CHANNELS >= 2) {
            *(BUFFER_IDX[2]++) = *ADCVALS[2];
            if (CHANNELS >= 3) {
                *(BUFFER_IDX[3]++) = *ADCVALS[3];
            }
        }
    }
    

    Is

    mov     rax, QWORD PTR [rbp-80]
    mov     edx, DWORD PTR [rax]
    mov     rax, QWORD PTR [rbp-48]
    lea     rcx, [rax+4]
    mov     QWORD PTR [rbp-48], rcx
    mov     DWORD PTR [rax], edx
    cmp     DWORD PTR [rbp-4], 0
    jle     .L2
    mov     rax, QWORD PTR [rbp-72]
    mov     edx, DWORD PTR [rax]
    mov     rax, QWORD PTR [rbp-40]
    lea     rcx, [rax+4]
    mov     QWORD PTR [rbp-40], rcx
    mov     DWORD PTR [rax], edx
    cmp     DWORD PTR [rbp-4], 1
    jle     .L2
    mov     rax, QWORD PTR [rbp-64]
    mov     edx, DWORD PTR [rax]
    mov     rax, QWORD PTR [rbp-32]
    lea     rcx, [rax+4]
    mov     QWORD PTR [rbp-32], rcx
    mov     DWORD PTR [rax], edx
    cmp     DWORD PTR [rbp-4], 2
    jle     .L2
    mov     rax, QWORD PTR [rbp-56]
    mov     edx, DWORD PTR [rax]
    mov     rax, QWORD PTR [rbp-24]
    lea     rcx, [rax+4]
    mov     QWORD PTR [rbp-24], rcx
    mov     DWORD PTR [rax], edx
    

    How you can see nested if will do less jumps but current compilers can optimize it and with -O3 flag you will get something like this

            mov     eax, DWORD PTR [rsp+12]
            test    eax, eax
            js      .L2
            mov     rax, QWORD PTR [rsp+48]
            mov     edx, DWORD PTR [rax]
            mov     rax, QWORD PTR [rsp+16]
            mov     DWORD PTR [rax], edx
            mov     eax, DWORD PTR [rsp+12]
            test    eax, eax
            jle     .L2
            mov     rax, QWORD PTR [rsp+56]
            mov     edx, DWORD PTR [rax]
            mov     rax, QWORD PTR [rsp+24]
            mov     DWORD PTR [rax], edx
            mov     eax, DWORD PTR [rsp+12]
            sub     eax, 1
            jle     .L2
            mov     rax, QWORD PTR [rsp+64]
            mov     edx, DWORD PTR [rax]
            mov     rax, QWORD PTR [rsp+32]
            mov     DWORD PTR [rax], edx
            mov     eax, DWORD PTR [rsp+12]
            cmp     eax, 2
            jle     .L2
            mov     rax, QWORD PTR [rsp+72]
            mov     edx, DWORD PTR [rax]
            mov     rax, QWORD PTR [rsp+40]
            mov     DWORD PTR [rax], edx
            mov     eax, DWORD PTR [rsp+12]
    .L2:
    

    That has +- same performance as nested if-s