c switch-statement function-pointers goto

Why is `switch` so slow?

In a bytecode interpreting loop, after several tests, I'm surprised to see that using switch is the worst choice to make. Making calls to a function pointer array, or using gcc's computed goto extension is always 10~20% faster, the computed goto version being the fastest. I've tested with my real toy VM with 97 instructions and with the mini fake VM pasted below.

Why is using switch the slowest?

#include <stdio.h>
#include <stdlib.h>
#include <stdbool.h>
#include <inttypes.h>
#include <time.h>

enum {
    ADD1 = 1,
    ADD2,
    SUB3,
    SUB4,
    MUL5,
    MUL6,
};

static unsigned int number;

static void add1(void) {
    number += 1;
}

static void add2(void) {
    number += 2;
}

static void sub3(void) {
    number -= 3;
}

static void sub4(void) {
    number -= 4;
}

static void mul5(void) {
    number *= 5;
}

static void mul6(void) {
    number *= 6;
}

static void interpret_bytecodes_switch(uint8_t *bcs) {
    while (true) {
        switch (*bcs++) {
        case 0:
            return;
        case ADD1:
            add1();
            break;
        case ADD2:
            add2();
            break;
        case SUB3:
            sub3();
            break;
        case SUB4:
            sub4();
            break;
        case MUL5:
            mul5();
            break;
        case MUL6:
            mul6();
            break;
        }
    }
}

static void interpret_bytecodes_function_pointer(uint8_t *bcs) {
    void (*fs[])(void) = {
        NULL,
        add1,
        add2,
        sub3,
        sub4,
        mul5,
        mul6,
    };
    while (*bcs) {
        fs[*bcs++]();
    }
}

static void interpret_bytecodes_goto(uint8_t *bcs) {
    void *labels[] = {
        &&l_exit,
        &&l_add1,
        &&l_add2,
        &&l_sub3,
        &&l_sub4,
        &&l_mul5,
        &&l_mul6,
    };
    #define JUMP goto *labels[*bcs++]
    JUMP;
l_exit:
    return;
l_add1:
    add1();
    JUMP;
l_add2:
    add2();
    JUMP;
l_sub3:
    sub3();
    JUMP;
l_sub4:
    sub4();
    JUMP;
l_mul5:
    mul5();
    JUMP;
l_mul6:
    mul6();
    JUMP;
    #undef JUMP
}

struct timer {
    clock_t start, end;
};

static void timer_start(struct timer *self) {
    self->start = clock();
}

static void timer_end(struct timer *self) {
    self->end = clock();
}

static double timer_measure(struct timer *self) {
    return (double)(self->end - self->start) / CLOCKS_PER_SEC;
}

static void test(void (*f)(uint8_t *), uint8_t *bcs) {
    number = 0;
    struct timer timer;
    timer_start(&timer);
    f(bcs);
    timer_end(&timer);
    printf("%u %.3fs\n", number, timer_measure(&timer));
}

int main(void) {
    const int N = 300000000;
    srand((unsigned)time(NULL));
    uint8_t *bcs = malloc(N + 1);
    for (int i = 0; i < N; ++i) {
        bcs[i] = rand() % 5 + 1;
    }
    bcs[N] = 0;
    for (int i = 0; i < 10; ++i) {
        printf("%d ", bcs[i]);
    }
    printf("\nswitch\n");
    test(interpret_bytecodes_switch, bcs);
    printf("function pointer\n");
    test(interpret_bytecodes_function_pointer, bcs);
    printf("goto\n");
    test(interpret_bytecodes_goto, bcs);
    return 0;
}

result

~$ gcc vm.c -ovm -std=gnu11 -O3
~$ ./vm
3 4 5 3 3 5 3 3 1 2 
switch
3050839589 2.866s
function pointer
3050839589 2.573s
goto
3050839589 2.433s
~$ ./vm
3 1 1 3 5 5 2 4 5 1 
switch
3898179583 2.871s
function pointer
3898179583 2.573s
goto
3898179583 2.431s
~$ ./vm
5 5 1 2 3 3 1 2 2 4 
switch
954521520 2.869s
function pointer
954521520 2.574s
goto
954521520 2.432s

Below is the relevant disassembly of the code posted here after -O3 optimization.

interpret_bytecodes_switch:
.L8:
    addq    $1, %rdi
    cmpb    $6, -1(%rdi)
    ja  .L8
    movzbl  -1(%rdi), %edx
    jmp *.L11(,%rdx,8)
.L11:
    .quad   .L10
    .quad   .L12
    .quad   .L13
    .quad   .L14
    .quad   .L15
    .quad   .L16
    .quad   .L17
.L16:
    leal    (%rax,%rax,4), %eax
    jmp .L8
.L15:
    subl    $4, %eax
    jmp .L8
.L14:
    subl    $3, %eax
    jmp .L8
.L13:
    addl    $2, %eax
    jmp .L8
.L12:
    addl    $1, %eax
    jmp .L8
.L10:
    movl    %eax, number(%rip)
    ret
.L17:
    leal    (%rax,%rax,2), %eax
    addl    %eax, %eax
    jmp .L8

interpret_bytecodes_function_pointer:
    pushq   %rbx
    movq    %rdi, %rbx
    subq    $64, %rsp
    movzbl  (%rdi), %eax
    movq    $0, (%rsp)
    movq    $add1, 8(%rsp)
    movq    $add2, 16(%rsp)
    movq    $sub3, 24(%rsp)
    movq    $sub4, 32(%rsp)
    movq    $mul5, 40(%rsp)
    testb   %al, %al
    movq    $mul6, 48(%rsp)
    je  .L19
.L23:
    addq    $1, %rbx
    call    *(%rsp,%rax,8)
    movzbl  (%rbx), %eax
    testb   %al, %al
    jne .L23
.L19:
    addq    $64, %rsp
    popq    %rbx
    ret

interpret_bytecodes_goto:
    movzbl  (%rdi), %eax
    movq    $.L27, -72(%rsp)
    addq    $2, %rdi
    movq    $.L28, -64(%rsp)
    movq    $.L29, -56(%rsp)
    movq    $.L30, -48(%rsp)
    movq    $.L31, -40(%rsp)
    movq    $.L32, -32(%rsp)
    movq    $.L33, -24(%rsp)
    movq    -72(%rsp,%rax,8), %rax
    jmp *%rax
.L33:
    movl    number(%rip), %eax
    leal    (%rax,%rax,2), %eax
    addl    %eax, %eax
    movl    %eax, number(%rip)
    movzbl  -1(%rdi), %eax
    movq    -72(%rsp,%rax,8), %rax
.L35:
    addq    $1, %rdi
    jmp *%rax
.L28:
    addl    $1, number(%rip)
    movzbl  -1(%rdi), %eax
    movq    -72(%rsp,%rax,8), %rax
    jmp .L35
.L30:
    subl    $3, number(%rip)
    movzbl  -1(%rdi), %eax
    movq    -72(%rsp,%rax,8), %rax
    jmp .L35
.L31:
    subl    $4, number(%rip)
    movzbl  -1(%rdi), %eax
    movq    -72(%rsp,%rax,8), %rax
    jmp .L35
.L32:
    movl    number(%rip), %eax
    leal    (%rax,%rax,4), %eax
    movl    %eax, number(%rip)
    movzbl  -1(%rdi), %eax
    movq    -72(%rsp,%rax,8), %rax
    jmp .L35
.L29:
    addl    $2, number(%rip)
    movzbl  -1(%rdi), %eax
    movq    -72(%rsp,%rax,8), %rax
    jmp .L35
.L27:
    rep ret

Solution

switch is slowest because it has to manage default cases and this may add an extra bounds test you didn't implemented.

switch also handles a more general case where case values are not arranged in a so simple sequence, extra computation may be needed for that.