In a bytecode interpreting loop, after several tests, I'm surprised to see that using switch
is the worst choice to make. Making calls to a function pointer array, or using gcc's computed goto
extension is always 10~20% faster, the computed goto
version being the fastest. I've tested with my real toy VM with 97 instructions and with the mini fake VM pasted below.
Why is using switch
the slowest?
#include <stdio.h>
#include <stdlib.h>
#include <stdbool.h>
#include <inttypes.h>
#include <time.h>
enum {
ADD1 = 1,
ADD2,
SUB3,
SUB4,
MUL5,
MUL6,
};
static unsigned int number;
static void add1(void) {
number += 1;
}
static void add2(void) {
number += 2;
}
static void sub3(void) {
number -= 3;
}
static void sub4(void) {
number -= 4;
}
static void mul5(void) {
number *= 5;
}
static void mul6(void) {
number *= 6;
}
static void interpret_bytecodes_switch(uint8_t *bcs) {
while (true) {
switch (*bcs++) {
case 0:
return;
case ADD1:
add1();
break;
case ADD2:
add2();
break;
case SUB3:
sub3();
break;
case SUB4:
sub4();
break;
case MUL5:
mul5();
break;
case MUL6:
mul6();
break;
}
}
}
static void interpret_bytecodes_function_pointer(uint8_t *bcs) {
void (*fs[])(void) = {
NULL,
add1,
add2,
sub3,
sub4,
mul5,
mul6,
};
while (*bcs) {
fs[*bcs++]();
}
}
static void interpret_bytecodes_goto(uint8_t *bcs) {
void *labels[] = {
&&l_exit,
&&l_add1,
&&l_add2,
&&l_sub3,
&&l_sub4,
&&l_mul5,
&&l_mul6,
};
#define JUMP goto *labels[*bcs++]
JUMP;
l_exit:
return;
l_add1:
add1();
JUMP;
l_add2:
add2();
JUMP;
l_sub3:
sub3();
JUMP;
l_sub4:
sub4();
JUMP;
l_mul5:
mul5();
JUMP;
l_mul6:
mul6();
JUMP;
#undef JUMP
}
struct timer {
clock_t start, end;
};
static void timer_start(struct timer *self) {
self->start = clock();
}
static void timer_end(struct timer *self) {
self->end = clock();
}
static double timer_measure(struct timer *self) {
return (double)(self->end - self->start) / CLOCKS_PER_SEC;
}
static void test(void (*f)(uint8_t *), uint8_t *bcs) {
number = 0;
struct timer timer;
timer_start(&timer);
f(bcs);
timer_end(&timer);
printf("%u %.3fs\n", number, timer_measure(&timer));
}
int main(void) {
const int N = 300000000;
srand((unsigned)time(NULL));
uint8_t *bcs = malloc(N + 1);
for (int i = 0; i < N; ++i) {
bcs[i] = rand() % 5 + 1;
}
bcs[N] = 0;
for (int i = 0; i < 10; ++i) {
printf("%d ", bcs[i]);
}
printf("\nswitch\n");
test(interpret_bytecodes_switch, bcs);
printf("function pointer\n");
test(interpret_bytecodes_function_pointer, bcs);
printf("goto\n");
test(interpret_bytecodes_goto, bcs);
return 0;
}
result
~$ gcc vm.c -ovm -std=gnu11 -O3
~$ ./vm
3 4 5 3 3 5 3 3 1 2
switch
3050839589 2.866s
function pointer
3050839589 2.573s
goto
3050839589 2.433s
~$ ./vm
3 1 1 3 5 5 2 4 5 1
switch
3898179583 2.871s
function pointer
3898179583 2.573s
goto
3898179583 2.431s
~$ ./vm
5 5 1 2 3 3 1 2 2 4
switch
954521520 2.869s
function pointer
954521520 2.574s
goto
954521520 2.432s
Below is the relevant disassembly of the code posted here after -O3
optimization.
interpret_bytecodes_switch:
.L8:
addq $1, %rdi
cmpb $6, -1(%rdi)
ja .L8
movzbl -1(%rdi), %edx
jmp *.L11(,%rdx,8)
.L11:
.quad .L10
.quad .L12
.quad .L13
.quad .L14
.quad .L15
.quad .L16
.quad .L17
.L16:
leal (%rax,%rax,4), %eax
jmp .L8
.L15:
subl $4, %eax
jmp .L8
.L14:
subl $3, %eax
jmp .L8
.L13:
addl $2, %eax
jmp .L8
.L12:
addl $1, %eax
jmp .L8
.L10:
movl %eax, number(%rip)
ret
.L17:
leal (%rax,%rax,2), %eax
addl %eax, %eax
jmp .L8
interpret_bytecodes_function_pointer:
pushq %rbx
movq %rdi, %rbx
subq $64, %rsp
movzbl (%rdi), %eax
movq $0, (%rsp)
movq $add1, 8(%rsp)
movq $add2, 16(%rsp)
movq $sub3, 24(%rsp)
movq $sub4, 32(%rsp)
movq $mul5, 40(%rsp)
testb %al, %al
movq $mul6, 48(%rsp)
je .L19
.L23:
addq $1, %rbx
call *(%rsp,%rax,8)
movzbl (%rbx), %eax
testb %al, %al
jne .L23
.L19:
addq $64, %rsp
popq %rbx
ret
interpret_bytecodes_goto:
movzbl (%rdi), %eax
movq $.L27, -72(%rsp)
addq $2, %rdi
movq $.L28, -64(%rsp)
movq $.L29, -56(%rsp)
movq $.L30, -48(%rsp)
movq $.L31, -40(%rsp)
movq $.L32, -32(%rsp)
movq $.L33, -24(%rsp)
movq -72(%rsp,%rax,8), %rax
jmp *%rax
.L33:
movl number(%rip), %eax
leal (%rax,%rax,2), %eax
addl %eax, %eax
movl %eax, number(%rip)
movzbl -1(%rdi), %eax
movq -72(%rsp,%rax,8), %rax
.L35:
addq $1, %rdi
jmp *%rax
.L28:
addl $1, number(%rip)
movzbl -1(%rdi), %eax
movq -72(%rsp,%rax,8), %rax
jmp .L35
.L30:
subl $3, number(%rip)
movzbl -1(%rdi), %eax
movq -72(%rsp,%rax,8), %rax
jmp .L35
.L31:
subl $4, number(%rip)
movzbl -1(%rdi), %eax
movq -72(%rsp,%rax,8), %rax
jmp .L35
.L32:
movl number(%rip), %eax
leal (%rax,%rax,4), %eax
movl %eax, number(%rip)
movzbl -1(%rdi), %eax
movq -72(%rsp,%rax,8), %rax
jmp .L35
.L29:
addl $2, number(%rip)
movzbl -1(%rdi), %eax
movq -72(%rsp,%rax,8), %rax
jmp .L35
.L27:
rep ret
switch
is slowest because it has to manage default
cases and this may add an extra bounds test you didn't implemented.
switch
also handles a more general case where case values are not arranged in a so simple sequence, extra computation may be needed for that.