Here is a very simple C program:
int main()
{
int n = 0;
while(n != 1000000000){
n += 1;
}
return n;
}
I compiled it with Clang and timed it. It ran in 4.711095243692398e-06
seconds, or 0.000004711095243692398
seconds.
Next I output the C program to Intel syntax assembly language using the Godbolt compiler explorer (https://godbolt.org) to remove the .cfi
directives:
.file "Svx.c"
.intel_syntax noprefix
.text
.globl main
.type main, @function
main:
push rbp
mov rbp, rsp
mov DWORD PTR -4[rbp], 0
jmp .L2
.L3:
add DWORD PTR -4[rbp], 1
.L2:
cmp DWORD PTR -4[rbp], 1000000000
jne .L3
mov eax, DWORD PTR -4[rbp]
pop rbp
ret
.size main, .-main
.ident "GCC: (Ubuntu 7.4.0-1ubuntu1~18.04.1) 7.4.0"
.section .note.GNU-stack,"",@progbits
I compiled it with GCC and timed it. The result was 1.96
seconds -- vastly slower than the Clang version.
Finally, I created my own assembly version:
[BITS 64]
[default rel]
global main:function
section .data align=16
section .text
main:
xor rax,rax
l_01:
cmp rax,1000000000
je l_02
add rax,5
jmp l_01
l_02:
ret
I compiled it with nasm
and linked it with ld
:
sudo nasm -felf64 Svx.asm
sudo ld -shared Svx.o -o Svx.so
and timed it. It ran in 0.14707629615440965
seconds.
Why does the C version run so fast if the reverse-compiled version runs vastly slower (0.0000047
seconds vs 1.96
seconds) and my NASM version runs in 0.147
seconds? I have a feeling that the result from the C version at 0.0000047
seconds is wrong; it seems impossibly fast. This is its Clang output to assembly language:
.text
.intel_syntax noprefix
.file "Svx.c"
.globl main # -- Begin function main
.p2align 4, 0x90
.type main,@function
main: # @main
.cfi_startproc
# %bb.0:
push rbp
.cfi_def_cfa_offset 16
.cfi_offset rbp, -16
mov rbp, rsp
.cfi_def_cfa_register rbp
mov dword ptr [rbp - 4], 0
.LBB0_1: # =>This Inner Loop Header: Depth=1
cmp dword ptr [rbp - 4], 1000000000
je .LBB0_3
# %bb.2: # in Loop: Header=BB0_1 Depth=1
mov eax, dword ptr [rbp - 4]
add eax, 1
mov dword ptr [rbp - 4], eax
jmp .LBB0_1
.LBB0_3:
mov eax, dword ptr [rbp - 4]
pop rbp
.cfi_def_cfa rsp, 8
ret
.Lfunc_end0:
.size main, .Lfunc_end0-main
.cfi_endproc
# -- End function
.ident "clang version 8.0.0-3~ubuntu18.04.1 (tags/RELEASE_800/final)"
.section ".note.GNU-stack","",@progbits
.addrsig
The listing shows they are using the stack for variables, not a register, which is (usually) slower.
The speed, at 0.0000047
seconds, seems impossibly fast to count to a billion. If that speed is correct, what's its secret? Reverse engineering doesn't reveal anything and in fact the Godbolt version is much slower.
Clang is just realizing that this loop is run 1000000000
times and doing the equivalent of return 1000000000;
.
My output with -O3
as you specified that you're using:
.text
.file "test.c"
.globl main # -- Begin function main
.p2align 4, 0x90
.type main,@function
main: # @main
.cfi_startproc
# %bb.0:
movl $1000000000, %eax # imm = 0x3B9ACA00
retq
.Lfunc_end0:
.size main, .Lfunc_end0-main
.cfi_endproc
# -- End function
.ident "clang version 8.0.0 (tags/RELEASE_800/final)"
.section ".note.GNU-stack","",@progbits
.addrsig
Notice the contents of main
:
# %bb.0:
movl $1000000000, %eax # imm = 0x3B9ACA00
retq
This removes the loop entirely and just returns 1000000000
.
A trick to get around this is to use volatile
:
int main(void)
{
volatile int n = 0;
while(n != 1000000000) {
n += 1;
}
return n;
}
Output (again with -O3
):
.text
.file "test.c"
.globl main # -- Begin function main
.p2align 4, 0x90
.type main,@function
main: # @main
.cfi_startproc
# %bb.0:
movl $0, -4(%rsp)
movl -4(%rsp), %ecx
movl -4(%rsp), %eax
cmpl $1000000000, %ecx # imm = 0x3B9ACA00
je .LBB0_3
.p2align 4, 0x90
.LBB0_1: # =>This Inner Loop Header: Depth=1
addl $1, %eax
movl %eax, -4(%rsp)
movl -4(%rsp), %ecx
movl -4(%rsp), %eax
cmpl $1000000000, %ecx # imm = 0x3B9ACA00
jne .LBB0_1
.LBB0_3:
retq
.Lfunc_end0:
.size main, .Lfunc_end0-main
.cfi_endproc
# -- End function
.ident "clang version 8.0.0 (tags/RELEASE_800/final)"
.section ".note.GNU-stack","",@progbits
.addrsig