Search code examples
cassemblyx86reverse-engineering

Three versions of the same C program, why is the first one so fast?


Here is a very simple C program:

int main()
{
    int n = 0;

    while(n != 1000000000){
        n += 1;
    }

return n;

}

I compiled it with Clang and timed it. It ran in 4.711095243692398e-06 seconds, or 0.000004711095243692398 seconds.

Next I output the C program to Intel syntax assembly language using the Godbolt compiler explorer (https://godbolt.org) to remove the .cfi directives:

.file   "Svx.c"
.intel_syntax noprefix
.text
.globl  main
.type   main, @function
main:
    push    rbp
    mov rbp, rsp
    mov DWORD PTR -4[rbp], 0
    jmp .L2
.L3:
    add DWORD PTR -4[rbp], 1
.L2:
    cmp DWORD PTR -4[rbp], 1000000000
    jne .L3
    mov eax, DWORD PTR -4[rbp]
    pop rbp
    ret
    .size   main, .-main
    .ident  "GCC: (Ubuntu 7.4.0-1ubuntu1~18.04.1) 7.4.0"
    .section    .note.GNU-stack,"",@progbits

I compiled it with GCC and timed it. The result was 1.96 seconds -- vastly slower than the Clang version.

Finally, I created my own assembly version:

[BITS 64]
[default rel]

global main:function

section .data align=16

section .text

main:
xor rax,rax
l_01:
cmp rax,1000000000
je l_02
add rax,5
jmp l_01
l_02:
ret

I compiled it with nasm and linked it with ld:

sudo nasm -felf64 Svx.asm

sudo ld -shared Svx.o -o Svx.so

and timed it. It ran in 0.14707629615440965 seconds.

Why does the C version run so fast if the reverse-compiled version runs vastly slower (0.0000047 seconds vs 1.96 seconds) and my NASM version runs in 0.147 seconds? I have a feeling that the result from the C version at 0.0000047 seconds is wrong; it seems impossibly fast. This is its Clang output to assembly language:

    .text
    .intel_syntax noprefix
    .file   "Svx.c"
    .globl  main                     # -- Begin function main
    .p2align    4, 0x90
    .type   main,@function
main:                                    # @main
    .cfi_startproc
# %bb.0:
    push    rbp
    .cfi_def_cfa_offset 16
    .cfi_offset rbp, -16
    mov rbp, rsp
    .cfi_def_cfa_register rbp
    mov dword ptr [rbp - 4], 0
.LBB0_1:                                # =>This Inner Loop Header:     Depth=1
    cmp dword ptr [rbp - 4], 1000000000
    je  .LBB0_3
# %bb.2:                                #   in Loop: Header=BB0_1   Depth=1
    mov eax, dword ptr [rbp - 4]
    add eax, 1
    mov dword ptr [rbp - 4], eax
    jmp .LBB0_1
.LBB0_3:
    mov eax, dword ptr [rbp - 4]
    pop rbp
    .cfi_def_cfa rsp, 8
    ret
.Lfunc_end0:
    .size   main, .Lfunc_end0-main
    .cfi_endproc
                                        # -- End function
    .ident  "clang version 8.0.0-3~ubuntu18.04.1 (tags/RELEASE_800/final)"
    .section    ".note.GNU-stack","",@progbits
    .addrsig

The listing shows they are using the stack for variables, not a register, which is (usually) slower.

The speed, at 0.0000047 seconds, seems impossibly fast to count to a billion. If that speed is correct, what's its secret? Reverse engineering doesn't reveal anything and in fact the Godbolt version is much slower.


Solution

  • Clang is just realizing that this loop is run 1000000000 times and doing the equivalent of return 1000000000;.

    My output with -O3 as you specified that you're using:

            .text
            .file   "test.c"
            .globl  main                    # -- Begin function main
            .p2align        4, 0x90
            .type   main,@function
    main:                                   # @main
            .cfi_startproc
    # %bb.0:
            movl    $1000000000, %eax       # imm = 0x3B9ACA00
            retq
    .Lfunc_end0:
            .size   main, .Lfunc_end0-main
            .cfi_endproc
                                            # -- End function
    
            .ident  "clang version 8.0.0 (tags/RELEASE_800/final)"
            .section        ".note.GNU-stack","",@progbits
            .addrsig
    

    Notice the contents of main:

    # %bb.0:
            movl    $1000000000, %eax       # imm = 0x3B9ACA00
            retq
    

    This removes the loop entirely and just returns 1000000000.

    A trick to get around this is to use volatile:

    int main(void)
    {
        volatile int n = 0;
    
        while(n != 1000000000) {
            n += 1;
        }
    
        return n;
    }
    

    Output (again with -O3):

            .text
            .file   "test.c"
            .globl  main                    # -- Begin function main
            .p2align        4, 0x90
            .type   main,@function
    main:                                   # @main
            .cfi_startproc
    # %bb.0:
            movl    $0, -4(%rsp)
            movl    -4(%rsp), %ecx
            movl    -4(%rsp), %eax
            cmpl    $1000000000, %ecx       # imm = 0x3B9ACA00
            je      .LBB0_3
            .p2align        4, 0x90
    .LBB0_1:                                # =>This Inner Loop Header: Depth=1
            addl    $1, %eax
            movl    %eax, -4(%rsp)
            movl    -4(%rsp), %ecx
            movl    -4(%rsp), %eax
            cmpl    $1000000000, %ecx       # imm = 0x3B9ACA00
            jne     .LBB0_1
    .LBB0_3:
            retq
    .Lfunc_end0:
            .size   main, .Lfunc_end0-main
            .cfi_endproc
                                            # -- End function
    
            .ident  "clang version 8.0.0 (tags/RELEASE_800/final)"
            .section        ".note.GNU-stack","",@progbits
            .addrsig