Search code examples
assemblyx86segmentation-faultbrainfuck

Why does a program created by a Brainfuck into assembly compiler crash?


I'm writing a Brainfuck to NASM compiler in Haskell. It can compile small programs, but fails to do so correctly with big ones.

Consider the following Brainfuck code:

++++++++[>++++[>++>+++>+++>+<<<<-]>+>+>->>+[<]<-]>>.>---.+++++++..+++.>>.<-.<.+++.------.--------.>>+.>++.

I have it represented as:

BfSource [Add 8,LoopL 0,GoRight 1,Add 4,LoopL 1,GoRight 1,Add 2,GoRight 1,Add 3,GoRight 1,Add 3,GoRight 1,Add 1,GoLeft 4,Sub 1,LoopR 1,GoRight 1,Add 1,GoRight 1,Add 1,GoRight 1,Sub 1,GoRight 2,Add 1,LoopL 2,GoLeft 1,LoopR 2,GoLeft 1,Sub 1,LoopR 0,GoRight 2,WriteChar,GoRight 1,Sub 3,WriteChar,Add 7,WriteChar,WriteChar,Add 3,WriteChar,GoRight 2,WriteChar,GoLeft 1,Sub 1,WriteChar,GoLeft 1,WriteChar,Add 3,WriteChar,Sub 6,WriteChar,Sub 8,WriteChar,GoRight 2,Add 1,WriteChar,GoRight 1,Add 2,WriteChar]

Which gets translated into the following assembly:

section .bss
    memory resb 30000
section .text
    global _start
_printChar:
    mov rdx, 1
    mov rbx, 1
    mov rax, 4
    int 80h
    ret
_start:
    mov rcx, memory
    mov al, [rcx]
    add al, 8
    mov [rcx], al
_L0:
    inc rcx
    mov al, [rcx]
    add al, 4
    mov [rcx], al
_L1:
    inc rcx
    mov al, [rcx]
    add al, 2
    mov [rcx], al
    inc rcx
    mov al, [rcx]
    add al, 3
    mov [rcx], al
    inc rcx
    mov al, [rcx]
    add al, 3
    mov [rcx], al
    inc rcx
    mov al, [rcx]
    inc al
    mov [rcx], al
    sub rcx, 4
    mov al, [rcx]
    dec al
    mov [rcx], al
    mov al, [rcx]
    test al, al
    jnz _L1
    inc rcx
    mov al, [rcx]
    inc al
    mov [rcx], al
    inc rcx
    mov al, [rcx]
    inc al
    mov [rcx], al
    inc rcx
    mov al, [rcx]
    dec al
    mov [rcx], al
    add rcx, 2
    mov al, [rcx]
    inc al
    mov [rcx], al
_L2:
    dec rcx
    mov al, [rcx]
    test al, al
    jnz _L2
    dec rcx
    mov al, [rcx]
    dec al
    mov [rcx], al
    mov al, [rcx]
    test al, al
    jnz _L0
    add rcx, 2
    call _printChar
    inc rcx
    mov al, [rcx]
    sub al, 3
    mov [rcx], al
    call _printChar
    mov al, [rcx]
    add al, 7
    mov [rcx], al
    call _printChar
    call _printChar
    mov al, [rcx]
    add al, 3
    mov [rcx], al
    call _printChar
    add rcx, 2
    call _printChar
    dec rcx
    mov al, [rcx]
    dec al
    mov [rcx], al
    call _printChar
    dec rcx
    call _printChar
    mov al, [rcx]
    add al, 3
    mov [rcx], al
    call _printChar
    mov al, [rcx]
    sub al, 6
    mov [rcx], al
    call _printChar
    mov al, [rcx]
    sub al, 8
    mov [rcx], al
    call _printChar
    add rcx, 2
    mov al, [rcx]
    inc al
    mov [rcx], al
    call _printChar
    inc rcx
    mov al, [rcx]
    add al, 2
    mov [rcx], al
    call _printChar
    mov rax, 1
    xor rbx, rbx
    int 80h

And here's how it behaves:

$ runghc Main.hs hello.bf
$ nasm -f elf64 hello.nasm
$ ld -m elf_x86_64 hello.o -o hello
$ ./hello
Hello World!

It works as it should. But when trying to compile a bigger program (in this case a mandelbrot fractal generator) a segfault occurs. I'm 100% sure this code works, because I've checked it in an online Brainfuck interpreter.

$ runghc Main.hs mandelbrot.bf
$ nasm -f elf64 mandelbrot.nasm
$ ld -m elf_x86_64 mandelbrot.o -o mandelbrot
$ ./mandelbrot
Segmentation fault

Using pwndbg I've found the place where segfault occurs:

────────────────────[ REGISTERS ]────────────────────
 RAX  0x1
 RBX  0x0
 RCX  0x404fff ◂— add    byte ptr [rax], al
 // ... All other registers are 0x0
 RSP  0x7fffffffe0f0 ◂— 0x1
 RIP  0x4014b6 (_L43+33) ◂— mov    byte ptr [rcx], al
──────────────────────[ DISASM ]─────────────────────
 ► 0x4014b6 <_L43+33>    mov    byte ptr [rcx], al
   0x4014b8 <_L43+35>    add    rcx, 8
   0x4014bc <_L43+39>    mov    al, byte ptr [rcx]
   0x4014be <_L43+41>    test   al, al
   0x4014c0 <_L43+43>    jne    _L33 <0x4013a6>

   0x4014c6 <_L43+49>    sub    rcx, 9
   0x4014ca <_L44>       inc    rcx
   0x4014cd <_L44+3>     xor    al, al
   0x4014cf <_L44+5>     mov    byte ptr [rcx], al
   0x4014d1 <_L44+7>     dec    rcx
   0x4014d4 <_L44+10>    mov    al, byte ptr [rcx]

I Ctrl+F'd that _L33 in a text editor and what I've found is similar, but different code (all the labels generated by my compiler are unique, so it has to be the same place).

    mov [rcx], al
    add rcx, 8
    mov al, [rcx]
    test al, al
    jnz _L33
    sub rcx, 9
_L44:
    inc rcx
    xor al, al
    mov [rcx], al
    dec rcx
    mov al, [rcx]

So what is going on here? Is NASM generating different assembly than in the source file? Or maybe pwndbg disassembled it incorrectly? I'd say something is wrong with my compiler, but I don't know what.


EDIT: I think cut and pasting two ~100 lines of code files is not a good idea, considering this post is already too long.

I've uploaded the source code to a GitHub repository, please take a look at it.


Solution

  • Your code generator miscompiles loops:

    bf2asm handle (LoopL x) =
        hPutStrLn handle $ "_L" ++ show x ++ ":"
    bf2asm handle (LoopR x) =
        mapM_ (hPutStrLn handle)
            [ "    mov al, [rcx]"
            , "    test al, al"
            , "    jnz _L" ++ show x
            ]
    

    As you can see, it puts the test of the current byte at the end of the loop, creating the equivalent of a do-while loop:

    do {  // [
        ...
    } while (*ecx);  // ]
    

    But the semantics of [ ] in brainfuck are that the loop test is done first, like this:

    while (*ecx) {  // [
        ...
    }  // ]
    

    You should change your compiler to produce something like this instead:

    _LS42:
        mov al, [rcx]
        test al, al
        jz _LE42
    ...
        jmp _LS42
    _LE42: