I'm trying out building a compiler with LLVM (inkwell crate in rust) by implementing a Brainf*** compiler/jit, dumping a main and a program function into a module and compiling. Both the JIT engine with "OptimizationLevel::Aggressive" and unoptimized external compilation (see below for the steps) seem to work and Brainf*** programs of significant size (drawing mandelbrot, embedded oop and other example programs) run correctly, but enabling any optimization level in LLVM opt reduces the code to a wrong "constant expression".
An example: the following Brainf*** program >++++++[<++++++++>-]<.
should print out the letter 0
(loading 6 * 8 = 48 = ASCII '0' into stack location 0 and then print). My compiler generates the following unoptimized LLVM IR module:
; ModuleID = 'my_module'
source_filename = "stdin"
define void @program(i8* %0) {
entry:
%index = alloca i32, align 4 ; allocate index
; increment index `>`
%offset = load i32, i32* %index, align 4
%incremented = add i32 %offset, 1
store i32 %incremented, i32* %index, align 4
; increment cell `+` (6 times) ...
%1 = load i32, i32* %index, align 4
%2 = getelementptr i8, i8* %0, i32 %1
%3 = load i8, i8* %2, align 1
%4 = add i8 %3, 1
store i8 %4, i8* %2, align 1
%5 = load i32, i32* %index, align 4
%6 = getelementptr i8, i8* %0, i32 %5
%7 = load i8, i8* %6, align 1
%8 = add i8 %7, 1
store i8 %8, i8* %6, align 1
%9 = load i32, i32* %index, align 4
%10 = getelementptr i8, i8* %0, i32 %9
%11 = load i8, i8* %10, align 1
%12 = add i8 %11, 1
store i8 %12, i8* %10, align 1
%13 = load i32, i32* %index, align 4
%14 = getelementptr i8, i8* %0, i32 %13
%15 = load i8, i8* %14, align 1
%16 = add i8 %15, 1
store i8 %16, i8* %14, align 1
%17 = load i32, i32* %index, align 4
%18 = getelementptr i8, i8* %0, i32 %17
%19 = load i8, i8* %18, align 1
%20 = add i8 %19, 1
store i8 %20, i8* %18, align 1
%21 = load i32, i32* %index, align 4
%22 = getelementptr i8, i8* %0, i32 %21
%23 = load i8, i8* %22, align 1
%24 = add i8 %23, 1
store i8 %24, i8* %22, align 1
; ...done.
; goto branch condition
br label %cond0
cond0: ; preds = %block0, %entry
; conditional `[`
%offset1 = load i32, i32* %index, align 4
%ptr = getelementptr i8, i8* %0, i32 %offset1
%cell = load i8, i8* %ptr, align 1
%25 = icmp eq i8 %cell, 0
; jump to after matching `]` if cell is zero
br i1 %25, label %after0, label %block0
block0: ; preds = %cond0
; decrement index `<`
%offset2 = load i32, i32* %index, align 4
%decremented = sub i32 %offset2, 1
store i32 %decremented, i32* %index, align 4
; increment current cell `+` (8 times) ...
%26 = load i32, i32* %index, align 4
%27 = getelementptr i8, i8* %0, i32 %26
%28 = load i8, i8* %27, align 1
%29 = add i8 %28, 1
store i8 %29, i8* %27, align 1
%30 = load i32, i32* %index, align 4
%31 = getelementptr i8, i8* %0, i32 %30
%32 = load i8, i8* %31, align 1
%33 = add i8 %32, 1
store i8 %33, i8* %31, align 1
%34 = load i32, i32* %index, align 4
%35 = getelementptr i8, i8* %0, i32 %34
%36 = load i8, i8* %35, align 1
%37 = add i8 %36, 1
store i8 %37, i8* %35, align 1
%38 = load i32, i32* %index, align 4
%39 = getelementptr i8, i8* %0, i32 %38
%40 = load i8, i8* %39, align 1
%41 = add i8 %40, 1
store i8 %41, i8* %39, align 1
%42 = load i32, i32* %index, align 4
%43 = getelementptr i8, i8* %0, i32 %42
%44 = load i8, i8* %43, align 1
%45 = add i8 %44, 1
store i8 %45, i8* %43, align 1
%46 = load i32, i32* %index, align 4
%47 = getelementptr i8, i8* %0, i32 %46
%48 = load i8, i8* %47, align 1
%49 = add i8 %48, 1
store i8 %49, i8* %47, align 1
%50 = load i32, i32* %index, align 4
%51 = getelementptr i8, i8* %0, i32 %50
%52 = load i8, i8* %51, align 1
%53 = add i8 %52, 1
store i8 %53, i8* %51, align 1
%54 = load i32, i32* %index, align 4
%55 = getelementptr i8, i8* %0, i32 %54
%56 = load i8, i8* %55, align 1
%57 = add i8 %56, 1
store i8 %57, i8* %55, align 1
; ...done.
; increment index `>`
%offset3 = load i32, i32* %index, align 4
%incremented4 = add i32 %offset3, 1
store i32 %incremented4, i32* %index, align 4
; decrement cell `-`
%offset5 = load i32, i32* %index, align 4
%ptr6 = getelementptr i8, i8* %0, i32 %offset5
%cell7 = load i8, i8* %ptr6, align 1
%dec_cell = sub i8 %cell7, 1
store i8 %dec_cell, i8* %ptr6, align 1
; unconditional jump `]` to matching `[`
br label %cond0
after0: ; preds = %cond0
; decrement index `<`
%offset8 = load i32, i32* %index, align 4
%decremented9 = sub i32 %offset8, 1
store i32 %decremented9, i32* %index, align 4
; print current cell `.`
%offset10 = load i32, i32* %index, align 4
%ptr11 = getelementptr i8, i8* %0, i32 %offset10
%cell12 = load i8, i8* %ptr11, align 1
%58 = sext i8 %cell12 to i32
%print = call i32 @putchar(i32 %58)
; return
ret void
}
declare i32 @putchar(i32) ; libc extern
declare i32 @getchar() ; libc extern
define i32 @main() {
entry:
; allocate bf stack
%0 = trunc i64 100000 to i32
%mallocsize = mul i32 %0, ptrtoint (i8* getelementptr (i8, i8* null, i32 1) to i32)
%stack = tail call i8* @malloc(i32 %mallocsize)
call void @llvm.memset.p0i8.i64(i8* align 1 %stack, i8 0, i64 100000, i1 false)
; call compiled bf function
call void @program(i8* %stack)
; return EXIT_SUCCESS
ret i32 0
}
declare noalias i8* @malloc(i32)
; Function Attrs: argmemonly nofree nounwind willreturn writeonly
declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i1 immarg) #0
attributes #0 = { argmemonly nofree nounwind willreturn writeonly }
This module runs both through LLVMJIT as well as the following compile commands (with out.ll being this file):
llc out.ll -o out.asm
clang out.asm -o out
./out
-- 0
but as soon as I add an optimizer pass:
opt out.ll --O1 -S -o opt.ll
llc opt.ll -o opt.asm
clang opt.asm -o opt
./opt
--
the program doesn't work and the following (obviously wrong) "optimized" module is generated:
; ModuleID = 'out.ll'
source_filename = "stdin"
; Function Attrs: nofree nounwind
define void @program(i8* nocapture %0) local_unnamed_addr #0 {
entry:
store i8 0, i8* %0, align 1
%print = call i32 @putchar(i32 0)
ret void
}
; Function Attrs: nofree nounwind
declare noundef i32 @putchar(i32 noundef) local_unnamed_addr #0
; Function Attrs: nofree nounwind
define i32 @main() local_unnamed_addr #0 {
entry:
%print.i = call i32 @putchar(i32 0) #1
ret i32 0
}
attributes #0 = { nofree nounwind }
attributes #1 = { nounwind }
Considering that the optimizer happily removes my code I must have violated some assumptions of the SSA code, but I can't figure out which. I alloca'd the only mutable variable (to avoid phi nodes) and I think my getelementptr and branch logic is correct; I also played around with different linkage flags to see if something changes, but I think I must have those correct, because else the CRT wouldn't find the main function. opt -print-after-all also didn't really help me understand the optimizations.
Could somebody tell me what I'm doing wrong here?
(PS: FYI going directly through clang -O<N> out.ll -o out
without opt gives the same results.)
%index = alloca i32, align 4 ; allocate index
; increment index `>`
%offset = load i32, i32* %index, align 4
%offset
is now undef
because you never stored a value into the alloca'd memory.
If you check opt -sroa
over your code, you can see this:
define void @program(i8* %0) {
entry:
%incremented = add i32 undef, 1
%1 = getelementptr i8, i8* %0, i32 %incremented
and so on.