compiler-construction llvm compiler-optimization llvm-ir

LLVM opt removes all my code (but runs correctly when unoptimized)

I'm trying out building a compiler with LLVM (inkwell crate in rust) by implementing a Brainf*** compiler/jit, dumping a main and a program function into a module and compiling. Both the JIT engine with "OptimizationLevel::Aggressive" and unoptimized external compilation (see below for the steps) seem to work and Brainf*** programs of significant size (drawing mandelbrot, embedded oop and other example programs) run correctly, but enabling any optimization level in LLVM opt reduces the code to a wrong "constant expression".

An example: the following Brainf*** program >++++++[<++++++++>-]<. should print out the letter 0 (loading 6 * 8 = 48 = ASCII '0' into stack location 0 and then print). My compiler generates the following unoptimized LLVM IR module:

; ModuleID = 'my_module'
source_filename = "stdin"

define void @program(i8* %0) {
entry:
  %index = alloca i32, align 4 ;  allocate index
  ;  increment index `>`
  %offset = load i32, i32* %index, align 4
  %incremented = add i32 %offset, 1
  store i32 %incremented, i32* %index, align 4
  ;  increment cell `+` (6 times) ...
  %1 = load i32, i32* %index, align 4
  %2 = getelementptr i8, i8* %0, i32 %1
  %3 = load i8, i8* %2, align 1
  %4 = add i8 %3, 1
  store i8 %4, i8* %2, align 1
  %5 = load i32, i32* %index, align 4
  %6 = getelementptr i8, i8* %0, i32 %5
  %7 = load i8, i8* %6, align 1
  %8 = add i8 %7, 1
  store i8 %8, i8* %6, align 1
  %9 = load i32, i32* %index, align 4
  %10 = getelementptr i8, i8* %0, i32 %9
  %11 = load i8, i8* %10, align 1
  %12 = add i8 %11, 1
  store i8 %12, i8* %10, align 1
  %13 = load i32, i32* %index, align 4
  %14 = getelementptr i8, i8* %0, i32 %13
  %15 = load i8, i8* %14, align 1
  %16 = add i8 %15, 1
  store i8 %16, i8* %14, align 1
  %17 = load i32, i32* %index, align 4
  %18 = getelementptr i8, i8* %0, i32 %17
  %19 = load i8, i8* %18, align 1
  %20 = add i8 %19, 1
  store i8 %20, i8* %18, align 1
  %21 = load i32, i32* %index, align 4
  %22 = getelementptr i8, i8* %0, i32 %21
  %23 = load i8, i8* %22, align 1
  %24 = add i8 %23, 1
  store i8 %24, i8* %22, align 1
  ;  ...done.
  ;  goto branch condition
  br label %cond0

cond0:                                            ; preds = %block0, %entry
  ;  conditional `[`
  %offset1 = load i32, i32* %index, align 4
  %ptr = getelementptr i8, i8* %0, i32 %offset1
  %cell = load i8, i8* %ptr, align 1
  %25 = icmp eq i8 %cell, 0
  ;  jump to after matching `]` if cell is zero
  br i1 %25, label %after0, label %block0

block0:                                           ; preds = %cond0
  ;  decrement index `<`
  %offset2 = load i32, i32* %index, align 4
  %decremented = sub i32 %offset2, 1
  store i32 %decremented, i32* %index, align 4
  ;  increment current cell `+` (8 times) ...
  %26 = load i32, i32* %index, align 4
  %27 = getelementptr i8, i8* %0, i32 %26
  %28 = load i8, i8* %27, align 1
  %29 = add i8 %28, 1
  store i8 %29, i8* %27, align 1
  %30 = load i32, i32* %index, align 4
  %31 = getelementptr i8, i8* %0, i32 %30
  %32 = load i8, i8* %31, align 1
  %33 = add i8 %32, 1
  store i8 %33, i8* %31, align 1
  %34 = load i32, i32* %index, align 4
  %35 = getelementptr i8, i8* %0, i32 %34
  %36 = load i8, i8* %35, align 1
  %37 = add i8 %36, 1
  store i8 %37, i8* %35, align 1
  %38 = load i32, i32* %index, align 4
  %39 = getelementptr i8, i8* %0, i32 %38
  %40 = load i8, i8* %39, align 1
  %41 = add i8 %40, 1
  store i8 %41, i8* %39, align 1
  %42 = load i32, i32* %index, align 4
  %43 = getelementptr i8, i8* %0, i32 %42
  %44 = load i8, i8* %43, align 1
  %45 = add i8 %44, 1
  store i8 %45, i8* %43, align 1
  %46 = load i32, i32* %index, align 4
  %47 = getelementptr i8, i8* %0, i32 %46
  %48 = load i8, i8* %47, align 1
  %49 = add i8 %48, 1
  store i8 %49, i8* %47, align 1
  %50 = load i32, i32* %index, align 4
  %51 = getelementptr i8, i8* %0, i32 %50
  %52 = load i8, i8* %51, align 1
  %53 = add i8 %52, 1
  store i8 %53, i8* %51, align 1
  %54 = load i32, i32* %index, align 4
  %55 = getelementptr i8, i8* %0, i32 %54
  %56 = load i8, i8* %55, align 1
  %57 = add i8 %56, 1
  store i8 %57, i8* %55, align 1
  ;  ...done.
  ;  increment index `>`
  %offset3 = load i32, i32* %index, align 4
  %incremented4 = add i32 %offset3, 1
  store i32 %incremented4, i32* %index, align 4
  ;  decrement cell `-`
  %offset5 = load i32, i32* %index, align 4
  %ptr6 = getelementptr i8, i8* %0, i32 %offset5
  %cell7 = load i8, i8* %ptr6, align 1
  %dec_cell = sub i8 %cell7, 1
  store i8 %dec_cell, i8* %ptr6, align 1
  ;  unconditional jump `]` to matching `[`
  br label %cond0

after0:                                           ; preds = %cond0
  ;  decrement index `<`
  %offset8 = load i32, i32* %index, align 4
  %decremented9 = sub i32 %offset8, 1
  store i32 %decremented9, i32* %index, align 4
  ;  print current cell `.`
  %offset10 = load i32, i32* %index, align 4
  %ptr11 = getelementptr i8, i8* %0, i32 %offset10
  %cell12 = load i8, i8* %ptr11, align 1
  %58 = sext i8 %cell12 to i32
  %print = call i32 @putchar(i32 %58)
  ;  return
  ret void
}

declare i32 @putchar(i32) ; libc extern

declare i32 @getchar()    ; libc extern

define i32 @main() {
entry:
  ;  allocate bf stack
  %0 = trunc i64 100000 to i32
  %mallocsize = mul i32 %0, ptrtoint (i8* getelementptr (i8, i8* null, i32 1) to i32)
  %stack = tail call i8* @malloc(i32 %mallocsize)
  call void @llvm.memset.p0i8.i64(i8* align 1 %stack, i8 0, i64 100000, i1 false)
  ;  call compiled bf function
  call void @program(i8* %stack)
  ;  return EXIT_SUCCESS
  ret i32 0
}

declare noalias i8* @malloc(i32)

; Function Attrs: argmemonly nofree nounwind willreturn writeonly
declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i1 immarg) #0

attributes #0 = { argmemonly nofree nounwind willreturn writeonly }

This module runs both through LLVMJIT as well as the following compile commands (with out.ll being this file):

llc out.ll -o out.asm
clang out.asm -o out
./out
-- 0

but as soon as I add an optimizer pass:

opt out.ll --O1 -S -o opt.ll
llc opt.ll -o opt.asm
clang opt.asm -o opt
./opt
--

the program doesn't work and the following (obviously wrong) "optimized" module is generated:

; ModuleID = 'out.ll'
source_filename = "stdin"

; Function Attrs: nofree nounwind
define void @program(i8* nocapture %0) local_unnamed_addr #0 {
entry:
  store i8 0, i8* %0, align 1
  %print = call i32 @putchar(i32 0)
  ret void
}

; Function Attrs: nofree nounwind
declare noundef i32 @putchar(i32 noundef) local_unnamed_addr #0

; Function Attrs: nofree nounwind
define i32 @main() local_unnamed_addr #0 {
entry:
  %print.i = call i32 @putchar(i32 0) #1
  ret i32 0
}

attributes #0 = { nofree nounwind }
attributes #1 = { nounwind }

Considering that the optimizer happily removes my code I must have violated some assumptions of the SSA code, but I can't figure out which. I alloca'd the only mutable variable (to avoid phi nodes) and I think my getelementptr and branch logic is correct; I also played around with different linkage flags to see if something changes, but I think I must have those correct, because else the CRT wouldn't find the main function. opt -print-after-all also didn't really help me understand the optimizations.

Could somebody tell me what I'm doing wrong here?

(PS: FYI going directly through clang -O<N> out.ll -o out without opt gives the same results.)

Solution

  %index = alloca i32, align 4 ;  allocate index
  ;  increment index `>`
  %offset = load i32, i32* %index, align 4

%offset is now undef because you never stored a value into the alloca'd memory.

If you check opt -sroa over your code, you can see this:

define void @program(i8* %0) {
entry:
  %incremented = add i32 undef, 1
  %1 = getelementptr i8, i8* %0, i32 %incremented

and so on.