4x4 matrix multiplication: Exception 4: Unaligned Address in inst/data fetch: 0x100100bb

I'm trying to to do a 4x4 Matrix multiplication using Assembly in MIPS simulator (QtMips). QtMips gives me Exception 4: Unaligned Address in inst/data fetch: 0x100100bb

This is where I get the error when I single step.

    [00400070] c52b0000  lwc1 $f11, 0($9) ; 80: lwc1 $f11 0($t1) #load float from array1

The error happens when counter k = 2, meaning when it is at the third loop. I'm assuming something is wrong with 32-bit alignment at my third load, lwc1

Here's what I tried/read but didn't work:

This suggests that I put .align 2 or .align 4 before my array (matrix) declaration in .data. Didn't work.
This suggests that it could be the issue of the size value (defined after array3). But I'm loading this to s1 by lw $s1 size so I don't see this being a real issue for me.

I'm very lost on what to do. Please impart me some wisdom.

Below is my whole code:

    # here's our array data, two args and a result
    .data
    .globl array1
    .globl array2
    .globl array3

    .align 5 #align the data set
array1: .float 1.00, 0.00, 3.14, 2.72, 2.72, 1.00, 0.00, 3.14, 1.00, 1.00, 1.00, 1.00, 1.00, 2.00, 3.00, 4.00
    .align 5 #align the data set
array2: .float 1.00, 1.00, 0.00, 3.14, 0.00, 1.00, 3.14, 2.72, 0.00, 1.00, 1.00, 0.00, 4.00, 3.00, 2.00, 1.00
    .align 5 #align the data set
array3: .float 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00

size: .word 4 #store float in s2

    .text
    .globl main
main:
    sw $31 saved_ret_pc

    .data
lb_:    .asciiz "Vector Multiplication\n"
lbd_:   .byte 1, -1, 0, 128
lbd1_:  .word 0x76543210, 0xfedcba98
    .text
    li $v0 4    # syscall 4 (print_str)
    la $a0 lb_
    syscall

# main program: multiply matrix 1 and 2, store in array3

la $t1 array1
la $t2 array2
la $t3 array3 ###load arrrays to registers


li $t4 4 # i loop counter    -> I changed addi to li
li $t5 4 # j loop counter
li $t6 4 # k loop counter

lw $s1 size # load matrix(array) size


i_loop:
    j j_loop
j_loop:
    j k_loop
k_loop:
    #f0 and f1 - float func return values
    #f10 - multiplication return values
    #f4, f5 - register to store addr offset

    lwc1 $f11 0($t1) #load float from array1
    lwc1 $f12 0($t2) #load float from array2
    lwc1 $f13 0($t3) #load float from result array3
    nop 
    mul.s $f10 $f11 $f12 #multiply floats, store result as temp in $f10
    nop

    add.s $f13 $f13 $f10 #add to multiplication result to resulting array3

    swc1 $f13 0($t3) #store the resulting float in array3

#call index_of_A
    move $s0 $ra    #save return address into s0
    nop
    jal index_of_A  #get addr offset for array1
    nop
    move $ra $s0    #restore return address that was saved into s0

#call index_of_B
    move $s0 $ra    #save return address into s0
    nop
    jal index_of_B  #get addr offset for array2
    nop
    move $ra $s0    #restore return address that was saved into s0

    add $t1 $t1 $s2 # next address in the array1
    add $t2 $t2 $s3 # next address in the array2
    addi $t3 $t3 4 # next address in the array3

    addi $t6 $t6 -1 #decrease k counter
    bne $t6 $0 k_loop #repeat k_loop

    addi $t5 $t5 -1 #decrease j counter
    bne $t5 $0 j_loop #repeat j_loop

    addi $t4 $t4 -1 #decrease i counter
    bne $t4 $0 i_loop #repeat i_loop

#used regs: f0-f5, f10-13
index_of_A: #function for array1 addr offset    #may need to convert all to float first
    #size*i + k #$f20*i + k
    mul $s2 $s1 $t4 # 4*i, 
    add $s2 $s2 $t6 # + k, store in $s2
    jr $ra #jump back to the caller


index_of_B: #function for array2 addr offset
    #4*k + j
    mul $s3 $s1 $t6 # 4*k, 
    add $s3 $s3 $t5 # + j, store in $s3
    jr $ra #jump back to the caller


# Done multiplying...
    .data
sm: .asciiz "Done multiplying\n"
    .text
print_and_end:
    li $v0 4    # syscall 4 (print_str)
    la $a0 sm
    syscall

# Done with the program!
    lw $31 saved_ret_pc
    jr $31      # Return from main

#Terminate the program
    li $v0, 10
    syscall

.end main

But I don't understand what's wrong since the same exact code works on my another example here:

Solution

4x4 Matrix multiplication Okay, so I figured it out so I am answering my own question.

I learned many things along the way and those include

.align is not necessary to run the code. It works without them. Perhaps I didn't need it for this specific situation.
$f12 and $f13 is reserved specifically for printing out floats. If you save the float somewhere else, it won't print.
The first offset calculation I made is 0, which is why I need to add it at the top of the loop, instead of at the end. That's what was causing all the trouble.
Be sure to calculate your index correctly. Look at my code comment to see what I do :)

Here is the final version of my code that works! You can see my GitHub for the matrix multiplication Python, C, and Assembly.https://github.com/leochoo/cmpa

.data

#define matrices
    .globl A
    .globl B
    .globl R

    .align 4 #align the data set
    A: .float 1.00, 0.00, 3.14, 2.72, 2.72, 1.00, 0.00, 3.14, 1.00, 1.00, 1.00, 1.00, 1.00, 2.00, 3.00, 4.00
    .align 4 
    B: .float 1.00, 1.00, 0.00, 3.14, 0.00, 1.00, 3.14, 2.72, 0.00, 1.00, 1.00, 0.00, 4.00, 3.00, 2.00, 1.00
    .align 4 
    R: .float 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00


    matrix_size: .word 4 #row and column size
    float_size: .word 4 #float is 4-byte in MIPS.
                        #i.e. 4-byte will take up 16-bit blocks in the memory,
                        #hence in hexadecimal address, 
                        #array[0] at 10010040, array[1] at 10010050.

    tempSum: .float 0.00 #initialize tempSum as 0

    lineBrk: .asciiz "\n"


    #For debugging
    arr_1: .asciiz "A: "
    arr_2: .asciiz " B: "
    arr_3: .asciiz " R: "
    i_:     .asciiz " i:"
    j_:     .asciiz "j:"
    k_:     .asciiz "k:"
    space_: .asciiz " "
    bar_:   .asciiz " | "





#TEXT (MAIN) SECTION - multiply matrix 1 and 2, store in array3
.text
    .globl main
main:

    #print title
        .data
    lb_:    .asciiz "Vector Multiplication\n"
    lbd_:   .byte 1, -1, 0, 128
    lbd1_:  .word 0x76543210, 0xfedcba98
        .text
    li $v0 4    # syscall 4 (print_str)
    la $a0 lb_
    syscall

#load matrices
la $t1 A
la $t2 B
la $t3 R

#load variables
li $s1 0 # later used to store offset of matrix B
lw $s1 matrix_size # $s1 = matrix_size
lw $s2 float_size # $s2 = float_size
l.s $f5 tempSum #tempSum 


#store base addresses
move $s6 $t1 # $s6 = base address of matrix A stored
move $s7 $t2 # $s7 = base address of matrix B stored



#for i in 0...4:
    #for j in 0...4:
        #for k in 0...4:
    li $t4 0 # i counter
i_loop:
        li $t5 0 # j counter
    j_loop:
            li $t6 0 # k counter
        k_loop:
            #update index of A[i:t4][k:t6]
                # $s0 = offset result
                # $s1 = matrix_size: 4
                # $s2 = float_size: 4
                # $s6 = base address of A

                #calculate offset
                mul $s0 $s1 $t4 # s0 = matrix_size*i
                add $s0 $s0 $t6 # s0 = s0 + k
                mul $s0 $s0 $s2 # s0 = float_size*s0

                #increase by offset
                add $t1 $s6 $s0 # new index = base_addr + offset  ##first loop initialization will always be zero... oh..

            #update index of B[k:t6][j:t5]
                # $s0 = offset result
                # $s1 = matrix_size: 4
                # $s2 = float_size: 4
                # $s7 = base address of B

                #caculate offset
                mul $s0 $s1 $t6 # s0 = matrix_size*k
                add $s0 $s0 $t5 # s0 = s0 + j
                mul $s0 $s0 $s2 # s0 = float_size*s0

                #increase by offset
                add $t2 $s7 $s0 # new index = base_addr + offset


            #load matrix A and B
            lwc1 $f1 0($t1) #load float from matrix A
            lwc1 $f2 0($t2) #load float from matrix B
            nop
                #print i, j, k

                li $v0 4        
                la $a0 i_
                syscall         # "i"

                li $v0 1 
                move $a0 $t4
                syscall         # value of i

                li $v0 4        
                la $a0 j_
                syscall         # "j"

                li $v0 1 
                move $a0 $t5
                syscall         # value of j


                li $v0 4        
                la $a0 k_
                syscall         # "k"

                li $v0 1 
                move $a0 $t6
                syscall         # value of k

                li $v0 4        # " | "
                la $a0 bar_
                syscall 

                #print A and B
                li $v0 4    
                la $a0 arr_1
                syscall

                lwc1 $f12 0($t1) #A
                li $v0 2
                syscall

                li $v0 4    
                la $a0 arr_2
                syscall

                lwc1 $f12 0($t2) #B
                li $v0 2
                syscall


            #Break down: R[i][j] +=  float_size * ( A[i][k] * B[k][j] )
            #### first result: (1*1)+(0*0)+(3.14*0)+(2.72*4)

            # (A * B)
            nop
            mul.s $f0 $f1 $f2 # (a*b)
            nop
            #tempSum:$f5 = tempSum + (A * B)
            add.s $f5 $f5 $f0
            nop
                ####1st = (A*B)
                ####2nd = (A*B) + (A*B)         


            #DON'T UPDATE index of R here
            #you only need to update it 16 times, hence in j_loop

        #k_loop end condition
        addi $t6 $t6 1 # k++
        bne $t6 $s1 k_loop #if k != 4, repeat k_loop


    #store R[i][j] = tempSum:$f5
    swc1 $f5 0($t3) #store the resulting float in array3
    nop

    #reset tempSum = 0
    l.s $f5 tempSum

    #load and print element in R
    li $v0 4    
    la $a0 arr_3 # " R "
    syscall     

    lwc1 $f12 0($t3)
    li $v0 2
    syscall 

    li $v0 4
    la $a0 lineBrk #print( '\n' )
    syscall

    #update index of R[i][j] - same as updating index of A
    add $t3 $t3 $s2


    #j_loop end condition
    addi $t5 $t5 1 
    bne $t5 $s1 j_loop 

#i_loop end condition
addi $t4 $t4 1 
bne $t4 $s1 i_loop


# Done multiplying...
    .data
sm: .asciiz "Done multiplying\n"
    .text
print_and_end:
    li $v0 4    # syscall 4 (print_str)
    la $a0 sm
    syscall

#Terminate the program
    li $v0, 10
    syscall

.end main