I'm trying to to do a 4x4 Matrix multiplication using Assembly in MIPS simulator (QtMips).
QtMips gives me Exception 4: Unaligned Address in inst/data fetch: 0x100100bb
This is where I get the error when I single step.
[00400070] c52b0000 lwc1 $f11, 0($9) ; 80: lwc1 $f11 0($t1) #load float from array1
The error happens when counter k = 2, meaning when it is at the third loop. I'm assuming something is wrong with 32-bit alignment at my third load, lwc1
Here's what I tried/read but didn't work:
lw $s1 size
so I don't see this being a real issue for me. I'm very lost on what to do. Please impart me some wisdom.
Below is my whole code:
# here's our array data, two args and a result
.data
.globl array1
.globl array2
.globl array3
.align 5 #align the data set
array1: .float 1.00, 0.00, 3.14, 2.72, 2.72, 1.00, 0.00, 3.14, 1.00, 1.00, 1.00, 1.00, 1.00, 2.00, 3.00, 4.00
.align 5 #align the data set
array2: .float 1.00, 1.00, 0.00, 3.14, 0.00, 1.00, 3.14, 2.72, 0.00, 1.00, 1.00, 0.00, 4.00, 3.00, 2.00, 1.00
.align 5 #align the data set
array3: .float 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00
size: .word 4 #store float in s2
.text
.globl main
main:
sw $31 saved_ret_pc
.data
lb_: .asciiz "Vector Multiplication\n"
lbd_: .byte 1, -1, 0, 128
lbd1_: .word 0x76543210, 0xfedcba98
.text
li $v0 4 # syscall 4 (print_str)
la $a0 lb_
syscall
# main program: multiply matrix 1 and 2, store in array3
la $t1 array1
la $t2 array2
la $t3 array3 ###load arrrays to registers
li $t4 4 # i loop counter -> I changed addi to li
li $t5 4 # j loop counter
li $t6 4 # k loop counter
lw $s1 size # load matrix(array) size
i_loop:
j j_loop
j_loop:
j k_loop
k_loop:
#f0 and f1 - float func return values
#f10 - multiplication return values
#f4, f5 - register to store addr offset
lwc1 $f11 0($t1) #load float from array1
lwc1 $f12 0($t2) #load float from array2
lwc1 $f13 0($t3) #load float from result array3
nop
mul.s $f10 $f11 $f12 #multiply floats, store result as temp in $f10
nop
add.s $f13 $f13 $f10 #add to multiplication result to resulting array3
swc1 $f13 0($t3) #store the resulting float in array3
#call index_of_A
move $s0 $ra #save return address into s0
nop
jal index_of_A #get addr offset for array1
nop
move $ra $s0 #restore return address that was saved into s0
#call index_of_B
move $s0 $ra #save return address into s0
nop
jal index_of_B #get addr offset for array2
nop
move $ra $s0 #restore return address that was saved into s0
add $t1 $t1 $s2 # next address in the array1
add $t2 $t2 $s3 # next address in the array2
addi $t3 $t3 4 # next address in the array3
addi $t6 $t6 -1 #decrease k counter
bne $t6 $0 k_loop #repeat k_loop
addi $t5 $t5 -1 #decrease j counter
bne $t5 $0 j_loop #repeat j_loop
addi $t4 $t4 -1 #decrease i counter
bne $t4 $0 i_loop #repeat i_loop
#used regs: f0-f5, f10-13
index_of_A: #function for array1 addr offset #may need to convert all to float first
#size*i + k #$f20*i + k
mul $s2 $s1 $t4 # 4*i,
add $s2 $s2 $t6 # + k, store in $s2
jr $ra #jump back to the caller
index_of_B: #function for array2 addr offset
#4*k + j
mul $s3 $s1 $t6 # 4*k,
add $s3 $s3 $t5 # + j, store in $s3
jr $ra #jump back to the caller
# Done multiplying...
.data
sm: .asciiz "Done multiplying\n"
.text
print_and_end:
li $v0 4 # syscall 4 (print_str)
la $a0 sm
syscall
# Done with the program!
lw $31 saved_ret_pc
jr $31 # Return from main
#Terminate the program
li $v0, 10
syscall
.end main
But I don't understand what's wrong since the same exact code works on my another example here:
4x4 Matrix multiplication Okay, so I figured it out so I am answering my own question.
I learned many things along the way and those include
Here is the final version of my code that works! You can see my GitHub for the matrix multiplication Python, C, and Assembly.https://github.com/leochoo/cmpa
.data
#define matrices
.globl A
.globl B
.globl R
.align 4 #align the data set
A: .float 1.00, 0.00, 3.14, 2.72, 2.72, 1.00, 0.00, 3.14, 1.00, 1.00, 1.00, 1.00, 1.00, 2.00, 3.00, 4.00
.align 4
B: .float 1.00, 1.00, 0.00, 3.14, 0.00, 1.00, 3.14, 2.72, 0.00, 1.00, 1.00, 0.00, 4.00, 3.00, 2.00, 1.00
.align 4
R: .float 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00
matrix_size: .word 4 #row and column size
float_size: .word 4 #float is 4-byte in MIPS.
#i.e. 4-byte will take up 16-bit blocks in the memory,
#hence in hexadecimal address,
#array[0] at 10010040, array[1] at 10010050.
tempSum: .float 0.00 #initialize tempSum as 0
lineBrk: .asciiz "\n"
#For debugging
arr_1: .asciiz "A: "
arr_2: .asciiz " B: "
arr_3: .asciiz " R: "
i_: .asciiz " i:"
j_: .asciiz "j:"
k_: .asciiz "k:"
space_: .asciiz " "
bar_: .asciiz " | "
#TEXT (MAIN) SECTION - multiply matrix 1 and 2, store in array3
.text
.globl main
main:
#print title
.data
lb_: .asciiz "Vector Multiplication\n"
lbd_: .byte 1, -1, 0, 128
lbd1_: .word 0x76543210, 0xfedcba98
.text
li $v0 4 # syscall 4 (print_str)
la $a0 lb_
syscall
#load matrices
la $t1 A
la $t2 B
la $t3 R
#load variables
li $s1 0 # later used to store offset of matrix B
lw $s1 matrix_size # $s1 = matrix_size
lw $s2 float_size # $s2 = float_size
l.s $f5 tempSum #tempSum
#store base addresses
move $s6 $t1 # $s6 = base address of matrix A stored
move $s7 $t2 # $s7 = base address of matrix B stored
#for i in 0...4:
#for j in 0...4:
#for k in 0...4:
li $t4 0 # i counter
i_loop:
li $t5 0 # j counter
j_loop:
li $t6 0 # k counter
k_loop:
#update index of A[i:t4][k:t6]
# $s0 = offset result
# $s1 = matrix_size: 4
# $s2 = float_size: 4
# $s6 = base address of A
#calculate offset
mul $s0 $s1 $t4 # s0 = matrix_size*i
add $s0 $s0 $t6 # s0 = s0 + k
mul $s0 $s0 $s2 # s0 = float_size*s0
#increase by offset
add $t1 $s6 $s0 # new index = base_addr + offset ##first loop initialization will always be zero... oh..
#update index of B[k:t6][j:t5]
# $s0 = offset result
# $s1 = matrix_size: 4
# $s2 = float_size: 4
# $s7 = base address of B
#caculate offset
mul $s0 $s1 $t6 # s0 = matrix_size*k
add $s0 $s0 $t5 # s0 = s0 + j
mul $s0 $s0 $s2 # s0 = float_size*s0
#increase by offset
add $t2 $s7 $s0 # new index = base_addr + offset
#load matrix A and B
lwc1 $f1 0($t1) #load float from matrix A
lwc1 $f2 0($t2) #load float from matrix B
nop
#print i, j, k
li $v0 4
la $a0 i_
syscall # "i"
li $v0 1
move $a0 $t4
syscall # value of i
li $v0 4
la $a0 j_
syscall # "j"
li $v0 1
move $a0 $t5
syscall # value of j
li $v0 4
la $a0 k_
syscall # "k"
li $v0 1
move $a0 $t6
syscall # value of k
li $v0 4 # " | "
la $a0 bar_
syscall
#print A and B
li $v0 4
la $a0 arr_1
syscall
lwc1 $f12 0($t1) #A
li $v0 2
syscall
li $v0 4
la $a0 arr_2
syscall
lwc1 $f12 0($t2) #B
li $v0 2
syscall
#Break down: R[i][j] += float_size * ( A[i][k] * B[k][j] )
#### first result: (1*1)+(0*0)+(3.14*0)+(2.72*4)
# (A * B)
nop
mul.s $f0 $f1 $f2 # (a*b)
nop
#tempSum:$f5 = tempSum + (A * B)
add.s $f5 $f5 $f0
nop
####1st = (A*B)
####2nd = (A*B) + (A*B)
#DON'T UPDATE index of R here
#you only need to update it 16 times, hence in j_loop
#k_loop end condition
addi $t6 $t6 1 # k++
bne $t6 $s1 k_loop #if k != 4, repeat k_loop
#store R[i][j] = tempSum:$f5
swc1 $f5 0($t3) #store the resulting float in array3
nop
#reset tempSum = 0
l.s $f5 tempSum
#load and print element in R
li $v0 4
la $a0 arr_3 # " R "
syscall
lwc1 $f12 0($t3)
li $v0 2
syscall
li $v0 4
la $a0 lineBrk #print( '\n' )
syscall
#update index of R[i][j] - same as updating index of A
add $t3 $t3 $s2
#j_loop end condition
addi $t5 $t5 1
bne $t5 $s1 j_loop
#i_loop end condition
addi $t4 $t4 1
bne $t4 $s1 i_loop
# Done multiplying...
.data
sm: .asciiz "Done multiplying\n"
.text
print_and_end:
li $v0 4 # syscall 4 (print_str)
la $a0 sm
syscall
#Terminate the program
li $v0, 10
syscall
.end main