I know there are several ways to return a value in assembly,
rax
registerxmm0
, xmm1
registersI have the following code in assembly, the code should calculate a 3x3 matrix times a 3D vector, and then return a 3-dimensional vector in floats (4 bytes each, 12 bytes in total).
Everything is easy to understand until the last piece of code to return the value, which is so weird that I don't understand its purpose.
Assuming that the 3-D value is finally in xmm6
, xmm7
and xmm8
, the code is as follows,
Note: there is no sub rsp, xxx
and add rsp, xxx
in the beginning and end of this function, actually the stack is not used at all during the computation of matrix and vector multiplication.
movss [rsp-28h],xmm6
movss [rsp-24h],xmm7
movss [rsp-20h],xmm8
mov eax,[rsp-24h]
shl eax,20h
mov ebx,[rsp-28h]
or rax,rbx
mov [rsp-18h],rax
movd rcx,xmm8
mov [rsp-10h],rcx
mov ebx,eax
shr rax,20h
mov [rsp-28h],eax
mov [rsp-24h],ebx
mov [rsp-20h],ecx
movss xmm1,[rsp-10h]
movsd xmm0,[rsp-28h]
I tried to compile and run this code in a C program with asm()
, and found that actually it just copies the xmm6
,xmm7
and xmm8
in 3 places, one in [rsp-28h]
, one in [rsp-18h]
, and one in xmm1
and xmm0
, all are the same floating point values.
I know that xmm0
and xmm1
are most likely the return values, but do[rsp-28h]
and [rsp-18h]
has any use in returning the value? Why the program uses a very strange method (copy to stacks, then copy to rax
etc. registers, and then copy them back to stack again) to copy the results?
I am asking this because the caller code doesn't use xmm0
and xmm1
. I am not sure that whether I should think the caller just discards the returning values, or the return values are actually somewhere in the caller's stack.
Thanks.
Caveat: This may not be exactly what you're talking about, but ...
rsp/rbp
is used in the return process if a function returns a struct
by value
Consider the following source:
#include <stdio.h>
struct obj {
int x[10];
};
struct obj
fill(int arg)
{
struct obj obj = {
.x = { 4, 5, 6 }
};
printf("fill: arg=%d\n",arg);
return obj;
}
void
objprint(const struct obj *obj)
{
for (int i = 0; i < 10; ++i)
printf(" %d",obj->x[i]);
printf("\n");
}
struct obj obj = {
.x = { 1, 2, 3 }
};
int
main(void)
{
objprint(&obj);
obj = fill(37);
objprint(&obj);
return 0;
}
For clarity, we compile with -m32
but the principle is the same for x86_64
. We compile with:
cc \
-fno-inline-small-functions \
-fno-inline-functions-called-once \
-fno-inline-functions \
-fomit-frame-pointer \
-S -fverbose-asm -O2 -m32 retval2.c
Here is the [redacted] assembler output. There's quite a bit of movement to/from stack frames.
.file "retval2.c"
.text
.section .rodata.str1.1,"aMS",@progbits,1
.LC0:
.string "fill: arg=%d\n"
.text
.p2align 4,,15
.globl fill
.type fill, @function
fill:
.LFB11:
.cfi_startproc
pushl %ebx #
.cfi_def_cfa_offset 8
.cfi_offset 3, -8
subl $16, %esp #,
.cfi_def_cfa_offset 24
# retval2.c:9: {
movl 24(%esp), %ebx # .result_ptr, .result_ptr
# retval2.c:14: printf("fill: arg=%d\n",arg);
pushl 28(%esp) # arg
.cfi_def_cfa_offset 28
pushl $.LC0 #
.cfi_def_cfa_offset 32
call printf #
# retval2.c:16: return obj;
movl $4, (%ebx) #, MEM[(struct obj *)&<retval>]
# retval2.c:17: }
movl %ebx, %eax # .result_ptr,
# retval2.c:16: return obj;
movl $5, 4(%ebx) #, MEM[(struct obj *)&<retval> + 4B]
movl $6, 8(%ebx) #, MEM[(struct obj *)&<retval> + 8B]
movl $0, 12(%ebx) #, MEM[(struct obj *)&<retval> + 12B]
movl $0, 16(%ebx) #, MEM[(struct obj *)&<retval> + 16B]
movl $0, 20(%ebx) #, MEM[(struct obj *)&<retval> + 20B]
movl $0, 24(%ebx) #, MEM[(struct obj *)&<retval> + 24B]
movl $0, 28(%ebx) #, MEM[(struct obj *)&<retval> + 28B]
movl $0, 32(%ebx) #, MEM[(struct obj *)&<retval> + 32B]
movl $0, 36(%ebx) #, MEM[(struct obj *)&<retval> + 36B]
# retval2.c:17: }
addl $24, %esp #,
.cfi_def_cfa_offset 8
popl %ebx #
.cfi_restore 3
.cfi_def_cfa_offset 4
ret $4 #
.cfi_endproc
.LFE11:
.size fill, .-fill
.section .rodata.str1.1
.LC1:
.string " %d"
.text
.p2align 4,,15
.globl objprint
.type objprint, @function
objprint:
.LFB12:
.cfi_startproc
pushl %esi #
.cfi_def_cfa_offset 8
.cfi_offset 6, -8
pushl %ebx #
.cfi_def_cfa_offset 12
.cfi_offset 3, -12
subl $4, %esp #,
.cfi_def_cfa_offset 16
# retval2.c:21: {
movl 16(%esp), %ebx # obj, ivtmp.15
leal 40(%ebx), %esi #, _16
.p2align 4,,10
.p2align 3
.L5:
# retval2.c:24: printf(" %d",obj->x[i]);
subl $8, %esp #,
.cfi_def_cfa_offset 24
pushl (%ebx) # MEM[base: _14, offset: 0B]
.cfi_def_cfa_offset 28
addl $4, %ebx #, ivtmp.15
pushl $.LC1 #
.cfi_def_cfa_offset 32
call printf #
# retval2.c:23: for (int i = 0; i < 10; ++i)
addl $16, %esp #,
.cfi_def_cfa_offset 16
cmpl %esi, %ebx # _16, ivtmp.15
jne .L5 #,
# retval2.c:25: printf("\n");
movl $10, 16(%esp) #,
# retval2.c:26: }
addl $4, %esp #,
.cfi_def_cfa_offset 12
popl %ebx #
.cfi_restore 3
.cfi_def_cfa_offset 8
popl %esi #
.cfi_restore 6
.cfi_def_cfa_offset 4
# retval2.c:25: printf("\n");
jmp putchar #
.cfi_endproc
.LFE12:
.size objprint, .-objprint
.section .text.startup,"ax",@progbits
.p2align 4,,15
.globl main
.type main, @function
main:
.LFB13:
.cfi_startproc
leal 4(%esp), %ecx #,
.cfi_def_cfa 1, 0
andl $-16, %esp #,
pushl -4(%ecx) #
pushl %ebp #
.cfi_escape 0x10,0x5,0x2,0x75,0
movl %esp, %ebp #,
pushl %ecx #
.cfi_escape 0xf,0x3,0x75,0x7c,0x6
subl $64, %esp #,
# retval2.c:36: objprint(&obj);
pushl $obj #
call objprint #
# retval2.c:37: obj = fill(37);
leal -56(%ebp), %eax #, tmp88
popl %edx #
popl %ecx #
pushl $37 #
pushl %eax # tmp88
call fill #
movl -56(%ebp), %eax #, tmp91
# retval2.c:38: objprint(&obj);
pushl $obj #
# retval2.c:37: obj = fill(37);
movl %eax, obj # tmp91, obj
movl -52(%ebp), %eax #, tmp93
movl %eax, obj+4 # tmp93, obj
movl -48(%ebp), %eax #, tmp95
movl %eax, obj+8 # tmp95, obj
movl -44(%ebp), %eax #, tmp97
movl %eax, obj+12 # tmp97, obj
movl -40(%ebp), %eax #, tmp99
movl %eax, obj+16 # tmp99, obj
movl -36(%ebp), %eax #, tmp101
movl %eax, obj+20 # tmp101, obj
movl -32(%ebp), %eax #, tmp103
movl %eax, obj+24 # tmp103, obj
movl -28(%ebp), %eax #, tmp105
movl %eax, obj+28 # tmp105, obj
movl -24(%ebp), %eax #, tmp107
movl %eax, obj+32 # tmp107, obj
movl -20(%ebp), %eax #, tmp109
movl %eax, obj+36 # tmp109, obj
# retval2.c:38: objprint(&obj);
call objprint #
# retval2.c:41: }
movl -4(%ebp), %ecx #,
.cfi_def_cfa 1, 0
addl $16, %esp #,
xorl %eax, %eax #
leave
.cfi_restore 5
leal -4(%ecx), %esp #,
.cfi_def_cfa 4, 4
ret
.cfi_endproc
.LFE13:
.size main, .-main
.globl obj
.data
.align 32
.type obj, @object
.size obj, 40
obj:
# x:
.long 1
.long 2
.long 3
.zero 28
.ident "GCC: (GNU) 8.3.1 20190223 (Red Hat 8.3.1-2)"
.section .note.GNU-stack,"",@progbits
To note:
Even though main
did: obj = fill(37);
, it passed a pointer to a temp area on its stack frame rather than passing a pointer to the lvalue obj
.
The caller did a manual copy from the temp area to the lvalue.