Does rsp stack pointer has any use in returning a value of a function call?

I know there are several ways to return a value in assembly,

in rax register
in xmm0, xmm1 registers
in stacks (something like return optimization in c++), this sometimes uses rdi register to save the stack address of the caller, and then return the value directly in address pointed by rdi

I have the following code in assembly, the code should calculate a 3x3 matrix times a 3D vector, and then return a 3-dimensional vector in floats (4 bytes each, 12 bytes in total).

Everything is easy to understand until the last piece of code to return the value, which is so weird that I don't understand its purpose.

Assuming that the 3-D value is finally in xmm6, xmm7 and xmm8, the code is as follows,

Note: there is no sub rsp, xxx and add rsp, xxx in the beginning and end of this function, actually the stack is not used at all during the computation of matrix and vector multiplication.

movss  [rsp-28h],xmm6
movss  [rsp-24h],xmm7
movss  [rsp-20h],xmm8

mov    eax,[rsp-24h]
shl    eax,20h
mov    ebx,[rsp-28h]
or     rax,rbx
mov    [rsp-18h],rax
movd   rcx,xmm8
mov    [rsp-10h],rcx

mov    ebx,eax
shr    rax,20h
mov    [rsp-28h],eax
mov    [rsp-24h],ebx
mov    [rsp-20h],ecx

movss  xmm1,[rsp-10h]
movsd  xmm0,[rsp-28h]

I tried to compile and run this code in a C program with asm(), and found that actually it just copies the xmm6,xmm7 and xmm8 in 3 places, one in [rsp-28h], one in [rsp-18h], and one in xmm1 and xmm0, all are the same floating point values.

I know that xmm0 and xmm1 are most likely the return values, but do[rsp-28h] and [rsp-18h] has any use in returning the value? Why the program uses a very strange method (copy to stacks, then copy to rax etc. registers, and then copy them back to stack again) to copy the results?

I am asking this because the caller code doesn't use xmm0 and xmm1. I am not sure that whether I should think the caller just discards the returning values, or the return values are actually somewhere in the caller's stack.

Thanks.

Solution

Caveat: This may not be exactly what you're talking about, but ...

rsp/rbp is used in the return process if a function returns a struct by value

The caller must reserve a [temp] area in its stack frame for the return value.
The first argument to the called function is an implicit/hidden pointer to this area.
The called function will copy the return value to the area pointed to by this pointer
The caller will then copy from the temp area to the final variable.

Consider the following source:

#include <stdio.h>

struct obj {
    int x[10];
};

struct obj
fill(int arg)
{
    struct obj obj = {
        .x = { 4, 5, 6 }
    };

    printf("fill: arg=%d\n",arg);

    return obj;
}

void
objprint(const struct obj *obj)
{

    for (int i = 0;  i < 10;  ++i)
        printf(" %d",obj->x[i]);
    printf("\n");
}

struct obj obj = {
    .x = { 1, 2, 3 }
};

int
main(void)
{

    objprint(&obj);
    obj = fill(37);
    objprint(&obj);

    return 0;
}

For clarity, we compile with -m32 but the principle is the same for x86_64. We compile with:

cc \
-fno-inline-small-functions \
-fno-inline-functions-called-once \
-fno-inline-functions \
-fomit-frame-pointer \
-S -fverbose-asm -O2 -m32 retval2.c

Here is the [redacted] assembler output. There's quite a bit of movement to/from stack frames.

    .file   "retval2.c"
    .text
    .section    .rodata.str1.1,"aMS",@progbits,1
.LC0:
    .string "fill: arg=%d\n"
    .text
    .p2align 4,,15
    .globl  fill
    .type   fill, @function
fill:
.LFB11:
    .cfi_startproc
    pushl   %ebx    #
    .cfi_def_cfa_offset 8
    .cfi_offset 3, -8
    subl    $16, %esp   #,
    .cfi_def_cfa_offset 24
# retval2.c:9: {
    movl    24(%esp), %ebx  # .result_ptr, .result_ptr
# retval2.c:14:     printf("fill: arg=%d\n",arg);
    pushl   28(%esp)    # arg
    .cfi_def_cfa_offset 28
    pushl   $.LC0   #
    .cfi_def_cfa_offset 32
    call    printf  #
# retval2.c:16:     return obj;
    movl    $4, (%ebx)  #, MEM[(struct obj *)&<retval>]
# retval2.c:17: }
    movl    %ebx, %eax  # .result_ptr,
# retval2.c:16:     return obj;
    movl    $5, 4(%ebx) #, MEM[(struct obj *)&<retval> + 4B]
    movl    $6, 8(%ebx) #, MEM[(struct obj *)&<retval> + 8B]
    movl    $0, 12(%ebx)    #, MEM[(struct obj *)&<retval> + 12B]
    movl    $0, 16(%ebx)    #, MEM[(struct obj *)&<retval> + 16B]
    movl    $0, 20(%ebx)    #, MEM[(struct obj *)&<retval> + 20B]
    movl    $0, 24(%ebx)    #, MEM[(struct obj *)&<retval> + 24B]
    movl    $0, 28(%ebx)    #, MEM[(struct obj *)&<retval> + 28B]
    movl    $0, 32(%ebx)    #, MEM[(struct obj *)&<retval> + 32B]
    movl    $0, 36(%ebx)    #, MEM[(struct obj *)&<retval> + 36B]
# retval2.c:17: }
    addl    $24, %esp   #,
    .cfi_def_cfa_offset 8
    popl    %ebx    #
    .cfi_restore 3
    .cfi_def_cfa_offset 4
    ret $4      #
    .cfi_endproc
.LFE11:
    .size   fill, .-fill
    .section    .rodata.str1.1
.LC1:
    .string " %d"
    .text
    .p2align 4,,15
    .globl  objprint
    .type   objprint, @function
objprint:
.LFB12:
    .cfi_startproc
    pushl   %esi    #
    .cfi_def_cfa_offset 8
    .cfi_offset 6, -8
    pushl   %ebx    #
    .cfi_def_cfa_offset 12
    .cfi_offset 3, -12
    subl    $4, %esp    #,
    .cfi_def_cfa_offset 16
# retval2.c:21: {
    movl    16(%esp), %ebx  # obj, ivtmp.15
    leal    40(%ebx), %esi  #, _16
    .p2align 4,,10
    .p2align 3
.L5:
# retval2.c:24:         printf(" %d",obj->x[i]);
    subl    $8, %esp    #,
    .cfi_def_cfa_offset 24
    pushl   (%ebx)  # MEM[base: _14, offset: 0B]
    .cfi_def_cfa_offset 28
    addl    $4, %ebx    #, ivtmp.15
    pushl   $.LC1   #
    .cfi_def_cfa_offset 32
    call    printf  #
# retval2.c:23:     for (int i = 0;  i < 10;  ++i)
    addl    $16, %esp   #,
    .cfi_def_cfa_offset 16
    cmpl    %esi, %ebx  # _16, ivtmp.15
    jne .L5 #,
# retval2.c:25:     printf("\n");
    movl    $10, 16(%esp)   #,
# retval2.c:26: }
    addl    $4, %esp    #,
    .cfi_def_cfa_offset 12
    popl    %ebx    #
    .cfi_restore 3
    .cfi_def_cfa_offset 8
    popl    %esi    #
    .cfi_restore 6
    .cfi_def_cfa_offset 4
# retval2.c:25:     printf("\n");
    jmp putchar #
    .cfi_endproc
.LFE12:
    .size   objprint, .-objprint
    .section    .text.startup,"ax",@progbits
    .p2align 4,,15
    .globl  main
    .type   main, @function
main:
.LFB13:
    .cfi_startproc
    leal    4(%esp), %ecx   #,
    .cfi_def_cfa 1, 0
    andl    $-16, %esp  #,
    pushl   -4(%ecx)    #
    pushl   %ebp    #
    .cfi_escape 0x10,0x5,0x2,0x75,0
    movl    %esp, %ebp  #,
    pushl   %ecx    #
    .cfi_escape 0xf,0x3,0x75,0x7c,0x6
    subl    $64, %esp   #,
# retval2.c:36:     objprint(&obj);
    pushl   $obj    #
    call    objprint    #
# retval2.c:37:     obj = fill(37);
    leal    -56(%ebp), %eax #, tmp88
    popl    %edx    #
    popl    %ecx    #
    pushl   $37 #
    pushl   %eax    # tmp88
    call    fill    #
    movl    -56(%ebp), %eax #, tmp91
# retval2.c:38:     objprint(&obj);
    pushl   $obj    #
# retval2.c:37:     obj = fill(37);
    movl    %eax, obj   # tmp91, obj
    movl    -52(%ebp), %eax #, tmp93
    movl    %eax, obj+4 # tmp93, obj
    movl    -48(%ebp), %eax #, tmp95
    movl    %eax, obj+8 # tmp95, obj
    movl    -44(%ebp), %eax #, tmp97
    movl    %eax, obj+12    # tmp97, obj
    movl    -40(%ebp), %eax #, tmp99
    movl    %eax, obj+16    # tmp99, obj
    movl    -36(%ebp), %eax #, tmp101
    movl    %eax, obj+20    # tmp101, obj
    movl    -32(%ebp), %eax #, tmp103
    movl    %eax, obj+24    # tmp103, obj
    movl    -28(%ebp), %eax #, tmp105
    movl    %eax, obj+28    # tmp105, obj
    movl    -24(%ebp), %eax #, tmp107
    movl    %eax, obj+32    # tmp107, obj
    movl    -20(%ebp), %eax #, tmp109
    movl    %eax, obj+36    # tmp109, obj
# retval2.c:38:     objprint(&obj);
    call    objprint    #
# retval2.c:41: }
    movl    -4(%ebp), %ecx  #,
    .cfi_def_cfa 1, 0
    addl    $16, %esp   #,
    xorl    %eax, %eax  #
    leave
    .cfi_restore 5
    leal    -4(%ecx), %esp  #,
    .cfi_def_cfa 4, 4
    ret
    .cfi_endproc
.LFE13:
    .size   main, .-main
    .globl  obj
    .data
    .align 32
    .type   obj, @object
    .size   obj, 40
obj:
# x:
    .long   1
    .long   2
    .long   3
    .zero   28
    .ident  "GCC: (GNU) 8.3.1 20190223 (Red Hat 8.3.1-2)"
    .section    .note.GNU-stack,"",@progbits

To note:

Even though main did: obj = fill(37);, it passed a pointer to a temp area on its stack frame rather than passing a pointer to the lvalue obj.

The caller did a manual copy from the temp area to the lvalue.