Search code examples
c++cassemblyx86-64

Does rsp stack pointer has any use in returning a value of a function call?


I know there are several ways to return a value in assembly,

  1. in rax register
  2. in xmm0, xmm1 registers
  3. in stacks (something like return optimization in c++), this sometimes uses rdi register to save the stack address of the caller, and then return the value directly in address pointed by rdi

I have the following code in assembly, the code should calculate a 3x3 matrix times a 3D vector, and then return a 3-dimensional vector in floats (4 bytes each, 12 bytes in total).

Everything is easy to understand until the last piece of code to return the value, which is so weird that I don't understand its purpose.

Assuming that the 3-D value is finally in xmm6, xmm7 and xmm8, the code is as follows,

Note: there is no sub rsp, xxx and add rsp, xxx in the beginning and end of this function, actually the stack is not used at all during the computation of matrix and vector multiplication.

movss  [rsp-28h],xmm6
movss  [rsp-24h],xmm7
movss  [rsp-20h],xmm8

mov    eax,[rsp-24h]
shl    eax,20h
mov    ebx,[rsp-28h]
or     rax,rbx
mov    [rsp-18h],rax
movd   rcx,xmm8
mov    [rsp-10h],rcx

mov    ebx,eax
shr    rax,20h
mov    [rsp-28h],eax
mov    [rsp-24h],ebx
mov    [rsp-20h],ecx

movss  xmm1,[rsp-10h]
movsd  xmm0,[rsp-28h]

I tried to compile and run this code in a C program with asm(), and found that actually it just copies the xmm6,xmm7 and xmm8 in 3 places, one in [rsp-28h], one in [rsp-18h], and one in xmm1 and xmm0, all are the same floating point values.

I know that xmm0 and xmm1 are most likely the return values, but do[rsp-28h] and [rsp-18h] has any use in returning the value? Why the program uses a very strange method (copy to stacks, then copy to rax etc. registers, and then copy them back to stack again) to copy the results?

I am asking this because the caller code doesn't use xmm0 and xmm1. I am not sure that whether I should think the caller just discards the returning values, or the return values are actually somewhere in the caller's stack.

Thanks.


Solution

  • Caveat: This may not be exactly what you're talking about, but ...

    rsp/rbp is used in the return process if a function returns a struct by value

    1. The caller must reserve a [temp] area in its stack frame for the return value.
    2. The first argument to the called function is an implicit/hidden pointer to this area.
    3. The called function will copy the return value to the area pointed to by this pointer
    4. The caller will then copy from the temp area to the final variable.

    Consider the following source:

    #include <stdio.h>
    
    struct obj {
        int x[10];
    };
    
    struct obj
    fill(int arg)
    {
        struct obj obj = {
            .x = { 4, 5, 6 }
        };
    
        printf("fill: arg=%d\n",arg);
    
        return obj;
    }
    
    void
    objprint(const struct obj *obj)
    {
    
        for (int i = 0;  i < 10;  ++i)
            printf(" %d",obj->x[i]);
        printf("\n");
    }
    
    struct obj obj = {
        .x = { 1, 2, 3 }
    };
    
    int
    main(void)
    {
    
        objprint(&obj);
        obj = fill(37);
        objprint(&obj);
    
        return 0;
    }
    

    For clarity, we compile with -m32 but the principle is the same for x86_64. We compile with:

    cc \
    -fno-inline-small-functions \
    -fno-inline-functions-called-once \
    -fno-inline-functions \
    -fomit-frame-pointer \
    -S -fverbose-asm -O2 -m32 retval2.c
    

    Here is the [redacted] assembler output. There's quite a bit of movement to/from stack frames.

        .file   "retval2.c"
        .text
        .section    .rodata.str1.1,"aMS",@progbits,1
    .LC0:
        .string "fill: arg=%d\n"
        .text
        .p2align 4,,15
        .globl  fill
        .type   fill, @function
    fill:
    .LFB11:
        .cfi_startproc
        pushl   %ebx    #
        .cfi_def_cfa_offset 8
        .cfi_offset 3, -8
        subl    $16, %esp   #,
        .cfi_def_cfa_offset 24
    # retval2.c:9: {
        movl    24(%esp), %ebx  # .result_ptr, .result_ptr
    # retval2.c:14:     printf("fill: arg=%d\n",arg);
        pushl   28(%esp)    # arg
        .cfi_def_cfa_offset 28
        pushl   $.LC0   #
        .cfi_def_cfa_offset 32
        call    printf  #
    # retval2.c:16:     return obj;
        movl    $4, (%ebx)  #, MEM[(struct obj *)&<retval>]
    # retval2.c:17: }
        movl    %ebx, %eax  # .result_ptr,
    # retval2.c:16:     return obj;
        movl    $5, 4(%ebx) #, MEM[(struct obj *)&<retval> + 4B]
        movl    $6, 8(%ebx) #, MEM[(struct obj *)&<retval> + 8B]
        movl    $0, 12(%ebx)    #, MEM[(struct obj *)&<retval> + 12B]
        movl    $0, 16(%ebx)    #, MEM[(struct obj *)&<retval> + 16B]
        movl    $0, 20(%ebx)    #, MEM[(struct obj *)&<retval> + 20B]
        movl    $0, 24(%ebx)    #, MEM[(struct obj *)&<retval> + 24B]
        movl    $0, 28(%ebx)    #, MEM[(struct obj *)&<retval> + 28B]
        movl    $0, 32(%ebx)    #, MEM[(struct obj *)&<retval> + 32B]
        movl    $0, 36(%ebx)    #, MEM[(struct obj *)&<retval> + 36B]
    # retval2.c:17: }
        addl    $24, %esp   #,
        .cfi_def_cfa_offset 8
        popl    %ebx    #
        .cfi_restore 3
        .cfi_def_cfa_offset 4
        ret $4      #
        .cfi_endproc
    .LFE11:
        .size   fill, .-fill
        .section    .rodata.str1.1
    .LC1:
        .string " %d"
        .text
        .p2align 4,,15
        .globl  objprint
        .type   objprint, @function
    objprint:
    .LFB12:
        .cfi_startproc
        pushl   %esi    #
        .cfi_def_cfa_offset 8
        .cfi_offset 6, -8
        pushl   %ebx    #
        .cfi_def_cfa_offset 12
        .cfi_offset 3, -12
        subl    $4, %esp    #,
        .cfi_def_cfa_offset 16
    # retval2.c:21: {
        movl    16(%esp), %ebx  # obj, ivtmp.15
        leal    40(%ebx), %esi  #, _16
        .p2align 4,,10
        .p2align 3
    .L5:
    # retval2.c:24:         printf(" %d",obj->x[i]);
        subl    $8, %esp    #,
        .cfi_def_cfa_offset 24
        pushl   (%ebx)  # MEM[base: _14, offset: 0B]
        .cfi_def_cfa_offset 28
        addl    $4, %ebx    #, ivtmp.15
        pushl   $.LC1   #
        .cfi_def_cfa_offset 32
        call    printf  #
    # retval2.c:23:     for (int i = 0;  i < 10;  ++i)
        addl    $16, %esp   #,
        .cfi_def_cfa_offset 16
        cmpl    %esi, %ebx  # _16, ivtmp.15
        jne .L5 #,
    # retval2.c:25:     printf("\n");
        movl    $10, 16(%esp)   #,
    # retval2.c:26: }
        addl    $4, %esp    #,
        .cfi_def_cfa_offset 12
        popl    %ebx    #
        .cfi_restore 3
        .cfi_def_cfa_offset 8
        popl    %esi    #
        .cfi_restore 6
        .cfi_def_cfa_offset 4
    # retval2.c:25:     printf("\n");
        jmp putchar #
        .cfi_endproc
    .LFE12:
        .size   objprint, .-objprint
        .section    .text.startup,"ax",@progbits
        .p2align 4,,15
        .globl  main
        .type   main, @function
    main:
    .LFB13:
        .cfi_startproc
        leal    4(%esp), %ecx   #,
        .cfi_def_cfa 1, 0
        andl    $-16, %esp  #,
        pushl   -4(%ecx)    #
        pushl   %ebp    #
        .cfi_escape 0x10,0x5,0x2,0x75,0
        movl    %esp, %ebp  #,
        pushl   %ecx    #
        .cfi_escape 0xf,0x3,0x75,0x7c,0x6
        subl    $64, %esp   #,
    # retval2.c:36:     objprint(&obj);
        pushl   $obj    #
        call    objprint    #
    # retval2.c:37:     obj = fill(37);
        leal    -56(%ebp), %eax #, tmp88
        popl    %edx    #
        popl    %ecx    #
        pushl   $37 #
        pushl   %eax    # tmp88
        call    fill    #
        movl    -56(%ebp), %eax #, tmp91
    # retval2.c:38:     objprint(&obj);
        pushl   $obj    #
    # retval2.c:37:     obj = fill(37);
        movl    %eax, obj   # tmp91, obj
        movl    -52(%ebp), %eax #, tmp93
        movl    %eax, obj+4 # tmp93, obj
        movl    -48(%ebp), %eax #, tmp95
        movl    %eax, obj+8 # tmp95, obj
        movl    -44(%ebp), %eax #, tmp97
        movl    %eax, obj+12    # tmp97, obj
        movl    -40(%ebp), %eax #, tmp99
        movl    %eax, obj+16    # tmp99, obj
        movl    -36(%ebp), %eax #, tmp101
        movl    %eax, obj+20    # tmp101, obj
        movl    -32(%ebp), %eax #, tmp103
        movl    %eax, obj+24    # tmp103, obj
        movl    -28(%ebp), %eax #, tmp105
        movl    %eax, obj+28    # tmp105, obj
        movl    -24(%ebp), %eax #, tmp107
        movl    %eax, obj+32    # tmp107, obj
        movl    -20(%ebp), %eax #, tmp109
        movl    %eax, obj+36    # tmp109, obj
    # retval2.c:38:     objprint(&obj);
        call    objprint    #
    # retval2.c:41: }
        movl    -4(%ebp), %ecx  #,
        .cfi_def_cfa 1, 0
        addl    $16, %esp   #,
        xorl    %eax, %eax  #
        leave
        .cfi_restore 5
        leal    -4(%ecx), %esp  #,
        .cfi_def_cfa 4, 4
        ret
        .cfi_endproc
    .LFE13:
        .size   main, .-main
        .globl  obj
        .data
        .align 32
        .type   obj, @object
        .size   obj, 40
    obj:
    # x:
        .long   1
        .long   2
        .long   3
        .zero   28
        .ident  "GCC: (GNU) 8.3.1 20190223 (Red Hat 8.3.1-2)"
        .section    .note.GNU-stack,"",@progbits
    

    To note:

    Even though main did: obj = fill(37);, it passed a pointer to a temp area on its stack frame rather than passing a pointer to the lvalue obj.

    The caller did a manual copy from the temp area to the lvalue.