Search code examples
c++assemblyvisual-c++inline

How are inline functions compiled to assembly?


I have some code C++ that loops to update values, and out of curiousity I wanted to see the assembly that made up the loop of the body. This led me to experiment a little bit with how inlining looks like after compilation (the compiler is MSVC with O2).

However, when I compared the instruction set to how I thought it should look like when it's actually inlined, I was a bit confused by what I found. Here is some context:

template<typename T>
struct ClassWithInline
{
    Values *v;

    ClassWithInline(Values *v) : v{ v } {}
    T inlineMe(T * const c) const
    {
        // some function of *c, using v->some_constants
    }
};

The Values object is just something that contains constants. ClassWithInline is a member of another object, Owner, and owner has a function callTheInline:

struct Owner
{

    ClassWithInline<double> a;
    Values *v;

    Owner(Values *v) : a{ ClassWithInline<double>(v) }, v{ v } {}
    void callTheInline()
    {
        double *ptr = new double[100];
        double *dptr = new double[100];

        size_t the_end = std::floor(1000 + log(100000));

        for (size_t n = 0; n < the_end; ++n)
        {
            dptr[n] = a.inlineMe(ptr + n);
        }

        ClassWithInline<double> b(v);
        for (size_t n = 0; n < the_end; ++n)
        {
            dptr[n] = b.inlineMe(ptr + n);
        }
    }
};

(The wonky end iteration number is so the compiler doesn't know the size of the loop at compile time and introduce some other optimizations.)

Now when I look at the assembly generated for those for loops, they are dramatically different; in fact the one invoking inlineMe from a has twice as many assembly instructions. How do I bridge this disparity?

a.inlineMe(ptr + n);

000000013F642094  mov         rbp,rbx  
000000013F642097  mov         qword ptr [rsp+20h],r15  
000000013F64209C  sub         rbp,rsi  
000000013F64209F  lea         r15,[r9-3]  
000000013F6420A3  mov         r14,rsi  
000000013F6420A6  lea         r10,[rbx+8]  
000000013F6420AA  sub         r14,rbx  
000000013F6420AD  nop         dword ptr [rax]  
000000013F6420B0  mov         rcx,qword ptr [rdi]  
000000013F6420B3  lea         rdx,[r14+r10]  
000000013F6420B7  movsd       xmm0,mmword ptr [r10-8]  
000000013F6420BD  movsd       xmm1,mmword ptr [rdx+rbp-10h]  
000000013F6420C3  addsd       xmm1,mmword ptr [r10]  
000000013F6420C8  movsd       xmm2,mmword ptr [rdi+8]  
000000013F6420CD  lea         rax,[rcx+r8]  
000000013F6420D1  mulsd       xmm0,xmm3  
000000013F6420D5  mulsd       xmm2,xmm2  
000000013F6420D9  addsd       xmm1,mmword ptr [rbx+rax*8]  
000000013F6420DE  mov         rax,r8  
000000013F6420E1  sub         rax,rcx  
000000013F6420E4  addsd       xmm1,mmword ptr [rbx+rax*8]  
000000013F6420E9  subsd       xmm1,xmm0  
000000013F6420ED  divsd       xmm1,xmm2  
000000013F6420F1  movsd       mmword ptr [r14+r10-8],xmm1  
000000013F6420F8  movsd       xmm1,mmword ptr [r10+8]  
000000013F6420FE  addsd       xmm1,mmword ptr [r10-8]  
000000013F642104  mov         rcx,qword ptr [rdi]  
000000013F642107  movsd       xmm0,mmword ptr [r10]  
000000013F64210C  movsd       xmm2,mmword ptr [rdi+8]  
000000013F642111  mulsd       xmm0,xmm3  
000000013F642115  lea         rax,[rcx+r8]  
000000013F642119  mulsd       xmm2,xmm2  
000000013F64211D  addsd       xmm1,mmword ptr [rbx+rax*8+8]  
000000013F642123  mov         rax,r8  
000000013F642126  sub         rax,rcx  
000000013F642129  addsd       xmm1,mmword ptr [rbx+rax*8+8]  
000000013F64212F  subsd       xmm1,xmm0  
000000013F642133  divsd       xmm1,xmm2  
000000013F642137  movsd       mmword ptr [rdx],xmm1  
000000013F64213B  movsd       xmm1,mmword ptr [r10+10h]  
000000013F642141  addsd       xmm1,mmword ptr [r10]  
000000013F642146  mov         rcx,qword ptr [rdi]  
000000013F642149  movsd       xmm0,mmword ptr [r10+8]  
000000013F64214F  movsd       xmm2,mmword ptr [rdi+8]  
000000013F642154  mulsd       xmm0,xmm3  
000000013F642158  lea         rax,[rcx+r8]  
000000013F64215C  mulsd       xmm2,xmm2  
000000013F642160  addsd       xmm1,mmword ptr [rbx+rax*8+10h]  
000000013F642166  mov         rax,r8  
000000013F642169  sub         rax,rcx  
000000013F64216C  addsd       xmm1,mmword ptr [rbx+rax*8+10h]  
000000013F642172  subsd       xmm1,xmm0  
000000013F642176  divsd       xmm1,xmm2  
000000013F64217A  movsd       mmword ptr [r14+r10+8],xmm1  
000000013F642181  movsd       xmm1,mmword ptr [r10+18h]  
000000013F642187  addsd       xmm1,mmword ptr [r10+8]  
000000013F64218D  mov         rcx,qword ptr [rdi]  
000000013F642190  movsd       xmm0,mmword ptr [r10+10h]  
000000013F642196  movsd       xmm2,mmword ptr [rdi+8]  
000000013F64219B  mulsd       xmm0,xmm3  
000000013F64219F  lea         rax,[rcx+r8]  
000000013F6421A3  mulsd       xmm2,xmm2  
000000013F6421A7  addsd       xmm1,mmword ptr [rbx+rax*8+18h]  
000000013F6421AD  mov         rax,r8  
000000013F6421B0  add         r8,4  
000000013F6421B4  sub         rax,rcx  
000000013F6421B7  addsd       xmm1,mmword ptr [rbx+rax*8+18h]  
000000013F6421BD  subsd       xmm1,xmm0  
000000013F6421C1  divsd       xmm1,xmm2  
000000013F6421C5  movsd       mmword ptr [r14+r10+10h],xmm1  
000000013F6421CC  add         r10,20h  
000000013F6421D0  cmp         r8,r15  
000000013F6421D3  jb          Owner::callTheInline+0B0h (013F6420B0h) 

b.inlineMe(ptr + n);

000000013F6422A4  movsd       xmm1,mmword ptr [rcx+r10*8-10h]  
000000013F6422AB  addsd       xmm1,mmword ptr [rdx+rcx]  
000000013F6422B0  movsd       xmm0,mmword ptr [rdx+rcx-8]  
000000013F6422B6  mulsd       xmm0,xmm3  
000000013F6422BA  addsd       xmm1,mmword ptr [rcx+r8*8-8]  
000000013F6422C1  addsd       xmm1,mmword ptr [rcx-8]  
000000013F6422C6  subsd       xmm1,xmm0  
000000013F6422CA  divsd       xmm1,xmm5  
000000013F6422CE  movsd       mmword ptr [rdi+rcx-8],xmm1  
000000013F6422D4  movsd       xmm2,mmword ptr [rdx+rcx-8]  
000000013F6422DA  addsd       xmm2,mmword ptr [rdx+rcx+8]  
000000013F6422E0  movsd       xmm0,mmword ptr [rdx+rcx]  
000000013F6422E5  mulsd       xmm0,xmm3  
000000013F6422E9  addsd       xmm2,mmword ptr [rcx+r8*8]  
000000013F6422EF  addsd       xmm2,mmword ptr [rcx]  
000000013F6422F3  subsd       xmm2,xmm0  
000000013F6422F7  divsd       xmm2,xmm5  
000000013F6422FB  movsd       mmword ptr [rdi+rcx],xmm2  
000000013F642300  movsd       xmm0,mmword ptr [rdx+rcx+8]  
000000013F642306  movsd       xmm1,mmword ptr [rdx+rcx]  
000000013F64230B  addsd       xmm1,mmword ptr [rcx+rbp]  
000000013F642310  mulsd       xmm0,xmm3  
000000013F642314  addsd       xmm1,mmword ptr [rcx+r8*8+8]  
000000013F64231B  addsd       xmm1,mmword ptr [rcx+8]  
000000013F642320  subsd       xmm1,xmm0  
000000013F642324  divsd       xmm1,xmm5  
000000013F642328  movsd       mmword ptr [rdi+rcx+8],xmm1  
000000013F64232E  movsd       xmm2,mmword ptr [rcx+r10*8+18h]  
000000013F642335  addsd       xmm2,mmword ptr [rdx+rcx+8]  
000000013F64233B  movsd       xmm0,mmword ptr [rcx+rbp]  
000000013F642340  mulsd       xmm0,xmm3  
000000013F642344  addsd       xmm2,mmword ptr [rcx+r8*8+10h]  
000000013F64234B  addsd       xmm2,mmword ptr [rcx+10h]  
000000013F642350  subsd       xmm2,xmm0  
000000013F642354  divsd       xmm2,xmm5  
000000013F642358  movsd       mmword ptr [r14+rcx],xmm2  
000000013F64235E  add         rcx,20h  
000000013F642362  sub         rax,1  
000000013F642366  jne         Owner::callTheInline+2A4h (013F6422A4h)  

Solution

  • Inlining of functions has three main effects:

    • It removes the function-call overhead.
    • It allows the compiler to optimize across the boundaries of the function.
    • It allows the compiler to make hard assumtpions about hardcoded parameters passed to functions. This includes the this pointer to member functions.

    Inlining always happens before the C++ code is translated into assembly. The compiler essentially treats an inline function as if the source code of the called function was inserted at the place of the call. Almost. (In reality the compiler usually also compiles the inlined function into a plain normal function and assigns weak linkage to it, but this is then not used in the further inlining process. This is not of interest here.)

    In your example a is a member of Owner and b is a local variable on the stack. Both a and b maintain a state v.

    To address a the compiler needs to address it via the this pointer of Owner. To address b the compiler does not need to use the this pointer of Owner, it is just on the stack. This alone already makes quite a difference in the number of instructions. Actually this also depends whether the compiler was allowed to inline callTheInline() or not and what the compiler knows about the storage of the Owner instance.

    The value of a.v persists beyond the end of function callTheInline(), while b does not persist beyond the end of this function. This potentially allows the compiler to omit certain calculations. But b.v does not persist beyond the end of the function which allows the compiler to omit calculations inlineMe().