Why is this version of strrev faster than mine?

I can't read assembly code, so my assumptions may be completely wrong!

Here's my code :

void reverse(char* str)
{
    size_t size = strlen(str) / 2;
    char tmp;
    for (int i = 0; i < size; ++i)
    {
        tmp = str[size - i - 1];
        str[size - i - 1] = str[size + i];
        str[size + i] = tmp;
    }
}

And here's the asm output :

000000000000073a <reverse>:
 73a:   55                      push   %rbp
 73b:   48 89 e5                mov    %rsp,%rbp
 73e:   48 83 ec 20             sub    $0x20,%rsp
 742:   48 89 7d e8             mov    %rdi,-0x18(%rbp)
 746:   48 8b 45 e8             mov    -0x18(%rbp),%rax
 74a:   48 89 c7                mov    %rax,%rdi
 74d:   e8 9e fe ff ff          callq  5f0 <strlen@plt>
 752:   48 d1 e8                shr    %rax
 755:   48 89 45 f8             mov    %rax,-0x8(%rbp)
 759:   c7 45 f4 00 00 00 00    movl   $0x0,-0xc(%rbp)
 760:   eb 72                   jmp    7d4 <reverse+0x9a>
 762:   8b 45 f4                mov    -0xc(%rbp),%eax
 765:   48 98                   cltq   
 767:   48 8b 55 f8             mov    -0x8(%rbp),%rdx
 76b:   48 29 c2                sub    %rax,%rdx
 76e:   48 89 d0                mov    %rdx,%rax
 771:   48 8d 50 ff             lea    -0x1(%rax),%rdx
 775:   48 8b 45 e8             mov    -0x18(%rbp),%rax
 779:   48 01 d0                add    %rdx,%rax
 77c:   0f b6 00                movzbl (%rax),%eax
 77f:   88 45 f3                mov    %al,-0xd(%rbp)
 782:   8b 45 f4                mov    -0xc(%rbp),%eax
 785:   48 63 d0                movslq %eax,%rdx
 788:   48 8b 45 f8             mov    -0x8(%rbp),%rax
 78c:   48 01 c2                add    %rax,%rdx
 78f:   48 8b 45 e8             mov    -0x18(%rbp),%rax
 793:   48 01 d0                add    %rdx,%rax
 796:   8b 55 f4                mov    -0xc(%rbp),%edx
 799:   48 63 d2                movslq %edx,%rdx
 79c:   48 8b 4d f8             mov    -0x8(%rbp),%rcx
 7a0:   48 29 d1                sub    %rdx,%rcx
 7a3:   48 89 ca                mov    %rcx,%rdx
 7a6:   48 8d 4a ff             lea    -0x1(%rdx),%rcx
 7aa:   48 8b 55 e8             mov    -0x18(%rbp),%rdx
 7ae:   48 01 ca                add    %rcx,%rdx
 7b1:   0f b6 00                movzbl (%rax),%eax
 7b4:   88 02                   mov    %al,(%rdx)
 7b6:   8b 45 f4                mov    -0xc(%rbp),%eax
 7b9:   48 63 d0                movslq %eax,%rdx
 7bc:   48 8b 45 f8             mov    -0x8(%rbp),%rax
 7c0:   48 01 c2                add    %rax,%rdx
 7c3:   48 8b 45 e8             mov    -0x18(%rbp),%rax
 7c7:   48 01 c2                add    %rax,%rdx
 7ca:   0f b6 45 f3             movzbl -0xd(%rbp),%eax
 7ce:   88 02                   mov    %al,(%rdx)
 7d0:   83 45 f4 01             addl   $0x1,-0xc(%rbp)
 7d4:   8b 45 f4                mov    -0xc(%rbp),%eax
 7d7:   48 98                   cltq   
 7d9:   48 39 45 f8             cmp    %rax,-0x8(%rbp)
 7dd:   77 83                   ja     762 <reverse+0x28>
 7df:   90                      nop
 7e0:   c9                      leaveq 
 7e1:   c3                      retq

And here's the other version:

void strrev2(unsigned char *str)
{
    int i;
    int j;
    unsigned char a;
    unsigned len = strlen((const char *)str);
    for (i = 0, j = len - 1; i < j; i++, j--)
    {
        a = str[i];
        str[i] = str[j];
        str[j] = a;
    }
}

And the asm:

00000000000007e2 <strrev2>:
 7e2:   55                      push   %rbp
 7e3:   48 89 e5                mov    %rsp,%rbp
 7e6:   48 83 ec 20             sub    $0x20,%rsp
 7ea:   48 89 7d e8             mov    %rdi,-0x18(%rbp)
 7ee:   48 8b 45 e8             mov    -0x18(%rbp),%rax
 7f2:   48 89 c7                mov    %rax,%rdi
 7f5:   e8 f6 fd ff ff          callq  5f0 <strlen@plt>
 7fa:   89 45 fc                mov    %eax,-0x4(%rbp)
 7fd:   c7 45 f4 00 00 00 00    movl   $0x0,-0xc(%rbp)
 804:   8b 45 fc                mov    -0x4(%rbp),%eax
 807:   83 e8 01                sub    $0x1,%eax
 80a:   89 45 f8                mov    %eax,-0x8(%rbp)
 80d:   eb 4d                   jmp    85c <strrev2+0x7a>
 80f:   8b 45 f4                mov    -0xc(%rbp),%eax
 812:   48 63 d0                movslq %eax,%rdx
 815:   48 8b 45 e8             mov    -0x18(%rbp),%rax
 819:   48 01 d0                add    %rdx,%rax
 81c:   0f b6 00                movzbl (%rax),%eax
 81f:   88 45 f3                mov    %al,-0xd(%rbp)
 822:   8b 45 f8                mov    -0x8(%rbp),%eax
 825:   48 63 d0                movslq %eax,%rdx
 828:   48 8b 45 e8             mov    -0x18(%rbp),%rax
 82c:   48 01 d0                add    %rdx,%rax
 82f:   8b 55 f4                mov    -0xc(%rbp),%edx
 832:   48 63 ca                movslq %edx,%rcx
 835:   48 8b 55 e8             mov    -0x18(%rbp),%rdx
 839:   48 01 ca                add    %rcx,%rdx
 83c:   0f b6 00                movzbl (%rax),%eax
 83f:   88 02                   mov    %al,(%rdx)
 841:   8b 45 f8                mov    -0x8(%rbp),%eax
 844:   48 63 d0                movslq %eax,%rdx
 847:   48 8b 45 e8             mov    -0x18(%rbp),%rax
 84b:   48 01 c2                add    %rax,%rdx
 84e:   0f b6 45 f3             movzbl -0xd(%rbp),%eax
 852:   88 02                   mov    %al,(%rdx)
 854:   83 45 f4 01             addl   $0x1,-0xc(%rbp)
 858:   83 6d f8 01             subl   $0x1,-0x8(%rbp)
 85c:   8b 45 f4                mov    -0xc(%rbp),%eax
 85f:   3b 45 f8                cmp    -0x8(%rbp),%eax
 862:   7c ab                   jl     80f <strrev2+0x2d>
 864:   90                      nop
 865:   c9                      leaveq 
 866:   c3                      retq

Why is the second version faster (I assume it is, because there are less instructions) and why does objdump produce more assembly instructions for my code?

My code uses less memory, but I thought it would also be faster, because I only increment one variable (i) and I don't cast when using strlen().

Solution

That piece here: size - i - 1

That is ruining the performance for you, as that calculation is actually being performed every single loop iteration.

Your assumption about using "less memory" is wrong. These variables didn't even end up in memory, in neither of the algorithms, but were kept purely within registers. So there was no memory access to eliminate in the first place, the only thing your optimization achieved was to introduce additional arithmetic which is now slowing down the loop.

The most complex form of addressing x86 arch can handle in a single instruction is variable[variable + constant]. Any more complex than that, and the pointer arithmetic has to be performed with multiple instructions instead.

Also, the compiler unrolled the code, correctly estimating the effects of up to 3 iterations in a row. For the code with i and j that means incrementing only once every 3 iterations, and using constant offsets in between. For your code, it meant redoing the address calculation over and over again.