I can't read assembly code, so my assumptions may be completely wrong!
Here's my code :
void reverse(char* str)
{
size_t size = strlen(str) / 2;
char tmp;
for (int i = 0; i < size; ++i)
{
tmp = str[size - i - 1];
str[size - i - 1] = str[size + i];
str[size + i] = tmp;
}
}
And here's the asm output :
000000000000073a <reverse>:
73a: 55 push %rbp
73b: 48 89 e5 mov %rsp,%rbp
73e: 48 83 ec 20 sub $0x20,%rsp
742: 48 89 7d e8 mov %rdi,-0x18(%rbp)
746: 48 8b 45 e8 mov -0x18(%rbp),%rax
74a: 48 89 c7 mov %rax,%rdi
74d: e8 9e fe ff ff callq 5f0 <strlen@plt>
752: 48 d1 e8 shr %rax
755: 48 89 45 f8 mov %rax,-0x8(%rbp)
759: c7 45 f4 00 00 00 00 movl $0x0,-0xc(%rbp)
760: eb 72 jmp 7d4 <reverse+0x9a>
762: 8b 45 f4 mov -0xc(%rbp),%eax
765: 48 98 cltq
767: 48 8b 55 f8 mov -0x8(%rbp),%rdx
76b: 48 29 c2 sub %rax,%rdx
76e: 48 89 d0 mov %rdx,%rax
771: 48 8d 50 ff lea -0x1(%rax),%rdx
775: 48 8b 45 e8 mov -0x18(%rbp),%rax
779: 48 01 d0 add %rdx,%rax
77c: 0f b6 00 movzbl (%rax),%eax
77f: 88 45 f3 mov %al,-0xd(%rbp)
782: 8b 45 f4 mov -0xc(%rbp),%eax
785: 48 63 d0 movslq %eax,%rdx
788: 48 8b 45 f8 mov -0x8(%rbp),%rax
78c: 48 01 c2 add %rax,%rdx
78f: 48 8b 45 e8 mov -0x18(%rbp),%rax
793: 48 01 d0 add %rdx,%rax
796: 8b 55 f4 mov -0xc(%rbp),%edx
799: 48 63 d2 movslq %edx,%rdx
79c: 48 8b 4d f8 mov -0x8(%rbp),%rcx
7a0: 48 29 d1 sub %rdx,%rcx
7a3: 48 89 ca mov %rcx,%rdx
7a6: 48 8d 4a ff lea -0x1(%rdx),%rcx
7aa: 48 8b 55 e8 mov -0x18(%rbp),%rdx
7ae: 48 01 ca add %rcx,%rdx
7b1: 0f b6 00 movzbl (%rax),%eax
7b4: 88 02 mov %al,(%rdx)
7b6: 8b 45 f4 mov -0xc(%rbp),%eax
7b9: 48 63 d0 movslq %eax,%rdx
7bc: 48 8b 45 f8 mov -0x8(%rbp),%rax
7c0: 48 01 c2 add %rax,%rdx
7c3: 48 8b 45 e8 mov -0x18(%rbp),%rax
7c7: 48 01 c2 add %rax,%rdx
7ca: 0f b6 45 f3 movzbl -0xd(%rbp),%eax
7ce: 88 02 mov %al,(%rdx)
7d0: 83 45 f4 01 addl $0x1,-0xc(%rbp)
7d4: 8b 45 f4 mov -0xc(%rbp),%eax
7d7: 48 98 cltq
7d9: 48 39 45 f8 cmp %rax,-0x8(%rbp)
7dd: 77 83 ja 762 <reverse+0x28>
7df: 90 nop
7e0: c9 leaveq
7e1: c3 retq
And here's the other version:
void strrev2(unsigned char *str)
{
int i;
int j;
unsigned char a;
unsigned len = strlen((const char *)str);
for (i = 0, j = len - 1; i < j; i++, j--)
{
a = str[i];
str[i] = str[j];
str[j] = a;
}
}
And the asm:
00000000000007e2 <strrev2>:
7e2: 55 push %rbp
7e3: 48 89 e5 mov %rsp,%rbp
7e6: 48 83 ec 20 sub $0x20,%rsp
7ea: 48 89 7d e8 mov %rdi,-0x18(%rbp)
7ee: 48 8b 45 e8 mov -0x18(%rbp),%rax
7f2: 48 89 c7 mov %rax,%rdi
7f5: e8 f6 fd ff ff callq 5f0 <strlen@plt>
7fa: 89 45 fc mov %eax,-0x4(%rbp)
7fd: c7 45 f4 00 00 00 00 movl $0x0,-0xc(%rbp)
804: 8b 45 fc mov -0x4(%rbp),%eax
807: 83 e8 01 sub $0x1,%eax
80a: 89 45 f8 mov %eax,-0x8(%rbp)
80d: eb 4d jmp 85c <strrev2+0x7a>
80f: 8b 45 f4 mov -0xc(%rbp),%eax
812: 48 63 d0 movslq %eax,%rdx
815: 48 8b 45 e8 mov -0x18(%rbp),%rax
819: 48 01 d0 add %rdx,%rax
81c: 0f b6 00 movzbl (%rax),%eax
81f: 88 45 f3 mov %al,-0xd(%rbp)
822: 8b 45 f8 mov -0x8(%rbp),%eax
825: 48 63 d0 movslq %eax,%rdx
828: 48 8b 45 e8 mov -0x18(%rbp),%rax
82c: 48 01 d0 add %rdx,%rax
82f: 8b 55 f4 mov -0xc(%rbp),%edx
832: 48 63 ca movslq %edx,%rcx
835: 48 8b 55 e8 mov -0x18(%rbp),%rdx
839: 48 01 ca add %rcx,%rdx
83c: 0f b6 00 movzbl (%rax),%eax
83f: 88 02 mov %al,(%rdx)
841: 8b 45 f8 mov -0x8(%rbp),%eax
844: 48 63 d0 movslq %eax,%rdx
847: 48 8b 45 e8 mov -0x18(%rbp),%rax
84b: 48 01 c2 add %rax,%rdx
84e: 0f b6 45 f3 movzbl -0xd(%rbp),%eax
852: 88 02 mov %al,(%rdx)
854: 83 45 f4 01 addl $0x1,-0xc(%rbp)
858: 83 6d f8 01 subl $0x1,-0x8(%rbp)
85c: 8b 45 f4 mov -0xc(%rbp),%eax
85f: 3b 45 f8 cmp -0x8(%rbp),%eax
862: 7c ab jl 80f <strrev2+0x2d>
864: 90 nop
865: c9 leaveq
866: c3 retq
Why is the second version faster (I assume it is, because there are less instructions) and why does objdump
produce more assembly instructions for my code?
My code uses less memory, but I thought it would also be faster, because I only increment one variable (i
) and I don't cast when using strlen()
.
That piece here: size - i - 1
That is ruining the performance for you, as that calculation is actually being performed every single loop iteration.
Your assumption about using "less memory" is wrong. These variables didn't even end up in memory, in neither of the algorithms, but were kept purely within registers. So there was no memory access to eliminate in the first place, the only thing your optimization achieved was to introduce additional arithmetic which is now slowing down the loop.
The most complex form of addressing x86 arch can handle in a single instruction is variable[variable + constant]
. Any more complex than that, and the pointer arithmetic has to be performed with multiple instructions instead.
Also, the compiler unrolled the code, correctly estimating the effects of up to 3 iterations in a row. For the code with i
and j
that means incrementing only once every 3 iterations, and using constant offsets in between. For your code, it meant redoing the address calculation over and over again.