This question is about how we multiply an integer with a constant. So let's look at a simple function:
int f(int x) {
return 10*x;
}
How can that function be optimized best, especially when inlined into a caller?
Approach 1 (produced by most optimizing compilers (e.g. on Godbolt))
lea (%rdi,%rdi,4), %eax
add %eax, %eax
Approach 2 (produced with clang3.6 and earlier, with -O3)
imul $10, %edi, %eax
Approach 3 (produced with g++6.2 without optimization, removing stores/reloads)
mov %edi, %eax
sal $2, %eax
add %edi, %eax
add %eax, %eax
Which version is fastest, and why? Primarily interested in Intel Haswell.
I just did some measurements. We mimic the following code in assembly, using the instructions given in the question:
for (unsigned i = 0; i < (1 << 30); ++i) {
r1 = r2 * 10;
r2 = r1 * 10;
}
Possibly there are some additional registers needed for temporaries.
Note: All measurements are in cycles per one multiplication.
We use clang compiler with -O3, because there we exactly get the assembly we want (gcc sometimes adds very few MOVs inside the loop). We have two parameters: u = #unrolled loops, and i = #ilp. For example for u=4, i=2, we get the following:
401d5b: 0f a2 cpuid
401d5d: 0f 31 rdtsc
401d5f: 89 d6 mov %edx,%esi
401d61: 89 c7 mov %eax,%edi
401d63: 41 89 f0 mov %esi,%r8d
401d66: 89 f8 mov %edi,%eax
401d68: b9 00 00 20 00 mov $0x200000,%ecx
401d6d: 0f 1f 00 nopl (%rax)
401d70: 6b d5 0a imul $0xa,%ebp,%edx
401d73: 41 6b f7 0a imul $0xa,%r15d,%esi
401d77: 6b fa 0a imul $0xa,%edx,%edi
401d7a: 6b d6 0a imul $0xa,%esi,%edx
401d7d: 6b f7 0a imul $0xa,%edi,%esi
401d80: 6b fa 0a imul $0xa,%edx,%edi
401d83: 6b d6 0a imul $0xa,%esi,%edx
401d86: 6b f7 0a imul $0xa,%edi,%esi
401d89: 6b fa 0a imul $0xa,%edx,%edi
401d8c: 6b d6 0a imul $0xa,%esi,%edx
401d8f: 6b f7 0a imul $0xa,%edi,%esi
401d92: 6b fa 0a imul $0xa,%edx,%edi
401d95: 44 6b e6 0a imul $0xa,%esi,%r12d
401d99: 44 6b ef 0a imul $0xa,%edi,%r13d
401d9d: 41 6b ec 0a imul $0xa,%r12d,%ebp
401da1: 45 6b fd 0a imul $0xa,%r13d,%r15d
401da5: ff c9 dec %ecx
401da7: 75 c7 jne 401d70 <_Z7measureIN5timer5rtdscE2V1Li16777216ELi4ELi2EEvv+0x130>
401da9: 49 c1 e0 20 shl $0x20,%r8
401dad: 49 09 c0 or %rax,%r8
401db0: 0f 01 f9 rdtscp
Note that these are not 8, but 16 imul instructions, because this is r2 = r1 * 10; r1 = r2 * 10; I will only post the main loop, i.e.,
401d70: 6b d5 0a imul $0xa,%ebp,%edx
401d73: 41 6b f7 0a imul $0xa,%r15d,%esi
401d77: 6b fa 0a imul $0xa,%edx,%edi
401d7a: 6b d6 0a imul $0xa,%esi,%edx
401d7d: 6b f7 0a imul $0xa,%edi,%esi
401d80: 6b fa 0a imul $0xa,%edx,%edi
401d83: 6b d6 0a imul $0xa,%esi,%edx
401d86: 6b f7 0a imul $0xa,%edi,%esi
401d89: 6b fa 0a imul $0xa,%edx,%edi
401d8c: 6b d6 0a imul $0xa,%esi,%edx
401d8f: 6b f7 0a imul $0xa,%edi,%esi
401d92: 6b fa 0a imul $0xa,%edx,%edi
401d95: 44 6b e6 0a imul $0xa,%esi,%r12d
401d99: 44 6b ef 0a imul $0xa,%edi,%r13d
401d9d: 41 6b ec 0a imul $0xa,%r12d,%ebp
401da1: 45 6b fd 0a imul $0xa,%r13d,%r15d
401da5: ff c9 dec %ecx
401da7: 75 c7 jne 401d70 <_Z7measureIN5timer5rtdscE2V1Li16777216ELi4ELi2EEvv+0x130>
Instead of rtdsc we use perf (PERF_COUNT_HW_REF_CPU_CYCLES = "ref cycles" and PERF_COUNT_HW_CPU_CYCLES = "cpu cycles").
We fix u = 16, and vary i = {1, 2, 4, 8, 16, 32}. We get
name uroll ilp ref cyclescpu cyclesp0 p1 p2 p3 p4 p5 p6 p7
V1 16 1 2.723 3.006 0.000 1.000 0.000 0.000 0.000 0.000 0.031 0.000
V1 16 2 1.349 1.502 0.000 1.000 0.000 0.000 0.000 0.000 0.016 0.000
V1 16 4 0.902 1.002 0.000 1.000 0.000 0.000 0.000 0.000 0.008 0.000
V1 16 8 0.899 1.001 0.000 1.000 0.004 0.006 0.008 0.002 0.006 0.002
V1 16 16 0.898 1.001 0.000 1.000 0.193 0.218 0.279 0.001 0.003 0.134
V1 16 32 0.926 1.008 0.000 1.004 0.453 0.490 0.642 0.001 0.002 0.322
This makes sense. The ref cycles can be ignored.
The columns on the right show the number of microops on the execution ports. We have one instruction on p1 (the imul, of course) and on p6 we have the decrement (1/16 in the first case). Later we can also see that we get other microops due to strong register pressure.
Ok, let's move to the second version, which is now:
402270: 89 ea mov %ebp,%edx
402272: c1 e2 02 shl $0x2,%edx
402275: 01 ea add %ebp,%edx
402277: 01 d2 add %edx,%edx
402279: 44 89 fe mov %r15d,%esi
40227c: c1 e6 02 shl $0x2,%esi
40227f: 44 01 fe add %r15d,%esi
402282: 01 f6 add %esi,%esi
402284: 89 d7 mov %edx,%edi
402286: c1 e7 02 shl $0x2,%edi
402289: 01 d7 add %edx,%edi
40228b: 01 ff add %edi,%edi
40228d: 89 f2 mov %esi,%edx
40228f: c1 e2 02 shl $0x2,%edx
402292: 01 f2 add %esi,%edx
402294: 01 d2 add %edx,%edx
402296: 89 fe mov %edi,%esi
402298: c1 e6 02 shl $0x2,%esi
40229b: 01 fe add %edi,%esi
40229d: 01 f6 add %esi,%esi
40229f: 89 d7 mov %edx,%edi
4022a1: c1 e7 02 shl $0x2,%edi
4022a4: 01 d7 add %edx,%edi
4022a6: 01 ff add %edi,%edi
4022a8: 89 f2 mov %esi,%edx
4022aa: c1 e2 02 shl $0x2,%edx
4022ad: 01 f2 add %esi,%edx
4022af: 01 d2 add %edx,%edx
4022b1: 89 fe mov %edi,%esi
4022b3: c1 e6 02 shl $0x2,%esi
4022b6: 01 fe add %edi,%esi
4022b8: 01 f6 add %esi,%esi
4022ba: 89 d7 mov %edx,%edi
4022bc: c1 e7 02 shl $0x2,%edi
4022bf: 01 d7 add %edx,%edi
4022c1: 01 ff add %edi,%edi
4022c3: 89 f2 mov %esi,%edx
4022c5: c1 e2 02 shl $0x2,%edx
4022c8: 01 f2 add %esi,%edx
4022ca: 01 d2 add %edx,%edx
4022cc: 89 fe mov %edi,%esi
4022ce: c1 e6 02 shl $0x2,%esi
4022d1: 01 fe add %edi,%esi
4022d3: 01 f6 add %esi,%esi
4022d5: 89 d7 mov %edx,%edi
4022d7: c1 e7 02 shl $0x2,%edi
4022da: 01 d7 add %edx,%edi
4022dc: 01 ff add %edi,%edi
4022de: 89 f2 mov %esi,%edx
4022e0: c1 e2 02 shl $0x2,%edx
4022e3: 01 f2 add %esi,%edx
4022e5: 01 d2 add %edx,%edx
4022e7: 89 fe mov %edi,%esi
4022e9: c1 e6 02 shl $0x2,%esi
4022ec: 01 fe add %edi,%esi
4022ee: 01 f6 add %esi,%esi
4022f0: 89 d5 mov %edx,%ebp
4022f2: c1 e5 02 shl $0x2,%ebp
4022f5: 01 d5 add %edx,%ebp
4022f7: 01 ed add %ebp,%ebp
4022f9: 41 89 f7 mov %esi,%r15d
4022fc: 41 c1 e7 02 shl $0x2,%r15d
402300: 41 01 f7 add %esi,%r15d
402303: 45 01 ff add %r15d,%r15d
402306: ff c8 dec %eax
402308: 0f 85 62 ff ff ff jne 402270 <_Z7measureIN5timer5rtdscE2V2Li16777216ELi4ELi2EEvv+0xe0>
Measurements for V2
name uroll ilp ref cyclescpu cyclesp0 p1 p2 p3 p4 p5 p6 p7
V2 16 1 2.696 3.004 0.776 0.714 0.000 0.000 0.000 0.731 0.811 0.000
V2 16 2 1.454 1.620 0.791 0.706 0.000 0.000 0.000 0.719 0.800 0.000
V2 16 4 0.918 1.022 0.836 0.679 0.000 0.000 0.000 0.696 0.795 0.000
V2 16 8 0.914 1.018 0.864 0.647 0.006 0.002 0.012 0.671 0.826 0.008
V2 16 16 1.277 1.414 0.834 0.652 0.237 0.263 0.334 0.685 0.887 0.161
V2 16 32 1.572 1.751 0.962 0.703 0.532 0.560 0.671 0.740 1.003 0.230
This also makes sense, we are slower, and with i=32, we for sure have too large register pressure (shown by the other ports used and in the assembly)... With i=0, we can verify, that p0+p1+p5+p6=~3.001, so no ILP of course. We could expect 4 cpu cycles, but the MOV is for free (register allocation).
With i=4: SHL is executed on p0 or p6, the ADD and MOV are both executed on p0, 1, 5, or 6. So we have 1 op on p0 or p6, and 2+1 ops (add/mov) on p0, p1, p5, or p6. Again, the MOV is for free, so we get 1 cycle per multiplication.
And finally with the optimized version:
402730: 67 8d 7c ad 00 lea 0x0(%ebp,%ebp,4),%edi
402735: 01 ff add %edi,%edi
402737: 67 43 8d 2c bf lea (%r15d,%r15d,4),%ebp
40273c: 01 ed add %ebp,%ebp
40273e: 67 8d 1c bf lea (%edi,%edi,4),%ebx
402742: 01 db add %ebx,%ebx
402744: 67 8d 7c ad 00 lea 0x0(%ebp,%ebp,4),%edi
402749: 01 ff add %edi,%edi
40274b: 67 8d 2c 9b lea (%ebx,%ebx,4),%ebp
40274f: 01 ed add %ebp,%ebp
402751: 67 8d 1c bf lea (%edi,%edi,4),%ebx
402755: 01 db add %ebx,%ebx
402757: 67 8d 7c ad 00 lea 0x0(%ebp,%ebp,4),%edi
40275c: 01 ff add %edi,%edi
40275e: 67 8d 2c 9b lea (%ebx,%ebx,4),%ebp
402762: 01 ed add %ebp,%ebp
402764: 67 8d 1c bf lea (%edi,%edi,4),%ebx
402768: 01 db add %ebx,%ebx
40276a: 67 8d 7c ad 00 lea 0x0(%ebp,%ebp,4),%edi
40276f: 01 ff add %edi,%edi
402771: 67 8d 2c 9b lea (%ebx,%ebx,4),%ebp
402775: 01 ed add %ebp,%ebp
402777: 67 8d 1c bf lea (%edi,%edi,4),%ebx
40277b: 01 db add %ebx,%ebx
40277d: 67 44 8d 64 ad 00 lea 0x0(%ebp,%ebp,4),%r12d
402783: 45 01 e4 add %r12d,%r12d
402786: 67 44 8d 2c 9b lea (%ebx,%ebx,4),%r13d
40278b: 45 01 ed add %r13d,%r13d
40278e: 67 43 8d 2c a4 lea (%r12d,%r12d,4),%ebp
402793: 01 ed add %ebp,%ebp
402795: 67 47 8d 7c ad 00 lea 0x0(%r13d,%r13d,4),%r15d
40279b: 45 01 ff add %r15d,%r15d
40279e: ff c9 dec %ecx
4027a0: 75 8e jne 402730 <_Z7measureIN5timer5rtdscE2V3Li16777216ELi4ELi2EEvv+0x170>
Measurements for V3
name uroll ilp ref cyclescpu cyclesp0 p1 p2 p3 p4 p5 p6 p7
V3 16 1 1.797 2.002 0.447 0.558 0.000 0.000 0.000 0.557 0.469 0.000
V3 16 2 1.273 1.418 0.448 0.587 0.000 0.000 0.000 0.528 0.453 0.000
V3 16 4 0.774 0.835 0.449 0.569 0.000 0.000 0.000 0.548 0.442 0.000
V3 16 8 0.572 0.636 0.440 0.555 0.017 0.021 0.032 0.562 0.456 0.012
V3 16 16 0.753 0.838 0.433 0.630 0.305 0.324 0.399 0.644 0.458 0.165
V3 16 32 0.976 1.087 0.467 0.766 0.514 0.536 0.701 0.737 0.458 0.333
Okay, now we are faster than the imul. 2 cycles for i=0 (1 for both instructions), and for i=8, we are even faster:. The lea can be executed on p1 and p5, the add, as above, on p0, p1, p5, or p6. So if perfectly scheduled, the LEA goes to p1 and p5, the ADD to p0, or p6. Unfortunately, in this case it isn't that perfect (the assembly is fine). I suppose that scheduling is not perfect, and a few ADD land on the p1/p5 ports.
All code can be seen on gcc.godbolt.org (it has quite some template magic, but boils down to extremely simple code, which has been checked many times). Note that I removed the timers and some other stuff, which is not necessary for checking the assembly.