Thanks for all the comments so far. I am sorry that I have used a bad example in my original question, that almost everyone would say: "Oh, you should use memcopy
!" But that is not what my question is about.
My question is more generic about how manual loop unrolling should be done. Consider this example this time, by summing all elements in an array:
#include <stdlib.h>
double sum (size_t n, double *x) {
size_t nr = n & 1;
double *end = x + (n - nr);
double sum_x = 0.0;
for (; x < end; x++) sum_x += *x;
if (nr) sum_x += *x;
return sum_x;
}
The compiler generated assembly admits a similar behaviour (to what is shown by the array-copying example in my original question)
sum:
movq %rdi, %rcx
andl $1, %ecx
subq %rcx, %rdi
leaq (%rsi,%rdi,8), %rdx
cmpq %rdx, %rsi
jnb .L5
movq %rsi, %rax
pxor %xmm0, %xmm0
.L3:
addsd (%rax), %xmm0
addq $8, %rax
cmpq %rax, %rdx
ja .L3
movq %rsi, %rax
notq %rax
addq %rax, %rdx
shrq $3, %rdx
leaq 8(%rsi,%rdx,8), %rsi
.L2:
testq %rcx, %rcx
je .L1
addsd (%rsi), %xmm0
.L1:
ret
.L5:
pxor %xmm0, %xmm0
jmp .L2
However, if I now schedule the "fractional" part ahead of the main loop (as I later dig out in an answer I posted), the compiler does much better job.
#include <stdlib.h>
double sum (size_t n, double *x) {
size_t nr = n & 1;
double *end = x + n;
double sum_x = 0.0;
if (nr) sum_x += *x;
for (x += nr; x < end; x++) sum_x += *x;
return sum_x;
}
sum:
leaq (%rsi,%rdi,8), %rdx
pxor %xmm0, %xmm0
andl $1, %edi
je .L2
addsd (%rsi), %xmm0
.L2:
leaq (%rsi,%rdi,8), %rax
cmpq %rax, %rdx
jbe .L1
.L4:
addsd (%rax), %xmm0
addq $8, %rax
cmpq %rax, %rdx
ja .L4
.L1:
ret
I have only used a compiler flag -O2
. So as Peter said, the compiler generated assembly should be close to C source code. Then the question is, why does a compiler do better in the latter case?
This is not really a performance-related question. It is just something I unconsciously found (and can't explain) when checking compiler's assembly output for C code from a C project I have been writing. Thanks again. Thank Peter for proposing a better title for the question.
Original question:
The following small C function copies a
, a vector of n
entries to b
. A manual loop unrolling of depth 2 is applied.
#include <stddef.h>
void foo (ptrdiff_t n, double *a, double *b) {
ptrdiff_t i = 0;
ptrdiff_t nr = n & 1;
n -= nr; // `n` is an even integer
while (i < n) {
b[i] = a[i];
b[i + 1] = a[i + 1];
i += 2;
} // `i = n` when the loop ends
if (nr) b[i] = a[i];
}
It gives the x64 assembly under gcc -O2
(any gcc
version 5.4+). However, I find the part of the output as commented weird. Why does the compiler ever generate them?
foo:
movq %rdi, %rcx
xorl %eax, %eax
andl $1, %ecx
subq %rcx, %rdi
testq %rdi, %rdi
jle .L11
.L12:
movsd (%rsi,%rax,8), %xmm0
movsd %xmm0, (%rdx,%rax,8)
movsd 8(%rsi,%rax,8), %xmm0
movsd %xmm0, 8(%rdx,%rax,8)
addq $2, %rax
cmpq %rax, %rdi // `i` in %rax, `n` in %rdi
jg .L12 // the loop ends, with `i = n`, BELOW IS WEIRD
subq $1, %rdi // n = n - 1;
shrq %rdi // n = n / 2;
leaq 2(%rdi,%rdi), %rax // i = 2 * n + 2; (this is just `i = n`, isn't it?)
.L11:
testq %rcx, %rcx
je .L10
movsd (%rsi,%rax,8), %xmm0
movsd %xmm0, (%rdx,%rax,8)
.L10:
ret
A similar version using size_t
instead of ptrdiff_t
gives something similar:
#include <stdlib.h>
void bar (size_t n, double *a, double *b) {
size_t i = 0;
size_t nr = n & 1;
n -= nr; // `n` is an even integer
while (i < n) {
b[i] = a[i];
b[i + 1] = a[i + 1];
i += 2;
} // `i = n` when the loop ends
if (nr) b[i] = a[i];
}
bar:
movq %rdi, %rcx
andl $1, %ecx
subq %rcx, %rdi
je .L20
xorl %eax, %eax
.L21:
movsd (%rsi,%rax,8), %xmm0
movsd %xmm0, (%rdx,%rax,8)
movsd 8(%rsi,%rax,8), %xmm0
movsd %xmm0, 8(%rdx,%rax,8)
addq $2, %rax
cmpq %rax, %rdi // `i` in %rax, `n` in %rdi
ja .L21 // the loop ends, with `i = n`, BUT BELOW IS WEIRD
subq $1, %rdi // n = n - 1;
andq $-2, %rdi // n = n & (-2);
addq $2, %rdi // n = n + 2; (this is just `i = n`, isn't it?)
.L20:
testq %rcx, %rcx
je .L19
movsd (%rsi,%rdi,8), %xmm0
movsd %xmm0, (%rdx,%rdi,8)
.L19:
ret
And here is another equivalence,
#include <stdlib.h>
void baz (size_t n, double *a, double *b) {
size_t nr = n & 1;
n -= nr;
double *b_end = b + n;
while (b < b_end) {
b[0] = a[0];
b[1] = a[1];
a += 2;
b += 2;
} // `b = b_end` when the loop ends
if (nr) b[0] = a[0];
}
but the following assembly looks more odd (though produced under -O2
). Now n
, a
and b
are all copied, and when the loop ends, we take 5 lines of code just to end up with b_copy = 0
?!
baz: // initially, `n` in %rdi, `a` in %rsi, `b` in %rdx
movq %rdi, %r8 // n_copy = n;
andl $1, %r8d // nr = n_copy & 1;
subq %r8, %rdi // n_copy -= nr;
leaq (%rdx,%rdi,8), %rdi // b_end = b + n;
cmpq %rdi, %rdx // if (b >= b_end) jump to .L31
jnb .L31
movq %rdx, %rax // b_copy = b;
movq %rsi, %rcx // a_copy = a;
.L32:
movsd (%rcx), %xmm0
addq $16, %rax
addq $16, %rcx
movsd %xmm0, -16(%rax)
movsd -8(%rcx), %xmm0
movsd %xmm0, -8(%rax)
cmpq %rax, %rdi // `b_copy` in %rax, `b_end` in %rdi
ja .L32 // the loop ends, with `b_copy = b_end`
movq %rdx, %rax // b_copy = b;
notq %rax // b_copy = ~b_copy;
addq %rax, %rdi // b_end = b_end + b_copy;
andq $-16, %rdi // b_end = b_end & (-16);
leaq 16(%rdi), %rax // b_copy = b_end + 16;
addq %rax, %rsi // a += b_copy; (isn't `b_copy` just 0?)
addq %rax, %rdx // b += b_copy;
.L31:
testq %r8, %r8 // if (nr == 0) jump to .L30
je .L30
movsd (%rsi), %xmm0 // xmm0 = a[0];
movsd %xmm0, (%rdx) // b[0] = xmm0;
.L30:
ret
Can anyone explain what the compiler has in mind in all three cases?
Looks like if I unroll the loop in the following manner, a compiler can generate neater code.
#include <stdlib.h>
#include <stddef.h>
void foo (ptrdiff_t n, double *a, double *b) {
ptrdiff_t i = n & 1;
if (i) b[0] = a[0];
while (i < n) {
b[i] = a[i];
b[i + 1] = a[i + 1];
i += 2;
}
}
void bar (size_t n, double *a, double *b) {
size_t i = n & 1;
if (i) b[0] = a[0];
while (i < n) {
b[i] = a[i];
b[i + 1] = a[i + 1];
i += 2;
}
}
void baz (size_t n, double *a, double *b) {
size_t nr = n & 1;
double *b_end = b + n;
if (nr) b[0] = a[0];
b += nr;
while (b < b_end) {
b[0] = a[0];
b[1] = a[1];
a += 2;
b += 2;
}
}
foo:
movq %rdi, %rax
andl $1, %eax
je .L9
movsd (%rsi), %xmm0
movsd %xmm0, (%rdx)
cmpq %rax, %rdi
jle .L11
.L4:
movsd (%rsi,%rax,8), %xmm0
movsd %xmm0, (%rdx,%rax,8)
movsd 8(%rsi,%rax,8), %xmm0
movsd %xmm0, 8(%rdx,%rax,8)
addq $2, %rax
.L9:
cmpq %rax, %rdi
jg .L4
.L11:
ret
bar:
movq %rdi, %rax
andl $1, %eax
je .L20
movsd (%rsi), %xmm0
movsd %xmm0, (%rdx)
cmpq %rax, %rdi
jbe .L21
.L15:
movsd (%rsi,%rax,8), %xmm0
movsd %xmm0, (%rdx,%rax,8)
movsd 8(%rsi,%rax,8), %xmm0
movsd %xmm0, 8(%rdx,%rax,8)
addq $2, %rax
.L20:
cmpq %rax, %rdi
ja .L15
.L21:
ret
baz:
leaq (%rdx,%rdi,8), %rcx
andl $1, %edi
je .L23
movsd (%rsi), %xmm0
movsd %xmm0, (%rdx)
.L23:
leaq (%rdx,%rdi,8), %rax
cmpq %rax, %rcx
jbe .L22
.L25:
movsd (%rsi), %xmm0
addq $16, %rax
addq $16, %rsi
movsd %xmm0, -16(%rax)
movsd -8(%rsi), %xmm0
movsd %xmm0, -8(%rax)
cmpq %rax, %rcx
ja .L25
.L22:
ret