In a fairly large and complex C program where running time is the first priority, I have to decide how I should write code pieces like this:
for (int i=0; i < md->global_grid[ic[0]][ic[1]][ic[2]].parts_num; i++)
{
if (md->global_grid[ic[0]][ic[1]][ic[2]].parts[i].core.GroupID == RESERVED_GROUP)
{
md->global_grid[ic[0]][ic[1]][ic[2]].parts[i].core.GroupID = GroupID;
fmd_real_t mass = md->potsys.atomkinds[md->global_grid[ic[0]][ic[1]][ic[2]].parts[i].core.atomkind].mass;
for (int d=0; d<3; d++)
md->global_grid[ic[0]][ic[1]][ic[2]].parts[i].core.v[d] -= MomentumSum[d] / (AtomsNum * mass);
}
}
This can be made more readable and compact by using pointers like pc
below:
for (int i=0; i < md->global_grid[ic[0]][ic[1]][ic[2]].parts_num; i++)
{
if (md->global_grid[ic[0]][ic[1]][ic[2]].parts[i].core.GroupID == RESERVED_GROUP)
{
particle_core_t *pc = &md->global_grid[ic[0]][ic[1]][ic[2]].parts[i].core;
pc->GroupID = GroupID;
fmd_real_t mass = md->potsys.atomkinds[pc->atomkind].mass;
for (int d=0; d<3; d++)
pc->v[d] -= MomentumSum[d] / (AtomsNum * mass);
}
}
But doesn't dereferencing pc
take some CPU time? I usually use the first form and sometimes the second form but don't know which is really better. I use -O3
of gcc for optimization.
I know that measuring running times and comparing them may provide an answer, but knowning what experienced and professional programmers think is always very helpful. In particular, merely comparing times doesn't tell why one form is faster.
Take a look at the assembly in jtbandes' godbolt example. Here is gcc's x86-64 assembly for the readable version of the inner loop:
particle_core_t *pc = &md->global_grid[ic[0]][ic[1]][ic[2]].parts[i].core;
pc->GroupID = GroupID;
fmd_real_t mass = md->potsys.atomkinds[pc->atomkind].mass;
for (int d=0; d<3; d++)
pc->v[d] -= MomentumSum[d] / (AtomsNum * mass);
gcc is clever enough to see that the loop over d
makes 3 iterations, so it unrolls it. It also sees that the lhs in each iteration is in the same array, so it efficiently stores the array address in rcx
, rather than repeatedly dereferencing pc->v
.
mov rcx, QWORD PTR [rax+8] ; rcx = pc->v.
mov DWORD PTR [rax], ebp
pxor xmm0, xmm0
add rdx, 1
movsx rax, DWORD PTR [rax+4]
mov rsi, QWORD PTR [r11+8]
cvtsi2sd xmm0, r8d
movsd xmm2, QWORD PTR [rbx] ; Load xmm2 = MomentumSum[0].
movsd xmm1, QWORD PTR [rcx] ; Load xmm1 = pc->v[0].
lea rax, [rax+rax*2]
lea rax, [rsi+rax*8]
mulsd xmm0, QWORD PTR [rax+16] ; Compute xmm0 = AtomsNum * mass.
movsx rax, DWORD PTR [r10]
mov rax, QWORD PTR [r12+rax*8]
divsd xmm2, xmm0 ; xmm2 /= xmm0
subsd xmm1, xmm2 ; xmm1 -= xmm2
movsd QWORD PTR [rcx], xmm1 ; Store pc->v[0] = xmm1.
movsd xmm2, QWORD PTR [rbx+8]
movsd xmm1, QWORD PTR [rcx+8]
divsd xmm2, xmm0
subsd xmm1, xmm2
movsd QWORD PTR [rcx+8], xmm1 ; Store pc->v[1] = xmm1.
movsd xmm1, QWORD PTR [rbx+16]
divsd xmm1, xmm0
movsd xmm0, QWORD PTR [rcx+16]
subsd xmm0, xmm1
movsd QWORD PTR [rcx+16], xmm0 ; Store pc->v[2] = xmm1.
movsx rcx, DWORD PTR [r10+4]
mov r9, QWORD PTR [rax+rcx*8]
movsx rcx, DWORD PTR [r10+8]
lea rax, [rcx+rcx*2]
lea rsi, [r9+rax*8]
mov edi, DWORD PTR [rsi]