Increasing code readability without a decrease in performance (for this snippet in C)

In a fairly large and complex C program where running time is the first priority, I have to decide how I should write code pieces like this:

for (int i=0; i < md->global_grid[ic[0]][ic[1]][ic[2]].parts_num; i++)
{
    if (md->global_grid[ic[0]][ic[1]][ic[2]].parts[i].core.GroupID == RESERVED_GROUP)
    {
        md->global_grid[ic[0]][ic[1]][ic[2]].parts[i].core.GroupID = GroupID;
        fmd_real_t mass = md->potsys.atomkinds[md->global_grid[ic[0]][ic[1]][ic[2]].parts[i].core.atomkind].mass;
        for (int d=0; d<3; d++)
            md->global_grid[ic[0]][ic[1]][ic[2]].parts[i].core.v[d] -= MomentumSum[d] / (AtomsNum * mass);
    }
}

This can be made more readable and compact by using pointers like pc below:

for (int i=0; i < md->global_grid[ic[0]][ic[1]][ic[2]].parts_num; i++)
{
    if (md->global_grid[ic[0]][ic[1]][ic[2]].parts[i].core.GroupID == RESERVED_GROUP)
    {
        particle_core_t *pc = &md->global_grid[ic[0]][ic[1]][ic[2]].parts[i].core;

        pc->GroupID = GroupID;
        fmd_real_t mass = md->potsys.atomkinds[pc->atomkind].mass;
        for (int d=0; d<3; d++)
            pc->v[d] -= MomentumSum[d] / (AtomsNum * mass);
    }
}

But doesn't dereferencing pc take some CPU time? I usually use the first form and sometimes the second form but don't know which is really better. I use -O3 of gcc for optimization.

I know that measuring running times and comparing them may provide an answer, but knowning what experienced and professional programmers think is always very helpful. In particular, merely comparing times doesn't tell why one form is faster.

Solution

Take a look at the assembly in jtbandes' godbolt example. Here is gcc's x86-64 assembly for the readable version of the inner loop:

particle_core_t *pc = &md->global_grid[ic[0]][ic[1]][ic[2]].parts[i].core;

pc->GroupID = GroupID;
fmd_real_t mass = md->potsys.atomkinds[pc->atomkind].mass;
for (int d=0; d<3; d++)
  pc->v[d] -= MomentumSum[d] / (AtomsNum * mass);

gcc is clever enough to see that the loop over d makes 3 iterations, so it unrolls it. It also sees that the lhs in each iteration is in the same array, so it efficiently stores the array address in rcx, rather than repeatedly dereferencing pc->v.

mov     rcx, QWORD PTR [rax+8]        ; rcx = pc->v.
mov     DWORD PTR [rax], ebp
pxor    xmm0, xmm0
add     rdx, 1
movsx   rax, DWORD PTR [rax+4]
mov     rsi, QWORD PTR [r11+8]
cvtsi2sd        xmm0, r8d
movsd   xmm2, QWORD PTR [rbx]         ; Load xmm2 = MomentumSum[0].
movsd   xmm1, QWORD PTR [rcx]         ; Load xmm1 = pc->v[0].
lea     rax, [rax+rax*2]
lea     rax, [rsi+rax*8]
mulsd   xmm0, QWORD PTR [rax+16]      ; Compute xmm0 = AtomsNum * mass.
movsx   rax, DWORD PTR [r10]
mov     rax, QWORD PTR [r12+rax*8]
divsd   xmm2, xmm0                    ; xmm2 /= xmm0
subsd   xmm1, xmm2                    ; xmm1 -= xmm2
movsd   QWORD PTR [rcx], xmm1         ; Store pc->v[0] = xmm1.
movsd   xmm2, QWORD PTR [rbx+8]
movsd   xmm1, QWORD PTR [rcx+8]
divsd   xmm2, xmm0
subsd   xmm1, xmm2
movsd   QWORD PTR [rcx+8], xmm1       ; Store pc->v[1] = xmm1.
movsd   xmm1, QWORD PTR [rbx+16]
divsd   xmm1, xmm0
movsd   xmm0, QWORD PTR [rcx+16]
subsd   xmm0, xmm1
movsd   QWORD PTR [rcx+16], xmm0      ; Store pc->v[2] = xmm1.
movsx   rcx, DWORD PTR [r10+4]
mov     r9, QWORD PTR [rax+rcx*8]
movsx   rcx, DWORD PTR [r10+8]
lea     rax, [rcx+rcx*2]
lea     rsi, [r9+rax*8]
mov     edi, DWORD PTR [rsi]