I'm trying to solidify my understanding of data contention and have come up with the following minimal test program. It runs a thread that does some data crunching, and spinlocks on an atomic bool until the thread is done.
#include <iostream>
#include <atomic>
#include <thread>
#include <array>
class SideProcessor
{
public:
SideProcessor()
: dataArr{}
{
};
void start() {
processingRequested = true;
};
bool isStarted() {
return processingRequested;
};
bool isDone() {
return !processingRequested;
};
void finish() {
processingRequested = false;
}
void run(std::atomic<bool>* exitRequested) {
while(!(*exitRequested)) {
// Spin on the flag.
while(!(isStarted()) && !(*exitRequested)) {
}
if (*exitRequested) {
// If we were asked to spin down, break out of the loop.
break;
}
// Process
processData();
// Flag that we're done.
finish();
}
};
private:
std::atomic<bool> processingRequested;
#ifdef PAD_ALIGNMENT
std::array<bool, 64> padding;
#endif
std::array<int, 100> dataArr;
void processData() {
// Add 1 to every element a bunch of times.
std::cout << "Processing data." << std::endl;
for (unsigned int i = 0; i < 10000000; ++i) {
for (unsigned int j = 0; j < 100; ++j) {
dataArr[j] += 1;
}
}
std::cout << "Done processing." << std::endl;
};
};
int main() {
std::atomic<bool> exitRequested;
exitRequested = false;
SideProcessor sideProcessor;
std::thread processThreadObj = std::thread(&SideProcessor::run,
&sideProcessor, &exitRequested);
// Spinlock while the thread is doing work.
std::cout << "Starting processing." << std::endl;
sideProcessor.start();
while (!(sideProcessor.isDone())) {
}
// Spin the thread down.
std::cout << "Waiting for thread to spin down." << std::endl;
exitRequested = true;
processThreadObj.join();
std::cout << "Done." << std::endl;
return 0;
}
When I build with -O3 and don't define PAD_ALIGNMENT, I get the following results from Linux perf:
142,511,066 cache-references (29.15%)
142,364 cache-misses # 0.100 % of all cache refs (39.33%)
33,580,965 L1-dcache-load-misses # 3.40% of all L1-dcache hits (39.90%)
988,605,337 L1-dcache-loads (40.46%)
279,446,259 L1-dcache-store (40.71%)
227,713 L1-icache-load-misses (40.71%)
10,040,733 LLC-loads (40.71%)
5,834 LLC-load-misses # 0.06% of all LL-cache hits (40.32%)
40,070,067 LLC-stores (19.39%)
94 LLC-store-misses (19.22%)
0.708704757 seconds time elapsed
When I build with PAD_ALIGNMENT, I get the following results:
450,713 cache-references (27.83%)
124,281 cache-misses # 27.574 % of all cache refs (39.29%)
104,857 L1-dcache-load-misses # 0.01% of all L1-dcache hits (42.16%)
714,361,767 L1-dcache-loads (45.02%)
281,140,925 L1-dcache-store (45.83%)
90,839 L1-icache-load-misses (43.52%)
11,225 LLC-loads (40.66%)
3,685 LLC-load-misses # 32.83% of all LL-cache hits (37.80%)
1,798 LLC-stores (17.18%)
76 LLC-store-misses (17.18%)
0.140602005 seconds time elapsed
I have 2 questions:
(Edit) Additional information as requested:
g++ version: (Ubuntu 7.5.0-3ubuntu1~18.04) 7.5.0
CPU model: Intel(R) Core(TM) i7-6700HQ CPU @ 2.60GHz
Kernel version: 4.15.0-106-generic
processData()'s loop when compiling with g++ -std=c++14 -pthread -g -O3 -c -fverbose-asm -Wa,-adhln
:
Without PAD_ALIGNMENT:
57:main.cpp **** void processData() {
58:main.cpp **** // Add 1 to every element a bunch of times.
59:main.cpp **** std::cout << "Processing data." << std::endl;
60:main.cpp **** for (unsigned int i = 0; i < 10000000; ++i) {
61:main.cpp **** for (unsigned int j = 0; j < 100; ++j) {
62:main.cpp **** dataArr[j] += 1;
409 .loc 4 62 0
410 0109 83450401 addl $1, 4(%rbp) #, MEM[(value_type &)this_8(D) + 4]
411 .LVL32:
412 010d 4183FD01 cmpl $1, %r13d #, prolog_loop_niters.140
413 0111 0F841901 je .L29 #,
413 0000
414 0117 83450801 addl $1, 8(%rbp) #, MEM[(value_type &)this_8(D) + 4]
415 .LVL33:
416 011b 4183FD02 cmpl $2, %r13d #, prolog_loop_niters.140
417 011f 0F842801 je .L30 #,
417 0000
418 0125 83450C01 addl $1, 12(%rbp) #, MEM[(value_type &)this_8(D) + 4]
419 .LVL34:
420 0129 BF610000 movl $97, %edi #, ivtmp_12
420 00
421 # main.cpp:61: for (unsigned int j = 0; j < 100; ++j) {
61:main.cpp **** dataArr[j] += 1;
422 .loc 4 61 0
423 012e 41BA0300 movl $3, %r10d #, j
423 0000
424 .LVL35:
425 .L18:
426 0134 31C0 xorl %eax, %eax # ivtmp.156
427 0136 31D2 xorl %edx, %edx # ivtmp.153
428 0138 0F1F8400 .p2align 4,,10
428 00000000
429 .p2align 3
430 .L20:
431 0140 83C201 addl $1, %edx #, ivtmp.153
432 # main.cpp:62: dataArr[j] += 1;
433 .loc 4 62 0
434 0143 66410F6F movdqa (%r14,%rax), %xmm0 # MEM[base: vectp_this.148_118, index: ivtmp.156_48, offset: 0], vect__2
434 0406
435 0149 660FFEC1 paddd %xmm1, %xmm0 # tmp231, vect__25.150
436 014d 410F2904 movaps %xmm0, (%r14,%rax) # vect__25.150, MEM[base: vectp_this.148_118, index: ivtmp.156_48, offse
436 06
437 0152 4883C010 addq $16, %rax #, ivtmp.156
438 0156 4439FA cmpl %r15d, %edx # bnd.143, ivtmp.153
439 0159 72E5 jb .L20 #,
440 015b 4429E7 subl %r12d, %edi # niters_vector_mult_vf.144, ivtmp_12
441 015e 4439E3 cmpl %r12d, %ebx # niters_vector_mult_vf.144, niters.142
442 0161 438D0422 leal (%r10,%r12), %eax #, tmp.145
443 0165 89FA movl %edi, %edx # ivtmp_12, tmp.146
444 0167 7421 je .L21 #,
445 .LVL36:
446 0169 89C7 movl %eax, %edi # tmp.145, tmp.145
447 016b 8344BD04 addl $1, 4(%rbp,%rdi,4) #, MEM[(value_type &)_129 + 4]
447 01
448 # main.cpp:61: for (unsigned int j = 0; j < 100; ++j) {
61:main.cpp **** dataArr[j] += 1;
449 .loc 4 61 0
450 0170 83FA01 cmpl $1, %edx #, tmp.146
451 0173 8D7801 leal 1(%rax), %edi #,
452 .LVL37:
453 0176 7412 je .L21 #,
454 # main.cpp:62: dataArr[j] += 1;
455 .loc 4 62 0
456 0178 8344BD04 addl $1, 4(%rbp,%rdi,4) #, MEM[(value_type &)_122 + 4]
456 01
457 # main.cpp:61: for (unsigned int j = 0; j < 100; ++j) {
61:main.cpp **** dataArr[j] += 1;
458 .loc 4 61 0
459 017d 83C002 addl $2, %eax #,
460 .LVL38:
461 0180 83FA02 cmpl $2, %edx #, tmp.146
462 0183 7405 je .L21 #,
463 # main.cpp:62: dataArr[j] += 1;
464 .loc 4 62 0
465 0185 83448504 addl $1, 4(%rbp,%rax,4) #, MEM[(value_type &)_160 + 4]
465 01
466 .LVL39:
467 .L21:
468 .LBE930:
469 # main.cpp:60: for (unsigned int i = 0; i < 10000000; ++i) {
60:main.cpp **** for (unsigned int j = 0; j < 100; ++j) {
470 .loc 4 60 0
471 018a 83EE01 subl $1, %esi #, ivtmp_3
472 018d 0F856DFF jne .L22 #,
472 FFFF
With PAD_ALIGNMENT:
57:main.cpp **** void processData() {
58:main.cpp **** // Add 1 to every element a bunch of times.
59:main.cpp **** std::cout << "Processing data." << std::endl;
60:main.cpp **** for (unsigned int i = 0; i < 10000000; ++i) {
61:main.cpp **** for (unsigned int j = 0; j < 100; ++j) {
62:main.cpp **** dataArr[j] += 1;
410 .loc 4 62 0
411 0109 83454401 addl $1, 68(%rbp) #, MEM[(value_type &)this_8(D) + 68]
412 .LVL32:
413 010d 4183FD01 cmpl $1, %r13d #, prolog_loop_niters.140
414 0111 0F841901 je .L29 #,
414 0000
415 0117 83454801 addl $1, 72(%rbp) #, MEM[(value_type &)this_8(D) + 68]
416 .LVL33:
417 011b 4183FD02 cmpl $2, %r13d #, prolog_loop_niters.140
418 011f 0F842801 je .L30 #,
418 0000
419 0125 83454C01 addl $1, 76(%rbp) #, MEM[(value_type &)this_8(D) + 68]
420 .LVL34:
421 0129 BF610000 movl $97, %edi #, ivtmp_12
421 00
422 # main.cpp:61: for (unsigned int j = 0; j < 100; ++j) {
61:main.cpp **** dataArr[j] += 1;
423 .loc 4 61 0
424 012e 41BA0300 movl $3, %r10d #, j
424 0000
425 .LVL35:
426 .L18:
427 0134 31C0 xorl %eax, %eax # ivtmp.156
428 0136 31D2 xorl %edx, %edx # ivtmp.153
429 0138 0F1F8400 .p2align 4,,10
429 00000000
430 .p2align 3
431 .L20:
432 0140 83C201 addl $1, %edx #, ivtmp.153
433 # main.cpp:62: dataArr[j] += 1;
434 .loc 4 62 0
435 0143 66410F6F movdqa (%r14,%rax), %xmm0 # MEM[base: vectp_this.148_118, index: ivtmp.156_48, offset: 0], vect__2
435 0406
436 0149 660FFEC1 paddd %xmm1, %xmm0 # tmp231, vect__25.150
437 014d 410F2904 movaps %xmm0, (%r14,%rax) # vect__25.150, MEM[base: vectp_this.148_118, index: ivtmp.156_48, offse
437 06
438 0152 4883C010 addq $16, %rax #, ivtmp.156
439 0156 4439FA cmpl %r15d, %edx # bnd.143, ivtmp.153
440 0159 72E5 jb .L20 #,
441 015b 4429E7 subl %r12d, %edi # niters_vector_mult_vf.144, ivtmp_12
442 015e 4439E3 cmpl %r12d, %ebx # niters_vector_mult_vf.144, niters.142
443 0161 438D0422 leal (%r10,%r12), %eax #, tmp.145
444 0165 89FA movl %edi, %edx # ivtmp_12, tmp.146
445 0167 7421 je .L21 #,
446 .LVL36:
447 0169 89C7 movl %eax, %edi # tmp.145, tmp.145
448 016b 8344BD44 addl $1, 68(%rbp,%rdi,4) #, MEM[(value_type &)_129 + 68]
448 01
449 # main.cpp:61: for (unsigned int j = 0; j < 100; ++j) {
61:main.cpp **** dataArr[j] += 1;
450 .loc 4 61 0
451 0170 83FA01 cmpl $1, %edx #, tmp.146
452 0173 8D7801 leal 1(%rax), %edi #,
453 .LVL37:
454 0176 7412 je .L21 #,
455 # main.cpp:62: dataArr[j] += 1;
456 .loc 4 62 0
457 0178 8344BD44 addl $1, 68(%rbp,%rdi,4) #, MEM[(value_type &)_122 + 68]
457 01
458 # main.cpp:61: for (unsigned int j = 0; j < 100; ++j) {
61:main.cpp **** dataArr[j] += 1;
459 .loc 4 61 0
460 017d 83C002 addl $2, %eax #,
461 .LVL38:
462 0180 83FA02 cmpl $2, %edx #, tmp.146
463 0183 7405 je .L21 #,
464 # main.cpp:62: dataArr[j] += 1;
465 .loc 4 62 0
466 0185 83448544 addl $1, 68(%rbp,%rax,4) #, MEM[(value_type &)_160 + 68]
466 01
467 .LVL39:
468 .L21:
469 .LBE930:
470 # main.cpp:60: for (unsigned int i = 0; i < 10000000; ++i) {
60:main.cpp **** for (unsigned int j = 0; j < 100; ++j) {
471 .loc 4 60 0
472 018a 83EE01 subl $1, %esi #, ivtmp_3
473 018d 0F856DFF jne .L22 #,
473 FFFF
An instance of type SideProcessor
has the following fields:
std::atomic<bool> processingRequested;
#ifdef PAD_ALIGNMENT
std::array<bool, 64> padding;
#endif
std::array<int, 100> dataArr;
The size of processingRequested
is probably one byte. Without PAD_ALIGNMENT
, the compiler will probably arrange the fields such that the first few elements of dataArr
are in same 64-byte cache line as processingRequested
. However, with PAD_ALIGNMENT
, there will be a 64-byte gap between the two fields, so they the first element of the array and processingRequested
will be in different cache lines.
Considering the loop in processData
in isolation, one would expect that all of the 100 elements of the dataArr
to easily fit in the L1D cache and so the vast majority of accesses should hit in the L1D. However, the main thread reads processingRequested
in while (!(sideProcessor.isDone())) { }
concurrently with the processing thread executing the loop in processData
. Without PAD_ALIGNMENT
, the main thread wants to read from the same same cache line that the processing thread wants to both read and write. This results in a false sharing situation where the shared cache line repeatedly bounces between the private caches of the two cores on the which the threads are running.
With false sharing between two cores in the same LLC sharing domain, there will be a negligible number of misses by the LLC (it can backstop the requests so they don't go to DRAM), but there will be a lot of read and RFO requests from the two cores. That's why the LLC miss event counts are small.
It appears to me that the compiler has unrolled the loop in processData
four times and vectorized it using 16-byte SSE instructions. This would explain why the number of stores is close to a quarter of a billion. Without, the number of loads is about a billion, about a quarter of which is from the processing thread and most of the rest are from the main thread. The number of loads executed in while (!(sideProcessor.isDone())) { }
depends on the time it takes to complete the execution of processData
. So it makes sense that the number of loads is much smaller in the case of no false sharing (with PAD_ALIGNMENT
).
In the case without PAD_ALIGNMENT
, most of the L1-dcache-load-misses
and LLC-loads
events are from requests generated by the main thread while most of the LLC-stores
events are from requests generated by the processing thread. All of these requests are to the line containing processingRequested
. It makes sense that LLC-stores
is much larger than LLC-loads
because the main thread accesses the line more rapidly than the processing thread, so it's more likely that the RFOs miss in the private caches of the core on the processing thread is running. I think also most of the L1-dcache-load-misses
events represent loads from the main thread to the shared cache line. It looks like only a third of these loads miss in the private L2 cache, which suggests that the line is being prefetched into the L2. This can be verified by disabling the L2 prefetchers and checking whether L1-dcache-load-misses
is about equal to LLC-loads
.