c++x86 cpu-architecture memory-barriers lock-free

x86 memory ordering test shows reordering where Intel's manual says there shouldn't be?

According to intel's manual. Neither Loads Nor Stores Are Reordered with Like Operations According to 8.2.3.2 Neither Loads Nor Stores Are Reordered with Like Operations

at document https://www.intel.com/content/www/us/en/architecture-and-technology/64-ia-32-architectures-software-developer-vol-3a-part-1-manual.html enter image description here

but I created a simple case, I found r1=1 and r2=2 happened.

#include <thread>
#include <iostream>

using namespace std;

volatile int x;
int b[500];
volatile int y;
volatile int start;

int s1;
int s2;
int s3;
int s0;
int foo()
{
    while(start==0);
    x=1;
    asm volatile("" ::: "memory");
    y=1;
    return 0;
}

int fool2()
{
    int a,b;
    while(start==0);
    a=x;
    asm volatile("" ::: "memory");
    b=y;

   if(a==0 && b==1)
         s0++;
   if(a==0 && b==0)
         s1++;
   if(a==1 && b==0)
         s2++;
   if(a==1 && b==1)
        s3++;
   return 0;
}

int main()
{
  int i=0;
  while(1)
  {
     x=y=0;
     thread t1(foo);
     thread t2(fool2);
     start = 1;
     t1.join();
     t2.join();
     i++;
     if((i&0xFFFF)==0)
     {
           cout<<s0<<" "<<s1<<" "<<s2<<" "<<s3<<endl;
     }
  }
}

g++ -O2 -pthread e.cpp

gcc version 7.5.0

output:

69 86538 1 19246512

The four case (r1 and r2 with 0, 1 combination) is all possible.

Solution

Take a closer look at what Section 8.2.3.2 of the intel manual. In your example your are effectively doing:

Processor 1	Processor 2
mov [ _x], 1	mov r2, _x
mov [ _y], 1	mov r1, _y

Instead of what the intel manual says:

Processor 1	Processor 2
mov [ _x], 1	mov r1, _y
mov [ _y], 1	mov r2, _x

In the your example processor 2 may load _x before _x is set by processor 1 and then load _y after processor 1 stores it thus allowing for (r1=1, r2=0):

Instruction	Processor
mov r2, _x	2
mov [ _x], 1	1
mov [ _y], 1	1
mov r1, _y	2

In the Intel example processor 2 can only load _x after it loads _y and processor 1 only sets _y after it sets _x so (r1=1, r2=0) is impossible.

Here is some code that demonstrates the Intel behavior:

#include <thread>
#include <iostream>
#include <stdlib.h>

using namespace std;

volatile int x;
volatile int y;
volatile int start;

constexpr bool flipOrdering = true; //Set this to true to see Intel example, false to see your example
constexpr int jitter = 10000;       //Range of random delay inserted between load/stores to make differences more obvious

int s1;
int s2;
int s3;
int s0;
int foo() {

    while(start==0);

    for(volatile int i = rand()%jitter; i; --i);
    x = 1;
    
    for(volatile int i = rand()%jitter; i; --i);
    asm volatile("" ::: "memory");

    for(volatile int i = rand()%jitter; i; --i);
    y = 1;

    return 0;
}

int fool2() {
    int a, b;
    while(start==0);

    for(volatile int i = rand()%jitter; i; --i);
    if constexpr(flipOrdering) b = y;
    else a = x;

    for(volatile int i = rand()%jitter; i; --i);
    asm volatile("" ::: "memory");

    for(volatile int i = rand()%jitter; i; --i);
    if constexpr(flipOrdering) a = x;
    else b = y;

   if(a==0 && b==1)
         s0++;
   if(a==0 && b==0)
         s1++;
   if(a==1 && b==0)
         s2++;
   if(a==1 && b==1)
        s3++;

    return 0;
}

int main() {
    int i=0;
    while(i< 1000) {
        x=y=0;
        thread t1(foo);
        thread t2(fool2);
        start = 1;
        t1.join();
        t2.join();
        i++;

        if((i%100)==0) {
            cout<<s0<<" "<<s1<<" "<<s2<<" "<<s3<<endl;
        }
    }

    return 0;
}

And here's a link to the the same code running in Compiler Explorer.