@Peter Cordes is right. I rewrite the atomic version (using 60 bytes padding on machine with 64 bytes cache line to separate r1,r2 into different cache lines):
std::atomic<int> x;
std::atomic<int> y;
struct Result {
std::atomic<int> r1;
char padding[60];
std::atomic<int> r2;
} res;
void thread1_func() {
res.r1.store(x.load(std::memory_order_relaxed), std::memory_order_relaxed);
if (res.r1.load(std::memory_order_relaxed)) {
res.r2.store(y.load(std::memory_order_relaxed), std::memory_order_relaxed);
}
}
void thread2_func() {
y.store(42, std::memory_order_relaxed);
x.store(1, std::memory_order_release);
}
void thread3_func() {
if (res.r2.load(std::memory_order_relaxed) == 0) {
return;
}
if (res.r1.load(std::memory_order_relaxed) == 0) {
printf("r1: %d, r2: %d\n", res.r1.load(std::memory_order_relaxed),
res.r2.load(std::memory_order_relaxed));
}
}
int main() {
while (1) {
x = 0;
y = 0;
res.r1 = 0;
res.r2 = 0;
std::thread t1(thread1_func);
std::thread t2(thread2_func);
std::thread t3(thread3_func);
t1.join();
t2.join();
t3.join();
}
return 0;
}
and now the program will enter the printf branch.
If we'd like thread3 never enter the printf branch, we can use 'release' ordering on res.r2.store
:
void thread1_func() {
res.r1.store(x.load(std::memory_order_relaxed), std::memory_order_relaxed);
if (res.r1.load(std::memory_order_relaxed)) {
res.r2.store(y.load(std::memory_order_relaxed), std::memory_order_release);
}
}