matu3ba · April 24, 2022 18:01
diff --git a/mem_consistency.cpp b/mem_consistency.cpp
 // Memory concistency and atomicity
 #include <atomic>
 #include <cassert>
 #include <iostream>
 #include <mutex>
 #include <thread>

 // assume: init of data = 0, flag != SET
 // hint: L1 and B1 can repeat many times

 // core 1                 | core 2
 // S1: STORE data = NEW   |
 // S2: STORE flag = SET   | L1: LOAD r1 = flag
 //                        | B1: if (r1 != SET) goto L1
 //                        | L2: LOAD r2 = data

 // memory operations can be reordered during compilation and execution
 // source code --compiler reordering--> machine code --processor reordering--> memory
 // => r2 = 0 is valid outcome of execution (core2 loads first data into r2)
 // example:
 // memory operations are reordered only with local view on each core,
 // accesses frmo other cores are not considered
 // core 1                 | core 2
 // S2: STORE flag = SET   |
 // S1: STORE data = NEW   | L1: LOAD r1 = flag
 //                        | B1: if (r1 != SET) goto L1
 //                        | L2: LOAD r2 = data

 // other example
 // init: x=0, y=0
 // core 1                 | core 2
 // S1: STORE x = NEW      | S2: STORE r1 = flag
 // L1: LOAD r1 = y        | L2: LOAD r2 = x
 // possible outcomes for tuple (r1, r2) ?
 // possible outcomes (0,0), (0,NEW), (NEW,0), (NEW,NEW)
 // for execution (L1,L2..), (..), (..), (..)
 // x86 allows this kind of reordering

 // 1. lock-based codes
 // - use lock to avoid concurrent accesses to same memory addresses
 //   * program can deadlock or livelock
 // 2. lock-free codes
 // - use atomics to coordinate concurrent accesses to same memory addresses
 //   * program can never lock up
 // - problem: memory operation reordering can influence correctness of algorithms
 // - clear reasoning on memory consistency required

 // memory consistency models (short: memory models)
 // sequential progams: no problems (except pointers, bruh)
 // parallel programs: complicated (and not reliable per se on architecutre)
 // - specification of allowed behavior of multithreaded programs executing with shared memory
 // - consists of rules defining allowed (re-)ordering Loads and Stores in memory
 // - avoids unintended effects of memory operations reordering
 //
 // memory modesl separate multithreaded executions in valid (following rules)
 // and invalid (violating rules) executions

 // Kind of reorderings:
 // 1. LoadLoad reorderings
 // 2. LoadStore reordering
 // 3. StoreLoad reordering
 // example possible in x86, ARM, POWER..
 // S1: x = NEW
 // L1: r1 = y
 // =>
 // L1: r1 = y
 // S1: x = NEW (
 // 4. StoreStore reordering

 // sequential consistency (SC): enforce all orderings of memory operations
 // - forbids all 4 types of reorderings for any memory operation
 // Definition
 // A multiprocessor is sequentially consistent, if
 // the result of any execution is the same as if the operations of all processors
 // were executed in some sequential order,
 // and the operations of each individual processor appear in this sequence in
 // the order specified by its program
 // (intuitive expected behavior)

 // previous example
 // core 1                 | core 2
 // S1: STORE x = NEW      | S2: STORE r1 = flag
 // L1: LOAD r1 = y        | L2: LOAD r2 = x
 // (0, 0) not possible, because it violates SC (store must happen before load)

 // SC disadvantages
 // - not used anymore
 //   * out-of-order execution (better pipeline utilization)
 //   * STORE instructions buffered if memory update takes very long due to cache miss
 // - enforced orderings of SC prevents many optimizations
 // - most expensive: enforced StoreLoad ordering
 //   * prevents reordering Stores after Loads that come later in program order
 //   * store before a load (in program order) must be finished
 //   (ie visible to all other processors) before load operations takes place

 // Relaxing SC: Total Store Order (TSO)
 // - allows StoreLoad reordering
 // - remaining enforced orderings
 //   * LoadLoad
 //   * LoadStore
 //   * StoreStore
 // - x86 memory model follows TSO
 // TSO model is weaker than SC model and SC model is stronger than TSO model

 // previous example
 // core 1                 | core 2
 // S1: STORE x = NEW      | S2: STORE r1 = flag
 // L1: LOAD r1 = y        | L2: LOAD r2 = x
 // (0, 0) for execution L1, S2, L2, S1 under TSO (StoreLoad reordering allowed)

 // avoid StoreLoad reordering in TSO:
 // * memor yfence
 //   1. full memory fence
 //   2. StoreLoad fence
 //   3. LoadLoad, StoreStore, LoadStore similar

 // previous example
 // core 1                 | core 2
 // S1: STORE x = NEW      | S2: STORE r1 = flag
 //     │                           │
 //     ▼-- direction of reordering ▼ -- direction of reordering
 // F1: Store_Load_Fence() | F2: Store_Load_Fence() <<-- StoreLoad/LoadLoad,StoreStore Fence etc
 //     ▲-- direction of reordering ▲ -- direction of reordering
 //     │                           │
 // L1: LOAD r1 = y        | L2: LOAD r2 = x
 // StoreLoad fence avoids (0, 0) as results

 // Memory Models OVerview
 // - Hardware memory model: Memory ordering behavior at runtime
 // - Software memory model: Putting another memory model on top of a (typically)
 //   weaker hardware memory model
 //   * emulates stronger memory model on weaker hardware model by issuing the
 //   corresponding memory fences
 // really weak -> weak with       -> usually strong -> sequentially consistent
 //                data dependency
 //                ordering
 // DEC Alpha      ARM                x86/64            dual 386
 //                PowerPC a          SPARC TSO         Java volatile
 // C/C++ 11                                            C/C++11 default atomics
 // low-level                                           or run on single core
 // atomics                                             without optimizations

 // Software Memory Models in C++
 // memory_order | fences
 // relaxed      | None
 // consume      | LoadLoad, LoadStore (fences only for vars that are data-dependent on that atomic)
 // acquire      | LoadLoad, LoadStore
 // release      | LoadStore, StoreStore
 // acq_rel      | LoadLoad, LoadStore, StoreStore
 // seq_cst      | All (default order)

 // int main()
 //{
 //     // is_lock_fre()
 //     std::atomic<int> atom_int;
 //     if (atom_int.is_lock_free())
 //         std::cout << "lockfree\n";
 //     // std::atomic_flag is lock-free
 // }

 // default memory order forall operations: memory_order_seq_cst
 // supported memory orders for different operations
 //                   relaxed consume acquire release acq_rel seq_cst
 // load              yes     yes     yes     no      no      yes
 // store             yes     no      no      yes             yes
 // read-modify-write yes     yes     yes     no      yes     yes

 // notable ones
 // - memory_order_seq_csv
 //   * prevents all reorderings
 // - memory_order_acq_rel
 //   * only allows StoreLoad reorderings (TSO)
 // - memory_order_acquire / consume
 //   * allows StoreLoad and StoreStore reorderings
 // - memory_order_release
 //   * allows LoadLoad and StoreLoad reorderings

 std::atomic<bool> flag(false);
 int data;

 void thread1()
 {
    data = 1337;
    // full fence (data = 1337 made visible to all threads)
    // flag.store(true, std::memory_order_seq_cst); // most strict
    // flag.store(true, std::memory_order_acq_rel); // more strict
    flag.store(true, std::memory_order_release); // strict enough
    // breaks flag.store(true, std::memory_order_relaxed); // not strict enough
 }

 void thread2()
 {
    while (!flag.load(std::memory_order_acquire))
        ; // strict enough
    // full fence (load of data in assertion cannot be reordered)
    assert(data == 1337); // will never fail
 }

 int main()
 {
    std::thread t1(thread1);
    std::thread t2(thread2);
    t1.join();
    t2.join();
 }
	// Memory concistency and atomicity
	#include <atomic>
	#include <cassert>
	#include <iostream>
	#include <mutex>
	#include <thread>

	// assume: init of data = 0, flag != SET
	// hint: L1 and B1 can repeat many times

	// core 1 \| core 2
	// S1: STORE data = NEW \|
	// S2: STORE flag = SET \| L1: LOAD r1 = flag
	// \| B1: if (r1 != SET) goto L1
	// \| L2: LOAD r2 = data

	// memory operations can be reordered during compilation and execution
	// source code --compiler reordering--> machine code --processor reordering--> memory
	// => r2 = 0 is valid outcome of execution (core2 loads first data into r2)
	// example:
	// memory operations are reordered only with local view on each core,
	// accesses frmo other cores are not considered
	// core 1 \| core 2
	// S2: STORE flag = SET \|
	// S1: STORE data = NEW \| L1: LOAD r1 = flag
	// \| B1: if (r1 != SET) goto L1
	// \| L2: LOAD r2 = data

	// other example
	// init: x=0, y=0
	// core 1 \| core 2
	// S1: STORE x = NEW \| S2: STORE r1 = flag
	// L1: LOAD r1 = y \| L2: LOAD r2 = x
	// possible outcomes for tuple (r1, r2) ?
	// possible outcomes (0,0), (0,NEW), (NEW,0), (NEW,NEW)
	// for execution (L1,L2..), (..), (..), (..)
	// x86 allows this kind of reordering

	// 1. lock-based codes
	// - use lock to avoid concurrent accesses to same memory addresses
	// * program can deadlock or livelock
	// 2. lock-free codes
	// - use atomics to coordinate concurrent accesses to same memory addresses
	// * program can never lock up
	// - problem: memory operation reordering can influence correctness of algorithms
	// - clear reasoning on memory consistency required

	// memory consistency models (short: memory models)
	// sequential progams: no problems (except pointers, bruh)
	// parallel programs: complicated (and not reliable per se on architecutre)
	// - specification of allowed behavior of multithreaded programs executing with shared memory
	// - consists of rules defining allowed (re-)ordering Loads and Stores in memory
	// - avoids unintended effects of memory operations reordering
	//
	// memory modesl separate multithreaded executions in valid (following rules)
	// and invalid (violating rules) executions

	// Kind of reorderings:
	// 1. LoadLoad reorderings
	// 2. LoadStore reordering
	// 3. StoreLoad reordering
	// example possible in x86, ARM, POWER..
	// S1: x = NEW
	// L1: r1 = y
	// =>
	// L1: r1 = y
	// S1: x = NEW (
	// 4. StoreStore reordering

	// sequential consistency (SC): enforce all orderings of memory operations
	// - forbids all 4 types of reorderings for any memory operation
	// Definition
	// A multiprocessor is sequentially consistent, if
	// the result of any execution is the same as if the operations of all processors
	// were executed in some sequential order,
	// and the operations of each individual processor appear in this sequence in
	// the order specified by its program
	// (intuitive expected behavior)

	// previous example
	// core 1 \| core 2
	// S1: STORE x = NEW \| S2: STORE r1 = flag
	// L1: LOAD r1 = y \| L2: LOAD r2 = x
	// (0, 0) not possible, because it violates SC (store must happen before load)

	// SC disadvantages
	// - not used anymore
	// * out-of-order execution (better pipeline utilization)
	// * STORE instructions buffered if memory update takes very long due to cache miss
	// - enforced orderings of SC prevents many optimizations
	// - most expensive: enforced StoreLoad ordering
	// * prevents reordering Stores after Loads that come later in program order
	// * store before a load (in program order) must be finished
	// (ie visible to all other processors) before load operations takes place

	// Relaxing SC: Total Store Order (TSO)
	// - allows StoreLoad reordering
	// - remaining enforced orderings
	// * LoadLoad
	// * LoadStore
	// * StoreStore
	// - x86 memory model follows TSO
	// TSO model is weaker than SC model and SC model is stronger than TSO model

	// previous example
	// core 1 \| core 2
	// S1: STORE x = NEW \| S2: STORE r1 = flag
	// L1: LOAD r1 = y \| L2: LOAD r2 = x
	// (0, 0) for execution L1, S2, L2, S1 under TSO (StoreLoad reordering allowed)

	// avoid StoreLoad reordering in TSO:
	// * memor yfence
	// 1. full memory fence
	// 2. StoreLoad fence
	// 3. LoadLoad, StoreStore, LoadStore similar

	// previous example
	// core 1 \| core 2
	// S1: STORE x = NEW \| S2: STORE r1 = flag
	// │ │
	// ▼-- direction of reordering ▼ -- direction of reordering
	// F1: Store_Load_Fence() \| F2: Store_Load_Fence() <<-- StoreLoad/LoadLoad,StoreStore Fence etc
	// ▲-- direction of reordering ▲ -- direction of reordering
	// │ │
	// L1: LOAD r1 = y \| L2: LOAD r2 = x
	// StoreLoad fence avoids (0, 0) as results

	// Memory Models OVerview
	// - Hardware memory model: Memory ordering behavior at runtime
	// - Software memory model: Putting another memory model on top of a (typically)
	// weaker hardware memory model
	// * emulates stronger memory model on weaker hardware model by issuing the
	// corresponding memory fences
	// really weak -> weak with -> usually strong -> sequentially consistent
	// data dependency
	// ordering
	// DEC Alpha ARM x86/64 dual 386
	// PowerPC a SPARC TSO Java volatile
	// C/C++ 11 C/C++11 default atomics
	// low-level or run on single core
	// atomics without optimizations

	// Software Memory Models in C++
	// memory_order \| fences
	// relaxed \| None
	// consume \| LoadLoad, LoadStore (fences only for vars that are data-dependent on that atomic)
	// acquire \| LoadLoad, LoadStore
	// release \| LoadStore, StoreStore
	// acq_rel \| LoadLoad, LoadStore, StoreStore
	// seq_cst \| All (default order)

	// int main()
	//{
	// // is_lock_fre()
	// std::atomic<int> atom_int;
	// if (atom_int.is_lock_free())
	// std::cout << "lockfree\n";
	// // std::atomic_flag is lock-free
	// }

	// default memory order forall operations: memory_order_seq_cst
	// supported memory orders for different operations
	// relaxed consume acquire release acq_rel seq_cst
	// load yes yes yes no no yes
	// store yes no no yes yes
	// read-modify-write yes yes yes no yes yes

	// notable ones
	// - memory_order_seq_csv
	// * prevents all reorderings
	// - memory_order_acq_rel
	// * only allows StoreLoad reorderings (TSO)
	// - memory_order_acquire / consume
	// * allows StoreLoad and StoreStore reorderings
	// - memory_order_release
	// * allows LoadLoad and StoreLoad reorderings

	std::atomic<bool> flag(false);
	int data;

	void thread1()
	{
	data = 1337;
	// full fence (data = 1337 made visible to all threads)
	// flag.store(true, std::memory_order_seq_cst); // most strict
	// flag.store(true, std::memory_order_acq_rel); // more strict
	flag.store(true, std::memory_order_release); // strict enough
	// breaks flag.store(true, std::memory_order_relaxed); // not strict enough
	}

	void thread2()
	{
	while (!flag.load(std::memory_order_acquire))
	; // strict enough
	// full fence (load of data in assertion cannot be reordered)
	assert(data == 1337); // will never fail
	}

	int main()
	{
	std::thread t1(thread1);
	std::thread t2(thread2);
	t1.join();
	t2.join();
	}