Created
April 24, 2022 18:01
-
-
Save matu3ba/1a777c478a77fefb36181135a44bc47a to your computer and use it in GitHub Desktop.
notes on memory consistency
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Memory concistency and atomicity | |
#include <atomic> | |
#include <cassert> | |
#include <iostream> | |
#include <mutex> | |
#include <thread> | |
// assume: init of data = 0, flag != SET | |
// hint: L1 and B1 can repeat many times | |
// core 1 | core 2 | |
// S1: STORE data = NEW | | |
// S2: STORE flag = SET | L1: LOAD r1 = flag | |
// | B1: if (r1 != SET) goto L1 | |
// | L2: LOAD r2 = data | |
// memory operations can be reordered during compilation and execution | |
// source code --compiler reordering--> machine code --processor reordering--> memory | |
// => r2 = 0 is valid outcome of execution (core2 loads first data into r2) | |
// example: | |
// memory operations are reordered only with local view on each core, | |
// accesses frmo other cores are not considered | |
// core 1 | core 2 | |
// S2: STORE flag = SET | | |
// S1: STORE data = NEW | L1: LOAD r1 = flag | |
// | B1: if (r1 != SET) goto L1 | |
// | L2: LOAD r2 = data | |
// other example | |
// init: x=0, y=0 | |
// core 1 | core 2 | |
// S1: STORE x = NEW | S2: STORE r1 = flag | |
// L1: LOAD r1 = y | L2: LOAD r2 = x | |
// possible outcomes for tuple (r1, r2) ? | |
// possible outcomes (0,0), (0,NEW), (NEW,0), (NEW,NEW) | |
// for execution (L1,L2..), (..), (..), (..) | |
// x86 allows this kind of reordering | |
// 1. lock-based codes | |
// - use lock to avoid concurrent accesses to same memory addresses | |
// * program can deadlock or livelock | |
// 2. lock-free codes | |
// - use atomics to coordinate concurrent accesses to same memory addresses | |
// * program can never lock up | |
// - problem: memory operation reordering can influence correctness of algorithms | |
// - clear reasoning on memory consistency required | |
// memory consistency models (short: memory models) | |
// sequential progams: no problems (except pointers, bruh) | |
// parallel programs: complicated (and not reliable per se on architecutre) | |
// - specification of allowed behavior of multithreaded programs executing with shared memory | |
// - consists of rules defining allowed (re-)ordering Loads and Stores in memory | |
// - avoids unintended effects of memory operations reordering | |
// | |
// memory modesl separate multithreaded executions in valid (following rules) | |
// and invalid (violating rules) executions | |
// Kind of reorderings: | |
// 1. LoadLoad reorderings | |
// 2. LoadStore reordering | |
// 3. StoreLoad reordering | |
// example possible in x86, ARM, POWER.. | |
// S1: x = NEW | |
// L1: r1 = y | |
// => | |
// L1: r1 = y | |
// S1: x = NEW ( | |
// 4. StoreStore reordering | |
// sequential consistency (SC): enforce all orderings of memory operations | |
// - forbids all 4 types of reorderings for any memory operation | |
// Definition | |
// A multiprocessor is sequentially consistent, if | |
// the result of any execution is the same as if the operations of all processors | |
// were executed in some sequential order, | |
// and the operations of each individual processor appear in this sequence in | |
// the order specified by its program | |
// (intuitive expected behavior) | |
// previous example | |
// core 1 | core 2 | |
// S1: STORE x = NEW | S2: STORE r1 = flag | |
// L1: LOAD r1 = y | L2: LOAD r2 = x | |
// (0, 0) not possible, because it violates SC (store must happen before load) | |
// SC disadvantages | |
// - not used anymore | |
// * out-of-order execution (better pipeline utilization) | |
// * STORE instructions buffered if memory update takes very long due to cache miss | |
// - enforced orderings of SC prevents many optimizations | |
// - most expensive: enforced StoreLoad ordering | |
// * prevents reordering Stores after Loads that come later in program order | |
// * store before a load (in program order) must be finished | |
// (ie visible to all other processors) before load operations takes place | |
// Relaxing SC: Total Store Order (TSO) | |
// - allows StoreLoad reordering | |
// - remaining enforced orderings | |
// * LoadLoad | |
// * LoadStore | |
// * StoreStore | |
// - x86 memory model follows TSO | |
// TSO model is weaker than SC model and SC model is stronger than TSO model | |
// previous example | |
// core 1 | core 2 | |
// S1: STORE x = NEW | S2: STORE r1 = flag | |
// L1: LOAD r1 = y | L2: LOAD r2 = x | |
// (0, 0) for execution L1, S2, L2, S1 under TSO (StoreLoad reordering allowed) | |
// avoid StoreLoad reordering in TSO: | |
// * memor yfence | |
// 1. full memory fence | |
// 2. StoreLoad fence | |
// 3. LoadLoad, StoreStore, LoadStore similar | |
// previous example | |
// core 1 | core 2 | |
// S1: STORE x = NEW | S2: STORE r1 = flag | |
// │ │ | |
// ▼-- direction of reordering ▼ -- direction of reordering | |
// F1: Store_Load_Fence() | F2: Store_Load_Fence() <<-- StoreLoad/LoadLoad,StoreStore Fence etc | |
// ▲-- direction of reordering ▲ -- direction of reordering | |
// │ │ | |
// L1: LOAD r1 = y | L2: LOAD r2 = x | |
// StoreLoad fence avoids (0, 0) as results | |
// Memory Models OVerview | |
// - Hardware memory model: Memory ordering behavior at runtime | |
// - Software memory model: Putting another memory model on top of a (typically) | |
// weaker hardware memory model | |
// * emulates stronger memory model on weaker hardware model by issuing the | |
// corresponding memory fences | |
// really weak -> weak with -> usually strong -> sequentially consistent | |
// data dependency | |
// ordering | |
// DEC Alpha ARM x86/64 dual 386 | |
// PowerPC a SPARC TSO Java volatile | |
// C/C++ 11 C/C++11 default atomics | |
// low-level or run on single core | |
// atomics without optimizations | |
// Software Memory Models in C++ | |
// memory_order | fences | |
// relaxed | None | |
// consume | LoadLoad, LoadStore (fences only for vars that are data-dependent on that atomic) | |
// acquire | LoadLoad, LoadStore | |
// release | LoadStore, StoreStore | |
// acq_rel | LoadLoad, LoadStore, StoreStore | |
// seq_cst | All (default order) | |
// int main() | |
//{ | |
// // is_lock_fre() | |
// std::atomic<int> atom_int; | |
// if (atom_int.is_lock_free()) | |
// std::cout << "lockfree\n"; | |
// // std::atomic_flag is lock-free | |
// } | |
// default memory order forall operations: memory_order_seq_cst | |
// supported memory orders for different operations | |
// relaxed consume acquire release acq_rel seq_cst | |
// load yes yes yes no no yes | |
// store yes no no yes yes | |
// read-modify-write yes yes yes no yes yes | |
// notable ones | |
// - memory_order_seq_csv | |
// * prevents all reorderings | |
// - memory_order_acq_rel | |
// * only allows StoreLoad reorderings (TSO) | |
// - memory_order_acquire / consume | |
// * allows StoreLoad and StoreStore reorderings | |
// - memory_order_release | |
// * allows LoadLoad and StoreLoad reorderings | |
std::atomic<bool> flag(false); | |
int data; | |
void thread1() | |
{ | |
data = 1337; | |
// full fence (data = 1337 made visible to all threads) | |
// flag.store(true, std::memory_order_seq_cst); // most strict | |
// flag.store(true, std::memory_order_acq_rel); // more strict | |
flag.store(true, std::memory_order_release); // strict enough | |
// breaks flag.store(true, std::memory_order_relaxed); // not strict enough | |
} | |
void thread2() | |
{ | |
while (!flag.load(std::memory_order_acquire)) | |
; // strict enough | |
// full fence (load of data in assertion cannot be reordered) | |
assert(data == 1337); // will never fail | |
} | |
int main() | |
{ | |
std::thread t1(thread1); | |
std::thread t2(thread2); | |
t1.join(); | |
t2.join(); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment