polytypic · January 19, 2023 10:57 · gasche · Jan 19, 2023 · polytypic · Jan 19, 2023
diff --git a/article-simplified-msi-model-of-shared-memory.md b/article-simplified-msi-model-of-shared-memory.md
diff --git a/dune b/dune
 (tests
 (names Quantify_msi)
 (libraries unix multicore-magic))
diff --git a/dune-project b/dune-project
 (lang dune 3.3)
diff --git a/quantify_msi.ml b/quantify_msi.ml
 [@@@alert "-unstable"]

 type message =
  | Initial
  | Ready
  | Overhead of int array
  | Read of int array
  | Write of int array
  | Stop

 let line = 128 / 8
 let micros time = Float.to_int (time *. 1000000.0)

 let rec overhead sum i n =
  if i < n then overhead (sum lor i) (i + (line * 8)) n else sum

 let rec read cache sum i n =
  if i < n then
    let sum = sum lor Array.unsafe_get cache (i + (0 * line)) in
    let sum = sum lor Array.unsafe_get cache (i + (1 * line)) in
    let sum = sum lor Array.unsafe_get cache (i + (2 * line)) in
    let sum = sum lor Array.unsafe_get cache (i + (3 * line)) in
    let sum = sum lor Array.unsafe_get cache (i + (4 * line)) in
    let sum = sum lor Array.unsafe_get cache (i + (5 * line)) in
    let sum = sum lor Array.unsafe_get cache (i + (6 * line)) in
    let sum = sum lor Array.unsafe_get cache (i + (7 * line)) in
    read cache sum (i + (line * 8)) n
  else sum

 let rec write cache i n =
  if i < n then (
    Array.unsafe_set cache (i + (0 * line)) 0;
    Array.unsafe_set cache (i + (1 * line)) 0;
    Array.unsafe_set cache (i + (2 * line)) 0;
    Array.unsafe_set cache (i + (3 * line)) 0;
    Array.unsafe_set cache (i + (4 * line)) 0;
    Array.unsafe_set cache (i + (5 * line)) 0;
    Array.unsafe_set cache (i + (6 * line)) 0;
    Array.unsafe_set cache (i + (7 * line)) 0;
    write cache (i + (line * 8)) n)

 let worker () =
  let timing = Multicore_magic.copy_as_padded @@ Atomic.make 0 in
  let ch = Multicore_magic.copy_as_padded @@ Atomic.make @@ Initial in
  let atomic = Multicore_magic.copy_as_padded @@ Atomic.make 0 in
  let domain =
    Domain.spawn @@ fun () ->
    while
      match Atomic.get ch with
      | Initial ->
          Atomic.set ch Ready;
          true
      | Ready -> true
      | Overhead cache ->
          let n = Multicore_magic.length_of_padded_array cache in
          Multicore_magic.fence atomic;
          let start = micros @@ Unix.gettimeofday () in
          let sum = overhead 0 0 n in
          Multicore_magic.fence atomic;
          let stop = micros @@ Unix.gettimeofday () in
          if sum <> 0 then (
            Atomic.set timing (stop - start);
            Atomic.set ch Ready);
          true
      | Read cache ->
          let n = Multicore_magic.length_of_padded_array cache in
          Multicore_magic.fence atomic;
          let start = micros @@ Unix.gettimeofday () in
          let sum = read cache 0 0 n in
          Multicore_magic.fence atomic;
          let stop = micros @@ Unix.gettimeofday () in
          if sum = 0 then (
            Atomic.set timing (stop - start);
            Atomic.set ch Ready);
          true
      | Write cache ->
          let n = Multicore_magic.length_of_padded_array cache in
          Multicore_magic.fence atomic;
          let start = micros @@ Unix.gettimeofday () in
          write cache 0 n;
          Multicore_magic.fence atomic;
          let stop = micros @@ Unix.gettimeofday () in
          Atomic.set timing (stop - start);
          Atomic.set ch Ready;
          true
      | Stop -> false
    do
      ()
    done
  in
  Multicore_magic.copy_as_padded (ch, timing, domain)

 let rec ready ((ch, _, _) as wr) = if Atomic.get ch != Ready then ready wr

 let stop (ch, _, domain) =
  Atomic.set ch Stop;
  Domain.join domain

 let atomic = Multicore_magic.copy_as_padded @@ Atomic.make 0

 let timing (ch, timing, _) message =
  Multicore_magic.fence atomic;
  Atomic.set ch message;
  while Atomic.get ch != Ready do
    ()
  done;
  Atomic.get timing

 let run wr message = timing wr message |> ignore

 let rec accumulate sum count timing =
  if 0 < count then accumulate (sum + timing ()) (count - 1) timing else sum

 let measure overhead timing = accumulate 0 100000 timing - overhead
 let stars x = String.make (Int.max 0 (Float.to_int (Float.round x))) '*'

 let print_measurement name x min =
  let x = Float.of_int x /. Float.of_int min in
  Printf.printf "  %s %7.2f %s\n" name x (stars x)

 type results = {
  mutable read_m : int;
  mutable write_m : int;
  mutable read_s : int;
  mutable write_s : int;
  mutable read_i_s : int;
  mutable write_i_s : int;
  mutable read_i_m : int;
  mutable write_i_m : int;
 }

 let results () =
  {
    read_m = 0;
    write_m = 0;
    read_s = 0;
    write_s = 0;
    read_i_s = 0;
    write_i_s = 0;
    read_i_m = 0;
    write_i_m = 0;
  }

 let () =
  let worker_1 = worker () and worker_2 = worker () and worker_3 = worker () in

  ready worker_1;
  ready worker_2;
  ready worker_3;

  let cache_in_words = 100 * 1024 / 8 in
  let cache = Multicore_magic.make_padded_array cache_in_words 0 in

  let overhead = Multicore_magic.copy_as_padded @@ Overhead cache
  and read = Multicore_magic.copy_as_padded @@ Read cache
  and write = Multicore_magic.copy_as_padded @@ Write cache in

  let results = [| results (); results (); results () |] in

  let read_m () =
    run worker_2 read;
    run worker_3 read;
    run worker_1 write;
    timing worker_1 read
  and write_m () =
    run worker_2 read;
    run worker_3 read;
    run worker_1 write;
    timing worker_1 write
  and read_s () =
    run worker_1 read;
    run worker_2 read;
    run worker_3 read;
    timing worker_1 read
  and write_s () =
    run worker_1 read;
    run worker_2 read;
    run worker_3 read;
    timing worker_1 write
  and read_i_s () =
    run worker_1 read;
    run worker_2 write;
    run worker_3 read;
    timing worker_1 read
  and write_i_s () =
    run worker_1 read;
    run worker_2 write;
    run worker_3 read;
    timing worker_1 write
  and read_i_m () =
    run worker_1 read;
    run worker_2 read;
    run worker_3 write;
    timing worker_1 read
  and write_i_m () =
    run worker_1 read;
    run worker_2 read;
    run worker_3 write;
    timing worker_1 write
  in

  let overhead () = timing worker_1 overhead in

  Gc.full_major ();

  let _warmup = measure 0 overhead in

  let overhead = measure 0 overhead in

  for i = 0 to Array.length results - 1 do
    let results = results.(i) in

    results.read_m <- measure overhead read_m;
    results.write_m <- measure overhead write_m;
    results.read_s <- measure overhead read_s;
    results.write_s <- measure overhead write_s;
    results.read_i_s <- measure overhead read_i_s;
    results.write_i_s <- measure overhead write_i_s;
    results.read_i_m <- measure overhead read_i_m;
    results.write_i_m <- measure overhead write_i_m
  done;

  Gc.print_stat stdout;
  Printf.printf "\n";

  stop worker_1;
  stop worker_2;
  stop worker_3;

  results
  |> Array.iter
     @@ fun {
              read_m;
              write_m;
              read_s;
              write_s;
              read_i_s;
              write_i_s;
              read_i_m;
              write_i_m;
            } ->
     let min =
       List.fold_left Int.min read_m
         [ write_m; read_s; write_s; read_i_m; write_i_m; read_i_s; write_i_s ]
     in
     Printf.printf "Relative to fastest:\n\n";
     print_measurement "Read  M: " read_m min;
     print_measurement "Write M: " write_m min;
     Printf.printf "\n";
     print_measurement "Read  S: " read_s min;
     print_measurement "Write S: " write_s min;
     Printf.printf "\n";
     print_measurement "Read  Is:" read_i_s min;
     print_measurement "Write Is:" write_i_s min;
     Printf.printf "\n";
     print_measurement "Read  Im:" read_i_m min;
     print_measurement "Write Im:" write_i_m min;
     Printf.printf "\n";
     flush stdout
	[@@@alert "-unstable"]

	type message =
	\| Initial
	\| Ready
	\| Overhead of int array
	\| Read of int array
	\| Write of int array
	\| Stop

	let line = 128 / 8
	let micros time = Float.to_int (time *. 1000000.0)

	let rec overhead sum i n =
	if i < n then overhead (sum lor i) (i + (line * 8)) n else sum

	let rec read cache sum i n =
	if i < n then
	let sum = sum lor Array.unsafe_get cache (i + (0 * line)) in
	let sum = sum lor Array.unsafe_get cache (i + (1 * line)) in
	let sum = sum lor Array.unsafe_get cache (i + (2 * line)) in
	let sum = sum lor Array.unsafe_get cache (i + (3 * line)) in
	let sum = sum lor Array.unsafe_get cache (i + (4 * line)) in
	let sum = sum lor Array.unsafe_get cache (i + (5 * line)) in
	let sum = sum lor Array.unsafe_get cache (i + (6 * line)) in
	let sum = sum lor Array.unsafe_get cache (i + (7 * line)) in
	read cache sum (i + (line * 8)) n
	else sum

	let rec write cache i n =
	if i < n then (
	Array.unsafe_set cache (i + (0 * line)) 0;
	Array.unsafe_set cache (i + (1 * line)) 0;
	Array.unsafe_set cache (i + (2 * line)) 0;
	Array.unsafe_set cache (i + (3 * line)) 0;
	Array.unsafe_set cache (i + (4 * line)) 0;
	Array.unsafe_set cache (i + (5 * line)) 0;
	Array.unsafe_set cache (i + (6 * line)) 0;
	Array.unsafe_set cache (i + (7 * line)) 0;
	write cache (i + (line * 8)) n)

	let worker () =
	let timing = Multicore_magic.copy_as_padded @@ Atomic.make 0 in
	let ch = Multicore_magic.copy_as_padded @@ Atomic.make @@ Initial in
	let atomic = Multicore_magic.copy_as_padded @@ Atomic.make 0 in
	let domain =
	Domain.spawn @@ fun () ->
	while
	match Atomic.get ch with
	\| Initial ->
	Atomic.set ch Ready;
	true
	\| Ready -> true
	\| Overhead cache ->
	let n = Multicore_magic.length_of_padded_array cache in
	Multicore_magic.fence atomic;
	let start = micros @@ Unix.gettimeofday () in
	let sum = overhead 0 0 n in
	Multicore_magic.fence atomic;
	let stop = micros @@ Unix.gettimeofday () in
	if sum <> 0 then (
	Atomic.set timing (stop - start);
	Atomic.set ch Ready);
	true
	\| Read cache ->
	let n = Multicore_magic.length_of_padded_array cache in
	Multicore_magic.fence atomic;
	let start = micros @@ Unix.gettimeofday () in
	let sum = read cache 0 0 n in
	Multicore_magic.fence atomic;
	let stop = micros @@ Unix.gettimeofday () in
	if sum = 0 then (
	Atomic.set timing (stop - start);
	Atomic.set ch Ready);
	true
	\| Write cache ->
	let n = Multicore_magic.length_of_padded_array cache in
	Multicore_magic.fence atomic;
	let start = micros @@ Unix.gettimeofday () in
	write cache 0 n;
	Multicore_magic.fence atomic;
	let stop = micros @@ Unix.gettimeofday () in
	Atomic.set timing (stop - start);
	Atomic.set ch Ready;
	true
	\| Stop -> false
	do
	()
	done
	in
	Multicore_magic.copy_as_padded (ch, timing, domain)

	let rec ready ((ch, _, _) as wr) = if Atomic.get ch != Ready then ready wr

	let stop (ch, _, domain) =
	Atomic.set ch Stop;
	Domain.join domain

	let atomic = Multicore_magic.copy_as_padded @@ Atomic.make 0

	let timing (ch, timing, _) message =
	Multicore_magic.fence atomic;
	Atomic.set ch message;
	while Atomic.get ch != Ready do
	()
	done;
	Atomic.get timing

	let run wr message = timing wr message \|> ignore

	let rec accumulate sum count timing =
	if 0 < count then accumulate (sum + timing ()) (count - 1) timing else sum

	let measure overhead timing = accumulate 0 100000 timing - overhead
	let stars x = String.make (Int.max 0 (Float.to_int (Float.round x))) '*'

	let print_measurement name x min =
	let x = Float.of_int x /. Float.of_int min in
	Printf.printf " %s %7.2f %s\n" name x (stars x)

	type results = {
	mutable read_m : int;
	mutable write_m : int;
	mutable read_s : int;
	mutable write_s : int;
	mutable read_i_s : int;
	mutable write_i_s : int;
	mutable read_i_m : int;
	mutable write_i_m : int;
	}

	let results () =
	{
	read_m = 0;
	write_m = 0;
	read_s = 0;
	write_s = 0;
	read_i_s = 0;
	write_i_s = 0;
	read_i_m = 0;
	write_i_m = 0;
	}

	let () =
	let worker_1 = worker () and worker_2 = worker () and worker_3 = worker () in

	ready worker_1;
	ready worker_2;
	ready worker_3;

	let cache_in_words = 100 * 1024 / 8 in
	let cache = Multicore_magic.make_padded_array cache_in_words 0 in

	let overhead = Multicore_magic.copy_as_padded @@ Overhead cache
	and read = Multicore_magic.copy_as_padded @@ Read cache
	and write = Multicore_magic.copy_as_padded @@ Write cache in

	let results = [\| results (); results (); results () \|] in

	let read_m () =
	run worker_2 read;
	run worker_3 read;
	run worker_1 write;
	timing worker_1 read
	and write_m () =
	run worker_2 read;
	run worker_3 read;
	run worker_1 write;
	timing worker_1 write
	and read_s () =
	run worker_1 read;
	run worker_2 read;
	run worker_3 read;
	timing worker_1 read
	and write_s () =
	run worker_1 read;
	run worker_2 read;
	run worker_3 read;
	timing worker_1 write
	and read_i_s () =
	run worker_1 read;
	run worker_2 write;
	run worker_3 read;
	timing worker_1 read
	and write_i_s () =
	run worker_1 read;
	run worker_2 write;
	run worker_3 read;
	timing worker_1 write
	and read_i_m () =
	run worker_1 read;
	run worker_2 read;
	run worker_3 write;
	timing worker_1 read
	and write_i_m () =
	run worker_1 read;
	run worker_2 read;
	run worker_3 write;
	timing worker_1 write
	in

	let overhead () = timing worker_1 overhead in

	Gc.full_major ();

	let _warmup = measure 0 overhead in

	let overhead = measure 0 overhead in

	for i = 0 to Array.length results - 1 do
	let results = results.(i) in

	results.read_m <- measure overhead read_m;
	results.write_m <- measure overhead write_m;
	results.read_s <- measure overhead read_s;
	results.write_s <- measure overhead write_s;
	results.read_i_s <- measure overhead read_i_s;
	results.write_i_s <- measure overhead write_i_s;
	results.read_i_m <- measure overhead read_i_m;
	results.write_i_m <- measure overhead write_i_m
	done;

	Gc.print_stat stdout;
	Printf.printf "\n";

	stop worker_1;
	stop worker_2;
	stop worker_3;

	results
	\|> Array.iter
	@@ fun {
	read_m;
	write_m;
	read_s;
	write_s;
	read_i_s;
	write_i_s;
	read_i_m;
	write_i_m;
	} ->
	let min =
	List.fold_left Int.min read_m
	[ write_m; read_s; write_s; read_i_m; write_i_m; read_i_s; write_i_s ]
	in
	Printf.printf "Relative to fastest:\n\n";
	print_measurement "Read M: " read_m min;
	print_measurement "Write M: " write_m min;
	Printf.printf "\n";
	print_measurement "Read S: " read_s min;
	print_measurement "Write S: " write_s min;
	Printf.printf "\n";
	print_measurement "Read Is:" read_i_s min;
	print_measurement "Write Is:" write_i_s min;
	Printf.printf "\n";
	print_measurement "Read Im:" read_i_m min;
	print_measurement "Write Im:" write_i_m min;
	Printf.printf "\n";
	flush stdout