patrickdet · September 2, 2025 09:53
diff --git a/benchmark.exs b/benchmark.exs
 #!/usr/bin/env elixir

 # Clean, focused benchmark for comparing main vs tokio branch
 Mix.install([
  {:wasmex, path: "."},
  {:benchee, "~> 1.3"}
 ])

 wat_compute = """
 (module
  (func $nop (export "nop"))

  (func $add (export "add") (param $a i32) (param $b i32) (result i32)
    local.get $a
    local.get $b
    i32.add
  )
 )
 """

 wat_io = """
 (module
  (import "env" "sleep_ms" (func $sleep (param i32)))

  (func $io_operation (export "io_operation") (param $delay i32)
    local.get $delay
    call $sleep
  )

  (func $mixed_operation (export "mixed_operation") (param $delay i32) (result i32)
    ;; Do some computation
    i32.const 42
    i32.const 58
    i32.add
    drop  ;; Drop the computation result

    ;; Then sleep
    local.get $delay
    call $sleep

    ;; Return result
    i32.const 100
  )
 )
 """

 IO.puts("\nWASMEX PERFORMANCE BENCHMARK")
 IO.puts("=" <> String.duplicate("=", 60))
 {branch, 0} = System.cmd("git", ["branch", "--show-current"])
 IO.puts("Branch: #{String.trim(branch)}")
 IO.puts("")

 IO.puts("METHODOLOGY")
 IO.puts("-" <> String.duplicate("-", 60))
 IO.puts("This benchmark measures WebAssembly execution performance in three scenarios:")
 IO.puts("")
 IO.puts("1. CONCURRENT CALLS: Multiple async tasks calling a single instance")
 IO.puts("   - Tests how well the runtime handles concurrent access to one instance")
 IO.puts("   - Measures coordination overhead and potential lock contention")
 IO.puts("")
 IO.puts("2. PARALLEL INSTANCES: Multiple pre-created instances executing in parallel")
 IO.puts("   - Tests how well the runtime scales across multiple instances")
 IO.puts("   - Each instance receives one call per round")
 IO.puts("")
 IO.puts("3. SUSTAINED THROUGHPUT: Continuous execution over fixed time period")
 IO.puts("   - Tests maximum sustained operations per second")
 IO.puts("   - Removes coordination overhead to show raw performance")
 IO.puts("")
 IO.puts("4. I/O-BOUND WORKLOAD: Operations with host-provided sleep")
 IO.puts("   - Tests how runtime handles blocking/waiting operations")
 IO.puts("   - Reveals async (Tokio) vs thread-blocking (OS threads) behavior")
 IO.puts("   - Shows scaling limits when operations wait for I/O")
 IO.puts("")
 IO.puts("WASM FUNCTIONS:")
 IO.puts("- Tests 1-3: Simple integer addition (CPU-bound, ~3 WASM instructions)")
 IO.puts("- Test 4: Host sleep import (I/O-bound, tests async handling)")
 IO.puts("")

 # Test 1: Single instance handling concurrent calls
 IO.puts("TEST 1: Single Instance Concurrent Calls")
 IO.puts("-" <> String.duplicate("-", 40))

 {:ok, pid} = Wasmex.start_link(%{bytes: wat_compute})

 # Warmup
 for _ <- 1..100, do: Wasmex.call_function(pid, :add, [1, 2])

 suite1 = Benchee.run(
  %{
    "1 call" => fn ->
      Wasmex.call_function(pid, :add, [1, 2])
    end,
    "10 concurrent" => fn ->
      tasks = for _ <- 1..10 do
        Task.async(fn -> Wasmex.call_function(pid, :add, [1, 2]) end)
      end
      Task.await_many(tasks)
    end,
    "100 concurrent" => fn ->
      tasks = for _ <- 1..100 do
        Task.async(fn -> Wasmex.call_function(pid, :add, [1, 2]) end)
      end
      Task.await_many(tasks)
    end,
    "1000 concurrent" => fn ->
      tasks = for _ <- 1..1000 do
        Task.async(fn -> Wasmex.call_function(pid, :add, [1, 2]) end)
      end
      Task.await_many(tasks)
    end,
    "10000 concurrent" => fn ->
      tasks = for _ <- 1..10000 do
        Task.async(fn -> Wasmex.call_function(pid, :add, [1, 2]) end)
      end
      Task.await_many(tasks, 30_000)
    end
  },
  time: 3,
  warmup: 1,
  formatters: [{Benchee.Formatters.Console, comparison: false}],
  print: [benchmarking: false, configuration: false]
 )

 # Calculate and display ops/sec
 IO.puts("\nOperations per second:")
 for scenario <- suite1.scenarios do
  count = case scenario.name do
    "1 call" -> 1
    "10 concurrent" -> 10
    "100 concurrent" -> 100
    "1000 concurrent" -> 1000
    "10000 concurrent" -> 10000
  end

  avg_time_us = scenario.run_time_data.statistics.average
  ops_per_sec = count / (avg_time_us / 1_000_000)

  name = String.pad_trailing(scenario.name, 20)
  IO.puts("  #{name} #{round(ops_per_sec)} ops/sec")
 end

 GenServer.stop(pid)

 # Test 2: Multiple pre-created instances
 IO.puts("\nTEST 2: Multiple Instance Parallel Execution")
 IO.puts("-" <> String.duplicate("-", 40))

 # Pre-create instances
 instances_10 = for _ <- 1..10, do: elem(Wasmex.start_link(%{bytes: wat_compute}), 1)
 instances_100 = for _ <- 1..100, do: elem(Wasmex.start_link(%{bytes: wat_compute}), 1)

 # Warmup
 for pid <- instances_10, do: Wasmex.call_function(pid, :add, [1, 2])

 suite2 = Benchee.run(
  %{
    "10 instances" => fn ->
      tasks = for pid <- instances_10 do
        Task.async(fn -> Wasmex.call_function(pid, :add, [1, 2]) end)
      end
      Task.await_many(tasks)
    end,
    "100 instances" => fn ->
      tasks = for pid <- instances_100 do
        Task.async(fn -> Wasmex.call_function(pid, :add, [1, 2]) end)
      end
      Task.await_many(tasks)
    end
  },
  time: 3,
  warmup: 1,
  formatters: [{Benchee.Formatters.Console, comparison: false}],
  print: [benchmarking: false, configuration: false]
 )

 # Calculate and display ops/sec
 IO.puts("\nOperations per second:")
 for scenario <- suite2.scenarios do
  count = case scenario.name do
    "10 instances" -> 10
    "100 instances" -> 100
  end

  avg_time_us = scenario.run_time_data.statistics.average
  ops_per_sec = count / (avg_time_us / 1_000_000)

  name = String.pad_trailing(scenario.name, 20)
  IO.puts("  #{name} #{round(ops_per_sec)} ops/sec total")
 end

 # Cleanup
 for pid <- instances_10, do: GenServer.stop(pid)
 for pid <- instances_100, do: GenServer.stop(pid)

 # Test 3: Sustained throughput
 IO.puts("\nTEST 3: Sustained Throughput (3 seconds)")
 IO.puts("-" <> String.duplicate("-", 40))

 for instance_count <- [1, 10, 50, 100] do
  instances = for _ <- 1..instance_count, do: elem(Wasmex.start_link(%{bytes: wat_compute}), 1)

  # Run all instances continuously for 3 seconds
  start_time = System.monotonic_time(:millisecond)
  end_time = start_time + 3000

  # Start a task for each instance that runs continuously
  tasks = Enum.map(instances, fn pid ->
    Task.async(fn ->
      Stream.cycle([1])
      |> Enum.reduce_while(0, fn _, acc ->
        if System.monotonic_time(:millisecond) < end_time do
          {:ok, _} = Wasmex.call_function(pid, :add, [1, 2])
          {:cont, acc + 1}
        else
          {:halt, acc}
        end
      end)
    end)
  end)

  counts = Task.await_many(tasks, 10_000)
  total_ops = Enum.sum(counts)
  elapsed = System.monotonic_time(:millisecond) - start_time
  total_ops_per_sec = total_ops * 1000 / elapsed
  per_instance_ops = total_ops_per_sec / instance_count

  instance_str = String.pad_trailing("#{instance_count} instance(s)", 15)
  total_str = String.pad_leading("#{round(total_ops_per_sec)}", 8)
  per_str = if instance_count > 1 do
    " (#{round(per_instance_ops)} per instance)"
  else
    ""
  end

  IO.puts("  #{instance_str} #{total_str} ops/sec#{per_str}")

  # Cleanup
  for pid <- instances, do: GenServer.stop(pid)
 end

 # Test 4: I/O-bound workload
 IO.puts("\nTEST 4: I/O-Bound Workload (with 10ms sleep)")
 IO.puts("-" <> String.duplicate("-", 40))

 # Define imports for sleep function
 imports = %{
  env: %{
    sleep_ms: {:fn, [:i32], [], fn _context, milliseconds ->
      Process.sleep(milliseconds)
      {:ok, []}
    end}
  }
 }

 IO.puts("Testing concurrent sleeping operations...")

 # Test with different concurrency levels
 for concurrent_count <- [10, 50, 100, 200] do
  # Create instance with imports
  {:ok, pid} = Wasmex.start_link(%{bytes: wat_io, imports: imports})

  start_time = System.monotonic_time(:millisecond)

  # Start concurrent sleeping operations
  tasks = for _ <- 1..concurrent_count do
    Task.async(fn ->
      # Each operation sleeps for 10ms
      Wasmex.call_function(pid, :io_operation, [10])
    end)
  end

  # Wait for all to complete
  Task.await_many(tasks, 30_000)

  elapsed = System.monotonic_time(:millisecond) - start_time
  ops_per_sec = concurrent_count * 1000 / elapsed

  concurrent_str = String.pad_trailing("#{concurrent_count} concurrent", 15)
  time_str = String.pad_leading("#{elapsed}ms", 8)
  ops_str = String.pad_leading("#{round(ops_per_sec)}", 6)

  # Expected time: ~10ms if perfectly async, longer if blocking
  expected_time = if concurrent_count <= 10, do: 10, else: 10
  blocking_factor = elapsed / expected_time

  IO.puts("  #{concurrent_str} #{time_str} (#{ops_str} ops/sec, #{Float.round(blocking_factor, 1)}x expected)")

  GenServer.stop(pid)
 end

 IO.puts("")
 IO.puts("Note: Lower blocking factor = better async handling")
 IO.puts("      1x = perfect async, higher = thread blocking")

 IO.puts("\nSUMMARY")
 IO.puts("-" <> String.duplicate("-", 40))
 IO.puts("Key metrics to compare between branches:")
 IO.puts("- Single instance handling 1000 concurrent calls")
 IO.puts("- Sustained throughput with 100 instances")
 IO.puts("- I/O-bound: 200 concurrent sleeping operations")
 IO.puts("- Scaling efficiency from 1 to 100 instances")
	#!/usr/bin/env elixir

	# Clean, focused benchmark for comparing main vs tokio branch
	Mix.install([
	{:wasmex, path: "."},
	{:benchee, "~> 1.3"}
	])

	wat_compute = """
	(module
	(func $nop (export "nop"))

	(func $add (export "add") (param $a i32) (param $b i32) (result i32)
	local.get $a
	local.get $b
	i32.add
	)
	)
	"""

	wat_io = """
	(module
	(import "env" "sleep_ms" (func $sleep (param i32)))

	(func $io_operation (export "io_operation") (param $delay i32)
	local.get $delay
	call $sleep
	)

	(func $mixed_operation (export "mixed_operation") (param $delay i32) (result i32)
	;; Do some computation
	i32.const 42
	i32.const 58
	i32.add
	drop ;; Drop the computation result

	;; Then sleep
	local.get $delay
	call $sleep

	;; Return result
	i32.const 100
	)
	)
	"""

	IO.puts("\nWASMEX PERFORMANCE BENCHMARK")
	IO.puts("=" <> String.duplicate("=", 60))
	{branch, 0} = System.cmd("git", ["branch", "--show-current"])
	IO.puts("Branch: #{String.trim(branch)}")
	IO.puts("")

	IO.puts("METHODOLOGY")
	IO.puts("-" <> String.duplicate("-", 60))
	IO.puts("This benchmark measures WebAssembly execution performance in three scenarios:")
	IO.puts("")
	IO.puts("1. CONCURRENT CALLS: Multiple async tasks calling a single instance")
	IO.puts(" - Tests how well the runtime handles concurrent access to one instance")
	IO.puts(" - Measures coordination overhead and potential lock contention")
	IO.puts("")
	IO.puts("2. PARALLEL INSTANCES: Multiple pre-created instances executing in parallel")
	IO.puts(" - Tests how well the runtime scales across multiple instances")
	IO.puts(" - Each instance receives one call per round")
	IO.puts("")
	IO.puts("3. SUSTAINED THROUGHPUT: Continuous execution over fixed time period")
	IO.puts(" - Tests maximum sustained operations per second")
	IO.puts(" - Removes coordination overhead to show raw performance")
	IO.puts("")
	IO.puts("4. I/O-BOUND WORKLOAD: Operations with host-provided sleep")
	IO.puts(" - Tests how runtime handles blocking/waiting operations")
	IO.puts(" - Reveals async (Tokio) vs thread-blocking (OS threads) behavior")
	IO.puts(" - Shows scaling limits when operations wait for I/O")
	IO.puts("")
	IO.puts("WASM FUNCTIONS:")
	IO.puts("- Tests 1-3: Simple integer addition (CPU-bound, ~3 WASM instructions)")
	IO.puts("- Test 4: Host sleep import (I/O-bound, tests async handling)")
	IO.puts("")

	# Test 1: Single instance handling concurrent calls
	IO.puts("TEST 1: Single Instance Concurrent Calls")
	IO.puts("-" <> String.duplicate("-", 40))

	{:ok, pid} = Wasmex.start_link(%{bytes: wat_compute})

	# Warmup
	for _ <- 1..100, do: Wasmex.call_function(pid, :add, [1, 2])

	suite1 = Benchee.run(
	%{
	"1 call" => fn ->
	Wasmex.call_function(pid, :add, [1, 2])
	end,
	"10 concurrent" => fn ->
	tasks = for _ <- 1..10 do
	Task.async(fn -> Wasmex.call_function(pid, :add, [1, 2]) end)
	end
	Task.await_many(tasks)
	end,
	"100 concurrent" => fn ->
	tasks = for _ <- 1..100 do
	Task.async(fn -> Wasmex.call_function(pid, :add, [1, 2]) end)
	end
	Task.await_many(tasks)
	end,
	"1000 concurrent" => fn ->
	tasks = for _ <- 1..1000 do
	Task.async(fn -> Wasmex.call_function(pid, :add, [1, 2]) end)
	end
	Task.await_many(tasks)
	end,
	"10000 concurrent" => fn ->
	tasks = for _ <- 1..10000 do
	Task.async(fn -> Wasmex.call_function(pid, :add, [1, 2]) end)
	end
	Task.await_many(tasks, 30_000)
	end
	},
	time: 3,
	warmup: 1,
	formatters: [{Benchee.Formatters.Console, comparison: false}],
	print: [benchmarking: false, configuration: false]
	)

	# Calculate and display ops/sec
	IO.puts("\nOperations per second:")
	for scenario <- suite1.scenarios do
	count = case scenario.name do
	"1 call" -> 1
	"10 concurrent" -> 10
	"100 concurrent" -> 100
	"1000 concurrent" -> 1000
	"10000 concurrent" -> 10000
	end

	avg_time_us = scenario.run_time_data.statistics.average
	ops_per_sec = count / (avg_time_us / 1_000_000)

	name = String.pad_trailing(scenario.name, 20)
	IO.puts(" #{name} #{round(ops_per_sec)} ops/sec")
	end

	GenServer.stop(pid)

	# Test 2: Multiple pre-created instances
	IO.puts("\nTEST 2: Multiple Instance Parallel Execution")
	IO.puts("-" <> String.duplicate("-", 40))

	# Pre-create instances
	instances_10 = for _ <- 1..10, do: elem(Wasmex.start_link(%{bytes: wat_compute}), 1)
	instances_100 = for _ <- 1..100, do: elem(Wasmex.start_link(%{bytes: wat_compute}), 1)

	# Warmup
	for pid <- instances_10, do: Wasmex.call_function(pid, :add, [1, 2])

	suite2 = Benchee.run(
	%{
	"10 instances" => fn ->
	tasks = for pid <- instances_10 do
	Task.async(fn -> Wasmex.call_function(pid, :add, [1, 2]) end)
	end
	Task.await_many(tasks)
	end,
	"100 instances" => fn ->
	tasks = for pid <- instances_100 do
	Task.async(fn -> Wasmex.call_function(pid, :add, [1, 2]) end)
	end
	Task.await_many(tasks)
	end
	},
	time: 3,
	warmup: 1,
	formatters: [{Benchee.Formatters.Console, comparison: false}],
	print: [benchmarking: false, configuration: false]
	)

	# Calculate and display ops/sec
	IO.puts("\nOperations per second:")
	for scenario <- suite2.scenarios do
	count = case scenario.name do
	"10 instances" -> 10
	"100 instances" -> 100
	end

	avg_time_us = scenario.run_time_data.statistics.average
	ops_per_sec = count / (avg_time_us / 1_000_000)

	name = String.pad_trailing(scenario.name, 20)
	IO.puts(" #{name} #{round(ops_per_sec)} ops/sec total")
	end

	# Cleanup
	for pid <- instances_10, do: GenServer.stop(pid)
	for pid <- instances_100, do: GenServer.stop(pid)

	# Test 3: Sustained throughput
	IO.puts("\nTEST 3: Sustained Throughput (3 seconds)")
	IO.puts("-" <> String.duplicate("-", 40))

	for instance_count <- [1, 10, 50, 100] do
	instances = for _ <- 1..instance_count, do: elem(Wasmex.start_link(%{bytes: wat_compute}), 1)

	# Run all instances continuously for 3 seconds
	start_time = System.monotonic_time(:millisecond)
	end_time = start_time + 3000

	# Start a task for each instance that runs continuously
	tasks = Enum.map(instances, fn pid ->
	Task.async(fn ->
	Stream.cycle([1])
	\|> Enum.reduce_while(0, fn _, acc ->
	if System.monotonic_time(:millisecond) < end_time do
	{:ok, _} = Wasmex.call_function(pid, :add, [1, 2])
	{:cont, acc + 1}
	else
	{:halt, acc}
	end
	end)
	end)
	end)

	counts = Task.await_many(tasks, 10_000)
	total_ops = Enum.sum(counts)
	elapsed = System.monotonic_time(:millisecond) - start_time
	total_ops_per_sec = total_ops * 1000 / elapsed
	per_instance_ops = total_ops_per_sec / instance_count

	instance_str = String.pad_trailing("#{instance_count} instance(s)", 15)
	total_str = String.pad_leading("#{round(total_ops_per_sec)}", 8)
	per_str = if instance_count > 1 do
	" (#{round(per_instance_ops)} per instance)"
	else
	""
	end

	IO.puts(" #{instance_str} #{total_str} ops/sec#{per_str}")

	# Cleanup
	for pid <- instances, do: GenServer.stop(pid)
	end

	# Test 4: I/O-bound workload
	IO.puts("\nTEST 4: I/O-Bound Workload (with 10ms sleep)")
	IO.puts("-" <> String.duplicate("-", 40))

	# Define imports for sleep function
	imports = %{
	env: %{
	sleep_ms: {:fn, [:i32], [], fn _context, milliseconds ->
	Process.sleep(milliseconds)
	{:ok, []}
	end}
	}
	}

	IO.puts("Testing concurrent sleeping operations...")

	# Test with different concurrency levels
	for concurrent_count <- [10, 50, 100, 200] do
	# Create instance with imports
	{:ok, pid} = Wasmex.start_link(%{bytes: wat_io, imports: imports})

	start_time = System.monotonic_time(:millisecond)

	# Start concurrent sleeping operations
	tasks = for _ <- 1..concurrent_count do
	Task.async(fn ->
	# Each operation sleeps for 10ms
	Wasmex.call_function(pid, :io_operation, [10])
	end)
	end

	# Wait for all to complete
	Task.await_many(tasks, 30_000)

	elapsed = System.monotonic_time(:millisecond) - start_time
	ops_per_sec = concurrent_count * 1000 / elapsed

	concurrent_str = String.pad_trailing("#{concurrent_count} concurrent", 15)
	time_str = String.pad_leading("#{elapsed}ms", 8)
	ops_str = String.pad_leading("#{round(ops_per_sec)}", 6)

	# Expected time: ~10ms if perfectly async, longer if blocking
	expected_time = if concurrent_count <= 10, do: 10, else: 10
	blocking_factor = elapsed / expected_time

	IO.puts(" #{concurrent_str} #{time_str} (#{ops_str} ops/sec, #{Float.round(blocking_factor, 1)}x expected)")

	GenServer.stop(pid)
	end

	IO.puts("")
	IO.puts("Note: Lower blocking factor = better async handling")
	IO.puts(" 1x = perfect async, higher = thread blocking")

	IO.puts("\nSUMMARY")
	IO.puts("-" <> String.duplicate("-", 40))
	IO.puts("Key metrics to compare between branches:")
	IO.puts("- Single instance handling 1000 concurrent calls")
	IO.puts("- Sustained throughput with 100 instances")
	IO.puts("- I/O-bound: 200 concurrent sleeping operations")
	IO.puts("- Scaling efficiency from 1 to 100 instances")