sithumonline · December 6, 2025 07:02
diff --git a/how_to_compile.sh b/how_to_compile.sh
 # Compile the Metal shader
 xcrun -sdk macosx metal -c Shaders.metal -o Shaders.air
 xcrun -sdk macosx metallib Shaders.air -o default.metallib

 # Compile and run Swift
 swiftc main.swift -o metal_fib -framework Metal -framework Foundation
 ./metal_fib

 go build -ldflags="-s -w" -o mprocessing-go main.go
diff --git a/main.go b/main.go
 package main

 import (
 	"log"
 	"sync"
 	"time"
 )

 func simulateWork(n int) int {
    if n <= 1 {
        return n
    }
    var a, b int = 0, 1
    for i := 2; i <= n; i++ {
        a, b = b, a+b
    }
    return b
 }

 // worker processes jobs from its dedicated jobs channel, does some work,
 // then sends the result on the results channel.
 func worker(id int, jobs <-chan int, results chan<- int, wg *sync.WaitGroup) {
 	defer wg.Done()
 	log.Printf("Worker %d: starting\n", id)
 	for j := range jobs {
 		log.Printf("Worker %d: started job %d\n", id, j)
 		result := simulateWork(45) // Example: Calculate the 45th Fibonacci number
 		log.Printf("Worker %d: finished job %d with result %d\n", id, j, result)
 		results <- result
 	}
 }

 func main() {
 	start := time.Now() // Start timer

 	const numJobs = 1_000_000 // 5 * 100
 	const numWorkers = 3 * 100

 	// Create a dedicated jobs channel for each worker
 	workerJobs := make([]chan int, numWorkers)
 	for i := range workerJobs {
 		workerJobs[i] = make(chan int, numJobs)
 	}

 	results := make(chan int, numJobs)

 	var wg sync.WaitGroup

 	// Start the worker goroutines
 	for w := 1; w <= numWorkers; w++ {
 		wg.Add(1)
 		go worker(w, workerJobs[w-1], results, &wg)
 	}

 	log.Println("Main: All workers started")

 	// Send jobs to specific workers
 	for j := 1; j <= numJobs; j++ {
 		workerID := (j - 1) % numWorkers // Assign jobs in a round-robin manner
 		workerJobs[workerID] <- j
 		log.Printf("Main: sent job %d to worker %d\n", j, workerID+1)
 	}

 	// Close all worker job channels
 	for _, ch := range workerJobs {
 		close(ch)
 	}

 	// Wait for all workers to finish, then close results
 	wg.Wait()
 	close(results)

 	// Collect and print results
 	for res := range results {
 		log.Printf("Result received: %d\n", res)
 	}

 	elapsed := time.Since(start) // End timer
 	log.Printf("Execution time: %s\n", elapsed)
 }
diff --git a/main.swift b/main.swift
 import Metal
 import Foundation

 func main() {
    let start = CFAbsoluteTimeGetCurrent()
    
    // Configuration
    let numJobs = 1_000_000 // 500
    let fibNumber: Int32 = 45
    
    // Get Metal device
    guard let device = MTLCreateSystemDefaultDevice() else {
        fatalError("Metal is not supported on this device")
    }
    print("Using device: \(device.name)")
    
    // Load the shader library
    guard let library = device.makeDefaultLibrary() else {
        fatalError("Failed to load Metal library")
    }
    
    guard let function = library.makeFunction(name: "fibonacci_kernel") else {
        fatalError("Failed to find fibonacci_kernel function")
    }
    
    // Create compute pipeline
    let pipeline: MTLComputePipelineState
    do {
        pipeline = try device.makeComputePipelineState(function: function)
    } catch {
        fatalError("Failed to create pipeline: \(error)")
    }
    
    // Create command queue
    guard let commandQueue = device.makeCommandQueue() else {
        fatalError("Failed to create command queue")
    }
    
    // Prepare input data (all jobs compute the same Fibonacci number)
    var jobs = [Int32](repeating: fibNumber, count: numJobs)
    
    // Create buffers
    guard let jobsBuffer = device.makeBuffer(
        bytes: &jobs,
        length: MemoryLayout<Int32>.stride * numJobs,
        options: .storageModeShared
    ) else {
        fatalError("Failed to create jobs buffer")
    }
    
    guard let resultsBuffer = device.makeBuffer(
        length: MemoryLayout<UInt64>.stride * numJobs,
        options: .storageModeShared
    ) else {
        fatalError("Failed to create results buffer")
    }
    
    // Create command buffer and encoder
    guard let commandBuffer = commandQueue.makeCommandBuffer(),
          let encoder = commandBuffer.makeComputeCommandEncoder() else {
        fatalError("Failed to create command buffer/encoder")
    }
    
    encoder.setComputePipelineState(pipeline)
    encoder.setBuffer(jobsBuffer, offset: 0, index: 0)
    encoder.setBuffer(resultsBuffer, offset: 0, index: 1)
    
    // Configure thread groups
    let threadsPerGrid = MTLSize(width: numJobs, height: 1, depth: 1)
    let maxThreadsPerGroup = pipeline.maxTotalThreadsPerThreadgroup
    print("Max threads per group: \(maxThreadsPerGroup)")
    let threadsPerGroup = MTLSize(width: min(numJobs, maxThreadsPerGroup), height: 1, depth: 1)
    
    encoder.dispatchThreads(threadsPerGrid, threadsPerThreadgroup: threadsPerGroup)
    encoder.endEncoding()
    
    // Execute and wait
    commandBuffer.commit()
    commandBuffer.waitUntilCompleted()
    
    // Read results
    let resultsPointer = resultsBuffer.contents().bindMemory(to: UInt64.self, capacity: numJobs)
    let results = Array(UnsafeBufferPointer(start: resultsPointer, count: numJobs))
    
    // Print some results
    print("First 5 results:")
    for i in 0..<min(5, numJobs) {
        print("  Job \(i): fib(\(fibNumber)) = \(results[i])")
    }
    
    let elapsed = CFAbsoluteTimeGetCurrent() - start
    print("Total jobs: \(numJobs)")
    print("Execution time: \(String(format: "%.4f", elapsed))s")
 }

 main()
diff --git a/results.json b/results.json
 {
  "results": [
    {
      "command": "./metal_fib",
      "mean": 0.12367221734,
      "stddev": 0.16942545033319764,
      "median": 0.06980722564,
      "user": 0.00964524,
      "system": 0.01644698,
      "min": 0.06056543414,
      "max": 0.60559114214,
      "times": [
        0.60559114214,
        0.07662976814000001,
        0.07422935114000001,
        0.06877580914,
        0.07083864214,
        0.06056543414,
        0.06554097514,
        0.08020380914000001,
        0.06646126714,
        0.06788597514
      ],
      "exit_codes": [
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0
      ]
    },
    {
      "command": "./mprocessing-go",
      "mean": 3.58517456314,
      "stddev": 0.11821375404667352,
      "median": 3.56436251714,
      "user": 3.55224154,
      "system": 2.4159574800000003,
      "min": 3.46224505914,
      "max": 3.81418480914,
      "times": [
        3.81418480914,
        3.46932664214,
        3.46224505914,
        3.57611510014,
        3.72334918414,
        3.64247955914,
        3.63194905914,
        3.55260993414,
        3.46886797514,
        3.51061830914
      ],
      "exit_codes": [
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0
      ]
    }
  ]
 }
diff --git a/Shaders.metal b/Shaders.metal
 #include <metal_stdlib>
 using namespace metal;

 // Iterative Fibonacci (recursive doesn't work well on GPU)
 uint64_t fibonacci(int n) {
    if (n <= 1) return n;
    
    uint64_t a = 0;
    uint64_t b = 1;
    for (int i = 2; i <= n; i++) {
        uint64_t temp = a + b;
        a = b;
        b = temp;
    }
    return b;
 }

 kernel void fibonacci_kernel(
    device const int* jobs [[buffer(0)]],
    device uint64_t* results [[buffer(1)]],
    uint id [[thread_position_in_grid]]
 ) {
    results[id] = fibonacci(jobs[id]);
 }
	# Compile the Metal shader
	xcrun -sdk macosx metal -c Shaders.metal -o Shaders.air
	xcrun -sdk macosx metallib Shaders.air -o default.metallib

	# Compile and run Swift
	swiftc main.swift -o metal_fib -framework Metal -framework Foundation
	./metal_fib

	go build -ldflags="-s -w" -o mprocessing-go main.go
	package main

	import (
	"log"
	"sync"
	"time"
	)

	func simulateWork(n int) int {
	if n <= 1 {
	return n
	}
	var a, b int = 0, 1
	for i := 2; i <= n; i++ {
	a, b = b, a+b
	}
	return b
	}

	// worker processes jobs from its dedicated jobs channel, does some work,
	// then sends the result on the results channel.
	func worker(id int, jobs <-chan int, results chan<- int, wg *sync.WaitGroup) {
	defer wg.Done()
	log.Printf("Worker %d: starting\n", id)
	for j := range jobs {
	log.Printf("Worker %d: started job %d\n", id, j)
	result := simulateWork(45) // Example: Calculate the 45th Fibonacci number
	log.Printf("Worker %d: finished job %d with result %d\n", id, j, result)
	results <- result
	}
	}

	func main() {
	start := time.Now() // Start timer

	const numJobs = 1_000_000 // 5 * 100
	const numWorkers = 3 * 100

	// Create a dedicated jobs channel for each worker
	workerJobs := make([]chan int, numWorkers)
	for i := range workerJobs {
	workerJobs[i] = make(chan int, numJobs)
	}

	results := make(chan int, numJobs)

	var wg sync.WaitGroup

	// Start the worker goroutines
	for w := 1; w <= numWorkers; w++ {
	wg.Add(1)
	go worker(w, workerJobs[w-1], results, &wg)
	}

	log.Println("Main: All workers started")

	// Send jobs to specific workers
	for j := 1; j <= numJobs; j++ {
	workerID := (j - 1) % numWorkers // Assign jobs in a round-robin manner
	workerJobs[workerID] <- j
	log.Printf("Main: sent job %d to worker %d\n", j, workerID+1)
	}

	// Close all worker job channels
	for _, ch := range workerJobs {
	close(ch)
	}

	// Wait for all workers to finish, then close results
	wg.Wait()
	close(results)

	// Collect and print results
	for res := range results {
	log.Printf("Result received: %d\n", res)
	}

	elapsed := time.Since(start) // End timer
	log.Printf("Execution time: %s\n", elapsed)
	}
	import Metal
	import Foundation

	func main() {
	let start = CFAbsoluteTimeGetCurrent()

	// Configuration
	let numJobs = 1_000_000 // 500
	let fibNumber: Int32 = 45

	// Get Metal device
	guard let device = MTLCreateSystemDefaultDevice() else {
	fatalError("Metal is not supported on this device")
	}
	print("Using device: \(device.name)")

	// Load the shader library
	guard let library = device.makeDefaultLibrary() else {
	fatalError("Failed to load Metal library")
	}

	guard let function = library.makeFunction(name: "fibonacci_kernel") else {
	fatalError("Failed to find fibonacci_kernel function")
	}

	// Create compute pipeline
	let pipeline: MTLComputePipelineState
	do {
	pipeline = try device.makeComputePipelineState(function: function)
	} catch {
	fatalError("Failed to create pipeline: \(error)")
	}

	// Create command queue
	guard let commandQueue = device.makeCommandQueue() else {
	fatalError("Failed to create command queue")
	}

	// Prepare input data (all jobs compute the same Fibonacci number)
	var jobs = [Int32](repeating: fibNumber, count: numJobs)

	// Create buffers
	guard let jobsBuffer = device.makeBuffer(
	bytes: &jobs,
	length: MemoryLayout<Int32>.stride * numJobs,
	options: .storageModeShared
	) else {
	fatalError("Failed to create jobs buffer")
	}

	guard let resultsBuffer = device.makeBuffer(
	length: MemoryLayout<UInt64>.stride * numJobs,
	options: .storageModeShared
	) else {
	fatalError("Failed to create results buffer")
	}

	// Create command buffer and encoder
	guard let commandBuffer = commandQueue.makeCommandBuffer(),
	let encoder = commandBuffer.makeComputeCommandEncoder() else {
	fatalError("Failed to create command buffer/encoder")
	}

	encoder.setComputePipelineState(pipeline)
	encoder.setBuffer(jobsBuffer, offset: 0, index: 0)
	encoder.setBuffer(resultsBuffer, offset: 0, index: 1)

	// Configure thread groups
	let threadsPerGrid = MTLSize(width: numJobs, height: 1, depth: 1)
	let maxThreadsPerGroup = pipeline.maxTotalThreadsPerThreadgroup
	print("Max threads per group: \(maxThreadsPerGroup)")
	let threadsPerGroup = MTLSize(width: min(numJobs, maxThreadsPerGroup), height: 1, depth: 1)

	encoder.dispatchThreads(threadsPerGrid, threadsPerThreadgroup: threadsPerGroup)
	encoder.endEncoding()

	// Execute and wait
	commandBuffer.commit()
	commandBuffer.waitUntilCompleted()

	// Read results
	let resultsPointer = resultsBuffer.contents().bindMemory(to: UInt64.self, capacity: numJobs)
	let results = Array(UnsafeBufferPointer(start: resultsPointer, count: numJobs))

	// Print some results
	print("First 5 results:")
	for i in 0..<min(5, numJobs) {
	print(" Job \(i): fib(\(fibNumber)) = \(results[i])")
	}

	let elapsed = CFAbsoluteTimeGetCurrent() - start
	print("Total jobs: \(numJobs)")
	print("Execution time: \(String(format: "%.4f", elapsed))s")
	}

	main()
	{
	"results": [
	{
	"command": "./metal_fib",
	"mean": 0.12367221734,
	"stddev": 0.16942545033319764,
	"median": 0.06980722564,
	"user": 0.00964524,
	"system": 0.01644698,
	"min": 0.06056543414,
	"max": 0.60559114214,
	"times": [
	0.60559114214,
	0.07662976814000001,
	0.07422935114000001,
	0.06877580914,
	0.07083864214,
	0.06056543414,
	0.06554097514,
	0.08020380914000001,
	0.06646126714,
	0.06788597514
	],
	"exit_codes": [
	0,
	0,
	0,
	0,
	0,
	0,
	0,
	0,
	0,
	0
	]
	},
	{
	"command": "./mprocessing-go",
	"mean": 3.58517456314,
	"stddev": 0.11821375404667352,
	"median": 3.56436251714,
	"user": 3.55224154,
	"system": 2.4159574800000003,
	"min": 3.46224505914,
	"max": 3.81418480914,
	"times": [
	3.81418480914,
	3.46932664214,
	3.46224505914,
	3.57611510014,
	3.72334918414,
	3.64247955914,
	3.63194905914,
	3.55260993414,
	3.46886797514,
	3.51061830914
	],
	"exit_codes": [
	0,
	0,
	0,
	0,
	0,
	0,
	0,
	0,
	0,
	0
	]
	}
	]
	}
	#include <metal_stdlib>
	using namespace metal;

	// Iterative Fibonacci (recursive doesn't work well on GPU)
	uint64_t fibonacci(int n) {
	if (n <= 1) return n;

	uint64_t a = 0;
	uint64_t b = 1;
	for (int i = 2; i <= n; i++) {
	uint64_t temp = a + b;
	a = b;
	b = temp;
	}
	return b;
	}

	kernel void fibonacci_kernel(
	device const int* jobs [[buffer(0)]],
	device uint64_t* results [[buffer(1)]],
	uint id [[thread_position_in_grid]]
	) {
	results[id] = fibonacci(jobs[id]);
	}