Skip to content

Instantly share code, notes, and snippets.

@sithumonline
Created December 6, 2025 07:02
Show Gist options
  • Select an option

  • Save sithumonline/93d7b0cf9b013f632de29bd4d019fb88 to your computer and use it in GitHub Desktop.

Select an option

Save sithumonline/93d7b0cf9b013f632de29bd4d019fb88 to your computer and use it in GitHub Desktop.
GPU vs CPU: When Metal Obliterates Go in Parallel Workloads
# Compile the Metal shader
xcrun -sdk macosx metal -c Shaders.metal -o Shaders.air
xcrun -sdk macosx metallib Shaders.air -o default.metallib
# Compile and run Swift
swiftc main.swift -o metal_fib -framework Metal -framework Foundation
./metal_fib
go build -ldflags="-s -w" -o mprocessing-go main.go
package main
import (
"log"
"sync"
"time"
)
func simulateWork(n int) int {
if n <= 1 {
return n
}
var a, b int = 0, 1
for i := 2; i <= n; i++ {
a, b = b, a+b
}
return b
}
// worker processes jobs from its dedicated jobs channel, does some work,
// then sends the result on the results channel.
func worker(id int, jobs <-chan int, results chan<- int, wg *sync.WaitGroup) {
defer wg.Done()
log.Printf("Worker %d: starting\n", id)
for j := range jobs {
log.Printf("Worker %d: started job %d\n", id, j)
result := simulateWork(45) // Example: Calculate the 45th Fibonacci number
log.Printf("Worker %d: finished job %d with result %d\n", id, j, result)
results <- result
}
}
func main() {
start := time.Now() // Start timer
const numJobs = 1_000_000 // 5 * 100
const numWorkers = 3 * 100
// Create a dedicated jobs channel for each worker
workerJobs := make([]chan int, numWorkers)
for i := range workerJobs {
workerJobs[i] = make(chan int, numJobs)
}
results := make(chan int, numJobs)
var wg sync.WaitGroup
// Start the worker goroutines
for w := 1; w <= numWorkers; w++ {
wg.Add(1)
go worker(w, workerJobs[w-1], results, &wg)
}
log.Println("Main: All workers started")
// Send jobs to specific workers
for j := 1; j <= numJobs; j++ {
workerID := (j - 1) % numWorkers // Assign jobs in a round-robin manner
workerJobs[workerID] <- j
log.Printf("Main: sent job %d to worker %d\n", j, workerID+1)
}
// Close all worker job channels
for _, ch := range workerJobs {
close(ch)
}
// Wait for all workers to finish, then close results
wg.Wait()
close(results)
// Collect and print results
for res := range results {
log.Printf("Result received: %d\n", res)
}
elapsed := time.Since(start) // End timer
log.Printf("Execution time: %s\n", elapsed)
}
import Metal
import Foundation
func main() {
let start = CFAbsoluteTimeGetCurrent()
// Configuration
let numJobs = 1_000_000 // 500
let fibNumber: Int32 = 45
// Get Metal device
guard let device = MTLCreateSystemDefaultDevice() else {
fatalError("Metal is not supported on this device")
}
print("Using device: \(device.name)")
// Load the shader library
guard let library = device.makeDefaultLibrary() else {
fatalError("Failed to load Metal library")
}
guard let function = library.makeFunction(name: "fibonacci_kernel") else {
fatalError("Failed to find fibonacci_kernel function")
}
// Create compute pipeline
let pipeline: MTLComputePipelineState
do {
pipeline = try device.makeComputePipelineState(function: function)
} catch {
fatalError("Failed to create pipeline: \(error)")
}
// Create command queue
guard let commandQueue = device.makeCommandQueue() else {
fatalError("Failed to create command queue")
}
// Prepare input data (all jobs compute the same Fibonacci number)
var jobs = [Int32](repeating: fibNumber, count: numJobs)
// Create buffers
guard let jobsBuffer = device.makeBuffer(
bytes: &jobs,
length: MemoryLayout<Int32>.stride * numJobs,
options: .storageModeShared
) else {
fatalError("Failed to create jobs buffer")
}
guard let resultsBuffer = device.makeBuffer(
length: MemoryLayout<UInt64>.stride * numJobs,
options: .storageModeShared
) else {
fatalError("Failed to create results buffer")
}
// Create command buffer and encoder
guard let commandBuffer = commandQueue.makeCommandBuffer(),
let encoder = commandBuffer.makeComputeCommandEncoder() else {
fatalError("Failed to create command buffer/encoder")
}
encoder.setComputePipelineState(pipeline)
encoder.setBuffer(jobsBuffer, offset: 0, index: 0)
encoder.setBuffer(resultsBuffer, offset: 0, index: 1)
// Configure thread groups
let threadsPerGrid = MTLSize(width: numJobs, height: 1, depth: 1)
let maxThreadsPerGroup = pipeline.maxTotalThreadsPerThreadgroup
print("Max threads per group: \(maxThreadsPerGroup)")
let threadsPerGroup = MTLSize(width: min(numJobs, maxThreadsPerGroup), height: 1, depth: 1)
encoder.dispatchThreads(threadsPerGrid, threadsPerThreadgroup: threadsPerGroup)
encoder.endEncoding()
// Execute and wait
commandBuffer.commit()
commandBuffer.waitUntilCompleted()
// Read results
let resultsPointer = resultsBuffer.contents().bindMemory(to: UInt64.self, capacity: numJobs)
let results = Array(UnsafeBufferPointer(start: resultsPointer, count: numJobs))
// Print some results
print("First 5 results:")
for i in 0..<min(5, numJobs) {
print(" Job \(i): fib(\(fibNumber)) = \(results[i])")
}
let elapsed = CFAbsoluteTimeGetCurrent() - start
print("Total jobs: \(numJobs)")
print("Execution time: \(String(format: "%.4f", elapsed))s")
}
main()
{
"results": [
{
"command": "./metal_fib",
"mean": 0.12367221734,
"stddev": 0.16942545033319764,
"median": 0.06980722564,
"user": 0.00964524,
"system": 0.01644698,
"min": 0.06056543414,
"max": 0.60559114214,
"times": [
0.60559114214,
0.07662976814000001,
0.07422935114000001,
0.06877580914,
0.07083864214,
0.06056543414,
0.06554097514,
0.08020380914000001,
0.06646126714,
0.06788597514
],
"exit_codes": [
0,
0,
0,
0,
0,
0,
0,
0,
0,
0
]
},
{
"command": "./mprocessing-go",
"mean": 3.58517456314,
"stddev": 0.11821375404667352,
"median": 3.56436251714,
"user": 3.55224154,
"system": 2.4159574800000003,
"min": 3.46224505914,
"max": 3.81418480914,
"times": [
3.81418480914,
3.46932664214,
3.46224505914,
3.57611510014,
3.72334918414,
3.64247955914,
3.63194905914,
3.55260993414,
3.46886797514,
3.51061830914
],
"exit_codes": [
0,
0,
0,
0,
0,
0,
0,
0,
0,
0
]
}
]
}
#include <metal_stdlib>
using namespace metal;
// Iterative Fibonacci (recursive doesn't work well on GPU)
uint64_t fibonacci(int n) {
if (n <= 1) return n;
uint64_t a = 0;
uint64_t b = 1;
for (int i = 2; i <= n; i++) {
uint64_t temp = a + b;
a = b;
b = temp;
}
return b;
}
kernel void fibonacci_kernel(
device const int* jobs [[buffer(0)]],
device uint64_t* results [[buffer(1)]],
uint id [[thread_position_in_grid]]
) {
results[id] = fibonacci(jobs[id]);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment