raphlinus · November 6, 2022 17:04
diff --git a/barrier_test.swift b/barrier_test.swift
 import Foundation
 import Metal

 // This is translated by naga from tile_alloc.wgsl
 // naga --buffer-bounds-check-policy ReadZeroSkipWrite tile_alloc.wgsl tile_alloc.metal
 let metalProgram = """
 // language: metal2.0
 #include <metal_stdlib>
 #include <simd/simd.h>

 using metal::uint;

 struct DefaultConstructible {
    template<typename T>
    operator T() && {
        return T {};
    }
 };
 struct _mslBufferSizes {
    uint size1;
 };

 constexpr constant unsigned WG_SIZE = 256u;
 typedef uint type_2[1];

 struct main_Input {
 };
 kernel void main_(
  metal::uint3 global_id [[thread_position_in_grid]]
 , metal::uint3 local_id [[thread_position_in_threadgroup]]
 , device metal::atomic_uint& bump [[user(fake0)]]
 , device type_2& paths [[user(fake0)]]
 , threadgroup uint& sh_tile_offset
 , constant _mslBufferSizes& _buffer_sizes [[user(fake0)]]
 ) {
    uint drawobj_ix = global_id.x;
    uint tile_count_in = local_id.x + 1u;
    if (local_id.x == (WG_SIZE - 1u)) {
        uint _e15 = metal::atomic_fetch_add_explicit(&bump, tile_count_in, metal::memory_order_relaxed);
        sh_tile_offset = 1u + _e15;
    }
    metal::threadgroup_barrier(metal::mem_flags::mem_threadgroup);
    uint tile_offset = sh_tile_offset;
    if (drawobj_ix < 3u) {
        if (uint(drawobj_ix) < 1 + (_buffer_sizes.size1 - 0 - 4) / 4) {
            paths[drawobj_ix] = tile_offset;
        }
        return;
    } else {
        return;
    }
 }
 """

 func barrierTest() {
    do {
        let device = MTLCreateSystemDefaultDevice()!
        let queue = device.makeCommandQueue()!
        let cmdbuf = queue.makeCommandBuffer()!
        let library = try device.makeLibrary(source: metalProgram, options:
                                            nil)
        let function = library.makeFunction(name: "main_")!
        let pipeline = try device.makeComputePipelineState(function: function)
        let sharedBuf = MTLResourceOptions.storageModeShared
        let buf0 = device.makeBuffer(length: 1024, options: sharedBuf)!
        let buf1 = device.makeBuffer(length: 1024, options: sharedBuf)!
        let encoder = cmdbuf.makeComputeCommandEncoder()!
        encoder.setComputePipelineState(pipeline)
        encoder.setBuffer(buf0, offset: 0, index: 0)
        encoder.setBuffer(buf1, offset: 0, index: 1)
        let sizes: [UInt32] = [1024];
        encoder.setBytes(sizes, length: 4, index: 2)
        encoder.setThreadgroupMemoryLength(16, index: 0)
        encoder.dispatchThreadgroups(MTLSizeMake(1, 1, 1), threadsPerThreadgroup: MTLSizeMake(256, 1, 1))
        encoder.endEncoding()
        cmdbuf.commit()
        cmdbuf.waitUntilCompleted()
        let contents = buf1.contents()
        for i in 0..<4 {
            print(i, contents.load(fromByteOffset: i * 4, as: UInt32.self))
        }
    } catch {
        print("got an error")
    }
 }

 barrierTest()
	import Foundation
	import Metal

	// This is translated by naga from tile_alloc.wgsl
	// naga --buffer-bounds-check-policy ReadZeroSkipWrite tile_alloc.wgsl tile_alloc.metal
	let metalProgram = """
	// language: metal2.0
	#include <metal_stdlib>
	#include <simd/simd.h>

	using metal::uint;

	struct DefaultConstructible {
	template<typename T>
	operator T() && {
	return T {};
	}
	};
	struct _mslBufferSizes {
	uint size1;
	};

	constexpr constant unsigned WG_SIZE = 256u;
	typedef uint type_2[1];

	struct main_Input {
	};
	kernel void main_(
	metal::uint3 global_id [[thread_position_in_grid]]
	, metal::uint3 local_id [[thread_position_in_threadgroup]]
	, device metal::atomic_uint& bump [[user(fake0)]]
	, device type_2& paths [[user(fake0)]]
	, threadgroup uint& sh_tile_offset
	, constant _mslBufferSizes& _buffer_sizes [[user(fake0)]]
	) {
	uint drawobj_ix = global_id.x;
	uint tile_count_in = local_id.x + 1u;
	if (local_id.x == (WG_SIZE - 1u)) {
	uint _e15 = metal::atomic_fetch_add_explicit(&bump, tile_count_in, metal::memory_order_relaxed);
	sh_tile_offset = 1u + _e15;
	}
	metal::threadgroup_barrier(metal::mem_flags::mem_threadgroup);
	uint tile_offset = sh_tile_offset;
	if (drawobj_ix < 3u) {
	if (uint(drawobj_ix) < 1 + (_buffer_sizes.size1 - 0 - 4) / 4) {
	paths[drawobj_ix] = tile_offset;
	}
	return;
	} else {
	return;
	}
	}
	"""

	func barrierTest() {
	do {
	let device = MTLCreateSystemDefaultDevice()!
	let queue = device.makeCommandQueue()!
	let cmdbuf = queue.makeCommandBuffer()!
	let library = try device.makeLibrary(source: metalProgram, options:
	nil)
	let function = library.makeFunction(name: "main_")!
	let pipeline = try device.makeComputePipelineState(function: function)
	let sharedBuf = MTLResourceOptions.storageModeShared
	let buf0 = device.makeBuffer(length: 1024, options: sharedBuf)!
	let buf1 = device.makeBuffer(length: 1024, options: sharedBuf)!
	let encoder = cmdbuf.makeComputeCommandEncoder()!
	encoder.setComputePipelineState(pipeline)
	encoder.setBuffer(buf0, offset: 0, index: 0)
	encoder.setBuffer(buf1, offset: 0, index: 1)
	let sizes: [UInt32] = [1024];
	encoder.setBytes(sizes, length: 4, index: 2)
	encoder.setThreadgroupMemoryLength(16, index: 0)
	encoder.dispatchThreadgroups(MTLSizeMake(1, 1, 1), threadsPerThreadgroup: MTLSizeMake(256, 1, 1))
	encoder.endEncoding()
	cmdbuf.commit()
	cmdbuf.waitUntilCompleted()
	let contents = buf1.contents()
	for i in 0..<4 {
	print(i, contents.load(fromByteOffset: i * 4, as: UInt32.self))
	}
	} catch {
	print("got an error")
	}
	}

	barrierTest()