Skip to content

Instantly share code, notes, and snippets.

@AmosLewis
Created January 30, 2025 18:24
Show Gist options
  • Save AmosLewis/a0b597e85091016b4f48565e0a13b0e3 to your computer and use it in GitHub Desktop.
Save AmosLewis/a0b597e85091016b4f48565e0a13b0e3 to your computer and use it in GitHub Desktop.
hal.executable public @prefill_bs1$async_dispatch_0 {
hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>) {
hal.executable.export public @prefill_bs1$async_dispatch_0_elementwise_broadcast_Dx4096_i64xbf16 ordinal(0) layout(#hal.pipeline.layout<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) {
^bb0(%arg0: !hal.device, %arg1: index):
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg1
hal.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @prefill_bs1$async_dispatch_0_elementwise_broadcast_Dx4096_i64xbf16() {
%c67108864 = arith.constant 67108864 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.constant.load layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = arith.index_castui %0 : i32 to index
%2 = util.assume.int %1<umin = 32, umax = 131040, udiv = 32> : index
%3 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128256x4096xbf16>>
%4 = flow.dispatch.workload.ordinal %2, 0 : index
%5 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?xi64>>{%4}
%6 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c67108864) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<?x4096xbf16>>{%4}
%7 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [128256, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128256x4096xbf16>> -> tensor<128256x4096xbf16>
%8 = flow.dispatch.tensor.load %5, offsets = [0], sizes = [%4], strides = [1] : !flow.dispatch.tensor<readonly:tensor<?xi64>>{%4} -> tensor<?xi64>
%9 = tensor.empty(%4) : tensor<?x4096xbf16>
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<?xi64>) outs(%9 : tensor<?x4096xbf16>) {
^bb0(%in: i64, %out: bf16):
%11 = arith.index_cast %in : i64 to index
%12 = linalg.index 1 : index
%extracted = tensor.extract %7[%11, %12] : tensor<128256x4096xbf16>
linalg.yield %extracted : bf16
} -> tensor<?x4096xbf16>
flow.dispatch.tensor.store %10, %6, offsets = [0, 0], sizes = [%4, 4096], strides = [1, 1] : tensor<?x4096xbf16> -> !flow.dispatch.tensor<writeonly:tensor<?x4096xbf16>>{%4}
return
}
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment