Skip to content

Instantly share code, notes, and snippets.

@AmosLewis
Last active February 24, 2025 18:04
Show Gist options
  • Save AmosLewis/964b405594349c1ca1b44b104b8f399f to your computer and use it in GitHub Desktop.
Save AmosLewis/964b405594349c1ca1b44b104b8f399f to your computer and use it in GitHub Desktop.
kv16
This file has been truncated, but you can view the full file.
iree-base-compiler 3.3.0rc20250223
iree-base-runtime 3.3.0rc20250223
iree-turbine 3.3.0rc20250223
wget https://sharkpublic.blob.core.windows.net/sharkpublic/chi/llama/fp8_32_kv16.mlir
iree-compile /sharedfile/32/fp8_32_kv16.mlir \
--iree-hip-target=gfx942 \
-o=/sharedfile/32/fp8_32_kv16.vmfb \
--iree-hal-target-device=hip \
--iree-dispatch-creation-enable-aggressive-fusion=true \
--iree-global-opt-propagate-transposes=true \
--iree-opt-aggressively-propagate-transposes=true \
--iree-opt-data-tiling=false \
--iree-preprocessing-pass-pipeline='builtin.module(util.func(iree-preprocessing-generalize-linalg-matmul-experimental))' \
--iree-hal-indirect-command-buffers=true \
--iree-stream-resource-memory-model=discrete \
--iree-hal-memoization=true \
--iree-opt-strip-assertions
failed to translate executables
/sharedfile/32/fp8_32_kv16.mlir:2887:12: error: 'vector.transfer_write' op write affecting operations on global resources are restricted to workgroup distributed contexts.
%990 = torch.aten.index_put %979, %989, %988, %false_228 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
^
/sharedfile/32/fp8_32_kv16.mlir:2887:12: note: see current operation: "vector.transfer_write"(%59, %42, %4, %61) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
/sharedfile/32/fp8_32_kv16.mlir:2887:12: error: 'func.func' op failed on workgroup distribution verification
%990 = torch.aten.index_put %979, %989, %988, %false_228 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
^
/sharedfile/32/fp8_32_kv16.mlir:2887:12: note: see current operation:
"func.func"() <{function_type = () -> (), sym_name = "prefill_bs1$async_dispatch_12_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"}> ({
%0 = "arith.constant"() <{value = 0.000000e+00 : f8E4M3FNUZ}> : () -> f8E4M3FNUZ
%1 = "arith.constant"() <{value = 0 : i64}> : () -> i64
%2 = "arith.constant"() <{value = 1 : index}> : () -> index
%3 = "arith.constant"() <{value = 32 : i64}> : () -> i64
%4 = "arith.constant"() <{value = 0 : index}> : () -> index
%5 = "arith.constant"() <{value = 64 : index}> : () -> index
%6 = "arith.constant"() <{value = dense<64> : vector<1xi64>}> : () -> vector<1xi64>
%7 = "memref.alloc"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x3xi32, #gpu.address_space<workgroup>>
%8 = "memref.alloca"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x1x1x1x4xbf16, #gpu.address_space<private>>
%9 = "gpu.thread_id"() <{dimension = #gpu<dim x>, upper_bound = 64 : index}> : () -> index
%10 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 8, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index} : () -> i32
%11 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 8, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 1 : index} : () -> i32
%12 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 8, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 2 : index} : () -> i32
%13 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 8, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 3 : index} : () -> i32
%14 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 8, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 4 : index} : () -> i32
%15 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 8, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 5 : index} : () -> i32
%16 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 8, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 6 : index} : () -> i32
%17 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 8, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 7 : index} : () -> i32
%18 = "arith.extui"(%10) : (i32) -> i64
%19 = "arith.extui"(%11) : (i32) -> i64
%20 = "arith.shli"(%19, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%21 = "arith.ori"(%18, %20) : (i64, i64) -> i64
%22 = "arith.index_castui"(%21) : (i64) -> index
%23 = "arith.extui"(%12) : (i32) -> i64
%24 = "arith.extui"(%13) : (i32) -> i64
%25 = "arith.shli"(%24, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%26 = "arith.ori"(%23, %25) : (i64, i64) -> i64
%27 = "arith.index_castui"(%26) : (i64) -> index
%28 = "arith.index_castui"(%14) : (i32) -> index
%29 = "arith.extui"(%15) : (i32) -> i64
%30 = "arith.extui"(%16) : (i32) -> i64
%31 = "arith.shli"(%30, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%32 = "arith.ori"(%29, %31) : (i64, i64) -> i64
%33 = "arith.index_castui"(%32) : (i64) -> index
%34 = "arith.index_castui"(%17) : (i32) -> index
%35:5 = "util.assume.int"(%22, %27, %28, %33, %34) <{assumptions = [[#util.int.assumption<umin = 68386816, umax = 5300322304>], [#util.int.assumption<umin = 68501504, umax = 5769969664>], [#util.int.assumption<umin = 1, umax = 4095>], [#util.int.assumption<umin = 64, umax = 576460752303423424, udiv = 64>], [#util.int.assumption<umin = 1, umax = 4095>]]}> : (index, index, index, index, index) -> (index, index, index, index, index)
%36 = "flow.dispatch.workload.ordinal"(%35#2) <{ordinal = 1 : index}> : (index) -> index
%37 = "flow.dispatch.workload.ordinal"(%35#3) <{ordinal = 2 : index}> : (index) -> index
%38 = "flow.dispatch.workload.ordinal"(%35#4) <{ordinal = 3 : index}> : (index) -> index
%39 = "hal.interface.binding.subspan"(%35#0, %36) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 8, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%39) <{alignment = 1 : i32}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%40 = "hal.interface.binding.subspan"(%4, %38) {alignment = 64 : index, binding = 1 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 8, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%40) <{alignment = 64 : i32}> : (memref<1x?xi64, #hal.descriptor_type<storage_buffer>>) -> ()
%41 = "hal.interface.binding.subspan"(%4, %37) {alignment = 64 : index, binding = 2 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 8, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%41) <{alignment = 64 : i32}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>) -> ()
%42 = "hal.interface.binding.subspan"(%35#1, %38) {alignment = 64 : index, binding = 3 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 8, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%42) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%43 = "memref.alloca"(%38) <{operandSegmentSizes = array<i32: 1, 0>}> : (index) -> memref<1x?xi32, #gpu.address_space<private>>
%44 = "affine.apply"(%38) <{map = affine_map<()[s0] -> (s0 ceildiv 2)>}> : (index) -> index
"gpu.barrier"() : () -> ()
"scf.for"(%9, %44, %5) ({
^bb0(%arg7: index):
%55 = "affine.apply"(%arg7) <{map = affine_map<(d0) -> (d0 * 2)>}> : (index) -> index
%56 = "affine.min"(%arg7, %38) <{map = affine_map<(d0)[s0] -> (d0 * -2 + s0, 2)>}> : (index, index) -> index
"scf.for"(%4, %56, %2) ({
^bb0(%arg8: index):
%57 = "arith.addi"(%arg8, %55) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
%58 = "vector.transfer_read"(%40, %4, %57, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
%59 = "arith.muli"(%58, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
%60 = "arith.trunci"(%59) : (vector<1xi64>) -> vector<1xi32>
%61 = "affine.apply"(%arg7, %arg8) <{map = affine_map<(d0)[s0] -> (d0 * 2 + s0)>}> : (index, index) -> index
"vector.transfer_write"(%59, %42, %4, %61) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
"vector.transfer_write"(%60, %43, %4, %61) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x?xi32, #gpu.address_space<private>>, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"gpu.barrier"() : () -> ()
%45:2 = "affine.delinearize_index"(%9) <{static_basis = array<i64: 4, 16>}> : (index) -> (index, index)
"scf.forall"(%36) <{mapping = [#iree_codegen.workgroup_mapping<z:1>, #iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>], operandSegmentSizes = array<i32: 0, 1, 0, 0>, staticLowerBound = array<i64: 0, 0, 0, 0>, staticStep = array<i64: 1, 1, 1, 1>, staticUpperBound = array<i64: -9223372036854775808, 32, 2, 2>}> ({
^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index):
%46 = "memref.subview"(%7) <{operandSegmentSizes = array<i32: 1, 0, 0, 0>, static_offsets = array<i64: 0, 0>, static_sizes = array<i64: 1, 1>, static_strides = array<i64: 1, 1>}> : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>
"gpu.barrier"() : () -> ()
"scf.for"(%9, %2, %5) ({
^bb0(%arg6: index):
%52 = "vector.transfer_read"(%40, %4, %arg0, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
%53 = "arith.muli"(%52, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
%54 = "arith.trunci"(%53) : (vector<1xi64>) -> vector<1xi32>
"vector.transfer_write"(%54, %7, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x3xi32, #gpu.address_space<workgroup>>, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"gpu.barrier"() : () -> ()
%47 = "affine.apply"(%arg2, %45#0) <{map = affine_map<(d0)[s0] -> (d0 * 4 + s0)>}> : (index, index) -> index
%48 = "affine.apply"(%arg3, %45#1) <{map = affine_map<(d0)[s0] -> (d0 * 64 + s0 * 4)>}> : (index, index) -> index
%49 = "memref.subview"(%41, %arg1, %47, %48, %37) <{operandSegmentSizes = array<i32: 1, 3, 1, 0>, static_offsets = array<i64: 0, -9223372036854775808, -9223372036854775808, -9223372036854775808>, static_sizes = array<i64: -9223372036854775808, 1, 1, 4>, static_strides = array<i64: 1, 1, 1, 1>}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>, index, index, index, index) -> memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%50 = "vector.transfer_read"(%39, %4, %arg0, %arg1, %47, %48, %0) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 5, 1, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, index, index, index, f8E4M3FNUZ) -> vector<4xf8E4M3FNUZ>
%51 = "arith.extf"(%50) : (vector<4xf8E4M3FNUZ>) -> vector<4xbf16>
"vector.transfer_write"(%51, %8, %4, %4, %4, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 5, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (vector<4xbf16>, memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, index, index, index, index, index) -> ()
"iree_linalg_ext.scatter"(%8, %46, %49) <{dimension_map = array<i64: 0>, operandSegmentSizes = array<i32: 2, 1>, unique_indices = true}> ({
^bb0(%arg4: bf16, %arg5: bf16):
"iree_linalg_ext.yield"(%arg4) : (bf16) -> ()
}) : (memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>, memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
"gpu.barrier"() : () -> ()
"scf.forall.in_parallel"() ({
^bb0:
}) : () -> ()
}) : (index) -> ()
"memref.dealloc"(%7) : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> ()
"func.return"() : () -> ()
}) {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64>} : () -> ()
/sharedfile/32/fp8_32_kv16.mlir:2887:12: error: failed to run translation of source executable to target executable for backend #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>
%990 = torch.aten.index_put %979, %989, %988, %false_228 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
^
/sharedfile/32/fp8_32_kv16.mlir:2887:12: note: see current operation:
"hal.executable.variant"() ({
"hal.executable.export"() ({
^bb0(%arg9: !hal.device, %arg10: index, %arg11: index, %arg12: index, %arg13: index):
%62:3 = "flow.dispatch.workgroup_count_from_slice"(%arg10, %arg11, %arg12, %arg13) : (index, index, index, index) -> (index, index, index)
"hal.return"(%62#0, %62#1, %62#2) : (index, index, index) -> ()
}) {layout = #hal.pipeline.layout<constants = 8, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index, sym_name = "prefill_bs1$async_dispatch_12_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"} : () -> ()
"builtin.module"() ({
"func.func"() <{function_type = () -> (), sym_name = "prefill_bs1$async_dispatch_12_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"}> ({
%0 = "arith.constant"() <{value = 0.000000e+00 : f8E4M3FNUZ}> : () -> f8E4M3FNUZ
%1 = "arith.constant"() <{value = 0 : i64}> : () -> i64
%2 = "arith.constant"() <{value = 1 : index}> : () -> index
%3 = "arith.constant"() <{value = 32 : i64}> : () -> i64
%4 = "arith.constant"() <{value = 0 : index}> : () -> index
%5 = "arith.constant"() <{value = 64 : index}> : () -> index
%6 = "arith.constant"() <{value = dense<64> : vector<1xi64>}> : () -> vector<1xi64>
%7 = "memref.alloc"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x3xi32, #gpu.address_space<workgroup>>
%8 = "memref.alloca"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x1x1x1x4xbf16, #gpu.address_space<private>>
%9 = "gpu.thread_id"() <{dimension = #gpu<dim x>, upper_bound = 64 : index}> : () -> index
%10 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 8, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index} : () -> i32
%11 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 8, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 1 : index} : () -> i32
%12 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 8, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 2 : index} : () -> i32
%13 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 8, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 3 : index} : () -> i32
%14 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 8, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 4 : index} : () -> i32
%15 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 8, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 5 : index} : () -> i32
%16 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 8, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 6 : index} : () -> i32
%17 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 8, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 7 : index} : () -> i32
%18 = "arith.extui"(%10) : (i32) -> i64
%19 = "arith.extui"(%11) : (i32) -> i64
%20 = "arith.shli"(%19, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%21 = "arith.ori"(%18, %20) : (i64, i64) -> i64
%22 = "arith.index_castui"(%21) : (i64) -> index
%23 = "arith.extui"(%12) : (i32) -> i64
%24 = "arith.extui"(%13) : (i32) -> i64
%25 = "arith.shli"(%24, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%26 = "arith.ori"(%23, %25) : (i64, i64) -> i64
%27 = "arith.index_castui"(%26) : (i64) -> index
%28 = "arith.index_castui"(%14) : (i32) -> index
%29 = "arith.extui"(%15) : (i32) -> i64
%30 = "arith.extui"(%16) : (i32) -> i64
%31 = "arith.shli"(%30, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%32 = "arith.ori"(%29, %31) : (i64, i64) -> i64
%33 = "arith.index_castui"(%32) : (i64) -> index
%34 = "arith.index_castui"(%17) : (i32) -> index
%35:5 = "util.assume.int"(%22, %27, %28, %33, %34) <{assumptions = [[#util.int.assumption<umin = 68386816, umax = 5300322304>], [#util.int.assumption<umin = 68501504, umax = 5769969664>], [#util.int.assumption<umin = 1, umax = 4095>], [#util.int.assumption<umin = 64, umax = 576460752303423424, udiv = 64>], [#util.int.assumption<umin = 1, umax = 4095>]]}> : (index, index, index, index, index) -> (index, index, index, index, index)
%36 = "flow.dispatch.workload.ordinal"(%35#2) <{ordinal = 1 : index}> : (index) -> index
%37 = "flow.dispatch.workload.ordinal"(%35#3) <{ordinal = 2 : index}> : (index) -> index
%38 = "flow.dispatch.workload.ordinal"(%35#4) <{ordinal = 3 : index}> : (index) -> index
%39 = "hal.interface.binding.subspan"(%35#0, %36) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 8, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%39) <{alignment = 1 : i32}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%40 = "hal.interface.binding.subspan"(%4, %38) {alignment = 64 : index, binding = 1 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 8, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%40) <{alignment = 64 : i32}> : (memref<1x?xi64, #hal.descriptor_type<storage_buffer>>) -> ()
%41 = "hal.interface.binding.subspan"(%4, %37) {alignment = 64 : index, binding = 2 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 8, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%41) <{alignment = 64 : i32}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>) -> ()
%42 = "hal.interface.binding.subspan"(%35#1, %38) {alignment = 64 : index, binding = 3 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 8, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%42) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%43 = "memref.alloca"(%38) <{operandSegmentSizes = array<i32: 1, 0>}> : (index) -> memref<1x?xi32, #gpu.address_space<private>>
%44 = "affine.apply"(%38) <{map = affine_map<()[s0] -> (s0 ceildiv 2)>}> : (index) -> index
"gpu.barrier"() : () -> ()
"scf.for"(%9, %44, %5) ({
^bb0(%arg7: index):
%55 = "affine.apply"(%arg7) <{map = affine_map<(d0) -> (d0 * 2)>}> : (index) -> index
%56 = "affine.min"(%arg7, %38) <{map = affine_map<(d0)[s0] -> (d0 * -2 + s0, 2)>}> : (index, index) -> index
"scf.for"(%4, %56, %2) ({
^bb0(%arg8: index):
%57 = "arith.addi"(%arg8, %55) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
%58 = "vector.transfer_read"(%40, %4, %57, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
%59 = "arith.muli"(%58, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
%60 = "arith.trunci"(%59) : (vector<1xi64>) -> vector<1xi32>
%61 = "affine.apply"(%arg7, %arg8) <{map = affine_map<(d0)[s0] -> (d0 * 2 + s0)>}> : (index, index) -> index
"vector.transfer_write"(%59, %42, %4, %61) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
"vector.transfer_write"(%60, %43, %4, %61) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x?xi32, #gpu.address_space<private>>, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"gpu.barrier"() : () -> ()
%45:2 = "affine.delinearize_index"(%9) <{static_basis = array<i64: 4, 16>}> : (index) -> (index, index)
"scf.forall"(%36) <{mapping = [#iree_codegen.workgroup_mapping<z:1>, #iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>], operandSegmentSizes = array<i32: 0, 1, 0, 0>, staticLowerBound = array<i64: 0, 0, 0, 0>, staticStep = array<i64: 1, 1, 1, 1>, staticUpperBound = array<i64: -9223372036854775808, 32, 2, 2>}> ({
^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index):
%46 = "memref.subview"(%7) <{operandSegmentSizes = array<i32: 1, 0, 0, 0>, static_offsets = array<i64: 0, 0>, static_sizes = array<i64: 1, 1>, static_strides = array<i64: 1, 1>}> : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>
"gpu.barrier"() : () -> ()
"scf.for"(%9, %2, %5) ({
^bb0(%arg6: index):
%52 = "vector.transfer_read"(%40, %4, %arg0, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
%53 = "arith.muli"(%52, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
%54 = "arith.trunci"(%53) : (vector<1xi64>) -> vector<1xi32>
"vector.transfer_write"(%54, %7, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x3xi32, #gpu.address_space<workgroup>>, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"gpu.barrier"() : () -> ()
%47 = "affine.apply"(%arg2, %45#0) <{map = affine_map<(d0)[s0] -> (d0 * 4 + s0)>}> : (index, index) -> index
%48 = "affine.apply"(%arg3, %45#1) <{map = affine_map<(d0)[s0] -> (d0 * 64 + s0 * 4)>}> : (index, index) -> index
%49 = "memref.subview"(%41, %arg1, %47, %48, %37) <{operandSegmentSizes = array<i32: 1, 3, 1, 0>, static_offsets = array<i64: 0, -9223372036854775808, -9223372036854775808, -9223372036854775808>, static_sizes = array<i64: -9223372036854775808, 1, 1, 4>, static_strides = array<i64: 1, 1, 1, 1>}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>, index, index, index, index) -> memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%50 = "vector.transfer_read"(%39, %4, %arg0, %arg1, %47, %48, %0) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 5, 1, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, index, index, index, f8E4M3FNUZ) -> vector<4xf8E4M3FNUZ>
%51 = "arith.extf"(%50) : (vector<4xf8E4M3FNUZ>) -> vector<4xbf16>
"vector.transfer_write"(%51, %8, %4, %4, %4, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 5, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (vector<4xbf16>, memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, index, index, index, index, index) -> ()
"iree_linalg_ext.scatter"(%8, %46, %49) <{dimension_map = array<i64: 0>, operandSegmentSizes = array<i32: 2, 1>, unique_indices = true}> ({
^bb0(%arg4: bf16, %arg5: bf16):
"iree_linalg_ext.yield"(%arg4) : (bf16) -> ()
}) : (memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>, memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
"gpu.barrier"() : () -> ()
"scf.forall.in_parallel"() ({
^bb0:
}) : () -> ()
}) : (index) -> ()
"memref.dealloc"(%7) : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> ()
"func.return"() : () -> ()
}) {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64>} : () -> ()
}) : () -> ()
"hal.executable.variant_end"() : () -> ()
}) {sym_name = "rocm_hsaco_fb", target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>} : () -> ()
/sharedfile/32/fp8_32_kv16.mlir:3763:13: error: 'vector.transfer_write' op write affecting operations on global resources are restricted to workgroup distributed contexts.
%1327 = torch.aten.index_put %1325, %1326, %1321, %false_597 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
^
/sharedfile/32/fp8_32_kv16.mlir:3763:13: note: see current operation: "vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
/sharedfile/32/fp8_32_kv16.mlir:3763:13: error: 'func.func' op failed on workgroup distribution verification
%1327 = torch.aten.index_put %1325, %1326, %1321, %false_597 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
^
/sharedfile/32/fp8_32_kv16.mlir:3763:13: note: see current operation:
"func.func"() <{function_type = () -> (), sym_name = "prefill_bs1$async_dispatch_32_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"}> ({
%0 = "arith.constant"() <{value = 0.000000e+00 : f8E4M3FNUZ}> : () -> f8E4M3FNUZ
%1 = "arith.constant"() <{value = 0 : i64}> : () -> i64
%2 = "arith.constant"() <{value = 1 : index}> : () -> index
%3 = "arith.constant"() <{value = 32 : i64}> : () -> i64
%4 = "arith.constant"() <{value = 0 : index}> : () -> index
%5 = "arith.constant"() <{value = 64 : index}> : () -> index
%6 = "arith.constant"() <{value = dense<2> : vector<1xi64>}> : () -> vector<1xi64>
%7 = "memref.alloc"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x3xi32, #gpu.address_space<workgroup>>
%8 = "memref.alloca"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x1x1x1x4xbf16, #gpu.address_space<private>>
%9 = "gpu.thread_id"() <{dimension = #gpu<dim x>, upper_bound = 64 : index}> : () -> index
%10 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index} : () -> i32
%11 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 1 : index} : () -> i32
%12 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 2 : index} : () -> i32
%13 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 3 : index} : () -> i32
%14 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 4 : index} : () -> i32
%15 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 5 : index} : () -> i32
%16 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 6 : index} : () -> i32
%17 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 7 : index} : () -> i32
%18 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 8 : index} : () -> i32
%19 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 9 : index} : () -> i32
%20 = "arith.extui"(%10) : (i32) -> i64
%21 = "arith.extui"(%11) : (i32) -> i64
%22 = "arith.shli"(%21, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%23 = "arith.ori"(%20, %22) : (i64, i64) -> i64
%24 = "arith.index_castui"(%23) : (i64) -> index
%25 = "arith.extui"(%12) : (i32) -> i64
%26 = "arith.extui"(%13) : (i32) -> i64
%27 = "arith.shli"(%26, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%28 = "arith.ori"(%25, %27) : (i64, i64) -> i64
%29 = "arith.index_castui"(%28) : (i64) -> index
%30 = "arith.extui"(%14) : (i32) -> i64
%31 = "arith.extui"(%15) : (i32) -> i64
%32 = "arith.shli"(%31, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%33 = "arith.ori"(%30, %32) : (i64, i64) -> i64
%34 = "arith.index_castui"(%33) : (i64) -> index
%35 = "arith.index_castui"(%16) : (i32) -> index
%36 = "arith.extui"(%17) : (i32) -> i64
%37 = "arith.extui"(%18) : (i32) -> i64
%38 = "arith.shli"(%37, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%39 = "arith.ori"(%36, %38) : (i64, i64) -> i64
%40 = "arith.index_castui"(%39) : (i64) -> index
%41 = "arith.index_castui"(%19) : (i32) -> index
%42:6 = "util.assume.int"(%24, %29, %34, %35, %40, %41) <{assumptions = [[#util.int.assumption<umin = 68386816, umax = 5300322304>], [#util.int.assumption<umin = 68501504, umax = 5769969664>], [#util.int.assumption<umin = 68501568, umax = 5770002432>], [#util.int.assumption<umin = 1, umax = 4095>], [#util.int.assumption<umin = 64, umax = 576460752303423424, udiv = 64>], [#util.int.assumption<umin = 1, umax = 4095>]]}> : (index, index, index, index, index, index) -> (index, index, index, index, index, index)
%43 = "flow.dispatch.workload.ordinal"(%42#3) <{ordinal = 1 : index}> : (index) -> index
%44 = "flow.dispatch.workload.ordinal"(%42#4) <{ordinal = 2 : index}> : (index) -> index
%45 = "flow.dispatch.workload.ordinal"(%42#5) <{ordinal = 3 : index}> : (index) -> index
%46 = "hal.interface.binding.subspan"(%42#0, %43) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%46) <{alignment = 1 : i32}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%47 = "hal.interface.binding.subspan"(%42#1, %45) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%47) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%48 = "hal.interface.binding.subspan"(%4, %44) {alignment = 64 : index, binding = 1 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%48) <{alignment = 64 : i32}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>) -> ()
%49 = "hal.interface.binding.subspan"(%42#2, %45) {alignment = 64 : index, binding = 2 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%49) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%50 = "memref.alloca"(%45) <{operandSegmentSizes = array<i32: 1, 0>}> : (index) -> memref<1x?xi32, #gpu.address_space<private>>
%51 = "affine.apply"(%45) <{map = affine_map<()[s0] -> (s0 ceildiv 2)>}> : (index) -> index
"gpu.barrier"() : () -> ()
"scf.for"(%9, %51, %5) ({
^bb0(%arg7: index):
%62 = "affine.apply"(%arg7) <{map = affine_map<(d0) -> (d0 * 2)>}> : (index) -> index
%63 = "affine.min"(%arg7, %45) <{map = affine_map<(d0)[s0] -> (d0 * -2 + s0, 2)>}> : (index, index) -> index
"scf.for"(%4, %63, %2) ({
^bb0(%arg8: index):
%64 = "arith.addi"(%arg8, %62) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
%65 = "vector.transfer_read"(%47, %4, %64, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
%66 = "arith.addi"(%65, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
%67 = "arith.trunci"(%66) : (vector<1xi64>) -> vector<1xi32>
%68 = "affine.apply"(%arg7, %arg8) <{map = affine_map<(d0)[s0] -> (d0 * 2 + s0)>}> : (index, index) -> index
"vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
"vector.transfer_write"(%67, %50, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x?xi32, #gpu.address_space<private>>, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"gpu.barrier"() : () -> ()
%52:2 = "affine.delinearize_index"(%9) <{static_basis = array<i64: 4, 16>}> : (index) -> (index, index)
"scf.forall"(%43) <{mapping = [#iree_codegen.workgroup_mapping<z:1>, #iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>], operandSegmentSizes = array<i32: 0, 1, 0, 0>, staticLowerBound = array<i64: 0, 0, 0, 0>, staticStep = array<i64: 1, 1, 1, 1>, staticUpperBound = array<i64: -9223372036854775808, 32, 2, 2>}> ({
^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index):
%53 = "memref.subview"(%7) <{operandSegmentSizes = array<i32: 1, 0, 0, 0>, static_offsets = array<i64: 0, 0>, static_sizes = array<i64: 1, 1>, static_strides = array<i64: 1, 1>}> : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>
"gpu.barrier"() : () -> ()
"scf.for"(%9, %2, %5) ({
^bb0(%arg6: index):
%59 = "vector.transfer_read"(%47, %4, %arg0, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
%60 = "arith.addi"(%59, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
%61 = "arith.trunci"(%60) : (vector<1xi64>) -> vector<1xi32>
"vector.transfer_write"(%61, %7, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x3xi32, #gpu.address_space<workgroup>>, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"gpu.barrier"() : () -> ()
%54 = "affine.apply"(%arg2, %52#0) <{map = affine_map<(d0)[s0] -> (d0 * 4 + s0)>}> : (index, index) -> index
%55 = "affine.apply"(%arg3, %52#1) <{map = affine_map<(d0)[s0] -> (d0 * 64 + s0 * 4)>}> : (index, index) -> index
%56 = "memref.subview"(%48, %arg1, %54, %55, %44) <{operandSegmentSizes = array<i32: 1, 3, 1, 0>, static_offsets = array<i64: 0, -9223372036854775808, -9223372036854775808, -9223372036854775808>, static_sizes = array<i64: -9223372036854775808, 1, 1, 4>, static_strides = array<i64: 1, 1, 1, 1>}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>, index, index, index, index) -> memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%57 = "vector.transfer_read"(%46, %4, %arg0, %arg1, %54, %55, %0) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 5, 1, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, index, index, index, f8E4M3FNUZ) -> vector<4xf8E4M3FNUZ>
%58 = "arith.extf"(%57) : (vector<4xf8E4M3FNUZ>) -> vector<4xbf16>
"vector.transfer_write"(%58, %8, %4, %4, %4, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 5, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (vector<4xbf16>, memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, index, index, index, index, index) -> ()
"iree_linalg_ext.scatter"(%8, %53, %56) <{dimension_map = array<i64: 0>, operandSegmentSizes = array<i32: 2, 1>, unique_indices = true}> ({
^bb0(%arg4: bf16, %arg5: bf16):
"iree_linalg_ext.yield"(%arg4) : (bf16) -> ()
}) : (memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>, memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
"gpu.barrier"() : () -> ()
"scf.forall.in_parallel"() ({
^bb0:
}) : () -> ()
}) : (index) -> ()
"memref.dealloc"(%7) : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> ()
"func.return"() : () -> ()
}) {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64>} : () -> ()
/sharedfile/32/fp8_32_kv16.mlir:3763:13: error: failed to run translation of source executable to target executable for backend #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>
%1327 = torch.aten.index_put %1325, %1326, %1321, %false_597 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
^
/sharedfile/32/fp8_32_kv16.mlir:3763:13: note: see current operation:
"hal.executable.variant"() ({
"hal.executable.export"() ({
^bb0(%arg9: !hal.device, %arg10: index, %arg11: index, %arg12: index, %arg13: index):
%69:3 = "flow.dispatch.workgroup_count_from_slice"(%arg10, %arg11, %arg12, %arg13) : (index, index, index, index) -> (index, index, index)
"hal.return"(%69#0, %69#1, %69#2) : (index, index, index) -> ()
}) {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index, sym_name = "prefill_bs1$async_dispatch_32_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"} : () -> ()
"builtin.module"() ({
"func.func"() <{function_type = () -> (), sym_name = "prefill_bs1$async_dispatch_32_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"}> ({
%0 = "arith.constant"() <{value = 0.000000e+00 : f8E4M3FNUZ}> : () -> f8E4M3FNUZ
%1 = "arith.constant"() <{value = 0 : i64}> : () -> i64
%2 = "arith.constant"() <{value = 1 : index}> : () -> index
%3 = "arith.constant"() <{value = 32 : i64}> : () -> i64
%4 = "arith.constant"() <{value = 0 : index}> : () -> index
%5 = "arith.constant"() <{value = 64 : index}> : () -> index
%6 = "arith.constant"() <{value = dense<2> : vector<1xi64>}> : () -> vector<1xi64>
%7 = "memref.alloc"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x3xi32, #gpu.address_space<workgroup>>
%8 = "memref.alloca"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x1x1x1x4xbf16, #gpu.address_space<private>>
%9 = "gpu.thread_id"() <{dimension = #gpu<dim x>, upper_bound = 64 : index}> : () -> index
%10 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index} : () -> i32
%11 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 1 : index} : () -> i32
%12 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 2 : index} : () -> i32
%13 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 3 : index} : () -> i32
%14 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 4 : index} : () -> i32
%15 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 5 : index} : () -> i32
%16 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 6 : index} : () -> i32
%17 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 7 : index} : () -> i32
%18 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 8 : index} : () -> i32
%19 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 9 : index} : () -> i32
%20 = "arith.extui"(%10) : (i32) -> i64
%21 = "arith.extui"(%11) : (i32) -> i64
%22 = "arith.shli"(%21, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%23 = "arith.ori"(%20, %22) : (i64, i64) -> i64
%24 = "arith.index_castui"(%23) : (i64) -> index
%25 = "arith.extui"(%12) : (i32) -> i64
%26 = "arith.extui"(%13) : (i32) -> i64
%27 = "arith.shli"(%26, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%28 = "arith.ori"(%25, %27) : (i64, i64) -> i64
%29 = "arith.index_castui"(%28) : (i64) -> index
%30 = "arith.extui"(%14) : (i32) -> i64
%31 = "arith.extui"(%15) : (i32) -> i64
%32 = "arith.shli"(%31, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%33 = "arith.ori"(%30, %32) : (i64, i64) -> i64
%34 = "arith.index_castui"(%33) : (i64) -> index
%35 = "arith.index_castui"(%16) : (i32) -> index
%36 = "arith.extui"(%17) : (i32) -> i64
%37 = "arith.extui"(%18) : (i32) -> i64
%38 = "arith.shli"(%37, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%39 = "arith.ori"(%36, %38) : (i64, i64) -> i64
%40 = "arith.index_castui"(%39) : (i64) -> index
%41 = "arith.index_castui"(%19) : (i32) -> index
%42:6 = "util.assume.int"(%24, %29, %34, %35, %40, %41) <{assumptions = [[#util.int.assumption<umin = 68386816, umax = 5300322304>], [#util.int.assumption<umin = 68501504, umax = 5769969664>], [#util.int.assumption<umin = 68501568, umax = 5770002432>], [#util.int.assumption<umin = 1, umax = 4095>], [#util.int.assumption<umin = 64, umax = 576460752303423424, udiv = 64>], [#util.int.assumption<umin = 1, umax = 4095>]]}> : (index, index, index, index, index, index) -> (index, index, index, index, index, index)
%43 = "flow.dispatch.workload.ordinal"(%42#3) <{ordinal = 1 : index}> : (index) -> index
%44 = "flow.dispatch.workload.ordinal"(%42#4) <{ordinal = 2 : index}> : (index) -> index
%45 = "flow.dispatch.workload.ordinal"(%42#5) <{ordinal = 3 : index}> : (index) -> index
%46 = "hal.interface.binding.subspan"(%42#0, %43) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%46) <{alignment = 1 : i32}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%47 = "hal.interface.binding.subspan"(%42#1, %45) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%47) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%48 = "hal.interface.binding.subspan"(%4, %44) {alignment = 64 : index, binding = 1 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%48) <{alignment = 64 : i32}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>) -> ()
%49 = "hal.interface.binding.subspan"(%42#2, %45) {alignment = 64 : index, binding = 2 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%49) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%50 = "memref.alloca"(%45) <{operandSegmentSizes = array<i32: 1, 0>}> : (index) -> memref<1x?xi32, #gpu.address_space<private>>
%51 = "affine.apply"(%45) <{map = affine_map<()[s0] -> (s0 ceildiv 2)>}> : (index) -> index
"gpu.barrier"() : () -> ()
"scf.for"(%9, %51, %5) ({
^bb0(%arg7: index):
%62 = "affine.apply"(%arg7) <{map = affine_map<(d0) -> (d0 * 2)>}> : (index) -> index
%63 = "affine.min"(%arg7, %45) <{map = affine_map<(d0)[s0] -> (d0 * -2 + s0, 2)>}> : (index, index) -> index
"scf.for"(%4, %63, %2) ({
^bb0(%arg8: index):
%64 = "arith.addi"(%arg8, %62) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
%65 = "vector.transfer_read"(%47, %4, %64, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
%66 = "arith.addi"(%65, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
%67 = "arith.trunci"(%66) : (vector<1xi64>) -> vector<1xi32>
%68 = "affine.apply"(%arg7, %arg8) <{map = affine_map<(d0)[s0] -> (d0 * 2 + s0)>}> : (index, index) -> index
"vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
"vector.transfer_write"(%67, %50, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x?xi32, #gpu.address_space<private>>, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"gpu.barrier"() : () -> ()
%52:2 = "affine.delinearize_index"(%9) <{static_basis = array<i64: 4, 16>}> : (index) -> (index, index)
"scf.forall"(%43) <{mapping = [#iree_codegen.workgroup_mapping<z:1>, #iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>], operandSegmentSizes = array<i32: 0, 1, 0, 0>, staticLowerBound = array<i64: 0, 0, 0, 0>, staticStep = array<i64: 1, 1, 1, 1>, staticUpperBound = array<i64: -9223372036854775808, 32, 2, 2>}> ({
^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index):
%53 = "memref.subview"(%7) <{operandSegmentSizes = array<i32: 1, 0, 0, 0>, static_offsets = array<i64: 0, 0>, static_sizes = array<i64: 1, 1>, static_strides = array<i64: 1, 1>}> : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>
"gpu.barrier"() : () -> ()
"scf.for"(%9, %2, %5) ({
^bb0(%arg6: index):
%59 = "vector.transfer_read"(%47, %4, %arg0, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
%60 = "arith.addi"(%59, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
%61 = "arith.trunci"(%60) : (vector<1xi64>) -> vector<1xi32>
"vector.transfer_write"(%61, %7, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x3xi32, #gpu.address_space<workgroup>>, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"gpu.barrier"() : () -> ()
%54 = "affine.apply"(%arg2, %52#0) <{map = affine_map<(d0)[s0] -> (d0 * 4 + s0)>}> : (index, index) -> index
%55 = "affine.apply"(%arg3, %52#1) <{map = affine_map<(d0)[s0] -> (d0 * 64 + s0 * 4)>}> : (index, index) -> index
%56 = "memref.subview"(%48, %arg1, %54, %55, %44) <{operandSegmentSizes = array<i32: 1, 3, 1, 0>, static_offsets = array<i64: 0, -9223372036854775808, -9223372036854775808, -9223372036854775808>, static_sizes = array<i64: -9223372036854775808, 1, 1, 4>, static_strides = array<i64: 1, 1, 1, 1>}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>, index, index, index, index) -> memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%57 = "vector.transfer_read"(%46, %4, %arg0, %arg1, %54, %55, %0) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 5, 1, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, index, index, index, f8E4M3FNUZ) -> vector<4xf8E4M3FNUZ>
%58 = "arith.extf"(%57) : (vector<4xf8E4M3FNUZ>) -> vector<4xbf16>
"vector.transfer_write"(%58, %8, %4, %4, %4, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 5, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (vector<4xbf16>, memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, index, index, index, index, index) -> ()
"iree_linalg_ext.scatter"(%8, %53, %56) <{dimension_map = array<i64: 0>, operandSegmentSizes = array<i32: 2, 1>, unique_indices = true}> ({
^bb0(%arg4: bf16, %arg5: bf16):
"iree_linalg_ext.yield"(%arg4) : (bf16) -> ()
}) : (memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>, memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
"gpu.barrier"() : () -> ()
"scf.forall.in_parallel"() ({
^bb0:
}) : () -> ()
}) : (index) -> ()
"memref.dealloc"(%7) : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> ()
"func.return"() : () -> ()
}) {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64>} : () -> ()
}) : () -> ()
"hal.executable.variant_end"() : () -> ()
}) {sym_name = "rocm_hsaco_fb", target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>} : () -> ()
/sharedfile/32/fp8_32_kv16.mlir:4639:13: error: 'vector.transfer_write' op write affecting operations on global resources are restricted to workgroup distributed contexts.
%1664 = torch.aten.index_put %1662, %1663, %1658, %false_970 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
^
/sharedfile/32/fp8_32_kv16.mlir:4639:13: note: see current operation: "vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
/sharedfile/32/fp8_32_kv16.mlir:4639:13: error: 'func.func' op failed on workgroup distribution verification
%1664 = torch.aten.index_put %1662, %1663, %1658, %false_970 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
^
/sharedfile/32/fp8_32_kv16.mlir:4639:13: note: see current operation:
"func.func"() <{function_type = () -> (), sym_name = "prefill_bs1$async_dispatch_52_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"}> ({
%0 = "arith.constant"() <{value = 0.000000e+00 : f8E4M3FNUZ}> : () -> f8E4M3FNUZ
%1 = "arith.constant"() <{value = 0 : i64}> : () -> i64
%2 = "arith.constant"() <{value = 1 : index}> : () -> index
%3 = "arith.constant"() <{value = 32 : i64}> : () -> i64
%4 = "arith.constant"() <{value = 0 : index}> : () -> index
%5 = "arith.constant"() <{value = 64 : index}> : () -> index
%6 = "arith.constant"() <{value = dense<4> : vector<1xi64>}> : () -> vector<1xi64>
%7 = "memref.alloc"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x3xi32, #gpu.address_space<workgroup>>
%8 = "memref.alloca"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x1x1x1x4xbf16, #gpu.address_space<private>>
%9 = "gpu.thread_id"() <{dimension = #gpu<dim x>, upper_bound = 64 : index}> : () -> index
%10 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index} : () -> i32
%11 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 1 : index} : () -> i32
%12 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 2 : index} : () -> i32
%13 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 3 : index} : () -> i32
%14 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 4 : index} : () -> i32
%15 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 5 : index} : () -> i32
%16 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 6 : index} : () -> i32
%17 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 7 : index} : () -> i32
%18 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 8 : index} : () -> i32
%19 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 9 : index} : () -> i32
%20 = "arith.extui"(%10) : (i32) -> i64
%21 = "arith.extui"(%11) : (i32) -> i64
%22 = "arith.shli"(%21, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%23 = "arith.ori"(%20, %22) : (i64, i64) -> i64
%24 = "arith.index_castui"(%23) : (i64) -> index
%25 = "arith.extui"(%12) : (i32) -> i64
%26 = "arith.extui"(%13) : (i32) -> i64
%27 = "arith.shli"(%26, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%28 = "arith.ori"(%25, %27) : (i64, i64) -> i64
%29 = "arith.index_castui"(%28) : (i64) -> index
%30 = "arith.extui"(%14) : (i32) -> i64
%31 = "arith.extui"(%15) : (i32) -> i64
%32 = "arith.shli"(%31, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%33 = "arith.ori"(%30, %32) : (i64, i64) -> i64
%34 = "arith.index_castui"(%33) : (i64) -> index
%35 = "arith.index_castui"(%16) : (i32) -> index
%36 = "arith.extui"(%17) : (i32) -> i64
%37 = "arith.extui"(%18) : (i32) -> i64
%38 = "arith.shli"(%37, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%39 = "arith.ori"(%36, %38) : (i64, i64) -> i64
%40 = "arith.index_castui"(%39) : (i64) -> index
%41 = "arith.index_castui"(%19) : (i32) -> index
%42:6 = "util.assume.int"(%24, %29, %34, %35, %40, %41) <{assumptions = [[#util.int.assumption<umin = 68386816, umax = 5300322304>], [#util.int.assumption<umin = 68501504, umax = 5769969664>], [#util.int.assumption<umin = 68501568, umax = 5770002432>], [#util.int.assumption<umin = 1, umax = 4095>], [#util.int.assumption<umin = 64, umax = 576460752303423424, udiv = 64>], [#util.int.assumption<umin = 1, umax = 4095>]]}> : (index, index, index, index, index, index) -> (index, index, index, index, index, index)
%43 = "flow.dispatch.workload.ordinal"(%42#3) <{ordinal = 1 : index}> : (index) -> index
%44 = "flow.dispatch.workload.ordinal"(%42#4) <{ordinal = 2 : index}> : (index) -> index
%45 = "flow.dispatch.workload.ordinal"(%42#5) <{ordinal = 3 : index}> : (index) -> index
%46 = "hal.interface.binding.subspan"(%42#0, %43) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%46) <{alignment = 1 : i32}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%47 = "hal.interface.binding.subspan"(%42#1, %45) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%47) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%48 = "hal.interface.binding.subspan"(%4, %44) {alignment = 64 : index, binding = 1 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%48) <{alignment = 64 : i32}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>) -> ()
%49 = "hal.interface.binding.subspan"(%42#2, %45) {alignment = 64 : index, binding = 2 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%49) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%50 = "memref.alloca"(%45) <{operandSegmentSizes = array<i32: 1, 0>}> : (index) -> memref<1x?xi32, #gpu.address_space<private>>
%51 = "affine.apply"(%45) <{map = affine_map<()[s0] -> (s0 ceildiv 2)>}> : (index) -> index
"gpu.barrier"() : () -> ()
"scf.for"(%9, %51, %5) ({
^bb0(%arg7: index):
%62 = "affine.apply"(%arg7) <{map = affine_map<(d0) -> (d0 * 2)>}> : (index) -> index
%63 = "affine.min"(%arg7, %45) <{map = affine_map<(d0)[s0] -> (d0 * -2 + s0, 2)>}> : (index, index) -> index
"scf.for"(%4, %63, %2) ({
^bb0(%arg8: index):
%64 = "arith.addi"(%arg8, %62) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
%65 = "vector.transfer_read"(%47, %4, %64, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
%66 = "arith.addi"(%65, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
%67 = "arith.trunci"(%66) : (vector<1xi64>) -> vector<1xi32>
%68 = "affine.apply"(%arg7, %arg8) <{map = affine_map<(d0)[s0] -> (d0 * 2 + s0)>}> : (index, index) -> index
"vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
"vector.transfer_write"(%67, %50, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x?xi32, #gpu.address_space<private>>, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"gpu.barrier"() : () -> ()
%52:2 = "affine.delinearize_index"(%9) <{static_basis = array<i64: 4, 16>}> : (index) -> (index, index)
"scf.forall"(%43) <{mapping = [#iree_codegen.workgroup_mapping<z:1>, #iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>], operandSegmentSizes = array<i32: 0, 1, 0, 0>, staticLowerBound = array<i64: 0, 0, 0, 0>, staticStep = array<i64: 1, 1, 1, 1>, staticUpperBound = array<i64: -9223372036854775808, 32, 2, 2>}> ({
^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index):
%53 = "memref.subview"(%7) <{operandSegmentSizes = array<i32: 1, 0, 0, 0>, static_offsets = array<i64: 0, 0>, static_sizes = array<i64: 1, 1>, static_strides = array<i64: 1, 1>}> : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>
"gpu.barrier"() : () -> ()
"scf.for"(%9, %2, %5) ({
^bb0(%arg6: index):
%59 = "vector.transfer_read"(%47, %4, %arg0, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
%60 = "arith.addi"(%59, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
%61 = "arith.trunci"(%60) : (vector<1xi64>) -> vector<1xi32>
"vector.transfer_write"(%61, %7, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x3xi32, #gpu.address_space<workgroup>>, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"gpu.barrier"() : () -> ()
%54 = "affine.apply"(%arg2, %52#0) <{map = affine_map<(d0)[s0] -> (d0 * 4 + s0)>}> : (index, index) -> index
%55 = "affine.apply"(%arg3, %52#1) <{map = affine_map<(d0)[s0] -> (d0 * 64 + s0 * 4)>}> : (index, index) -> index
%56 = "memref.subview"(%48, %arg1, %54, %55, %44) <{operandSegmentSizes = array<i32: 1, 3, 1, 0>, static_offsets = array<i64: 0, -9223372036854775808, -9223372036854775808, -9223372036854775808>, static_sizes = array<i64: -9223372036854775808, 1, 1, 4>, static_strides = array<i64: 1, 1, 1, 1>}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>, index, index, index, index) -> memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%57 = "vector.transfer_read"(%46, %4, %arg0, %arg1, %54, %55, %0) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 5, 1, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, index, index, index, f8E4M3FNUZ) -> vector<4xf8E4M3FNUZ>
%58 = "arith.extf"(%57) : (vector<4xf8E4M3FNUZ>) -> vector<4xbf16>
"vector.transfer_write"(%58, %8, %4, %4, %4, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 5, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (vector<4xbf16>, memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, index, index, index, index, index) -> ()
"iree_linalg_ext.scatter"(%8, %53, %56) <{dimension_map = array<i64: 0>, operandSegmentSizes = array<i32: 2, 1>, unique_indices = true}> ({
^bb0(%arg4: bf16, %arg5: bf16):
"iree_linalg_ext.yield"(%arg4) : (bf16) -> ()
}) : (memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>, memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
"gpu.barrier"() : () -> ()
"scf.forall.in_parallel"() ({
^bb0:
}) : () -> ()
}) : (index) -> ()
"memref.dealloc"(%7) : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> ()
"func.return"() : () -> ()
}) {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64>} : () -> ()
/sharedfile/32/fp8_32_kv16.mlir:4639:13: error: failed to run translation of source executable to target executable for backend #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>
%1664 = torch.aten.index_put %1662, %1663, %1658, %false_970 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
^
/sharedfile/32/fp8_32_kv16.mlir:4639:13: note: see current operation:
"hal.executable.variant"() ({
"hal.executable.export"() ({
^bb0(%arg9: !hal.device, %arg10: index, %arg11: index, %arg12: index, %arg13: index):
%69:3 = "flow.dispatch.workgroup_count_from_slice"(%arg10, %arg11, %arg12, %arg13) : (index, index, index, index) -> (index, index, index)
"hal.return"(%69#0, %69#1, %69#2) : (index, index, index) -> ()
}) {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index, sym_name = "prefill_bs1$async_dispatch_52_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"} : () -> ()
"builtin.module"() ({
"func.func"() <{function_type = () -> (), sym_name = "prefill_bs1$async_dispatch_52_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"}> ({
%0 = "arith.constant"() <{value = 0.000000e+00 : f8E4M3FNUZ}> : () -> f8E4M3FNUZ
%1 = "arith.constant"() <{value = 0 : i64}> : () -> i64
%2 = "arith.constant"() <{value = 1 : index}> : () -> index
%3 = "arith.constant"() <{value = 32 : i64}> : () -> i64
%4 = "arith.constant"() <{value = 0 : index}> : () -> index
%5 = "arith.constant"() <{value = 64 : index}> : () -> index
%6 = "arith.constant"() <{value = dense<4> : vector<1xi64>}> : () -> vector<1xi64>
%7 = "memref.alloc"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x3xi32, #gpu.address_space<workgroup>>
%8 = "memref.alloca"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x1x1x1x4xbf16, #gpu.address_space<private>>
%9 = "gpu.thread_id"() <{dimension = #gpu<dim x>, upper_bound = 64 : index}> : () -> index
%10 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index} : () -> i32
%11 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 1 : index} : () -> i32
%12 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 2 : index} : () -> i32
%13 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 3 : index} : () -> i32
%14 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 4 : index} : () -> i32
%15 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 5 : index} : () -> i32
%16 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 6 : index} : () -> i32
%17 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 7 : index} : () -> i32
%18 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 8 : index} : () -> i32
%19 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 9 : index} : () -> i32
%20 = "arith.extui"(%10) : (i32) -> i64
%21 = "arith.extui"(%11) : (i32) -> i64
%22 = "arith.shli"(%21, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%23 = "arith.ori"(%20, %22) : (i64, i64) -> i64
%24 = "arith.index_castui"(%23) : (i64) -> index
%25 = "arith.extui"(%12) : (i32) -> i64
%26 = "arith.extui"(%13) : (i32) -> i64
%27 = "arith.shli"(%26, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%28 = "arith.ori"(%25, %27) : (i64, i64) -> i64
%29 = "arith.index_castui"(%28) : (i64) -> index
%30 = "arith.extui"(%14) : (i32) -> i64
%31 = "arith.extui"(%15) : (i32) -> i64
%32 = "arith.shli"(%31, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%33 = "arith.ori"(%30, %32) : (i64, i64) -> i64
%34 = "arith.index_castui"(%33) : (i64) -> index
%35 = "arith.index_castui"(%16) : (i32) -> index
%36 = "arith.extui"(%17) : (i32) -> i64
%37 = "arith.extui"(%18) : (i32) -> i64
%38 = "arith.shli"(%37, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%39 = "arith.ori"(%36, %38) : (i64, i64) -> i64
%40 = "arith.index_castui"(%39) : (i64) -> index
%41 = "arith.index_castui"(%19) : (i32) -> index
%42:6 = "util.assume.int"(%24, %29, %34, %35, %40, %41) <{assumptions = [[#util.int.assumption<umin = 68386816, umax = 5300322304>], [#util.int.assumption<umin = 68501504, umax = 5769969664>], [#util.int.assumption<umin = 68501568, umax = 5770002432>], [#util.int.assumption<umin = 1, umax = 4095>], [#util.int.assumption<umin = 64, umax = 576460752303423424, udiv = 64>], [#util.int.assumption<umin = 1, umax = 4095>]]}> : (index, index, index, index, index, index) -> (index, index, index, index, index, index)
%43 = "flow.dispatch.workload.ordinal"(%42#3) <{ordinal = 1 : index}> : (index) -> index
%44 = "flow.dispatch.workload.ordinal"(%42#4) <{ordinal = 2 : index}> : (index) -> index
%45 = "flow.dispatch.workload.ordinal"(%42#5) <{ordinal = 3 : index}> : (index) -> index
%46 = "hal.interface.binding.subspan"(%42#0, %43) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%46) <{alignment = 1 : i32}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%47 = "hal.interface.binding.subspan"(%42#1, %45) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%47) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%48 = "hal.interface.binding.subspan"(%4, %44) {alignment = 64 : index, binding = 1 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%48) <{alignment = 64 : i32}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>) -> ()
%49 = "hal.interface.binding.subspan"(%42#2, %45) {alignment = 64 : index, binding = 2 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%49) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%50 = "memref.alloca"(%45) <{operandSegmentSizes = array<i32: 1, 0>}> : (index) -> memref<1x?xi32, #gpu.address_space<private>>
%51 = "affine.apply"(%45) <{map = affine_map<()[s0] -> (s0 ceildiv 2)>}> : (index) -> index
"gpu.barrier"() : () -> ()
"scf.for"(%9, %51, %5) ({
^bb0(%arg7: index):
%62 = "affine.apply"(%arg7) <{map = affine_map<(d0) -> (d0 * 2)>}> : (index) -> index
%63 = "affine.min"(%arg7, %45) <{map = affine_map<(d0)[s0] -> (d0 * -2 + s0, 2)>}> : (index, index) -> index
"scf.for"(%4, %63, %2) ({
^bb0(%arg8: index):
%64 = "arith.addi"(%arg8, %62) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
%65 = "vector.transfer_read"(%47, %4, %64, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
%66 = "arith.addi"(%65, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
%67 = "arith.trunci"(%66) : (vector<1xi64>) -> vector<1xi32>
%68 = "affine.apply"(%arg7, %arg8) <{map = affine_map<(d0)[s0] -> (d0 * 2 + s0)>}> : (index, index) -> index
"vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
"vector.transfer_write"(%67, %50, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x?xi32, #gpu.address_space<private>>, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"gpu.barrier"() : () -> ()
%52:2 = "affine.delinearize_index"(%9) <{static_basis = array<i64: 4, 16>}> : (index) -> (index, index)
"scf.forall"(%43) <{mapping = [#iree_codegen.workgroup_mapping<z:1>, #iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>], operandSegmentSizes = array<i32: 0, 1, 0, 0>, staticLowerBound = array<i64: 0, 0, 0, 0>, staticStep = array<i64: 1, 1, 1, 1>, staticUpperBound = array<i64: -9223372036854775808, 32, 2, 2>}> ({
^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index):
%53 = "memref.subview"(%7) <{operandSegmentSizes = array<i32: 1, 0, 0, 0>, static_offsets = array<i64: 0, 0>, static_sizes = array<i64: 1, 1>, static_strides = array<i64: 1, 1>}> : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>
"gpu.barrier"() : () -> ()
"scf.for"(%9, %2, %5) ({
^bb0(%arg6: index):
%59 = "vector.transfer_read"(%47, %4, %arg0, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
%60 = "arith.addi"(%59, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
%61 = "arith.trunci"(%60) : (vector<1xi64>) -> vector<1xi32>
"vector.transfer_write"(%61, %7, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x3xi32, #gpu.address_space<workgroup>>, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"gpu.barrier"() : () -> ()
%54 = "affine.apply"(%arg2, %52#0) <{map = affine_map<(d0)[s0] -> (d0 * 4 + s0)>}> : (index, index) -> index
%55 = "affine.apply"(%arg3, %52#1) <{map = affine_map<(d0)[s0] -> (d0 * 64 + s0 * 4)>}> : (index, index) -> index
%56 = "memref.subview"(%48, %arg1, %54, %55, %44) <{operandSegmentSizes = array<i32: 1, 3, 1, 0>, static_offsets = array<i64: 0, -9223372036854775808, -9223372036854775808, -9223372036854775808>, static_sizes = array<i64: -9223372036854775808, 1, 1, 4>, static_strides = array<i64: 1, 1, 1, 1>}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>, index, index, index, index) -> memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%57 = "vector.transfer_read"(%46, %4, %arg0, %arg1, %54, %55, %0) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 5, 1, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, index, index, index, f8E4M3FNUZ) -> vector<4xf8E4M3FNUZ>
%58 = "arith.extf"(%57) : (vector<4xf8E4M3FNUZ>) -> vector<4xbf16>
"vector.transfer_write"(%58, %8, %4, %4, %4, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 5, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (vector<4xbf16>, memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, index, index, index, index, index) -> ()
"iree_linalg_ext.scatter"(%8, %53, %56) <{dimension_map = array<i64: 0>, operandSegmentSizes = array<i32: 2, 1>, unique_indices = true}> ({
^bb0(%arg4: bf16, %arg5: bf16):
"iree_linalg_ext.yield"(%arg4) : (bf16) -> ()
}) : (memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>, memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
"gpu.barrier"() : () -> ()
"scf.forall.in_parallel"() ({
^bb0:
}) : () -> ()
}) : (index) -> ()
"memref.dealloc"(%7) : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> ()
"func.return"() : () -> ()
}) {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64>} : () -> ()
}) : () -> ()
"hal.executable.variant_end"() : () -> ()
}) {sym_name = "rocm_hsaco_fb", target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>} : () -> ()
/sharedfile/32/fp8_32_kv16.mlir:5515:13: error: 'vector.transfer_write' op write affecting operations on global resources are restricted to workgroup distributed contexts.
%2001 = torch.aten.index_put %1999, %2000, %1995, %false_1343 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
^
/sharedfile/32/fp8_32_kv16.mlir:5515:13: note: see current operation: "vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
/sharedfile/32/fp8_32_kv16.mlir:5515:13: error: 'func.func' op failed on workgroup distribution verification
%2001 = torch.aten.index_put %1999, %2000, %1995, %false_1343 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
^
/sharedfile/32/fp8_32_kv16.mlir:5515:13: note: see current operation:
"func.func"() <{function_type = () -> (), sym_name = "prefill_bs1$async_dispatch_72_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"}> ({
%0 = "arith.constant"() <{value = 0.000000e+00 : f8E4M3FNUZ}> : () -> f8E4M3FNUZ
%1 = "arith.constant"() <{value = 0 : i64}> : () -> i64
%2 = "arith.constant"() <{value = 1 : index}> : () -> index
%3 = "arith.constant"() <{value = 32 : i64}> : () -> i64
%4 = "arith.constant"() <{value = 0 : index}> : () -> index
%5 = "arith.constant"() <{value = 64 : index}> : () -> index
%6 = "arith.constant"() <{value = dense<6> : vector<1xi64>}> : () -> vector<1xi64>
%7 = "memref.alloc"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x3xi32, #gpu.address_space<workgroup>>
%8 = "memref.alloca"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x1x1x1x4xbf16, #gpu.address_space<private>>
%9 = "gpu.thread_id"() <{dimension = #gpu<dim x>, upper_bound = 64 : index}> : () -> index
%10 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index} : () -> i32
%11 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 1 : index} : () -> i32
%12 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 2 : index} : () -> i32
%13 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 3 : index} : () -> i32
%14 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 4 : index} : () -> i32
%15 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 5 : index} : () -> i32
%16 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 6 : index} : () -> i32
%17 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 7 : index} : () -> i32
%18 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 8 : index} : () -> i32
%19 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 9 : index} : () -> i32
%20 = "arith.extui"(%10) : (i32) -> i64
%21 = "arith.extui"(%11) : (i32) -> i64
%22 = "arith.shli"(%21, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%23 = "arith.ori"(%20, %22) : (i64, i64) -> i64
%24 = "arith.index_castui"(%23) : (i64) -> index
%25 = "arith.extui"(%12) : (i32) -> i64
%26 = "arith.extui"(%13) : (i32) -> i64
%27 = "arith.shli"(%26, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%28 = "arith.ori"(%25, %27) : (i64, i64) -> i64
%29 = "arith.index_castui"(%28) : (i64) -> index
%30 = "arith.extui"(%14) : (i32) -> i64
%31 = "arith.extui"(%15) : (i32) -> i64
%32 = "arith.shli"(%31, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%33 = "arith.ori"(%30, %32) : (i64, i64) -> i64
%34 = "arith.index_castui"(%33) : (i64) -> index
%35 = "arith.index_castui"(%16) : (i32) -> index
%36 = "arith.extui"(%17) : (i32) -> i64
%37 = "arith.extui"(%18) : (i32) -> i64
%38 = "arith.shli"(%37, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%39 = "arith.ori"(%36, %38) : (i64, i64) -> i64
%40 = "arith.index_castui"(%39) : (i64) -> index
%41 = "arith.index_castui"(%19) : (i32) -> index
%42:6 = "util.assume.int"(%24, %29, %34, %35, %40, %41) <{assumptions = [[#util.int.assumption<umin = 68386816, umax = 5300322304>], [#util.int.assumption<umin = 68501504, umax = 5769969664>], [#util.int.assumption<umin = 68501568, umax = 5770002432>], [#util.int.assumption<umin = 1, umax = 4095>], [#util.int.assumption<umin = 64, umax = 576460752303423424, udiv = 64>], [#util.int.assumption<umin = 1, umax = 4095>]]}> : (index, index, index, index, index, index) -> (index, index, index, index, index, index)
%43 = "flow.dispatch.workload.ordinal"(%42#3) <{ordinal = 1 : index}> : (index) -> index
%44 = "flow.dispatch.workload.ordinal"(%42#4) <{ordinal = 2 : index}> : (index) -> index
%45 = "flow.dispatch.workload.ordinal"(%42#5) <{ordinal = 3 : index}> : (index) -> index
%46 = "hal.interface.binding.subspan"(%42#0, %43) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%46) <{alignment = 1 : i32}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%47 = "hal.interface.binding.subspan"(%42#1, %45) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%47) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%48 = "hal.interface.binding.subspan"(%4, %44) {alignment = 64 : index, binding = 1 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%48) <{alignment = 64 : i32}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>) -> ()
%49 = "hal.interface.binding.subspan"(%42#2, %45) {alignment = 64 : index, binding = 2 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%49) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%50 = "memref.alloca"(%45) <{operandSegmentSizes = array<i32: 1, 0>}> : (index) -> memref<1x?xi32, #gpu.address_space<private>>
%51 = "affine.apply"(%45) <{map = affine_map<()[s0] -> (s0 ceildiv 2)>}> : (index) -> index
"gpu.barrier"() : () -> ()
"scf.for"(%9, %51, %5) ({
^bb0(%arg7: index):
%62 = "affine.apply"(%arg7) <{map = affine_map<(d0) -> (d0 * 2)>}> : (index) -> index
%63 = "affine.min"(%arg7, %45) <{map = affine_map<(d0)[s0] -> (d0 * -2 + s0, 2)>}> : (index, index) -> index
"scf.for"(%4, %63, %2) ({
^bb0(%arg8: index):
%64 = "arith.addi"(%arg8, %62) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
%65 = "vector.transfer_read"(%47, %4, %64, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
%66 = "arith.addi"(%65, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
%67 = "arith.trunci"(%66) : (vector<1xi64>) -> vector<1xi32>
%68 = "affine.apply"(%arg7, %arg8) <{map = affine_map<(d0)[s0] -> (d0 * 2 + s0)>}> : (index, index) -> index
"vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
"vector.transfer_write"(%67, %50, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x?xi32, #gpu.address_space<private>>, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"gpu.barrier"() : () -> ()
%52:2 = "affine.delinearize_index"(%9) <{static_basis = array<i64: 4, 16>}> : (index) -> (index, index)
"scf.forall"(%43) <{mapping = [#iree_codegen.workgroup_mapping<z:1>, #iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>], operandSegmentSizes = array<i32: 0, 1, 0, 0>, staticLowerBound = array<i64: 0, 0, 0, 0>, staticStep = array<i64: 1, 1, 1, 1>, staticUpperBound = array<i64: -9223372036854775808, 32, 2, 2>}> ({
^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index):
%53 = "memref.subview"(%7) <{operandSegmentSizes = array<i32: 1, 0, 0, 0>, static_offsets = array<i64: 0, 0>, static_sizes = array<i64: 1, 1>, static_strides = array<i64: 1, 1>}> : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>
"gpu.barrier"() : () -> ()
"scf.for"(%9, %2, %5) ({
^bb0(%arg6: index):
%59 = "vector.transfer_read"(%47, %4, %arg0, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
%60 = "arith.addi"(%59, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
%61 = "arith.trunci"(%60) : (vector<1xi64>) -> vector<1xi32>
"vector.transfer_write"(%61, %7, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x3xi32, #gpu.address_space<workgroup>>, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"gpu.barrier"() : () -> ()
%54 = "affine.apply"(%arg2, %52#0) <{map = affine_map<(d0)[s0] -> (d0 * 4 + s0)>}> : (index, index) -> index
%55 = "affine.apply"(%arg3, %52#1) <{map = affine_map<(d0)[s0] -> (d0 * 64 + s0 * 4)>}> : (index, index) -> index
%56 = "memref.subview"(%48, %arg1, %54, %55, %44) <{operandSegmentSizes = array<i32: 1, 3, 1, 0>, static_offsets = array<i64: 0, -9223372036854775808, -9223372036854775808, -9223372036854775808>, static_sizes = array<i64: -9223372036854775808, 1, 1, 4>, static_strides = array<i64: 1, 1, 1, 1>}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>, index, index, index, index) -> memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%57 = "vector.transfer_read"(%46, %4, %arg0, %arg1, %54, %55, %0) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 5, 1, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, index, index, index, f8E4M3FNUZ) -> vector<4xf8E4M3FNUZ>
%58 = "arith.extf"(%57) : (vector<4xf8E4M3FNUZ>) -> vector<4xbf16>
"vector.transfer_write"(%58, %8, %4, %4, %4, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 5, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (vector<4xbf16>, memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, index, index, index, index, index) -> ()
"iree_linalg_ext.scatter"(%8, %53, %56) <{dimension_map = array<i64: 0>, operandSegmentSizes = array<i32: 2, 1>, unique_indices = true}> ({
^bb0(%arg4: bf16, %arg5: bf16):
"iree_linalg_ext.yield"(%arg4) : (bf16) -> ()
}) : (memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>, memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
"gpu.barrier"() : () -> ()
"scf.forall.in_parallel"() ({
^bb0:
}) : () -> ()
}) : (index) -> ()
"memref.dealloc"(%7) : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> ()
"func.return"() : () -> ()
}) {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64>} : () -> ()
/sharedfile/32/fp8_32_kv16.mlir:5515:13: error: failed to run translation of source executable to target executable for backend #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>
%2001 = torch.aten.index_put %1999, %2000, %1995, %false_1343 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
^
/sharedfile/32/fp8_32_kv16.mlir:5515:13: note: see current operation:
"hal.executable.variant"() ({
"hal.executable.export"() ({
^bb0(%arg9: !hal.device, %arg10: index, %arg11: index, %arg12: index, %arg13: index):
%69:3 = "flow.dispatch.workgroup_count_from_slice"(%arg10, %arg11, %arg12, %arg13) : (index, index, index, index) -> (index, index, index)
"hal.return"(%69#0, %69#1, %69#2) : (index, index, index) -> ()
}) {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index, sym_name = "prefill_bs1$async_dispatch_72_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"} : () -> ()
"builtin.module"() ({
"func.func"() <{function_type = () -> (), sym_name = "prefill_bs1$async_dispatch_72_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"}> ({
%0 = "arith.constant"() <{value = 0.000000e+00 : f8E4M3FNUZ}> : () -> f8E4M3FNUZ
%1 = "arith.constant"() <{value = 0 : i64}> : () -> i64
%2 = "arith.constant"() <{value = 1 : index}> : () -> index
%3 = "arith.constant"() <{value = 32 : i64}> : () -> i64
%4 = "arith.constant"() <{value = 0 : index}> : () -> index
%5 = "arith.constant"() <{value = 64 : index}> : () -> index
%6 = "arith.constant"() <{value = dense<6> : vector<1xi64>}> : () -> vector<1xi64>
%7 = "memref.alloc"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x3xi32, #gpu.address_space<workgroup>>
%8 = "memref.alloca"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x1x1x1x4xbf16, #gpu.address_space<private>>
%9 = "gpu.thread_id"() <{dimension = #gpu<dim x>, upper_bound = 64 : index}> : () -> index
%10 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index} : () -> i32
%11 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 1 : index} : () -> i32
%12 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 2 : index} : () -> i32
%13 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 3 : index} : () -> i32
%14 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 4 : index} : () -> i32
%15 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 5 : index} : () -> i32
%16 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 6 : index} : () -> i32
%17 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 7 : index} : () -> i32
%18 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 8 : index} : () -> i32
%19 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 9 : index} : () -> i32
%20 = "arith.extui"(%10) : (i32) -> i64
%21 = "arith.extui"(%11) : (i32) -> i64
%22 = "arith.shli"(%21, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%23 = "arith.ori"(%20, %22) : (i64, i64) -> i64
%24 = "arith.index_castui"(%23) : (i64) -> index
%25 = "arith.extui"(%12) : (i32) -> i64
%26 = "arith.extui"(%13) : (i32) -> i64
%27 = "arith.shli"(%26, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%28 = "arith.ori"(%25, %27) : (i64, i64) -> i64
%29 = "arith.index_castui"(%28) : (i64) -> index
%30 = "arith.extui"(%14) : (i32) -> i64
%31 = "arith.extui"(%15) : (i32) -> i64
%32 = "arith.shli"(%31, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%33 = "arith.ori"(%30, %32) : (i64, i64) -> i64
%34 = "arith.index_castui"(%33) : (i64) -> index
%35 = "arith.index_castui"(%16) : (i32) -> index
%36 = "arith.extui"(%17) : (i32) -> i64
%37 = "arith.extui"(%18) : (i32) -> i64
%38 = "arith.shli"(%37, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%39 = "arith.ori"(%36, %38) : (i64, i64) -> i64
%40 = "arith.index_castui"(%39) : (i64) -> index
%41 = "arith.index_castui"(%19) : (i32) -> index
%42:6 = "util.assume.int"(%24, %29, %34, %35, %40, %41) <{assumptions = [[#util.int.assumption<umin = 68386816, umax = 5300322304>], [#util.int.assumption<umin = 68501504, umax = 5769969664>], [#util.int.assumption<umin = 68501568, umax = 5770002432>], [#util.int.assumption<umin = 1, umax = 4095>], [#util.int.assumption<umin = 64, umax = 576460752303423424, udiv = 64>], [#util.int.assumption<umin = 1, umax = 4095>]]}> : (index, index, index, index, index, index) -> (index, index, index, index, index, index)
%43 = "flow.dispatch.workload.ordinal"(%42#3) <{ordinal = 1 : index}> : (index) -> index
%44 = "flow.dispatch.workload.ordinal"(%42#4) <{ordinal = 2 : index}> : (index) -> index
%45 = "flow.dispatch.workload.ordinal"(%42#5) <{ordinal = 3 : index}> : (index) -> index
%46 = "hal.interface.binding.subspan"(%42#0, %43) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%46) <{alignment = 1 : i32}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%47 = "hal.interface.binding.subspan"(%42#1, %45) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%47) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%48 = "hal.interface.binding.subspan"(%4, %44) {alignment = 64 : index, binding = 1 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%48) <{alignment = 64 : i32}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>) -> ()
%49 = "hal.interface.binding.subspan"(%42#2, %45) {alignment = 64 : index, binding = 2 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%49) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%50 = "memref.alloca"(%45) <{operandSegmentSizes = array<i32: 1, 0>}> : (index) -> memref<1x?xi32, #gpu.address_space<private>>
%51 = "affine.apply"(%45) <{map = affine_map<()[s0] -> (s0 ceildiv 2)>}> : (index) -> index
"gpu.barrier"() : () -> ()
"scf.for"(%9, %51, %5) ({
^bb0(%arg7: index):
%62 = "affine.apply"(%arg7) <{map = affine_map<(d0) -> (d0 * 2)>}> : (index) -> index
%63 = "affine.min"(%arg7, %45) <{map = affine_map<(d0)[s0] -> (d0 * -2 + s0, 2)>}> : (index, index) -> index
"scf.for"(%4, %63, %2) ({
^bb0(%arg8: index):
%64 = "arith.addi"(%arg8, %62) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
%65 = "vector.transfer_read"(%47, %4, %64, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
%66 = "arith.addi"(%65, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
%67 = "arith.trunci"(%66) : (vector<1xi64>) -> vector<1xi32>
%68 = "affine.apply"(%arg7, %arg8) <{map = affine_map<(d0)[s0] -> (d0 * 2 + s0)>}> : (index, index) -> index
"vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
"vector.transfer_write"(%67, %50, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x?xi32, #gpu.address_space<private>>, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"gpu.barrier"() : () -> ()
%52:2 = "affine.delinearize_index"(%9) <{static_basis = array<i64: 4, 16>}> : (index) -> (index, index)
"scf.forall"(%43) <{mapping = [#iree_codegen.workgroup_mapping<z:1>, #iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>], operandSegmentSizes = array<i32: 0, 1, 0, 0>, staticLowerBound = array<i64: 0, 0, 0, 0>, staticStep = array<i64: 1, 1, 1, 1>, staticUpperBound = array<i64: -9223372036854775808, 32, 2, 2>}> ({
^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index):
%53 = "memref.subview"(%7) <{operandSegmentSizes = array<i32: 1, 0, 0, 0>, static_offsets = array<i64: 0, 0>, static_sizes = array<i64: 1, 1>, static_strides = array<i64: 1, 1>}> : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>
"gpu.barrier"() : () -> ()
"scf.for"(%9, %2, %5) ({
^bb0(%arg6: index):
%59 = "vector.transfer_read"(%47, %4, %arg0, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
%60 = "arith.addi"(%59, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
%61 = "arith.trunci"(%60) : (vector<1xi64>) -> vector<1xi32>
"vector.transfer_write"(%61, %7, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x3xi32, #gpu.address_space<workgroup>>, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"gpu.barrier"() : () -> ()
%54 = "affine.apply"(%arg2, %52#0) <{map = affine_map<(d0)[s0] -> (d0 * 4 + s0)>}> : (index, index) -> index
%55 = "affine.apply"(%arg3, %52#1) <{map = affine_map<(d0)[s0] -> (d0 * 64 + s0 * 4)>}> : (index, index) -> index
%56 = "memref.subview"(%48, %arg1, %54, %55, %44) <{operandSegmentSizes = array<i32: 1, 3, 1, 0>, static_offsets = array<i64: 0, -9223372036854775808, -9223372036854775808, -9223372036854775808>, static_sizes = array<i64: -9223372036854775808, 1, 1, 4>, static_strides = array<i64: 1, 1, 1, 1>}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>, index, index, index, index) -> memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%57 = "vector.transfer_read"(%46, %4, %arg0, %arg1, %54, %55, %0) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 5, 1, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, index, index, index, f8E4M3FNUZ) -> vector<4xf8E4M3FNUZ>
%58 = "arith.extf"(%57) : (vector<4xf8E4M3FNUZ>) -> vector<4xbf16>
"vector.transfer_write"(%58, %8, %4, %4, %4, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 5, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (vector<4xbf16>, memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, index, index, index, index, index) -> ()
"iree_linalg_ext.scatter"(%8, %53, %56) <{dimension_map = array<i64: 0>, operandSegmentSizes = array<i32: 2, 1>, unique_indices = true}> ({
^bb0(%arg4: bf16, %arg5: bf16):
"iree_linalg_ext.yield"(%arg4) : (bf16) -> ()
}) : (memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>, memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
"gpu.barrier"() : () -> ()
"scf.forall.in_parallel"() ({
^bb0:
}) : () -> ()
}) : (index) -> ()
"memref.dealloc"(%7) : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> ()
"func.return"() : () -> ()
}) {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64>} : () -> ()
}) : () -> ()
"hal.executable.variant_end"() : () -> ()
}) {sym_name = "rocm_hsaco_fb", target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>} : () -> ()
/sharedfile/32/fp8_32_kv16.mlir:6391:13: error: 'vector.transfer_write' op write affecting operations on global resources are restricted to workgroup distributed contexts.
%2338 = torch.aten.index_put %2336, %2337, %2332, %false_1716 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
^
/sharedfile/32/fp8_32_kv16.mlir:6391:13: note: see current operation: "vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
/sharedfile/32/fp8_32_kv16.mlir:6391:13: error: 'func.func' op failed on workgroup distribution verification
%2338 = torch.aten.index_put %2336, %2337, %2332, %false_1716 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
^
/sharedfile/32/fp8_32_kv16.mlir:6391:13: note: see current operation:
"func.func"() <{function_type = () -> (), sym_name = "prefill_bs1$async_dispatch_92_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"}> ({
%0 = "arith.constant"() <{value = 0.000000e+00 : f8E4M3FNUZ}> : () -> f8E4M3FNUZ
%1 = "arith.constant"() <{value = 0 : i64}> : () -> i64
%2 = "arith.constant"() <{value = 1 : index}> : () -> index
%3 = "arith.constant"() <{value = 32 : i64}> : () -> i64
%4 = "arith.constant"() <{value = 0 : index}> : () -> index
%5 = "arith.constant"() <{value = 64 : index}> : () -> index
%6 = "arith.constant"() <{value = dense<8> : vector<1xi64>}> : () -> vector<1xi64>
%7 = "memref.alloc"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x3xi32, #gpu.address_space<workgroup>>
%8 = "memref.alloca"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x1x1x1x4xbf16, #gpu.address_space<private>>
%9 = "gpu.thread_id"() <{dimension = #gpu<dim x>, upper_bound = 64 : index}> : () -> index
%10 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index} : () -> i32
%11 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 1 : index} : () -> i32
%12 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 2 : index} : () -> i32
%13 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 3 : index} : () -> i32
%14 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 4 : index} : () -> i32
%15 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 5 : index} : () -> i32
%16 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 6 : index} : () -> i32
%17 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 7 : index} : () -> i32
%18 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 8 : index} : () -> i32
%19 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 9 : index} : () -> i32
%20 = "arith.extui"(%10) : (i32) -> i64
%21 = "arith.extui"(%11) : (i32) -> i64
%22 = "arith.shli"(%21, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%23 = "arith.ori"(%20, %22) : (i64, i64) -> i64
%24 = "arith.index_castui"(%23) : (i64) -> index
%25 = "arith.extui"(%12) : (i32) -> i64
%26 = "arith.extui"(%13) : (i32) -> i64
%27 = "arith.shli"(%26, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%28 = "arith.ori"(%25, %27) : (i64, i64) -> i64
%29 = "arith.index_castui"(%28) : (i64) -> index
%30 = "arith.extui"(%14) : (i32) -> i64
%31 = "arith.extui"(%15) : (i32) -> i64
%32 = "arith.shli"(%31, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%33 = "arith.ori"(%30, %32) : (i64, i64) -> i64
%34 = "arith.index_castui"(%33) : (i64) -> index
%35 = "arith.index_castui"(%16) : (i32) -> index
%36 = "arith.extui"(%17) : (i32) -> i64
%37 = "arith.extui"(%18) : (i32) -> i64
%38 = "arith.shli"(%37, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%39 = "arith.ori"(%36, %38) : (i64, i64) -> i64
%40 = "arith.index_castui"(%39) : (i64) -> index
%41 = "arith.index_castui"(%19) : (i32) -> index
%42:6 = "util.assume.int"(%24, %29, %34, %35, %40, %41) <{assumptions = [[#util.int.assumption<umin = 68386816, umax = 5300322304>], [#util.int.assumption<umin = 68501504, umax = 5769969664>], [#util.int.assumption<umin = 68501568, umax = 5770002432>], [#util.int.assumption<umin = 1, umax = 4095>], [#util.int.assumption<umin = 64, umax = 576460752303423424, udiv = 64>], [#util.int.assumption<umin = 1, umax = 4095>]]}> : (index, index, index, index, index, index) -> (index, index, index, index, index, index)
%43 = "flow.dispatch.workload.ordinal"(%42#3) <{ordinal = 1 : index}> : (index) -> index
%44 = "flow.dispatch.workload.ordinal"(%42#4) <{ordinal = 2 : index}> : (index) -> index
%45 = "flow.dispatch.workload.ordinal"(%42#5) <{ordinal = 3 : index}> : (index) -> index
%46 = "hal.interface.binding.subspan"(%42#0, %43) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%46) <{alignment = 1 : i32}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%47 = "hal.interface.binding.subspan"(%42#1, %45) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%47) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%48 = "hal.interface.binding.subspan"(%4, %44) {alignment = 64 : index, binding = 1 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%48) <{alignment = 64 : i32}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>) -> ()
%49 = "hal.interface.binding.subspan"(%42#2, %45) {alignment = 64 : index, binding = 2 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%49) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%50 = "memref.alloca"(%45) <{operandSegmentSizes = array<i32: 1, 0>}> : (index) -> memref<1x?xi32, #gpu.address_space<private>>
%51 = "affine.apply"(%45) <{map = affine_map<()[s0] -> (s0 ceildiv 2)>}> : (index) -> index
"gpu.barrier"() : () -> ()
"scf.for"(%9, %51, %5) ({
^bb0(%arg7: index):
%62 = "affine.apply"(%arg7) <{map = affine_map<(d0) -> (d0 * 2)>}> : (index) -> index
%63 = "affine.min"(%arg7, %45) <{map = affine_map<(d0)[s0] -> (d0 * -2 + s0, 2)>}> : (index, index) -> index
"scf.for"(%4, %63, %2) ({
^bb0(%arg8: index):
%64 = "arith.addi"(%arg8, %62) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
%65 = "vector.transfer_read"(%47, %4, %64, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
%66 = "arith.addi"(%65, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
%67 = "arith.trunci"(%66) : (vector<1xi64>) -> vector<1xi32>
%68 = "affine.apply"(%arg7, %arg8) <{map = affine_map<(d0)[s0] -> (d0 * 2 + s0)>}> : (index, index) -> index
"vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
"vector.transfer_write"(%67, %50, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x?xi32, #gpu.address_space<private>>, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"gpu.barrier"() : () -> ()
%52:2 = "affine.delinearize_index"(%9) <{static_basis = array<i64: 4, 16>}> : (index) -> (index, index)
"scf.forall"(%43) <{mapping = [#iree_codegen.workgroup_mapping<z:1>, #iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>], operandSegmentSizes = array<i32: 0, 1, 0, 0>, staticLowerBound = array<i64: 0, 0, 0, 0>, staticStep = array<i64: 1, 1, 1, 1>, staticUpperBound = array<i64: -9223372036854775808, 32, 2, 2>}> ({
^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index):
%53 = "memref.subview"(%7) <{operandSegmentSizes = array<i32: 1, 0, 0, 0>, static_offsets = array<i64: 0, 0>, static_sizes = array<i64: 1, 1>, static_strides = array<i64: 1, 1>}> : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>
"gpu.barrier"() : () -> ()
"scf.for"(%9, %2, %5) ({
^bb0(%arg6: index):
%59 = "vector.transfer_read"(%47, %4, %arg0, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
%60 = "arith.addi"(%59, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
%61 = "arith.trunci"(%60) : (vector<1xi64>) -> vector<1xi32>
"vector.transfer_write"(%61, %7, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x3xi32, #gpu.address_space<workgroup>>, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"gpu.barrier"() : () -> ()
%54 = "affine.apply"(%arg2, %52#0) <{map = affine_map<(d0)[s0] -> (d0 * 4 + s0)>}> : (index, index) -> index
%55 = "affine.apply"(%arg3, %52#1) <{map = affine_map<(d0)[s0] -> (d0 * 64 + s0 * 4)>}> : (index, index) -> index
%56 = "memref.subview"(%48, %arg1, %54, %55, %44) <{operandSegmentSizes = array<i32: 1, 3, 1, 0>, static_offsets = array<i64: 0, -9223372036854775808, -9223372036854775808, -9223372036854775808>, static_sizes = array<i64: -9223372036854775808, 1, 1, 4>, static_strides = array<i64: 1, 1, 1, 1>}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>, index, index, index, index) -> memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%57 = "vector.transfer_read"(%46, %4, %arg0, %arg1, %54, %55, %0) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 5, 1, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, index, index, index, f8E4M3FNUZ) -> vector<4xf8E4M3FNUZ>
%58 = "arith.extf"(%57) : (vector<4xf8E4M3FNUZ>) -> vector<4xbf16>
"vector.transfer_write"(%58, %8, %4, %4, %4, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 5, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (vector<4xbf16>, memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, index, index, index, index, index) -> ()
"iree_linalg_ext.scatter"(%8, %53, %56) <{dimension_map = array<i64: 0>, operandSegmentSizes = array<i32: 2, 1>, unique_indices = true}> ({
^bb0(%arg4: bf16, %arg5: bf16):
"iree_linalg_ext.yield"(%arg4) : (bf16) -> ()
}) : (memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>, memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
"gpu.barrier"() : () -> ()
"scf.forall.in_parallel"() ({
^bb0:
}) : () -> ()
}) : (index) -> ()
"memref.dealloc"(%7) : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> ()
"func.return"() : () -> ()
}) {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64>} : () -> ()
/sharedfile/32/fp8_32_kv16.mlir:6391:13: error: failed to run translation of source executable to target executable for backend #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>
%2338 = torch.aten.index_put %2336, %2337, %2332, %false_1716 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
^
/sharedfile/32/fp8_32_kv16.mlir:6391:13: note: see current operation:
"hal.executable.variant"() ({
"hal.executable.export"() ({
^bb0(%arg9: !hal.device, %arg10: index, %arg11: index, %arg12: index, %arg13: index):
%69:3 = "flow.dispatch.workgroup_count_from_slice"(%arg10, %arg11, %arg12, %arg13) : (index, index, index, index) -> (index, index, index)
"hal.return"(%69#0, %69#1, %69#2) : (index, index, index) -> ()
}) {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index, sym_name = "prefill_bs1$async_dispatch_92_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"} : () -> ()
"builtin.module"() ({
"func.func"() <{function_type = () -> (), sym_name = "prefill_bs1$async_dispatch_92_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"}> ({
%0 = "arith.constant"() <{value = 0.000000e+00 : f8E4M3FNUZ}> : () -> f8E4M3FNUZ
%1 = "arith.constant"() <{value = 0 : i64}> : () -> i64
%2 = "arith.constant"() <{value = 1 : index}> : () -> index
%3 = "arith.constant"() <{value = 32 : i64}> : () -> i64
%4 = "arith.constant"() <{value = 0 : index}> : () -> index
%5 = "arith.constant"() <{value = 64 : index}> : () -> index
%6 = "arith.constant"() <{value = dense<8> : vector<1xi64>}> : () -> vector<1xi64>
%7 = "memref.alloc"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x3xi32, #gpu.address_space<workgroup>>
%8 = "memref.alloca"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x1x1x1x4xbf16, #gpu.address_space<private>>
%9 = "gpu.thread_id"() <{dimension = #gpu<dim x>, upper_bound = 64 : index}> : () -> index
%10 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index} : () -> i32
%11 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 1 : index} : () -> i32
%12 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 2 : index} : () -> i32
%13 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 3 : index} : () -> i32
%14 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 4 : index} : () -> i32
%15 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 5 : index} : () -> i32
%16 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 6 : index} : () -> i32
%17 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 7 : index} : () -> i32
%18 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 8 : index} : () -> i32
%19 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 9 : index} : () -> i32
%20 = "arith.extui"(%10) : (i32) -> i64
%21 = "arith.extui"(%11) : (i32) -> i64
%22 = "arith.shli"(%21, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%23 = "arith.ori"(%20, %22) : (i64, i64) -> i64
%24 = "arith.index_castui"(%23) : (i64) -> index
%25 = "arith.extui"(%12) : (i32) -> i64
%26 = "arith.extui"(%13) : (i32) -> i64
%27 = "arith.shli"(%26, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%28 = "arith.ori"(%25, %27) : (i64, i64) -> i64
%29 = "arith.index_castui"(%28) : (i64) -> index
%30 = "arith.extui"(%14) : (i32) -> i64
%31 = "arith.extui"(%15) : (i32) -> i64
%32 = "arith.shli"(%31, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%33 = "arith.ori"(%30, %32) : (i64, i64) -> i64
%34 = "arith.index_castui"(%33) : (i64) -> index
%35 = "arith.index_castui"(%16) : (i32) -> index
%36 = "arith.extui"(%17) : (i32) -> i64
%37 = "arith.extui"(%18) : (i32) -> i64
%38 = "arith.shli"(%37, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%39 = "arith.ori"(%36, %38) : (i64, i64) -> i64
%40 = "arith.index_castui"(%39) : (i64) -> index
%41 = "arith.index_castui"(%19) : (i32) -> index
%42:6 = "util.assume.int"(%24, %29, %34, %35, %40, %41) <{assumptions = [[#util.int.assumption<umin = 68386816, umax = 5300322304>], [#util.int.assumption<umin = 68501504, umax = 5769969664>], [#util.int.assumption<umin = 68501568, umax = 5770002432>], [#util.int.assumption<umin = 1, umax = 4095>], [#util.int.assumption<umin = 64, umax = 576460752303423424, udiv = 64>], [#util.int.assumption<umin = 1, umax = 4095>]]}> : (index, index, index, index, index, index) -> (index, index, index, index, index, index)
%43 = "flow.dispatch.workload.ordinal"(%42#3) <{ordinal = 1 : index}> : (index) -> index
%44 = "flow.dispatch.workload.ordinal"(%42#4) <{ordinal = 2 : index}> : (index) -> index
%45 = "flow.dispatch.workload.ordinal"(%42#5) <{ordinal = 3 : index}> : (index) -> index
%46 = "hal.interface.binding.subspan"(%42#0, %43) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%46) <{alignment = 1 : i32}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%47 = "hal.interface.binding.subspan"(%42#1, %45) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%47) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%48 = "hal.interface.binding.subspan"(%4, %44) {alignment = 64 : index, binding = 1 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%48) <{alignment = 64 : i32}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>) -> ()
%49 = "hal.interface.binding.subspan"(%42#2, %45) {alignment = 64 : index, binding = 2 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%49) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%50 = "memref.alloca"(%45) <{operandSegmentSizes = array<i32: 1, 0>}> : (index) -> memref<1x?xi32, #gpu.address_space<private>>
%51 = "affine.apply"(%45) <{map = affine_map<()[s0] -> (s0 ceildiv 2)>}> : (index) -> index
"gpu.barrier"() : () -> ()
"scf.for"(%9, %51, %5) ({
^bb0(%arg7: index):
%62 = "affine.apply"(%arg7) <{map = affine_map<(d0) -> (d0 * 2)>}> : (index) -> index
%63 = "affine.min"(%arg7, %45) <{map = affine_map<(d0)[s0] -> (d0 * -2 + s0, 2)>}> : (index, index) -> index
"scf.for"(%4, %63, %2) ({
^bb0(%arg8: index):
%64 = "arith.addi"(%arg8, %62) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
%65 = "vector.transfer_read"(%47, %4, %64, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
%66 = "arith.addi"(%65, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
%67 = "arith.trunci"(%66) : (vector<1xi64>) -> vector<1xi32>
%68 = "affine.apply"(%arg7, %arg8) <{map = affine_map<(d0)[s0] -> (d0 * 2 + s0)>}> : (index, index) -> index
"vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
"vector.transfer_write"(%67, %50, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x?xi32, #gpu.address_space<private>>, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"gpu.barrier"() : () -> ()
%52:2 = "affine.delinearize_index"(%9) <{static_basis = array<i64: 4, 16>}> : (index) -> (index, index)
"scf.forall"(%43) <{mapping = [#iree_codegen.workgroup_mapping<z:1>, #iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>], operandSegmentSizes = array<i32: 0, 1, 0, 0>, staticLowerBound = array<i64: 0, 0, 0, 0>, staticStep = array<i64: 1, 1, 1, 1>, staticUpperBound = array<i64: -9223372036854775808, 32, 2, 2>}> ({
^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index):
%53 = "memref.subview"(%7) <{operandSegmentSizes = array<i32: 1, 0, 0, 0>, static_offsets = array<i64: 0, 0>, static_sizes = array<i64: 1, 1>, static_strides = array<i64: 1, 1>}> : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>
"gpu.barrier"() : () -> ()
"scf.for"(%9, %2, %5) ({
^bb0(%arg6: index):
%59 = "vector.transfer_read"(%47, %4, %arg0, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
%60 = "arith.addi"(%59, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
%61 = "arith.trunci"(%60) : (vector<1xi64>) -> vector<1xi32>
"vector.transfer_write"(%61, %7, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x3xi32, #gpu.address_space<workgroup>>, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"gpu.barrier"() : () -> ()
%54 = "affine.apply"(%arg2, %52#0) <{map = affine_map<(d0)[s0] -> (d0 * 4 + s0)>}> : (index, index) -> index
%55 = "affine.apply"(%arg3, %52#1) <{map = affine_map<(d0)[s0] -> (d0 * 64 + s0 * 4)>}> : (index, index) -> index
%56 = "memref.subview"(%48, %arg1, %54, %55, %44) <{operandSegmentSizes = array<i32: 1, 3, 1, 0>, static_offsets = array<i64: 0, -9223372036854775808, -9223372036854775808, -9223372036854775808>, static_sizes = array<i64: -9223372036854775808, 1, 1, 4>, static_strides = array<i64: 1, 1, 1, 1>}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>, index, index, index, index) -> memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%57 = "vector.transfer_read"(%46, %4, %arg0, %arg1, %54, %55, %0) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 5, 1, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, index, index, index, f8E4M3FNUZ) -> vector<4xf8E4M3FNUZ>
%58 = "arith.extf"(%57) : (vector<4xf8E4M3FNUZ>) -> vector<4xbf16>
"vector.transfer_write"(%58, %8, %4, %4, %4, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 5, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (vector<4xbf16>, memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, index, index, index, index, index) -> ()
"iree_linalg_ext.scatter"(%8, %53, %56) <{dimension_map = array<i64: 0>, operandSegmentSizes = array<i32: 2, 1>, unique_indices = true}> ({
^bb0(%arg4: bf16, %arg5: bf16):
"iree_linalg_ext.yield"(%arg4) : (bf16) -> ()
}) : (memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>, memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
"gpu.barrier"() : () -> ()
"scf.forall.in_parallel"() ({
^bb0:
}) : () -> ()
}) : (index) -> ()
"memref.dealloc"(%7) : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> ()
"func.return"() : () -> ()
}) {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64>} : () -> ()
}) : () -> ()
"hal.executable.variant_end"() : () -> ()
}) {sym_name = "rocm_hsaco_fb", target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>} : () -> ()
/sharedfile/32/fp8_32_kv16.mlir:7267:13: error: 'vector.transfer_write' op write affecting operations on global resources are restricted to workgroup distributed contexts.
%2675 = torch.aten.index_put %2673, %2674, %2669, %false_2088 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
^
/sharedfile/32/fp8_32_kv16.mlir:7267:13: note: see current operation: "vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
/sharedfile/32/fp8_32_kv16.mlir:7267:13: error: 'func.func' op failed on workgroup distribution verification
%2675 = torch.aten.index_put %2673, %2674, %2669, %false_2088 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
^
/sharedfile/32/fp8_32_kv16.mlir:7267:13: note: see current operation:
"func.func"() <{function_type = () -> (), sym_name = "prefill_bs1$async_dispatch_112_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"}> ({
%0 = "arith.constant"() <{value = 0.000000e+00 : f8E4M3FNUZ}> : () -> f8E4M3FNUZ
%1 = "arith.constant"() <{value = 0 : i64}> : () -> i64
%2 = "arith.constant"() <{value = 1 : index}> : () -> index
%3 = "arith.constant"() <{value = 32 : i64}> : () -> i64
%4 = "arith.constant"() <{value = 0 : index}> : () -> index
%5 = "arith.constant"() <{value = 64 : index}> : () -> index
%6 = "arith.constant"() <{value = dense<10> : vector<1xi64>}> : () -> vector<1xi64>
%7 = "memref.alloc"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x3xi32, #gpu.address_space<workgroup>>
%8 = "memref.alloca"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x1x1x1x4xbf16, #gpu.address_space<private>>
%9 = "gpu.thread_id"() <{dimension = #gpu<dim x>, upper_bound = 64 : index}> : () -> index
%10 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index} : () -> i32
%11 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 1 : index} : () -> i32
%12 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 2 : index} : () -> i32
%13 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 3 : index} : () -> i32
%14 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 4 : index} : () -> i32
%15 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 5 : index} : () -> i32
%16 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 6 : index} : () -> i32
%17 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 7 : index} : () -> i32
%18 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 8 : index} : () -> i32
%19 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 9 : index} : () -> i32
%20 = "arith.extui"(%10) : (i32) -> i64
%21 = "arith.extui"(%11) : (i32) -> i64
%22 = "arith.shli"(%21, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%23 = "arith.ori"(%20, %22) : (i64, i64) -> i64
%24 = "arith.index_castui"(%23) : (i64) -> index
%25 = "arith.extui"(%12) : (i32) -> i64
%26 = "arith.extui"(%13) : (i32) -> i64
%27 = "arith.shli"(%26, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%28 = "arith.ori"(%25, %27) : (i64, i64) -> i64
%29 = "arith.index_castui"(%28) : (i64) -> index
%30 = "arith.extui"(%14) : (i32) -> i64
%31 = "arith.extui"(%15) : (i32) -> i64
%32 = "arith.shli"(%31, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%33 = "arith.ori"(%30, %32) : (i64, i64) -> i64
%34 = "arith.index_castui"(%33) : (i64) -> index
%35 = "arith.index_castui"(%16) : (i32) -> index
%36 = "arith.extui"(%17) : (i32) -> i64
%37 = "arith.extui"(%18) : (i32) -> i64
%38 = "arith.shli"(%37, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%39 = "arith.ori"(%36, %38) : (i64, i64) -> i64
%40 = "arith.index_castui"(%39) : (i64) -> index
%41 = "arith.index_castui"(%19) : (i32) -> index
%42:6 = "util.assume.int"(%24, %29, %34, %35, %40, %41) <{assumptions = [[#util.int.assumption<umin = 68386816, umax = 5300322304>], [#util.int.assumption<umin = 68501504, umax = 5769969664>], [#util.int.assumption<umin = 68501568, umax = 5770002432>], [#util.int.assumption<umin = 1, umax = 4095>], [#util.int.assumption<umin = 64, umax = 576460752303423424, udiv = 64>], [#util.int.assumption<umin = 1, umax = 4095>]]}> : (index, index, index, index, index, index) -> (index, index, index, index, index, index)
%43 = "flow.dispatch.workload.ordinal"(%42#3) <{ordinal = 1 : index}> : (index) -> index
%44 = "flow.dispatch.workload.ordinal"(%42#4) <{ordinal = 2 : index}> : (index) -> index
%45 = "flow.dispatch.workload.ordinal"(%42#5) <{ordinal = 3 : index}> : (index) -> index
%46 = "hal.interface.binding.subspan"(%42#0, %43) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%46) <{alignment = 1 : i32}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%47 = "hal.interface.binding.subspan"(%42#1, %45) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%47) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%48 = "hal.interface.binding.subspan"(%4, %44) {alignment = 64 : index, binding = 1 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%48) <{alignment = 64 : i32}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>) -> ()
%49 = "hal.interface.binding.subspan"(%42#2, %45) {alignment = 64 : index, binding = 2 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%49) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%50 = "memref.alloca"(%45) <{operandSegmentSizes = array<i32: 1, 0>}> : (index) -> memref<1x?xi32, #gpu.address_space<private>>
%51 = "affine.apply"(%45) <{map = affine_map<()[s0] -> (s0 ceildiv 2)>}> : (index) -> index
"gpu.barrier"() : () -> ()
"scf.for"(%9, %51, %5) ({
^bb0(%arg7: index):
%62 = "affine.apply"(%arg7) <{map = affine_map<(d0) -> (d0 * 2)>}> : (index) -> index
%63 = "affine.min"(%arg7, %45) <{map = affine_map<(d0)[s0] -> (d0 * -2 + s0, 2)>}> : (index, index) -> index
"scf.for"(%4, %63, %2) ({
^bb0(%arg8: index):
%64 = "arith.addi"(%arg8, %62) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
%65 = "vector.transfer_read"(%47, %4, %64, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
%66 = "arith.addi"(%65, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
%67 = "arith.trunci"(%66) : (vector<1xi64>) -> vector<1xi32>
%68 = "affine.apply"(%arg7, %arg8) <{map = affine_map<(d0)[s0] -> (d0 * 2 + s0)>}> : (index, index) -> index
"vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
"vector.transfer_write"(%67, %50, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x?xi32, #gpu.address_space<private>>, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"gpu.barrier"() : () -> ()
%52:2 = "affine.delinearize_index"(%9) <{static_basis = array<i64: 4, 16>}> : (index) -> (index, index)
"scf.forall"(%43) <{mapping = [#iree_codegen.workgroup_mapping<z:1>, #iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>], operandSegmentSizes = array<i32: 0, 1, 0, 0>, staticLowerBound = array<i64: 0, 0, 0, 0>, staticStep = array<i64: 1, 1, 1, 1>, staticUpperBound = array<i64: -9223372036854775808, 32, 2, 2>}> ({
^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index):
%53 = "memref.subview"(%7) <{operandSegmentSizes = array<i32: 1, 0, 0, 0>, static_offsets = array<i64: 0, 0>, static_sizes = array<i64: 1, 1>, static_strides = array<i64: 1, 1>}> : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>
"gpu.barrier"() : () -> ()
"scf.for"(%9, %2, %5) ({
^bb0(%arg6: index):
%59 = "vector.transfer_read"(%47, %4, %arg0, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
%60 = "arith.addi"(%59, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
%61 = "arith.trunci"(%60) : (vector<1xi64>) -> vector<1xi32>
"vector.transfer_write"(%61, %7, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x3xi32, #gpu.address_space<workgroup>>, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"gpu.barrier"() : () -> ()
%54 = "affine.apply"(%arg2, %52#0) <{map = affine_map<(d0)[s0] -> (d0 * 4 + s0)>}> : (index, index) -> index
%55 = "affine.apply"(%arg3, %52#1) <{map = affine_map<(d0)[s0] -> (d0 * 64 + s0 * 4)>}> : (index, index) -> index
%56 = "memref.subview"(%48, %arg1, %54, %55, %44) <{operandSegmentSizes = array<i32: 1, 3, 1, 0>, static_offsets = array<i64: 0, -9223372036854775808, -9223372036854775808, -9223372036854775808>, static_sizes = array<i64: -9223372036854775808, 1, 1, 4>, static_strides = array<i64: 1, 1, 1, 1>}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>, index, index, index, index) -> memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%57 = "vector.transfer_read"(%46, %4, %arg0, %arg1, %54, %55, %0) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 5, 1, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, index, index, index, f8E4M3FNUZ) -> vector<4xf8E4M3FNUZ>
%58 = "arith.extf"(%57) : (vector<4xf8E4M3FNUZ>) -> vector<4xbf16>
"vector.transfer_write"(%58, %8, %4, %4, %4, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 5, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (vector<4xbf16>, memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, index, index, index, index, index) -> ()
"iree_linalg_ext.scatter"(%8, %53, %56) <{dimension_map = array<i64: 0>, operandSegmentSizes = array<i32: 2, 1>, unique_indices = true}> ({
^bb0(%arg4: bf16, %arg5: bf16):
"iree_linalg_ext.yield"(%arg4) : (bf16) -> ()
}) : (memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>, memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
"gpu.barrier"() : () -> ()
"scf.forall.in_parallel"() ({
^bb0:
}) : () -> ()
}) : (index) -> ()
"memref.dealloc"(%7) : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> ()
"func.return"() : () -> ()
}) {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64>} : () -> ()
/sharedfile/32/fp8_32_kv16.mlir:7267:13: error: failed to run translation of source executable to target executable for backend #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>
%2675 = torch.aten.index_put %2673, %2674, %2669, %false_2088 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
^
/sharedfile/32/fp8_32_kv16.mlir:7267:13: note: see current operation:
"hal.executable.variant"() ({
"hal.executable.export"() ({
^bb0(%arg9: !hal.device, %arg10: index, %arg11: index, %arg12: index, %arg13: index):
%69:3 = "flow.dispatch.workgroup_count_from_slice"(%arg10, %arg11, %arg12, %arg13) : (index, index, index, index) -> (index, index, index)
"hal.return"(%69#0, %69#1, %69#2) : (index, index, index) -> ()
}) {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index, sym_name = "prefill_bs1$async_dispatch_112_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"} : () -> ()
"builtin.module"() ({
"func.func"() <{function_type = () -> (), sym_name = "prefill_bs1$async_dispatch_112_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"}> ({
%0 = "arith.constant"() <{value = 0.000000e+00 : f8E4M3FNUZ}> : () -> f8E4M3FNUZ
%1 = "arith.constant"() <{value = 0 : i64}> : () -> i64
%2 = "arith.constant"() <{value = 1 : index}> : () -> index
%3 = "arith.constant"() <{value = 32 : i64}> : () -> i64
%4 = "arith.constant"() <{value = 0 : index}> : () -> index
%5 = "arith.constant"() <{value = 64 : index}> : () -> index
%6 = "arith.constant"() <{value = dense<10> : vector<1xi64>}> : () -> vector<1xi64>
%7 = "memref.alloc"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x3xi32, #gpu.address_space<workgroup>>
%8 = "memref.alloca"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x1x1x1x4xbf16, #gpu.address_space<private>>
%9 = "gpu.thread_id"() <{dimension = #gpu<dim x>, upper_bound = 64 : index}> : () -> index
%10 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index} : () -> i32
%11 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 1 : index} : () -> i32
%12 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 2 : index} : () -> i32
%13 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 3 : index} : () -> i32
%14 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 4 : index} : () -> i32
%15 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 5 : index} : () -> i32
%16 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 6 : index} : () -> i32
%17 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 7 : index} : () -> i32
%18 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 8 : index} : () -> i32
%19 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 9 : index} : () -> i32
%20 = "arith.extui"(%10) : (i32) -> i64
%21 = "arith.extui"(%11) : (i32) -> i64
%22 = "arith.shli"(%21, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%23 = "arith.ori"(%20, %22) : (i64, i64) -> i64
%24 = "arith.index_castui"(%23) : (i64) -> index
%25 = "arith.extui"(%12) : (i32) -> i64
%26 = "arith.extui"(%13) : (i32) -> i64
%27 = "arith.shli"(%26, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%28 = "arith.ori"(%25, %27) : (i64, i64) -> i64
%29 = "arith.index_castui"(%28) : (i64) -> index
%30 = "arith.extui"(%14) : (i32) -> i64
%31 = "arith.extui"(%15) : (i32) -> i64
%32 = "arith.shli"(%31, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%33 = "arith.ori"(%30, %32) : (i64, i64) -> i64
%34 = "arith.index_castui"(%33) : (i64) -> index
%35 = "arith.index_castui"(%16) : (i32) -> index
%36 = "arith.extui"(%17) : (i32) -> i64
%37 = "arith.extui"(%18) : (i32) -> i64
%38 = "arith.shli"(%37, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%39 = "arith.ori"(%36, %38) : (i64, i64) -> i64
%40 = "arith.index_castui"(%39) : (i64) -> index
%41 = "arith.index_castui"(%19) : (i32) -> index
%42:6 = "util.assume.int"(%24, %29, %34, %35, %40, %41) <{assumptions = [[#util.int.assumption<umin = 68386816, umax = 5300322304>], [#util.int.assumption<umin = 68501504, umax = 5769969664>], [#util.int.assumption<umin = 68501568, umax = 5770002432>], [#util.int.assumption<umin = 1, umax = 4095>], [#util.int.assumption<umin = 64, umax = 576460752303423424, udiv = 64>], [#util.int.assumption<umin = 1, umax = 4095>]]}> : (index, index, index, index, index, index) -> (index, index, index, index, index, index)
%43 = "flow.dispatch.workload.ordinal"(%42#3) <{ordinal = 1 : index}> : (index) -> index
%44 = "flow.dispatch.workload.ordinal"(%42#4) <{ordinal = 2 : index}> : (index) -> index
%45 = "flow.dispatch.workload.ordinal"(%42#5) <{ordinal = 3 : index}> : (index) -> index
%46 = "hal.interface.binding.subspan"(%42#0, %43) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%46) <{alignment = 1 : i32}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%47 = "hal.interface.binding.subspan"(%42#1, %45) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%47) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%48 = "hal.interface.binding.subspan"(%4, %44) {alignment = 64 : index, binding = 1 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%48) <{alignment = 64 : i32}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>) -> ()
%49 = "hal.interface.binding.subspan"(%42#2, %45) {alignment = 64 : index, binding = 2 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%49) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%50 = "memref.alloca"(%45) <{operandSegmentSizes = array<i32: 1, 0>}> : (index) -> memref<1x?xi32, #gpu.address_space<private>>
%51 = "affine.apply"(%45) <{map = affine_map<()[s0] -> (s0 ceildiv 2)>}> : (index) -> index
"gpu.barrier"() : () -> ()
"scf.for"(%9, %51, %5) ({
^bb0(%arg7: index):
%62 = "affine.apply"(%arg7) <{map = affine_map<(d0) -> (d0 * 2)>}> : (index) -> index
%63 = "affine.min"(%arg7, %45) <{map = affine_map<(d0)[s0] -> (d0 * -2 + s0, 2)>}> : (index, index) -> index
"scf.for"(%4, %63, %2) ({
^bb0(%arg8: index):
%64 = "arith.addi"(%arg8, %62) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
%65 = "vector.transfer_read"(%47, %4, %64, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
%66 = "arith.addi"(%65, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
%67 = "arith.trunci"(%66) : (vector<1xi64>) -> vector<1xi32>
%68 = "affine.apply"(%arg7, %arg8) <{map = affine_map<(d0)[s0] -> (d0 * 2 + s0)>}> : (index, index) -> index
"vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
"vector.transfer_write"(%67, %50, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x?xi32, #gpu.address_space<private>>, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"gpu.barrier"() : () -> ()
%52:2 = "affine.delinearize_index"(%9) <{static_basis = array<i64: 4, 16>}> : (index) -> (index, index)
"scf.forall"(%43) <{mapping = [#iree_codegen.workgroup_mapping<z:1>, #iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>], operandSegmentSizes = array<i32: 0, 1, 0, 0>, staticLowerBound = array<i64: 0, 0, 0, 0>, staticStep = array<i64: 1, 1, 1, 1>, staticUpperBound = array<i64: -9223372036854775808, 32, 2, 2>}> ({
^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index):
%53 = "memref.subview"(%7) <{operandSegmentSizes = array<i32: 1, 0, 0, 0>, static_offsets = array<i64: 0, 0>, static_sizes = array<i64: 1, 1>, static_strides = array<i64: 1, 1>}> : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>
"gpu.barrier"() : () -> ()
"scf.for"(%9, %2, %5) ({
^bb0(%arg6: index):
%59 = "vector.transfer_read"(%47, %4, %arg0, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
%60 = "arith.addi"(%59, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
%61 = "arith.trunci"(%60) : (vector<1xi64>) -> vector<1xi32>
"vector.transfer_write"(%61, %7, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x3xi32, #gpu.address_space<workgroup>>, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"gpu.barrier"() : () -> ()
%54 = "affine.apply"(%arg2, %52#0) <{map = affine_map<(d0)[s0] -> (d0 * 4 + s0)>}> : (index, index) -> index
%55 = "affine.apply"(%arg3, %52#1) <{map = affine_map<(d0)[s0] -> (d0 * 64 + s0 * 4)>}> : (index, index) -> index
%56 = "memref.subview"(%48, %arg1, %54, %55, %44) <{operandSegmentSizes = array<i32: 1, 3, 1, 0>, static_offsets = array<i64: 0, -9223372036854775808, -9223372036854775808, -9223372036854775808>, static_sizes = array<i64: -9223372036854775808, 1, 1, 4>, static_strides = array<i64: 1, 1, 1, 1>}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>, index, index, index, index) -> memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%57 = "vector.transfer_read"(%46, %4, %arg0, %arg1, %54, %55, %0) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 5, 1, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, index, index, index, f8E4M3FNUZ) -> vector<4xf8E4M3FNUZ>
%58 = "arith.extf"(%57) : (vector<4xf8E4M3FNUZ>) -> vector<4xbf16>
"vector.transfer_write"(%58, %8, %4, %4, %4, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 5, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (vector<4xbf16>, memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, index, index, index, index, index) -> ()
"iree_linalg_ext.scatter"(%8, %53, %56) <{dimension_map = array<i64: 0>, operandSegmentSizes = array<i32: 2, 1>, unique_indices = true}> ({
^bb0(%arg4: bf16, %arg5: bf16):
"iree_linalg_ext.yield"(%arg4) : (bf16) -> ()
}) : (memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>, memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
"gpu.barrier"() : () -> ()
"scf.forall.in_parallel"() ({
^bb0:
}) : () -> ()
}) : (index) -> ()
"memref.dealloc"(%7) : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> ()
"func.return"() : () -> ()
}) {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64>} : () -> ()
}) : () -> ()
"hal.executable.variant_end"() : () -> ()
}) {sym_name = "rocm_hsaco_fb", target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>} : () -> ()
/sharedfile/32/fp8_32_kv16.mlir:8143:13: error: 'vector.transfer_write' op write affecting operations on global resources are restricted to workgroup distributed contexts.
%3012 = torch.aten.index_put %3010, %3011, %3006, %false_2460 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
^
/sharedfile/32/fp8_32_kv16.mlir:8143:13: note: see current operation: "vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
/sharedfile/32/fp8_32_kv16.mlir:8143:13: error: 'func.func' op failed on workgroup distribution verification
%3012 = torch.aten.index_put %3010, %3011, %3006, %false_2460 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
^
/sharedfile/32/fp8_32_kv16.mlir:8143:13: note: see current operation:
"func.func"() <{function_type = () -> (), sym_name = "prefill_bs1$async_dispatch_132_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"}> ({
%0 = "arith.constant"() <{value = 0.000000e+00 : f8E4M3FNUZ}> : () -> f8E4M3FNUZ
%1 = "arith.constant"() <{value = 0 : i64}> : () -> i64
%2 = "arith.constant"() <{value = 1 : index}> : () -> index
%3 = "arith.constant"() <{value = 32 : i64}> : () -> i64
%4 = "arith.constant"() <{value = 0 : index}> : () -> index
%5 = "arith.constant"() <{value = 64 : index}> : () -> index
%6 = "arith.constant"() <{value = dense<12> : vector<1xi64>}> : () -> vector<1xi64>
%7 = "memref.alloc"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x3xi32, #gpu.address_space<workgroup>>
%8 = "memref.alloca"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x1x1x1x4xbf16, #gpu.address_space<private>>
%9 = "gpu.thread_id"() <{dimension = #gpu<dim x>, upper_bound = 64 : index}> : () -> index
%10 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index} : () -> i32
%11 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 1 : index} : () -> i32
%12 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 2 : index} : () -> i32
%13 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 3 : index} : () -> i32
%14 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 4 : index} : () -> i32
%15 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 5 : index} : () -> i32
%16 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 6 : index} : () -> i32
%17 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 7 : index} : () -> i32
%18 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 8 : index} : () -> i32
%19 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 9 : index} : () -> i32
%20 = "arith.extui"(%10) : (i32) -> i64
%21 = "arith.extui"(%11) : (i32) -> i64
%22 = "arith.shli"(%21, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%23 = "arith.ori"(%20, %22) : (i64, i64) -> i64
%24 = "arith.index_castui"(%23) : (i64) -> index
%25 = "arith.extui"(%12) : (i32) -> i64
%26 = "arith.extui"(%13) : (i32) -> i64
%27 = "arith.shli"(%26, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%28 = "arith.ori"(%25, %27) : (i64, i64) -> i64
%29 = "arith.index_castui"(%28) : (i64) -> index
%30 = "arith.extui"(%14) : (i32) -> i64
%31 = "arith.extui"(%15) : (i32) -> i64
%32 = "arith.shli"(%31, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%33 = "arith.ori"(%30, %32) : (i64, i64) -> i64
%34 = "arith.index_castui"(%33) : (i64) -> index
%35 = "arith.index_castui"(%16) : (i32) -> index
%36 = "arith.extui"(%17) : (i32) -> i64
%37 = "arith.extui"(%18) : (i32) -> i64
%38 = "arith.shli"(%37, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%39 = "arith.ori"(%36, %38) : (i64, i64) -> i64
%40 = "arith.index_castui"(%39) : (i64) -> index
%41 = "arith.index_castui"(%19) : (i32) -> index
%42:6 = "util.assume.int"(%24, %29, %34, %35, %40, %41) <{assumptions = [[#util.int.assumption<umin = 68386816, umax = 5300322304>], [#util.int.assumption<umin = 68501504, umax = 5769969664>], [#util.int.assumption<umin = 68501568, umax = 5770002432>], [#util.int.assumption<umin = 1, umax = 4095>], [#util.int.assumption<umin = 64, umax = 576460752303423424, udiv = 64>], [#util.int.assumption<umin = 1, umax = 4095>]]}> : (index, index, index, index, index, index) -> (index, index, index, index, index, index)
%43 = "flow.dispatch.workload.ordinal"(%42#3) <{ordinal = 1 : index}> : (index) -> index
%44 = "flow.dispatch.workload.ordinal"(%42#4) <{ordinal = 2 : index}> : (index) -> index
%45 = "flow.dispatch.workload.ordinal"(%42#5) <{ordinal = 3 : index}> : (index) -> index
%46 = "hal.interface.binding.subspan"(%42#0, %43) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%46) <{alignment = 1 : i32}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%47 = "hal.interface.binding.subspan"(%42#1, %45) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%47) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%48 = "hal.interface.binding.subspan"(%4, %44) {alignment = 64 : index, binding = 1 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%48) <{alignment = 64 : i32}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>) -> ()
%49 = "hal.interface.binding.subspan"(%42#2, %45) {alignment = 64 : index, binding = 2 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%49) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%50 = "memref.alloca"(%45) <{operandSegmentSizes = array<i32: 1, 0>}> : (index) -> memref<1x?xi32, #gpu.address_space<private>>
%51 = "affine.apply"(%45) <{map = affine_map<()[s0] -> (s0 ceildiv 2)>}> : (index) -> index
"gpu.barrier"() : () -> ()
"scf.for"(%9, %51, %5) ({
^bb0(%arg7: index):
%62 = "affine.apply"(%arg7) <{map = affine_map<(d0) -> (d0 * 2)>}> : (index) -> index
%63 = "affine.min"(%arg7, %45) <{map = affine_map<(d0)[s0] -> (d0 * -2 + s0, 2)>}> : (index, index) -> index
"scf.for"(%4, %63, %2) ({
^bb0(%arg8: index):
%64 = "arith.addi"(%arg8, %62) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
%65 = "vector.transfer_read"(%47, %4, %64, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
%66 = "arith.addi"(%65, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
%67 = "arith.trunci"(%66) : (vector<1xi64>) -> vector<1xi32>
%68 = "affine.apply"(%arg7, %arg8) <{map = affine_map<(d0)[s0] -> (d0 * 2 + s0)>}> : (index, index) -> index
"vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
"vector.transfer_write"(%67, %50, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x?xi32, #gpu.address_space<private>>, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"gpu.barrier"() : () -> ()
%52:2 = "affine.delinearize_index"(%9) <{static_basis = array<i64: 4, 16>}> : (index) -> (index, index)
"scf.forall"(%43) <{mapping = [#iree_codegen.workgroup_mapping<z:1>, #iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>], operandSegmentSizes = array<i32: 0, 1, 0, 0>, staticLowerBound = array<i64: 0, 0, 0, 0>, staticStep = array<i64: 1, 1, 1, 1>, staticUpperBound = array<i64: -9223372036854775808, 32, 2, 2>}> ({
^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index):
%53 = "memref.subview"(%7) <{operandSegmentSizes = array<i32: 1, 0, 0, 0>, static_offsets = array<i64: 0, 0>, static_sizes = array<i64: 1, 1>, static_strides = array<i64: 1, 1>}> : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>
"gpu.barrier"() : () -> ()
"scf.for"(%9, %2, %5) ({
^bb0(%arg6: index):
%59 = "vector.transfer_read"(%47, %4, %arg0, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
%60 = "arith.addi"(%59, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
%61 = "arith.trunci"(%60) : (vector<1xi64>) -> vector<1xi32>
"vector.transfer_write"(%61, %7, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x3xi32, #gpu.address_space<workgroup>>, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"gpu.barrier"() : () -> ()
%54 = "affine.apply"(%arg2, %52#0) <{map = affine_map<(d0)[s0] -> (d0 * 4 + s0)>}> : (index, index) -> index
%55 = "affine.apply"(%arg3, %52#1) <{map = affine_map<(d0)[s0] -> (d0 * 64 + s0 * 4)>}> : (index, index) -> index
%56 = "memref.subview"(%48, %arg1, %54, %55, %44) <{operandSegmentSizes = array<i32: 1, 3, 1, 0>, static_offsets = array<i64: 0, -9223372036854775808, -9223372036854775808, -9223372036854775808>, static_sizes = array<i64: -9223372036854775808, 1, 1, 4>, static_strides = array<i64: 1, 1, 1, 1>}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>, index, index, index, index) -> memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%57 = "vector.transfer_read"(%46, %4, %arg0, %arg1, %54, %55, %0) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 5, 1, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, index, index, index, f8E4M3FNUZ) -> vector<4xf8E4M3FNUZ>
%58 = "arith.extf"(%57) : (vector<4xf8E4M3FNUZ>) -> vector<4xbf16>
"vector.transfer_write"(%58, %8, %4, %4, %4, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 5, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (vector<4xbf16>, memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, index, index, index, index, index) -> ()
"iree_linalg_ext.scatter"(%8, %53, %56) <{dimension_map = array<i64: 0>, operandSegmentSizes = array<i32: 2, 1>, unique_indices = true}> ({
^bb0(%arg4: bf16, %arg5: bf16):
"iree_linalg_ext.yield"(%arg4) : (bf16) -> ()
}) : (memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>, memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
"gpu.barrier"() : () -> ()
"scf.forall.in_parallel"() ({
^bb0:
}) : () -> ()
}) : (index) -> ()
"memref.dealloc"(%7) : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> ()
"func.return"() : () -> ()
}) {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64>} : () -> ()
/sharedfile/32/fp8_32_kv16.mlir:8143:13: error: failed to run translation of source executable to target executable for backend #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>
%3012 = torch.aten.index_put %3010, %3011, %3006, %false_2460 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
^
/sharedfile/32/fp8_32_kv16.mlir:8143:13: note: see current operation:
"hal.executable.variant"() ({
"hal.executable.export"() ({
^bb0(%arg9: !hal.device, %arg10: index, %arg11: index, %arg12: index, %arg13: index):
%69:3 = "flow.dispatch.workgroup_count_from_slice"(%arg10, %arg11, %arg12, %arg13) : (index, index, index, index) -> (index, index, index)
"hal.return"(%69#0, %69#1, %69#2) : (index, index, index) -> ()
}) {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index, sym_name = "prefill_bs1$async_dispatch_132_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"} : () -> ()
"builtin.module"() ({
"func.func"() <{function_type = () -> (), sym_name = "prefill_bs1$async_dispatch_132_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"}> ({
%0 = "arith.constant"() <{value = 0.000000e+00 : f8E4M3FNUZ}> : () -> f8E4M3FNUZ
%1 = "arith.constant"() <{value = 0 : i64}> : () -> i64
%2 = "arith.constant"() <{value = 1 : index}> : () -> index
%3 = "arith.constant"() <{value = 32 : i64}> : () -> i64
%4 = "arith.constant"() <{value = 0 : index}> : () -> index
%5 = "arith.constant"() <{value = 64 : index}> : () -> index
%6 = "arith.constant"() <{value = dense<12> : vector<1xi64>}> : () -> vector<1xi64>
%7 = "memref.alloc"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x3xi32, #gpu.address_space<workgroup>>
%8 = "memref.alloca"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x1x1x1x4xbf16, #gpu.address_space<private>>
%9 = "gpu.thread_id"() <{dimension = #gpu<dim x>, upper_bound = 64 : index}> : () -> index
%10 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index} : () -> i32
%11 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 1 : index} : () -> i32
%12 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 2 : index} : () -> i32
%13 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 3 : index} : () -> i32
%14 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 4 : index} : () -> i32
%15 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 5 : index} : () -> i32
%16 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 6 : index} : () -> i32
%17 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 7 : index} : () -> i32
%18 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 8 : index} : () -> i32
%19 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 9 : index} : () -> i32
%20 = "arith.extui"(%10) : (i32) -> i64
%21 = "arith.extui"(%11) : (i32) -> i64
%22 = "arith.shli"(%21, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%23 = "arith.ori"(%20, %22) : (i64, i64) -> i64
%24 = "arith.index_castui"(%23) : (i64) -> index
%25 = "arith.extui"(%12) : (i32) -> i64
%26 = "arith.extui"(%13) : (i32) -> i64
%27 = "arith.shli"(%26, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%28 = "arith.ori"(%25, %27) : (i64, i64) -> i64
%29 = "arith.index_castui"(%28) : (i64) -> index
%30 = "arith.extui"(%14) : (i32) -> i64
%31 = "arith.extui"(%15) : (i32) -> i64
%32 = "arith.shli"(%31, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%33 = "arith.ori"(%30, %32) : (i64, i64) -> i64
%34 = "arith.index_castui"(%33) : (i64) -> index
%35 = "arith.index_castui"(%16) : (i32) -> index
%36 = "arith.extui"(%17) : (i32) -> i64
%37 = "arith.extui"(%18) : (i32) -> i64
%38 = "arith.shli"(%37, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%39 = "arith.ori"(%36, %38) : (i64, i64) -> i64
%40 = "arith.index_castui"(%39) : (i64) -> index
%41 = "arith.index_castui"(%19) : (i32) -> index
%42:6 = "util.assume.int"(%24, %29, %34, %35, %40, %41) <{assumptions = [[#util.int.assumption<umin = 68386816, umax = 5300322304>], [#util.int.assumption<umin = 68501504, umax = 5769969664>], [#util.int.assumption<umin = 68501568, umax = 5770002432>], [#util.int.assumption<umin = 1, umax = 4095>], [#util.int.assumption<umin = 64, umax = 576460752303423424, udiv = 64>], [#util.int.assumption<umin = 1, umax = 4095>]]}> : (index, index, index, index, index, index) -> (index, index, index, index, index, index)
%43 = "flow.dispatch.workload.ordinal"(%42#3) <{ordinal = 1 : index}> : (index) -> index
%44 = "flow.dispatch.workload.ordinal"(%42#4) <{ordinal = 2 : index}> : (index) -> index
%45 = "flow.dispatch.workload.ordinal"(%42#5) <{ordinal = 3 : index}> : (index) -> index
%46 = "hal.interface.binding.subspan"(%42#0, %43) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%46) <{alignment = 1 : i32}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%47 = "hal.interface.binding.subspan"(%42#1, %45) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%47) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%48 = "hal.interface.binding.subspan"(%4, %44) {alignment = 64 : index, binding = 1 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%48) <{alignment = 64 : i32}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>) -> ()
%49 = "hal.interface.binding.subspan"(%42#2, %45) {alignment = 64 : index, binding = 2 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%49) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%50 = "memref.alloca"(%45) <{operandSegmentSizes = array<i32: 1, 0>}> : (index) -> memref<1x?xi32, #gpu.address_space<private>>
%51 = "affine.apply"(%45) <{map = affine_map<()[s0] -> (s0 ceildiv 2)>}> : (index) -> index
"gpu.barrier"() : () -> ()
"scf.for"(%9, %51, %5) ({
^bb0(%arg7: index):
%62 = "affine.apply"(%arg7) <{map = affine_map<(d0) -> (d0 * 2)>}> : (index) -> index
%63 = "affine.min"(%arg7, %45) <{map = affine_map<(d0)[s0] -> (d0 * -2 + s0, 2)>}> : (index, index) -> index
"scf.for"(%4, %63, %2) ({
^bb0(%arg8: index):
%64 = "arith.addi"(%arg8, %62) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
%65 = "vector.transfer_read"(%47, %4, %64, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
%66 = "arith.addi"(%65, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
%67 = "arith.trunci"(%66) : (vector<1xi64>) -> vector<1xi32>
%68 = "affine.apply"(%arg7, %arg8) <{map = affine_map<(d0)[s0] -> (d0 * 2 + s0)>}> : (index, index) -> index
"vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
"vector.transfer_write"(%67, %50, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x?xi32, #gpu.address_space<private>>, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"gpu.barrier"() : () -> ()
%52:2 = "affine.delinearize_index"(%9) <{static_basis = array<i64: 4, 16>}> : (index) -> (index, index)
"scf.forall"(%43) <{mapping = [#iree_codegen.workgroup_mapping<z:1>, #iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>], operandSegmentSizes = array<i32: 0, 1, 0, 0>, staticLowerBound = array<i64: 0, 0, 0, 0>, staticStep = array<i64: 1, 1, 1, 1>, staticUpperBound = array<i64: -9223372036854775808, 32, 2, 2>}> ({
^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index):
%53 = "memref.subview"(%7) <{operandSegmentSizes = array<i32: 1, 0, 0, 0>, static_offsets = array<i64: 0, 0>, static_sizes = array<i64: 1, 1>, static_strides = array<i64: 1, 1>}> : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>
"gpu.barrier"() : () -> ()
"scf.for"(%9, %2, %5) ({
^bb0(%arg6: index):
%59 = "vector.transfer_read"(%47, %4, %arg0, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
%60 = "arith.addi"(%59, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
%61 = "arith.trunci"(%60) : (vector<1xi64>) -> vector<1xi32>
"vector.transfer_write"(%61, %7, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x3xi32, #gpu.address_space<workgroup>>, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"gpu.barrier"() : () -> ()
%54 = "affine.apply"(%arg2, %52#0) <{map = affine_map<(d0)[s0] -> (d0 * 4 + s0)>}> : (index, index) -> index
%55 = "affine.apply"(%arg3, %52#1) <{map = affine_map<(d0)[s0] -> (d0 * 64 + s0 * 4)>}> : (index, index) -> index
%56 = "memref.subview"(%48, %arg1, %54, %55, %44) <{operandSegmentSizes = array<i32: 1, 3, 1, 0>, static_offsets = array<i64: 0, -9223372036854775808, -9223372036854775808, -9223372036854775808>, static_sizes = array<i64: -9223372036854775808, 1, 1, 4>, static_strides = array<i64: 1, 1, 1, 1>}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>, index, index, index, index) -> memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%57 = "vector.transfer_read"(%46, %4, %arg0, %arg1, %54, %55, %0) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 5, 1, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, index, index, index, f8E4M3FNUZ) -> vector<4xf8E4M3FNUZ>
%58 = "arith.extf"(%57) : (vector<4xf8E4M3FNUZ>) -> vector<4xbf16>
"vector.transfer_write"(%58, %8, %4, %4, %4, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 5, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (vector<4xbf16>, memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, index, index, index, index, index) -> ()
"iree_linalg_ext.scatter"(%8, %53, %56) <{dimension_map = array<i64: 0>, operandSegmentSizes = array<i32: 2, 1>, unique_indices = true}> ({
^bb0(%arg4: bf16, %arg5: bf16):
"iree_linalg_ext.yield"(%arg4) : (bf16) -> ()
}) : (memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>, memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
"gpu.barrier"() : () -> ()
"scf.forall.in_parallel"() ({
^bb0:
}) : () -> ()
}) : (index) -> ()
"memref.dealloc"(%7) : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> ()
"func.return"() : () -> ()
}) {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64>} : () -> ()
}) : () -> ()
"hal.executable.variant_end"() : () -> ()
}) {sym_name = "rocm_hsaco_fb", target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>} : () -> ()
/sharedfile/32/fp8_32_kv16.mlir:9019:13: error: 'vector.transfer_write' op write affecting operations on global resources are restricted to workgroup distributed contexts.
%3349 = torch.aten.index_put %3347, %3348, %3343, %false_2832 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
^
/sharedfile/32/fp8_32_kv16.mlir:9019:13: note: see current operation: "vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
/sharedfile/32/fp8_32_kv16.mlir:9019:13: error: 'func.func' op failed on workgroup distribution verification
%3349 = torch.aten.index_put %3347, %3348, %3343, %false_2832 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
^
/sharedfile/32/fp8_32_kv16.mlir:9019:13: note: see current operation:
"func.func"() <{function_type = () -> (), sym_name = "prefill_bs1$async_dispatch_152_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"}> ({
%0 = "arith.constant"() <{value = 0.000000e+00 : f8E4M3FNUZ}> : () -> f8E4M3FNUZ
%1 = "arith.constant"() <{value = 0 : i64}> : () -> i64
%2 = "arith.constant"() <{value = 1 : index}> : () -> index
%3 = "arith.constant"() <{value = 32 : i64}> : () -> i64
%4 = "arith.constant"() <{value = 0 : index}> : () -> index
%5 = "arith.constant"() <{value = 64 : index}> : () -> index
%6 = "arith.constant"() <{value = dense<14> : vector<1xi64>}> : () -> vector<1xi64>
%7 = "memref.alloc"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x3xi32, #gpu.address_space<workgroup>>
%8 = "memref.alloca"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x1x1x1x4xbf16, #gpu.address_space<private>>
%9 = "gpu.thread_id"() <{dimension = #gpu<dim x>, upper_bound = 64 : index}> : () -> index
%10 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index} : () -> i32
%11 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 1 : index} : () -> i32
%12 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 2 : index} : () -> i32
%13 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 3 : index} : () -> i32
%14 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 4 : index} : () -> i32
%15 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 5 : index} : () -> i32
%16 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 6 : index} : () -> i32
%17 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 7 : index} : () -> i32
%18 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 8 : index} : () -> i32
%19 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 9 : index} : () -> i32
%20 = "arith.extui"(%10) : (i32) -> i64
%21 = "arith.extui"(%11) : (i32) -> i64
%22 = "arith.shli"(%21, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%23 = "arith.ori"(%20, %22) : (i64, i64) -> i64
%24 = "arith.index_castui"(%23) : (i64) -> index
%25 = "arith.extui"(%12) : (i32) -> i64
%26 = "arith.extui"(%13) : (i32) -> i64
%27 = "arith.shli"(%26, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%28 = "arith.ori"(%25, %27) : (i64, i64) -> i64
%29 = "arith.index_castui"(%28) : (i64) -> index
%30 = "arith.extui"(%14) : (i32) -> i64
%31 = "arith.extui"(%15) : (i32) -> i64
%32 = "arith.shli"(%31, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%33 = "arith.ori"(%30, %32) : (i64, i64) -> i64
%34 = "arith.index_castui"(%33) : (i64) -> index
%35 = "arith.index_castui"(%16) : (i32) -> index
%36 = "arith.extui"(%17) : (i32) -> i64
%37 = "arith.extui"(%18) : (i32) -> i64
%38 = "arith.shli"(%37, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%39 = "arith.ori"(%36, %38) : (i64, i64) -> i64
%40 = "arith.index_castui"(%39) : (i64) -> index
%41 = "arith.index_castui"(%19) : (i32) -> index
%42:6 = "util.assume.int"(%24, %29, %34, %35, %40, %41) <{assumptions = [[#util.int.assumption<umin = 68386816, umax = 5300322304>], [#util.int.assumption<umin = 68501504, umax = 5769969664>], [#util.int.assumption<umin = 68501568, umax = 5770002432>], [#util.int.assumption<umin = 1, umax = 4095>], [#util.int.assumption<umin = 64, umax = 576460752303423424, udiv = 64>], [#util.int.assumption<umin = 1, umax = 4095>]]}> : (index, index, index, index, index, index) -> (index, index, index, index, index, index)
%43 = "flow.dispatch.workload.ordinal"(%42#3) <{ordinal = 1 : index}> : (index) -> index
%44 = "flow.dispatch.workload.ordinal"(%42#4) <{ordinal = 2 : index}> : (index) -> index
%45 = "flow.dispatch.workload.ordinal"(%42#5) <{ordinal = 3 : index}> : (index) -> index
%46 = "hal.interface.binding.subspan"(%42#0, %43) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%46) <{alignment = 1 : i32}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%47 = "hal.interface.binding.subspan"(%42#1, %45) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%47) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%48 = "hal.interface.binding.subspan"(%4, %44) {alignment = 64 : index, binding = 1 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%48) <{alignment = 64 : i32}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>) -> ()
%49 = "hal.interface.binding.subspan"(%42#2, %45) {alignment = 64 : index, binding = 2 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%49) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%50 = "memref.alloca"(%45) <{operandSegmentSizes = array<i32: 1, 0>}> : (index) -> memref<1x?xi32, #gpu.address_space<private>>
%51 = "affine.apply"(%45) <{map = affine_map<()[s0] -> (s0 ceildiv 2)>}> : (index) -> index
"gpu.barrier"() : () -> ()
"scf.for"(%9, %51, %5) ({
^bb0(%arg7: index):
%62 = "affine.apply"(%arg7) <{map = affine_map<(d0) -> (d0 * 2)>}> : (index) -> index
%63 = "affine.min"(%arg7, %45) <{map = affine_map<(d0)[s0] -> (d0 * -2 + s0, 2)>}> : (index, index) -> index
"scf.for"(%4, %63, %2) ({
^bb0(%arg8: index):
%64 = "arith.addi"(%arg8, %62) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
%65 = "vector.transfer_read"(%47, %4, %64, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
%66 = "arith.addi"(%65, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
%67 = "arith.trunci"(%66) : (vector<1xi64>) -> vector<1xi32>
%68 = "affine.apply"(%arg7, %arg8) <{map = affine_map<(d0)[s0] -> (d0 * 2 + s0)>}> : (index, index) -> index
"vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
"vector.transfer_write"(%67, %50, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x?xi32, #gpu.address_space<private>>, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"gpu.barrier"() : () -> ()
%52:2 = "affine.delinearize_index"(%9) <{static_basis = array<i64: 4, 16>}> : (index) -> (index, index)
"scf.forall"(%43) <{mapping = [#iree_codegen.workgroup_mapping<z:1>, #iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>], operandSegmentSizes = array<i32: 0, 1, 0, 0>, staticLowerBound = array<i64: 0, 0, 0, 0>, staticStep = array<i64: 1, 1, 1, 1>, staticUpperBound = array<i64: -9223372036854775808, 32, 2, 2>}> ({
^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index):
%53 = "memref.subview"(%7) <{operandSegmentSizes = array<i32: 1, 0, 0, 0>, static_offsets = array<i64: 0, 0>, static_sizes = array<i64: 1, 1>, static_strides = array<i64: 1, 1>}> : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>
"gpu.barrier"() : () -> ()
"scf.for"(%9, %2, %5) ({
^bb0(%arg6: index):
%59 = "vector.transfer_read"(%47, %4, %arg0, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
%60 = "arith.addi"(%59, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
%61 = "arith.trunci"(%60) : (vector<1xi64>) -> vector<1xi32>
"vector.transfer_write"(%61, %7, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x3xi32, #gpu.address_space<workgroup>>, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"gpu.barrier"() : () -> ()
%54 = "affine.apply"(%arg2, %52#0) <{map = affine_map<(d0)[s0] -> (d0 * 4 + s0)>}> : (index, index) -> index
%55 = "affine.apply"(%arg3, %52#1) <{map = affine_map<(d0)[s0] -> (d0 * 64 + s0 * 4)>}> : (index, index) -> index
%56 = "memref.subview"(%48, %arg1, %54, %55, %44) <{operandSegmentSizes = array<i32: 1, 3, 1, 0>, static_offsets = array<i64: 0, -9223372036854775808, -9223372036854775808, -9223372036854775808>, static_sizes = array<i64: -9223372036854775808, 1, 1, 4>, static_strides = array<i64: 1, 1, 1, 1>}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>, index, index, index, index) -> memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%57 = "vector.transfer_read"(%46, %4, %arg0, %arg1, %54, %55, %0) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 5, 1, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, index, index, index, f8E4M3FNUZ) -> vector<4xf8E4M3FNUZ>
%58 = "arith.extf"(%57) : (vector<4xf8E4M3FNUZ>) -> vector<4xbf16>
"vector.transfer_write"(%58, %8, %4, %4, %4, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 5, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (vector<4xbf16>, memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, index, index, index, index, index) -> ()
"iree_linalg_ext.scatter"(%8, %53, %56) <{dimension_map = array<i64: 0>, operandSegmentSizes = array<i32: 2, 1>, unique_indices = true}> ({
^bb0(%arg4: bf16, %arg5: bf16):
"iree_linalg_ext.yield"(%arg4) : (bf16) -> ()
}) : (memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>, memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
"gpu.barrier"() : () -> ()
"scf.forall.in_parallel"() ({
^bb0:
}) : () -> ()
}) : (index) -> ()
"memref.dealloc"(%7) : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> ()
"func.return"() : () -> ()
}) {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64>} : () -> ()
/sharedfile/32/fp8_32_kv16.mlir:9019:13: error: failed to run translation of source executable to target executable for backend #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>
%3349 = torch.aten.index_put %3347, %3348, %3343, %false_2832 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
^
/sharedfile/32/fp8_32_kv16.mlir:9019:13: note: see current operation:
"hal.executable.variant"() ({
"hal.executable.export"() ({
^bb0(%arg9: !hal.device, %arg10: index, %arg11: index, %arg12: index, %arg13: index):
%69:3 = "flow.dispatch.workgroup_count_from_slice"(%arg10, %arg11, %arg12, %arg13) : (index, index, index, index) -> (index, index, index)
"hal.return"(%69#0, %69#1, %69#2) : (index, index, index) -> ()
}) {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index, sym_name = "prefill_bs1$async_dispatch_152_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"} : () -> ()
"builtin.module"() ({
"func.func"() <{function_type = () -> (), sym_name = "prefill_bs1$async_dispatch_152_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"}> ({
%0 = "arith.constant"() <{value = 0.000000e+00 : f8E4M3FNUZ}> : () -> f8E4M3FNUZ
%1 = "arith.constant"() <{value = 0 : i64}> : () -> i64
%2 = "arith.constant"() <{value = 1 : index}> : () -> index
%3 = "arith.constant"() <{value = 32 : i64}> : () -> i64
%4 = "arith.constant"() <{value = 0 : index}> : () -> index
%5 = "arith.constant"() <{value = 64 : index}> : () -> index
%6 = "arith.constant"() <{value = dense<14> : vector<1xi64>}> : () -> vector<1xi64>
%7 = "memref.alloc"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x3xi32, #gpu.address_space<workgroup>>
%8 = "memref.alloca"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x1x1x1x4xbf16, #gpu.address_space<private>>
%9 = "gpu.thread_id"() <{dimension = #gpu<dim x>, upper_bound = 64 : index}> : () -> index
%10 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index} : () -> i32
%11 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 1 : index} : () -> i32
%12 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 2 : index} : () -> i32
%13 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 3 : index} : () -> i32
%14 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 4 : index} : () -> i32
%15 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 5 : index} : () -> i32
%16 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 6 : index} : () -> i32
%17 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 7 : index} : () -> i32
%18 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 8 : index} : () -> i32
%19 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 9 : index} : () -> i32
%20 = "arith.extui"(%10) : (i32) -> i64
%21 = "arith.extui"(%11) : (i32) -> i64
%22 = "arith.shli"(%21, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%23 = "arith.ori"(%20, %22) : (i64, i64) -> i64
%24 = "arith.index_castui"(%23) : (i64) -> index
%25 = "arith.extui"(%12) : (i32) -> i64
%26 = "arith.extui"(%13) : (i32) -> i64
%27 = "arith.shli"(%26, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%28 = "arith.ori"(%25, %27) : (i64, i64) -> i64
%29 = "arith.index_castui"(%28) : (i64) -> index
%30 = "arith.extui"(%14) : (i32) -> i64
%31 = "arith.extui"(%15) : (i32) -> i64
%32 = "arith.shli"(%31, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%33 = "arith.ori"(%30, %32) : (i64, i64) -> i64
%34 = "arith.index_castui"(%33) : (i64) -> index
%35 = "arith.index_castui"(%16) : (i32) -> index
%36 = "arith.extui"(%17) : (i32) -> i64
%37 = "arith.extui"(%18) : (i32) -> i64
%38 = "arith.shli"(%37, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%39 = "arith.ori"(%36, %38) : (i64, i64) -> i64
%40 = "arith.index_castui"(%39) : (i64) -> index
%41 = "arith.index_castui"(%19) : (i32) -> index
%42:6 = "util.assume.int"(%24, %29, %34, %35, %40, %41) <{assumptions = [[#util.int.assumption<umin = 68386816, umax = 5300322304>], [#util.int.assumption<umin = 68501504, umax = 5769969664>], [#util.int.assumption<umin = 68501568, umax = 5770002432>], [#util.int.assumption<umin = 1, umax = 4095>], [#util.int.assumption<umin = 64, umax = 576460752303423424, udiv = 64>], [#util.int.assumption<umin = 1, umax = 4095>]]}> : (index, index, index, index, index, index) -> (index, index, index, index, index, index)
%43 = "flow.dispatch.workload.ordinal"(%42#3) <{ordinal = 1 : index}> : (index) -> index
%44 = "flow.dispatch.workload.ordinal"(%42#4) <{ordinal = 2 : index}> : (index) -> index
%45 = "flow.dispatch.workload.ordinal"(%42#5) <{ordinal = 3 : index}> : (index) -> index
%46 = "hal.interface.binding.subspan"(%42#0, %43) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%46) <{alignment = 1 : i32}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%47 = "hal.interface.binding.subspan"(%42#1, %45) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%47) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%48 = "hal.interface.binding.subspan"(%4, %44) {alignment = 64 : index, binding = 1 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%48) <{alignment = 64 : i32}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>) -> ()
%49 = "hal.interface.binding.subspan"(%42#2, %45) {alignment = 64 : index, binding = 2 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%49) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%50 = "memref.alloca"(%45) <{operandSegmentSizes = array<i32: 1, 0>}> : (index) -> memref<1x?xi32, #gpu.address_space<private>>
%51 = "affine.apply"(%45) <{map = affine_map<()[s0] -> (s0 ceildiv 2)>}> : (index) -> index
"gpu.barrier"() : () -> ()
"scf.for"(%9, %51, %5) ({
^bb0(%arg7: index):
%62 = "affine.apply"(%arg7) <{map = affine_map<(d0) -> (d0 * 2)>}> : (index) -> index
%63 = "affine.min"(%arg7, %45) <{map = affine_map<(d0)[s0] -> (d0 * -2 + s0, 2)>}> : (index, index) -> index
"scf.for"(%4, %63, %2) ({
^bb0(%arg8: index):
%64 = "arith.addi"(%arg8, %62) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
%65 = "vector.transfer_read"(%47, %4, %64, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
%66 = "arith.addi"(%65, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
%67 = "arith.trunci"(%66) : (vector<1xi64>) -> vector<1xi32>
%68 = "affine.apply"(%arg7, %arg8) <{map = affine_map<(d0)[s0] -> (d0 * 2 + s0)>}> : (index, index) -> index
"vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
"vector.transfer_write"(%67, %50, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x?xi32, #gpu.address_space<private>>, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"gpu.barrier"() : () -> ()
%52:2 = "affine.delinearize_index"(%9) <{static_basis = array<i64: 4, 16>}> : (index) -> (index, index)
"scf.forall"(%43) <{mapping = [#iree_codegen.workgroup_mapping<z:1>, #iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>], operandSegmentSizes = array<i32: 0, 1, 0, 0>, staticLowerBound = array<i64: 0, 0, 0, 0>, staticStep = array<i64: 1, 1, 1, 1>, staticUpperBound = array<i64: -9223372036854775808, 32, 2, 2>}> ({
^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index):
%53 = "memref.subview"(%7) <{operandSegmentSizes = array<i32: 1, 0, 0, 0>, static_offsets = array<i64: 0, 0>, static_sizes = array<i64: 1, 1>, static_strides = array<i64: 1, 1>}> : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>
"gpu.barrier"() : () -> ()
"scf.for"(%9, %2, %5) ({
^bb0(%arg6: index):
%59 = "vector.transfer_read"(%47, %4, %arg0, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
%60 = "arith.addi"(%59, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
%61 = "arith.trunci"(%60) : (vector<1xi64>) -> vector<1xi32>
"vector.transfer_write"(%61, %7, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x3xi32, #gpu.address_space<workgroup>>, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"gpu.barrier"() : () -> ()
%54 = "affine.apply"(%arg2, %52#0) <{map = affine_map<(d0)[s0] -> (d0 * 4 + s0)>}> : (index, index) -> index
%55 = "affine.apply"(%arg3, %52#1) <{map = affine_map<(d0)[s0] -> (d0 * 64 + s0 * 4)>}> : (index, index) -> index
%56 = "memref.subview"(%48, %arg1, %54, %55, %44) <{operandSegmentSizes = array<i32: 1, 3, 1, 0>, static_offsets = array<i64: 0, -9223372036854775808, -9223372036854775808, -9223372036854775808>, static_sizes = array<i64: -9223372036854775808, 1, 1, 4>, static_strides = array<i64: 1, 1, 1, 1>}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>, index, index, index, index) -> memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%57 = "vector.transfer_read"(%46, %4, %arg0, %arg1, %54, %55, %0) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 5, 1, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, index, index, index, f8E4M3FNUZ) -> vector<4xf8E4M3FNUZ>
%58 = "arith.extf"(%57) : (vector<4xf8E4M3FNUZ>) -> vector<4xbf16>
"vector.transfer_write"(%58, %8, %4, %4, %4, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 5, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (vector<4xbf16>, memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, index, index, index, index, index) -> ()
"iree_linalg_ext.scatter"(%8, %53, %56) <{dimension_map = array<i64: 0>, operandSegmentSizes = array<i32: 2, 1>, unique_indices = true}> ({
^bb0(%arg4: bf16, %arg5: bf16):
"iree_linalg_ext.yield"(%arg4) : (bf16) -> ()
}) : (memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>, memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
"gpu.barrier"() : () -> ()
"scf.forall.in_parallel"() ({
^bb0:
}) : () -> ()
}) : (index) -> ()
"memref.dealloc"(%7) : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> ()
"func.return"() : () -> ()
}) {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64>} : () -> ()
}) : () -> ()
"hal.executable.variant_end"() : () -> ()
}) {sym_name = "rocm_hsaco_fb", target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>} : () -> ()
/sharedfile/32/fp8_32_kv16.mlir:9895:13: error: 'vector.transfer_write' op write affecting operations on global resources are restricted to workgroup distributed contexts.
%3686 = torch.aten.index_put %3684, %3685, %3680, %false_3204 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
^
/sharedfile/32/fp8_32_kv16.mlir:9895:13: note: see current operation: "vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
/sharedfile/32/fp8_32_kv16.mlir:9895:13: error: 'func.func' op failed on workgroup distribution verification
%3686 = torch.aten.index_put %3684, %3685, %3680, %false_3204 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
^
/sharedfile/32/fp8_32_kv16.mlir:9895:13: note: see current operation:
"func.func"() <{function_type = () -> (), sym_name = "prefill_bs1$async_dispatch_172_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"}> ({
%0 = "arith.constant"() <{value = 0.000000e+00 : f8E4M3FNUZ}> : () -> f8E4M3FNUZ
%1 = "arith.constant"() <{value = 0 : i64}> : () -> i64
%2 = "arith.constant"() <{value = 1 : index}> : () -> index
%3 = "arith.constant"() <{value = 32 : i64}> : () -> i64
%4 = "arith.constant"() <{value = 0 : index}> : () -> index
%5 = "arith.constant"() <{value = 64 : index}> : () -> index
%6 = "arith.constant"() <{value = dense<16> : vector<1xi64>}> : () -> vector<1xi64>
%7 = "memref.alloc"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x3xi32, #gpu.address_space<workgroup>>
%8 = "memref.alloca"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x1x1x1x4xbf16, #gpu.address_space<private>>
%9 = "gpu.thread_id"() <{dimension = #gpu<dim x>, upper_bound = 64 : index}> : () -> index
%10 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index} : () -> i32
%11 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 1 : index} : () -> i32
%12 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 2 : index} : () -> i32
%13 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 3 : index} : () -> i32
%14 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 4 : index} : () -> i32
%15 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 5 : index} : () -> i32
%16 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 6 : index} : () -> i32
%17 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 7 : index} : () -> i32
%18 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 8 : index} : () -> i32
%19 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 9 : index} : () -> i32
%20 = "arith.extui"(%10) : (i32) -> i64
%21 = "arith.extui"(%11) : (i32) -> i64
%22 = "arith.shli"(%21, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%23 = "arith.ori"(%20, %22) : (i64, i64) -> i64
%24 = "arith.index_castui"(%23) : (i64) -> index
%25 = "arith.extui"(%12) : (i32) -> i64
%26 = "arith.extui"(%13) : (i32) -> i64
%27 = "arith.shli"(%26, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%28 = "arith.ori"(%25, %27) : (i64, i64) -> i64
%29 = "arith.index_castui"(%28) : (i64) -> index
%30 = "arith.extui"(%14) : (i32) -> i64
%31 = "arith.extui"(%15) : (i32) -> i64
%32 = "arith.shli"(%31, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%33 = "arith.ori"(%30, %32) : (i64, i64) -> i64
%34 = "arith.index_castui"(%33) : (i64) -> index
%35 = "arith.index_castui"(%16) : (i32) -> index
%36 = "arith.extui"(%17) : (i32) -> i64
%37 = "arith.extui"(%18) : (i32) -> i64
%38 = "arith.shli"(%37, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%39 = "arith.ori"(%36, %38) : (i64, i64) -> i64
%40 = "arith.index_castui"(%39) : (i64) -> index
%41 = "arith.index_castui"(%19) : (i32) -> index
%42:6 = "util.assume.int"(%24, %29, %34, %35, %40, %41) <{assumptions = [[#util.int.assumption<umin = 68386816, umax = 5300322304>], [#util.int.assumption<umin = 68501504, umax = 5769969664>], [#util.int.assumption<umin = 68501568, umax = 5770002432>], [#util.int.assumption<umin = 1, umax = 4095>], [#util.int.assumption<umin = 64, umax = 576460752303423424, udiv = 64>], [#util.int.assumption<umin = 1, umax = 4095>]]}> : (index, index, index, index, index, index) -> (index, index, index, index, index, index)
%43 = "flow.dispatch.workload.ordinal"(%42#3) <{ordinal = 1 : index}> : (index) -> index
%44 = "flow.dispatch.workload.ordinal"(%42#4) <{ordinal = 2 : index}> : (index) -> index
%45 = "flow.dispatch.workload.ordinal"(%42#5) <{ordinal = 3 : index}> : (index) -> index
%46 = "hal.interface.binding.subspan"(%42#0, %43) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%46) <{alignment = 1 : i32}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%47 = "hal.interface.binding.subspan"(%42#1, %45) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%47) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%48 = "hal.interface.binding.subspan"(%4, %44) {alignment = 64 : index, binding = 1 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%48) <{alignment = 64 : i32}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>) -> ()
%49 = "hal.interface.binding.subspan"(%42#2, %45) {alignment = 64 : index, binding = 2 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%49) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%50 = "memref.alloca"(%45) <{operandSegmentSizes = array<i32: 1, 0>}> : (index) -> memref<1x?xi32, #gpu.address_space<private>>
%51 = "affine.apply"(%45) <{map = affine_map<()[s0] -> (s0 ceildiv 2)>}> : (index) -> index
"gpu.barrier"() : () -> ()
"scf.for"(%9, %51, %5) ({
^bb0(%arg7: index):
%62 = "affine.apply"(%arg7) <{map = affine_map<(d0) -> (d0 * 2)>}> : (index) -> index
%63 = "affine.min"(%arg7, %45) <{map = affine_map<(d0)[s0] -> (d0 * -2 + s0, 2)>}> : (index, index) -> index
"scf.for"(%4, %63, %2) ({
^bb0(%arg8: index):
%64 = "arith.addi"(%arg8, %62) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
%65 = "vector.transfer_read"(%47, %4, %64, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
%66 = "arith.addi"(%65, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
%67 = "arith.trunci"(%66) : (vector<1xi64>) -> vector<1xi32>
%68 = "affine.apply"(%arg7, %arg8) <{map = affine_map<(d0)[s0] -> (d0 * 2 + s0)>}> : (index, index) -> index
"vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
"vector.transfer_write"(%67, %50, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x?xi32, #gpu.address_space<private>>, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"gpu.barrier"() : () -> ()
%52:2 = "affine.delinearize_index"(%9) <{static_basis = array<i64: 4, 16>}> : (index) -> (index, index)
"scf.forall"(%43) <{mapping = [#iree_codegen.workgroup_mapping<z:1>, #iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>], operandSegmentSizes = array<i32: 0, 1, 0, 0>, staticLowerBound = array<i64: 0, 0, 0, 0>, staticStep = array<i64: 1, 1, 1, 1>, staticUpperBound = array<i64: -9223372036854775808, 32, 2, 2>}> ({
^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index):
%53 = "memref.subview"(%7) <{operandSegmentSizes = array<i32: 1, 0, 0, 0>, static_offsets = array<i64: 0, 0>, static_sizes = array<i64: 1, 1>, static_strides = array<i64: 1, 1>}> : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>
"gpu.barrier"() : () -> ()
"scf.for"(%9, %2, %5) ({
^bb0(%arg6: index):
%59 = "vector.transfer_read"(%47, %4, %arg0, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
%60 = "arith.addi"(%59, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
%61 = "arith.trunci"(%60) : (vector<1xi64>) -> vector<1xi32>
"vector.transfer_write"(%61, %7, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x3xi32, #gpu.address_space<workgroup>>, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"gpu.barrier"() : () -> ()
%54 = "affine.apply"(%arg2, %52#0) <{map = affine_map<(d0)[s0] -> (d0 * 4 + s0)>}> : (index, index) -> index
%55 = "affine.apply"(%arg3, %52#1) <{map = affine_map<(d0)[s0] -> (d0 * 64 + s0 * 4)>}> : (index, index) -> index
%56 = "memref.subview"(%48, %arg1, %54, %55, %44) <{operandSegmentSizes = array<i32: 1, 3, 1, 0>, static_offsets = array<i64: 0, -9223372036854775808, -9223372036854775808, -9223372036854775808>, static_sizes = array<i64: -9223372036854775808, 1, 1, 4>, static_strides = array<i64: 1, 1, 1, 1>}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>, index, index, index, index) -> memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%57 = "vector.transfer_read"(%46, %4, %arg0, %arg1, %54, %55, %0) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 5, 1, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, index, index, index, f8E4M3FNUZ) -> vector<4xf8E4M3FNUZ>
%58 = "arith.extf"(%57) : (vector<4xf8E4M3FNUZ>) -> vector<4xbf16>
"vector.transfer_write"(%58, %8, %4, %4, %4, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 5, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (vector<4xbf16>, memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, index, index, index, index, index) -> ()
"iree_linalg_ext.scatter"(%8, %53, %56) <{dimension_map = array<i64: 0>, operandSegmentSizes = array<i32: 2, 1>, unique_indices = true}> ({
^bb0(%arg4: bf16, %arg5: bf16):
"iree_linalg_ext.yield"(%arg4) : (bf16) -> ()
}) : (memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>, memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
"gpu.barrier"() : () -> ()
"scf.forall.in_parallel"() ({
^bb0:
}) : () -> ()
}) : (index) -> ()
"memref.dealloc"(%7) : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> ()
"func.return"() : () -> ()
}) {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64>} : () -> ()
/sharedfile/32/fp8_32_kv16.mlir:9895:13: error: failed to run translation of source executable to target executable for backend #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>
%3686 = torch.aten.index_put %3684, %3685, %3680, %false_3204 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
^
/sharedfile/32/fp8_32_kv16.mlir:9895:13: note: see current operation:
"hal.executable.variant"() ({
"hal.executable.export"() ({
^bb0(%arg9: !hal.device, %arg10: index, %arg11: index, %arg12: index, %arg13: index):
%69:3 = "flow.dispatch.workgroup_count_from_slice"(%arg10, %arg11, %arg12, %arg13) : (index, index, index, index) -> (index, index, index)
"hal.return"(%69#0, %69#1, %69#2) : (index, index, index) -> ()
}) {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index, sym_name = "prefill_bs1$async_dispatch_172_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"} : () -> ()
"builtin.module"() ({
"func.func"() <{function_type = () -> (), sym_name = "prefill_bs1$async_dispatch_172_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"}> ({
%0 = "arith.constant"() <{value = 0.000000e+00 : f8E4M3FNUZ}> : () -> f8E4M3FNUZ
%1 = "arith.constant"() <{value = 0 : i64}> : () -> i64
%2 = "arith.constant"() <{value = 1 : index}> : () -> index
%3 = "arith.constant"() <{value = 32 : i64}> : () -> i64
%4 = "arith.constant"() <{value = 0 : index}> : () -> index
%5 = "arith.constant"() <{value = 64 : index}> : () -> index
%6 = "arith.constant"() <{value = dense<16> : vector<1xi64>}> : () -> vector<1xi64>
%7 = "memref.alloc"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x3xi32, #gpu.address_space<workgroup>>
%8 = "memref.alloca"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x1x1x1x4xbf16, #gpu.address_space<private>>
%9 = "gpu.thread_id"() <{dimension = #gpu<dim x>, upper_bound = 64 : index}> : () -> index
%10 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index} : () -> i32
%11 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 1 : index} : () -> i32
%12 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 2 : index} : () -> i32
%13 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 3 : index} : () -> i32
%14 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 4 : index} : () -> i32
%15 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 5 : index} : () -> i32
%16 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 6 : index} : () -> i32
%17 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 7 : index} : () -> i32
%18 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 8 : index} : () -> i32
%19 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 9 : index} : () -> i32
%20 = "arith.extui"(%10) : (i32) -> i64
%21 = "arith.extui"(%11) : (i32) -> i64
%22 = "arith.shli"(%21, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%23 = "arith.ori"(%20, %22) : (i64, i64) -> i64
%24 = "arith.index_castui"(%23) : (i64) -> index
%25 = "arith.extui"(%12) : (i32) -> i64
%26 = "arith.extui"(%13) : (i32) -> i64
%27 = "arith.shli"(%26, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%28 = "arith.ori"(%25, %27) : (i64, i64) -> i64
%29 = "arith.index_castui"(%28) : (i64) -> index
%30 = "arith.extui"(%14) : (i32) -> i64
%31 = "arith.extui"(%15) : (i32) -> i64
%32 = "arith.shli"(%31, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%33 = "arith.ori"(%30, %32) : (i64, i64) -> i64
%34 = "arith.index_castui"(%33) : (i64) -> index
%35 = "arith.index_castui"(%16) : (i32) -> index
%36 = "arith.extui"(%17) : (i32) -> i64
%37 = "arith.extui"(%18) : (i32) -> i64
%38 = "arith.shli"(%37, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%39 = "arith.ori"(%36, %38) : (i64, i64) -> i64
%40 = "arith.index_castui"(%39) : (i64) -> index
%41 = "arith.index_castui"(%19) : (i32) -> index
%42:6 = "util.assume.int"(%24, %29, %34, %35, %40, %41) <{assumptions = [[#util.int.assumption<umin = 68386816, umax = 5300322304>], [#util.int.assumption<umin = 68501504, umax = 5769969664>], [#util.int.assumption<umin = 68501568, umax = 5770002432>], [#util.int.assumption<umin = 1, umax = 4095>], [#util.int.assumption<umin = 64, umax = 576460752303423424, udiv = 64>], [#util.int.assumption<umin = 1, umax = 4095>]]}> : (index, index, index, index, index, index) -> (index, index, index, index, index, index)
%43 = "flow.dispatch.workload.ordinal"(%42#3) <{ordinal = 1 : index}> : (index) -> index
%44 = "flow.dispatch.workload.ordinal"(%42#4) <{ordinal = 2 : index}> : (index) -> index
%45 = "flow.dispatch.workload.ordinal"(%42#5) <{ordinal = 3 : index}> : (index) -> index
%46 = "hal.interface.binding.subspan"(%42#0, %43) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%46) <{alignment = 1 : i32}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%47 = "hal.interface.binding.subspan"(%42#1, %45) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%47) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%48 = "hal.interface.binding.subspan"(%4, %44) {alignment = 64 : index, binding = 1 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%48) <{alignment = 64 : i32}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>) -> ()
%49 = "hal.interface.binding.subspan"(%42#2, %45) {alignment = 64 : index, binding = 2 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%49) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%50 = "memref.alloca"(%45) <{operandSegmentSizes = array<i32: 1, 0>}> : (index) -> memref<1x?xi32, #gpu.address_space<private>>
%51 = "affine.apply"(%45) <{map = affine_map<()[s0] -> (s0 ceildiv 2)>}> : (index) -> index
"gpu.barrier"() : () -> ()
"scf.for"(%9, %51, %5) ({
^bb0(%arg7: index):
%62 = "affine.apply"(%arg7) <{map = affine_map<(d0) -> (d0 * 2)>}> : (index) -> index
%63 = "affine.min"(%arg7, %45) <{map = affine_map<(d0)[s0] -> (d0 * -2 + s0, 2)>}> : (index, index) -> index
"scf.for"(%4, %63, %2) ({
^bb0(%arg8: index):
%64 = "arith.addi"(%arg8, %62) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
%65 = "vector.transfer_read"(%47, %4, %64, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
%66 = "arith.addi"(%65, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
%67 = "arith.trunci"(%66) : (vector<1xi64>) -> vector<1xi32>
%68 = "affine.apply"(%arg7, %arg8) <{map = affine_map<(d0)[s0] -> (d0 * 2 + s0)>}> : (index, index) -> index
"vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
"vector.transfer_write"(%67, %50, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x?xi32, #gpu.address_space<private>>, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"gpu.barrier"() : () -> ()
%52:2 = "affine.delinearize_index"(%9) <{static_basis = array<i64: 4, 16>}> : (index) -> (index, index)
"scf.forall"(%43) <{mapping = [#iree_codegen.workgroup_mapping<z:1>, #iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>], operandSegmentSizes = array<i32: 0, 1, 0, 0>, staticLowerBound = array<i64: 0, 0, 0, 0>, staticStep = array<i64: 1, 1, 1, 1>, staticUpperBound = array<i64: -9223372036854775808, 32, 2, 2>}> ({
^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index):
%53 = "memref.subview"(%7) <{operandSegmentSizes = array<i32: 1, 0, 0, 0>, static_offsets = array<i64: 0, 0>, static_sizes = array<i64: 1, 1>, static_strides = array<i64: 1, 1>}> : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>
"gpu.barrier"() : () -> ()
"scf.for"(%9, %2, %5) ({
^bb0(%arg6: index):
%59 = "vector.transfer_read"(%47, %4, %arg0, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
%60 = "arith.addi"(%59, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
%61 = "arith.trunci"(%60) : (vector<1xi64>) -> vector<1xi32>
"vector.transfer_write"(%61, %7, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x3xi32, #gpu.address_space<workgroup>>, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"gpu.barrier"() : () -> ()
%54 = "affine.apply"(%arg2, %52#0) <{map = affine_map<(d0)[s0] -> (d0 * 4 + s0)>}> : (index, index) -> index
%55 = "affine.apply"(%arg3, %52#1) <{map = affine_map<(d0)[s0] -> (d0 * 64 + s0 * 4)>}> : (index, index) -> index
%56 = "memref.subview"(%48, %arg1, %54, %55, %44) <{operandSegmentSizes = array<i32: 1, 3, 1, 0>, static_offsets = array<i64: 0, -9223372036854775808, -9223372036854775808, -9223372036854775808>, static_sizes = array<i64: -9223372036854775808, 1, 1, 4>, static_strides = array<i64: 1, 1, 1, 1>}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>, index, index, index, index) -> memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%57 = "vector.transfer_read"(%46, %4, %arg0, %arg1, %54, %55, %0) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 5, 1, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, index, index, index, f8E4M3FNUZ) -> vector<4xf8E4M3FNUZ>
%58 = "arith.extf"(%57) : (vector<4xf8E4M3FNUZ>) -> vector<4xbf16>
"vector.transfer_write"(%58, %8, %4, %4, %4, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 5, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (vector<4xbf16>, memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, index, index, index, index, index) -> ()
"iree_linalg_ext.scatter"(%8, %53, %56) <{dimension_map = array<i64: 0>, operandSegmentSizes = array<i32: 2, 1>, unique_indices = true}> ({
^bb0(%arg4: bf16, %arg5: bf16):
"iree_linalg_ext.yield"(%arg4) : (bf16) -> ()
}) : (memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>, memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
"gpu.barrier"() : () -> ()
"scf.forall.in_parallel"() ({
^bb0:
}) : () -> ()
}) : (index) -> ()
"memref.dealloc"(%7) : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> ()
"func.return"() : () -> ()
}) {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64>} : () -> ()
}) : () -> ()
"hal.executable.variant_end"() : () -> ()
}) {sym_name = "rocm_hsaco_fb", target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>} : () -> ()
/sharedfile/32/fp8_32_kv16.mlir:10771:13: error: 'vector.transfer_write' op write affecting operations on global resources are restricted to workgroup distributed contexts.
%4023 = torch.aten.index_put %4021, %4022, %4017, %false_3576 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
^
/sharedfile/32/fp8_32_kv16.mlir:10771:13: note: see current operation: "vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
/sharedfile/32/fp8_32_kv16.mlir:10771:13: error: 'func.func' op failed on workgroup distribution verification
%4023 = torch.aten.index_put %4021, %4022, %4017, %false_3576 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
^
/sharedfile/32/fp8_32_kv16.mlir:10771:13: note: see current operation:
"func.func"() <{function_type = () -> (), sym_name = "prefill_bs1$async_dispatch_192_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"}> ({
%0 = "arith.constant"() <{value = 0.000000e+00 : f8E4M3FNUZ}> : () -> f8E4M3FNUZ
%1 = "arith.constant"() <{value = 0 : i64}> : () -> i64
%2 = "arith.constant"() <{value = 1 : index}> : () -> index
%3 = "arith.constant"() <{value = 32 : i64}> : () -> i64
%4 = "arith.constant"() <{value = 0 : index}> : () -> index
%5 = "arith.constant"() <{value = 64 : index}> : () -> index
%6 = "arith.constant"() <{value = dense<18> : vector<1xi64>}> : () -> vector<1xi64>
%7 = "memref.alloc"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x3xi32, #gpu.address_space<workgroup>>
%8 = "memref.alloca"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x1x1x1x4xbf16, #gpu.address_space<private>>
%9 = "gpu.thread_id"() <{dimension = #gpu<dim x>, upper_bound = 64 : index}> : () -> index
%10 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index} : () -> i32
%11 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 1 : index} : () -> i32
%12 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 2 : index} : () -> i32
%13 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 3 : index} : () -> i32
%14 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 4 : index} : () -> i32
%15 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 5 : index} : () -> i32
%16 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 6 : index} : () -> i32
%17 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 7 : index} : () -> i32
%18 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 8 : index} : () -> i32
%19 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 9 : index} : () -> i32
%20 = "arith.extui"(%10) : (i32) -> i64
%21 = "arith.extui"(%11) : (i32) -> i64
%22 = "arith.shli"(%21, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%23 = "arith.ori"(%20, %22) : (i64, i64) -> i64
%24 = "arith.index_castui"(%23) : (i64) -> index
%25 = "arith.extui"(%12) : (i32) -> i64
%26 = "arith.extui"(%13) : (i32) -> i64
%27 = "arith.shli"(%26, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%28 = "arith.ori"(%25, %27) : (i64, i64) -> i64
%29 = "arith.index_castui"(%28) : (i64) -> index
%30 = "arith.extui"(%14) : (i32) -> i64
%31 = "arith.extui"(%15) : (i32) -> i64
%32 = "arith.shli"(%31, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%33 = "arith.ori"(%30, %32) : (i64, i64) -> i64
%34 = "arith.index_castui"(%33) : (i64) -> index
%35 = "arith.index_castui"(%16) : (i32) -> index
%36 = "arith.extui"(%17) : (i32) -> i64
%37 = "arith.extui"(%18) : (i32) -> i64
%38 = "arith.shli"(%37, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%39 = "arith.ori"(%36, %38) : (i64, i64) -> i64
%40 = "arith.index_castui"(%39) : (i64) -> index
%41 = "arith.index_castui"(%19) : (i32) -> index
%42:6 = "util.assume.int"(%24, %29, %34, %35, %40, %41) <{assumptions = [[#util.int.assumption<umin = 68386816, umax = 5300322304>], [#util.int.assumption<umin = 68501504, umax = 5769969664>], [#util.int.assumption<umin = 68501568, umax = 5770002432>], [#util.int.assumption<umin = 1, umax = 4095>], [#util.int.assumption<umin = 64, umax = 576460752303423424, udiv = 64>], [#util.int.assumption<umin = 1, umax = 4095>]]}> : (index, index, index, index, index, index) -> (index, index, index, index, index, index)
%43 = "flow.dispatch.workload.ordinal"(%42#3) <{ordinal = 1 : index}> : (index) -> index
%44 = "flow.dispatch.workload.ordinal"(%42#4) <{ordinal = 2 : index}> : (index) -> index
%45 = "flow.dispatch.workload.ordinal"(%42#5) <{ordinal = 3 : index}> : (index) -> index
%46 = "hal.interface.binding.subspan"(%42#0, %43) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%46) <{alignment = 1 : i32}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%47 = "hal.interface.binding.subspan"(%42#1, %45) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%47) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%48 = "hal.interface.binding.subspan"(%4, %44) {alignment = 64 : index, binding = 1 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%48) <{alignment = 64 : i32}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>) -> ()
%49 = "hal.interface.binding.subspan"(%42#2, %45) {alignment = 64 : index, binding = 2 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%49) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%50 = "memref.alloca"(%45) <{operandSegmentSizes = array<i32: 1, 0>}> : (index) -> memref<1x?xi32, #gpu.address_space<private>>
%51 = "affine.apply"(%45) <{map = affine_map<()[s0] -> (s0 ceildiv 2)>}> : (index) -> index
"gpu.barrier"() : () -> ()
"scf.for"(%9, %51, %5) ({
^bb0(%arg7: index):
%62 = "affine.apply"(%arg7) <{map = affine_map<(d0) -> (d0 * 2)>}> : (index) -> index
%63 = "affine.min"(%arg7, %45) <{map = affine_map<(d0)[s0] -> (d0 * -2 + s0, 2)>}> : (index, index) -> index
"scf.for"(%4, %63, %2) ({
^bb0(%arg8: index):
%64 = "arith.addi"(%arg8, %62) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
%65 = "vector.transfer_read"(%47, %4, %64, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
%66 = "arith.addi"(%65, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
%67 = "arith.trunci"(%66) : (vector<1xi64>) -> vector<1xi32>
%68 = "affine.apply"(%arg7, %arg8) <{map = affine_map<(d0)[s0] -> (d0 * 2 + s0)>}> : (index, index) -> index
"vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
"vector.transfer_write"(%67, %50, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x?xi32, #gpu.address_space<private>>, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"gpu.barrier"() : () -> ()
%52:2 = "affine.delinearize_index"(%9) <{static_basis = array<i64: 4, 16>}> : (index) -> (index, index)
"scf.forall"(%43) <{mapping = [#iree_codegen.workgroup_mapping<z:1>, #iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>], operandSegmentSizes = array<i32: 0, 1, 0, 0>, staticLowerBound = array<i64: 0, 0, 0, 0>, staticStep = array<i64: 1, 1, 1, 1>, staticUpperBound = array<i64: -9223372036854775808, 32, 2, 2>}> ({
^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index):
%53 = "memref.subview"(%7) <{operandSegmentSizes = array<i32: 1, 0, 0, 0>, static_offsets = array<i64: 0, 0>, static_sizes = array<i64: 1, 1>, static_strides = array<i64: 1, 1>}> : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>
"gpu.barrier"() : () -> ()
"scf.for"(%9, %2, %5) ({
^bb0(%arg6: index):
%59 = "vector.transfer_read"(%47, %4, %arg0, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
%60 = "arith.addi"(%59, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
%61 = "arith.trunci"(%60) : (vector<1xi64>) -> vector<1xi32>
"vector.transfer_write"(%61, %7, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x3xi32, #gpu.address_space<workgroup>>, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"gpu.barrier"() : () -> ()
%54 = "affine.apply"(%arg2, %52#0) <{map = affine_map<(d0)[s0] -> (d0 * 4 + s0)>}> : (index, index) -> index
%55 = "affine.apply"(%arg3, %52#1) <{map = affine_map<(d0)[s0] -> (d0 * 64 + s0 * 4)>}> : (index, index) -> index
%56 = "memref.subview"(%48, %arg1, %54, %55, %44) <{operandSegmentSizes = array<i32: 1, 3, 1, 0>, static_offsets = array<i64: 0, -9223372036854775808, -9223372036854775808, -9223372036854775808>, static_sizes = array<i64: -9223372036854775808, 1, 1, 4>, static_strides = array<i64: 1, 1, 1, 1>}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>, index, index, index, index) -> memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%57 = "vector.transfer_read"(%46, %4, %arg0, %arg1, %54, %55, %0) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 5, 1, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, index, index, index, f8E4M3FNUZ) -> vector<4xf8E4M3FNUZ>
%58 = "arith.extf"(%57) : (vector<4xf8E4M3FNUZ>) -> vector<4xbf16>
"vector.transfer_write"(%58, %8, %4, %4, %4, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 5, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (vector<4xbf16>, memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, index, index, index, index, index) -> ()
"iree_linalg_ext.scatter"(%8, %53, %56) <{dimension_map = array<i64: 0>, operandSegmentSizes = array<i32: 2, 1>, unique_indices = true}> ({
^bb0(%arg4: bf16, %arg5: bf16):
"iree_linalg_ext.yield"(%arg4) : (bf16) -> ()
}) : (memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>, memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
"gpu.barrier"() : () -> ()
"scf.forall.in_parallel"() ({
^bb0:
}) : () -> ()
}) : (index) -> ()
"memref.dealloc"(%7) : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> ()
"func.return"() : () -> ()
}) {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64>} : () -> ()
/sharedfile/32/fp8_32_kv16.mlir:10771:13: error: failed to run translation of source executable to target executable for backend #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>
%4023 = torch.aten.index_put %4021, %4022, %4017, %false_3576 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
^
/sharedfile/32/fp8_32_kv16.mlir:10771:13: note: see current operation:
"hal.executable.variant"() ({
"hal.executable.export"() ({
^bb0(%arg9: !hal.device, %arg10: index, %arg11: index, %arg12: index, %arg13: index):
%69:3 = "flow.dispatch.workgroup_count_from_slice"(%arg10, %arg11, %arg12, %arg13) : (index, index, index, index) -> (index, index, index)
"hal.return"(%69#0, %69#1, %69#2) : (index, index, index) -> ()
}) {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index, sym_name = "prefill_bs1$async_dispatch_192_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"} : () -> ()
"builtin.module"() ({
"func.func"() <{function_type = () -> (), sym_name = "prefill_bs1$async_dispatch_192_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"}> ({
%0 = "arith.constant"() <{value = 0.000000e+00 : f8E4M3FNUZ}> : () -> f8E4M3FNUZ
%1 = "arith.constant"() <{value = 0 : i64}> : () -> i64
%2 = "arith.constant"() <{value = 1 : index}> : () -> index
%3 = "arith.constant"() <{value = 32 : i64}> : () -> i64
%4 = "arith.constant"() <{value = 0 : index}> : () -> index
%5 = "arith.constant"() <{value = 64 : index}> : () -> index
%6 = "arith.constant"() <{value = dense<18> : vector<1xi64>}> : () -> vector<1xi64>
%7 = "memref.alloc"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x3xi32, #gpu.address_space<workgroup>>
%8 = "memref.alloca"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x1x1x1x4xbf16, #gpu.address_space<private>>
%9 = "gpu.thread_id"() <{dimension = #gpu<dim x>, upper_bound = 64 : index}> : () -> index
%10 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index} : () -> i32
%11 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 1 : index} : () -> i32
%12 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 2 : index} : () -> i32
%13 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 3 : index} : () -> i32
%14 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 4 : index} : () -> i32
%15 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 5 : index} : () -> i32
%16 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 6 : index} : () -> i32
%17 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 7 : index} : () -> i32
%18 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 8 : index} : () -> i32
%19 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 9 : index} : () -> i32
%20 = "arith.extui"(%10) : (i32) -> i64
%21 = "arith.extui"(%11) : (i32) -> i64
%22 = "arith.shli"(%21, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%23 = "arith.ori"(%20, %22) : (i64, i64) -> i64
%24 = "arith.index_castui"(%23) : (i64) -> index
%25 = "arith.extui"(%12) : (i32) -> i64
%26 = "arith.extui"(%13) : (i32) -> i64
%27 = "arith.shli"(%26, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%28 = "arith.ori"(%25, %27) : (i64, i64) -> i64
%29 = "arith.index_castui"(%28) : (i64) -> index
%30 = "arith.extui"(%14) : (i32) -> i64
%31 = "arith.extui"(%15) : (i32) -> i64
%32 = "arith.shli"(%31, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%33 = "arith.ori"(%30, %32) : (i64, i64) -> i64
%34 = "arith.index_castui"(%33) : (i64) -> index
%35 = "arith.index_castui"(%16) : (i32) -> index
%36 = "arith.extui"(%17) : (i32) -> i64
%37 = "arith.extui"(%18) : (i32) -> i64
%38 = "arith.shli"(%37, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%39 = "arith.ori"(%36, %38) : (i64, i64) -> i64
%40 = "arith.index_castui"(%39) : (i64) -> index
%41 = "arith.index_castui"(%19) : (i32) -> index
%42:6 = "util.assume.int"(%24, %29, %34, %35, %40, %41) <{assumptions = [[#util.int.assumption<umin = 68386816, umax = 5300322304>], [#util.int.assumption<umin = 68501504, umax = 5769969664>], [#util.int.assumption<umin = 68501568, umax = 5770002432>], [#util.int.assumption<umin = 1, umax = 4095>], [#util.int.assumption<umin = 64, umax = 576460752303423424, udiv = 64>], [#util.int.assumption<umin = 1, umax = 4095>]]}> : (index, index, index, index, index, index) -> (index, index, index, index, index, index)
%43 = "flow.dispatch.workload.ordinal"(%42#3) <{ordinal = 1 : index}> : (index) -> index
%44 = "flow.dispatch.workload.ordinal"(%42#4) <{ordinal = 2 : index}> : (index) -> index
%45 = "flow.dispatch.workload.ordinal"(%42#5) <{ordinal = 3 : index}> : (index) -> index
%46 = "hal.interface.binding.subspan"(%42#0, %43) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%46) <{alignment = 1 : i32}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%47 = "hal.interface.binding.subspan"(%42#1, %45) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%47) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%48 = "hal.interface.binding.subspan"(%4, %44) {alignment = 64 : index, binding = 1 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%48) <{alignment = 64 : i32}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>) -> ()
%49 = "hal.interface.binding.subspan"(%42#2, %45) {alignment = 64 : index, binding = 2 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%49) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%50 = "memref.alloca"(%45) <{operandSegmentSizes = array<i32: 1, 0>}> : (index) -> memref<1x?xi32, #gpu.address_space<private>>
%51 = "affine.apply"(%45) <{map = affine_map<()[s0] -> (s0 ceildiv 2)>}> : (index) -> index
"gpu.barrier"() : () -> ()
"scf.for"(%9, %51, %5) ({
^bb0(%arg7: index):
%62 = "affine.apply"(%arg7) <{map = affine_map<(d0) -> (d0 * 2)>}> : (index) -> index
%63 = "affine.min"(%arg7, %45) <{map = affine_map<(d0)[s0] -> (d0 * -2 + s0, 2)>}> : (index, index) -> index
"scf.for"(%4, %63, %2) ({
^bb0(%arg8: index):
%64 = "arith.addi"(%arg8, %62) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
%65 = "vector.transfer_read"(%47, %4, %64, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
%66 = "arith.addi"(%65, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
%67 = "arith.trunci"(%66) : (vector<1xi64>) -> vector<1xi32>
%68 = "affine.apply"(%arg7, %arg8) <{map = affine_map<(d0)[s0] -> (d0 * 2 + s0)>}> : (index, index) -> index
"vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
"vector.transfer_write"(%67, %50, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x?xi32, #gpu.address_space<private>>, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"gpu.barrier"() : () -> ()
%52:2 = "affine.delinearize_index"(%9) <{static_basis = array<i64: 4, 16>}> : (index) -> (index, index)
"scf.forall"(%43) <{mapping = [#iree_codegen.workgroup_mapping<z:1>, #iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>], operandSegmentSizes = array<i32: 0, 1, 0, 0>, staticLowerBound = array<i64: 0, 0, 0, 0>, staticStep = array<i64: 1, 1, 1, 1>, staticUpperBound = array<i64: -9223372036854775808, 32, 2, 2>}> ({
^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index):
%53 = "memref.subview"(%7) <{operandSegmentSizes = array<i32: 1, 0, 0, 0>, static_offsets = array<i64: 0, 0>, static_sizes = array<i64: 1, 1>, static_strides = array<i64: 1, 1>}> : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>
"gpu.barrier"() : () -> ()
"scf.for"(%9, %2, %5) ({
^bb0(%arg6: index):
%59 = "vector.transfer_read"(%47, %4, %arg0, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
%60 = "arith.addi"(%59, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
%61 = "arith.trunci"(%60) : (vector<1xi64>) -> vector<1xi32>
"vector.transfer_write"(%61, %7, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x3xi32, #gpu.address_space<workgroup>>, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"gpu.barrier"() : () -> ()
%54 = "affine.apply"(%arg2, %52#0) <{map = affine_map<(d0)[s0] -> (d0 * 4 + s0)>}> : (index, index) -> index
%55 = "affine.apply"(%arg3, %52#1) <{map = affine_map<(d0)[s0] -> (d0 * 64 + s0 * 4)>}> : (index, index) -> index
%56 = "memref.subview"(%48, %arg1, %54, %55, %44) <{operandSegmentSizes = array<i32: 1, 3, 1, 0>, static_offsets = array<i64: 0, -9223372036854775808, -9223372036854775808, -9223372036854775808>, static_sizes = array<i64: -9223372036854775808, 1, 1, 4>, static_strides = array<i64: 1, 1, 1, 1>}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>, index, index, index, index) -> memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%57 = "vector.transfer_read"(%46, %4, %arg0, %arg1, %54, %55, %0) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 5, 1, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, index, index, index, f8E4M3FNUZ) -> vector<4xf8E4M3FNUZ>
%58 = "arith.extf"(%57) : (vector<4xf8E4M3FNUZ>) -> vector<4xbf16>
"vector.transfer_write"(%58, %8, %4, %4, %4, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 5, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (vector<4xbf16>, memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, index, index, index, index, index) -> ()
"iree_linalg_ext.scatter"(%8, %53, %56) <{dimension_map = array<i64: 0>, operandSegmentSizes = array<i32: 2, 1>, unique_indices = true}> ({
^bb0(%arg4: bf16, %arg5: bf16):
"iree_linalg_ext.yield"(%arg4) : (bf16) -> ()
}) : (memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>, memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
"gpu.barrier"() : () -> ()
"scf.forall.in_parallel"() ({
^bb0:
}) : () -> ()
}) : (index) -> ()
"memref.dealloc"(%7) : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> ()
"func.return"() : () -> ()
}) {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64>} : () -> ()
}) : () -> ()
"hal.executable.variant_end"() : () -> ()
}) {sym_name = "rocm_hsaco_fb", target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>} : () -> ()
/sharedfile/32/fp8_32_kv16.mlir:11647:13: error: 'vector.transfer_write' op write affecting operations on global resources are restricted to workgroup distributed contexts.
%4360 = torch.aten.index_put %4358, %4359, %4354, %false_3948 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
^
/sharedfile/32/fp8_32_kv16.mlir:11647:13: note: see current operation: "vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
/sharedfile/32/fp8_32_kv16.mlir:11647:13: error: 'func.func' op failed on workgroup distribution verification
%4360 = torch.aten.index_put %4358, %4359, %4354, %false_3948 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
^
/sharedfile/32/fp8_32_kv16.mlir:11647:13: note: see current operation:
"func.func"() <{function_type = () -> (), sym_name = "prefill_bs1$async_dispatch_212_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"}> ({
%0 = "arith.constant"() <{value = 0.000000e+00 : f8E4M3FNUZ}> : () -> f8E4M3FNUZ
%1 = "arith.constant"() <{value = 0 : i64}> : () -> i64
%2 = "arith.constant"() <{value = 1 : index}> : () -> index
%3 = "arith.constant"() <{value = 32 : i64}> : () -> i64
%4 = "arith.constant"() <{value = 0 : index}> : () -> index
%5 = "arith.constant"() <{value = 64 : index}> : () -> index
%6 = "arith.constant"() <{value = dense<20> : vector<1xi64>}> : () -> vector<1xi64>
%7 = "memref.alloc"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x3xi32, #gpu.address_space<workgroup>>
%8 = "memref.alloca"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x1x1x1x4xbf16, #gpu.address_space<private>>
%9 = "gpu.thread_id"() <{dimension = #gpu<dim x>, upper_bound = 64 : index}> : () -> index
%10 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index} : () -> i32
%11 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 1 : index} : () -> i32
%12 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 2 : index} : () -> i32
%13 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 3 : index} : () -> i32
%14 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 4 : index} : () -> i32
%15 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 5 : index} : () -> i32
%16 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 6 : index} : () -> i32
%17 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 7 : index} : () -> i32
%18 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 8 : index} : () -> i32
%19 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 9 : index} : () -> i32
%20 = "arith.extui"(%10) : (i32) -> i64
%21 = "arith.extui"(%11) : (i32) -> i64
%22 = "arith.shli"(%21, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%23 = "arith.ori"(%20, %22) : (i64, i64) -> i64
%24 = "arith.index_castui"(%23) : (i64) -> index
%25 = "arith.extui"(%12) : (i32) -> i64
%26 = "arith.extui"(%13) : (i32) -> i64
%27 = "arith.shli"(%26, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%28 = "arith.ori"(%25, %27) : (i64, i64) -> i64
%29 = "arith.index_castui"(%28) : (i64) -> index
%30 = "arith.extui"(%14) : (i32) -> i64
%31 = "arith.extui"(%15) : (i32) -> i64
%32 = "arith.shli"(%31, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%33 = "arith.ori"(%30, %32) : (i64, i64) -> i64
%34 = "arith.index_castui"(%33) : (i64) -> index
%35 = "arith.index_castui"(%16) : (i32) -> index
%36 = "arith.extui"(%17) : (i32) -> i64
%37 = "arith.extui"(%18) : (i32) -> i64
%38 = "arith.shli"(%37, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%39 = "arith.ori"(%36, %38) : (i64, i64) -> i64
%40 = "arith.index_castui"(%39) : (i64) -> index
%41 = "arith.index_castui"(%19) : (i32) -> index
%42:6 = "util.assume.int"(%24, %29, %34, %35, %40, %41) <{assumptions = [[#util.int.assumption<umin = 68386816, umax = 5300322304>], [#util.int.assumption<umin = 68501504, umax = 5769969664>], [#util.int.assumption<umin = 68501568, umax = 5770002432>], [#util.int.assumption<umin = 1, umax = 4095>], [#util.int.assumption<umin = 64, umax = 576460752303423424, udiv = 64>], [#util.int.assumption<umin = 1, umax = 4095>]]}> : (index, index, index, index, index, index) -> (index, index, index, index, index, index)
%43 = "flow.dispatch.workload.ordinal"(%42#3) <{ordinal = 1 : index}> : (index) -> index
%44 = "flow.dispatch.workload.ordinal"(%42#4) <{ordinal = 2 : index}> : (index) -> index
%45 = "flow.dispatch.workload.ordinal"(%42#5) <{ordinal = 3 : index}> : (index) -> index
%46 = "hal.interface.binding.subspan"(%42#0, %43) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%46) <{alignment = 1 : i32}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%47 = "hal.interface.binding.subspan"(%42#1, %45) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%47) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%48 = "hal.interface.binding.subspan"(%4, %44) {alignment = 64 : index, binding = 1 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%48) <{alignment = 64 : i32}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>) -> ()
%49 = "hal.interface.binding.subspan"(%42#2, %45) {alignment = 64 : index, binding = 2 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%49) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%50 = "memref.alloca"(%45) <{operandSegmentSizes = array<i32: 1, 0>}> : (index) -> memref<1x?xi32, #gpu.address_space<private>>
%51 = "affine.apply"(%45) <{map = affine_map<()[s0] -> (s0 ceildiv 2)>}> : (index) -> index
"gpu.barrier"() : () -> ()
"scf.for"(%9, %51, %5) ({
^bb0(%arg7: index):
%62 = "affine.apply"(%arg7) <{map = affine_map<(d0) -> (d0 * 2)>}> : (index) -> index
%63 = "affine.min"(%arg7, %45) <{map = affine_map<(d0)[s0] -> (d0 * -2 + s0, 2)>}> : (index, index) -> index
"scf.for"(%4, %63, %2) ({
^bb0(%arg8: index):
%64 = "arith.addi"(%arg8, %62) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
%65 = "vector.transfer_read"(%47, %4, %64, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
%66 = "arith.addi"(%65, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
%67 = "arith.trunci"(%66) : (vector<1xi64>) -> vector<1xi32>
%68 = "affine.apply"(%arg7, %arg8) <{map = affine_map<(d0)[s0] -> (d0 * 2 + s0)>}> : (index, index) -> index
"vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
"vector.transfer_write"(%67, %50, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x?xi32, #gpu.address_space<private>>, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"gpu.barrier"() : () -> ()
%52:2 = "affine.delinearize_index"(%9) <{static_basis = array<i64: 4, 16>}> : (index) -> (index, index)
"scf.forall"(%43) <{mapping = [#iree_codegen.workgroup_mapping<z:1>, #iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>], operandSegmentSizes = array<i32: 0, 1, 0, 0>, staticLowerBound = array<i64: 0, 0, 0, 0>, staticStep = array<i64: 1, 1, 1, 1>, staticUpperBound = array<i64: -9223372036854775808, 32, 2, 2>}> ({
^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index):
%53 = "memref.subview"(%7) <{operandSegmentSizes = array<i32: 1, 0, 0, 0>, static_offsets = array<i64: 0, 0>, static_sizes = array<i64: 1, 1>, static_strides = array<i64: 1, 1>}> : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>
"gpu.barrier"() : () -> ()
"scf.for"(%9, %2, %5) ({
^bb0(%arg6: index):
%59 = "vector.transfer_read"(%47, %4, %arg0, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
%60 = "arith.addi"(%59, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
%61 = "arith.trunci"(%60) : (vector<1xi64>) -> vector<1xi32>
"vector.transfer_write"(%61, %7, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x3xi32, #gpu.address_space<workgroup>>, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"gpu.barrier"() : () -> ()
%54 = "affine.apply"(%arg2, %52#0) <{map = affine_map<(d0)[s0] -> (d0 * 4 + s0)>}> : (index, index) -> index
%55 = "affine.apply"(%arg3, %52#1) <{map = affine_map<(d0)[s0] -> (d0 * 64 + s0 * 4)>}> : (index, index) -> index
%56 = "memref.subview"(%48, %arg1, %54, %55, %44) <{operandSegmentSizes = array<i32: 1, 3, 1, 0>, static_offsets = array<i64: 0, -9223372036854775808, -9223372036854775808, -9223372036854775808>, static_sizes = array<i64: -9223372036854775808, 1, 1, 4>, static_strides = array<i64: 1, 1, 1, 1>}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>, index, index, index, index) -> memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%57 = "vector.transfer_read"(%46, %4, %arg0, %arg1, %54, %55, %0) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 5, 1, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, index, index, index, f8E4M3FNUZ) -> vector<4xf8E4M3FNUZ>
%58 = "arith.extf"(%57) : (vector<4xf8E4M3FNUZ>) -> vector<4xbf16>
"vector.transfer_write"(%58, %8, %4, %4, %4, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 5, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (vector<4xbf16>, memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, index, index, index, index, index) -> ()
"iree_linalg_ext.scatter"(%8, %53, %56) <{dimension_map = array<i64: 0>, operandSegmentSizes = array<i32: 2, 1>, unique_indices = true}> ({
^bb0(%arg4: bf16, %arg5: bf16):
"iree_linalg_ext.yield"(%arg4) : (bf16) -> ()
}) : (memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>, memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
"gpu.barrier"() : () -> ()
"scf.forall.in_parallel"() ({
^bb0:
}) : () -> ()
}) : (index) -> ()
"memref.dealloc"(%7) : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> ()
"func.return"() : () -> ()
}) {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64>} : () -> ()
/sharedfile/32/fp8_32_kv16.mlir:11647:13: error: failed to run translation of source executable to target executable for backend #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>
%4360 = torch.aten.index_put %4358, %4359, %4354, %false_3948 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
^
/sharedfile/32/fp8_32_kv16.mlir:11647:13: note: see current operation:
"hal.executable.variant"() ({
"hal.executable.export"() ({
^bb0(%arg9: !hal.device, %arg10: index, %arg11: index, %arg12: index, %arg13: index):
%69:3 = "flow.dispatch.workgroup_count_from_slice"(%arg10, %arg11, %arg12, %arg13) : (index, index, index, index) -> (index, index, index)
"hal.return"(%69#0, %69#1, %69#2) : (index, index, index) -> ()
}) {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index, sym_name = "prefill_bs1$async_dispatch_212_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"} : () -> ()
"builtin.module"() ({
"func.func"() <{function_type = () -> (), sym_name = "prefill_bs1$async_dispatch_212_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"}> ({
%0 = "arith.constant"() <{value = 0.000000e+00 : f8E4M3FNUZ}> : () -> f8E4M3FNUZ
%1 = "arith.constant"() <{value = 0 : i64}> : () -> i64
%2 = "arith.constant"() <{value = 1 : index}> : () -> index
%3 = "arith.constant"() <{value = 32 : i64}> : () -> i64
%4 = "arith.constant"() <{value = 0 : index}> : () -> index
%5 = "arith.constant"() <{value = 64 : index}> : () -> index
%6 = "arith.constant"() <{value = dense<20> : vector<1xi64>}> : () -> vector<1xi64>
%7 = "memref.alloc"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x3xi32, #gpu.address_space<workgroup>>
%8 = "memref.alloca"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x1x1x1x4xbf16, #gpu.address_space<private>>
%9 = "gpu.thread_id"() <{dimension = #gpu<dim x>, upper_bound = 64 : index}> : () -> index
%10 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index} : () -> i32
%11 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 1 : index} : () -> i32
%12 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 2 : index} : () -> i32
%13 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 3 : index} : () -> i32
%14 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 4 : index} : () -> i32
%15 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 5 : index} : () -> i32
%16 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 6 : index} : () -> i32
%17 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 7 : index} : () -> i32
%18 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 8 : index} : () -> i32
%19 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 9 : index} : () -> i32
%20 = "arith.extui"(%10) : (i32) -> i64
%21 = "arith.extui"(%11) : (i32) -> i64
%22 = "arith.shli"(%21, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%23 = "arith.ori"(%20, %22) : (i64, i64) -> i64
%24 = "arith.index_castui"(%23) : (i64) -> index
%25 = "arith.extui"(%12) : (i32) -> i64
%26 = "arith.extui"(%13) : (i32) -> i64
%27 = "arith.shli"(%26, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%28 = "arith.ori"(%25, %27) : (i64, i64) -> i64
%29 = "arith.index_castui"(%28) : (i64) -> index
%30 = "arith.extui"(%14) : (i32) -> i64
%31 = "arith.extui"(%15) : (i32) -> i64
%32 = "arith.shli"(%31, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%33 = "arith.ori"(%30, %32) : (i64, i64) -> i64
%34 = "arith.index_castui"(%33) : (i64) -> index
%35 = "arith.index_castui"(%16) : (i32) -> index
%36 = "arith.extui"(%17) : (i32) -> i64
%37 = "arith.extui"(%18) : (i32) -> i64
%38 = "arith.shli"(%37, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%39 = "arith.ori"(%36, %38) : (i64, i64) -> i64
%40 = "arith.index_castui"(%39) : (i64) -> index
%41 = "arith.index_castui"(%19) : (i32) -> index
%42:6 = "util.assume.int"(%24, %29, %34, %35, %40, %41) <{assumptions = [[#util.int.assumption<umin = 68386816, umax = 5300322304>], [#util.int.assumption<umin = 68501504, umax = 5769969664>], [#util.int.assumption<umin = 68501568, umax = 5770002432>], [#util.int.assumption<umin = 1, umax = 4095>], [#util.int.assumption<umin = 64, umax = 576460752303423424, udiv = 64>], [#util.int.assumption<umin = 1, umax = 4095>]]}> : (index, index, index, index, index, index) -> (index, index, index, index, index, index)
%43 = "flow.dispatch.workload.ordinal"(%42#3) <{ordinal = 1 : index}> : (index) -> index
%44 = "flow.dispatch.workload.ordinal"(%42#4) <{ordinal = 2 : index}> : (index) -> index
%45 = "flow.dispatch.workload.ordinal"(%42#5) <{ordinal = 3 : index}> : (index) -> index
%46 = "hal.interface.binding.subspan"(%42#0, %43) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%46) <{alignment = 1 : i32}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%47 = "hal.interface.binding.subspan"(%42#1, %45) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%47) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%48 = "hal.interface.binding.subspan"(%4, %44) {alignment = 64 : index, binding = 1 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%48) <{alignment = 64 : i32}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>) -> ()
%49 = "hal.interface.binding.subspan"(%42#2, %45) {alignment = 64 : index, binding = 2 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%49) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%50 = "memref.alloca"(%45) <{operandSegmentSizes = array<i32: 1, 0>}> : (index) -> memref<1x?xi32, #gpu.address_space<private>>
%51 = "affine.apply"(%45) <{map = affine_map<()[s0] -> (s0 ceildiv 2)>}> : (index) -> index
"gpu.barrier"() : () -> ()
"scf.for"(%9, %51, %5) ({
^bb0(%arg7: index):
%62 = "affine.apply"(%arg7) <{map = affine_map<(d0) -> (d0 * 2)>}> : (index) -> index
%63 = "affine.min"(%arg7, %45) <{map = affine_map<(d0)[s0] -> (d0 * -2 + s0, 2)>}> : (index, index) -> index
"scf.for"(%4, %63, %2) ({
^bb0(%arg8: index):
%64 = "arith.addi"(%arg8, %62) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
%65 = "vector.transfer_read"(%47, %4, %64, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
%66 = "arith.addi"(%65, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
%67 = "arith.trunci"(%66) : (vector<1xi64>) -> vector<1xi32>
%68 = "affine.apply"(%arg7, %arg8) <{map = affine_map<(d0)[s0] -> (d0 * 2 + s0)>}> : (index, index) -> index
"vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
"vector.transfer_write"(%67, %50, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x?xi32, #gpu.address_space<private>>, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"gpu.barrier"() : () -> ()
%52:2 = "affine.delinearize_index"(%9) <{static_basis = array<i64: 4, 16>}> : (index) -> (index, index)
"scf.forall"(%43) <{mapping = [#iree_codegen.workgroup_mapping<z:1>, #iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>], operandSegmentSizes = array<i32: 0, 1, 0, 0>, staticLowerBound = array<i64: 0, 0, 0, 0>, staticStep = array<i64: 1, 1, 1, 1>, staticUpperBound = array<i64: -9223372036854775808, 32, 2, 2>}> ({
^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index):
%53 = "memref.subview"(%7) <{operandSegmentSizes = array<i32: 1, 0, 0, 0>, static_offsets = array<i64: 0, 0>, static_sizes = array<i64: 1, 1>, static_strides = array<i64: 1, 1>}> : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>
"gpu.barrier"() : () -> ()
"scf.for"(%9, %2, %5) ({
^bb0(%arg6: index):
%59 = "vector.transfer_read"(%47, %4, %arg0, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
%60 = "arith.addi"(%59, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
%61 = "arith.trunci"(%60) : (vector<1xi64>) -> vector<1xi32>
"vector.transfer_write"(%61, %7, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x3xi32, #gpu.address_space<workgroup>>, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"gpu.barrier"() : () -> ()
%54 = "affine.apply"(%arg2, %52#0) <{map = affine_map<(d0)[s0] -> (d0 * 4 + s0)>}> : (index, index) -> index
%55 = "affine.apply"(%arg3, %52#1) <{map = affine_map<(d0)[s0] -> (d0 * 64 + s0 * 4)>}> : (index, index) -> index
%56 = "memref.subview"(%48, %arg1, %54, %55, %44) <{operandSegmentSizes = array<i32: 1, 3, 1, 0>, static_offsets = array<i64: 0, -9223372036854775808, -9223372036854775808, -9223372036854775808>, static_sizes = array<i64: -9223372036854775808, 1, 1, 4>, static_strides = array<i64: 1, 1, 1, 1>}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>, index, index, index, index) -> memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%57 = "vector.transfer_read"(%46, %4, %arg0, %arg1, %54, %55, %0) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 5, 1, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, index, index, index, f8E4M3FNUZ) -> vector<4xf8E4M3FNUZ>
%58 = "arith.extf"(%57) : (vector<4xf8E4M3FNUZ>) -> vector<4xbf16>
"vector.transfer_write"(%58, %8, %4, %4, %4, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 5, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (vector<4xbf16>, memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, index, index, index, index, index) -> ()
"iree_linalg_ext.scatter"(%8, %53, %56) <{dimension_map = array<i64: 0>, operandSegmentSizes = array<i32: 2, 1>, unique_indices = true}> ({
^bb0(%arg4: bf16, %arg5: bf16):
"iree_linalg_ext.yield"(%arg4) : (bf16) -> ()
}) : (memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>, memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
"gpu.barrier"() : () -> ()
"scf.forall.in_parallel"() ({
^bb0:
}) : () -> ()
}) : (index) -> ()
"memref.dealloc"(%7) : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> ()
"func.return"() : () -> ()
}) {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64>} : () -> ()
}) : () -> ()
"hal.executable.variant_end"() : () -> ()
}) {sym_name = "rocm_hsaco_fb", target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>} : () -> ()
/sharedfile/32/fp8_32_kv16.mlir:12523:13: error: 'vector.transfer_write' op write affecting operations on global resources are restricted to workgroup distributed contexts.
%4697 = torch.aten.index_put %4695, %4696, %4691, %false_4320 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
^
/sharedfile/32/fp8_32_kv16.mlir:12523:13: note: see current operation: "vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
/sharedfile/32/fp8_32_kv16.mlir:12523:13: error: 'func.func' op failed on workgroup distribution verification
%4697 = torch.aten.index_put %4695, %4696, %4691, %false_4320 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
^
/sharedfile/32/fp8_32_kv16.mlir:12523:13: note: see current operation:
"func.func"() <{function_type = () -> (), sym_name = "prefill_bs1$async_dispatch_232_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"}> ({
%0 = "arith.constant"() <{value = 0.000000e+00 : f8E4M3FNUZ}> : () -> f8E4M3FNUZ
%1 = "arith.constant"() <{value = 0 : i64}> : () -> i64
%2 = "arith.constant"() <{value = 1 : index}> : () -> index
%3 = "arith.constant"() <{value = 32 : i64}> : () -> i64
%4 = "arith.constant"() <{value = 0 : index}> : () -> index
%5 = "arith.constant"() <{value = 64 : index}> : () -> index
%6 = "arith.constant"() <{value = dense<22> : vector<1xi64>}> : () -> vector<1xi64>
%7 = "memref.alloc"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x3xi32, #gpu.address_space<workgroup>>
%8 = "memref.alloca"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x1x1x1x4xbf16, #gpu.address_space<private>>
%9 = "gpu.thread_id"() <{dimension = #gpu<dim x>, upper_bound = 64 : index}> : () -> index
%10 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index} : () -> i32
%11 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 1 : index} : () -> i32
%12 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 2 : index} : () -> i32
%13 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 3 : index} : () -> i32
%14 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 4 : index} : () -> i32
%15 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 5 : index} : () -> i32
%16 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 6 : index} : () -> i32
%17 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 7 : index} : () -> i32
%18 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 8 : index} : () -> i32
%19 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 9 : index} : () -> i32
%20 = "arith.extui"(%10) : (i32) -> i64
%21 = "arith.extui"(%11) : (i32) -> i64
%22 = "arith.shli"(%21, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%23 = "arith.ori"(%20, %22) : (i64, i64) -> i64
%24 = "arith.index_castui"(%23) : (i64) -> index
%25 = "arith.extui"(%12) : (i32) -> i64
%26 = "arith.extui"(%13) : (i32) -> i64
%27 = "arith.shli"(%26, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%28 = "arith.ori"(%25, %27) : (i64, i64) -> i64
%29 = "arith.index_castui"(%28) : (i64) -> index
%30 = "arith.extui"(%14) : (i32) -> i64
%31 = "arith.extui"(%15) : (i32) -> i64
%32 = "arith.shli"(%31, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%33 = "arith.ori"(%30, %32) : (i64, i64) -> i64
%34 = "arith.index_castui"(%33) : (i64) -> index
%35 = "arith.index_castui"(%16) : (i32) -> index
%36 = "arith.extui"(%17) : (i32) -> i64
%37 = "arith.extui"(%18) : (i32) -> i64
%38 = "arith.shli"(%37, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%39 = "arith.ori"(%36, %38) : (i64, i64) -> i64
%40 = "arith.index_castui"(%39) : (i64) -> index
%41 = "arith.index_castui"(%19) : (i32) -> index
%42:6 = "util.assume.int"(%24, %29, %34, %35, %40, %41) <{assumptions = [[#util.int.assumption<umin = 68386816, umax = 5300322304>], [#util.int.assumption<umin = 68501504, umax = 5769969664>], [#util.int.assumption<umin = 68501568, umax = 5770002432>], [#util.int.assumption<umin = 1, umax = 4095>], [#util.int.assumption<umin = 64, umax = 576460752303423424, udiv = 64>], [#util.int.assumption<umin = 1, umax = 4095>]]}> : (index, index, index, index, index, index) -> (index, index, index, index, index, index)
%43 = "flow.dispatch.workload.ordinal"(%42#3) <{ordinal = 1 : index}> : (index) -> index
%44 = "flow.dispatch.workload.ordinal"(%42#4) <{ordinal = 2 : index}> : (index) -> index
%45 = "flow.dispatch.workload.ordinal"(%42#5) <{ordinal = 3 : index}> : (index) -> index
%46 = "hal.interface.binding.subspan"(%42#0, %43) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%46) <{alignment = 1 : i32}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%47 = "hal.interface.binding.subspan"(%42#1, %45) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%47) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%48 = "hal.interface.binding.subspan"(%4, %44) {alignment = 64 : index, binding = 1 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%48) <{alignment = 64 : i32}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>) -> ()
%49 = "hal.interface.binding.subspan"(%42#2, %45) {alignment = 64 : index, binding = 2 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%49) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%50 = "memref.alloca"(%45) <{operandSegmentSizes = array<i32: 1, 0>}> : (index) -> memref<1x?xi32, #gpu.address_space<private>>
%51 = "affine.apply"(%45) <{map = affine_map<()[s0] -> (s0 ceildiv 2)>}> : (index) -> index
"gpu.barrier"() : () -> ()
"scf.for"(%9, %51, %5) ({
^bb0(%arg7: index):
%62 = "affine.apply"(%arg7) <{map = affine_map<(d0) -> (d0 * 2)>}> : (index) -> index
%63 = "affine.min"(%arg7, %45) <{map = affine_map<(d0)[s0] -> (d0 * -2 + s0, 2)>}> : (index, index) -> index
"scf.for"(%4, %63, %2) ({
^bb0(%arg8: index):
%64 = "arith.addi"(%arg8, %62) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
%65 = "vector.transfer_read"(%47, %4, %64, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
%66 = "arith.addi"(%65, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
%67 = "arith.trunci"(%66) : (vector<1xi64>) -> vector<1xi32>
%68 = "affine.apply"(%arg7, %arg8) <{map = affine_map<(d0)[s0] -> (d0 * 2 + s0)>}> : (index, index) -> index
"vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
"vector.transfer_write"(%67, %50, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x?xi32, #gpu.address_space<private>>, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"gpu.barrier"() : () -> ()
%52:2 = "affine.delinearize_index"(%9) <{static_basis = array<i64: 4, 16>}> : (index) -> (index, index)
"scf.forall"(%43) <{mapping = [#iree_codegen.workgroup_mapping<z:1>, #iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>], operandSegmentSizes = array<i32: 0, 1, 0, 0>, staticLowerBound = array<i64: 0, 0, 0, 0>, staticStep = array<i64: 1, 1, 1, 1>, staticUpperBound = array<i64: -9223372036854775808, 32, 2, 2>}> ({
^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index):
%53 = "memref.subview"(%7) <{operandSegmentSizes = array<i32: 1, 0, 0, 0>, static_offsets = array<i64: 0, 0>, static_sizes = array<i64: 1, 1>, static_strides = array<i64: 1, 1>}> : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>
"gpu.barrier"() : () -> ()
"scf.for"(%9, %2, %5) ({
^bb0(%arg6: index):
%59 = "vector.transfer_read"(%47, %4, %arg0, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
%60 = "arith.addi"(%59, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
%61 = "arith.trunci"(%60) : (vector<1xi64>) -> vector<1xi32>
"vector.transfer_write"(%61, %7, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x3xi32, #gpu.address_space<workgroup>>, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"gpu.barrier"() : () -> ()
%54 = "affine.apply"(%arg2, %52#0) <{map = affine_map<(d0)[s0] -> (d0 * 4 + s0)>}> : (index, index) -> index
%55 = "affine.apply"(%arg3, %52#1) <{map = affine_map<(d0)[s0] -> (d0 * 64 + s0 * 4)>}> : (index, index) -> index
%56 = "memref.subview"(%48, %arg1, %54, %55, %44) <{operandSegmentSizes = array<i32: 1, 3, 1, 0>, static_offsets = array<i64: 0, -9223372036854775808, -9223372036854775808, -9223372036854775808>, static_sizes = array<i64: -9223372036854775808, 1, 1, 4>, static_strides = array<i64: 1, 1, 1, 1>}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>, index, index, index, index) -> memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%57 = "vector.transfer_read"(%46, %4, %arg0, %arg1, %54, %55, %0) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 5, 1, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, index, index, index, f8E4M3FNUZ) -> vector<4xf8E4M3FNUZ>
%58 = "arith.extf"(%57) : (vector<4xf8E4M3FNUZ>) -> vector<4xbf16>
"vector.transfer_write"(%58, %8, %4, %4, %4, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 5, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (vector<4xbf16>, memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, index, index, index, index, index) -> ()
"iree_linalg_ext.scatter"(%8, %53, %56) <{dimension_map = array<i64: 0>, operandSegmentSizes = array<i32: 2, 1>, unique_indices = true}> ({
^bb0(%arg4: bf16, %arg5: bf16):
"iree_linalg_ext.yield"(%arg4) : (bf16) -> ()
}) : (memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>, memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
"gpu.barrier"() : () -> ()
"scf.forall.in_parallel"() ({
^bb0:
}) : () -> ()
}) : (index) -> ()
"memref.dealloc"(%7) : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> ()
"func.return"() : () -> ()
}) {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64>} : () -> ()
/sharedfile/32/fp8_32_kv16.mlir:12523:13: error: failed to run translation of source executable to target executable for backend #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>
%4697 = torch.aten.index_put %4695, %4696, %4691, %false_4320 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
^
/sharedfile/32/fp8_32_kv16.mlir:12523:13: note: see current operation:
"hal.executable.variant"() ({
"hal.executable.export"() ({
^bb0(%arg9: !hal.device, %arg10: index, %arg11: index, %arg12: index, %arg13: index):
%69:3 = "flow.dispatch.workgroup_count_from_slice"(%arg10, %arg11, %arg12, %arg13) : (index, index, index, index) -> (index, index, index)
"hal.return"(%69#0, %69#1, %69#2) : (index, index, index) -> ()
}) {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index, sym_name = "prefill_bs1$async_dispatch_232_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"} : () -> ()
"builtin.module"() ({
"func.func"() <{function_type = () -> (), sym_name = "prefill_bs1$async_dispatch_232_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"}> ({
%0 = "arith.constant"() <{value = 0.000000e+00 : f8E4M3FNUZ}> : () -> f8E4M3FNUZ
%1 = "arith.constant"() <{value = 0 : i64}> : () -> i64
%2 = "arith.constant"() <{value = 1 : index}> : () -> index
%3 = "arith.constant"() <{value = 32 : i64}> : () -> i64
%4 = "arith.constant"() <{value = 0 : index}> : () -> index
%5 = "arith.constant"() <{value = 64 : index}> : () -> index
%6 = "arith.constant"() <{value = dense<22> : vector<1xi64>}> : () -> vector<1xi64>
%7 = "memref.alloc"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x3xi32, #gpu.address_space<workgroup>>
%8 = "memref.alloca"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x1x1x1x4xbf16, #gpu.address_space<private>>
%9 = "gpu.thread_id"() <{dimension = #gpu<dim x>, upper_bound = 64 : index}> : () -> index
%10 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index} : () -> i32
%11 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 1 : index} : () -> i32
%12 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 2 : index} : () -> i32
%13 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 3 : index} : () -> i32
%14 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 4 : index} : () -> i32
%15 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 5 : index} : () -> i32
%16 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 6 : index} : () -> i32
%17 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 7 : index} : () -> i32
%18 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 8 : index} : () -> i32
%19 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 9 : index} : () -> i32
%20 = "arith.extui"(%10) : (i32) -> i64
%21 = "arith.extui"(%11) : (i32) -> i64
%22 = "arith.shli"(%21, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%23 = "arith.ori"(%20, %22) : (i64, i64) -> i64
%24 = "arith.index_castui"(%23) : (i64) -> index
%25 = "arith.extui"(%12) : (i32) -> i64
%26 = "arith.extui"(%13) : (i32) -> i64
%27 = "arith.shli"(%26, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%28 = "arith.ori"(%25, %27) : (i64, i64) -> i64
%29 = "arith.index_castui"(%28) : (i64) -> index
%30 = "arith.extui"(%14) : (i32) -> i64
%31 = "arith.extui"(%15) : (i32) -> i64
%32 = "arith.shli"(%31, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%33 = "arith.ori"(%30, %32) : (i64, i64) -> i64
%34 = "arith.index_castui"(%33) : (i64) -> index
%35 = "arith.index_castui"(%16) : (i32) -> index
%36 = "arith.extui"(%17) : (i32) -> i64
%37 = "arith.extui"(%18) : (i32) -> i64
%38 = "arith.shli"(%37, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%39 = "arith.ori"(%36, %38) : (i64, i64) -> i64
%40 = "arith.index_castui"(%39) : (i64) -> index
%41 = "arith.index_castui"(%19) : (i32) -> index
%42:6 = "util.assume.int"(%24, %29, %34, %35, %40, %41) <{assumptions = [[#util.int.assumption<umin = 68386816, umax = 5300322304>], [#util.int.assumption<umin = 68501504, umax = 5769969664>], [#util.int.assumption<umin = 68501568, umax = 5770002432>], [#util.int.assumption<umin = 1, umax = 4095>], [#util.int.assumption<umin = 64, umax = 576460752303423424, udiv = 64>], [#util.int.assumption<umin = 1, umax = 4095>]]}> : (index, index, index, index, index, index) -> (index, index, index, index, index, index)
%43 = "flow.dispatch.workload.ordinal"(%42#3) <{ordinal = 1 : index}> : (index) -> index
%44 = "flow.dispatch.workload.ordinal"(%42#4) <{ordinal = 2 : index}> : (index) -> index
%45 = "flow.dispatch.workload.ordinal"(%42#5) <{ordinal = 3 : index}> : (index) -> index
%46 = "hal.interface.binding.subspan"(%42#0, %43) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%46) <{alignment = 1 : i32}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%47 = "hal.interface.binding.subspan"(%42#1, %45) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%47) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%48 = "hal.interface.binding.subspan"(%4, %44) {alignment = 64 : index, binding = 1 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%48) <{alignment = 64 : i32}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>) -> ()
%49 = "hal.interface.binding.subspan"(%42#2, %45) {alignment = 64 : index, binding = 2 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%49) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%50 = "memref.alloca"(%45) <{operandSegmentSizes = array<i32: 1, 0>}> : (index) -> memref<1x?xi32, #gpu.address_space<private>>
%51 = "affine.apply"(%45) <{map = affine_map<()[s0] -> (s0 ceildiv 2)>}> : (index) -> index
"gpu.barrier"() : () -> ()
"scf.for"(%9, %51, %5) ({
^bb0(%arg7: index):
%62 = "affine.apply"(%arg7) <{map = affine_map<(d0) -> (d0 * 2)>}> : (index) -> index
%63 = "affine.min"(%arg7, %45) <{map = affine_map<(d0)[s0] -> (d0 * -2 + s0, 2)>}> : (index, index) -> index
"scf.for"(%4, %63, %2) ({
^bb0(%arg8: index):
%64 = "arith.addi"(%arg8, %62) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
%65 = "vector.transfer_read"(%47, %4, %64, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
%66 = "arith.addi"(%65, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
%67 = "arith.trunci"(%66) : (vector<1xi64>) -> vector<1xi32>
%68 = "affine.apply"(%arg7, %arg8) <{map = affine_map<(d0)[s0] -> (d0 * 2 + s0)>}> : (index, index) -> index
"vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
"vector.transfer_write"(%67, %50, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x?xi32, #gpu.address_space<private>>, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"gpu.barrier"() : () -> ()
%52:2 = "affine.delinearize_index"(%9) <{static_basis = array<i64: 4, 16>}> : (index) -> (index, index)
"scf.forall"(%43) <{mapping = [#iree_codegen.workgroup_mapping<z:1>, #iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>], operandSegmentSizes = array<i32: 0, 1, 0, 0>, staticLowerBound = array<i64: 0, 0, 0, 0>, staticStep = array<i64: 1, 1, 1, 1>, staticUpperBound = array<i64: -9223372036854775808, 32, 2, 2>}> ({
^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index):
%53 = "memref.subview"(%7) <{operandSegmentSizes = array<i32: 1, 0, 0, 0>, static_offsets = array<i64: 0, 0>, static_sizes = array<i64: 1, 1>, static_strides = array<i64: 1, 1>}> : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>
"gpu.barrier"() : () -> ()
"scf.for"(%9, %2, %5) ({
^bb0(%arg6: index):
%59 = "vector.transfer_read"(%47, %4, %arg0, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
%60 = "arith.addi"(%59, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
%61 = "arith.trunci"(%60) : (vector<1xi64>) -> vector<1xi32>
"vector.transfer_write"(%61, %7, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x3xi32, #gpu.address_space<workgroup>>, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"gpu.barrier"() : () -> ()
%54 = "affine.apply"(%arg2, %52#0) <{map = affine_map<(d0)[s0] -> (d0 * 4 + s0)>}> : (index, index) -> index
%55 = "affine.apply"(%arg3, %52#1) <{map = affine_map<(d0)[s0] -> (d0 * 64 + s0 * 4)>}> : (index, index) -> index
%56 = "memref.subview"(%48, %arg1, %54, %55, %44) <{operandSegmentSizes = array<i32: 1, 3, 1, 0>, static_offsets = array<i64: 0, -9223372036854775808, -9223372036854775808, -9223372036854775808>, static_sizes = array<i64: -9223372036854775808, 1, 1, 4>, static_strides = array<i64: 1, 1, 1, 1>}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>, index, index, index, index) -> memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%57 = "vector.transfer_read"(%46, %4, %arg0, %arg1, %54, %55, %0) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 5, 1, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, index, index, index, f8E4M3FNUZ) -> vector<4xf8E4M3FNUZ>
%58 = "arith.extf"(%57) : (vector<4xf8E4M3FNUZ>) -> vector<4xbf16>
"vector.transfer_write"(%58, %8, %4, %4, %4, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 5, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (vector<4xbf16>, memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, index, index, index, index, index) -> ()
"iree_linalg_ext.scatter"(%8, %53, %56) <{dimension_map = array<i64: 0>, operandSegmentSizes = array<i32: 2, 1>, unique_indices = true}> ({
^bb0(%arg4: bf16, %arg5: bf16):
"iree_linalg_ext.yield"(%arg4) : (bf16) -> ()
}) : (memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>, memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
"gpu.barrier"() : () -> ()
"scf.forall.in_parallel"() ({
^bb0:
}) : () -> ()
}) : (index) -> ()
"memref.dealloc"(%7) : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> ()
"func.return"() : () -> ()
}) {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64>} : () -> ()
}) : () -> ()
"hal.executable.variant_end"() : () -> ()
}) {sym_name = "rocm_hsaco_fb", target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>} : () -> ()
/sharedfile/32/fp8_32_kv16.mlir:13399:13: error: 'vector.transfer_write' op write affecting operations on global resources are restricted to workgroup distributed contexts.
%5034 = torch.aten.index_put %5032, %5033, %5028, %false_4692 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
^
/sharedfile/32/fp8_32_kv16.mlir:13399:13: note: see current operation: "vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
/sharedfile/32/fp8_32_kv16.mlir:13399:13: error: 'func.func' op failed on workgroup distribution verification
%5034 = torch.aten.index_put %5032, %5033, %5028, %false_4692 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
^
/sharedfile/32/fp8_32_kv16.mlir:13399:13: note: see current operation:
"func.func"() <{function_type = () -> (), sym_name = "prefill_bs1$async_dispatch_252_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"}> ({
%0 = "arith.constant"() <{value = 0.000000e+00 : f8E4M3FNUZ}> : () -> f8E4M3FNUZ
%1 = "arith.constant"() <{value = 0 : i64}> : () -> i64
%2 = "arith.constant"() <{value = 1 : index}> : () -> index
%3 = "arith.constant"() <{value = 32 : i64}> : () -> i64
%4 = "arith.constant"() <{value = 0 : index}> : () -> index
%5 = "arith.constant"() <{value = 64 : index}> : () -> index
%6 = "arith.constant"() <{value = dense<24> : vector<1xi64>}> : () -> vector<1xi64>
%7 = "memref.alloc"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x3xi32, #gpu.address_space<workgroup>>
%8 = "memref.alloca"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x1x1x1x4xbf16, #gpu.address_space<private>>
%9 = "gpu.thread_id"() <{dimension = #gpu<dim x>, upper_bound = 64 : index}> : () -> index
%10 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index} : () -> i32
%11 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 1 : index} : () -> i32
%12 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 2 : index} : () -> i32
%13 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 3 : index} : () -> i32
%14 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 4 : index} : () -> i32
%15 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 5 : index} : () -> i32
%16 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 6 : index} : () -> i32
%17 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 7 : index} : () -> i32
%18 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 8 : index} : () -> i32
%19 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 9 : index} : () -> i32
%20 = "arith.extui"(%10) : (i32) -> i64
%21 = "arith.extui"(%11) : (i32) -> i64
%22 = "arith.shli"(%21, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%23 = "arith.ori"(%20, %22) : (i64, i64) -> i64
%24 = "arith.index_castui"(%23) : (i64) -> index
%25 = "arith.extui"(%12) : (i32) -> i64
%26 = "arith.extui"(%13) : (i32) -> i64
%27 = "arith.shli"(%26, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%28 = "arith.ori"(%25, %27) : (i64, i64) -> i64
%29 = "arith.index_castui"(%28) : (i64) -> index
%30 = "arith.extui"(%14) : (i32) -> i64
%31 = "arith.extui"(%15) : (i32) -> i64
%32 = "arith.shli"(%31, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%33 = "arith.ori"(%30, %32) : (i64, i64) -> i64
%34 = "arith.index_castui"(%33) : (i64) -> index
%35 = "arith.index_castui"(%16) : (i32) -> index
%36 = "arith.extui"(%17) : (i32) -> i64
%37 = "arith.extui"(%18) : (i32) -> i64
%38 = "arith.shli"(%37, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%39 = "arith.ori"(%36, %38) : (i64, i64) -> i64
%40 = "arith.index_castui"(%39) : (i64) -> index
%41 = "arith.index_castui"(%19) : (i32) -> index
%42:6 = "util.assume.int"(%24, %29, %34, %35, %40, %41) <{assumptions = [[#util.int.assumption<umin = 68386816, umax = 5300322304>], [#util.int.assumption<umin = 68501504, umax = 5769969664>], [#util.int.assumption<umin = 68501568, umax = 5770002432>], [#util.int.assumption<umin = 1, umax = 4095>], [#util.int.assumption<umin = 64, umax = 576460752303423424, udiv = 64>], [#util.int.assumption<umin = 1, umax = 4095>]]}> : (index, index, index, index, index, index) -> (index, index, index, index, index, index)
%43 = "flow.dispatch.workload.ordinal"(%42#3) <{ordinal = 1 : index}> : (index) -> index
%44 = "flow.dispatch.workload.ordinal"(%42#4) <{ordinal = 2 : index}> : (index) -> index
%45 = "flow.dispatch.workload.ordinal"(%42#5) <{ordinal = 3 : index}> : (index) -> index
%46 = "hal.interface.binding.subspan"(%42#0, %43) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%46) <{alignment = 1 : i32}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%47 = "hal.interface.binding.subspan"(%42#1, %45) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%47) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%48 = "hal.interface.binding.subspan"(%4, %44) {alignment = 64 : index, binding = 1 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%48) <{alignment = 64 : i32}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>) -> ()
%49 = "hal.interface.binding.subspan"(%42#2, %45) {alignment = 64 : index, binding = 2 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%49) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%50 = "memref.alloca"(%45) <{operandSegmentSizes = array<i32: 1, 0>}> : (index) -> memref<1x?xi32, #gpu.address_space<private>>
%51 = "affine.apply"(%45) <{map = affine_map<()[s0] -> (s0 ceildiv 2)>}> : (index) -> index
"gpu.barrier"() : () -> ()
"scf.for"(%9, %51, %5) ({
^bb0(%arg7: index):
%62 = "affine.apply"(%arg7) <{map = affine_map<(d0) -> (d0 * 2)>}> : (index) -> index
%63 = "affine.min"(%arg7, %45) <{map = affine_map<(d0)[s0] -> (d0 * -2 + s0, 2)>}> : (index, index) -> index
"scf.for"(%4, %63, %2) ({
^bb0(%arg8: index):
%64 = "arith.addi"(%arg8, %62) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
%65 = "vector.transfer_read"(%47, %4, %64, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
%66 = "arith.addi"(%65, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
%67 = "arith.trunci"(%66) : (vector<1xi64>) -> vector<1xi32>
%68 = "affine.apply"(%arg7, %arg8) <{map = affine_map<(d0)[s0] -> (d0 * 2 + s0)>}> : (index, index) -> index
"vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
"vector.transfer_write"(%67, %50, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x?xi32, #gpu.address_space<private>>, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"gpu.barrier"() : () -> ()
%52:2 = "affine.delinearize_index"(%9) <{static_basis = array<i64: 4, 16>}> : (index) -> (index, index)
"scf.forall"(%43) <{mapping = [#iree_codegen.workgroup_mapping<z:1>, #iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>], operandSegmentSizes = array<i32: 0, 1, 0, 0>, staticLowerBound = array<i64: 0, 0, 0, 0>, staticStep = array<i64: 1, 1, 1, 1>, staticUpperBound = array<i64: -9223372036854775808, 32, 2, 2>}> ({
^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index):
%53 = "memref.subview"(%7) <{operandSegmentSizes = array<i32: 1, 0, 0, 0>, static_offsets = array<i64: 0, 0>, static_sizes = array<i64: 1, 1>, static_strides = array<i64: 1, 1>}> : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>
"gpu.barrier"() : () -> ()
"scf.for"(%9, %2, %5) ({
^bb0(%arg6: index):
%59 = "vector.transfer_read"(%47, %4, %arg0, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
%60 = "arith.addi"(%59, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
%61 = "arith.trunci"(%60) : (vector<1xi64>) -> vector<1xi32>
"vector.transfer_write"(%61, %7, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x3xi32, #gpu.address_space<workgroup>>, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"gpu.barrier"() : () -> ()
%54 = "affine.apply"(%arg2, %52#0) <{map = affine_map<(d0)[s0] -> (d0 * 4 + s0)>}> : (index, index) -> index
%55 = "affine.apply"(%arg3, %52#1) <{map = affine_map<(d0)[s0] -> (d0 * 64 + s0 * 4)>}> : (index, index) -> index
%56 = "memref.subview"(%48, %arg1, %54, %55, %44) <{operandSegmentSizes = array<i32: 1, 3, 1, 0>, static_offsets = array<i64: 0, -9223372036854775808, -9223372036854775808, -9223372036854775808>, static_sizes = array<i64: -9223372036854775808, 1, 1, 4>, static_strides = array<i64: 1, 1, 1, 1>}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>, index, index, index, index) -> memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%57 = "vector.transfer_read"(%46, %4, %arg0, %arg1, %54, %55, %0) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 5, 1, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, index, index, index, f8E4M3FNUZ) -> vector<4xf8E4M3FNUZ>
%58 = "arith.extf"(%57) : (vector<4xf8E4M3FNUZ>) -> vector<4xbf16>
"vector.transfer_write"(%58, %8, %4, %4, %4, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 5, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (vector<4xbf16>, memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, index, index, index, index, index) -> ()
"iree_linalg_ext.scatter"(%8, %53, %56) <{dimension_map = array<i64: 0>, operandSegmentSizes = array<i32: 2, 1>, unique_indices = true}> ({
^bb0(%arg4: bf16, %arg5: bf16):
"iree_linalg_ext.yield"(%arg4) : (bf16) -> ()
}) : (memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>, memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
"gpu.barrier"() : () -> ()
"scf.forall.in_parallel"() ({
^bb0:
}) : () -> ()
}) : (index) -> ()
"memref.dealloc"(%7) : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> ()
"func.return"() : () -> ()
}) {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64>} : () -> ()
/sharedfile/32/fp8_32_kv16.mlir:13399:13: error: failed to run translation of source executable to target executable for backend #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>
%5034 = torch.aten.index_put %5032, %5033, %5028, %false_4692 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
^
/sharedfile/32/fp8_32_kv16.mlir:13399:13: note: see current operation:
"hal.executable.variant"() ({
"hal.executable.export"() ({
^bb0(%arg9: !hal.device, %arg10: index, %arg11: index, %arg12: index, %arg13: index):
%69:3 = "flow.dispatch.workgroup_count_from_slice"(%arg10, %arg11, %arg12, %arg13) : (index, index, index, index) -> (index, index, index)
"hal.return"(%69#0, %69#1, %69#2) : (index, index, index) -> ()
}) {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index, sym_name = "prefill_bs1$async_dispatch_252_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"} : () -> ()
"builtin.module"() ({
"func.func"() <{function_type = () -> (), sym_name = "prefill_bs1$async_dispatch_252_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"}> ({
%0 = "arith.constant"() <{value = 0.000000e+00 : f8E4M3FNUZ}> : () -> f8E4M3FNUZ
%1 = "arith.constant"() <{value = 0 : i64}> : () -> i64
%2 = "arith.constant"() <{value = 1 : index}> : () -> index
%3 = "arith.constant"() <{value = 32 : i64}> : () -> i64
%4 = "arith.constant"() <{value = 0 : index}> : () -> index
%5 = "arith.constant"() <{value = 64 : index}> : () -> index
%6 = "arith.constant"() <{value = dense<24> : vector<1xi64>}> : () -> vector<1xi64>
%7 = "memref.alloc"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x3xi32, #gpu.address_space<workgroup>>
%8 = "memref.alloca"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x1x1x1x4xbf16, #gpu.address_space<private>>
%9 = "gpu.thread_id"() <{dimension = #gpu<dim x>, upper_bound = 64 : index}> : () -> index
%10 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index} : () -> i32
%11 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 1 : index} : () -> i32
%12 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 2 : index} : () -> i32
%13 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 3 : index} : () -> i32
%14 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 4 : index} : () -> i32
%15 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 5 : index} : () -> i32
%16 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 6 : index} : () -> i32
%17 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 7 : index} : () -> i32
%18 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 8 : index} : () -> i32
%19 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 9 : index} : () -> i32
%20 = "arith.extui"(%10) : (i32) -> i64
%21 = "arith.extui"(%11) : (i32) -> i64
%22 = "arith.shli"(%21, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%23 = "arith.ori"(%20, %22) : (i64, i64) -> i64
%24 = "arith.index_castui"(%23) : (i64) -> index
%25 = "arith.extui"(%12) : (i32) -> i64
%26 = "arith.extui"(%13) : (i32) -> i64
%27 = "arith.shli"(%26, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%28 = "arith.ori"(%25, %27) : (i64, i64) -> i64
%29 = "arith.index_castui"(%28) : (i64) -> index
%30 = "arith.extui"(%14) : (i32) -> i64
%31 = "arith.extui"(%15) : (i32) -> i64
%32 = "arith.shli"(%31, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%33 = "arith.ori"(%30, %32) : (i64, i64) -> i64
%34 = "arith.index_castui"(%33) : (i64) -> index
%35 = "arith.index_castui"(%16) : (i32) -> index
%36 = "arith.extui"(%17) : (i32) -> i64
%37 = "arith.extui"(%18) : (i32) -> i64
%38 = "arith.shli"(%37, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%39 = "arith.ori"(%36, %38) : (i64, i64) -> i64
%40 = "arith.index_castui"(%39) : (i64) -> index
%41 = "arith.index_castui"(%19) : (i32) -> index
%42:6 = "util.assume.int"(%24, %29, %34, %35, %40, %41) <{assumptions = [[#util.int.assumption<umin = 68386816, umax = 5300322304>], [#util.int.assumption<umin = 68501504, umax = 5769969664>], [#util.int.assumption<umin = 68501568, umax = 5770002432>], [#util.int.assumption<umin = 1, umax = 4095>], [#util.int.assumption<umin = 64, umax = 576460752303423424, udiv = 64>], [#util.int.assumption<umin = 1, umax = 4095>]]}> : (index, index, index, index, index, index) -> (index, index, index, index, index, index)
%43 = "flow.dispatch.workload.ordinal"(%42#3) <{ordinal = 1 : index}> : (index) -> index
%44 = "flow.dispatch.workload.ordinal"(%42#4) <{ordinal = 2 : index}> : (index) -> index
%45 = "flow.dispatch.workload.ordinal"(%42#5) <{ordinal = 3 : index}> : (index) -> index
%46 = "hal.interface.binding.subspan"(%42#0, %43) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%46) <{alignment = 1 : i32}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%47 = "hal.interface.binding.subspan"(%42#1, %45) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%47) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%48 = "hal.interface.binding.subspan"(%4, %44) {alignment = 64 : index, binding = 1 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%48) <{alignment = 64 : i32}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>) -> ()
%49 = "hal.interface.binding.subspan"(%42#2, %45) {alignment = 64 : index, binding = 2 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%49) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%50 = "memref.alloca"(%45) <{operandSegmentSizes = array<i32: 1, 0>}> : (index) -> memref<1x?xi32, #gpu.address_space<private>>
%51 = "affine.apply"(%45) <{map = affine_map<()[s0] -> (s0 ceildiv 2)>}> : (index) -> index
"gpu.barrier"() : () -> ()
"scf.for"(%9, %51, %5) ({
^bb0(%arg7: index):
%62 = "affine.apply"(%arg7) <{map = affine_map<(d0) -> (d0 * 2)>}> : (index) -> index
%63 = "affine.min"(%arg7, %45) <{map = affine_map<(d0)[s0] -> (d0 * -2 + s0, 2)>}> : (index, index) -> index
"scf.for"(%4, %63, %2) ({
^bb0(%arg8: index):
%64 = "arith.addi"(%arg8, %62) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
%65 = "vector.transfer_read"(%47, %4, %64, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
%66 = "arith.addi"(%65, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
%67 = "arith.trunci"(%66) : (vector<1xi64>) -> vector<1xi32>
%68 = "affine.apply"(%arg7, %arg8) <{map = affine_map<(d0)[s0] -> (d0 * 2 + s0)>}> : (index, index) -> index
"vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
"vector.transfer_write"(%67, %50, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x?xi32, #gpu.address_space<private>>, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"gpu.barrier"() : () -> ()
%52:2 = "affine.delinearize_index"(%9) <{static_basis = array<i64: 4, 16>}> : (index) -> (index, index)
"scf.forall"(%43) <{mapping = [#iree_codegen.workgroup_mapping<z:1>, #iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>], operandSegmentSizes = array<i32: 0, 1, 0, 0>, staticLowerBound = array<i64: 0, 0, 0, 0>, staticStep = array<i64: 1, 1, 1, 1>, staticUpperBound = array<i64: -9223372036854775808, 32, 2, 2>}> ({
^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index):
%53 = "memref.subview"(%7) <{operandSegmentSizes = array<i32: 1, 0, 0, 0>, static_offsets = array<i64: 0, 0>, static_sizes = array<i64: 1, 1>, static_strides = array<i64: 1, 1>}> : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>
"gpu.barrier"() : () -> ()
"scf.for"(%9, %2, %5) ({
^bb0(%arg6: index):
%59 = "vector.transfer_read"(%47, %4, %arg0, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
%60 = "arith.addi"(%59, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
%61 = "arith.trunci"(%60) : (vector<1xi64>) -> vector<1xi32>
"vector.transfer_write"(%61, %7, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x3xi32, #gpu.address_space<workgroup>>, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"gpu.barrier"() : () -> ()
%54 = "affine.apply"(%arg2, %52#0) <{map = affine_map<(d0)[s0] -> (d0 * 4 + s0)>}> : (index, index) -> index
%55 = "affine.apply"(%arg3, %52#1) <{map = affine_map<(d0)[s0] -> (d0 * 64 + s0 * 4)>}> : (index, index) -> index
%56 = "memref.subview"(%48, %arg1, %54, %55, %44) <{operandSegmentSizes = array<i32: 1, 3, 1, 0>, static_offsets = array<i64: 0, -9223372036854775808, -9223372036854775808, -9223372036854775808>, static_sizes = array<i64: -9223372036854775808, 1, 1, 4>, static_strides = array<i64: 1, 1, 1, 1>}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>, index, index, index, index) -> memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%57 = "vector.transfer_read"(%46, %4, %arg0, %arg1, %54, %55, %0) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 5, 1, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, index, index, index, f8E4M3FNUZ) -> vector<4xf8E4M3FNUZ>
%58 = "arith.extf"(%57) : (vector<4xf8E4M3FNUZ>) -> vector<4xbf16>
"vector.transfer_write"(%58, %8, %4, %4, %4, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 5, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (vector<4xbf16>, memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, index, index, index, index, index) -> ()
"iree_linalg_ext.scatter"(%8, %53, %56) <{dimension_map = array<i64: 0>, operandSegmentSizes = array<i32: 2, 1>, unique_indices = true}> ({
^bb0(%arg4: bf16, %arg5: bf16):
"iree_linalg_ext.yield"(%arg4) : (bf16) -> ()
}) : (memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>, memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
"gpu.barrier"() : () -> ()
"scf.forall.in_parallel"() ({
^bb0:
}) : () -> ()
}) : (index) -> ()
"memref.dealloc"(%7) : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> ()
"func.return"() : () -> ()
}) {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64>} : () -> ()
}) : () -> ()
"hal.executable.variant_end"() : () -> ()
}) {sym_name = "rocm_hsaco_fb", target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>} : () -> ()
/sharedfile/32/fp8_32_kv16.mlir:14275:13: error: 'vector.transfer_write' op write affecting operations on global resources are restricted to workgroup distributed contexts.
%5371 = torch.aten.index_put %5369, %5370, %5365, %false_5065 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
^
/sharedfile/32/fp8_32_kv16.mlir:14275:13: note: see current operation: "vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
/sharedfile/32/fp8_32_kv16.mlir:14275:13: error: 'func.func' op failed on workgroup distribution verification
%5371 = torch.aten.index_put %5369, %5370, %5365, %false_5065 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
^
/sharedfile/32/fp8_32_kv16.mlir:14275:13: note: see current operation:
"func.func"() <{function_type = () -> (), sym_name = "prefill_bs1$async_dispatch_272_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"}> ({
%0 = "arith.constant"() <{value = 0.000000e+00 : f8E4M3FNUZ}> : () -> f8E4M3FNUZ
%1 = "arith.constant"() <{value = 0 : i64}> : () -> i64
%2 = "arith.constant"() <{value = 1 : index}> : () -> index
%3 = "arith.constant"() <{value = 32 : i64}> : () -> i64
%4 = "arith.constant"() <{value = 0 : index}> : () -> index
%5 = "arith.constant"() <{value = 64 : index}> : () -> index
%6 = "arith.constant"() <{value = dense<26> : vector<1xi64>}> : () -> vector<1xi64>
%7 = "memref.alloc"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x3xi32, #gpu.address_space<workgroup>>
%8 = "memref.alloca"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x1x1x1x4xbf16, #gpu.address_space<private>>
%9 = "gpu.thread_id"() <{dimension = #gpu<dim x>, upper_bound = 64 : index}> : () -> index
%10 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index} : () -> i32
%11 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 1 : index} : () -> i32
%12 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 2 : index} : () -> i32
%13 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 3 : index} : () -> i32
%14 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 4 : index} : () -> i32
%15 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 5 : index} : () -> i32
%16 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 6 : index} : () -> i32
%17 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 7 : index} : () -> i32
%18 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 8 : index} : () -> i32
%19 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 9 : index} : () -> i32
%20 = "arith.extui"(%10) : (i32) -> i64
%21 = "arith.extui"(%11) : (i32) -> i64
%22 = "arith.shli"(%21, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%23 = "arith.ori"(%20, %22) : (i64, i64) -> i64
%24 = "arith.index_castui"(%23) : (i64) -> index
%25 = "arith.extui"(%12) : (i32) -> i64
%26 = "arith.extui"(%13) : (i32) -> i64
%27 = "arith.shli"(%26, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%28 = "arith.ori"(%25, %27) : (i64, i64) -> i64
%29 = "arith.index_castui"(%28) : (i64) -> index
%30 = "arith.extui"(%14) : (i32) -> i64
%31 = "arith.extui"(%15) : (i32) -> i64
%32 = "arith.shli"(%31, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%33 = "arith.ori"(%30, %32) : (i64, i64) -> i64
%34 = "arith.index_castui"(%33) : (i64) -> index
%35 = "arith.index_castui"(%16) : (i32) -> index
%36 = "arith.extui"(%17) : (i32) -> i64
%37 = "arith.extui"(%18) : (i32) -> i64
%38 = "arith.shli"(%37, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%39 = "arith.ori"(%36, %38) : (i64, i64) -> i64
%40 = "arith.index_castui"(%39) : (i64) -> index
%41 = "arith.index_castui"(%19) : (i32) -> index
%42:6 = "util.assume.int"(%24, %29, %34, %35, %40, %41) <{assumptions = [[#util.int.assumption<umin = 68386816, umax = 5300322304>], [#util.int.assumption<umin = 68501504, umax = 5769969664>], [#util.int.assumption<umin = 68501568, umax = 5770002432>], [#util.int.assumption<umin = 1, umax = 4095>], [#util.int.assumption<umin = 64, umax = 576460752303423424, udiv = 64>], [#util.int.assumption<umin = 1, umax = 4095>]]}> : (index, index, index, index, index, index) -> (index, index, index, index, index, index)
%43 = "flow.dispatch.workload.ordinal"(%42#3) <{ordinal = 1 : index}> : (index) -> index
%44 = "flow.dispatch.workload.ordinal"(%42#4) <{ordinal = 2 : index}> : (index) -> index
%45 = "flow.dispatch.workload.ordinal"(%42#5) <{ordinal = 3 : index}> : (index) -> index
%46 = "hal.interface.binding.subspan"(%42#0, %43) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%46) <{alignment = 1 : i32}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%47 = "hal.interface.binding.subspan"(%42#1, %45) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%47) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%48 = "hal.interface.binding.subspan"(%4, %44) {alignment = 64 : index, binding = 1 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%48) <{alignment = 64 : i32}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>) -> ()
%49 = "hal.interface.binding.subspan"(%42#2, %45) {alignment = 64 : index, binding = 2 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%49) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%50 = "memref.alloca"(%45) <{operandSegmentSizes = array<i32: 1, 0>}> : (index) -> memref<1x?xi32, #gpu.address_space<private>>
%51 = "affine.apply"(%45) <{map = affine_map<()[s0] -> (s0 ceildiv 2)>}> : (index) -> index
"gpu.barrier"() : () -> ()
"scf.for"(%9, %51, %5) ({
^bb0(%arg7: index):
%62 = "affine.apply"(%arg7) <{map = affine_map<(d0) -> (d0 * 2)>}> : (index) -> index
%63 = "affine.min"(%arg7, %45) <{map = affine_map<(d0)[s0] -> (d0 * -2 + s0, 2)>}> : (index, index) -> index
"scf.for"(%4, %63, %2) ({
^bb0(%arg8: index):
%64 = "arith.addi"(%arg8, %62) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
%65 = "vector.transfer_read"(%47, %4, %64, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
%66 = "arith.addi"(%65, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
%67 = "arith.trunci"(%66) : (vector<1xi64>) -> vector<1xi32>
%68 = "affine.apply"(%arg7, %arg8) <{map = affine_map<(d0)[s0] -> (d0 * 2 + s0)>}> : (index, index) -> index
"vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
"vector.transfer_write"(%67, %50, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x?xi32, #gpu.address_space<private>>, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"gpu.barrier"() : () -> ()
%52:2 = "affine.delinearize_index"(%9) <{static_basis = array<i64: 4, 16>}> : (index) -> (index, index)
"scf.forall"(%43) <{mapping = [#iree_codegen.workgroup_mapping<z:1>, #iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>], operandSegmentSizes = array<i32: 0, 1, 0, 0>, staticLowerBound = array<i64: 0, 0, 0, 0>, staticStep = array<i64: 1, 1, 1, 1>, staticUpperBound = array<i64: -9223372036854775808, 32, 2, 2>}> ({
^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index):
%53 = "memref.subview"(%7) <{operandSegmentSizes = array<i32: 1, 0, 0, 0>, static_offsets = array<i64: 0, 0>, static_sizes = array<i64: 1, 1>, static_strides = array<i64: 1, 1>}> : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>
"gpu.barrier"() : () -> ()
"scf.for"(%9, %2, %5) ({
^bb0(%arg6: index):
%59 = "vector.transfer_read"(%47, %4, %arg0, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
%60 = "arith.addi"(%59, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
%61 = "arith.trunci"(%60) : (vector<1xi64>) -> vector<1xi32>
"vector.transfer_write"(%61, %7, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x3xi32, #gpu.address_space<workgroup>>, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"gpu.barrier"() : () -> ()
%54 = "affine.apply"(%arg2, %52#0) <{map = affine_map<(d0)[s0] -> (d0 * 4 + s0)>}> : (index, index) -> index
%55 = "affine.apply"(%arg3, %52#1) <{map = affine_map<(d0)[s0] -> (d0 * 64 + s0 * 4)>}> : (index, index) -> index
%56 = "memref.subview"(%48, %arg1, %54, %55, %44) <{operandSegmentSizes = array<i32: 1, 3, 1, 0>, static_offsets = array<i64: 0, -9223372036854775808, -9223372036854775808, -9223372036854775808>, static_sizes = array<i64: -9223372036854775808, 1, 1, 4>, static_strides = array<i64: 1, 1, 1, 1>}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>, index, index, index, index) -> memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%57 = "vector.transfer_read"(%46, %4, %arg0, %arg1, %54, %55, %0) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 5, 1, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, index, index, index, f8E4M3FNUZ) -> vector<4xf8E4M3FNUZ>
%58 = "arith.extf"(%57) : (vector<4xf8E4M3FNUZ>) -> vector<4xbf16>
"vector.transfer_write"(%58, %8, %4, %4, %4, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 5, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (vector<4xbf16>, memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, index, index, index, index, index) -> ()
"iree_linalg_ext.scatter"(%8, %53, %56) <{dimension_map = array<i64: 0>, operandSegmentSizes = array<i32: 2, 1>, unique_indices = true}> ({
^bb0(%arg4: bf16, %arg5: bf16):
"iree_linalg_ext.yield"(%arg4) : (bf16) -> ()
}) : (memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>, memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
"gpu.barrier"() : () -> ()
"scf.forall.in_parallel"() ({
^bb0:
}) : () -> ()
}) : (index) -> ()
"memref.dealloc"(%7) : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> ()
"func.return"() : () -> ()
}) {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64>} : () -> ()
/sharedfile/32/fp8_32_kv16.mlir:14275:13: error: failed to run translation of source executable to target executable for backend #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>
%5371 = torch.aten.index_put %5369, %5370, %5365, %false_5065 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
^
/sharedfile/32/fp8_32_kv16.mlir:14275:13: note: see current operation:
"hal.executable.variant"() ({
"hal.executable.export"() ({
^bb0(%arg9: !hal.device, %arg10: index, %arg11: index, %arg12: index, %arg13: index):
%69:3 = "flow.dispatch.workgroup_count_from_slice"(%arg10, %arg11, %arg12, %arg13) : (index, index, index, index) -> (index, index, index)
"hal.return"(%69#0, %69#1, %69#2) : (index, index, index) -> ()
}) {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index, sym_name = "prefill_bs1$async_dispatch_272_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"} : () -> ()
"builtin.module"() ({
"func.func"() <{function_type = () -> (), sym_name = "prefill_bs1$async_dispatch_272_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"}> ({
%0 = "arith.constant"() <{value = 0.000000e+00 : f8E4M3FNUZ}> : () -> f8E4M3FNUZ
%1 = "arith.constant"() <{value = 0 : i64}> : () -> i64
%2 = "arith.constant"() <{value = 1 : index}> : () -> index
%3 = "arith.constant"() <{value = 32 : i64}> : () -> i64
%4 = "arith.constant"() <{value = 0 : index}> : () -> index
%5 = "arith.constant"() <{value = 64 : index}> : () -> index
%6 = "arith.constant"() <{value = dense<26> : vector<1xi64>}> : () -> vector<1xi64>
%7 = "memref.alloc"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x3xi32, #gpu.address_space<workgroup>>
%8 = "memref.alloca"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x1x1x1x4xbf16, #gpu.address_space<private>>
%9 = "gpu.thread_id"() <{dimension = #gpu<dim x>, upper_bound = 64 : index}> : () -> index
%10 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index} : () -> i32
%11 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 1 : index} : () -> i32
%12 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 2 : index} : () -> i32
%13 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 3 : index} : () -> i32
%14 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 4 : index} : () -> i32
%15 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 5 : index} : () -> i32
%16 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 6 : index} : () -> i32
%17 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 7 : index} : () -> i32
%18 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 8 : index} : () -> i32
%19 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 9 : index} : () -> i32
%20 = "arith.extui"(%10) : (i32) -> i64
%21 = "arith.extui"(%11) : (i32) -> i64
%22 = "arith.shli"(%21, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%23 = "arith.ori"(%20, %22) : (i64, i64) -> i64
%24 = "arith.index_castui"(%23) : (i64) -> index
%25 = "arith.extui"(%12) : (i32) -> i64
%26 = "arith.extui"(%13) : (i32) -> i64
%27 = "arith.shli"(%26, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%28 = "arith.ori"(%25, %27) : (i64, i64) -> i64
%29 = "arith.index_castui"(%28) : (i64) -> index
%30 = "arith.extui"(%14) : (i32) -> i64
%31 = "arith.extui"(%15) : (i32) -> i64
%32 = "arith.shli"(%31, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%33 = "arith.ori"(%30, %32) : (i64, i64) -> i64
%34 = "arith.index_castui"(%33) : (i64) -> index
%35 = "arith.index_castui"(%16) : (i32) -> index
%36 = "arith.extui"(%17) : (i32) -> i64
%37 = "arith.extui"(%18) : (i32) -> i64
%38 = "arith.shli"(%37, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%39 = "arith.ori"(%36, %38) : (i64, i64) -> i64
%40 = "arith.index_castui"(%39) : (i64) -> index
%41 = "arith.index_castui"(%19) : (i32) -> index
%42:6 = "util.assume.int"(%24, %29, %34, %35, %40, %41) <{assumptions = [[#util.int.assumption<umin = 68386816, umax = 5300322304>], [#util.int.assumption<umin = 68501504, umax = 5769969664>], [#util.int.assumption<umin = 68501568, umax = 5770002432>], [#util.int.assumption<umin = 1, umax = 4095>], [#util.int.assumption<umin = 64, umax = 576460752303423424, udiv = 64>], [#util.int.assumption<umin = 1, umax = 4095>]]}> : (index, index, index, index, index, index) -> (index, index, index, index, index, index)
%43 = "flow.dispatch.workload.ordinal"(%42#3) <{ordinal = 1 : index}> : (index) -> index
%44 = "flow.dispatch.workload.ordinal"(%42#4) <{ordinal = 2 : index}> : (index) -> index
%45 = "flow.dispatch.workload.ordinal"(%42#5) <{ordinal = 3 : index}> : (index) -> index
%46 = "hal.interface.binding.subspan"(%42#0, %43) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%46) <{alignment = 1 : i32}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%47 = "hal.interface.binding.subspan"(%42#1, %45) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%47) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%48 = "hal.interface.binding.subspan"(%4, %44) {alignment = 64 : index, binding = 1 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%48) <{alignment = 64 : i32}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>) -> ()
%49 = "hal.interface.binding.subspan"(%42#2, %45) {alignment = 64 : index, binding = 2 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%49) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%50 = "memref.alloca"(%45) <{operandSegmentSizes = array<i32: 1, 0>}> : (index) -> memref<1x?xi32, #gpu.address_space<private>>
%51 = "affine.apply"(%45) <{map = affine_map<()[s0] -> (s0 ceildiv 2)>}> : (index) -> index
"gpu.barrier"() : () -> ()
"scf.for"(%9, %51, %5) ({
^bb0(%arg7: index):
%62 = "affine.apply"(%arg7) <{map = affine_map<(d0) -> (d0 * 2)>}> : (index) -> index
%63 = "affine.min"(%arg7, %45) <{map = affine_map<(d0)[s0] -> (d0 * -2 + s0, 2)>}> : (index, index) -> index
"scf.for"(%4, %63, %2) ({
^bb0(%arg8: index):
%64 = "arith.addi"(%arg8, %62) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
%65 = "vector.transfer_read"(%47, %4, %64, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
%66 = "arith.addi"(%65, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
%67 = "arith.trunci"(%66) : (vector<1xi64>) -> vector<1xi32>
%68 = "affine.apply"(%arg7, %arg8) <{map = affine_map<(d0)[s0] -> (d0 * 2 + s0)>}> : (index, index) -> index
"vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
"vector.transfer_write"(%67, %50, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x?xi32, #gpu.address_space<private>>, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"gpu.barrier"() : () -> ()
%52:2 = "affine.delinearize_index"(%9) <{static_basis = array<i64: 4, 16>}> : (index) -> (index, index)
"scf.forall"(%43) <{mapping = [#iree_codegen.workgroup_mapping<z:1>, #iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>], operandSegmentSizes = array<i32: 0, 1, 0, 0>, staticLowerBound = array<i64: 0, 0, 0, 0>, staticStep = array<i64: 1, 1, 1, 1>, staticUpperBound = array<i64: -9223372036854775808, 32, 2, 2>}> ({
^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index):
%53 = "memref.subview"(%7) <{operandSegmentSizes = array<i32: 1, 0, 0, 0>, static_offsets = array<i64: 0, 0>, static_sizes = array<i64: 1, 1>, static_strides = array<i64: 1, 1>}> : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>
"gpu.barrier"() : () -> ()
"scf.for"(%9, %2, %5) ({
^bb0(%arg6: index):
%59 = "vector.transfer_read"(%47, %4, %arg0, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
%60 = "arith.addi"(%59, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
%61 = "arith.trunci"(%60) : (vector<1xi64>) -> vector<1xi32>
"vector.transfer_write"(%61, %7, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x3xi32, #gpu.address_space<workgroup>>, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"gpu.barrier"() : () -> ()
%54 = "affine.apply"(%arg2, %52#0) <{map = affine_map<(d0)[s0] -> (d0 * 4 + s0)>}> : (index, index) -> index
%55 = "affine.apply"(%arg3, %52#1) <{map = affine_map<(d0)[s0] -> (d0 * 64 + s0 * 4)>}> : (index, index) -> index
%56 = "memref.subview"(%48, %arg1, %54, %55, %44) <{operandSegmentSizes = array<i32: 1, 3, 1, 0>, static_offsets = array<i64: 0, -9223372036854775808, -9223372036854775808, -9223372036854775808>, static_sizes = array<i64: -9223372036854775808, 1, 1, 4>, static_strides = array<i64: 1, 1, 1, 1>}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>, index, index, index, index) -> memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%57 = "vector.transfer_read"(%46, %4, %arg0, %arg1, %54, %55, %0) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 5, 1, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, index, index, index, f8E4M3FNUZ) -> vector<4xf8E4M3FNUZ>
%58 = "arith.extf"(%57) : (vector<4xf8E4M3FNUZ>) -> vector<4xbf16>
"vector.transfer_write"(%58, %8, %4, %4, %4, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 5, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (vector<4xbf16>, memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, index, index, index, index, index) -> ()
"iree_linalg_ext.scatter"(%8, %53, %56) <{dimension_map = array<i64: 0>, operandSegmentSizes = array<i32: 2, 1>, unique_indices = true}> ({
^bb0(%arg4: bf16, %arg5: bf16):
"iree_linalg_ext.yield"(%arg4) : (bf16) -> ()
}) : (memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>, memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
"gpu.barrier"() : () -> ()
"scf.forall.in_parallel"() ({
^bb0:
}) : () -> ()
}) : (index) -> ()
"memref.dealloc"(%7) : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> ()
"func.return"() : () -> ()
}) {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64>} : () -> ()
}) : () -> ()
"hal.executable.variant_end"() : () -> ()
}) {sym_name = "rocm_hsaco_fb", target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>} : () -> ()
/sharedfile/32/fp8_32_kv16.mlir:15151:13: error: 'vector.transfer_write' op write affecting operations on global resources are restricted to workgroup distributed contexts.
%5708 = torch.aten.index_put %5706, %5707, %5702, %false_5437 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
^
/sharedfile/32/fp8_32_kv16.mlir:15151:13: note: see current operation: "vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
/sharedfile/32/fp8_32_kv16.mlir:15151:13: error: 'func.func' op failed on workgroup distribution verification
%5708 = torch.aten.index_put %5706, %5707, %5702, %false_5437 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
^
/sharedfile/32/fp8_32_kv16.mlir:15151:13: note: see current operation:
"func.func"() <{function_type = () -> (), sym_name = "prefill_bs1$async_dispatch_292_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"}> ({
%0 = "arith.constant"() <{value = 0.000000e+00 : f8E4M3FNUZ}> : () -> f8E4M3FNUZ
%1 = "arith.constant"() <{value = 0 : i64}> : () -> i64
%2 = "arith.constant"() <{value = 1 : index}> : () -> index
%3 = "arith.constant"() <{value = 32 : i64}> : () -> i64
%4 = "arith.constant"() <{value = 0 : index}> : () -> index
%5 = "arith.constant"() <{value = 64 : index}> : () -> index
%6 = "arith.constant"() <{value = dense<28> : vector<1xi64>}> : () -> vector<1xi64>
%7 = "memref.alloc"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x3xi32, #gpu.address_space<workgroup>>
%8 = "memref.alloca"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x1x1x1x4xbf16, #gpu.address_space<private>>
%9 = "gpu.thread_id"() <{dimension = #gpu<dim x>, upper_bound = 64 : index}> : () -> index
%10 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index} : () -> i32
%11 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 1 : index} : () -> i32
%12 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 2 : index} : () -> i32
%13 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 3 : index} : () -> i32
%14 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 4 : index} : () -> i32
%15 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 5 : index} : () -> i32
%16 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 6 : index} : () -> i32
%17 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 7 : index} : () -> i32
%18 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 8 : index} : () -> i32
%19 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 9 : index} : () -> i32
%20 = "arith.extui"(%10) : (i32) -> i64
%21 = "arith.extui"(%11) : (i32) -> i64
%22 = "arith.shli"(%21, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%23 = "arith.ori"(%20, %22) : (i64, i64) -> i64
%24 = "arith.index_castui"(%23) : (i64) -> index
%25 = "arith.extui"(%12) : (i32) -> i64
%26 = "arith.extui"(%13) : (i32) -> i64
%27 = "arith.shli"(%26, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%28 = "arith.ori"(%25, %27) : (i64, i64) -> i64
%29 = "arith.index_castui"(%28) : (i64) -> index
%30 = "arith.extui"(%14) : (i32) -> i64
%31 = "arith.extui"(%15) : (i32) -> i64
%32 = "arith.shli"(%31, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%33 = "arith.ori"(%30, %32) : (i64, i64) -> i64
%34 = "arith.index_castui"(%33) : (i64) -> index
%35 = "arith.index_castui"(%16) : (i32) -> index
%36 = "arith.extui"(%17) : (i32) -> i64
%37 = "arith.extui"(%18) : (i32) -> i64
%38 = "arith.shli"(%37, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%39 = "arith.ori"(%36, %38) : (i64, i64) -> i64
%40 = "arith.index_castui"(%39) : (i64) -> index
%41 = "arith.index_castui"(%19) : (i32) -> index
%42:6 = "util.assume.int"(%24, %29, %34, %35, %40, %41) <{assumptions = [[#util.int.assumption<umin = 68386816, umax = 5300322304>], [#util.int.assumption<umin = 68501504, umax = 5769969664>], [#util.int.assumption<umin = 68501568, umax = 5770002432>], [#util.int.assumption<umin = 1, umax = 4095>], [#util.int.assumption<umin = 64, umax = 576460752303423424, udiv = 64>], [#util.int.assumption<umin = 1, umax = 4095>]]}> : (index, index, index, index, index, index) -> (index, index, index, index, index, index)
%43 = "flow.dispatch.workload.ordinal"(%42#3) <{ordinal = 1 : index}> : (index) -> index
%44 = "flow.dispatch.workload.ordinal"(%42#4) <{ordinal = 2 : index}> : (index) -> index
%45 = "flow.dispatch.workload.ordinal"(%42#5) <{ordinal = 3 : index}> : (index) -> index
%46 = "hal.interface.binding.subspan"(%42#0, %43) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%46) <{alignment = 1 : i32}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%47 = "hal.interface.binding.subspan"(%42#1, %45) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%47) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%48 = "hal.interface.binding.subspan"(%4, %44) {alignment = 64 : index, binding = 1 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%48) <{alignment = 64 : i32}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>) -> ()
%49 = "hal.interface.binding.subspan"(%42#2, %45) {alignment = 64 : index, binding = 2 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%49) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%50 = "memref.alloca"(%45) <{operandSegmentSizes = array<i32: 1, 0>}> : (index) -> memref<1x?xi32, #gpu.address_space<private>>
%51 = "affine.apply"(%45) <{map = affine_map<()[s0] -> (s0 ceildiv 2)>}> : (index) -> index
"gpu.barrier"() : () -> ()
"scf.for"(%9, %51, %5) ({
^bb0(%arg7: index):
%62 = "affine.apply"(%arg7) <{map = affine_map<(d0) -> (d0 * 2)>}> : (index) -> index
%63 = "affine.min"(%arg7, %45) <{map = affine_map<(d0)[s0] -> (d0 * -2 + s0, 2)>}> : (index, index) -> index
"scf.for"(%4, %63, %2) ({
^bb0(%arg8: index):
%64 = "arith.addi"(%arg8, %62) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
%65 = "vector.transfer_read"(%47, %4, %64, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
%66 = "arith.addi"(%65, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
%67 = "arith.trunci"(%66) : (vector<1xi64>) -> vector<1xi32>
%68 = "affine.apply"(%arg7, %arg8) <{map = affine_map<(d0)[s0] -> (d0 * 2 + s0)>}> : (index, index) -> index
"vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
"vector.transfer_write"(%67, %50, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x?xi32, #gpu.address_space<private>>, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"gpu.barrier"() : () -> ()
%52:2 = "affine.delinearize_index"(%9) <{static_basis = array<i64: 4, 16>}> : (index) -> (index, index)
"scf.forall"(%43) <{mapping = [#iree_codegen.workgroup_mapping<z:1>, #iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>], operandSegmentSizes = array<i32: 0, 1, 0, 0>, staticLowerBound = array<i64: 0, 0, 0, 0>, staticStep = array<i64: 1, 1, 1, 1>, staticUpperBound = array<i64: -9223372036854775808, 32, 2, 2>}> ({
^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index):
%53 = "memref.subview"(%7) <{operandSegmentSizes = array<i32: 1, 0, 0, 0>, static_offsets = array<i64: 0, 0>, static_sizes = array<i64: 1, 1>, static_strides = array<i64: 1, 1>}> : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>
"gpu.barrier"() : () -> ()
"scf.for"(%9, %2, %5) ({
^bb0(%arg6: index):
%59 = "vector.transfer_read"(%47, %4, %arg0, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
%60 = "arith.addi"(%59, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
%61 = "arith.trunci"(%60) : (vector<1xi64>) -> vector<1xi32>
"vector.transfer_write"(%61, %7, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x3xi32, #gpu.address_space<workgroup>>, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"gpu.barrier"() : () -> ()
%54 = "affine.apply"(%arg2, %52#0) <{map = affine_map<(d0)[s0] -> (d0 * 4 + s0)>}> : (index, index) -> index
%55 = "affine.apply"(%arg3, %52#1) <{map = affine_map<(d0)[s0] -> (d0 * 64 + s0 * 4)>}> : (index, index) -> index
%56 = "memref.subview"(%48, %arg1, %54, %55, %44) <{operandSegmentSizes = array<i32: 1, 3, 1, 0>, static_offsets = array<i64: 0, -9223372036854775808, -9223372036854775808, -9223372036854775808>, static_sizes = array<i64: -9223372036854775808, 1, 1, 4>, static_strides = array<i64: 1, 1, 1, 1>}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>, index, index, index, index) -> memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%57 = "vector.transfer_read"(%46, %4, %arg0, %arg1, %54, %55, %0) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 5, 1, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, index, index, index, f8E4M3FNUZ) -> vector<4xf8E4M3FNUZ>
%58 = "arith.extf"(%57) : (vector<4xf8E4M3FNUZ>) -> vector<4xbf16>
"vector.transfer_write"(%58, %8, %4, %4, %4, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 5, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (vector<4xbf16>, memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, index, index, index, index, index) -> ()
"iree_linalg_ext.scatter"(%8, %53, %56) <{dimension_map = array<i64: 0>, operandSegmentSizes = array<i32: 2, 1>, unique_indices = true}> ({
^bb0(%arg4: bf16, %arg5: bf16):
"iree_linalg_ext.yield"(%arg4) : (bf16) -> ()
}) : (memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>, memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
"gpu.barrier"() : () -> ()
"scf.forall.in_parallel"() ({
^bb0:
}) : () -> ()
}) : (index) -> ()
"memref.dealloc"(%7) : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> ()
"func.return"() : () -> ()
}) {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64>} : () -> ()
/sharedfile/32/fp8_32_kv16.mlir:15151:13: error: failed to run translation of source executable to target executable for backend #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>
%5708 = torch.aten.index_put %5706, %5707, %5702, %false_5437 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
^
/sharedfile/32/fp8_32_kv16.mlir:15151:13: note: see current operation:
"hal.executable.variant"() ({
"hal.executable.export"() ({
^bb0(%arg9: !hal.device, %arg10: index, %arg11: index, %arg12: index, %arg13: index):
%69:3 = "flow.dispatch.workgroup_count_from_slice"(%arg10, %arg11, %arg12, %arg13) : (index, index, index, index) -> (index, index, index)
"hal.return"(%69#0, %69#1, %69#2) : (index, index, index) -> ()
}) {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index, sym_name = "prefill_bs1$async_dispatch_292_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"} : () -> ()
"builtin.module"() ({
"func.func"() <{function_type = () -> (), sym_name = "prefill_bs1$async_dispatch_292_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"}> ({
%0 = "arith.constant"() <{value = 0.000000e+00 : f8E4M3FNUZ}> : () -> f8E4M3FNUZ
%1 = "arith.constant"() <{value = 0 : i64}> : () -> i64
%2 = "arith.constant"() <{value = 1 : index}> : () -> index
%3 = "arith.constant"() <{value = 32 : i64}> : () -> i64
%4 = "arith.constant"() <{value = 0 : index}> : () -> index
%5 = "arith.constant"() <{value = 64 : index}> : () -> index
%6 = "arith.constant"() <{value = dense<28> : vector<1xi64>}> : () -> vector<1xi64>
%7 = "memref.alloc"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x3xi32, #gpu.address_space<workgroup>>
%8 = "memref.alloca"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x1x1x1x4xbf16, #gpu.address_space<private>>
%9 = "gpu.thread_id"() <{dimension = #gpu<dim x>, upper_bound = 64 : index}> : () -> index
%10 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index} : () -> i32
%11 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 1 : index} : () -> i32
%12 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 2 : index} : () -> i32
%13 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 3 : index} : () -> i32
%14 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 4 : index} : () -> i32
%15 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 5 : index} : () -> i32
%16 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 6 : index} : () -> i32
%17 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 7 : index} : () -> i32
%18 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 8 : index} : () -> i32
%19 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 9 : index} : () -> i32
%20 = "arith.extui"(%10) : (i32) -> i64
%21 = "arith.extui"(%11) : (i32) -> i64
%22 = "arith.shli"(%21, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%23 = "arith.ori"(%20, %22) : (i64, i64) -> i64
%24 = "arith.index_castui"(%23) : (i64) -> index
%25 = "arith.extui"(%12) : (i32) -> i64
%26 = "arith.extui"(%13) : (i32) -> i64
%27 = "arith.shli"(%26, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%28 = "arith.ori"(%25, %27) : (i64, i64) -> i64
%29 = "arith.index_castui"(%28) : (i64) -> index
%30 = "arith.extui"(%14) : (i32) -> i64
%31 = "arith.extui"(%15) : (i32) -> i64
%32 = "arith.shli"(%31, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%33 = "arith.ori"(%30, %32) : (i64, i64) -> i64
%34 = "arith.index_castui"(%33) : (i64) -> index
%35 = "arith.index_castui"(%16) : (i32) -> index
%36 = "arith.extui"(%17) : (i32) -> i64
%37 = "arith.extui"(%18) : (i32) -> i64
%38 = "arith.shli"(%37, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%39 = "arith.ori"(%36, %38) : (i64, i64) -> i64
%40 = "arith.index_castui"(%39) : (i64) -> index
%41 = "arith.index_castui"(%19) : (i32) -> index
%42:6 = "util.assume.int"(%24, %29, %34, %35, %40, %41) <{assumptions = [[#util.int.assumption<umin = 68386816, umax = 5300322304>], [#util.int.assumption<umin = 68501504, umax = 5769969664>], [#util.int.assumption<umin = 68501568, umax = 5770002432>], [#util.int.assumption<umin = 1, umax = 4095>], [#util.int.assumption<umin = 64, umax = 576460752303423424, udiv = 64>], [#util.int.assumption<umin = 1, umax = 4095>]]}> : (index, index, index, index, index, index) -> (index, index, index, index, index, index)
%43 = "flow.dispatch.workload.ordinal"(%42#3) <{ordinal = 1 : index}> : (index) -> index
%44 = "flow.dispatch.workload.ordinal"(%42#4) <{ordinal = 2 : index}> : (index) -> index
%45 = "flow.dispatch.workload.ordinal"(%42#5) <{ordinal = 3 : index}> : (index) -> index
%46 = "hal.interface.binding.subspan"(%42#0, %43) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%46) <{alignment = 1 : i32}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%47 = "hal.interface.binding.subspan"(%42#1, %45) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%47) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%48 = "hal.interface.binding.subspan"(%4, %44) {alignment = 64 : index, binding = 1 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%48) <{alignment = 64 : i32}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>) -> ()
%49 = "hal.interface.binding.subspan"(%42#2, %45) {alignment = 64 : index, binding = 2 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%49) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%50 = "memref.alloca"(%45) <{operandSegmentSizes = array<i32: 1, 0>}> : (index) -> memref<1x?xi32, #gpu.address_space<private>>
%51 = "affine.apply"(%45) <{map = affine_map<()[s0] -> (s0 ceildiv 2)>}> : (index) -> index
"gpu.barrier"() : () -> ()
"scf.for"(%9, %51, %5) ({
^bb0(%arg7: index):
%62 = "affine.apply"(%arg7) <{map = affine_map<(d0) -> (d0 * 2)>}> : (index) -> index
%63 = "affine.min"(%arg7, %45) <{map = affine_map<(d0)[s0] -> (d0 * -2 + s0, 2)>}> : (index, index) -> index
"scf.for"(%4, %63, %2) ({
^bb0(%arg8: index):
%64 = "arith.addi"(%arg8, %62) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
%65 = "vector.transfer_read"(%47, %4, %64, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
%66 = "arith.addi"(%65, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
%67 = "arith.trunci"(%66) : (vector<1xi64>) -> vector<1xi32>
%68 = "affine.apply"(%arg7, %arg8) <{map = affine_map<(d0)[s0] -> (d0 * 2 + s0)>}> : (index, index) -> index
"vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
"vector.transfer_write"(%67, %50, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x?xi32, #gpu.address_space<private>>, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"gpu.barrier"() : () -> ()
%52:2 = "affine.delinearize_index"(%9) <{static_basis = array<i64: 4, 16>}> : (index) -> (index, index)
"scf.forall"(%43) <{mapping = [#iree_codegen.workgroup_mapping<z:1>, #iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>], operandSegmentSizes = array<i32: 0, 1, 0, 0>, staticLowerBound = array<i64: 0, 0, 0, 0>, staticStep = array<i64: 1, 1, 1, 1>, staticUpperBound = array<i64: -9223372036854775808, 32, 2, 2>}> ({
^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index):
%53 = "memref.subview"(%7) <{operandSegmentSizes = array<i32: 1, 0, 0, 0>, static_offsets = array<i64: 0, 0>, static_sizes = array<i64: 1, 1>, static_strides = array<i64: 1, 1>}> : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>
"gpu.barrier"() : () -> ()
"scf.for"(%9, %2, %5) ({
^bb0(%arg6: index):
%59 = "vector.transfer_read"(%47, %4, %arg0, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
%60 = "arith.addi"(%59, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
%61 = "arith.trunci"(%60) : (vector<1xi64>) -> vector<1xi32>
"vector.transfer_write"(%61, %7, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x3xi32, #gpu.address_space<workgroup>>, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"gpu.barrier"() : () -> ()
%54 = "affine.apply"(%arg2, %52#0) <{map = affine_map<(d0)[s0] -> (d0 * 4 + s0)>}> : (index, index) -> index
%55 = "affine.apply"(%arg3, %52#1) <{map = affine_map<(d0)[s0] -> (d0 * 64 + s0 * 4)>}> : (index, index) -> index
%56 = "memref.subview"(%48, %arg1, %54, %55, %44) <{operandSegmentSizes = array<i32: 1, 3, 1, 0>, static_offsets = array<i64: 0, -9223372036854775808, -9223372036854775808, -9223372036854775808>, static_sizes = array<i64: -9223372036854775808, 1, 1, 4>, static_strides = array<i64: 1, 1, 1, 1>}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>, index, index, index, index) -> memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%57 = "vector.transfer_read"(%46, %4, %arg0, %arg1, %54, %55, %0) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 5, 1, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, index, index, index, f8E4M3FNUZ) -> vector<4xf8E4M3FNUZ>
%58 = "arith.extf"(%57) : (vector<4xf8E4M3FNUZ>) -> vector<4xbf16>
"vector.transfer_write"(%58, %8, %4, %4, %4, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 5, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (vector<4xbf16>, memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, index, index, index, index, index) -> ()
"iree_linalg_ext.scatter"(%8, %53, %56) <{dimension_map = array<i64: 0>, operandSegmentSizes = array<i32: 2, 1>, unique_indices = true}> ({
^bb0(%arg4: bf16, %arg5: bf16):
"iree_linalg_ext.yield"(%arg4) : (bf16) -> ()
}) : (memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>, memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
"gpu.barrier"() : () -> ()
"scf.forall.in_parallel"() ({
^bb0:
}) : () -> ()
}) : (index) -> ()
"memref.dealloc"(%7) : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> ()
"func.return"() : () -> ()
}) {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64>} : () -> ()
}) : () -> ()
"hal.executable.variant_end"() : () -> ()
}) {sym_name = "rocm_hsaco_fb", target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>} : () -> ()
/sharedfile/32/fp8_32_kv16.mlir:16027:13: error: 'vector.transfer_write' op write affecting operations on global resources are restricted to workgroup distributed contexts.
%6045 = torch.aten.index_put %6043, %6044, %6039, %false_5809 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
^
/sharedfile/32/fp8_32_kv16.mlir:16027:13: note: see current operation: "vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
/sharedfile/32/fp8_32_kv16.mlir:16027:13: error: 'func.func' op failed on workgroup distribution verification
%6045 = torch.aten.index_put %6043, %6044, %6039, %false_5809 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
^
/sharedfile/32/fp8_32_kv16.mlir:16027:13: note: see current operation:
"func.func"() <{function_type = () -> (), sym_name = "prefill_bs1$async_dispatch_312_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"}> ({
%0 = "arith.constant"() <{value = 0.000000e+00 : f8E4M3FNUZ}> : () -> f8E4M3FNUZ
%1 = "arith.constant"() <{value = 0 : i64}> : () -> i64
%2 = "arith.constant"() <{value = 1 : index}> : () -> index
%3 = "arith.constant"() <{value = 32 : i64}> : () -> i64
%4 = "arith.constant"() <{value = 0 : index}> : () -> index
%5 = "arith.constant"() <{value = 64 : index}> : () -> index
%6 = "arith.constant"() <{value = dense<30> : vector<1xi64>}> : () -> vector<1xi64>
%7 = "memref.alloc"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x3xi32, #gpu.address_space<workgroup>>
%8 = "memref.alloca"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x1x1x1x4xbf16, #gpu.address_space<private>>
%9 = "gpu.thread_id"() <{dimension = #gpu<dim x>, upper_bound = 64 : index}> : () -> index
%10 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index} : () -> i32
%11 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 1 : index} : () -> i32
%12 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 2 : index} : () -> i32
%13 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 3 : index} : () -> i32
%14 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 4 : index} : () -> i32
%15 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 5 : index} : () -> i32
%16 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 6 : index} : () -> i32
%17 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 7 : index} : () -> i32
%18 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 8 : index} : () -> i32
%19 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 9 : index} : () -> i32
%20 = "arith.extui"(%10) : (i32) -> i64
%21 = "arith.extui"(%11) : (i32) -> i64
%22 = "arith.shli"(%21, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%23 = "arith.ori"(%20, %22) : (i64, i64) -> i64
%24 = "arith.index_castui"(%23) : (i64) -> index
%25 = "arith.extui"(%12) : (i32) -> i64
%26 = "arith.extui"(%13) : (i32) -> i64
%27 = "arith.shli"(%26, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%28 = "arith.ori"(%25, %27) : (i64, i64) -> i64
%29 = "arith.index_castui"(%28) : (i64) -> index
%30 = "arith.extui"(%14) : (i32) -> i64
%31 = "arith.extui"(%15) : (i32) -> i64
%32 = "arith.shli"(%31, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%33 = "arith.ori"(%30, %32) : (i64, i64) -> i64
%34 = "arith.index_castui"(%33) : (i64) -> index
%35 = "arith.index_castui"(%16) : (i32) -> index
%36 = "arith.extui"(%17) : (i32) -> i64
%37 = "arith.extui"(%18) : (i32) -> i64
%38 = "arith.shli"(%37, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%39 = "arith.ori"(%36, %38) : (i64, i64) -> i64
%40 = "arith.index_castui"(%39) : (i64) -> index
%41 = "arith.index_castui"(%19) : (i32) -> index
%42:6 = "util.assume.int"(%24, %29, %34, %35, %40, %41) <{assumptions = [[#util.int.assumption<umin = 68386816, umax = 5300322304>], [#util.int.assumption<umin = 68501504, umax = 5769969664>], [#util.int.assumption<umin = 68501568, umax = 5770002432>], [#util.int.assumption<umin = 1, umax = 4095>], [#util.int.assumption<umin = 64, umax = 576460752303423424, udiv = 64>], [#util.int.assumption<umin = 1, umax = 4095>]]}> : (index, index, index, index, index, index) -> (index, index, index, index, index, index)
%43 = "flow.dispatch.workload.ordinal"(%42#3) <{ordinal = 1 : index}> : (index) -> index
%44 = "flow.dispatch.workload.ordinal"(%42#4) <{ordinal = 2 : index}> : (index) -> index
%45 = "flow.dispatch.workload.ordinal"(%42#5) <{ordinal = 3 : index}> : (index) -> index
%46 = "hal.interface.binding.subspan"(%42#0, %43) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%46) <{alignment = 1 : i32}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%47 = "hal.interface.binding.subspan"(%42#1, %45) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%47) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%48 = "hal.interface.binding.subspan"(%4, %44) {alignment = 64 : index, binding = 1 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%48) <{alignment = 64 : i32}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>) -> ()
%49 = "hal.interface.binding.subspan"(%42#2, %45) {alignment = 64 : index, binding = 2 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%49) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%50 = "memref.alloca"(%45) <{operandSegmentSizes = array<i32: 1, 0>}> : (index) -> memref<1x?xi32, #gpu.address_space<private>>
%51 = "affine.apply"(%45) <{map = affine_map<()[s0] -> (s0 ceildiv 2)>}> : (index) -> index
"gpu.barrier"() : () -> ()
"scf.for"(%9, %51, %5) ({
^bb0(%arg7: index):
%62 = "affine.apply"(%arg7) <{map = affine_map<(d0) -> (d0 * 2)>}> : (index) -> index
%63 = "affine.min"(%arg7, %45) <{map = affine_map<(d0)[s0] -> (d0 * -2 + s0, 2)>}> : (index, index) -> index
"scf.for"(%4, %63, %2) ({
^bb0(%arg8: index):
%64 = "arith.addi"(%arg8, %62) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
%65 = "vector.transfer_read"(%47, %4, %64, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
%66 = "arith.addi"(%65, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
%67 = "arith.trunci"(%66) : (vector<1xi64>) -> vector<1xi32>
%68 = "affine.apply"(%arg7, %arg8) <{map = affine_map<(d0)[s0] -> (d0 * 2 + s0)>}> : (index, index) -> index
"vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
"vector.transfer_write"(%67, %50, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x?xi32, #gpu.address_space<private>>, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"gpu.barrier"() : () -> ()
%52:2 = "affine.delinearize_index"(%9) <{static_basis = array<i64: 4, 16>}> : (index) -> (index, index)
"scf.forall"(%43) <{mapping = [#iree_codegen.workgroup_mapping<z:1>, #iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>], operandSegmentSizes = array<i32: 0, 1, 0, 0>, staticLowerBound = array<i64: 0, 0, 0, 0>, staticStep = array<i64: 1, 1, 1, 1>, staticUpperBound = array<i64: -9223372036854775808, 32, 2, 2>}> ({
^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index):
%53 = "memref.subview"(%7) <{operandSegmentSizes = array<i32: 1, 0, 0, 0>, static_offsets = array<i64: 0, 0>, static_sizes = array<i64: 1, 1>, static_strides = array<i64: 1, 1>}> : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>
"gpu.barrier"() : () -> ()
"scf.for"(%9, %2, %5) ({
^bb0(%arg6: index):
%59 = "vector.transfer_read"(%47, %4, %arg0, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
%60 = "arith.addi"(%59, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
%61 = "arith.trunci"(%60) : (vector<1xi64>) -> vector<1xi32>
"vector.transfer_write"(%61, %7, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x3xi32, #gpu.address_space<workgroup>>, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"gpu.barrier"() : () -> ()
%54 = "affine.apply"(%arg2, %52#0) <{map = affine_map<(d0)[s0] -> (d0 * 4 + s0)>}> : (index, index) -> index
%55 = "affine.apply"(%arg3, %52#1) <{map = affine_map<(d0)[s0] -> (d0 * 64 + s0 * 4)>}> : (index, index) -> index
%56 = "memref.subview"(%48, %arg1, %54, %55, %44) <{operandSegmentSizes = array<i32: 1, 3, 1, 0>, static_offsets = array<i64: 0, -9223372036854775808, -9223372036854775808, -9223372036854775808>, static_sizes = array<i64: -9223372036854775808, 1, 1, 4>, static_strides = array<i64: 1, 1, 1, 1>}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>, index, index, index, index) -> memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%57 = "vector.transfer_read"(%46, %4, %arg0, %arg1, %54, %55, %0) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 5, 1, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, index, index, index, f8E4M3FNUZ) -> vector<4xf8E4M3FNUZ>
%58 = "arith.extf"(%57) : (vector<4xf8E4M3FNUZ>) -> vector<4xbf16>
"vector.transfer_write"(%58, %8, %4, %4, %4, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 5, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (vector<4xbf16>, memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, index, index, index, index, index) -> ()
"iree_linalg_ext.scatter"(%8, %53, %56) <{dimension_map = array<i64: 0>, operandSegmentSizes = array<i32: 2, 1>, unique_indices = true}> ({
^bb0(%arg4: bf16, %arg5: bf16):
"iree_linalg_ext.yield"(%arg4) : (bf16) -> ()
}) : (memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>, memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
"gpu.barrier"() : () -> ()
"scf.forall.in_parallel"() ({
^bb0:
}) : () -> ()
}) : (index) -> ()
"memref.dealloc"(%7) : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> ()
"func.return"() : () -> ()
}) {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64>} : () -> ()
/sharedfile/32/fp8_32_kv16.mlir:16027:13: error: failed to run translation of source executable to target executable for backend #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>
%6045 = torch.aten.index_put %6043, %6044, %6039, %false_5809 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
^
/sharedfile/32/fp8_32_kv16.mlir:16027:13: note: see current operation:
"hal.executable.variant"() ({
"hal.executable.export"() ({
^bb0(%arg9: !hal.device, %arg10: index, %arg11: index, %arg12: index, %arg13: index):
%69:3 = "flow.dispatch.workgroup_count_from_slice"(%arg10, %arg11, %arg12, %arg13) : (index, index, index, index) -> (index, index, index)
"hal.return"(%69#0, %69#1, %69#2) : (index, index, index) -> ()
}) {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index, sym_name = "prefill_bs1$async_dispatch_312_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"} : () -> ()
"builtin.module"() ({
"func.func"() <{function_type = () -> (), sym_name = "prefill_bs1$async_dispatch_312_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"}> ({
%0 = "arith.constant"() <{value = 0.000000e+00 : f8E4M3FNUZ}> : () -> f8E4M3FNUZ
%1 = "arith.constant"() <{value = 0 : i64}> : () -> i64
%2 = "arith.constant"() <{value = 1 : index}> : () -> index
%3 = "arith.constant"() <{value = 32 : i64}> : () -> i64
%4 = "arith.constant"() <{value = 0 : index}> : () -> index
%5 = "arith.constant"() <{value = 64 : index}> : () -> index
%6 = "arith.constant"() <{value = dense<30> : vector<1xi64>}> : () -> vector<1xi64>
%7 = "memref.alloc"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x3xi32, #gpu.address_space<workgroup>>
%8 = "memref.alloca"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x1x1x1x4xbf16, #gpu.address_space<private>>
%9 = "gpu.thread_id"() <{dimension = #gpu<dim x>, upper_bound = 64 : index}> : () -> index
%10 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index} : () -> i32
%11 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 1 : index} : () -> i32
%12 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 2 : index} : () -> i32
%13 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 3 : index} : () -> i32
%14 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 4 : index} : () -> i32
%15 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 5 : index} : () -> i32
%16 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 6 : index} : () -> i32
%17 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 7 : index} : () -> i32
%18 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 8 : index} : () -> i32
%19 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 9 : index} : () -> i32
%20 = "arith.extui"(%10) : (i32) -> i64
%21 = "arith.extui"(%11) : (i32) -> i64
%22 = "arith.shli"(%21, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%23 = "arith.ori"(%20, %22) : (i64, i64) -> i64
%24 = "arith.index_castui"(%23) : (i64) -> index
%25 = "arith.extui"(%12) : (i32) -> i64
%26 = "arith.extui"(%13) : (i32) -> i64
%27 = "arith.shli"(%26, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%28 = "arith.ori"(%25, %27) : (i64, i64) -> i64
%29 = "arith.index_castui"(%28) : (i64) -> index
%30 = "arith.extui"(%14) : (i32) -> i64
%31 = "arith.extui"(%15) : (i32) -> i64
%32 = "arith.shli"(%31, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%33 = "arith.ori"(%30, %32) : (i64, i64) -> i64
%34 = "arith.index_castui"(%33) : (i64) -> index
%35 = "arith.index_castui"(%16) : (i32) -> index
%36 = "arith.extui"(%17) : (i32) -> i64
%37 = "arith.extui"(%18) : (i32) -> i64
%38 = "arith.shli"(%37, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%39 = "arith.ori"(%36, %38) : (i64, i64) -> i64
%40 = "arith.index_castui"(%39) : (i64) -> index
%41 = "arith.index_castui"(%19) : (i32) -> index
%42:6 = "util.assume.int"(%24, %29, %34, %35, %40, %41) <{assumptions = [[#util.int.assumption<umin = 68386816, umax = 5300322304>], [#util.int.assumption<umin = 68501504, umax = 5769969664>], [#util.int.assumption<umin = 68501568, umax = 5770002432>], [#util.int.assumption<umin = 1, umax = 4095>], [#util.int.assumption<umin = 64, umax = 576460752303423424, udiv = 64>], [#util.int.assumption<umin = 1, umax = 4095>]]}> : (index, index, index, index, index, index) -> (index, index, index, index, index, index)
%43 = "flow.dispatch.workload.ordinal"(%42#3) <{ordinal = 1 : index}> : (index) -> index
%44 = "flow.dispatch.workload.ordinal"(%42#4) <{ordinal = 2 : index}> : (index) -> index
%45 = "flow.dispatch.workload.ordinal"(%42#5) <{ordinal = 3 : index}> : (index) -> index
%46 = "hal.interface.binding.subspan"(%42#0, %43) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%46) <{alignment = 1 : i32}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%47 = "hal.interface.binding.subspan"(%42#1, %45) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%47) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%48 = "hal.interface.binding.subspan"(%4, %44) {alignment = 64 : index, binding = 1 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%48) <{alignment = 64 : i32}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>) -> ()
%49 = "hal.interface.binding.subspan"(%42#2, %45) {alignment = 64 : index, binding = 2 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%49) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%50 = "memref.alloca"(%45) <{operandSegmentSizes = array<i32: 1, 0>}> : (index) -> memref<1x?xi32, #gpu.address_space<private>>
%51 = "affine.apply"(%45) <{map = affine_map<()[s0] -> (s0 ceildiv 2)>}> : (index) -> index
"gpu.barrier"() : () -> ()
"scf.for"(%9, %51, %5) ({
^bb0(%arg7: index):
%62 = "affine.apply"(%arg7) <{map = affine_map<(d0) -> (d0 * 2)>}> : (index) -> index
%63 = "affine.min"(%arg7, %45) <{map = affine_map<(d0)[s0] -> (d0 * -2 + s0, 2)>}> : (index, index) -> index
"scf.for"(%4, %63, %2) ({
^bb0(%arg8: index):
%64 = "arith.addi"(%arg8, %62) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
%65 = "vector.transfer_read"(%47, %4, %64, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
%66 = "arith.addi"(%65, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
%67 = "arith.trunci"(%66) : (vector<1xi64>) -> vector<1xi32>
%68 = "affine.apply"(%arg7, %arg8) <{map = affine_map<(d0)[s0] -> (d0 * 2 + s0)>}> : (index, index) -> index
"vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
"vector.transfer_write"(%67, %50, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x?xi32, #gpu.address_space<private>>, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"gpu.barrier"() : () -> ()
%52:2 = "affine.delinearize_index"(%9) <{static_basis = array<i64: 4, 16>}> : (index) -> (index, index)
"scf.forall"(%43) <{mapping = [#iree_codegen.workgroup_mapping<z:1>, #iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>], operandSegmentSizes = array<i32: 0, 1, 0, 0>, staticLowerBound = array<i64: 0, 0, 0, 0>, staticStep = array<i64: 1, 1, 1, 1>, staticUpperBound = array<i64: -9223372036854775808, 32, 2, 2>}> ({
^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index):
%53 = "memref.subview"(%7) <{operandSegmentSizes = array<i32: 1, 0, 0, 0>, static_offsets = array<i64: 0, 0>, static_sizes = array<i64: 1, 1>, static_strides = array<i64: 1, 1>}> : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>
"gpu.barrier"() : () -> ()
"scf.for"(%9, %2, %5) ({
^bb0(%arg6: index):
%59 = "vector.transfer_read"(%47, %4, %arg0, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
%60 = "arith.addi"(%59, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
%61 = "arith.trunci"(%60) : (vector<1xi64>) -> vector<1xi32>
"vector.transfer_write"(%61, %7, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x3xi32, #gpu.address_space<workgroup>>, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"gpu.barrier"() : () -> ()
%54 = "affine.apply"(%arg2, %52#0) <{map = affine_map<(d0)[s0] -> (d0 * 4 + s0)>}> : (index, index) -> index
%55 = "affine.apply"(%arg3, %52#1) <{map = affine_map<(d0)[s0] -> (d0 * 64 + s0 * 4)>}> : (index, index) -> index
%56 = "memref.subview"(%48, %arg1, %54, %55, %44) <{operandSegmentSizes = array<i32: 1, 3, 1, 0>, static_offsets = array<i64: 0, -9223372036854775808, -9223372036854775808, -9223372036854775808>, static_sizes = array<i64: -9223372036854775808, 1, 1, 4>, static_strides = array<i64: 1, 1, 1, 1>}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>, index, index, index, index) -> memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%57 = "vector.transfer_read"(%46, %4, %arg0, %arg1, %54, %55, %0) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 5, 1, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, index, index, index, f8E4M3FNUZ) -> vector<4xf8E4M3FNUZ>
%58 = "arith.extf"(%57) : (vector<4xf8E4M3FNUZ>) -> vector<4xbf16>
"vector.transfer_write"(%58, %8, %4, %4, %4, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 5, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (vector<4xbf16>, memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, index, index, index, index, index) -> ()
"iree_linalg_ext.scatter"(%8, %53, %56) <{dimension_map = array<i64: 0>, operandSegmentSizes = array<i32: 2, 1>, unique_indices = true}> ({
^bb0(%arg4: bf16, %arg5: bf16):
"iree_linalg_ext.yield"(%arg4) : (bf16) -> ()
}) : (memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>, memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
"gpu.barrier"() : () -> ()
"scf.forall.in_parallel"() ({
^bb0:
}) : () -> ()
}) : (index) -> ()
"memref.dealloc"(%7) : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> ()
"func.return"() : () -> ()
}) {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64>} : () -> ()
}) : () -> ()
"hal.executable.variant_end"() : () -> ()
}) {sym_name = "rocm_hsaco_fb", target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>} : () -> ()
/sharedfile/32/fp8_32_kv16.mlir:16903:13: error: 'vector.transfer_write' op write affecting operations on global resources are restricted to workgroup distributed contexts.
%6382 = torch.aten.index_put %6380, %6381, %6376, %false_6182 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
^
/sharedfile/32/fp8_32_kv16.mlir:16903:13: note: see current operation: "vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
/sharedfile/32/fp8_32_kv16.mlir:16903:13: error: 'func.func' op failed on workgroup distribution verification
%6382 = torch.aten.index_put %6380, %6381, %6376, %false_6182 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
^
/sharedfile/32/fp8_32_kv16.mlir:16903:13: note: see current operation:
"func.func"() <{function_type = () -> (), sym_name = "prefill_bs1$async_dispatch_332_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"}> ({
%0 = "arith.constant"() <{value = 0.000000e+00 : f8E4M3FNUZ}> : () -> f8E4M3FNUZ
%1 = "arith.constant"() <{value = 0 : i64}> : () -> i64
%2 = "arith.constant"() <{value = 1 : index}> : () -> index
%3 = "arith.constant"() <{value = 32 : i64}> : () -> i64
%4 = "arith.constant"() <{value = 0 : index}> : () -> index
%5 = "arith.constant"() <{value = 64 : index}> : () -> index
%6 = "arith.constant"() <{value = dense<32> : vector<1xi64>}> : () -> vector<1xi64>
%7 = "memref.alloc"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x3xi32, #gpu.address_space<workgroup>>
%8 = "memref.alloca"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x1x1x1x4xbf16, #gpu.address_space<private>>
%9 = "gpu.thread_id"() <{dimension = #gpu<dim x>, upper_bound = 64 : index}> : () -> index
%10 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index} : () -> i32
%11 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 1 : index} : () -> i32
%12 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 2 : index} : () -> i32
%13 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 3 : index} : () -> i32
%14 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 4 : index} : () -> i32
%15 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 5 : index} : () -> i32
%16 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 6 : index} : () -> i32
%17 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 7 : index} : () -> i32
%18 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 8 : index} : () -> i32
%19 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 9 : index} : () -> i32
%20 = "arith.extui"(%10) : (i32) -> i64
%21 = "arith.extui"(%11) : (i32) -> i64
%22 = "arith.shli"(%21, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%23 = "arith.ori"(%20, %22) : (i64, i64) -> i64
%24 = "arith.index_castui"(%23) : (i64) -> index
%25 = "arith.extui"(%12) : (i32) -> i64
%26 = "arith.extui"(%13) : (i32) -> i64
%27 = "arith.shli"(%26, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%28 = "arith.ori"(%25, %27) : (i64, i64) -> i64
%29 = "arith.index_castui"(%28) : (i64) -> index
%30 = "arith.extui"(%14) : (i32) -> i64
%31 = "arith.extui"(%15) : (i32) -> i64
%32 = "arith.shli"(%31, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%33 = "arith.ori"(%30, %32) : (i64, i64) -> i64
%34 = "arith.index_castui"(%33) : (i64) -> index
%35 = "arith.index_castui"(%16) : (i32) -> index
%36 = "arith.extui"(%17) : (i32) -> i64
%37 = "arith.extui"(%18) : (i32) -> i64
%38 = "arith.shli"(%37, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%39 = "arith.ori"(%36, %38) : (i64, i64) -> i64
%40 = "arith.index_castui"(%39) : (i64) -> index
%41 = "arith.index_castui"(%19) : (i32) -> index
%42:6 = "util.assume.int"(%24, %29, %34, %35, %40, %41) <{assumptions = [[#util.int.assumption<umin = 68386816, umax = 5300322304>], [#util.int.assumption<umin = 68501504, umax = 5769969664>], [#util.int.assumption<umin = 68501568, umax = 5770002432>], [#util.int.assumption<umin = 1, umax = 4095>], [#util.int.assumption<umin = 64, umax = 576460752303423424, udiv = 64>], [#util.int.assumption<umin = 1, umax = 4095>]]}> : (index, index, index, index, index, index) -> (index, index, index, index, index, index)
%43 = "flow.dispatch.workload.ordinal"(%42#3) <{ordinal = 1 : index}> : (index) -> index
%44 = "flow.dispatch.workload.ordinal"(%42#4) <{ordinal = 2 : index}> : (index) -> index
%45 = "flow.dispatch.workload.ordinal"(%42#5) <{ordinal = 3 : index}> : (index) -> index
%46 = "hal.interface.binding.subspan"(%42#0, %43) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%46) <{alignment = 1 : i32}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%47 = "hal.interface.binding.subspan"(%42#1, %45) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%47) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%48 = "hal.interface.binding.subspan"(%4, %44) {alignment = 64 : index, binding = 1 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%48) <{alignment = 64 : i32}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>) -> ()
%49 = "hal.interface.binding.subspan"(%42#2, %45) {alignment = 64 : index, binding = 2 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%49) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%50 = "memref.alloca"(%45) <{operandSegmentSizes = array<i32: 1, 0>}> : (index) -> memref<1x?xi32, #gpu.address_space<private>>
%51 = "affine.apply"(%45) <{map = affine_map<()[s0] -> (s0 ceildiv 2)>}> : (index) -> index
"gpu.barrier"() : () -> ()
"scf.for"(%9, %51, %5) ({
^bb0(%arg7: index):
%62 = "affine.apply"(%arg7) <{map = affine_map<(d0) -> (d0 * 2)>}> : (index) -> index
%63 = "affine.min"(%arg7, %45) <{map = affine_map<(d0)[s0] -> (d0 * -2 + s0, 2)>}> : (index, index) -> index
"scf.for"(%4, %63, %2) ({
^bb0(%arg8: index):
%64 = "arith.addi"(%arg8, %62) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
%65 = "vector.transfer_read"(%47, %4, %64, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
%66 = "arith.addi"(%65, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
%67 = "arith.trunci"(%66) : (vector<1xi64>) -> vector<1xi32>
%68 = "affine.apply"(%arg7, %arg8) <{map = affine_map<(d0)[s0] -> (d0 * 2 + s0)>}> : (index, index) -> index
"vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
"vector.transfer_write"(%67, %50, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x?xi32, #gpu.address_space<private>>, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"gpu.barrier"() : () -> ()
%52:2 = "affine.delinearize_index"(%9) <{static_basis = array<i64: 4, 16>}> : (index) -> (index, index)
"scf.forall"(%43) <{mapping = [#iree_codegen.workgroup_mapping<z:1>, #iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>], operandSegmentSizes = array<i32: 0, 1, 0, 0>, staticLowerBound = array<i64: 0, 0, 0, 0>, staticStep = array<i64: 1, 1, 1, 1>, staticUpperBound = array<i64: -9223372036854775808, 32, 2, 2>}> ({
^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index):
%53 = "memref.subview"(%7) <{operandSegmentSizes = array<i32: 1, 0, 0, 0>, static_offsets = array<i64: 0, 0>, static_sizes = array<i64: 1, 1>, static_strides = array<i64: 1, 1>}> : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>
"gpu.barrier"() : () -> ()
"scf.for"(%9, %2, %5) ({
^bb0(%arg6: index):
%59 = "vector.transfer_read"(%47, %4, %arg0, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
%60 = "arith.addi"(%59, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
%61 = "arith.trunci"(%60) : (vector<1xi64>) -> vector<1xi32>
"vector.transfer_write"(%61, %7, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x3xi32, #gpu.address_space<workgroup>>, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"gpu.barrier"() : () -> ()
%54 = "affine.apply"(%arg2, %52#0) <{map = affine_map<(d0)[s0] -> (d0 * 4 + s0)>}> : (index, index) -> index
%55 = "affine.apply"(%arg3, %52#1) <{map = affine_map<(d0)[s0] -> (d0 * 64 + s0 * 4)>}> : (index, index) -> index
%56 = "memref.subview"(%48, %arg1, %54, %55, %44) <{operandSegmentSizes = array<i32: 1, 3, 1, 0>, static_offsets = array<i64: 0, -9223372036854775808, -9223372036854775808, -9223372036854775808>, static_sizes = array<i64: -9223372036854775808, 1, 1, 4>, static_strides = array<i64: 1, 1, 1, 1>}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>, index, index, index, index) -> memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%57 = "vector.transfer_read"(%46, %4, %arg0, %arg1, %54, %55, %0) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 5, 1, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, index, index, index, f8E4M3FNUZ) -> vector<4xf8E4M3FNUZ>
%58 = "arith.extf"(%57) : (vector<4xf8E4M3FNUZ>) -> vector<4xbf16>
"vector.transfer_write"(%58, %8, %4, %4, %4, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 5, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (vector<4xbf16>, memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, index, index, index, index, index) -> ()
"iree_linalg_ext.scatter"(%8, %53, %56) <{dimension_map = array<i64: 0>, operandSegmentSizes = array<i32: 2, 1>, unique_indices = true}> ({
^bb0(%arg4: bf16, %arg5: bf16):
"iree_linalg_ext.yield"(%arg4) : (bf16) -> ()
}) : (memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>, memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
"gpu.barrier"() : () -> ()
"scf.forall.in_parallel"() ({
^bb0:
}) : () -> ()
}) : (index) -> ()
"memref.dealloc"(%7) : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> ()
"func.return"() : () -> ()
}) {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64>} : () -> ()
/sharedfile/32/fp8_32_kv16.mlir:16903:13: error: failed to run translation of source executable to target executable for backend #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>
%6382 = torch.aten.index_put %6380, %6381, %6376, %false_6182 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
^
/sharedfile/32/fp8_32_kv16.mlir:16903:13: note: see current operation:
"hal.executable.variant"() ({
"hal.executable.export"() ({
^bb0(%arg9: !hal.device, %arg10: index, %arg11: index, %arg12: index, %arg13: index):
%69:3 = "flow.dispatch.workgroup_count_from_slice"(%arg10, %arg11, %arg12, %arg13) : (index, index, index, index) -> (index, index, index)
"hal.return"(%69#0, %69#1, %69#2) : (index, index, index) -> ()
}) {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index, sym_name = "prefill_bs1$async_dispatch_332_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"} : () -> ()
"builtin.module"() ({
"func.func"() <{function_type = () -> (), sym_name = "prefill_bs1$async_dispatch_332_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"}> ({
%0 = "arith.constant"() <{value = 0.000000e+00 : f8E4M3FNUZ}> : () -> f8E4M3FNUZ
%1 = "arith.constant"() <{value = 0 : i64}> : () -> i64
%2 = "arith.constant"() <{value = 1 : index}> : () -> index
%3 = "arith.constant"() <{value = 32 : i64}> : () -> i64
%4 = "arith.constant"() <{value = 0 : index}> : () -> index
%5 = "arith.constant"() <{value = 64 : index}> : () -> index
%6 = "arith.constant"() <{value = dense<32> : vector<1xi64>}> : () -> vector<1xi64>
%7 = "memref.alloc"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x3xi32, #gpu.address_space<workgroup>>
%8 = "memref.alloca"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x1x1x1x4xbf16, #gpu.address_space<private>>
%9 = "gpu.thread_id"() <{dimension = #gpu<dim x>, upper_bound = 64 : index}> : () -> index
%10 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index} : () -> i32
%11 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 1 : index} : () -> i32
%12 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 2 : index} : () -> i32
%13 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 3 : index} : () -> i32
%14 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 4 : index} : () -> i32
%15 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 5 : index} : () -> i32
%16 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 6 : index} : () -> i32
%17 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 7 : index} : () -> i32
%18 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 8 : index} : () -> i32
%19 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 9 : index} : () -> i32
%20 = "arith.extui"(%10) : (i32) -> i64
%21 = "arith.extui"(%11) : (i32) -> i64
%22 = "arith.shli"(%21, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%23 = "arith.ori"(%20, %22) : (i64, i64) -> i64
%24 = "arith.index_castui"(%23) : (i64) -> index
%25 = "arith.extui"(%12) : (i32) -> i64
%26 = "arith.extui"(%13) : (i32) -> i64
%27 = "arith.shli"(%26, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%28 = "arith.ori"(%25, %27) : (i64, i64) -> i64
%29 = "arith.index_castui"(%28) : (i64) -> index
%30 = "arith.extui"(%14) : (i32) -> i64
%31 = "arith.extui"(%15) : (i32) -> i64
%32 = "arith.shli"(%31, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%33 = "arith.ori"(%30, %32) : (i64, i64) -> i64
%34 = "arith.index_castui"(%33) : (i64) -> index
%35 = "arith.index_castui"(%16) : (i32) -> index
%36 = "arith.extui"(%17) : (i32) -> i64
%37 = "arith.extui"(%18) : (i32) -> i64
%38 = "arith.shli"(%37, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%39 = "arith.ori"(%36, %38) : (i64, i64) -> i64
%40 = "arith.index_castui"(%39) : (i64) -> index
%41 = "arith.index_castui"(%19) : (i32) -> index
%42:6 = "util.assume.int"(%24, %29, %34, %35, %40, %41) <{assumptions = [[#util.int.assumption<umin = 68386816, umax = 5300322304>], [#util.int.assumption<umin = 68501504, umax = 5769969664>], [#util.int.assumption<umin = 68501568, umax = 5770002432>], [#util.int.assumption<umin = 1, umax = 4095>], [#util.int.assumption<umin = 64, umax = 576460752303423424, udiv = 64>], [#util.int.assumption<umin = 1, umax = 4095>]]}> : (index, index, index, index, index, index) -> (index, index, index, index, index, index)
%43 = "flow.dispatch.workload.ordinal"(%42#3) <{ordinal = 1 : index}> : (index) -> index
%44 = "flow.dispatch.workload.ordinal"(%42#4) <{ordinal = 2 : index}> : (index) -> index
%45 = "flow.dispatch.workload.ordinal"(%42#5) <{ordinal = 3 : index}> : (index) -> index
%46 = "hal.interface.binding.subspan"(%42#0, %43) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%46) <{alignment = 1 : i32}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%47 = "hal.interface.binding.subspan"(%42#1, %45) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%47) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%48 = "hal.interface.binding.subspan"(%4, %44) {alignment = 64 : index, binding = 1 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%48) <{alignment = 64 : i32}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>) -> ()
%49 = "hal.interface.binding.subspan"(%42#2, %45) {alignment = 64 : index, binding = 2 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%49) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%50 = "memref.alloca"(%45) <{operandSegmentSizes = array<i32: 1, 0>}> : (index) -> memref<1x?xi32, #gpu.address_space<private>>
%51 = "affine.apply"(%45) <{map = affine_map<()[s0] -> (s0 ceildiv 2)>}> : (index) -> index
"gpu.barrier"() : () -> ()
"scf.for"(%9, %51, %5) ({
^bb0(%arg7: index):
%62 = "affine.apply"(%arg7) <{map = affine_map<(d0) -> (d0 * 2)>}> : (index) -> index
%63 = "affine.min"(%arg7, %45) <{map = affine_map<(d0)[s0] -> (d0 * -2 + s0, 2)>}> : (index, index) -> index
"scf.for"(%4, %63, %2) ({
^bb0(%arg8: index):
%64 = "arith.addi"(%arg8, %62) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
%65 = "vector.transfer_read"(%47, %4, %64, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
%66 = "arith.addi"(%65, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
%67 = "arith.trunci"(%66) : (vector<1xi64>) -> vector<1xi32>
%68 = "affine.apply"(%arg7, %arg8) <{map = affine_map<(d0)[s0] -> (d0 * 2 + s0)>}> : (index, index) -> index
"vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
"vector.transfer_write"(%67, %50, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x?xi32, #gpu.address_space<private>>, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"gpu.barrier"() : () -> ()
%52:2 = "affine.delinearize_index"(%9) <{static_basis = array<i64: 4, 16>}> : (index) -> (index, index)
"scf.forall"(%43) <{mapping = [#iree_codegen.workgroup_mapping<z:1>, #iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>], operandSegmentSizes = array<i32: 0, 1, 0, 0>, staticLowerBound = array<i64: 0, 0, 0, 0>, staticStep = array<i64: 1, 1, 1, 1>, staticUpperBound = array<i64: -9223372036854775808, 32, 2, 2>}> ({
^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index):
%53 = "memref.subview"(%7) <{operandSegmentSizes = array<i32: 1, 0, 0, 0>, static_offsets = array<i64: 0, 0>, static_sizes = array<i64: 1, 1>, static_strides = array<i64: 1, 1>}> : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>
"gpu.barrier"() : () -> ()
"scf.for"(%9, %2, %5) ({
^bb0(%arg6: index):
%59 = "vector.transfer_read"(%47, %4, %arg0, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
%60 = "arith.addi"(%59, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
%61 = "arith.trunci"(%60) : (vector<1xi64>) -> vector<1xi32>
"vector.transfer_write"(%61, %7, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x3xi32, #gpu.address_space<workgroup>>, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"gpu.barrier"() : () -> ()
%54 = "affine.apply"(%arg2, %52#0) <{map = affine_map<(d0)[s0] -> (d0 * 4 + s0)>}> : (index, index) -> index
%55 = "affine.apply"(%arg3, %52#1) <{map = affine_map<(d0)[s0] -> (d0 * 64 + s0 * 4)>}> : (index, index) -> index
%56 = "memref.subview"(%48, %arg1, %54, %55, %44) <{operandSegmentSizes = array<i32: 1, 3, 1, 0>, static_offsets = array<i64: 0, -9223372036854775808, -9223372036854775808, -9223372036854775808>, static_sizes = array<i64: -9223372036854775808, 1, 1, 4>, static_strides = array<i64: 1, 1, 1, 1>}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>, index, index, index, index) -> memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%57 = "vector.transfer_read"(%46, %4, %arg0, %arg1, %54, %55, %0) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 5, 1, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, index, index, index, f8E4M3FNUZ) -> vector<4xf8E4M3FNUZ>
%58 = "arith.extf"(%57) : (vector<4xf8E4M3FNUZ>) -> vector<4xbf16>
"vector.transfer_write"(%58, %8, %4, %4, %4, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 5, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (vector<4xbf16>, memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, index, index, index, index, index) -> ()
"iree_linalg_ext.scatter"(%8, %53, %56) <{dimension_map = array<i64: 0>, operandSegmentSizes = array<i32: 2, 1>, unique_indices = true}> ({
^bb0(%arg4: bf16, %arg5: bf16):
"iree_linalg_ext.yield"(%arg4) : (bf16) -> ()
}) : (memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>, memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
"gpu.barrier"() : () -> ()
"scf.forall.in_parallel"() ({
^bb0:
}) : () -> ()
}) : (index) -> ()
"memref.dealloc"(%7) : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> ()
"func.return"() : () -> ()
}) {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64>} : () -> ()
}) : () -> ()
"hal.executable.variant_end"() : () -> ()
}) {sym_name = "rocm_hsaco_fb", target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>} : () -> ()
/sharedfile/32/fp8_32_kv16.mlir:17779:13: error: 'vector.transfer_write' op write affecting operations on global resources are restricted to workgroup distributed contexts.
%6719 = torch.aten.index_put %6717, %6718, %6713, %false_6554 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
^
/sharedfile/32/fp8_32_kv16.mlir:17779:13: note: see current operation: "vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
/sharedfile/32/fp8_32_kv16.mlir:17779:13: error: 'func.func' op failed on workgroup distribution verification
%6719 = torch.aten.index_put %6717, %6718, %6713, %false_6554 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
^
/sharedfile/32/fp8_32_kv16.mlir:17779:13: note: see current operation:
"func.func"() <{function_type = () -> (), sym_name = "prefill_bs1$async_dispatch_352_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"}> ({
%0 = "arith.constant"() <{value = 0.000000e+00 : f8E4M3FNUZ}> : () -> f8E4M3FNUZ
%1 = "arith.constant"() <{value = 0 : i64}> : () -> i64
%2 = "arith.constant"() <{value = 1 : index}> : () -> index
%3 = "arith.constant"() <{value = 32 : i64}> : () -> i64
%4 = "arith.constant"() <{value = 0 : index}> : () -> index
%5 = "arith.constant"() <{value = 64 : index}> : () -> index
%6 = "arith.constant"() <{value = dense<34> : vector<1xi64>}> : () -> vector<1xi64>
%7 = "memref.alloc"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x3xi32, #gpu.address_space<workgroup>>
%8 = "memref.alloca"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x1x1x1x4xbf16, #gpu.address_space<private>>
%9 = "gpu.thread_id"() <{dimension = #gpu<dim x>, upper_bound = 64 : index}> : () -> index
%10 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index} : () -> i32
%11 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 1 : index} : () -> i32
%12 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 2 : index} : () -> i32
%13 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 3 : index} : () -> i32
%14 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 4 : index} : () -> i32
%15 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 5 : index} : () -> i32
%16 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 6 : index} : () -> i32
%17 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 7 : index} : () -> i32
%18 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 8 : index} : () -> i32
%19 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 9 : index} : () -> i32
%20 = "arith.extui"(%10) : (i32) -> i64
%21 = "arith.extui"(%11) : (i32) -> i64
%22 = "arith.shli"(%21, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%23 = "arith.ori"(%20, %22) : (i64, i64) -> i64
%24 = "arith.index_castui"(%23) : (i64) -> index
%25 = "arith.extui"(%12) : (i32) -> i64
%26 = "arith.extui"(%13) : (i32) -> i64
%27 = "arith.shli"(%26, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%28 = "arith.ori"(%25, %27) : (i64, i64) -> i64
%29 = "arith.index_castui"(%28) : (i64) -> index
%30 = "arith.extui"(%14) : (i32) -> i64
%31 = "arith.extui"(%15) : (i32) -> i64
%32 = "arith.shli"(%31, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%33 = "arith.ori"(%30, %32) : (i64, i64) -> i64
%34 = "arith.index_castui"(%33) : (i64) -> index
%35 = "arith.index_castui"(%16) : (i32) -> index
%36 = "arith.extui"(%17) : (i32) -> i64
%37 = "arith.extui"(%18) : (i32) -> i64
%38 = "arith.shli"(%37, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%39 = "arith.ori"(%36, %38) : (i64, i64) -> i64
%40 = "arith.index_castui"(%39) : (i64) -> index
%41 = "arith.index_castui"(%19) : (i32) -> index
%42:6 = "util.assume.int"(%24, %29, %34, %35, %40, %41) <{assumptions = [[#util.int.assumption<umin = 68386816, umax = 5300322304>], [#util.int.assumption<umin = 68501504, umax = 5769969664>], [#util.int.assumption<umin = 68501568, umax = 5770002432>], [#util.int.assumption<umin = 1, umax = 4095>], [#util.int.assumption<umin = 64, umax = 576460752303423424, udiv = 64>], [#util.int.assumption<umin = 1, umax = 4095>]]}> : (index, index, index, index, index, index) -> (index, index, index, index, index, index)
%43 = "flow.dispatch.workload.ordinal"(%42#3) <{ordinal = 1 : index}> : (index) -> index
%44 = "flow.dispatch.workload.ordinal"(%42#4) <{ordinal = 2 : index}> : (index) -> index
%45 = "flow.dispatch.workload.ordinal"(%42#5) <{ordinal = 3 : index}> : (index) -> index
%46 = "hal.interface.binding.subspan"(%42#0, %43) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%46) <{alignment = 1 : i32}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%47 = "hal.interface.binding.subspan"(%42#1, %45) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%47) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%48 = "hal.interface.binding.subspan"(%4, %44) {alignment = 64 : index, binding = 1 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%48) <{alignment = 64 : i32}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>) -> ()
%49 = "hal.interface.binding.subspan"(%42#2, %45) {alignment = 64 : index, binding = 2 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%49) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%50 = "memref.alloca"(%45) <{operandSegmentSizes = array<i32: 1, 0>}> : (index) -> memref<1x?xi32, #gpu.address_space<private>>
%51 = "affine.apply"(%45) <{map = affine_map<()[s0] -> (s0 ceildiv 2)>}> : (index) -> index
"gpu.barrier"() : () -> ()
"scf.for"(%9, %51, %5) ({
^bb0(%arg7: index):
%62 = "affine.apply"(%arg7) <{map = affine_map<(d0) -> (d0 * 2)>}> : (index) -> index
%63 = "affine.min"(%arg7, %45) <{map = affine_map<(d0)[s0] -> (d0 * -2 + s0, 2)>}> : (index, index) -> index
"scf.for"(%4, %63, %2) ({
^bb0(%arg8: index):
%64 = "arith.addi"(%arg8, %62) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
%65 = "vector.transfer_read"(%47, %4, %64, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
%66 = "arith.addi"(%65, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
%67 = "arith.trunci"(%66) : (vector<1xi64>) -> vector<1xi32>
%68 = "affine.apply"(%arg7, %arg8) <{map = affine_map<(d0)[s0] -> (d0 * 2 + s0)>}> : (index, index) -> index
"vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
"vector.transfer_write"(%67, %50, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x?xi32, #gpu.address_space<private>>, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"gpu.barrier"() : () -> ()
%52:2 = "affine.delinearize_index"(%9) <{static_basis = array<i64: 4, 16>}> : (index) -> (index, index)
"scf.forall"(%43) <{mapping = [#iree_codegen.workgroup_mapping<z:1>, #iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>], operandSegmentSizes = array<i32: 0, 1, 0, 0>, staticLowerBound = array<i64: 0, 0, 0, 0>, staticStep = array<i64: 1, 1, 1, 1>, staticUpperBound = array<i64: -9223372036854775808, 32, 2, 2>}> ({
^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index):
%53 = "memref.subview"(%7) <{operandSegmentSizes = array<i32: 1, 0, 0, 0>, static_offsets = array<i64: 0, 0>, static_sizes = array<i64: 1, 1>, static_strides = array<i64: 1, 1>}> : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>
"gpu.barrier"() : () -> ()
"scf.for"(%9, %2, %5) ({
^bb0(%arg6: index):
%59 = "vector.transfer_read"(%47, %4, %arg0, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
%60 = "arith.addi"(%59, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
%61 = "arith.trunci"(%60) : (vector<1xi64>) -> vector<1xi32>
"vector.transfer_write"(%61, %7, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x3xi32, #gpu.address_space<workgroup>>, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"gpu.barrier"() : () -> ()
%54 = "affine.apply"(%arg2, %52#0) <{map = affine_map<(d0)[s0] -> (d0 * 4 + s0)>}> : (index, index) -> index
%55 = "affine.apply"(%arg3, %52#1) <{map = affine_map<(d0)[s0] -> (d0 * 64 + s0 * 4)>}> : (index, index) -> index
%56 = "memref.subview"(%48, %arg1, %54, %55, %44) <{operandSegmentSizes = array<i32: 1, 3, 1, 0>, static_offsets = array<i64: 0, -9223372036854775808, -9223372036854775808, -9223372036854775808>, static_sizes = array<i64: -9223372036854775808, 1, 1, 4>, static_strides = array<i64: 1, 1, 1, 1>}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>, index, index, index, index) -> memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%57 = "vector.transfer_read"(%46, %4, %arg0, %arg1, %54, %55, %0) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 5, 1, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, index, index, index, f8E4M3FNUZ) -> vector<4xf8E4M3FNUZ>
%58 = "arith.extf"(%57) : (vector<4xf8E4M3FNUZ>) -> vector<4xbf16>
"vector.transfer_write"(%58, %8, %4, %4, %4, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 5, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (vector<4xbf16>, memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, index, index, index, index, index) -> ()
"iree_linalg_ext.scatter"(%8, %53, %56) <{dimension_map = array<i64: 0>, operandSegmentSizes = array<i32: 2, 1>, unique_indices = true}> ({
^bb0(%arg4: bf16, %arg5: bf16):
"iree_linalg_ext.yield"(%arg4) : (bf16) -> ()
}) : (memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>, memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
"gpu.barrier"() : () -> ()
"scf.forall.in_parallel"() ({
^bb0:
}) : () -> ()
}) : (index) -> ()
"memref.dealloc"(%7) : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> ()
"func.return"() : () -> ()
}) {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64>} : () -> ()
/sharedfile/32/fp8_32_kv16.mlir:17779:13: error: failed to run translation of source executable to target executable for backend #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>
%6719 = torch.aten.index_put %6717, %6718, %6713, %false_6554 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
^
/sharedfile/32/fp8_32_kv16.mlir:17779:13: note: see current operation:
"hal.executable.variant"() ({
"hal.executable.export"() ({
^bb0(%arg9: !hal.device, %arg10: index, %arg11: index, %arg12: index, %arg13: index):
%69:3 = "flow.dispatch.workgroup_count_from_slice"(%arg10, %arg11, %arg12, %arg13) : (index, index, index, index) -> (index, index, index)
"hal.return"(%69#0, %69#1, %69#2) : (index, index, index) -> ()
}) {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index, sym_name = "prefill_bs1$async_dispatch_352_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"} : () -> ()
"builtin.module"() ({
"func.func"() <{function_type = () -> (), sym_name = "prefill_bs1$async_dispatch_352_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"}> ({
%0 = "arith.constant"() <{value = 0.000000e+00 : f8E4M3FNUZ}> : () -> f8E4M3FNUZ
%1 = "arith.constant"() <{value = 0 : i64}> : () -> i64
%2 = "arith.constant"() <{value = 1 : index}> : () -> index
%3 = "arith.constant"() <{value = 32 : i64}> : () -> i64
%4 = "arith.constant"() <{value = 0 : index}> : () -> index
%5 = "arith.constant"() <{value = 64 : index}> : () -> index
%6 = "arith.constant"() <{value = dense<34> : vector<1xi64>}> : () -> vector<1xi64>
%7 = "memref.alloc"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x3xi32, #gpu.address_space<workgroup>>
%8 = "memref.alloca"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x1x1x1x4xbf16, #gpu.address_space<private>>
%9 = "gpu.thread_id"() <{dimension = #gpu<dim x>, upper_bound = 64 : index}> : () -> index
%10 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index} : () -> i32
%11 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 1 : index} : () -> i32
%12 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 2 : index} : () -> i32
%13 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 3 : index} : () -> i32
%14 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 4 : index} : () -> i32
%15 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 5 : index} : () -> i32
%16 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 6 : index} : () -> i32
%17 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 7 : index} : () -> i32
%18 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 8 : index} : () -> i32
%19 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 9 : index} : () -> i32
%20 = "arith.extui"(%10) : (i32) -> i64
%21 = "arith.extui"(%11) : (i32) -> i64
%22 = "arith.shli"(%21, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%23 = "arith.ori"(%20, %22) : (i64, i64) -> i64
%24 = "arith.index_castui"(%23) : (i64) -> index
%25 = "arith.extui"(%12) : (i32) -> i64
%26 = "arith.extui"(%13) : (i32) -> i64
%27 = "arith.shli"(%26, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%28 = "arith.ori"(%25, %27) : (i64, i64) -> i64
%29 = "arith.index_castui"(%28) : (i64) -> index
%30 = "arith.extui"(%14) : (i32) -> i64
%31 = "arith.extui"(%15) : (i32) -> i64
%32 = "arith.shli"(%31, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%33 = "arith.ori"(%30, %32) : (i64, i64) -> i64
%34 = "arith.index_castui"(%33) : (i64) -> index
%35 = "arith.index_castui"(%16) : (i32) -> index
%36 = "arith.extui"(%17) : (i32) -> i64
%37 = "arith.extui"(%18) : (i32) -> i64
%38 = "arith.shli"(%37, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%39 = "arith.ori"(%36, %38) : (i64, i64) -> i64
%40 = "arith.index_castui"(%39) : (i64) -> index
%41 = "arith.index_castui"(%19) : (i32) -> index
%42:6 = "util.assume.int"(%24, %29, %34, %35, %40, %41) <{assumptions = [[#util.int.assumption<umin = 68386816, umax = 5300322304>], [#util.int.assumption<umin = 68501504, umax = 5769969664>], [#util.int.assumption<umin = 68501568, umax = 5770002432>], [#util.int.assumption<umin = 1, umax = 4095>], [#util.int.assumption<umin = 64, umax = 576460752303423424, udiv = 64>], [#util.int.assumption<umin = 1, umax = 4095>]]}> : (index, index, index, index, index, index) -> (index, index, index, index, index, index)
%43 = "flow.dispatch.workload.ordinal"(%42#3) <{ordinal = 1 : index}> : (index) -> index
%44 = "flow.dispatch.workload.ordinal"(%42#4) <{ordinal = 2 : index}> : (index) -> index
%45 = "flow.dispatch.workload.ordinal"(%42#5) <{ordinal = 3 : index}> : (index) -> index
%46 = "hal.interface.binding.subspan"(%42#0, %43) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%46) <{alignment = 1 : i32}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%47 = "hal.interface.binding.subspan"(%42#1, %45) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%47) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%48 = "hal.interface.binding.subspan"(%4, %44) {alignment = 64 : index, binding = 1 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%48) <{alignment = 64 : i32}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>) -> ()
%49 = "hal.interface.binding.subspan"(%42#2, %45) {alignment = 64 : index, binding = 2 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%49) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%50 = "memref.alloca"(%45) <{operandSegmentSizes = array<i32: 1, 0>}> : (index) -> memref<1x?xi32, #gpu.address_space<private>>
%51 = "affine.apply"(%45) <{map = affine_map<()[s0] -> (s0 ceildiv 2)>}> : (index) -> index
"gpu.barrier"() : () -> ()
"scf.for"(%9, %51, %5) ({
^bb0(%arg7: index):
%62 = "affine.apply"(%arg7) <{map = affine_map<(d0) -> (d0 * 2)>}> : (index) -> index
%63 = "affine.min"(%arg7, %45) <{map = affine_map<(d0)[s0] -> (d0 * -2 + s0, 2)>}> : (index, index) -> index
"scf.for"(%4, %63, %2) ({
^bb0(%arg8: index):
%64 = "arith.addi"(%arg8, %62) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
%65 = "vector.transfer_read"(%47, %4, %64, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
%66 = "arith.addi"(%65, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
%67 = "arith.trunci"(%66) : (vector<1xi64>) -> vector<1xi32>
%68 = "affine.apply"(%arg7, %arg8) <{map = affine_map<(d0)[s0] -> (d0 * 2 + s0)>}> : (index, index) -> index
"vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
"vector.transfer_write"(%67, %50, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x?xi32, #gpu.address_space<private>>, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"gpu.barrier"() : () -> ()
%52:2 = "affine.delinearize_index"(%9) <{static_basis = array<i64: 4, 16>}> : (index) -> (index, index)
"scf.forall"(%43) <{mapping = [#iree_codegen.workgroup_mapping<z:1>, #iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>], operandSegmentSizes = array<i32: 0, 1, 0, 0>, staticLowerBound = array<i64: 0, 0, 0, 0>, staticStep = array<i64: 1, 1, 1, 1>, staticUpperBound = array<i64: -9223372036854775808, 32, 2, 2>}> ({
^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index):
%53 = "memref.subview"(%7) <{operandSegmentSizes = array<i32: 1, 0, 0, 0>, static_offsets = array<i64: 0, 0>, static_sizes = array<i64: 1, 1>, static_strides = array<i64: 1, 1>}> : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>
"gpu.barrier"() : () -> ()
"scf.for"(%9, %2, %5) ({
^bb0(%arg6: index):
%59 = "vector.transfer_read"(%47, %4, %arg0, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
%60 = "arith.addi"(%59, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
%61 = "arith.trunci"(%60) : (vector<1xi64>) -> vector<1xi32>
"vector.transfer_write"(%61, %7, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x3xi32, #gpu.address_space<workgroup>>, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"gpu.barrier"() : () -> ()
%54 = "affine.apply"(%arg2, %52#0) <{map = affine_map<(d0)[s0] -> (d0 * 4 + s0)>}> : (index, index) -> index
%55 = "affine.apply"(%arg3, %52#1) <{map = affine_map<(d0)[s0] -> (d0 * 64 + s0 * 4)>}> : (index, index) -> index
%56 = "memref.subview"(%48, %arg1, %54, %55, %44) <{operandSegmentSizes = array<i32: 1, 3, 1, 0>, static_offsets = array<i64: 0, -9223372036854775808, -9223372036854775808, -9223372036854775808>, static_sizes = array<i64: -9223372036854775808, 1, 1, 4>, static_strides = array<i64: 1, 1, 1, 1>}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>, index, index, index, index) -> memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%57 = "vector.transfer_read"(%46, %4, %arg0, %arg1, %54, %55, %0) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 5, 1, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, index, index, index, f8E4M3FNUZ) -> vector<4xf8E4M3FNUZ>
%58 = "arith.extf"(%57) : (vector<4xf8E4M3FNUZ>) -> vector<4xbf16>
"vector.transfer_write"(%58, %8, %4, %4, %4, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 5, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (vector<4xbf16>, memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, index, index, index, index, index) -> ()
"iree_linalg_ext.scatter"(%8, %53, %56) <{dimension_map = array<i64: 0>, operandSegmentSizes = array<i32: 2, 1>, unique_indices = true}> ({
^bb0(%arg4: bf16, %arg5: bf16):
"iree_linalg_ext.yield"(%arg4) : (bf16) -> ()
}) : (memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>, memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
"gpu.barrier"() : () -> ()
"scf.forall.in_parallel"() ({
^bb0:
}) : () -> ()
}) : (index) -> ()
"memref.dealloc"(%7) : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> ()
"func.return"() : () -> ()
}) {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64>} : () -> ()
}) : () -> ()
"hal.executable.variant_end"() : () -> ()
}) {sym_name = "rocm_hsaco_fb", target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>} : () -> ()
/sharedfile/32/fp8_32_kv16.mlir:18655:13: error: 'vector.transfer_write' op write affecting operations on global resources are restricted to workgroup distributed contexts.
%7056 = torch.aten.index_put %7054, %7055, %7050, %false_6926 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
^
/sharedfile/32/fp8_32_kv16.mlir:18655:13: note: see current operation: "vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
/sharedfile/32/fp8_32_kv16.mlir:18655:13: error: 'func.func' op failed on workgroup distribution verification
%7056 = torch.aten.index_put %7054, %7055, %7050, %false_6926 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
^
/sharedfile/32/fp8_32_kv16.mlir:18655:13: note: see current operation:
"func.func"() <{function_type = () -> (), sym_name = "prefill_bs1$async_dispatch_372_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"}> ({
%0 = "arith.constant"() <{value = 0.000000e+00 : f8E4M3FNUZ}> : () -> f8E4M3FNUZ
%1 = "arith.constant"() <{value = 0 : i64}> : () -> i64
%2 = "arith.constant"() <{value = 1 : index}> : () -> index
%3 = "arith.constant"() <{value = 32 : i64}> : () -> i64
%4 = "arith.constant"() <{value = 0 : index}> : () -> index
%5 = "arith.constant"() <{value = 64 : index}> : () -> index
%6 = "arith.constant"() <{value = dense<36> : vector<1xi64>}> : () -> vector<1xi64>
%7 = "memref.alloc"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x3xi32, #gpu.address_space<workgroup>>
%8 = "memref.alloca"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x1x1x1x4xbf16, #gpu.address_space<private>>
%9 = "gpu.thread_id"() <{dimension = #gpu<dim x>, upper_bound = 64 : index}> : () -> index
%10 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index} : () -> i32
%11 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 1 : index} : () -> i32
%12 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 2 : index} : () -> i32
%13 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 3 : index} : () -> i32
%14 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 4 : index} : () -> i32
%15 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 5 : index} : () -> i32
%16 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 6 : index} : () -> i32
%17 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 7 : index} : () -> i32
%18 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 8 : index} : () -> i32
%19 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 9 : index} : () -> i32
%20 = "arith.extui"(%10) : (i32) -> i64
%21 = "arith.extui"(%11) : (i32) -> i64
%22 = "arith.shli"(%21, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%23 = "arith.ori"(%20, %22) : (i64, i64) -> i64
%24 = "arith.index_castui"(%23) : (i64) -> index
%25 = "arith.extui"(%12) : (i32) -> i64
%26 = "arith.extui"(%13) : (i32) -> i64
%27 = "arith.shli"(%26, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%28 = "arith.ori"(%25, %27) : (i64, i64) -> i64
%29 = "arith.index_castui"(%28) : (i64) -> index
%30 = "arith.extui"(%14) : (i32) -> i64
%31 = "arith.extui"(%15) : (i32) -> i64
%32 = "arith.shli"(%31, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%33 = "arith.ori"(%30, %32) : (i64, i64) -> i64
%34 = "arith.index_castui"(%33) : (i64) -> index
%35 = "arith.index_castui"(%16) : (i32) -> index
%36 = "arith.extui"(%17) : (i32) -> i64
%37 = "arith.extui"(%18) : (i32) -> i64
%38 = "arith.shli"(%37, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%39 = "arith.ori"(%36, %38) : (i64, i64) -> i64
%40 = "arith.index_castui"(%39) : (i64) -> index
%41 = "arith.index_castui"(%19) : (i32) -> index
%42:6 = "util.assume.int"(%24, %29, %34, %35, %40, %41) <{assumptions = [[#util.int.assumption<umin = 68386816, umax = 5300322304>], [#util.int.assumption<umin = 68501504, umax = 5769969664>], [#util.int.assumption<umin = 68501568, umax = 5770002432>], [#util.int.assumption<umin = 1, umax = 4095>], [#util.int.assumption<umin = 64, umax = 576460752303423424, udiv = 64>], [#util.int.assumption<umin = 1, umax = 4095>]]}> : (index, index, index, index, index, index) -> (index, index, index, index, index, index)
%43 = "flow.dispatch.workload.ordinal"(%42#3) <{ordinal = 1 : index}> : (index) -> index
%44 = "flow.dispatch.workload.ordinal"(%42#4) <{ordinal = 2 : index}> : (index) -> index
%45 = "flow.dispatch.workload.ordinal"(%42#5) <{ordinal = 3 : index}> : (index) -> index
%46 = "hal.interface.binding.subspan"(%42#0, %43) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%46) <{alignment = 1 : i32}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%47 = "hal.interface.binding.subspan"(%42#1, %45) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%47) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%48 = "hal.interface.binding.subspan"(%4, %44) {alignment = 64 : index, binding = 1 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%48) <{alignment = 64 : i32}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>) -> ()
%49 = "hal.interface.binding.subspan"(%42#2, %45) {alignment = 64 : index, binding = 2 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%49) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%50 = "memref.alloca"(%45) <{operandSegmentSizes = array<i32: 1, 0>}> : (index) -> memref<1x?xi32, #gpu.address_space<private>>
%51 = "affine.apply"(%45) <{map = affine_map<()[s0] -> (s0 ceildiv 2)>}> : (index) -> index
"gpu.barrier"() : () -> ()
"scf.for"(%9, %51, %5) ({
^bb0(%arg7: index):
%62 = "affine.apply"(%arg7) <{map = affine_map<(d0) -> (d0 * 2)>}> : (index) -> index
%63 = "affine.min"(%arg7, %45) <{map = affine_map<(d0)[s0] -> (d0 * -2 + s0, 2)>}> : (index, index) -> index
"scf.for"(%4, %63, %2) ({
^bb0(%arg8: index):
%64 = "arith.addi"(%arg8, %62) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
%65 = "vector.transfer_read"(%47, %4, %64, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
%66 = "arith.addi"(%65, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
%67 = "arith.trunci"(%66) : (vector<1xi64>) -> vector<1xi32>
%68 = "affine.apply"(%arg7, %arg8) <{map = affine_map<(d0)[s0] -> (d0 * 2 + s0)>}> : (index, index) -> index
"vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
"vector.transfer_write"(%67, %50, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x?xi32, #gpu.address_space<private>>, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"gpu.barrier"() : () -> ()
%52:2 = "affine.delinearize_index"(%9) <{static_basis = array<i64: 4, 16>}> : (index) -> (index, index)
"scf.forall"(%43) <{mapping = [#iree_codegen.workgroup_mapping<z:1>, #iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>], operandSegmentSizes = array<i32: 0, 1, 0, 0>, staticLowerBound = array<i64: 0, 0, 0, 0>, staticStep = array<i64: 1, 1, 1, 1>, staticUpperBound = array<i64: -9223372036854775808, 32, 2, 2>}> ({
^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index):
%53 = "memref.subview"(%7) <{operandSegmentSizes = array<i32: 1, 0, 0, 0>, static_offsets = array<i64: 0, 0>, static_sizes = array<i64: 1, 1>, static_strides = array<i64: 1, 1>}> : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>
"gpu.barrier"() : () -> ()
"scf.for"(%9, %2, %5) ({
^bb0(%arg6: index):
%59 = "vector.transfer_read"(%47, %4, %arg0, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
%60 = "arith.addi"(%59, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
%61 = "arith.trunci"(%60) : (vector<1xi64>) -> vector<1xi32>
"vector.transfer_write"(%61, %7, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x3xi32, #gpu.address_space<workgroup>>, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"gpu.barrier"() : () -> ()
%54 = "affine.apply"(%arg2, %52#0) <{map = affine_map<(d0)[s0] -> (d0 * 4 + s0)>}> : (index, index) -> index
%55 = "affine.apply"(%arg3, %52#1) <{map = affine_map<(d0)[s0] -> (d0 * 64 + s0 * 4)>}> : (index, index) -> index
%56 = "memref.subview"(%48, %arg1, %54, %55, %44) <{operandSegmentSizes = array<i32: 1, 3, 1, 0>, static_offsets = array<i64: 0, -9223372036854775808, -9223372036854775808, -9223372036854775808>, static_sizes = array<i64: -9223372036854775808, 1, 1, 4>, static_strides = array<i64: 1, 1, 1, 1>}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>, index, index, index, index) -> memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%57 = "vector.transfer_read"(%46, %4, %arg0, %arg1, %54, %55, %0) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 5, 1, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, index, index, index, f8E4M3FNUZ) -> vector<4xf8E4M3FNUZ>
%58 = "arith.extf"(%57) : (vector<4xf8E4M3FNUZ>) -> vector<4xbf16>
"vector.transfer_write"(%58, %8, %4, %4, %4, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 5, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (vector<4xbf16>, memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, index, index, index, index, index) -> ()
"iree_linalg_ext.scatter"(%8, %53, %56) <{dimension_map = array<i64: 0>, operandSegmentSizes = array<i32: 2, 1>, unique_indices = true}> ({
^bb0(%arg4: bf16, %arg5: bf16):
"iree_linalg_ext.yield"(%arg4) : (bf16) -> ()
}) : (memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>, memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
"gpu.barrier"() : () -> ()
"scf.forall.in_parallel"() ({
^bb0:
}) : () -> ()
}) : (index) -> ()
"memref.dealloc"(%7) : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> ()
"func.return"() : () -> ()
}) {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64>} : () -> ()
/sharedfile/32/fp8_32_kv16.mlir:18655:13: error: failed to run translation of source executable to target executable for backend #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>
%7056 = torch.aten.index_put %7054, %7055, %7050, %false_6926 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
^
/sharedfile/32/fp8_32_kv16.mlir:18655:13: note: see current operation:
"hal.executable.variant"() ({
"hal.executable.export"() ({
^bb0(%arg9: !hal.device, %arg10: index, %arg11: index, %arg12: index, %arg13: index):
%69:3 = "flow.dispatch.workgroup_count_from_slice"(%arg10, %arg11, %arg12, %arg13) : (index, index, index, index) -> (index, index, index)
"hal.return"(%69#0, %69#1, %69#2) : (index, index, index) -> ()
}) {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index, sym_name = "prefill_bs1$async_dispatch_372_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"} : () -> ()
"builtin.module"() ({
"func.func"() <{function_type = () -> (), sym_name = "prefill_bs1$async_dispatch_372_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"}> ({
%0 = "arith.constant"() <{value = 0.000000e+00 : f8E4M3FNUZ}> : () -> f8E4M3FNUZ
%1 = "arith.constant"() <{value = 0 : i64}> : () -> i64
%2 = "arith.constant"() <{value = 1 : index}> : () -> index
%3 = "arith.constant"() <{value = 32 : i64}> : () -> i64
%4 = "arith.constant"() <{value = 0 : index}> : () -> index
%5 = "arith.constant"() <{value = 64 : index}> : () -> index
%6 = "arith.constant"() <{value = dense<36> : vector<1xi64>}> : () -> vector<1xi64>
%7 = "memref.alloc"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x3xi32, #gpu.address_space<workgroup>>
%8 = "memref.alloca"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x1x1x1x4xbf16, #gpu.address_space<private>>
%9 = "gpu.thread_id"() <{dimension = #gpu<dim x>, upper_bound = 64 : index}> : () -> index
%10 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index} : () -> i32
%11 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 1 : index} : () -> i32
%12 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 2 : index} : () -> i32
%13 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 3 : index} : () -> i32
%14 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 4 : index} : () -> i32
%15 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 5 : index} : () -> i32
%16 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 6 : index} : () -> i32
%17 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 7 : index} : () -> i32
%18 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 8 : index} : () -> i32
%19 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 9 : index} : () -> i32
%20 = "arith.extui"(%10) : (i32) -> i64
%21 = "arith.extui"(%11) : (i32) -> i64
%22 = "arith.shli"(%21, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%23 = "arith.ori"(%20, %22) : (i64, i64) -> i64
%24 = "arith.index_castui"(%23) : (i64) -> index
%25 = "arith.extui"(%12) : (i32) -> i64
%26 = "arith.extui"(%13) : (i32) -> i64
%27 = "arith.shli"(%26, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%28 = "arith.ori"(%25, %27) : (i64, i64) -> i64
%29 = "arith.index_castui"(%28) : (i64) -> index
%30 = "arith.extui"(%14) : (i32) -> i64
%31 = "arith.extui"(%15) : (i32) -> i64
%32 = "arith.shli"(%31, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%33 = "arith.ori"(%30, %32) : (i64, i64) -> i64
%34 = "arith.index_castui"(%33) : (i64) -> index
%35 = "arith.index_castui"(%16) : (i32) -> index
%36 = "arith.extui"(%17) : (i32) -> i64
%37 = "arith.extui"(%18) : (i32) -> i64
%38 = "arith.shli"(%37, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%39 = "arith.ori"(%36, %38) : (i64, i64) -> i64
%40 = "arith.index_castui"(%39) : (i64) -> index
%41 = "arith.index_castui"(%19) : (i32) -> index
%42:6 = "util.assume.int"(%24, %29, %34, %35, %40, %41) <{assumptions = [[#util.int.assumption<umin = 68386816, umax = 5300322304>], [#util.int.assumption<umin = 68501504, umax = 5769969664>], [#util.int.assumption<umin = 68501568, umax = 5770002432>], [#util.int.assumption<umin = 1, umax = 4095>], [#util.int.assumption<umin = 64, umax = 576460752303423424, udiv = 64>], [#util.int.assumption<umin = 1, umax = 4095>]]}> : (index, index, index, index, index, index) -> (index, index, index, index, index, index)
%43 = "flow.dispatch.workload.ordinal"(%42#3) <{ordinal = 1 : index}> : (index) -> index
%44 = "flow.dispatch.workload.ordinal"(%42#4) <{ordinal = 2 : index}> : (index) -> index
%45 = "flow.dispatch.workload.ordinal"(%42#5) <{ordinal = 3 : index}> : (index) -> index
%46 = "hal.interface.binding.subspan"(%42#0, %43) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%46) <{alignment = 1 : i32}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%47 = "hal.interface.binding.subspan"(%42#1, %45) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%47) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%48 = "hal.interface.binding.subspan"(%4, %44) {alignment = 64 : index, binding = 1 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%48) <{alignment = 64 : i32}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>) -> ()
%49 = "hal.interface.binding.subspan"(%42#2, %45) {alignment = 64 : index, binding = 2 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%49) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%50 = "memref.alloca"(%45) <{operandSegmentSizes = array<i32: 1, 0>}> : (index) -> memref<1x?xi32, #gpu.address_space<private>>
%51 = "affine.apply"(%45) <{map = affine_map<()[s0] -> (s0 ceildiv 2)>}> : (index) -> index
"gpu.barrier"() : () -> ()
"scf.for"(%9, %51, %5) ({
^bb0(%arg7: index):
%62 = "affine.apply"(%arg7) <{map = affine_map<(d0) -> (d0 * 2)>}> : (index) -> index
%63 = "affine.min"(%arg7, %45) <{map = affine_map<(d0)[s0] -> (d0 * -2 + s0, 2)>}> : (index, index) -> index
"scf.for"(%4, %63, %2) ({
^bb0(%arg8: index):
%64 = "arith.addi"(%arg8, %62) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
%65 = "vector.transfer_read"(%47, %4, %64, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
%66 = "arith.addi"(%65, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
%67 = "arith.trunci"(%66) : (vector<1xi64>) -> vector<1xi32>
%68 = "affine.apply"(%arg7, %arg8) <{map = affine_map<(d0)[s0] -> (d0 * 2 + s0)>}> : (index, index) -> index
"vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
"vector.transfer_write"(%67, %50, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x?xi32, #gpu.address_space<private>>, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"gpu.barrier"() : () -> ()
%52:2 = "affine.delinearize_index"(%9) <{static_basis = array<i64: 4, 16>}> : (index) -> (index, index)
"scf.forall"(%43) <{mapping = [#iree_codegen.workgroup_mapping<z:1>, #iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>], operandSegmentSizes = array<i32: 0, 1, 0, 0>, staticLowerBound = array<i64: 0, 0, 0, 0>, staticStep = array<i64: 1, 1, 1, 1>, staticUpperBound = array<i64: -9223372036854775808, 32, 2, 2>}> ({
^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index):
%53 = "memref.subview"(%7) <{operandSegmentSizes = array<i32: 1, 0, 0, 0>, static_offsets = array<i64: 0, 0>, static_sizes = array<i64: 1, 1>, static_strides = array<i64: 1, 1>}> : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>
"gpu.barrier"() : () -> ()
"scf.for"(%9, %2, %5) ({
^bb0(%arg6: index):
%59 = "vector.transfer_read"(%47, %4, %arg0, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
%60 = "arith.addi"(%59, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
%61 = "arith.trunci"(%60) : (vector<1xi64>) -> vector<1xi32>
"vector.transfer_write"(%61, %7, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x3xi32, #gpu.address_space<workgroup>>, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"gpu.barrier"() : () -> ()
%54 = "affine.apply"(%arg2, %52#0) <{map = affine_map<(d0)[s0] -> (d0 * 4 + s0)>}> : (index, index) -> index
%55 = "affine.apply"(%arg3, %52#1) <{map = affine_map<(d0)[s0] -> (d0 * 64 + s0 * 4)>}> : (index, index) -> index
%56 = "memref.subview"(%48, %arg1, %54, %55, %44) <{operandSegmentSizes = array<i32: 1, 3, 1, 0>, static_offsets = array<i64: 0, -9223372036854775808, -9223372036854775808, -9223372036854775808>, static_sizes = array<i64: -9223372036854775808, 1, 1, 4>, static_strides = array<i64: 1, 1, 1, 1>}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>, index, index, index, index) -> memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%57 = "vector.transfer_read"(%46, %4, %arg0, %arg1, %54, %55, %0) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 5, 1, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, index, index, index, f8E4M3FNUZ) -> vector<4xf8E4M3FNUZ>
%58 = "arith.extf"(%57) : (vector<4xf8E4M3FNUZ>) -> vector<4xbf16>
"vector.transfer_write"(%58, %8, %4, %4, %4, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 5, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (vector<4xbf16>, memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, index, index, index, index, index) -> ()
"iree_linalg_ext.scatter"(%8, %53, %56) <{dimension_map = array<i64: 0>, operandSegmentSizes = array<i32: 2, 1>, unique_indices = true}> ({
^bb0(%arg4: bf16, %arg5: bf16):
"iree_linalg_ext.yield"(%arg4) : (bf16) -> ()
}) : (memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>, memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
"gpu.barrier"() : () -> ()
"scf.forall.in_parallel"() ({
^bb0:
}) : () -> ()
}) : (index) -> ()
"memref.dealloc"(%7) : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> ()
"func.return"() : () -> ()
}) {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64>} : () -> ()
}) : () -> ()
"hal.executable.variant_end"() : () -> ()
}) {sym_name = "rocm_hsaco_fb", target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>} : () -> ()
/sharedfile/32/fp8_32_kv16.mlir:19531:13: error: 'vector.transfer_write' op write affecting operations on global resources are restricted to workgroup distributed contexts.
%7393 = torch.aten.index_put %7391, %7392, %7387, %false_7298 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
^
/sharedfile/32/fp8_32_kv16.mlir:19531:13: note: see current operation: "vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
/sharedfile/32/fp8_32_kv16.mlir:19531:13: error: 'func.func' op failed on workgroup distribution verification
%7393 = torch.aten.index_put %7391, %7392, %7387, %false_7298 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
^
/sharedfile/32/fp8_32_kv16.mlir:19531:13: note: see current operation:
"func.func"() <{function_type = () -> (), sym_name = "prefill_bs1$async_dispatch_392_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"}> ({
%0 = "arith.constant"() <{value = 0.000000e+00 : f8E4M3FNUZ}> : () -> f8E4M3FNUZ
%1 = "arith.constant"() <{value = 0 : i64}> : () -> i64
%2 = "arith.constant"() <{value = 1 : index}> : () -> index
%3 = "arith.constant"() <{value = 32 : i64}> : () -> i64
%4 = "arith.constant"() <{value = 0 : index}> : () -> index
%5 = "arith.constant"() <{value = 64 : index}> : () -> index
%6 = "arith.constant"() <{value = dense<38> : vector<1xi64>}> : () -> vector<1xi64>
%7 = "memref.alloc"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x3xi32, #gpu.address_space<workgroup>>
%8 = "memref.alloca"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x1x1x1x4xbf16, #gpu.address_space<private>>
%9 = "gpu.thread_id"() <{dimension = #gpu<dim x>, upper_bound = 64 : index}> : () -> index
%10 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index} : () -> i32
%11 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 1 : index} : () -> i32
%12 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 2 : index} : () -> i32
%13 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 3 : index} : () -> i32
%14 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 4 : index} : () -> i32
%15 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 5 : index} : () -> i32
%16 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 6 : index} : () -> i32
%17 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 7 : index} : () -> i32
%18 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 8 : index} : () -> i32
%19 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 9 : index} : () -> i32
%20 = "arith.extui"(%10) : (i32) -> i64
%21 = "arith.extui"(%11) : (i32) -> i64
%22 = "arith.shli"(%21, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%23 = "arith.ori"(%20, %22) : (i64, i64) -> i64
%24 = "arith.index_castui"(%23) : (i64) -> index
%25 = "arith.extui"(%12) : (i32) -> i64
%26 = "arith.extui"(%13) : (i32) -> i64
%27 = "arith.shli"(%26, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%28 = "arith.ori"(%25, %27) : (i64, i64) -> i64
%29 = "arith.index_castui"(%28) : (i64) -> index
%30 = "arith.extui"(%14) : (i32) -> i64
%31 = "arith.extui"(%15) : (i32) -> i64
%32 = "arith.shli"(%31, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%33 = "arith.ori"(%30, %32) : (i64, i64) -> i64
%34 = "arith.index_castui"(%33) : (i64) -> index
%35 = "arith.index_castui"(%16) : (i32) -> index
%36 = "arith.extui"(%17) : (i32) -> i64
%37 = "arith.extui"(%18) : (i32) -> i64
%38 = "arith.shli"(%37, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%39 = "arith.ori"(%36, %38) : (i64, i64) -> i64
%40 = "arith.index_castui"(%39) : (i64) -> index
%41 = "arith.index_castui"(%19) : (i32) -> index
%42:6 = "util.assume.int"(%24, %29, %34, %35, %40, %41) <{assumptions = [[#util.int.assumption<umin = 68386816, umax = 5300322304>], [#util.int.assumption<umin = 68501504, umax = 5769969664>], [#util.int.assumption<umin = 68501568, umax = 5770002432>], [#util.int.assumption<umin = 1, umax = 4095>], [#util.int.assumption<umin = 64, umax = 576460752303423424, udiv = 64>], [#util.int.assumption<umin = 1, umax = 4095>]]}> : (index, index, index, index, index, index) -> (index, index, index, index, index, index)
%43 = "flow.dispatch.workload.ordinal"(%42#3) <{ordinal = 1 : index}> : (index) -> index
%44 = "flow.dispatch.workload.ordinal"(%42#4) <{ordinal = 2 : index}> : (index) -> index
%45 = "flow.dispatch.workload.ordinal"(%42#5) <{ordinal = 3 : index}> : (index) -> index
%46 = "hal.interface.binding.subspan"(%42#0, %43) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%46) <{alignment = 1 : i32}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%47 = "hal.interface.binding.subspan"(%42#1, %45) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%47) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%48 = "hal.interface.binding.subspan"(%4, %44) {alignment = 64 : index, binding = 1 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%48) <{alignment = 64 : i32}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>) -> ()
%49 = "hal.interface.binding.subspan"(%42#2, %45) {alignment = 64 : index, binding = 2 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%49) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%50 = "memref.alloca"(%45) <{operandSegmentSizes = array<i32: 1, 0>}> : (index) -> memref<1x?xi32, #gpu.address_space<private>>
%51 = "affine.apply"(%45) <{map = affine_map<()[s0] -> (s0 ceildiv 2)>}> : (index) -> index
"gpu.barrier"() : () -> ()
"scf.for"(%9, %51, %5) ({
^bb0(%arg7: index):
%62 = "affine.apply"(%arg7) <{map = affine_map<(d0) -> (d0 * 2)>}> : (index) -> index
%63 = "affine.min"(%arg7, %45) <{map = affine_map<(d0)[s0] -> (d0 * -2 + s0, 2)>}> : (index, index) -> index
"scf.for"(%4, %63, %2) ({
^bb0(%arg8: index):
%64 = "arith.addi"(%arg8, %62) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
%65 = "vector.transfer_read"(%47, %4, %64, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
%66 = "arith.addi"(%65, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
%67 = "arith.trunci"(%66) : (vector<1xi64>) -> vector<1xi32>
%68 = "affine.apply"(%arg7, %arg8) <{map = affine_map<(d0)[s0] -> (d0 * 2 + s0)>}> : (index, index) -> index
"vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
"vector.transfer_write"(%67, %50, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x?xi32, #gpu.address_space<private>>, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"gpu.barrier"() : () -> ()
%52:2 = "affine.delinearize_index"(%9) <{static_basis = array<i64: 4, 16>}> : (index) -> (index, index)
"scf.forall"(%43) <{mapping = [#iree_codegen.workgroup_mapping<z:1>, #iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>], operandSegmentSizes = array<i32: 0, 1, 0, 0>, staticLowerBound = array<i64: 0, 0, 0, 0>, staticStep = array<i64: 1, 1, 1, 1>, staticUpperBound = array<i64: -9223372036854775808, 32, 2, 2>}> ({
^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index):
%53 = "memref.subview"(%7) <{operandSegmentSizes = array<i32: 1, 0, 0, 0>, static_offsets = array<i64: 0, 0>, static_sizes = array<i64: 1, 1>, static_strides = array<i64: 1, 1>}> : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>
"gpu.barrier"() : () -> ()
"scf.for"(%9, %2, %5) ({
^bb0(%arg6: index):
%59 = "vector.transfer_read"(%47, %4, %arg0, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
%60 = "arith.addi"(%59, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
%61 = "arith.trunci"(%60) : (vector<1xi64>) -> vector<1xi32>
"vector.transfer_write"(%61, %7, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x3xi32, #gpu.address_space<workgroup>>, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"gpu.barrier"() : () -> ()
%54 = "affine.apply"(%arg2, %52#0) <{map = affine_map<(d0)[s0] -> (d0 * 4 + s0)>}> : (index, index) -> index
%55 = "affine.apply"(%arg3, %52#1) <{map = affine_map<(d0)[s0] -> (d0 * 64 + s0 * 4)>}> : (index, index) -> index
%56 = "memref.subview"(%48, %arg1, %54, %55, %44) <{operandSegmentSizes = array<i32: 1, 3, 1, 0>, static_offsets = array<i64: 0, -9223372036854775808, -9223372036854775808, -9223372036854775808>, static_sizes = array<i64: -9223372036854775808, 1, 1, 4>, static_strides = array<i64: 1, 1, 1, 1>}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>, index, index, index, index) -> memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%57 = "vector.transfer_read"(%46, %4, %arg0, %arg1, %54, %55, %0) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 5, 1, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, index, index, index, f8E4M3FNUZ) -> vector<4xf8E4M3FNUZ>
%58 = "arith.extf"(%57) : (vector<4xf8E4M3FNUZ>) -> vector<4xbf16>
"vector.transfer_write"(%58, %8, %4, %4, %4, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 5, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (vector<4xbf16>, memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, index, index, index, index, index) -> ()
"iree_linalg_ext.scatter"(%8, %53, %56) <{dimension_map = array<i64: 0>, operandSegmentSizes = array<i32: 2, 1>, unique_indices = true}> ({
^bb0(%arg4: bf16, %arg5: bf16):
"iree_linalg_ext.yield"(%arg4) : (bf16) -> ()
}) : (memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>, memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
"gpu.barrier"() : () -> ()
"scf.forall.in_parallel"() ({
^bb0:
}) : () -> ()
}) : (index) -> ()
"memref.dealloc"(%7) : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> ()
"func.return"() : () -> ()
}) {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64>} : () -> ()
/sharedfile/32/fp8_32_kv16.mlir:19531:13: error: failed to run translation of source executable to target executable for backend #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>
%7393 = torch.aten.index_put %7391, %7392, %7387, %false_7298 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
^
/sharedfile/32/fp8_32_kv16.mlir:19531:13: note: see current operation:
"hal.executable.variant"() ({
"hal.executable.export"() ({
^bb0(%arg9: !hal.device, %arg10: index, %arg11: index, %arg12: index, %arg13: index):
%69:3 = "flow.dispatch.workgroup_count_from_slice"(%arg10, %arg11, %arg12, %arg13) : (index, index, index, index) -> (index, index, index)
"hal.return"(%69#0, %69#1, %69#2) : (index, index, index) -> ()
}) {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index, sym_name = "prefill_bs1$async_dispatch_392_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"} : () -> ()
"builtin.module"() ({
"func.func"() <{function_type = () -> (), sym_name = "prefill_bs1$async_dispatch_392_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"}> ({
%0 = "arith.constant"() <{value = 0.000000e+00 : f8E4M3FNUZ}> : () -> f8E4M3FNUZ
%1 = "arith.constant"() <{value = 0 : i64}> : () -> i64
%2 = "arith.constant"() <{value = 1 : index}> : () -> index
%3 = "arith.constant"() <{value = 32 : i64}> : () -> i64
%4 = "arith.constant"() <{value = 0 : index}> : () -> index
%5 = "arith.constant"() <{value = 64 : index}> : () -> index
%6 = "arith.constant"() <{value = dense<38> : vector<1xi64>}> : () -> vector<1xi64>
%7 = "memref.alloc"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x3xi32, #gpu.address_space<workgroup>>
%8 = "memref.alloca"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x1x1x1x4xbf16, #gpu.address_space<private>>
%9 = "gpu.thread_id"() <{dimension = #gpu<dim x>, upper_bound = 64 : index}> : () -> index
%10 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index} : () -> i32
%11 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 1 : index} : () -> i32
%12 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 2 : index} : () -> i32
%13 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 3 : index} : () -> i32
%14 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 4 : index} : () -> i32
%15 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 5 : index} : () -> i32
%16 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 6 : index} : () -> i32
%17 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 7 : index} : () -> i32
%18 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 8 : index} : () -> i32
%19 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 9 : index} : () -> i32
%20 = "arith.extui"(%10) : (i32) -> i64
%21 = "arith.extui"(%11) : (i32) -> i64
%22 = "arith.shli"(%21, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%23 = "arith.ori"(%20, %22) : (i64, i64) -> i64
%24 = "arith.index_castui"(%23) : (i64) -> index
%25 = "arith.extui"(%12) : (i32) -> i64
%26 = "arith.extui"(%13) : (i32) -> i64
%27 = "arith.shli"(%26, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%28 = "arith.ori"(%25, %27) : (i64, i64) -> i64
%29 = "arith.index_castui"(%28) : (i64) -> index
%30 = "arith.extui"(%14) : (i32) -> i64
%31 = "arith.extui"(%15) : (i32) -> i64
%32 = "arith.shli"(%31, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%33 = "arith.ori"(%30, %32) : (i64, i64) -> i64
%34 = "arith.index_castui"(%33) : (i64) -> index
%35 = "arith.index_castui"(%16) : (i32) -> index
%36 = "arith.extui"(%17) : (i32) -> i64
%37 = "arith.extui"(%18) : (i32) -> i64
%38 = "arith.shli"(%37, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%39 = "arith.ori"(%36, %38) : (i64, i64) -> i64
%40 = "arith.index_castui"(%39) : (i64) -> index
%41 = "arith.index_castui"(%19) : (i32) -> index
%42:6 = "util.assume.int"(%24, %29, %34, %35, %40, %41) <{assumptions = [[#util.int.assumption<umin = 68386816, umax = 5300322304>], [#util.int.assumption<umin = 68501504, umax = 5769969664>], [#util.int.assumption<umin = 68501568, umax = 5770002432>], [#util.int.assumption<umin = 1, umax = 4095>], [#util.int.assumption<umin = 64, umax = 576460752303423424, udiv = 64>], [#util.int.assumption<umin = 1, umax = 4095>]]}> : (index, index, index, index, index, index) -> (index, index, index, index, index, index)
%43 = "flow.dispatch.workload.ordinal"(%42#3) <{ordinal = 1 : index}> : (index) -> index
%44 = "flow.dispatch.workload.ordinal"(%42#4) <{ordinal = 2 : index}> : (index) -> index
%45 = "flow.dispatch.workload.ordinal"(%42#5) <{ordinal = 3 : index}> : (index) -> index
%46 = "hal.interface.binding.subspan"(%42#0, %43) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%46) <{alignment = 1 : i32}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%47 = "hal.interface.binding.subspan"(%42#1, %45) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%47) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%48 = "hal.interface.binding.subspan"(%4, %44) {alignment = 64 : index, binding = 1 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%48) <{alignment = 64 : i32}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>) -> ()
%49 = "hal.interface.binding.subspan"(%42#2, %45) {alignment = 64 : index, binding = 2 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%49) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%50 = "memref.alloca"(%45) <{operandSegmentSizes = array<i32: 1, 0>}> : (index) -> memref<1x?xi32, #gpu.address_space<private>>
%51 = "affine.apply"(%45) <{map = affine_map<()[s0] -> (s0 ceildiv 2)>}> : (index) -> index
"gpu.barrier"() : () -> ()
"scf.for"(%9, %51, %5) ({
^bb0(%arg7: index):
%62 = "affine.apply"(%arg7) <{map = affine_map<(d0) -> (d0 * 2)>}> : (index) -> index
%63 = "affine.min"(%arg7, %45) <{map = affine_map<(d0)[s0] -> (d0 * -2 + s0, 2)>}> : (index, index) -> index
"scf.for"(%4, %63, %2) ({
^bb0(%arg8: index):
%64 = "arith.addi"(%arg8, %62) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
%65 = "vector.transfer_read"(%47, %4, %64, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
%66 = "arith.addi"(%65, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
%67 = "arith.trunci"(%66) : (vector<1xi64>) -> vector<1xi32>
%68 = "affine.apply"(%arg7, %arg8) <{map = affine_map<(d0)[s0] -> (d0 * 2 + s0)>}> : (index, index) -> index
"vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
"vector.transfer_write"(%67, %50, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x?xi32, #gpu.address_space<private>>, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"gpu.barrier"() : () -> ()
%52:2 = "affine.delinearize_index"(%9) <{static_basis = array<i64: 4, 16>}> : (index) -> (index, index)
"scf.forall"(%43) <{mapping = [#iree_codegen.workgroup_mapping<z:1>, #iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>], operandSegmentSizes = array<i32: 0, 1, 0, 0>, staticLowerBound = array<i64: 0, 0, 0, 0>, staticStep = array<i64: 1, 1, 1, 1>, staticUpperBound = array<i64: -9223372036854775808, 32, 2, 2>}> ({
^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index):
%53 = "memref.subview"(%7) <{operandSegmentSizes = array<i32: 1, 0, 0, 0>, static_offsets = array<i64: 0, 0>, static_sizes = array<i64: 1, 1>, static_strides = array<i64: 1, 1>}> : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>
"gpu.barrier"() : () -> ()
"scf.for"(%9, %2, %5) ({
^bb0(%arg6: index):
%59 = "vector.transfer_read"(%47, %4, %arg0, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
%60 = "arith.addi"(%59, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
%61 = "arith.trunci"(%60) : (vector<1xi64>) -> vector<1xi32>
"vector.transfer_write"(%61, %7, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x3xi32, #gpu.address_space<workgroup>>, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"gpu.barrier"() : () -> ()
%54 = "affine.apply"(%arg2, %52#0) <{map = affine_map<(d0)[s0] -> (d0 * 4 + s0)>}> : (index, index) -> index
%55 = "affine.apply"(%arg3, %52#1) <{map = affine_map<(d0)[s0] -> (d0 * 64 + s0 * 4)>}> : (index, index) -> index
%56 = "memref.subview"(%48, %arg1, %54, %55, %44) <{operandSegmentSizes = array<i32: 1, 3, 1, 0>, static_offsets = array<i64: 0, -9223372036854775808, -9223372036854775808, -9223372036854775808>, static_sizes = array<i64: -9223372036854775808, 1, 1, 4>, static_strides = array<i64: 1, 1, 1, 1>}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>, index, index, index, index) -> memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%57 = "vector.transfer_read"(%46, %4, %arg0, %arg1, %54, %55, %0) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 5, 1, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, index, index, index, f8E4M3FNUZ) -> vector<4xf8E4M3FNUZ>
%58 = "arith.extf"(%57) : (vector<4xf8E4M3FNUZ>) -> vector<4xbf16>
"vector.transfer_write"(%58, %8, %4, %4, %4, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 5, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (vector<4xbf16>, memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, index, index, index, index, index) -> ()
"iree_linalg_ext.scatter"(%8, %53, %56) <{dimension_map = array<i64: 0>, operandSegmentSizes = array<i32: 2, 1>, unique_indices = true}> ({
^bb0(%arg4: bf16, %arg5: bf16):
"iree_linalg_ext.yield"(%arg4) : (bf16) -> ()
}) : (memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>, memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
"gpu.barrier"() : () -> ()
"scf.forall.in_parallel"() ({
^bb0:
}) : () -> ()
}) : (index) -> ()
"memref.dealloc"(%7) : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> ()
"func.return"() : () -> ()
}) {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64>} : () -> ()
}) : () -> ()
"hal.executable.variant_end"() : () -> ()
}) {sym_name = "rocm_hsaco_fb", target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>} : () -> ()
/sharedfile/32/fp8_32_kv16.mlir:20407:13: error: 'vector.transfer_write' op write affecting operations on global resources are restricted to workgroup distributed contexts.
%7730 = torch.aten.index_put %7728, %7729, %7724, %false_7670 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
^
/sharedfile/32/fp8_32_kv16.mlir:20407:13: note: see current operation: "vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
/sharedfile/32/fp8_32_kv16.mlir:20407:13: error: 'func.func' op failed on workgroup distribution verification
%7730 = torch.aten.index_put %7728, %7729, %7724, %false_7670 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
^
/sharedfile/32/fp8_32_kv16.mlir:20407:13: note: see current operation:
"func.func"() <{function_type = () -> (), sym_name = "prefill_bs1$async_dispatch_412_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"}> ({
%0 = "arith.constant"() <{value = 0.000000e+00 : f8E4M3FNUZ}> : () -> f8E4M3FNUZ
%1 = "arith.constant"() <{value = 0 : i64}> : () -> i64
%2 = "arith.constant"() <{value = 1 : index}> : () -> index
%3 = "arith.constant"() <{value = 32 : i64}> : () -> i64
%4 = "arith.constant"() <{value = 0 : index}> : () -> index
%5 = "arith.constant"() <{value = 64 : index}> : () -> index
%6 = "arith.constant"() <{value = dense<40> : vector<1xi64>}> : () -> vector<1xi64>
%7 = "memref.alloc"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x3xi32, #gpu.address_space<workgroup>>
%8 = "memref.alloca"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x1x1x1x4xbf16, #gpu.address_space<private>>
%9 = "gpu.thread_id"() <{dimension = #gpu<dim x>, upper_bound = 64 : index}> : () -> index
%10 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index} : () -> i32
%11 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 1 : index} : () -> i32
%12 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 2 : index} : () -> i32
%13 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 3 : index} : () -> i32
%14 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 4 : index} : () -> i32
%15 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 5 : index} : () -> i32
%16 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 6 : index} : () -> i32
%17 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 7 : index} : () -> i32
%18 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 8 : index} : () -> i32
%19 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 9 : index} : () -> i32
%20 = "arith.extui"(%10) : (i32) -> i64
%21 = "arith.extui"(%11) : (i32) -> i64
%22 = "arith.shli"(%21, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%23 = "arith.ori"(%20, %22) : (i64, i64) -> i64
%24 = "arith.index_castui"(%23) : (i64) -> index
%25 = "arith.extui"(%12) : (i32) -> i64
%26 = "arith.extui"(%13) : (i32) -> i64
%27 = "arith.shli"(%26, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%28 = "arith.ori"(%25, %27) : (i64, i64) -> i64
%29 = "arith.index_castui"(%28) : (i64) -> index
%30 = "arith.extui"(%14) : (i32) -> i64
%31 = "arith.extui"(%15) : (i32) -> i64
%32 = "arith.shli"(%31, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%33 = "arith.ori"(%30, %32) : (i64, i64) -> i64
%34 = "arith.index_castui"(%33) : (i64) -> index
%35 = "arith.index_castui"(%16) : (i32) -> index
%36 = "arith.extui"(%17) : (i32) -> i64
%37 = "arith.extui"(%18) : (i32) -> i64
%38 = "arith.shli"(%37, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%39 = "arith.ori"(%36, %38) : (i64, i64) -> i64
%40 = "arith.index_castui"(%39) : (i64) -> index
%41 = "arith.index_castui"(%19) : (i32) -> index
%42:6 = "util.assume.int"(%24, %29, %34, %35, %40, %41) <{assumptions = [[#util.int.assumption<umin = 68386816, umax = 5300322304>], [#util.int.assumption<umin = 68501504, umax = 5769969664>], [#util.int.assumption<umin = 68501568, umax = 5770002432>], [#util.int.assumption<umin = 1, umax = 4095>], [#util.int.assumption<umin = 64, umax = 576460752303423424, udiv = 64>], [#util.int.assumption<umin = 1, umax = 4095>]]}> : (index, index, index, index, index, index) -> (index, index, index, index, index, index)
%43 = "flow.dispatch.workload.ordinal"(%42#3) <{ordinal = 1 : index}> : (index) -> index
%44 = "flow.dispatch.workload.ordinal"(%42#4) <{ordinal = 2 : index}> : (index) -> index
%45 = "flow.dispatch.workload.ordinal"(%42#5) <{ordinal = 3 : index}> : (index) -> index
%46 = "hal.interface.binding.subspan"(%42#0, %43) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%46) <{alignment = 1 : i32}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%47 = "hal.interface.binding.subspan"(%42#1, %45) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%47) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%48 = "hal.interface.binding.subspan"(%4, %44) {alignment = 64 : index, binding = 1 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%48) <{alignment = 64 : i32}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>) -> ()
%49 = "hal.interface.binding.subspan"(%42#2, %45) {alignment = 64 : index, binding = 2 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%49) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%50 = "memref.alloca"(%45) <{operandSegmentSizes = array<i32: 1, 0>}> : (index) -> memref<1x?xi32, #gpu.address_space<private>>
%51 = "affine.apply"(%45) <{map = affine_map<()[s0] -> (s0 ceildiv 2)>}> : (index) -> index
"gpu.barrier"() : () -> ()
"scf.for"(%9, %51, %5) ({
^bb0(%arg7: index):
%62 = "affine.apply"(%arg7) <{map = affine_map<(d0) -> (d0 * 2)>}> : (index) -> index
%63 = "affine.min"(%arg7, %45) <{map = affine_map<(d0)[s0] -> (d0 * -2 + s0, 2)>}> : (index, index) -> index
"scf.for"(%4, %63, %2) ({
^bb0(%arg8: index):
%64 = "arith.addi"(%arg8, %62) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
%65 = "vector.transfer_read"(%47, %4, %64, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
%66 = "arith.addi"(%65, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
%67 = "arith.trunci"(%66) : (vector<1xi64>) -> vector<1xi32>
%68 = "affine.apply"(%arg7, %arg8) <{map = affine_map<(d0)[s0] -> (d0 * 2 + s0)>}> : (index, index) -> index
"vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
"vector.transfer_write"(%67, %50, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x?xi32, #gpu.address_space<private>>, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"gpu.barrier"() : () -> ()
%52:2 = "affine.delinearize_index"(%9) <{static_basis = array<i64: 4, 16>}> : (index) -> (index, index)
"scf.forall"(%43) <{mapping = [#iree_codegen.workgroup_mapping<z:1>, #iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>], operandSegmentSizes = array<i32: 0, 1, 0, 0>, staticLowerBound = array<i64: 0, 0, 0, 0>, staticStep = array<i64: 1, 1, 1, 1>, staticUpperBound = array<i64: -9223372036854775808, 32, 2, 2>}> ({
^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index):
%53 = "memref.subview"(%7) <{operandSegmentSizes = array<i32: 1, 0, 0, 0>, static_offsets = array<i64: 0, 0>, static_sizes = array<i64: 1, 1>, static_strides = array<i64: 1, 1>}> : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>
"gpu.barrier"() : () -> ()
"scf.for"(%9, %2, %5) ({
^bb0(%arg6: index):
%59 = "vector.transfer_read"(%47, %4, %arg0, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
%60 = "arith.addi"(%59, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
%61 = "arith.trunci"(%60) : (vector<1xi64>) -> vector<1xi32>
"vector.transfer_write"(%61, %7, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x3xi32, #gpu.address_space<workgroup>>, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"gpu.barrier"() : () -> ()
%54 = "affine.apply"(%arg2, %52#0) <{map = affine_map<(d0)[s0] -> (d0 * 4 + s0)>}> : (index, index) -> index
%55 = "affine.apply"(%arg3, %52#1) <{map = affine_map<(d0)[s0] -> (d0 * 64 + s0 * 4)>}> : (index, index) -> index
%56 = "memref.subview"(%48, %arg1, %54, %55, %44) <{operandSegmentSizes = array<i32: 1, 3, 1, 0>, static_offsets = array<i64: 0, -9223372036854775808, -9223372036854775808, -9223372036854775808>, static_sizes = array<i64: -9223372036854775808, 1, 1, 4>, static_strides = array<i64: 1, 1, 1, 1>}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>, index, index, index, index) -> memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%57 = "vector.transfer_read"(%46, %4, %arg0, %arg1, %54, %55, %0) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 5, 1, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, index, index, index, f8E4M3FNUZ) -> vector<4xf8E4M3FNUZ>
%58 = "arith.extf"(%57) : (vector<4xf8E4M3FNUZ>) -> vector<4xbf16>
"vector.transfer_write"(%58, %8, %4, %4, %4, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 5, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (vector<4xbf16>, memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, index, index, index, index, index) -> ()
"iree_linalg_ext.scatter"(%8, %53, %56) <{dimension_map = array<i64: 0>, operandSegmentSizes = array<i32: 2, 1>, unique_indices = true}> ({
^bb0(%arg4: bf16, %arg5: bf16):
"iree_linalg_ext.yield"(%arg4) : (bf16) -> ()
}) : (memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>, memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
"gpu.barrier"() : () -> ()
"scf.forall.in_parallel"() ({
^bb0:
}) : () -> ()
}) : (index) -> ()
"memref.dealloc"(%7) : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> ()
"func.return"() : () -> ()
}) {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64>} : () -> ()
/sharedfile/32/fp8_32_kv16.mlir:20407:13: error: failed to run translation of source executable to target executable for backend #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>
%7730 = torch.aten.index_put %7728, %7729, %7724, %false_7670 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
^
/sharedfile/32/fp8_32_kv16.mlir:20407:13: note: see current operation:
"hal.executable.variant"() ({
"hal.executable.export"() ({
^bb0(%arg9: !hal.device, %arg10: index, %arg11: index, %arg12: index, %arg13: index):
%69:3 = "flow.dispatch.workgroup_count_from_slice"(%arg10, %arg11, %arg12, %arg13) : (index, index, index, index) -> (index, index, index)
"hal.return"(%69#0, %69#1, %69#2) : (index, index, index) -> ()
}) {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index, sym_name = "prefill_bs1$async_dispatch_412_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"} : () -> ()
"builtin.module"() ({
"func.func"() <{function_type = () -> (), sym_name = "prefill_bs1$async_dispatch_412_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"}> ({
%0 = "arith.constant"() <{value = 0.000000e+00 : f8E4M3FNUZ}> : () -> f8E4M3FNUZ
%1 = "arith.constant"() <{value = 0 : i64}> : () -> i64
%2 = "arith.constant"() <{value = 1 : index}> : () -> index
%3 = "arith.constant"() <{value = 32 : i64}> : () -> i64
%4 = "arith.constant"() <{value = 0 : index}> : () -> index
%5 = "arith.constant"() <{value = 64 : index}> : () -> index
%6 = "arith.constant"() <{value = dense<40> : vector<1xi64>}> : () -> vector<1xi64>
%7 = "memref.alloc"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x3xi32, #gpu.address_space<workgroup>>
%8 = "memref.alloca"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x1x1x1x4xbf16, #gpu.address_space<private>>
%9 = "gpu.thread_id"() <{dimension = #gpu<dim x>, upper_bound = 64 : index}> : () -> index
%10 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index} : () -> i32
%11 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 1 : index} : () -> i32
%12 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 2 : index} : () -> i32
%13 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 3 : index} : () -> i32
%14 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 4 : index} : () -> i32
%15 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 5 : index} : () -> i32
%16 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 6 : index} : () -> i32
%17 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 7 : index} : () -> i32
%18 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 8 : index} : () -> i32
%19 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 9 : index} : () -> i32
%20 = "arith.extui"(%10) : (i32) -> i64
%21 = "arith.extui"(%11) : (i32) -> i64
%22 = "arith.shli"(%21, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%23 = "arith.ori"(%20, %22) : (i64, i64) -> i64
%24 = "arith.index_castui"(%23) : (i64) -> index
%25 = "arith.extui"(%12) : (i32) -> i64
%26 = "arith.extui"(%13) : (i32) -> i64
%27 = "arith.shli"(%26, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%28 = "arith.ori"(%25, %27) : (i64, i64) -> i64
%29 = "arith.index_castui"(%28) : (i64) -> index
%30 = "arith.extui"(%14) : (i32) -> i64
%31 = "arith.extui"(%15) : (i32) -> i64
%32 = "arith.shli"(%31, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%33 = "arith.ori"(%30, %32) : (i64, i64) -> i64
%34 = "arith.index_castui"(%33) : (i64) -> index
%35 = "arith.index_castui"(%16) : (i32) -> index
%36 = "arith.extui"(%17) : (i32) -> i64
%37 = "arith.extui"(%18) : (i32) -> i64
%38 = "arith.shli"(%37, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%39 = "arith.ori"(%36, %38) : (i64, i64) -> i64
%40 = "arith.index_castui"(%39) : (i64) -> index
%41 = "arith.index_castui"(%19) : (i32) -> index
%42:6 = "util.assume.int"(%24, %29, %34, %35, %40, %41) <{assumptions = [[#util.int.assumption<umin = 68386816, umax = 5300322304>], [#util.int.assumption<umin = 68501504, umax = 5769969664>], [#util.int.assumption<umin = 68501568, umax = 5770002432>], [#util.int.assumption<umin = 1, umax = 4095>], [#util.int.assumption<umin = 64, umax = 576460752303423424, udiv = 64>], [#util.int.assumption<umin = 1, umax = 4095>]]}> : (index, index, index, index, index, index) -> (index, index, index, index, index, index)
%43 = "flow.dispatch.workload.ordinal"(%42#3) <{ordinal = 1 : index}> : (index) -> index
%44 = "flow.dispatch.workload.ordinal"(%42#4) <{ordinal = 2 : index}> : (index) -> index
%45 = "flow.dispatch.workload.ordinal"(%42#5) <{ordinal = 3 : index}> : (index) -> index
%46 = "hal.interface.binding.subspan"(%42#0, %43) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%46) <{alignment = 1 : i32}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%47 = "hal.interface.binding.subspan"(%42#1, %45) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%47) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%48 = "hal.interface.binding.subspan"(%4, %44) {alignment = 64 : index, binding = 1 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%48) <{alignment = 64 : i32}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>) -> ()
%49 = "hal.interface.binding.subspan"(%42#2, %45) {alignment = 64 : index, binding = 2 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%49) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%50 = "memref.alloca"(%45) <{operandSegmentSizes = array<i32: 1, 0>}> : (index) -> memref<1x?xi32, #gpu.address_space<private>>
%51 = "affine.apply"(%45) <{map = affine_map<()[s0] -> (s0 ceildiv 2)>}> : (index) -> index
"gpu.barrier"() : () -> ()
"scf.for"(%9, %51, %5) ({
^bb0(%arg7: index):
%62 = "affine.apply"(%arg7) <{map = affine_map<(d0) -> (d0 * 2)>}> : (index) -> index
%63 = "affine.min"(%arg7, %45) <{map = affine_map<(d0)[s0] -> (d0 * -2 + s0, 2)>}> : (index, index) -> index
"scf.for"(%4, %63, %2) ({
^bb0(%arg8: index):
%64 = "arith.addi"(%arg8, %62) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
%65 = "vector.transfer_read"(%47, %4, %64, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
%66 = "arith.addi"(%65, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
%67 = "arith.trunci"(%66) : (vector<1xi64>) -> vector<1xi32>
%68 = "affine.apply"(%arg7, %arg8) <{map = affine_map<(d0)[s0] -> (d0 * 2 + s0)>}> : (index, index) -> index
"vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
"vector.transfer_write"(%67, %50, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x?xi32, #gpu.address_space<private>>, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"gpu.barrier"() : () -> ()
%52:2 = "affine.delinearize_index"(%9) <{static_basis = array<i64: 4, 16>}> : (index) -> (index, index)
"scf.forall"(%43) <{mapping = [#iree_codegen.workgroup_mapping<z:1>, #iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>], operandSegmentSizes = array<i32: 0, 1, 0, 0>, staticLowerBound = array<i64: 0, 0, 0, 0>, staticStep = array<i64: 1, 1, 1, 1>, staticUpperBound = array<i64: -9223372036854775808, 32, 2, 2>}> ({
^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index):
%53 = "memref.subview"(%7) <{operandSegmentSizes = array<i32: 1, 0, 0, 0>, static_offsets = array<i64: 0, 0>, static_sizes = array<i64: 1, 1>, static_strides = array<i64: 1, 1>}> : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>
"gpu.barrier"() : () -> ()
"scf.for"(%9, %2, %5) ({
^bb0(%arg6: index):
%59 = "vector.transfer_read"(%47, %4, %arg0, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
%60 = "arith.addi"(%59, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
%61 = "arith.trunci"(%60) : (vector<1xi64>) -> vector<1xi32>
"vector.transfer_write"(%61, %7, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x3xi32, #gpu.address_space<workgroup>>, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"gpu.barrier"() : () -> ()
%54 = "affine.apply"(%arg2, %52#0) <{map = affine_map<(d0)[s0] -> (d0 * 4 + s0)>}> : (index, index) -> index
%55 = "affine.apply"(%arg3, %52#1) <{map = affine_map<(d0)[s0] -> (d0 * 64 + s0 * 4)>}> : (index, index) -> index
%56 = "memref.subview"(%48, %arg1, %54, %55, %44) <{operandSegmentSizes = array<i32: 1, 3, 1, 0>, static_offsets = array<i64: 0, -9223372036854775808, -9223372036854775808, -9223372036854775808>, static_sizes = array<i64: -9223372036854775808, 1, 1, 4>, static_strides = array<i64: 1, 1, 1, 1>}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>, index, index, index, index) -> memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%57 = "vector.transfer_read"(%46, %4, %arg0, %arg1, %54, %55, %0) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 5, 1, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, index, index, index, f8E4M3FNUZ) -> vector<4xf8E4M3FNUZ>
%58 = "arith.extf"(%57) : (vector<4xf8E4M3FNUZ>) -> vector<4xbf16>
"vector.transfer_write"(%58, %8, %4, %4, %4, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 5, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (vector<4xbf16>, memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, index, index, index, index, index) -> ()
"iree_linalg_ext.scatter"(%8, %53, %56) <{dimension_map = array<i64: 0>, operandSegmentSizes = array<i32: 2, 1>, unique_indices = true}> ({
^bb0(%arg4: bf16, %arg5: bf16):
"iree_linalg_ext.yield"(%arg4) : (bf16) -> ()
}) : (memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>, memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
"gpu.barrier"() : () -> ()
"scf.forall.in_parallel"() ({
^bb0:
}) : () -> ()
}) : (index) -> ()
"memref.dealloc"(%7) : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> ()
"func.return"() : () -> ()
}) {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64>} : () -> ()
}) : () -> ()
"hal.executable.variant_end"() : () -> ()
}) {sym_name = "rocm_hsaco_fb", target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>} : () -> ()
/sharedfile/32/fp8_32_kv16.mlir:21283:13: error: 'vector.transfer_write' op write affecting operations on global resources are restricted to workgroup distributed contexts.
%8067 = torch.aten.index_put %8065, %8066, %8061, %false_8042 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
^
/sharedfile/32/fp8_32_kv16.mlir:21283:13: note: see current operation: "vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
/sharedfile/32/fp8_32_kv16.mlir:21283:13: error: 'func.func' op failed on workgroup distribution verification
%8067 = torch.aten.index_put %8065, %8066, %8061, %false_8042 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
^
/sharedfile/32/fp8_32_kv16.mlir:21283:13: note: see current operation:
"func.func"() <{function_type = () -> (), sym_name = "prefill_bs1$async_dispatch_432_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"}> ({
%0 = "arith.constant"() <{value = 0.000000e+00 : f8E4M3FNUZ}> : () -> f8E4M3FNUZ
%1 = "arith.constant"() <{value = 0 : i64}> : () -> i64
%2 = "arith.constant"() <{value = 1 : index}> : () -> index
%3 = "arith.constant"() <{value = 32 : i64}> : () -> i64
%4 = "arith.constant"() <{value = 0 : index}> : () -> index
%5 = "arith.constant"() <{value = 64 : index}> : () -> index
%6 = "arith.constant"() <{value = dense<42> : vector<1xi64>}> : () -> vector<1xi64>
%7 = "memref.alloc"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x3xi32, #gpu.address_space<workgroup>>
%8 = "memref.alloca"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x1x1x1x4xbf16, #gpu.address_space<private>>
%9 = "gpu.thread_id"() <{dimension = #gpu<dim x>, upper_bound = 64 : index}> : () -> index
%10 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index} : () -> i32
%11 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 1 : index} : () -> i32
%12 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 2 : index} : () -> i32
%13 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 3 : index} : () -> i32
%14 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 4 : index} : () -> i32
%15 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 5 : index} : () -> i32
%16 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 6 : index} : () -> i32
%17 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 7 : index} : () -> i32
%18 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 8 : index} : () -> i32
%19 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 9 : index} : () -> i32
%20 = "arith.extui"(%10) : (i32) -> i64
%21 = "arith.extui"(%11) : (i32) -> i64
%22 = "arith.shli"(%21, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%23 = "arith.ori"(%20, %22) : (i64, i64) -> i64
%24 = "arith.index_castui"(%23) : (i64) -> index
%25 = "arith.extui"(%12) : (i32) -> i64
%26 = "arith.extui"(%13) : (i32) -> i64
%27 = "arith.shli"(%26, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%28 = "arith.ori"(%25, %27) : (i64, i64) -> i64
%29 = "arith.index_castui"(%28) : (i64) -> index
%30 = "arith.extui"(%14) : (i32) -> i64
%31 = "arith.extui"(%15) : (i32) -> i64
%32 = "arith.shli"(%31, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%33 = "arith.ori"(%30, %32) : (i64, i64) -> i64
%34 = "arith.index_castui"(%33) : (i64) -> index
%35 = "arith.index_castui"(%16) : (i32) -> index
%36 = "arith.extui"(%17) : (i32) -> i64
%37 = "arith.extui"(%18) : (i32) -> i64
%38 = "arith.shli"(%37, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%39 = "arith.ori"(%36, %38) : (i64, i64) -> i64
%40 = "arith.index_castui"(%39) : (i64) -> index
%41 = "arith.index_castui"(%19) : (i32) -> index
%42:6 = "util.assume.int"(%24, %29, %34, %35, %40, %41) <{assumptions = [[#util.int.assumption<umin = 68386816, umax = 5300322304>], [#util.int.assumption<umin = 68501504, umax = 5769969664>], [#util.int.assumption<umin = 68501568, umax = 5770002432>], [#util.int.assumption<umin = 1, umax = 4095>], [#util.int.assumption<umin = 64, umax = 576460752303423424, udiv = 64>], [#util.int.assumption<umin = 1, umax = 4095>]]}> : (index, index, index, index, index, index) -> (index, index, index, index, index, index)
%43 = "flow.dispatch.workload.ordinal"(%42#3) <{ordinal = 1 : index}> : (index) -> index
%44 = "flow.dispatch.workload.ordinal"(%42#4) <{ordinal = 2 : index}> : (index) -> index
%45 = "flow.dispatch.workload.ordinal"(%42#5) <{ordinal = 3 : index}> : (index) -> index
%46 = "hal.interface.binding.subspan"(%42#0, %43) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%46) <{alignment = 1 : i32}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%47 = "hal.interface.binding.subspan"(%42#1, %45) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%47) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%48 = "hal.interface.binding.subspan"(%4, %44) {alignment = 64 : index, binding = 1 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%48) <{alignment = 64 : i32}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>) -> ()
%49 = "hal.interface.binding.subspan"(%42#2, %45) {alignment = 64 : index, binding = 2 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%49) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%50 = "memref.alloca"(%45) <{operandSegmentSizes = array<i32: 1, 0>}> : (index) -> memref<1x?xi32, #gpu.address_space<private>>
%51 = "affine.apply"(%45) <{map = affine_map<()[s0] -> (s0 ceildiv 2)>}> : (index) -> index
"gpu.barrier"() : () -> ()
"scf.for"(%9, %51, %5) ({
^bb0(%arg7: index):
%62 = "affine.apply"(%arg7) <{map = affine_map<(d0) -> (d0 * 2)>}> : (index) -> index
%63 = "affine.min"(%arg7, %45) <{map = affine_map<(d0)[s0] -> (d0 * -2 + s0, 2)>}> : (index, index) -> index
"scf.for"(%4, %63, %2) ({
^bb0(%arg8: index):
%64 = "arith.addi"(%arg8, %62) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
%65 = "vector.transfer_read"(%47, %4, %64, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
%66 = "arith.addi"(%65, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
%67 = "arith.trunci"(%66) : (vector<1xi64>) -> vector<1xi32>
%68 = "affine.apply"(%arg7, %arg8) <{map = affine_map<(d0)[s0] -> (d0 * 2 + s0)>}> : (index, index) -> index
"vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
"vector.transfer_write"(%67, %50, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x?xi32, #gpu.address_space<private>>, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"gpu.barrier"() : () -> ()
%52:2 = "affine.delinearize_index"(%9) <{static_basis = array<i64: 4, 16>}> : (index) -> (index, index)
"scf.forall"(%43) <{mapping = [#iree_codegen.workgroup_mapping<z:1>, #iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>], operandSegmentSizes = array<i32: 0, 1, 0, 0>, staticLowerBound = array<i64: 0, 0, 0, 0>, staticStep = array<i64: 1, 1, 1, 1>, staticUpperBound = array<i64: -9223372036854775808, 32, 2, 2>}> ({
^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index):
%53 = "memref.subview"(%7) <{operandSegmentSizes = array<i32: 1, 0, 0, 0>, static_offsets = array<i64: 0, 0>, static_sizes = array<i64: 1, 1>, static_strides = array<i64: 1, 1>}> : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>
"gpu.barrier"() : () -> ()
"scf.for"(%9, %2, %5) ({
^bb0(%arg6: index):
%59 = "vector.transfer_read"(%47, %4, %arg0, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
%60 = "arith.addi"(%59, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
%61 = "arith.trunci"(%60) : (vector<1xi64>) -> vector<1xi32>
"vector.transfer_write"(%61, %7, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x3xi32, #gpu.address_space<workgroup>>, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"gpu.barrier"() : () -> ()
%54 = "affine.apply"(%arg2, %52#0) <{map = affine_map<(d0)[s0] -> (d0 * 4 + s0)>}> : (index, index) -> index
%55 = "affine.apply"(%arg3, %52#1) <{map = affine_map<(d0)[s0] -> (d0 * 64 + s0 * 4)>}> : (index, index) -> index
%56 = "memref.subview"(%48, %arg1, %54, %55, %44) <{operandSegmentSizes = array<i32: 1, 3, 1, 0>, static_offsets = array<i64: 0, -9223372036854775808, -9223372036854775808, -9223372036854775808>, static_sizes = array<i64: -9223372036854775808, 1, 1, 4>, static_strides = array<i64: 1, 1, 1, 1>}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>, index, index, index, index) -> memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%57 = "vector.transfer_read"(%46, %4, %arg0, %arg1, %54, %55, %0) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 5, 1, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, index, index, index, f8E4M3FNUZ) -> vector<4xf8E4M3FNUZ>
%58 = "arith.extf"(%57) : (vector<4xf8E4M3FNUZ>) -> vector<4xbf16>
"vector.transfer_write"(%58, %8, %4, %4, %4, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 5, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (vector<4xbf16>, memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, index, index, index, index, index) -> ()
"iree_linalg_ext.scatter"(%8, %53, %56) <{dimension_map = array<i64: 0>, operandSegmentSizes = array<i32: 2, 1>, unique_indices = true}> ({
^bb0(%arg4: bf16, %arg5: bf16):
"iree_linalg_ext.yield"(%arg4) : (bf16) -> ()
}) : (memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>, memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
"gpu.barrier"() : () -> ()
"scf.forall.in_parallel"() ({
^bb0:
}) : () -> ()
}) : (index) -> ()
"memref.dealloc"(%7) : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> ()
"func.return"() : () -> ()
}) {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64>} : () -> ()
/sharedfile/32/fp8_32_kv16.mlir:21283:13: error: failed to run translation of source executable to target executable for backend #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>
%8067 = torch.aten.index_put %8065, %8066, %8061, %false_8042 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
^
/sharedfile/32/fp8_32_kv16.mlir:21283:13: note: see current operation:
"hal.executable.variant"() ({
"hal.executable.export"() ({
^bb0(%arg9: !hal.device, %arg10: index, %arg11: index, %arg12: index, %arg13: index):
%69:3 = "flow.dispatch.workgroup_count_from_slice"(%arg10, %arg11, %arg12, %arg13) : (index, index, index, index) -> (index, index, index)
"hal.return"(%69#0, %69#1, %69#2) : (index, index, index) -> ()
}) {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index, sym_name = "prefill_bs1$async_dispatch_432_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"} : () -> ()
"builtin.module"() ({
"func.func"() <{function_type = () -> (), sym_name = "prefill_bs1$async_dispatch_432_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"}> ({
%0 = "arith.constant"() <{value = 0.000000e+00 : f8E4M3FNUZ}> : () -> f8E4M3FNUZ
%1 = "arith.constant"() <{value = 0 : i64}> : () -> i64
%2 = "arith.constant"() <{value = 1 : index}> : () -> index
%3 = "arith.constant"() <{value = 32 : i64}> : () -> i64
%4 = "arith.constant"() <{value = 0 : index}> : () -> index
%5 = "arith.constant"() <{value = 64 : index}> : () -> index
%6 = "arith.constant"() <{value = dense<42> : vector<1xi64>}> : () -> vector<1xi64>
%7 = "memref.alloc"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x3xi32, #gpu.address_space<workgroup>>
%8 = "memref.alloca"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x1x1x1x4xbf16, #gpu.address_space<private>>
%9 = "gpu.thread_id"() <{dimension = #gpu<dim x>, upper_bound = 64 : index}> : () -> index
%10 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index} : () -> i32
%11 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 1 : index} : () -> i32
%12 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 2 : index} : () -> i32
%13 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 3 : index} : () -> i32
%14 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 4 : index} : () -> i32
%15 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 5 : index} : () -> i32
%16 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 6 : index} : () -> i32
%17 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 7 : index} : () -> i32
%18 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 8 : index} : () -> i32
%19 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 9 : index} : () -> i32
%20 = "arith.extui"(%10) : (i32) -> i64
%21 = "arith.extui"(%11) : (i32) -> i64
%22 = "arith.shli"(%21, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%23 = "arith.ori"(%20, %22) : (i64, i64) -> i64
%24 = "arith.index_castui"(%23) : (i64) -> index
%25 = "arith.extui"(%12) : (i32) -> i64
%26 = "arith.extui"(%13) : (i32) -> i64
%27 = "arith.shli"(%26, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%28 = "arith.ori"(%25, %27) : (i64, i64) -> i64
%29 = "arith.index_castui"(%28) : (i64) -> index
%30 = "arith.extui"(%14) : (i32) -> i64
%31 = "arith.extui"(%15) : (i32) -> i64
%32 = "arith.shli"(%31, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%33 = "arith.ori"(%30, %32) : (i64, i64) -> i64
%34 = "arith.index_castui"(%33) : (i64) -> index
%35 = "arith.index_castui"(%16) : (i32) -> index
%36 = "arith.extui"(%17) : (i32) -> i64
%37 = "arith.extui"(%18) : (i32) -> i64
%38 = "arith.shli"(%37, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%39 = "arith.ori"(%36, %38) : (i64, i64) -> i64
%40 = "arith.index_castui"(%39) : (i64) -> index
%41 = "arith.index_castui"(%19) : (i32) -> index
%42:6 = "util.assume.int"(%24, %29, %34, %35, %40, %41) <{assumptions = [[#util.int.assumption<umin = 68386816, umax = 5300322304>], [#util.int.assumption<umin = 68501504, umax = 5769969664>], [#util.int.assumption<umin = 68501568, umax = 5770002432>], [#util.int.assumption<umin = 1, umax = 4095>], [#util.int.assumption<umin = 64, umax = 576460752303423424, udiv = 64>], [#util.int.assumption<umin = 1, umax = 4095>]]}> : (index, index, index, index, index, index) -> (index, index, index, index, index, index)
%43 = "flow.dispatch.workload.ordinal"(%42#3) <{ordinal = 1 : index}> : (index) -> index
%44 = "flow.dispatch.workload.ordinal"(%42#4) <{ordinal = 2 : index}> : (index) -> index
%45 = "flow.dispatch.workload.ordinal"(%42#5) <{ordinal = 3 : index}> : (index) -> index
%46 = "hal.interface.binding.subspan"(%42#0, %43) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%46) <{alignment = 1 : i32}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%47 = "hal.interface.binding.subspan"(%42#1, %45) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%47) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%48 = "hal.interface.binding.subspan"(%4, %44) {alignment = 64 : index, binding = 1 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%48) <{alignment = 64 : i32}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>) -> ()
%49 = "hal.interface.binding.subspan"(%42#2, %45) {alignment = 64 : index, binding = 2 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%49) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%50 = "memref.alloca"(%45) <{operandSegmentSizes = array<i32: 1, 0>}> : (index) -> memref<1x?xi32, #gpu.address_space<private>>
%51 = "affine.apply"(%45) <{map = affine_map<()[s0] -> (s0 ceildiv 2)>}> : (index) -> index
"gpu.barrier"() : () -> ()
"scf.for"(%9, %51, %5) ({
^bb0(%arg7: index):
%62 = "affine.apply"(%arg7) <{map = affine_map<(d0) -> (d0 * 2)>}> : (index) -> index
%63 = "affine.min"(%arg7, %45) <{map = affine_map<(d0)[s0] -> (d0 * -2 + s0, 2)>}> : (index, index) -> index
"scf.for"(%4, %63, %2) ({
^bb0(%arg8: index):
%64 = "arith.addi"(%arg8, %62) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
%65 = "vector.transfer_read"(%47, %4, %64, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
%66 = "arith.addi"(%65, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
%67 = "arith.trunci"(%66) : (vector<1xi64>) -> vector<1xi32>
%68 = "affine.apply"(%arg7, %arg8) <{map = affine_map<(d0)[s0] -> (d0 * 2 + s0)>}> : (index, index) -> index
"vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
"vector.transfer_write"(%67, %50, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x?xi32, #gpu.address_space<private>>, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"gpu.barrier"() : () -> ()
%52:2 = "affine.delinearize_index"(%9) <{static_basis = array<i64: 4, 16>}> : (index) -> (index, index)
"scf.forall"(%43) <{mapping = [#iree_codegen.workgroup_mapping<z:1>, #iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>], operandSegmentSizes = array<i32: 0, 1, 0, 0>, staticLowerBound = array<i64: 0, 0, 0, 0>, staticStep = array<i64: 1, 1, 1, 1>, staticUpperBound = array<i64: -9223372036854775808, 32, 2, 2>}> ({
^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index):
%53 = "memref.subview"(%7) <{operandSegmentSizes = array<i32: 1, 0, 0, 0>, static_offsets = array<i64: 0, 0>, static_sizes = array<i64: 1, 1>, static_strides = array<i64: 1, 1>}> : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>
"gpu.barrier"() : () -> ()
"scf.for"(%9, %2, %5) ({
^bb0(%arg6: index):
%59 = "vector.transfer_read"(%47, %4, %arg0, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
%60 = "arith.addi"(%59, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
%61 = "arith.trunci"(%60) : (vector<1xi64>) -> vector<1xi32>
"vector.transfer_write"(%61, %7, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x3xi32, #gpu.address_space<workgroup>>, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"gpu.barrier"() : () -> ()
%54 = "affine.apply"(%arg2, %52#0) <{map = affine_map<(d0)[s0] -> (d0 * 4 + s0)>}> : (index, index) -> index
%55 = "affine.apply"(%arg3, %52#1) <{map = affine_map<(d0)[s0] -> (d0 * 64 + s0 * 4)>}> : (index, index) -> index
%56 = "memref.subview"(%48, %arg1, %54, %55, %44) <{operandSegmentSizes = array<i32: 1, 3, 1, 0>, static_offsets = array<i64: 0, -9223372036854775808, -9223372036854775808, -9223372036854775808>, static_sizes = array<i64: -9223372036854775808, 1, 1, 4>, static_strides = array<i64: 1, 1, 1, 1>}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>, index, index, index, index) -> memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%57 = "vector.transfer_read"(%46, %4, %arg0, %arg1, %54, %55, %0) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 5, 1, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, index, index, index, f8E4M3FNUZ) -> vector<4xf8E4M3FNUZ>
%58 = "arith.extf"(%57) : (vector<4xf8E4M3FNUZ>) -> vector<4xbf16>
"vector.transfer_write"(%58, %8, %4, %4, %4, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 5, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (vector<4xbf16>, memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, index, index, index, index, index) -> ()
"iree_linalg_ext.scatter"(%8, %53, %56) <{dimension_map = array<i64: 0>, operandSegmentSizes = array<i32: 2, 1>, unique_indices = true}> ({
^bb0(%arg4: bf16, %arg5: bf16):
"iree_linalg_ext.yield"(%arg4) : (bf16) -> ()
}) : (memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>, memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
"gpu.barrier"() : () -> ()
"scf.forall.in_parallel"() ({
^bb0:
}) : () -> ()
}) : (index) -> ()
"memref.dealloc"(%7) : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> ()
"func.return"() : () -> ()
}) {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64>} : () -> ()
}) : () -> ()
"hal.executable.variant_end"() : () -> ()
}) {sym_name = "rocm_hsaco_fb", target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>} : () -> ()
/sharedfile/32/fp8_32_kv16.mlir:22159:13: error: 'vector.transfer_write' op write affecting operations on global resources are restricted to workgroup distributed contexts.
%8404 = torch.aten.index_put %8402, %8403, %8398, %false_8414 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
^
/sharedfile/32/fp8_32_kv16.mlir:22159:13: note: see current operation: "vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
/sharedfile/32/fp8_32_kv16.mlir:22159:13: error: 'func.func' op failed on workgroup distribution verification
%8404 = torch.aten.index_put %8402, %8403, %8398, %false_8414 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
^
/sharedfile/32/fp8_32_kv16.mlir:22159:13: note: see current operation:
"func.func"() <{function_type = () -> (), sym_name = "prefill_bs1$async_dispatch_452_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"}> ({
%0 = "arith.constant"() <{value = 0.000000e+00 : f8E4M3FNUZ}> : () -> f8E4M3FNUZ
%1 = "arith.constant"() <{value = 0 : i64}> : () -> i64
%2 = "arith.constant"() <{value = 1 : index}> : () -> index
%3 = "arith.constant"() <{value = 32 : i64}> : () -> i64
%4 = "arith.constant"() <{value = 0 : index}> : () -> index
%5 = "arith.constant"() <{value = 64 : index}> : () -> index
%6 = "arith.constant"() <{value = dense<44> : vector<1xi64>}> : () -> vector<1xi64>
%7 = "memref.alloc"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x3xi32, #gpu.address_space<workgroup>>
%8 = "memref.alloca"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x1x1x1x4xbf16, #gpu.address_space<private>>
%9 = "gpu.thread_id"() <{dimension = #gpu<dim x>, upper_bound = 64 : index}> : () -> index
%10 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index} : () -> i32
%11 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 1 : index} : () -> i32
%12 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 2 : index} : () -> i32
%13 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 3 : index} : () -> i32
%14 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 4 : index} : () -> i32
%15 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 5 : index} : () -> i32
%16 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 6 : index} : () -> i32
%17 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 7 : index} : () -> i32
%18 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 8 : index} : () -> i32
%19 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 9 : index} : () -> i32
%20 = "arith.extui"(%10) : (i32) -> i64
%21 = "arith.extui"(%11) : (i32) -> i64
%22 = "arith.shli"(%21, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%23 = "arith.ori"(%20, %22) : (i64, i64) -> i64
%24 = "arith.index_castui"(%23) : (i64) -> index
%25 = "arith.extui"(%12) : (i32) -> i64
%26 = "arith.extui"(%13) : (i32) -> i64
%27 = "arith.shli"(%26, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%28 = "arith.ori"(%25, %27) : (i64, i64) -> i64
%29 = "arith.index_castui"(%28) : (i64) -> index
%30 = "arith.extui"(%14) : (i32) -> i64
%31 = "arith.extui"(%15) : (i32) -> i64
%32 = "arith.shli"(%31, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%33 = "arith.ori"(%30, %32) : (i64, i64) -> i64
%34 = "arith.index_castui"(%33) : (i64) -> index
%35 = "arith.index_castui"(%16) : (i32) -> index
%36 = "arith.extui"(%17) : (i32) -> i64
%37 = "arith.extui"(%18) : (i32) -> i64
%38 = "arith.shli"(%37, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%39 = "arith.ori"(%36, %38) : (i64, i64) -> i64
%40 = "arith.index_castui"(%39) : (i64) -> index
%41 = "arith.index_castui"(%19) : (i32) -> index
%42:6 = "util.assume.int"(%24, %29, %34, %35, %40, %41) <{assumptions = [[#util.int.assumption<umin = 68386816, umax = 5300322304>], [#util.int.assumption<umin = 68501504, umax = 5769969664>], [#util.int.assumption<umin = 68501568, umax = 5770002432>], [#util.int.assumption<umin = 1, umax = 4095>], [#util.int.assumption<umin = 64, umax = 576460752303423424, udiv = 64>], [#util.int.assumption<umin = 1, umax = 4095>]]}> : (index, index, index, index, index, index) -> (index, index, index, index, index, index)
%43 = "flow.dispatch.workload.ordinal"(%42#3) <{ordinal = 1 : index}> : (index) -> index
%44 = "flow.dispatch.workload.ordinal"(%42#4) <{ordinal = 2 : index}> : (index) -> index
%45 = "flow.dispatch.workload.ordinal"(%42#5) <{ordinal = 3 : index}> : (index) -> index
%46 = "hal.interface.binding.subspan"(%42#0, %43) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%46) <{alignment = 1 : i32}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%47 = "hal.interface.binding.subspan"(%42#1, %45) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%47) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%48 = "hal.interface.binding.subspan"(%4, %44) {alignment = 64 : index, binding = 1 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%48) <{alignment = 64 : i32}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>) -> ()
%49 = "hal.interface.binding.subspan"(%42#2, %45) {alignment = 64 : index, binding = 2 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%49) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%50 = "memref.alloca"(%45) <{operandSegmentSizes = array<i32: 1, 0>}> : (index) -> memref<1x?xi32, #gpu.address_space<private>>
%51 = "affine.apply"(%45) <{map = affine_map<()[s0] -> (s0 ceildiv 2)>}> : (index) -> index
"gpu.barrier"() : () -> ()
"scf.for"(%9, %51, %5) ({
^bb0(%arg7: index):
%62 = "affine.apply"(%arg7) <{map = affine_map<(d0) -> (d0 * 2)>}> : (index) -> index
%63 = "affine.min"(%arg7, %45) <{map = affine_map<(d0)[s0] -> (d0 * -2 + s0, 2)>}> : (index, index) -> index
"scf.for"(%4, %63, %2) ({
^bb0(%arg8: index):
%64 = "arith.addi"(%arg8, %62) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
%65 = "vector.transfer_read"(%47, %4, %64, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
%66 = "arith.addi"(%65, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
%67 = "arith.trunci"(%66) : (vector<1xi64>) -> vector<1xi32>
%68 = "affine.apply"(%arg7, %arg8) <{map = affine_map<(d0)[s0] -> (d0 * 2 + s0)>}> : (index, index) -> index
"vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
"vector.transfer_write"(%67, %50, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x?xi32, #gpu.address_space<private>>, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"gpu.barrier"() : () -> ()
%52:2 = "affine.delinearize_index"(%9) <{static_basis = array<i64: 4, 16>}> : (index) -> (index, index)
"scf.forall"(%43) <{mapping = [#iree_codegen.workgroup_mapping<z:1>, #iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>], operandSegmentSizes = array<i32: 0, 1, 0, 0>, staticLowerBound = array<i64: 0, 0, 0, 0>, staticStep = array<i64: 1, 1, 1, 1>, staticUpperBound = array<i64: -9223372036854775808, 32, 2, 2>}> ({
^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index):
%53 = "memref.subview"(%7) <{operandSegmentSizes = array<i32: 1, 0, 0, 0>, static_offsets = array<i64: 0, 0>, static_sizes = array<i64: 1, 1>, static_strides = array<i64: 1, 1>}> : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>
"gpu.barrier"() : () -> ()
"scf.for"(%9, %2, %5) ({
^bb0(%arg6: index):
%59 = "vector.transfer_read"(%47, %4, %arg0, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
%60 = "arith.addi"(%59, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
%61 = "arith.trunci"(%60) : (vector<1xi64>) -> vector<1xi32>
"vector.transfer_write"(%61, %7, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x3xi32, #gpu.address_space<workgroup>>, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"gpu.barrier"() : () -> ()
%54 = "affine.apply"(%arg2, %52#0) <{map = affine_map<(d0)[s0] -> (d0 * 4 + s0)>}> : (index, index) -> index
%55 = "affine.apply"(%arg3, %52#1) <{map = affine_map<(d0)[s0] -> (d0 * 64 + s0 * 4)>}> : (index, index) -> index
%56 = "memref.subview"(%48, %arg1, %54, %55, %44) <{operandSegmentSizes = array<i32: 1, 3, 1, 0>, static_offsets = array<i64: 0, -9223372036854775808, -9223372036854775808, -9223372036854775808>, static_sizes = array<i64: -9223372036854775808, 1, 1, 4>, static_strides = array<i64: 1, 1, 1, 1>}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>, index, index, index, index) -> memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%57 = "vector.transfer_read"(%46, %4, %arg0, %arg1, %54, %55, %0) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 5, 1, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, index, index, index, f8E4M3FNUZ) -> vector<4xf8E4M3FNUZ>
%58 = "arith.extf"(%57) : (vector<4xf8E4M3FNUZ>) -> vector<4xbf16>
"vector.transfer_write"(%58, %8, %4, %4, %4, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 5, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (vector<4xbf16>, memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, index, index, index, index, index) -> ()
"iree_linalg_ext.scatter"(%8, %53, %56) <{dimension_map = array<i64: 0>, operandSegmentSizes = array<i32: 2, 1>, unique_indices = true}> ({
^bb0(%arg4: bf16, %arg5: bf16):
"iree_linalg_ext.yield"(%arg4) : (bf16) -> ()
}) : (memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>, memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
"gpu.barrier"() : () -> ()
"scf.forall.in_parallel"() ({
^bb0:
}) : () -> ()
}) : (index) -> ()
"memref.dealloc"(%7) : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> ()
"func.return"() : () -> ()
}) {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64>} : () -> ()
/sharedfile/32/fp8_32_kv16.mlir:22159:13: error: failed to run translation of source executable to target executable for backend #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>
%8404 = torch.aten.index_put %8402, %8403, %8398, %false_8414 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
^
/sharedfile/32/fp8_32_kv16.mlir:22159:13: note: see current operation:
"hal.executable.variant"() ({
"hal.executable.export"() ({
^bb0(%arg9: !hal.device, %arg10: index, %arg11: index, %arg12: index, %arg13: index):
%69:3 = "flow.dispatch.workgroup_count_from_slice"(%arg10, %arg11, %arg12, %arg13) : (index, index, index, index) -> (index, index, index)
"hal.return"(%69#0, %69#1, %69#2) : (index, index, index) -> ()
}) {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index, sym_name = "prefill_bs1$async_dispatch_452_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"} : () -> ()
"builtin.module"() ({
"func.func"() <{function_type = () -> (), sym_name = "prefill_bs1$async_dispatch_452_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"}> ({
%0 = "arith.constant"() <{value = 0.000000e+00 : f8E4M3FNUZ}> : () -> f8E4M3FNUZ
%1 = "arith.constant"() <{value = 0 : i64}> : () -> i64
%2 = "arith.constant"() <{value = 1 : index}> : () -> index
%3 = "arith.constant"() <{value = 32 : i64}> : () -> i64
%4 = "arith.constant"() <{value = 0 : index}> : () -> index
%5 = "arith.constant"() <{value = 64 : index}> : () -> index
%6 = "arith.constant"() <{value = dense<44> : vector<1xi64>}> : () -> vector<1xi64>
%7 = "memref.alloc"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x3xi32, #gpu.address_space<workgroup>>
%8 = "memref.alloca"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x1x1x1x4xbf16, #gpu.address_space<private>>
%9 = "gpu.thread_id"() <{dimension = #gpu<dim x>, upper_bound = 64 : index}> : () -> index
%10 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index} : () -> i32
%11 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 1 : index} : () -> i32
%12 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 2 : index} : () -> i32
%13 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 3 : index} : () -> i32
%14 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 4 : index} : () -> i32
%15 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 5 : index} : () -> i32
%16 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 6 : index} : () -> i32
%17 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 7 : index} : () -> i32
%18 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 8 : index} : () -> i32
%19 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 9 : index} : () -> i32
%20 = "arith.extui"(%10) : (i32) -> i64
%21 = "arith.extui"(%11) : (i32) -> i64
%22 = "arith.shli"(%21, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%23 = "arith.ori"(%20, %22) : (i64, i64) -> i64
%24 = "arith.index_castui"(%23) : (i64) -> index
%25 = "arith.extui"(%12) : (i32) -> i64
%26 = "arith.extui"(%13) : (i32) -> i64
%27 = "arith.shli"(%26, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%28 = "arith.ori"(%25, %27) : (i64, i64) -> i64
%29 = "arith.index_castui"(%28) : (i64) -> index
%30 = "arith.extui"(%14) : (i32) -> i64
%31 = "arith.extui"(%15) : (i32) -> i64
%32 = "arith.shli"(%31, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%33 = "arith.ori"(%30, %32) : (i64, i64) -> i64
%34 = "arith.index_castui"(%33) : (i64) -> index
%35 = "arith.index_castui"(%16) : (i32) -> index
%36 = "arith.extui"(%17) : (i32) -> i64
%37 = "arith.extui"(%18) : (i32) -> i64
%38 = "arith.shli"(%37, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%39 = "arith.ori"(%36, %38) : (i64, i64) -> i64
%40 = "arith.index_castui"(%39) : (i64) -> index
%41 = "arith.index_castui"(%19) : (i32) -> index
%42:6 = "util.assume.int"(%24, %29, %34, %35, %40, %41) <{assumptions = [[#util.int.assumption<umin = 68386816, umax = 5300322304>], [#util.int.assumption<umin = 68501504, umax = 5769969664>], [#util.int.assumption<umin = 68501568, umax = 5770002432>], [#util.int.assumption<umin = 1, umax = 4095>], [#util.int.assumption<umin = 64, umax = 576460752303423424, udiv = 64>], [#util.int.assumption<umin = 1, umax = 4095>]]}> : (index, index, index, index, index, index) -> (index, index, index, index, index, index)
%43 = "flow.dispatch.workload.ordinal"(%42#3) <{ordinal = 1 : index}> : (index) -> index
%44 = "flow.dispatch.workload.ordinal"(%42#4) <{ordinal = 2 : index}> : (index) -> index
%45 = "flow.dispatch.workload.ordinal"(%42#5) <{ordinal = 3 : index}> : (index) -> index
%46 = "hal.interface.binding.subspan"(%42#0, %43) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%46) <{alignment = 1 : i32}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%47 = "hal.interface.binding.subspan"(%42#1, %45) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%47) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%48 = "hal.interface.binding.subspan"(%4, %44) {alignment = 64 : index, binding = 1 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%48) <{alignment = 64 : i32}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>) -> ()
%49 = "hal.interface.binding.subspan"(%42#2, %45) {alignment = 64 : index, binding = 2 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%49) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%50 = "memref.alloca"(%45) <{operandSegmentSizes = array<i32: 1, 0>}> : (index) -> memref<1x?xi32, #gpu.address_space<private>>
%51 = "affine.apply"(%45) <{map = affine_map<()[s0] -> (s0 ceildiv 2)>}> : (index) -> index
"gpu.barrier"() : () -> ()
"scf.for"(%9, %51, %5) ({
^bb0(%arg7: index):
%62 = "affine.apply"(%arg7) <{map = affine_map<(d0) -> (d0 * 2)>}> : (index) -> index
%63 = "affine.min"(%arg7, %45) <{map = affine_map<(d0)[s0] -> (d0 * -2 + s0, 2)>}> : (index, index) -> index
"scf.for"(%4, %63, %2) ({
^bb0(%arg8: index):
%64 = "arith.addi"(%arg8, %62) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
%65 = "vector.transfer_read"(%47, %4, %64, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
%66 = "arith.addi"(%65, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
%67 = "arith.trunci"(%66) : (vector<1xi64>) -> vector<1xi32>
%68 = "affine.apply"(%arg7, %arg8) <{map = affine_map<(d0)[s0] -> (d0 * 2 + s0)>}> : (index, index) -> index
"vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
"vector.transfer_write"(%67, %50, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x?xi32, #gpu.address_space<private>>, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"gpu.barrier"() : () -> ()
%52:2 = "affine.delinearize_index"(%9) <{static_basis = array<i64: 4, 16>}> : (index) -> (index, index)
"scf.forall"(%43) <{mapping = [#iree_codegen.workgroup_mapping<z:1>, #iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>], operandSegmentSizes = array<i32: 0, 1, 0, 0>, staticLowerBound = array<i64: 0, 0, 0, 0>, staticStep = array<i64: 1, 1, 1, 1>, staticUpperBound = array<i64: -9223372036854775808, 32, 2, 2>}> ({
^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index):
%53 = "memref.subview"(%7) <{operandSegmentSizes = array<i32: 1, 0, 0, 0>, static_offsets = array<i64: 0, 0>, static_sizes = array<i64: 1, 1>, static_strides = array<i64: 1, 1>}> : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>
"gpu.barrier"() : () -> ()
"scf.for"(%9, %2, %5) ({
^bb0(%arg6: index):
%59 = "vector.transfer_read"(%47, %4, %arg0, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
%60 = "arith.addi"(%59, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
%61 = "arith.trunci"(%60) : (vector<1xi64>) -> vector<1xi32>
"vector.transfer_write"(%61, %7, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x3xi32, #gpu.address_space<workgroup>>, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"gpu.barrier"() : () -> ()
%54 = "affine.apply"(%arg2, %52#0) <{map = affine_map<(d0)[s0] -> (d0 * 4 + s0)>}> : (index, index) -> index
%55 = "affine.apply"(%arg3, %52#1) <{map = affine_map<(d0)[s0] -> (d0 * 64 + s0 * 4)>}> : (index, index) -> index
%56 = "memref.subview"(%48, %arg1, %54, %55, %44) <{operandSegmentSizes = array<i32: 1, 3, 1, 0>, static_offsets = array<i64: 0, -9223372036854775808, -9223372036854775808, -9223372036854775808>, static_sizes = array<i64: -9223372036854775808, 1, 1, 4>, static_strides = array<i64: 1, 1, 1, 1>}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>, index, index, index, index) -> memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%57 = "vector.transfer_read"(%46, %4, %arg0, %arg1, %54, %55, %0) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 5, 1, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, index, index, index, f8E4M3FNUZ) -> vector<4xf8E4M3FNUZ>
%58 = "arith.extf"(%57) : (vector<4xf8E4M3FNUZ>) -> vector<4xbf16>
"vector.transfer_write"(%58, %8, %4, %4, %4, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 5, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (vector<4xbf16>, memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, index, index, index, index, index) -> ()
"iree_linalg_ext.scatter"(%8, %53, %56) <{dimension_map = array<i64: 0>, operandSegmentSizes = array<i32: 2, 1>, unique_indices = true}> ({
^bb0(%arg4: bf16, %arg5: bf16):
"iree_linalg_ext.yield"(%arg4) : (bf16) -> ()
}) : (memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>, memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
"gpu.barrier"() : () -> ()
"scf.forall.in_parallel"() ({
^bb0:
}) : () -> ()
}) : (index) -> ()
"memref.dealloc"(%7) : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> ()
"func.return"() : () -> ()
}) {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64>} : () -> ()
}) : () -> ()
"hal.executable.variant_end"() : () -> ()
}) {sym_name = "rocm_hsaco_fb", target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>} : () -> ()
/sharedfile/32/fp8_32_kv16.mlir:23035:13: error: 'vector.transfer_write' op write affecting operations on global resources are restricted to workgroup distributed contexts.
%8741 = torch.aten.index_put %8739, %8740, %8735, %false_8786 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
^
/sharedfile/32/fp8_32_kv16.mlir:23035:13: note: see current operation: "vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
/sharedfile/32/fp8_32_kv16.mlir:23035:13: error: 'func.func' op failed on workgroup distribution verification
%8741 = torch.aten.index_put %8739, %8740, %8735, %false_8786 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
^
/sharedfile/32/fp8_32_kv16.mlir:23035:13: note: see current operation:
"func.func"() <{function_type = () -> (), sym_name = "prefill_bs1$async_dispatch_472_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"}> ({
%0 = "arith.constant"() <{value = 0.000000e+00 : f8E4M3FNUZ}> : () -> f8E4M3FNUZ
%1 = "arith.constant"() <{value = 0 : i64}> : () -> i64
%2 = "arith.constant"() <{value = 1 : index}> : () -> index
%3 = "arith.constant"() <{value = 32 : i64}> : () -> i64
%4 = "arith.constant"() <{value = 0 : index}> : () -> index
%5 = "arith.constant"() <{value = 64 : index}> : () -> index
%6 = "arith.constant"() <{value = dense<46> : vector<1xi64>}> : () -> vector<1xi64>
%7 = "memref.alloc"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x3xi32, #gpu.address_space<workgroup>>
%8 = "memref.alloca"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x1x1x1x4xbf16, #gpu.address_space<private>>
%9 = "gpu.thread_id"() <{dimension = #gpu<dim x>, upper_bound = 64 : index}> : () -> index
%10 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index} : () -> i32
%11 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 1 : index} : () -> i32
%12 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 2 : index} : () -> i32
%13 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 3 : index} : () -> i32
%14 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 4 : index} : () -> i32
%15 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 5 : index} : () -> i32
%16 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 6 : index} : () -> i32
%17 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 7 : index} : () -> i32
%18 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 8 : index} : () -> i32
%19 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 9 : index} : () -> i32
%20 = "arith.extui"(%10) : (i32) -> i64
%21 = "arith.extui"(%11) : (i32) -> i64
%22 = "arith.shli"(%21, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%23 = "arith.ori"(%20, %22) : (i64, i64) -> i64
%24 = "arith.index_castui"(%23) : (i64) -> index
%25 = "arith.extui"(%12) : (i32) -> i64
%26 = "arith.extui"(%13) : (i32) -> i64
%27 = "arith.shli"(%26, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%28 = "arith.ori"(%25, %27) : (i64, i64) -> i64
%29 = "arith.index_castui"(%28) : (i64) -> index
%30 = "arith.extui"(%14) : (i32) -> i64
%31 = "arith.extui"(%15) : (i32) -> i64
%32 = "arith.shli"(%31, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%33 = "arith.ori"(%30, %32) : (i64, i64) -> i64
%34 = "arith.index_castui"(%33) : (i64) -> index
%35 = "arith.index_castui"(%16) : (i32) -> index
%36 = "arith.extui"(%17) : (i32) -> i64
%37 = "arith.extui"(%18) : (i32) -> i64
%38 = "arith.shli"(%37, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%39 = "arith.ori"(%36, %38) : (i64, i64) -> i64
%40 = "arith.index_castui"(%39) : (i64) -> index
%41 = "arith.index_castui"(%19) : (i32) -> index
%42:6 = "util.assume.int"(%24, %29, %34, %35, %40, %41) <{assumptions = [[#util.int.assumption<umin = 68386816, umax = 5300322304>], [#util.int.assumption<umin = 68501504, umax = 5769969664>], [#util.int.assumption<umin = 68501568, umax = 5770002432>], [#util.int.assumption<umin = 1, umax = 4095>], [#util.int.assumption<umin = 64, umax = 576460752303423424, udiv = 64>], [#util.int.assumption<umin = 1, umax = 4095>]]}> : (index, index, index, index, index, index) -> (index, index, index, index, index, index)
%43 = "flow.dispatch.workload.ordinal"(%42#3) <{ordinal = 1 : index}> : (index) -> index
%44 = "flow.dispatch.workload.ordinal"(%42#4) <{ordinal = 2 : index}> : (index) -> index
%45 = "flow.dispatch.workload.ordinal"(%42#5) <{ordinal = 3 : index}> : (index) -> index
%46 = "hal.interface.binding.subspan"(%42#0, %43) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%46) <{alignment = 1 : i32}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%47 = "hal.interface.binding.subspan"(%42#1, %45) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%47) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%48 = "hal.interface.binding.subspan"(%4, %44) {alignment = 64 : index, binding = 1 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%48) <{alignment = 64 : i32}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>) -> ()
%49 = "hal.interface.binding.subspan"(%42#2, %45) {alignment = 64 : index, binding = 2 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%49) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%50 = "memref.alloca"(%45) <{operandSegmentSizes = array<i32: 1, 0>}> : (index) -> memref<1x?xi32, #gpu.address_space<private>>
%51 = "affine.apply"(%45) <{map = affine_map<()[s0] -> (s0 ceildiv 2)>}> : (index) -> index
"gpu.barrier"() : () -> ()
"scf.for"(%9, %51, %5) ({
^bb0(%arg7: index):
%62 = "affine.apply"(%arg7) <{map = affine_map<(d0) -> (d0 * 2)>}> : (index) -> index
%63 = "affine.min"(%arg7, %45) <{map = affine_map<(d0)[s0] -> (d0 * -2 + s0, 2)>}> : (index, index) -> index
"scf.for"(%4, %63, %2) ({
^bb0(%arg8: index):
%64 = "arith.addi"(%arg8, %62) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
%65 = "vector.transfer_read"(%47, %4, %64, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
%66 = "arith.addi"(%65, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
%67 = "arith.trunci"(%66) : (vector<1xi64>) -> vector<1xi32>
%68 = "affine.apply"(%arg7, %arg8) <{map = affine_map<(d0)[s0] -> (d0 * 2 + s0)>}> : (index, index) -> index
"vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
"vector.transfer_write"(%67, %50, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x?xi32, #gpu.address_space<private>>, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"gpu.barrier"() : () -> ()
%52:2 = "affine.delinearize_index"(%9) <{static_basis = array<i64: 4, 16>}> : (index) -> (index, index)
"scf.forall"(%43) <{mapping = [#iree_codegen.workgroup_mapping<z:1>, #iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>], operandSegmentSizes = array<i32: 0, 1, 0, 0>, staticLowerBound = array<i64: 0, 0, 0, 0>, staticStep = array<i64: 1, 1, 1, 1>, staticUpperBound = array<i64: -9223372036854775808, 32, 2, 2>}> ({
^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index):
%53 = "memref.subview"(%7) <{operandSegmentSizes = array<i32: 1, 0, 0, 0>, static_offsets = array<i64: 0, 0>, static_sizes = array<i64: 1, 1>, static_strides = array<i64: 1, 1>}> : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>
"gpu.barrier"() : () -> ()
"scf.for"(%9, %2, %5) ({
^bb0(%arg6: index):
%59 = "vector.transfer_read"(%47, %4, %arg0, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
%60 = "arith.addi"(%59, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
%61 = "arith.trunci"(%60) : (vector<1xi64>) -> vector<1xi32>
"vector.transfer_write"(%61, %7, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x3xi32, #gpu.address_space<workgroup>>, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"gpu.barrier"() : () -> ()
%54 = "affine.apply"(%arg2, %52#0) <{map = affine_map<(d0)[s0] -> (d0 * 4 + s0)>}> : (index, index) -> index
%55 = "affine.apply"(%arg3, %52#1) <{map = affine_map<(d0)[s0] -> (d0 * 64 + s0 * 4)>}> : (index, index) -> index
%56 = "memref.subview"(%48, %arg1, %54, %55, %44) <{operandSegmentSizes = array<i32: 1, 3, 1, 0>, static_offsets = array<i64: 0, -9223372036854775808, -9223372036854775808, -9223372036854775808>, static_sizes = array<i64: -9223372036854775808, 1, 1, 4>, static_strides = array<i64: 1, 1, 1, 1>}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>, index, index, index, index) -> memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%57 = "vector.transfer_read"(%46, %4, %arg0, %arg1, %54, %55, %0) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 5, 1, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, index, index, index, f8E4M3FNUZ) -> vector<4xf8E4M3FNUZ>
%58 = "arith.extf"(%57) : (vector<4xf8E4M3FNUZ>) -> vector<4xbf16>
"vector.transfer_write"(%58, %8, %4, %4, %4, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 5, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (vector<4xbf16>, memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, index, index, index, index, index) -> ()
"iree_linalg_ext.scatter"(%8, %53, %56) <{dimension_map = array<i64: 0>, operandSegmentSizes = array<i32: 2, 1>, unique_indices = true}> ({
^bb0(%arg4: bf16, %arg5: bf16):
"iree_linalg_ext.yield"(%arg4) : (bf16) -> ()
}) : (memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>, memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
"gpu.barrier"() : () -> ()
"scf.forall.in_parallel"() ({
^bb0:
}) : () -> ()
}) : (index) -> ()
"memref.dealloc"(%7) : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> ()
"func.return"() : () -> ()
}) {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64>} : () -> ()
/sharedfile/32/fp8_32_kv16.mlir:23035:13: error: failed to run translation of source executable to target executable for backend #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>
%8741 = torch.aten.index_put %8739, %8740, %8735, %false_8786 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
^
/sharedfile/32/fp8_32_kv16.mlir:23035:13: note: see current operation:
"hal.executable.variant"() ({
"hal.executable.export"() ({
^bb0(%arg9: !hal.device, %arg10: index, %arg11: index, %arg12: index, %arg13: index):
%69:3 = "flow.dispatch.workgroup_count_from_slice"(%arg10, %arg11, %arg12, %arg13) : (index, index, index, index) -> (index, index, index)
"hal.return"(%69#0, %69#1, %69#2) : (index, index, index) -> ()
}) {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index, sym_name = "prefill_bs1$async_dispatch_472_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"} : () -> ()
"builtin.module"() ({
"func.func"() <{function_type = () -> (), sym_name = "prefill_bs1$async_dispatch_472_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"}> ({
%0 = "arith.constant"() <{value = 0.000000e+00 : f8E4M3FNUZ}> : () -> f8E4M3FNUZ
%1 = "arith.constant"() <{value = 0 : i64}> : () -> i64
%2 = "arith.constant"() <{value = 1 : index}> : () -> index
%3 = "arith.constant"() <{value = 32 : i64}> : () -> i64
%4 = "arith.constant"() <{value = 0 : index}> : () -> index
%5 = "arith.constant"() <{value = 64 : index}> : () -> index
%6 = "arith.constant"() <{value = dense<46> : vector<1xi64>}> : () -> vector<1xi64>
%7 = "memref.alloc"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x3xi32, #gpu.address_space<workgroup>>
%8 = "memref.alloca"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x1x1x1x4xbf16, #gpu.address_space<private>>
%9 = "gpu.thread_id"() <{dimension = #gpu<dim x>, upper_bound = 64 : index}> : () -> index
%10 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index} : () -> i32
%11 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 1 : index} : () -> i32
%12 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 2 : index} : () -> i32
%13 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 3 : index} : () -> i32
%14 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 4 : index} : () -> i32
%15 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 5 : index} : () -> i32
%16 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 6 : index} : () -> i32
%17 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 7 : index} : () -> i32
%18 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 8 : index} : () -> i32
%19 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 9 : index} : () -> i32
%20 = "arith.extui"(%10) : (i32) -> i64
%21 = "arith.extui"(%11) : (i32) -> i64
%22 = "arith.shli"(%21, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%23 = "arith.ori"(%20, %22) : (i64, i64) -> i64
%24 = "arith.index_castui"(%23) : (i64) -> index
%25 = "arith.extui"(%12) : (i32) -> i64
%26 = "arith.extui"(%13) : (i32) -> i64
%27 = "arith.shli"(%26, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%28 = "arith.ori"(%25, %27) : (i64, i64) -> i64
%29 = "arith.index_castui"(%28) : (i64) -> index
%30 = "arith.extui"(%14) : (i32) -> i64
%31 = "arith.extui"(%15) : (i32) -> i64
%32 = "arith.shli"(%31, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%33 = "arith.ori"(%30, %32) : (i64, i64) -> i64
%34 = "arith.index_castui"(%33) : (i64) -> index
%35 = "arith.index_castui"(%16) : (i32) -> index
%36 = "arith.extui"(%17) : (i32) -> i64
%37 = "arith.extui"(%18) : (i32) -> i64
%38 = "arith.shli"(%37, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%39 = "arith.ori"(%36, %38) : (i64, i64) -> i64
%40 = "arith.index_castui"(%39) : (i64) -> index
%41 = "arith.index_castui"(%19) : (i32) -> index
%42:6 = "util.assume.int"(%24, %29, %34, %35, %40, %41) <{assumptions = [[#util.int.assumption<umin = 68386816, umax = 5300322304>], [#util.int.assumption<umin = 68501504, umax = 5769969664>], [#util.int.assumption<umin = 68501568, umax = 5770002432>], [#util.int.assumption<umin = 1, umax = 4095>], [#util.int.assumption<umin = 64, umax = 576460752303423424, udiv = 64>], [#util.int.assumption<umin = 1, umax = 4095>]]}> : (index, index, index, index, index, index) -> (index, index, index, index, index, index)
%43 = "flow.dispatch.workload.ordinal"(%42#3) <{ordinal = 1 : index}> : (index) -> index
%44 = "flow.dispatch.workload.ordinal"(%42#4) <{ordinal = 2 : index}> : (index) -> index
%45 = "flow.dispatch.workload.ordinal"(%42#5) <{ordinal = 3 : index}> : (index) -> index
%46 = "hal.interface.binding.subspan"(%42#0, %43) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%46) <{alignment = 1 : i32}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%47 = "hal.interface.binding.subspan"(%42#1, %45) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%47) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%48 = "hal.interface.binding.subspan"(%4, %44) {alignment = 64 : index, binding = 1 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%48) <{alignment = 64 : i32}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>) -> ()
%49 = "hal.interface.binding.subspan"(%42#2, %45) {alignment = 64 : index, binding = 2 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%49) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%50 = "memref.alloca"(%45) <{operandSegmentSizes = array<i32: 1, 0>}> : (index) -> memref<1x?xi32, #gpu.address_space<private>>
%51 = "affine.apply"(%45) <{map = affine_map<()[s0] -> (s0 ceildiv 2)>}> : (index) -> index
"gpu.barrier"() : () -> ()
"scf.for"(%9, %51, %5) ({
^bb0(%arg7: index):
%62 = "affine.apply"(%arg7) <{map = affine_map<(d0) -> (d0 * 2)>}> : (index) -> index
%63 = "affine.min"(%arg7, %45) <{map = affine_map<(d0)[s0] -> (d0 * -2 + s0, 2)>}> : (index, index) -> index
"scf.for"(%4, %63, %2) ({
^bb0(%arg8: index):
%64 = "arith.addi"(%arg8, %62) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
%65 = "vector.transfer_read"(%47, %4, %64, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
%66 = "arith.addi"(%65, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
%67 = "arith.trunci"(%66) : (vector<1xi64>) -> vector<1xi32>
%68 = "affine.apply"(%arg7, %arg8) <{map = affine_map<(d0)[s0] -> (d0 * 2 + s0)>}> : (index, index) -> index
"vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
"vector.transfer_write"(%67, %50, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x?xi32, #gpu.address_space<private>>, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"gpu.barrier"() : () -> ()
%52:2 = "affine.delinearize_index"(%9) <{static_basis = array<i64: 4, 16>}> : (index) -> (index, index)
"scf.forall"(%43) <{mapping = [#iree_codegen.workgroup_mapping<z:1>, #iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>], operandSegmentSizes = array<i32: 0, 1, 0, 0>, staticLowerBound = array<i64: 0, 0, 0, 0>, staticStep = array<i64: 1, 1, 1, 1>, staticUpperBound = array<i64: -9223372036854775808, 32, 2, 2>}> ({
^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index):
%53 = "memref.subview"(%7) <{operandSegmentSizes = array<i32: 1, 0, 0, 0>, static_offsets = array<i64: 0, 0>, static_sizes = array<i64: 1, 1>, static_strides = array<i64: 1, 1>}> : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>
"gpu.barrier"() : () -> ()
"scf.for"(%9, %2, %5) ({
^bb0(%arg6: index):
%59 = "vector.transfer_read"(%47, %4, %arg0, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
%60 = "arith.addi"(%59, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
%61 = "arith.trunci"(%60) : (vector<1xi64>) -> vector<1xi32>
"vector.transfer_write"(%61, %7, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x3xi32, #gpu.address_space<workgroup>>, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"gpu.barrier"() : () -> ()
%54 = "affine.apply"(%arg2, %52#0) <{map = affine_map<(d0)[s0] -> (d0 * 4 + s0)>}> : (index, index) -> index
%55 = "affine.apply"(%arg3, %52#1) <{map = affine_map<(d0)[s0] -> (d0 * 64 + s0 * 4)>}> : (index, index) -> index
%56 = "memref.subview"(%48, %arg1, %54, %55, %44) <{operandSegmentSizes = array<i32: 1, 3, 1, 0>, static_offsets = array<i64: 0, -9223372036854775808, -9223372036854775808, -9223372036854775808>, static_sizes = array<i64: -9223372036854775808, 1, 1, 4>, static_strides = array<i64: 1, 1, 1, 1>}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>, index, index, index, index) -> memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%57 = "vector.transfer_read"(%46, %4, %arg0, %arg1, %54, %55, %0) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 5, 1, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, index, index, index, f8E4M3FNUZ) -> vector<4xf8E4M3FNUZ>
%58 = "arith.extf"(%57) : (vector<4xf8E4M3FNUZ>) -> vector<4xbf16>
"vector.transfer_write"(%58, %8, %4, %4, %4, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 5, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (vector<4xbf16>, memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, index, index, index, index, index) -> ()
"iree_linalg_ext.scatter"(%8, %53, %56) <{dimension_map = array<i64: 0>, operandSegmentSizes = array<i32: 2, 1>, unique_indices = true}> ({
^bb0(%arg4: bf16, %arg5: bf16):
"iree_linalg_ext.yield"(%arg4) : (bf16) -> ()
}) : (memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>, memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
"gpu.barrier"() : () -> ()
"scf.forall.in_parallel"() ({
^bb0:
}) : () -> ()
}) : (index) -> ()
"memref.dealloc"(%7) : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> ()
"func.return"() : () -> ()
}) {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64>} : () -> ()
}) : () -> ()
"hal.executable.variant_end"() : () -> ()
}) {sym_name = "rocm_hsaco_fb", target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>} : () -> ()
/sharedfile/32/fp8_32_kv16.mlir:23911:13: error: 'vector.transfer_write' op write affecting operations on global resources are restricted to workgroup distributed contexts.
%9078 = torch.aten.index_put %9076, %9077, %9072, %false_9158 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
^
/sharedfile/32/fp8_32_kv16.mlir:23911:13: note: see current operation: "vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
/sharedfile/32/fp8_32_kv16.mlir:23911:13: error: 'func.func' op failed on workgroup distribution verification
%9078 = torch.aten.index_put %9076, %9077, %9072, %false_9158 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
^
/sharedfile/32/fp8_32_kv16.mlir:23911:13: note: see current operation:
"func.func"() <{function_type = () -> (), sym_name = "prefill_bs1$async_dispatch_492_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"}> ({
%0 = "arith.constant"() <{value = 0.000000e+00 : f8E4M3FNUZ}> : () -> f8E4M3FNUZ
%1 = "arith.constant"() <{value = 0 : i64}> : () -> i64
%2 = "arith.constant"() <{value = 1 : index}> : () -> index
%3 = "arith.constant"() <{value = 32 : i64}> : () -> i64
%4 = "arith.constant"() <{value = 0 : index}> : () -> index
%5 = "arith.constant"() <{value = 64 : index}> : () -> index
%6 = "arith.constant"() <{value = dense<48> : vector<1xi64>}> : () -> vector<1xi64>
%7 = "memref.alloc"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x3xi32, #gpu.address_space<workgroup>>
%8 = "memref.alloca"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x1x1x1x4xbf16, #gpu.address_space<private>>
%9 = "gpu.thread_id"() <{dimension = #gpu<dim x>, upper_bound = 64 : index}> : () -> index
%10 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index} : () -> i32
%11 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 1 : index} : () -> i32
%12 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 2 : index} : () -> i32
%13 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 3 : index} : () -> i32
%14 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 4 : index} : () -> i32
%15 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 5 : index} : () -> i32
%16 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 6 : index} : () -> i32
%17 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 7 : index} : () -> i32
%18 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 8 : index} : () -> i32
%19 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 9 : index} : () -> i32
%20 = "arith.extui"(%10) : (i32) -> i64
%21 = "arith.extui"(%11) : (i32) -> i64
%22 = "arith.shli"(%21, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%23 = "arith.ori"(%20, %22) : (i64, i64) -> i64
%24 = "arith.index_castui"(%23) : (i64) -> index
%25 = "arith.extui"(%12) : (i32) -> i64
%26 = "arith.extui"(%13) : (i32) -> i64
%27 = "arith.shli"(%26, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%28 = "arith.ori"(%25, %27) : (i64, i64) -> i64
%29 = "arith.index_castui"(%28) : (i64) -> index
%30 = "arith.extui"(%14) : (i32) -> i64
%31 = "arith.extui"(%15) : (i32) -> i64
%32 = "arith.shli"(%31, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%33 = "arith.ori"(%30, %32) : (i64, i64) -> i64
%34 = "arith.index_castui"(%33) : (i64) -> index
%35 = "arith.index_castui"(%16) : (i32) -> index
%36 = "arith.extui"(%17) : (i32) -> i64
%37 = "arith.extui"(%18) : (i32) -> i64
%38 = "arith.shli"(%37, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%39 = "arith.ori"(%36, %38) : (i64, i64) -> i64
%40 = "arith.index_castui"(%39) : (i64) -> index
%41 = "arith.index_castui"(%19) : (i32) -> index
%42:6 = "util.assume.int"(%24, %29, %34, %35, %40, %41) <{assumptions = [[#util.int.assumption<umin = 68386816, umax = 5300322304>], [#util.int.assumption<umin = 68501504, umax = 5769969664>], [#util.int.assumption<umin = 68501568, umax = 5770002432>], [#util.int.assumption<umin = 1, umax = 4095>], [#util.int.assumption<umin = 64, umax = 576460752303423424, udiv = 64>], [#util.int.assumption<umin = 1, umax = 4095>]]}> : (index, index, index, index, index, index) -> (index, index, index, index, index, index)
%43 = "flow.dispatch.workload.ordinal"(%42#3) <{ordinal = 1 : index}> : (index) -> index
%44 = "flow.dispatch.workload.ordinal"(%42#4) <{ordinal = 2 : index}> : (index) -> index
%45 = "flow.dispatch.workload.ordinal"(%42#5) <{ordinal = 3 : index}> : (index) -> index
%46 = "hal.interface.binding.subspan"(%42#0, %43) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%46) <{alignment = 1 : i32}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%47 = "hal.interface.binding.subspan"(%42#1, %45) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%47) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%48 = "hal.interface.binding.subspan"(%4, %44) {alignment = 64 : index, binding = 1 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%48) <{alignment = 64 : i32}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>) -> ()
%49 = "hal.interface.binding.subspan"(%42#2, %45) {alignment = 64 : index, binding = 2 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%49) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%50 = "memref.alloca"(%45) <{operandSegmentSizes = array<i32: 1, 0>}> : (index) -> memref<1x?xi32, #gpu.address_space<private>>
%51 = "affine.apply"(%45) <{map = affine_map<()[s0] -> (s0 ceildiv 2)>}> : (index) -> index
"gpu.barrier"() : () -> ()
"scf.for"(%9, %51, %5) ({
^bb0(%arg7: index):
%62 = "affine.apply"(%arg7) <{map = affine_map<(d0) -> (d0 * 2)>}> : (index) -> index
%63 = "affine.min"(%arg7, %45) <{map = affine_map<(d0)[s0] -> (d0 * -2 + s0, 2)>}> : (index, index) -> index
"scf.for"(%4, %63, %2) ({
^bb0(%arg8: index):
%64 = "arith.addi"(%arg8, %62) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
%65 = "vector.transfer_read"(%47, %4, %64, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
%66 = "arith.addi"(%65, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
%67 = "arith.trunci"(%66) : (vector<1xi64>) -> vector<1xi32>
%68 = "affine.apply"(%arg7, %arg8) <{map = affine_map<(d0)[s0] -> (d0 * 2 + s0)>}> : (index, index) -> index
"vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
"vector.transfer_write"(%67, %50, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x?xi32, #gpu.address_space<private>>, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"gpu.barrier"() : () -> ()
%52:2 = "affine.delinearize_index"(%9) <{static_basis = array<i64: 4, 16>}> : (index) -> (index, index)
"scf.forall"(%43) <{mapping = [#iree_codegen.workgroup_mapping<z:1>, #iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>], operandSegmentSizes = array<i32: 0, 1, 0, 0>, staticLowerBound = array<i64: 0, 0, 0, 0>, staticStep = array<i64: 1, 1, 1, 1>, staticUpperBound = array<i64: -9223372036854775808, 32, 2, 2>}> ({
^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index):
%53 = "memref.subview"(%7) <{operandSegmentSizes = array<i32: 1, 0, 0, 0>, static_offsets = array<i64: 0, 0>, static_sizes = array<i64: 1, 1>, static_strides = array<i64: 1, 1>}> : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>
"gpu.barrier"() : () -> ()
"scf.for"(%9, %2, %5) ({
^bb0(%arg6: index):
%59 = "vector.transfer_read"(%47, %4, %arg0, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
%60 = "arith.addi"(%59, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
%61 = "arith.trunci"(%60) : (vector<1xi64>) -> vector<1xi32>
"vector.transfer_write"(%61, %7, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x3xi32, #gpu.address_space<workgroup>>, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"gpu.barrier"() : () -> ()
%54 = "affine.apply"(%arg2, %52#0) <{map = affine_map<(d0)[s0] -> (d0 * 4 + s0)>}> : (index, index) -> index
%55 = "affine.apply"(%arg3, %52#1) <{map = affine_map<(d0)[s0] -> (d0 * 64 + s0 * 4)>}> : (index, index) -> index
%56 = "memref.subview"(%48, %arg1, %54, %55, %44) <{operandSegmentSizes = array<i32: 1, 3, 1, 0>, static_offsets = array<i64: 0, -9223372036854775808, -9223372036854775808, -9223372036854775808>, static_sizes = array<i64: -9223372036854775808, 1, 1, 4>, static_strides = array<i64: 1, 1, 1, 1>}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>, index, index, index, index) -> memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%57 = "vector.transfer_read"(%46, %4, %arg0, %arg1, %54, %55, %0) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 5, 1, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, index, index, index, f8E4M3FNUZ) -> vector<4xf8E4M3FNUZ>
%58 = "arith.extf"(%57) : (vector<4xf8E4M3FNUZ>) -> vector<4xbf16>
"vector.transfer_write"(%58, %8, %4, %4, %4, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 5, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (vector<4xbf16>, memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, index, index, index, index, index) -> ()
"iree_linalg_ext.scatter"(%8, %53, %56) <{dimension_map = array<i64: 0>, operandSegmentSizes = array<i32: 2, 1>, unique_indices = true}> ({
^bb0(%arg4: bf16, %arg5: bf16):
"iree_linalg_ext.yield"(%arg4) : (bf16) -> ()
}) : (memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>, memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
"gpu.barrier"() : () -> ()
"scf.forall.in_parallel"() ({
^bb0:
}) : () -> ()
}) : (index) -> ()
"memref.dealloc"(%7) : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> ()
"func.return"() : () -> ()
}) {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64>} : () -> ()
/sharedfile/32/fp8_32_kv16.mlir:23911:13: error: failed to run translation of source executable to target executable for backend #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>
%9078 = torch.aten.index_put %9076, %9077, %9072, %false_9158 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
^
/sharedfile/32/fp8_32_kv16.mlir:23911:13: note: see current operation:
"hal.executable.variant"() ({
"hal.executable.export"() ({
^bb0(%arg9: !hal.device, %arg10: index, %arg11: index, %arg12: index, %arg13: index):
%69:3 = "flow.dispatch.workgroup_count_from_slice"(%arg10, %arg11, %arg12, %arg13) : (index, index, index, index) -> (index, index, index)
"hal.return"(%69#0, %69#1, %69#2) : (index, index, index) -> ()
}) {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index, sym_name = "prefill_bs1$async_dispatch_492_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"} : () -> ()
"builtin.module"() ({
"func.func"() <{function_type = () -> (), sym_name = "prefill_bs1$async_dispatch_492_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"}> ({
%0 = "arith.constant"() <{value = 0.000000e+00 : f8E4M3FNUZ}> : () -> f8E4M3FNUZ
%1 = "arith.constant"() <{value = 0 : i64}> : () -> i64
%2 = "arith.constant"() <{value = 1 : index}> : () -> index
%3 = "arith.constant"() <{value = 32 : i64}> : () -> i64
%4 = "arith.constant"() <{value = 0 : index}> : () -> index
%5 = "arith.constant"() <{value = 64 : index}> : () -> index
%6 = "arith.constant"() <{value = dense<48> : vector<1xi64>}> : () -> vector<1xi64>
%7 = "memref.alloc"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x3xi32, #gpu.address_space<workgroup>>
%8 = "memref.alloca"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x1x1x1x4xbf16, #gpu.address_space<private>>
%9 = "gpu.thread_id"() <{dimension = #gpu<dim x>, upper_bound = 64 : index}> : () -> index
%10 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index} : () -> i32
%11 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 1 : index} : () -> i32
%12 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 2 : index} : () -> i32
%13 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 3 : index} : () -> i32
%14 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 4 : index} : () -> i32
%15 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 5 : index} : () -> i32
%16 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 6 : index} : () -> i32
%17 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 7 : index} : () -> i32
%18 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 8 : index} : () -> i32
%19 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 9 : index} : () -> i32
%20 = "arith.extui"(%10) : (i32) -> i64
%21 = "arith.extui"(%11) : (i32) -> i64
%22 = "arith.shli"(%21, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%23 = "arith.ori"(%20, %22) : (i64, i64) -> i64
%24 = "arith.index_castui"(%23) : (i64) -> index
%25 = "arith.extui"(%12) : (i32) -> i64
%26 = "arith.extui"(%13) : (i32) -> i64
%27 = "arith.shli"(%26, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%28 = "arith.ori"(%25, %27) : (i64, i64) -> i64
%29 = "arith.index_castui"(%28) : (i64) -> index
%30 = "arith.extui"(%14) : (i32) -> i64
%31 = "arith.extui"(%15) : (i32) -> i64
%32 = "arith.shli"(%31, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%33 = "arith.ori"(%30, %32) : (i64, i64) -> i64
%34 = "arith.index_castui"(%33) : (i64) -> index
%35 = "arith.index_castui"(%16) : (i32) -> index
%36 = "arith.extui"(%17) : (i32) -> i64
%37 = "arith.extui"(%18) : (i32) -> i64
%38 = "arith.shli"(%37, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%39 = "arith.ori"(%36, %38) : (i64, i64) -> i64
%40 = "arith.index_castui"(%39) : (i64) -> index
%41 = "arith.index_castui"(%19) : (i32) -> index
%42:6 = "util.assume.int"(%24, %29, %34, %35, %40, %41) <{assumptions = [[#util.int.assumption<umin = 68386816, umax = 5300322304>], [#util.int.assumption<umin = 68501504, umax = 5769969664>], [#util.int.assumption<umin = 68501568, umax = 5770002432>], [#util.int.assumption<umin = 1, umax = 4095>], [#util.int.assumption<umin = 64, umax = 576460752303423424, udiv = 64>], [#util.int.assumption<umin = 1, umax = 4095>]]}> : (index, index, index, index, index, index) -> (index, index, index, index, index, index)
%43 = "flow.dispatch.workload.ordinal"(%42#3) <{ordinal = 1 : index}> : (index) -> index
%44 = "flow.dispatch.workload.ordinal"(%42#4) <{ordinal = 2 : index}> : (index) -> index
%45 = "flow.dispatch.workload.ordinal"(%42#5) <{ordinal = 3 : index}> : (index) -> index
%46 = "hal.interface.binding.subspan"(%42#0, %43) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%46) <{alignment = 1 : i32}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%47 = "hal.interface.binding.subspan"(%42#1, %45) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%47) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%48 = "hal.interface.binding.subspan"(%4, %44) {alignment = 64 : index, binding = 1 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%48) <{alignment = 64 : i32}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>) -> ()
%49 = "hal.interface.binding.subspan"(%42#2, %45) {alignment = 64 : index, binding = 2 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%49) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%50 = "memref.alloca"(%45) <{operandSegmentSizes = array<i32: 1, 0>}> : (index) -> memref<1x?xi32, #gpu.address_space<private>>
%51 = "affine.apply"(%45) <{map = affine_map<()[s0] -> (s0 ceildiv 2)>}> : (index) -> index
"gpu.barrier"() : () -> ()
"scf.for"(%9, %51, %5) ({
^bb0(%arg7: index):
%62 = "affine.apply"(%arg7) <{map = affine_map<(d0) -> (d0 * 2)>}> : (index) -> index
%63 = "affine.min"(%arg7, %45) <{map = affine_map<(d0)[s0] -> (d0 * -2 + s0, 2)>}> : (index, index) -> index
"scf.for"(%4, %63, %2) ({
^bb0(%arg8: index):
%64 = "arith.addi"(%arg8, %62) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
%65 = "vector.transfer_read"(%47, %4, %64, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
%66 = "arith.addi"(%65, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
%67 = "arith.trunci"(%66) : (vector<1xi64>) -> vector<1xi32>
%68 = "affine.apply"(%arg7, %arg8) <{map = affine_map<(d0)[s0] -> (d0 * 2 + s0)>}> : (index, index) -> index
"vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
"vector.transfer_write"(%67, %50, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x?xi32, #gpu.address_space<private>>, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"gpu.barrier"() : () -> ()
%52:2 = "affine.delinearize_index"(%9) <{static_basis = array<i64: 4, 16>}> : (index) -> (index, index)
"scf.forall"(%43) <{mapping = [#iree_codegen.workgroup_mapping<z:1>, #iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>], operandSegmentSizes = array<i32: 0, 1, 0, 0>, staticLowerBound = array<i64: 0, 0, 0, 0>, staticStep = array<i64: 1, 1, 1, 1>, staticUpperBound = array<i64: -9223372036854775808, 32, 2, 2>}> ({
^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index):
%53 = "memref.subview"(%7) <{operandSegmentSizes = array<i32: 1, 0, 0, 0>, static_offsets = array<i64: 0, 0>, static_sizes = array<i64: 1, 1>, static_strides = array<i64: 1, 1>}> : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>
"gpu.barrier"() : () -> ()
"scf.for"(%9, %2, %5) ({
^bb0(%arg6: index):
%59 = "vector.transfer_read"(%47, %4, %arg0, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
%60 = "arith.addi"(%59, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
%61 = "arith.trunci"(%60) : (vector<1xi64>) -> vector<1xi32>
"vector.transfer_write"(%61, %7, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x3xi32, #gpu.address_space<workgroup>>, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"gpu.barrier"() : () -> ()
%54 = "affine.apply"(%arg2, %52#0) <{map = affine_map<(d0)[s0] -> (d0 * 4 + s0)>}> : (index, index) -> index
%55 = "affine.apply"(%arg3, %52#1) <{map = affine_map<(d0)[s0] -> (d0 * 64 + s0 * 4)>}> : (index, index) -> index
%56 = "memref.subview"(%48, %arg1, %54, %55, %44) <{operandSegmentSizes = array<i32: 1, 3, 1, 0>, static_offsets = array<i64: 0, -9223372036854775808, -9223372036854775808, -9223372036854775808>, static_sizes = array<i64: -9223372036854775808, 1, 1, 4>, static_strides = array<i64: 1, 1, 1, 1>}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>, index, index, index, index) -> memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%57 = "vector.transfer_read"(%46, %4, %arg0, %arg1, %54, %55, %0) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 5, 1, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, index, index, index, f8E4M3FNUZ) -> vector<4xf8E4M3FNUZ>
%58 = "arith.extf"(%57) : (vector<4xf8E4M3FNUZ>) -> vector<4xbf16>
"vector.transfer_write"(%58, %8, %4, %4, %4, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 5, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (vector<4xbf16>, memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, index, index, index, index, index) -> ()
"iree_linalg_ext.scatter"(%8, %53, %56) <{dimension_map = array<i64: 0>, operandSegmentSizes = array<i32: 2, 1>, unique_indices = true}> ({
^bb0(%arg4: bf16, %arg5: bf16):
"iree_linalg_ext.yield"(%arg4) : (bf16) -> ()
}) : (memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>, memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
"gpu.barrier"() : () -> ()
"scf.forall.in_parallel"() ({
^bb0:
}) : () -> ()
}) : (index) -> ()
"memref.dealloc"(%7) : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> ()
"func.return"() : () -> ()
}) {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64>} : () -> ()
}) : () -> ()
"hal.executable.variant_end"() : () -> ()
}) {sym_name = "rocm_hsaco_fb", target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>} : () -> ()
/sharedfile/32/fp8_32_kv16.mlir:24787:13: error: 'vector.transfer_write' op write affecting operations on global resources are restricted to workgroup distributed contexts.
%9415 = torch.aten.index_put %9413, %9414, %9409, %false_9530 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
^
/sharedfile/32/fp8_32_kv16.mlir:24787:13: note: see current operation: "vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
/sharedfile/32/fp8_32_kv16.mlir:24787:13: error: 'func.func' op failed on workgroup distribution verification
%9415 = torch.aten.index_put %9413, %9414, %9409, %false_9530 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
^
/sharedfile/32/fp8_32_kv16.mlir:24787:13: note: see current operation:
"func.func"() <{function_type = () -> (), sym_name = "prefill_bs1$async_dispatch_512_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"}> ({
%0 = "arith.constant"() <{value = 0.000000e+00 : f8E4M3FNUZ}> : () -> f8E4M3FNUZ
%1 = "arith.constant"() <{value = 0 : i64}> : () -> i64
%2 = "arith.constant"() <{value = 1 : index}> : () -> index
%3 = "arith.constant"() <{value = 32 : i64}> : () -> i64
%4 = "arith.constant"() <{value = 0 : index}> : () -> index
%5 = "arith.constant"() <{value = 64 : index}> : () -> index
%6 = "arith.constant"() <{value = dense<50> : vector<1xi64>}> : () -> vector<1xi64>
%7 = "memref.alloc"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x3xi32, #gpu.address_space<workgroup>>
%8 = "memref.alloca"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x1x1x1x4xbf16, #gpu.address_space<private>>
%9 = "gpu.thread_id"() <{dimension = #gpu<dim x>, upper_bound = 64 : index}> : () -> index
%10 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index} : () -> i32
%11 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 1 : index} : () -> i32
%12 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 2 : index} : () -> i32
%13 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 3 : index} : () -> i32
%14 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 4 : index} : () -> i32
%15 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 5 : index} : () -> i32
%16 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 6 : index} : () -> i32
%17 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 7 : index} : () -> i32
%18 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 8 : index} : () -> i32
%19 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 9 : index} : () -> i32
%20 = "arith.extui"(%10) : (i32) -> i64
%21 = "arith.extui"(%11) : (i32) -> i64
%22 = "arith.shli"(%21, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%23 = "arith.ori"(%20, %22) : (i64, i64) -> i64
%24 = "arith.index_castui"(%23) : (i64) -> index
%25 = "arith.extui"(%12) : (i32) -> i64
%26 = "arith.extui"(%13) : (i32) -> i64
%27 = "arith.shli"(%26, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%28 = "arith.ori"(%25, %27) : (i64, i64) -> i64
%29 = "arith.index_castui"(%28) : (i64) -> index
%30 = "arith.extui"(%14) : (i32) -> i64
%31 = "arith.extui"(%15) : (i32) -> i64
%32 = "arith.shli"(%31, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%33 = "arith.ori"(%30, %32) : (i64, i64) -> i64
%34 = "arith.index_castui"(%33) : (i64) -> index
%35 = "arith.index_castui"(%16) : (i32) -> index
%36 = "arith.extui"(%17) : (i32) -> i64
%37 = "arith.extui"(%18) : (i32) -> i64
%38 = "arith.shli"(%37, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%39 = "arith.ori"(%36, %38) : (i64, i64) -> i64
%40 = "arith.index_castui"(%39) : (i64) -> index
%41 = "arith.index_castui"(%19) : (i32) -> index
%42:6 = "util.assume.int"(%24, %29, %34, %35, %40, %41) <{assumptions = [[#util.int.assumption<umin = 68386816, umax = 5300322304>], [#util.int.assumption<umin = 68501504, umax = 5769969664>], [#util.int.assumption<umin = 68501568, umax = 5770002432>], [#util.int.assumption<umin = 1, umax = 4095>], [#util.int.assumption<umin = 64, umax = 576460752303423424, udiv = 64>], [#util.int.assumption<umin = 1, umax = 4095>]]}> : (index, index, index, index, index, index) -> (index, index, index, index, index, index)
%43 = "flow.dispatch.workload.ordinal"(%42#3) <{ordinal = 1 : index}> : (index) -> index
%44 = "flow.dispatch.workload.ordinal"(%42#4) <{ordinal = 2 : index}> : (index) -> index
%45 = "flow.dispatch.workload.ordinal"(%42#5) <{ordinal = 3 : index}> : (index) -> index
%46 = "hal.interface.binding.subspan"(%42#0, %43) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%46) <{alignment = 1 : i32}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%47 = "hal.interface.binding.subspan"(%42#1, %45) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%47) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%48 = "hal.interface.binding.subspan"(%4, %44) {alignment = 64 : index, binding = 1 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%48) <{alignment = 64 : i32}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>) -> ()
%49 = "hal.interface.binding.subspan"(%42#2, %45) {alignment = 64 : index, binding = 2 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%49) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%50 = "memref.alloca"(%45) <{operandSegmentSizes = array<i32: 1, 0>}> : (index) -> memref<1x?xi32, #gpu.address_space<private>>
%51 = "affine.apply"(%45) <{map = affine_map<()[s0] -> (s0 ceildiv 2)>}> : (index) -> index
"gpu.barrier"() : () -> ()
"scf.for"(%9, %51, %5) ({
^bb0(%arg7: index):
%62 = "affine.apply"(%arg7) <{map = affine_map<(d0) -> (d0 * 2)>}> : (index) -> index
%63 = "affine.min"(%arg7, %45) <{map = affine_map<(d0)[s0] -> (d0 * -2 + s0, 2)>}> : (index, index) -> index
"scf.for"(%4, %63, %2) ({
^bb0(%arg8: index):
%64 = "arith.addi"(%arg8, %62) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
%65 = "vector.transfer_read"(%47, %4, %64, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
%66 = "arith.addi"(%65, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
%67 = "arith.trunci"(%66) : (vector<1xi64>) -> vector<1xi32>
%68 = "affine.apply"(%arg7, %arg8) <{map = affine_map<(d0)[s0] -> (d0 * 2 + s0)>}> : (index, index) -> index
"vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
"vector.transfer_write"(%67, %50, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x?xi32, #gpu.address_space<private>>, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"gpu.barrier"() : () -> ()
%52:2 = "affine.delinearize_index"(%9) <{static_basis = array<i64: 4, 16>}> : (index) -> (index, index)
"scf.forall"(%43) <{mapping = [#iree_codegen.workgroup_mapping<z:1>, #iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>], operandSegmentSizes = array<i32: 0, 1, 0, 0>, staticLowerBound = array<i64: 0, 0, 0, 0>, staticStep = array<i64: 1, 1, 1, 1>, staticUpperBound = array<i64: -9223372036854775808, 32, 2, 2>}> ({
^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index):
%53 = "memref.subview"(%7) <{operandSegmentSizes = array<i32: 1, 0, 0, 0>, static_offsets = array<i64: 0, 0>, static_sizes = array<i64: 1, 1>, static_strides = array<i64: 1, 1>}> : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>
"gpu.barrier"() : () -> ()
"scf.for"(%9, %2, %5) ({
^bb0(%arg6: index):
%59 = "vector.transfer_read"(%47, %4, %arg0, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
%60 = "arith.addi"(%59, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
%61 = "arith.trunci"(%60) : (vector<1xi64>) -> vector<1xi32>
"vector.transfer_write"(%61, %7, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x3xi32, #gpu.address_space<workgroup>>, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"gpu.barrier"() : () -> ()
%54 = "affine.apply"(%arg2, %52#0) <{map = affine_map<(d0)[s0] -> (d0 * 4 + s0)>}> : (index, index) -> index
%55 = "affine.apply"(%arg3, %52#1) <{map = affine_map<(d0)[s0] -> (d0 * 64 + s0 * 4)>}> : (index, index) -> index
%56 = "memref.subview"(%48, %arg1, %54, %55, %44) <{operandSegmentSizes = array<i32: 1, 3, 1, 0>, static_offsets = array<i64: 0, -9223372036854775808, -9223372036854775808, -9223372036854775808>, static_sizes = array<i64: -9223372036854775808, 1, 1, 4>, static_strides = array<i64: 1, 1, 1, 1>}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>, index, index, index, index) -> memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%57 = "vector.transfer_read"(%46, %4, %arg0, %arg1, %54, %55, %0) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 5, 1, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, index, index, index, f8E4M3FNUZ) -> vector<4xf8E4M3FNUZ>
%58 = "arith.extf"(%57) : (vector<4xf8E4M3FNUZ>) -> vector<4xbf16>
"vector.transfer_write"(%58, %8, %4, %4, %4, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 5, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (vector<4xbf16>, memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, index, index, index, index, index) -> ()
"iree_linalg_ext.scatter"(%8, %53, %56) <{dimension_map = array<i64: 0>, operandSegmentSizes = array<i32: 2, 1>, unique_indices = true}> ({
^bb0(%arg4: bf16, %arg5: bf16):
"iree_linalg_ext.yield"(%arg4) : (bf16) -> ()
}) : (memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>, memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
"gpu.barrier"() : () -> ()
"scf.forall.in_parallel"() ({
^bb0:
}) : () -> ()
}) : (index) -> ()
"memref.dealloc"(%7) : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> ()
"func.return"() : () -> ()
}) {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64>} : () -> ()
/sharedfile/32/fp8_32_kv16.mlir:24787:13: error: failed to run translation of source executable to target executable for backend #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>
%9415 = torch.aten.index_put %9413, %9414, %9409, %false_9530 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
^
/sharedfile/32/fp8_32_kv16.mlir:24787:13: note: see current operation:
"hal.executable.variant"() ({
"hal.executable.export"() ({
^bb0(%arg9: !hal.device, %arg10: index, %arg11: index, %arg12: index, %arg13: index):
%69:3 = "flow.dispatch.workgroup_count_from_slice"(%arg10, %arg11, %arg12, %arg13) : (index, index, index, index) -> (index, index, index)
"hal.return"(%69#0, %69#1, %69#2) : (index, index, index) -> ()
}) {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index, sym_name = "prefill_bs1$async_dispatch_512_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"} : () -> ()
"builtin.module"() ({
"func.func"() <{function_type = () -> (), sym_name = "prefill_bs1$async_dispatch_512_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"}> ({
%0 = "arith.constant"() <{value = 0.000000e+00 : f8E4M3FNUZ}> : () -> f8E4M3FNUZ
%1 = "arith.constant"() <{value = 0 : i64}> : () -> i64
%2 = "arith.constant"() <{value = 1 : index}> : () -> index
%3 = "arith.constant"() <{value = 32 : i64}> : () -> i64
%4 = "arith.constant"() <{value = 0 : index}> : () -> index
%5 = "arith.constant"() <{value = 64 : index}> : () -> index
%6 = "arith.constant"() <{value = dense<50> : vector<1xi64>}> : () -> vector<1xi64>
%7 = "memref.alloc"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x3xi32, #gpu.address_space<workgroup>>
%8 = "memref.alloca"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x1x1x1x4xbf16, #gpu.address_space<private>>
%9 = "gpu.thread_id"() <{dimension = #gpu<dim x>, upper_bound = 64 : index}> : () -> index
%10 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index} : () -> i32
%11 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 1 : index} : () -> i32
%12 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 2 : index} : () -> i32
%13 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 3 : index} : () -> i32
%14 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 4 : index} : () -> i32
%15 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 5 : index} : () -> i32
%16 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 6 : index} : () -> i32
%17 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 7 : index} : () -> i32
%18 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 8 : index} : () -> i32
%19 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 9 : index} : () -> i32
%20 = "arith.extui"(%10) : (i32) -> i64
%21 = "arith.extui"(%11) : (i32) -> i64
%22 = "arith.shli"(%21, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%23 = "arith.ori"(%20, %22) : (i64, i64) -> i64
%24 = "arith.index_castui"(%23) : (i64) -> index
%25 = "arith.extui"(%12) : (i32) -> i64
%26 = "arith.extui"(%13) : (i32) -> i64
%27 = "arith.shli"(%26, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%28 = "arith.ori"(%25, %27) : (i64, i64) -> i64
%29 = "arith.index_castui"(%28) : (i64) -> index
%30 = "arith.extui"(%14) : (i32) -> i64
%31 = "arith.extui"(%15) : (i32) -> i64
%32 = "arith.shli"(%31, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%33 = "arith.ori"(%30, %32) : (i64, i64) -> i64
%34 = "arith.index_castui"(%33) : (i64) -> index
%35 = "arith.index_castui"(%16) : (i32) -> index
%36 = "arith.extui"(%17) : (i32) -> i64
%37 = "arith.extui"(%18) : (i32) -> i64
%38 = "arith.shli"(%37, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%39 = "arith.ori"(%36, %38) : (i64, i64) -> i64
%40 = "arith.index_castui"(%39) : (i64) -> index
%41 = "arith.index_castui"(%19) : (i32) -> index
%42:6 = "util.assume.int"(%24, %29, %34, %35, %40, %41) <{assumptions = [[#util.int.assumption<umin = 68386816, umax = 5300322304>], [#util.int.assumption<umin = 68501504, umax = 5769969664>], [#util.int.assumption<umin = 68501568, umax = 5770002432>], [#util.int.assumption<umin = 1, umax = 4095>], [#util.int.assumption<umin = 64, umax = 576460752303423424, udiv = 64>], [#util.int.assumption<umin = 1, umax = 4095>]]}> : (index, index, index, index, index, index) -> (index, index, index, index, index, index)
%43 = "flow.dispatch.workload.ordinal"(%42#3) <{ordinal = 1 : index}> : (index) -> index
%44 = "flow.dispatch.workload.ordinal"(%42#4) <{ordinal = 2 : index}> : (index) -> index
%45 = "flow.dispatch.workload.ordinal"(%42#5) <{ordinal = 3 : index}> : (index) -> index
%46 = "hal.interface.binding.subspan"(%42#0, %43) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%46) <{alignment = 1 : i32}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%47 = "hal.interface.binding.subspan"(%42#1, %45) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%47) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%48 = "hal.interface.binding.subspan"(%4, %44) {alignment = 64 : index, binding = 1 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%48) <{alignment = 64 : i32}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>) -> ()
%49 = "hal.interface.binding.subspan"(%42#2, %45) {alignment = 64 : index, binding = 2 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%49) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%50 = "memref.alloca"(%45) <{operandSegmentSizes = array<i32: 1, 0>}> : (index) -> memref<1x?xi32, #gpu.address_space<private>>
%51 = "affine.apply"(%45) <{map = affine_map<()[s0] -> (s0 ceildiv 2)>}> : (index) -> index
"gpu.barrier"() : () -> ()
"scf.for"(%9, %51, %5) ({
^bb0(%arg7: index):
%62 = "affine.apply"(%arg7) <{map = affine_map<(d0) -> (d0 * 2)>}> : (index) -> index
%63 = "affine.min"(%arg7, %45) <{map = affine_map<(d0)[s0] -> (d0 * -2 + s0, 2)>}> : (index, index) -> index
"scf.for"(%4, %63, %2) ({
^bb0(%arg8: index):
%64 = "arith.addi"(%arg8, %62) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
%65 = "vector.transfer_read"(%47, %4, %64, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
%66 = "arith.addi"(%65, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
%67 = "arith.trunci"(%66) : (vector<1xi64>) -> vector<1xi32>
%68 = "affine.apply"(%arg7, %arg8) <{map = affine_map<(d0)[s0] -> (d0 * 2 + s0)>}> : (index, index) -> index
"vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
"vector.transfer_write"(%67, %50, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x?xi32, #gpu.address_space<private>>, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"gpu.barrier"() : () -> ()
%52:2 = "affine.delinearize_index"(%9) <{static_basis = array<i64: 4, 16>}> : (index) -> (index, index)
"scf.forall"(%43) <{mapping = [#iree_codegen.workgroup_mapping<z:1>, #iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>], operandSegmentSizes = array<i32: 0, 1, 0, 0>, staticLowerBound = array<i64: 0, 0, 0, 0>, staticStep = array<i64: 1, 1, 1, 1>, staticUpperBound = array<i64: -9223372036854775808, 32, 2, 2>}> ({
^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index):
%53 = "memref.subview"(%7) <{operandSegmentSizes = array<i32: 1, 0, 0, 0>, static_offsets = array<i64: 0, 0>, static_sizes = array<i64: 1, 1>, static_strides = array<i64: 1, 1>}> : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>
"gpu.barrier"() : () -> ()
"scf.for"(%9, %2, %5) ({
^bb0(%arg6: index):
%59 = "vector.transfer_read"(%47, %4, %arg0, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
%60 = "arith.addi"(%59, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
%61 = "arith.trunci"(%60) : (vector<1xi64>) -> vector<1xi32>
"vector.transfer_write"(%61, %7, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x3xi32, #gpu.address_space<workgroup>>, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"gpu.barrier"() : () -> ()
%54 = "affine.apply"(%arg2, %52#0) <{map = affine_map<(d0)[s0] -> (d0 * 4 + s0)>}> : (index, index) -> index
%55 = "affine.apply"(%arg3, %52#1) <{map = affine_map<(d0)[s0] -> (d0 * 64 + s0 * 4)>}> : (index, index) -> index
%56 = "memref.subview"(%48, %arg1, %54, %55, %44) <{operandSegmentSizes = array<i32: 1, 3, 1, 0>, static_offsets = array<i64: 0, -9223372036854775808, -9223372036854775808, -9223372036854775808>, static_sizes = array<i64: -9223372036854775808, 1, 1, 4>, static_strides = array<i64: 1, 1, 1, 1>}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>, index, index, index, index) -> memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%57 = "vector.transfer_read"(%46, %4, %arg0, %arg1, %54, %55, %0) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 5, 1, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, index, index, index, f8E4M3FNUZ) -> vector<4xf8E4M3FNUZ>
%58 = "arith.extf"(%57) : (vector<4xf8E4M3FNUZ>) -> vector<4xbf16>
"vector.transfer_write"(%58, %8, %4, %4, %4, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 5, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (vector<4xbf16>, memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, index, index, index, index, index) -> ()
"iree_linalg_ext.scatter"(%8, %53, %56) <{dimension_map = array<i64: 0>, operandSegmentSizes = array<i32: 2, 1>, unique_indices = true}> ({
^bb0(%arg4: bf16, %arg5: bf16):
"iree_linalg_ext.yield"(%arg4) : (bf16) -> ()
}) : (memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>, memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
"gpu.barrier"() : () -> ()
"scf.forall.in_parallel"() ({
^bb0:
}) : () -> ()
}) : (index) -> ()
"memref.dealloc"(%7) : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> ()
"func.return"() : () -> ()
}) {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64>} : () -> ()
}) : () -> ()
"hal.executable.variant_end"() : () -> ()
}) {sym_name = "rocm_hsaco_fb", target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>} : () -> ()
/sharedfile/32/fp8_32_kv16.mlir:25663:13: error: 'vector.transfer_write' op write affecting operations on global resources are restricted to workgroup distributed contexts.
%9752 = torch.aten.index_put %9750, %9751, %9746, %false_9902 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
^
/sharedfile/32/fp8_32_kv16.mlir:25663:13: note: see current operation: "vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
/sharedfile/32/fp8_32_kv16.mlir:25663:13: error: 'func.func' op failed on workgroup distribution verification
%9752 = torch.aten.index_put %9750, %9751, %9746, %false_9902 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
^
/sharedfile/32/fp8_32_kv16.mlir:25663:13: note: see current operation:
"func.func"() <{function_type = () -> (), sym_name = "prefill_bs1$async_dispatch_532_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"}> ({
%0 = "arith.constant"() <{value = 0.000000e+00 : f8E4M3FNUZ}> : () -> f8E4M3FNUZ
%1 = "arith.constant"() <{value = 0 : i64}> : () -> i64
%2 = "arith.constant"() <{value = 1 : index}> : () -> index
%3 = "arith.constant"() <{value = 32 : i64}> : () -> i64
%4 = "arith.constant"() <{value = 0 : index}> : () -> index
%5 = "arith.constant"() <{value = 64 : index}> : () -> index
%6 = "arith.constant"() <{value = dense<52> : vector<1xi64>}> : () -> vector<1xi64>
%7 = "memref.alloc"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x3xi32, #gpu.address_space<workgroup>>
%8 = "memref.alloca"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x1x1x1x4xbf16, #gpu.address_space<private>>
%9 = "gpu.thread_id"() <{dimension = #gpu<dim x>, upper_bound = 64 : index}> : () -> index
%10 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index} : () -> i32
%11 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 1 : index} : () -> i32
%12 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 2 : index} : () -> i32
%13 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 3 : index} : () -> i32
%14 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 4 : index} : () -> i32
%15 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 5 : index} : () -> i32
%16 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 6 : index} : () -> i32
%17 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 7 : index} : () -> i32
%18 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 8 : index} : () -> i32
%19 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 9 : index} : () -> i32
%20 = "arith.extui"(%10) : (i32) -> i64
%21 = "arith.extui"(%11) : (i32) -> i64
%22 = "arith.shli"(%21, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%23 = "arith.ori"(%20, %22) : (i64, i64) -> i64
%24 = "arith.index_castui"(%23) : (i64) -> index
%25 = "arith.extui"(%12) : (i32) -> i64
%26 = "arith.extui"(%13) : (i32) -> i64
%27 = "arith.shli"(%26, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%28 = "arith.ori"(%25, %27) : (i64, i64) -> i64
%29 = "arith.index_castui"(%28) : (i64) -> index
%30 = "arith.extui"(%14) : (i32) -> i64
%31 = "arith.extui"(%15) : (i32) -> i64
%32 = "arith.shli"(%31, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%33 = "arith.ori"(%30, %32) : (i64, i64) -> i64
%34 = "arith.index_castui"(%33) : (i64) -> index
%35 = "arith.index_castui"(%16) : (i32) -> index
%36 = "arith.extui"(%17) : (i32) -> i64
%37 = "arith.extui"(%18) : (i32) -> i64
%38 = "arith.shli"(%37, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%39 = "arith.ori"(%36, %38) : (i64, i64) -> i64
%40 = "arith.index_castui"(%39) : (i64) -> index
%41 = "arith.index_castui"(%19) : (i32) -> index
%42:6 = "util.assume.int"(%24, %29, %34, %35, %40, %41) <{assumptions = [[#util.int.assumption<umin = 68386816, umax = 5300322304>], [#util.int.assumption<umin = 68501504, umax = 5769969664>], [#util.int.assumption<umin = 68501568, umax = 5770002432>], [#util.int.assumption<umin = 1, umax = 4095>], [#util.int.assumption<umin = 64, umax = 576460752303423424, udiv = 64>], [#util.int.assumption<umin = 1, umax = 4095>]]}> : (index, index, index, index, index, index) -> (index, index, index, index, index, index)
%43 = "flow.dispatch.workload.ordinal"(%42#3) <{ordinal = 1 : index}> : (index) -> index
%44 = "flow.dispatch.workload.ordinal"(%42#4) <{ordinal = 2 : index}> : (index) -> index
%45 = "flow.dispatch.workload.ordinal"(%42#5) <{ordinal = 3 : index}> : (index) -> index
%46 = "hal.interface.binding.subspan"(%42#0, %43) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%46) <{alignment = 1 : i32}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%47 = "hal.interface.binding.subspan"(%42#1, %45) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%47) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%48 = "hal.interface.binding.subspan"(%4, %44) {alignment = 64 : index, binding = 1 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%48) <{alignment = 64 : i32}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>) -> ()
%49 = "hal.interface.binding.subspan"(%42#2, %45) {alignment = 64 : index, binding = 2 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
"memref.assume_alignment"(%49) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
%50 = "memref.alloca"(%45) <{operandSegmentSizes = array<i32: 1, 0>}> : (index) -> memref<1x?xi32, #gpu.address_space<private>>
%51 = "affine.apply"(%45) <{map = affine_map<()[s0] -> (s0 ceildiv 2)>}> : (index) -> index
"gpu.barrier"() : () -> ()
"scf.for"(%9, %51, %5) ({
^bb0(%arg7: index):
%62 = "affine.apply"(%arg7) <{map = affine_map<(d0) -> (d0 * 2)>}> : (index) -> index
%63 = "affine.min"(%arg7, %45) <{map = affine_map<(d0)[s0] -> (d0 * -2 + s0, 2)>}> : (index, index) -> index
"scf.for"(%4, %63, %2) ({
^bb0(%arg8: index):
%64 = "arith.addi"(%arg8, %62) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
%65 = "vector.transfer_read"(%47, %4, %64, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
%66 = "arith.addi"(%65, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
%67 = "arith.trunci"(%66) : (vector<1xi64>) -> vector<1xi32>
%68 = "affine.apply"(%arg7, %arg8) <{map = affine_map<(d0)[s0] -> (d0 * 2 + s0)>}> : (index, index) -> index
"vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
"vector.transfer_write"(%67, %50, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x?xi32, #gpu.address_space<private>>, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"gpu.barrier"() : () -> ()
%52:2 = "affine.delinearize_index"(%9) <{static_basis = array<i64: 4, 16>}> : (index) -> (index, index)
"scf.forall"(%43) <{mapping = [#iree_codegen.workgroup_mapping<z:1>, #iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>], operandSegmentSizes = array<i32: 0, 1, 0, 0>, staticLowerBound = array<i64: 0, 0, 0, 0>, staticStep = array<i64: 1, 1, 1, 1>, staticUpperBound = array<i64: -9223372036854775808, 32, 2, 2>}> ({
^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index):
%53 = "memref.subview"(%7) <{operandSegmentSizes = array<i32: 1, 0, 0, 0>, static_offsets = array<i64: 0, 0>, static_sizes = array<i64: 1, 1>, static_strides = array<i64: 1, 1>}> : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>
"gpu.barrier"() : () -> ()
"scf.for"(%9, %2, %5) ({
^bb0(%arg6: index):
%59 = "vector.transfer_read"(%47, %4, %arg0, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
%60 = "arith.addi"(%59, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
%61 = "arith.trunci"(%60) : (vector<1xi64>) -> vector<1xi32>
"vector.transfer_write"(%61, %7, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x3xi32, #gpu.address_space<workgroup>>, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"gpu.barrier"() : () -> ()
%54 = "affine.apply"(%arg2, %52#0) <{map = affine_map<(d0)[s0] -> (d0 * 4 + s0)>}> : (index, index) -> index
%55 = "affine.apply"(%arg3, %52#1) <{map = affine_map<(d0)[s0] -> (d0 * 64 + s0 * 4)>}> : (index, index) -> index
%56 = "memref.subview"(%48, %arg1, %54, %55, %44) <{operandSegmentSizes = array<i32: 1, 3, 1, 0>, static_offsets = array<i64: 0, -9223372036854775808, -9223372036854775808, -9223372036854775808>, static_sizes = array<i64: -9223372036854775808, 1, 1, 4>, static_strides = array<i64: 1, 1, 1, 1>}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>, index, index, index, index) -> memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%57 = "vector.transfer_read"(%46, %4, %arg0, %arg1, %54, %55, %0) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 5, 1, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, index, index, index, f8E4M3FNUZ) -> vector<4xf8E4M3FNUZ>
%58 = "arith.extf"(%57) : (vector<4xf8E4M3FNUZ>) -> vector<4xbf16>
"vector.transfer_write"(%58, %8, %4, %4, %4, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 5, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (vector<4xbf16>, memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, index, index, index, index, index) -> ()
"iree_linalg_ext.scatter"(%8, %53, %56) <{dimension_map = array<i64: 0>, operandSegmentSizes = array<i32: 2, 1>, unique_indices = true}> ({
^bb0(%arg4: bf16, %arg5: bf16):
"iree_linalg_ext.yield"(%arg4) : (bf16) -> ()
}) : (memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>, memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
"gpu.barrier"() : () -> ()
"scf.forall.in_parallel"() ({
^bb0:
}) : () -> ()
}) : (index) -> ()
"memref.dealloc"(%7) : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> ()
"func.return"() : () -> ()
}) {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64>} : () -> ()
/sharedfile/32/fp8_32_kv16.mlir:25663:13: error: failed to run translation of source executable to target executable for backend #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>
%9752 = torch.aten.index_put %9750, %9751, %9746, %false_9902 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
^
/sharedfile/32/fp8_32_kv16.mlir:25663:13: note: see current operation:
"hal.executable.variant"() ({
"hal.executable.export"() ({
^bb0(%arg9: !hal.device, %arg10: index, %arg11: index, %arg12: index, %arg13: index):
%69:3 = "flow.dispatch.workgroup_count_from_slice"(%arg10, %arg11, %arg12, %arg13) : (index, index, index, index) -> (index, index, index)
"hal.return"(%69#0, %69#1, %69#2) : (index, index, index) -> ()
}) {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index, sym_name = "prefill_bs1$async_dispatch_532_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"} : () -> ()
"builtin.module"() ({
"func.func"() <{function_type = () -> (), sym_name = "prefill_bs1$async_dispatch_532_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"}> ({
%0 = "arith.constant"() <{value = 0.000000e+00 : f8E4M3FNUZ}> : () -> f8E4M3FNUZ
%1 = "arith.constant"() <{value = 0 : i64}> : () -> i64
%2 = "arith.constant"() <{value = 1 : index}> : () -> index
%3 = "arith.constant"() <{value = 32 : i64}> : () -> i64
%4 = "arith.constant"() <{value = 0 : index}> : () -> index
%5 = "arith.constant"() <{value = 64 : index}> : () -> index
%6 = "arith.constant"() <{value = dense<52> : vector<1xi64>}> : () -> vector<1xi64>
%7 = "memref.alloc"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x3xi32, #gpu.address_space<workgroup>>
%8 = "memref.alloca"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x1x1x1x4xbf16, #gpu.address_space<private>>
%9 = "gpu.thread_id"() <{dimension = #gpu<dim x>, upper_bound = 64 : index}> : () -> index
%10 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index} : () -> i32
%11 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 1 : index} : () -> i32
%12 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 2 : index} : () -> i32
%13 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 3 : index} : () -> i32
%14 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 4 : index} : () -> i32
%15 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 5 : index} : () -> i32
%16 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 6 : index} : () -> i32
%17 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 7 : index} : () -> i32
%18 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 8 : index} : () -> i32
%19 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 9 : index} : () -> i32
%20 = "arith.extui"(%10) : (i32) -> i64
%21 = "arith.extui"(%11) : (i32) -> i64
%22 = "arith.shli"(%21, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%23 = "arith.ori"(%20, %22) : (i64, i64) -> i64
%24 = "arith.index_castui"(%23) : (i64) -> index
%25 = "arith.extui"(%12) : (i32) -> i64
%26 = "arith.extui"(%13) : (i32) -> i64
%27 = "arith.shli"(%26, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%28 = "arith.ori"(%25, %27) : (i64, i64) -> i64
%29 = "arith.index_castui"(%28) : (i64) -> index
%30 = "arith.extui"(%14) : (i32) -> i64
%31 = "arith.extui"(%15) : (i32) -> i64
%32 = "arith.shli"(%31, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%33 = "arith.ori"(%30, %32) : (i64, i64) -> i64
%34 = "arith.index_castui"(%33) : (i64) -> index
%35 = "arith.index_castui"(%16) : (i32) -> index
%36 = "arith.extui"(%17) : (i32) -> i64
%37 = "arith.extui"(%18) : (i32) -> i64
%38 = "arith.shli"(%37, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment