AmosLewis · February 24, 2025 18:04
diff --git a/llama_fp8_compile_bug.txt b/llama_fp8_compile_bug.txt
 iree-base-compiler       3.3.0rc20250223
 iree-base-runtime        3.3.0rc20250223
 iree-turbine             3.3.0rc20250223

 wget https://sharkpublic.blob.core.windows.net/sharkpublic/chi/llama/fp8_32_kv16.mlir

 iree-compile /sharedfile/32/fp8_32_kv16.mlir \
  --iree-hip-target=gfx942 \
  -o=/sharedfile/32/fp8_32_kv16.vmfb \
  --iree-hal-target-device=hip \
  --iree-dispatch-creation-enable-aggressive-fusion=true \
  --iree-global-opt-propagate-transposes=true \
  --iree-opt-aggressively-propagate-transposes=true \
  --iree-opt-data-tiling=false \
  --iree-preprocessing-pass-pipeline='builtin.module(util.func(iree-preprocessing-generalize-linalg-matmul-experimental))' \
  --iree-hal-indirect-command-buffers=true \
  --iree-stream-resource-memory-model=discrete \
  --iree-hal-memoization=true \
  --iree-opt-strip-assertions
  
  failed to translate executables
 /sharedfile/32/fp8_32_kv16.mlir:2887:12: error: 'vector.transfer_write' op write affecting operations on global resources are restricted to workgroup distributed contexts.
    %990 = torch.aten.index_put %979, %989, %988, %false_228 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
           ^
 /sharedfile/32/fp8_32_kv16.mlir:2887:12: note: see current operation: "vector.transfer_write"(%59, %42, %4, %61) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
 /sharedfile/32/fp8_32_kv16.mlir:2887:12: error: 'func.func' op failed on workgroup distribution verification
    %990 = torch.aten.index_put %979, %989, %988, %false_228 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
           ^
 /sharedfile/32/fp8_32_kv16.mlir:2887:12: note: see current operation: 
 "func.func"() <{function_type = () -> (), sym_name = "prefill_bs1$async_dispatch_12_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"}> ({
  %0 = "arith.constant"() <{value = 0.000000e+00 : f8E4M3FNUZ}> : () -> f8E4M3FNUZ
  %1 = "arith.constant"() <{value = 0 : i64}> : () -> i64
  %2 = "arith.constant"() <{value = 1 : index}> : () -> index
  %3 = "arith.constant"() <{value = 32 : i64}> : () -> i64
  %4 = "arith.constant"() <{value = 0 : index}> : () -> index
  %5 = "arith.constant"() <{value = 64 : index}> : () -> index
  %6 = "arith.constant"() <{value = dense<64> : vector<1xi64>}> : () -> vector<1xi64>
  %7 = "memref.alloc"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x3xi32, #gpu.address_space<workgroup>>
  %8 = "memref.alloca"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x1x1x1x4xbf16, #gpu.address_space<private>>
  %9 = "gpu.thread_id"() <{dimension = #gpu<dim x>, upper_bound = 64 : index}> : () -> index
  %10 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 8, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index} : () -> i32
  %11 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 8, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 1 : index} : () -> i32
  %12 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 8, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 2 : index} : () -> i32
  %13 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 8, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 3 : index} : () -> i32
  %14 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 8, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 4 : index} : () -> i32
  %15 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 8, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 5 : index} : () -> i32
  %16 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 8, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 6 : index} : () -> i32
  %17 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 8, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 7 : index} : () -> i32
  %18 = "arith.extui"(%10) : (i32) -> i64
  %19 = "arith.extui"(%11) : (i32) -> i64
  %20 = "arith.shli"(%19, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
  %21 = "arith.ori"(%18, %20) : (i64, i64) -> i64
  %22 = "arith.index_castui"(%21) : (i64) -> index
  %23 = "arith.extui"(%12) : (i32) -> i64
  %24 = "arith.extui"(%13) : (i32) -> i64
  %25 = "arith.shli"(%24, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
  %26 = "arith.ori"(%23, %25) : (i64, i64) -> i64
  %27 = "arith.index_castui"(%26) : (i64) -> index
  %28 = "arith.index_castui"(%14) : (i32) -> index
  %29 = "arith.extui"(%15) : (i32) -> i64
  %30 = "arith.extui"(%16) : (i32) -> i64
  %31 = "arith.shli"(%30, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
  %32 = "arith.ori"(%29, %31) : (i64, i64) -> i64
  %33 = "arith.index_castui"(%32) : (i64) -> index
  %34 = "arith.index_castui"(%17) : (i32) -> index
  %35:5 = "util.assume.int"(%22, %27, %28, %33, %34) <{assumptions = [[#util.int.assumption<umin = 68386816, umax = 5300322304>], [#util.int.assumption<umin = 68501504, umax = 5769969664>], [#util.int.assumption<umin = 1, umax = 4095>], [#util.int.assumption<umin = 64, umax = 576460752303423424, udiv = 64>], [#util.int.assumption<umin = 1, umax = 4095>]]}> : (index, index, index, index, index) -> (index, index, index, index, index)
  %36 = "flow.dispatch.workload.ordinal"(%35#2) <{ordinal = 1 : index}> : (index) -> index
  %37 = "flow.dispatch.workload.ordinal"(%35#3) <{ordinal = 2 : index}> : (index) -> index
  %38 = "flow.dispatch.workload.ordinal"(%35#4) <{ordinal = 3 : index}> : (index) -> index
  %39 = "hal.interface.binding.subspan"(%35#0, %36) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 8, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  "memref.assume_alignment"(%39) <{alignment = 1 : i32}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
  %40 = "hal.interface.binding.subspan"(%4, %38) {alignment = 64 : index, binding = 1 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 8, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, #hal.descriptor_type<storage_buffer>>
  "memref.assume_alignment"(%40) <{alignment = 64 : i32}> : (memref<1x?xi64, #hal.descriptor_type<storage_buffer>>) -> ()
  %41 = "hal.interface.binding.subspan"(%4, %37) {alignment = 64 : index, binding = 2 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 8, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>
  "memref.assume_alignment"(%41) <{alignment = 64 : i32}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>) -> ()
  %42 = "hal.interface.binding.subspan"(%35#1, %38) {alignment = 64 : index, binding = 3 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 8, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  "memref.assume_alignment"(%42) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
  %43 = "memref.alloca"(%38) <{operandSegmentSizes = array<i32: 1, 0>}> : (index) -> memref<1x?xi32, #gpu.address_space<private>>
  %44 = "affine.apply"(%38) <{map = affine_map<()[s0] -> (s0 ceildiv 2)>}> : (index) -> index
  "gpu.barrier"() : () -> ()
  "scf.for"(%9, %44, %5) ({
  ^bb0(%arg7: index):
    %55 = "affine.apply"(%arg7) <{map = affine_map<(d0) -> (d0 * 2)>}> : (index) -> index
    %56 = "affine.min"(%arg7, %38) <{map = affine_map<(d0)[s0] -> (d0 * -2 + s0, 2)>}> : (index, index) -> index
    "scf.for"(%4, %56, %2) ({
    ^bb0(%arg8: index):
      %57 = "arith.addi"(%arg8, %55) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
      %58 = "vector.transfer_read"(%40, %4, %57, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
      %59 = "arith.muli"(%58, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
      %60 = "arith.trunci"(%59) : (vector<1xi64>) -> vector<1xi32>
      %61 = "affine.apply"(%arg7, %arg8) <{map = affine_map<(d0)[s0] -> (d0 * 2 + s0)>}> : (index, index) -> index
      "vector.transfer_write"(%59, %42, %4, %61) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
      "vector.transfer_write"(%60, %43, %4, %61) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x?xi32, #gpu.address_space<private>>, index, index) -> ()
      "scf.yield"() : () -> ()
    }) : (index, index, index) -> ()
    "scf.yield"() : () -> ()
  }) : (index, index, index) -> ()
  "gpu.barrier"() : () -> ()
  %45:2 = "affine.delinearize_index"(%9) <{static_basis = array<i64: 4, 16>}> : (index) -> (index, index)
  "scf.forall"(%36) <{mapping = [#iree_codegen.workgroup_mapping<z:1>, #iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>], operandSegmentSizes = array<i32: 0, 1, 0, 0>, staticLowerBound = array<i64: 0, 0, 0, 0>, staticStep = array<i64: 1, 1, 1, 1>, staticUpperBound = array<i64: -9223372036854775808, 32, 2, 2>}> ({
  ^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index):
    %46 = "memref.subview"(%7) <{operandSegmentSizes = array<i32: 1, 0, 0, 0>, static_offsets = array<i64: 0, 0>, static_sizes = array<i64: 1, 1>, static_strides = array<i64: 1, 1>}> : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>
    "gpu.barrier"() : () -> ()
    "scf.for"(%9, %2, %5) ({
    ^bb0(%arg6: index):
      %52 = "vector.transfer_read"(%40, %4, %arg0, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
      %53 = "arith.muli"(%52, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
      %54 = "arith.trunci"(%53) : (vector<1xi64>) -> vector<1xi32>
      "vector.transfer_write"(%54, %7, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x3xi32, #gpu.address_space<workgroup>>, index, index) -> ()
      "scf.yield"() : () -> ()
    }) : (index, index, index) -> ()
    "gpu.barrier"() : () -> ()
    %47 = "affine.apply"(%arg2, %45#0) <{map = affine_map<(d0)[s0] -> (d0 * 4 + s0)>}> : (index, index) -> index
    %48 = "affine.apply"(%arg3, %45#1) <{map = affine_map<(d0)[s0] -> (d0 * 64 + s0 * 4)>}> : (index, index) -> index
    %49 = "memref.subview"(%41, %arg1, %47, %48, %37) <{operandSegmentSizes = array<i32: 1, 3, 1, 0>, static_offsets = array<i64: 0, -9223372036854775808, -9223372036854775808, -9223372036854775808>, static_sizes = array<i64: -9223372036854775808, 1, 1, 4>, static_strides = array<i64: 1, 1, 1, 1>}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>, index, index, index, index) -> memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %50 = "vector.transfer_read"(%39, %4, %arg0, %arg1, %47, %48, %0) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 5, 1, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, index, index, index, f8E4M3FNUZ) -> vector<4xf8E4M3FNUZ>
    %51 = "arith.extf"(%50) : (vector<4xf8E4M3FNUZ>) -> vector<4xbf16>
    "vector.transfer_write"(%51, %8, %4, %4, %4, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 5, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (vector<4xbf16>, memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, index, index, index, index, index) -> ()
    "iree_linalg_ext.scatter"(%8, %46, %49) <{dimension_map = array<i64: 0>, operandSegmentSizes = array<i32: 2, 1>, unique_indices = true}> ({
    ^bb0(%arg4: bf16, %arg5: bf16):
      "iree_linalg_ext.yield"(%arg4) : (bf16) -> ()
    }) : (memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>, memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
    "gpu.barrier"() : () -> ()
    "scf.forall.in_parallel"() ({
    ^bb0:
    }) : () -> ()
  }) : (index) -> ()
  "memref.dealloc"(%7) : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> ()
  "func.return"() : () -> ()
 }) {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64>} : () -> ()
 /sharedfile/32/fp8_32_kv16.mlir:2887:12: error: failed to run translation of source executable to target executable for backend #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>
    %990 = torch.aten.index_put %979, %989, %988, %false_228 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
           ^
 /sharedfile/32/fp8_32_kv16.mlir:2887:12: note: see current operation: 
 "hal.executable.variant"() ({
  "hal.executable.export"() ({
  ^bb0(%arg9: !hal.device, %arg10: index, %arg11: index, %arg12: index, %arg13: index):
    %62:3 = "flow.dispatch.workgroup_count_from_slice"(%arg10, %arg11, %arg12, %arg13) : (index, index, index, index) -> (index, index, index)
    "hal.return"(%62#0, %62#1, %62#2) : (index, index, index) -> ()
  }) {layout = #hal.pipeline.layout<constants = 8, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index, sym_name = "prefill_bs1$async_dispatch_12_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"} : () -> ()
  "builtin.module"() ({
    "func.func"() <{function_type = () -> (), sym_name = "prefill_bs1$async_dispatch_12_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"}> ({
      %0 = "arith.constant"() <{value = 0.000000e+00 : f8E4M3FNUZ}> : () -> f8E4M3FNUZ
      %1 = "arith.constant"() <{value = 0 : i64}> : () -> i64
      %2 = "arith.constant"() <{value = 1 : index}> : () -> index
      %3 = "arith.constant"() <{value = 32 : i64}> : () -> i64
      %4 = "arith.constant"() <{value = 0 : index}> : () -> index
      %5 = "arith.constant"() <{value = 64 : index}> : () -> index
      %6 = "arith.constant"() <{value = dense<64> : vector<1xi64>}> : () -> vector<1xi64>
      %7 = "memref.alloc"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x3xi32, #gpu.address_space<workgroup>>
      %8 = "memref.alloca"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x1x1x1x4xbf16, #gpu.address_space<private>>
      %9 = "gpu.thread_id"() <{dimension = #gpu<dim x>, upper_bound = 64 : index}> : () -> index
      %10 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 8, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index} : () -> i32
      %11 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 8, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 1 : index} : () -> i32
      %12 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 8, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 2 : index} : () -> i32
      %13 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 8, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 3 : index} : () -> i32
      %14 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 8, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 4 : index} : () -> i32
      %15 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 8, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 5 : index} : () -> i32
      %16 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 8, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 6 : index} : () -> i32
      %17 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 8, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 7 : index} : () -> i32
      %18 = "arith.extui"(%10) : (i32) -> i64
      %19 = "arith.extui"(%11) : (i32) -> i64
      %20 = "arith.shli"(%19, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
      %21 = "arith.ori"(%18, %20) : (i64, i64) -> i64
      %22 = "arith.index_castui"(%21) : (i64) -> index
      %23 = "arith.extui"(%12) : (i32) -> i64
      %24 = "arith.extui"(%13) : (i32) -> i64
      %25 = "arith.shli"(%24, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
      %26 = "arith.ori"(%23, %25) : (i64, i64) -> i64
      %27 = "arith.index_castui"(%26) : (i64) -> index
      %28 = "arith.index_castui"(%14) : (i32) -> index
      %29 = "arith.extui"(%15) : (i32) -> i64
      %30 = "arith.extui"(%16) : (i32) -> i64
      %31 = "arith.shli"(%30, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
      %32 = "arith.ori"(%29, %31) : (i64, i64) -> i64
      %33 = "arith.index_castui"(%32) : (i64) -> index
      %34 = "arith.index_castui"(%17) : (i32) -> index
      %35:5 = "util.assume.int"(%22, %27, %28, %33, %34) <{assumptions = [[#util.int.assumption<umin = 68386816, umax = 5300322304>], [#util.int.assumption<umin = 68501504, umax = 5769969664>], [#util.int.assumption<umin = 1, umax = 4095>], [#util.int.assumption<umin = 64, umax = 576460752303423424, udiv = 64>], [#util.int.assumption<umin = 1, umax = 4095>]]}> : (index, index, index, index, index) -> (index, index, index, index, index)
      %36 = "flow.dispatch.workload.ordinal"(%35#2) <{ordinal = 1 : index}> : (index) -> index
      %37 = "flow.dispatch.workload.ordinal"(%35#3) <{ordinal = 2 : index}> : (index) -> index
      %38 = "flow.dispatch.workload.ordinal"(%35#4) <{ordinal = 3 : index}> : (index) -> index
      %39 = "hal.interface.binding.subspan"(%35#0, %36) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 8, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      "memref.assume_alignment"(%39) <{alignment = 1 : i32}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
      %40 = "hal.interface.binding.subspan"(%4, %38) {alignment = 64 : index, binding = 1 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 8, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, #hal.descriptor_type<storage_buffer>>
      "memref.assume_alignment"(%40) <{alignment = 64 : i32}> : (memref<1x?xi64, #hal.descriptor_type<storage_buffer>>) -> ()
      %41 = "hal.interface.binding.subspan"(%4, %37) {alignment = 64 : index, binding = 2 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 8, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>
      "memref.assume_alignment"(%41) <{alignment = 64 : i32}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>) -> ()
      %42 = "hal.interface.binding.subspan"(%35#1, %38) {alignment = 64 : index, binding = 3 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 8, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      "memref.assume_alignment"(%42) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
      %43 = "memref.alloca"(%38) <{operandSegmentSizes = array<i32: 1, 0>}> : (index) -> memref<1x?xi32, #gpu.address_space<private>>
      %44 = "affine.apply"(%38) <{map = affine_map<()[s0] -> (s0 ceildiv 2)>}> : (index) -> index
      "gpu.barrier"() : () -> ()
      "scf.for"(%9, %44, %5) ({
      ^bb0(%arg7: index):
        %55 = "affine.apply"(%arg7) <{map = affine_map<(d0) -> (d0 * 2)>}> : (index) -> index
        %56 = "affine.min"(%arg7, %38) <{map = affine_map<(d0)[s0] -> (d0 * -2 + s0, 2)>}> : (index, index) -> index
        "scf.for"(%4, %56, %2) ({
        ^bb0(%arg8: index):
          %57 = "arith.addi"(%arg8, %55) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
          %58 = "vector.transfer_read"(%40, %4, %57, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
          %59 = "arith.muli"(%58, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
          %60 = "arith.trunci"(%59) : (vector<1xi64>) -> vector<1xi32>
          %61 = "affine.apply"(%arg7, %arg8) <{map = affine_map<(d0)[s0] -> (d0 * 2 + s0)>}> : (index, index) -> index
          "vector.transfer_write"(%59, %42, %4, %61) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
          "vector.transfer_write"(%60, %43, %4, %61) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x?xi32, #gpu.address_space<private>>, index, index) -> ()
          "scf.yield"() : () -> ()
        }) : (index, index, index) -> ()
        "scf.yield"() : () -> ()
      }) : (index, index, index) -> ()
      "gpu.barrier"() : () -> ()
      %45:2 = "affine.delinearize_index"(%9) <{static_basis = array<i64: 4, 16>}> : (index) -> (index, index)
      "scf.forall"(%36) <{mapping = [#iree_codegen.workgroup_mapping<z:1>, #iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>], operandSegmentSizes = array<i32: 0, 1, 0, 0>, staticLowerBound = array<i64: 0, 0, 0, 0>, staticStep = array<i64: 1, 1, 1, 1>, staticUpperBound = array<i64: -9223372036854775808, 32, 2, 2>}> ({
      ^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index):
        %46 = "memref.subview"(%7) <{operandSegmentSizes = array<i32: 1, 0, 0, 0>, static_offsets = array<i64: 0, 0>, static_sizes = array<i64: 1, 1>, static_strides = array<i64: 1, 1>}> : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>
        "gpu.barrier"() : () -> ()
        "scf.for"(%9, %2, %5) ({
        ^bb0(%arg6: index):
          %52 = "vector.transfer_read"(%40, %4, %arg0, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
          %53 = "arith.muli"(%52, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
          %54 = "arith.trunci"(%53) : (vector<1xi64>) -> vector<1xi32>
          "vector.transfer_write"(%54, %7, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x3xi32, #gpu.address_space<workgroup>>, index, index) -> ()
          "scf.yield"() : () -> ()
        }) : (index, index, index) -> ()
        "gpu.barrier"() : () -> ()
        %47 = "affine.apply"(%arg2, %45#0) <{map = affine_map<(d0)[s0] -> (d0 * 4 + s0)>}> : (index, index) -> index
        %48 = "affine.apply"(%arg3, %45#1) <{map = affine_map<(d0)[s0] -> (d0 * 64 + s0 * 4)>}> : (index, index) -> index
        %49 = "memref.subview"(%41, %arg1, %47, %48, %37) <{operandSegmentSizes = array<i32: 1, 3, 1, 0>, static_offsets = array<i64: 0, -9223372036854775808, -9223372036854775808, -9223372036854775808>, static_sizes = array<i64: -9223372036854775808, 1, 1, 4>, static_strides = array<i64: 1, 1, 1, 1>}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>, index, index, index, index) -> memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        %50 = "vector.transfer_read"(%39, %4, %arg0, %arg1, %47, %48, %0) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 5, 1, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, index, index, index, f8E4M3FNUZ) -> vector<4xf8E4M3FNUZ>
        %51 = "arith.extf"(%50) : (vector<4xf8E4M3FNUZ>) -> vector<4xbf16>
        "vector.transfer_write"(%51, %8, %4, %4, %4, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 5, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (vector<4xbf16>, memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, index, index, index, index, index) -> ()
        "iree_linalg_ext.scatter"(%8, %46, %49) <{dimension_map = array<i64: 0>, operandSegmentSizes = array<i32: 2, 1>, unique_indices = true}> ({
        ^bb0(%arg4: bf16, %arg5: bf16):
          "iree_linalg_ext.yield"(%arg4) : (bf16) -> ()
        }) : (memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>, memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
        "gpu.barrier"() : () -> ()
        "scf.forall.in_parallel"() ({
        ^bb0:
        }) : () -> ()
      }) : (index) -> ()
      "memref.dealloc"(%7) : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> ()
      "func.return"() : () -> ()
    }) {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64>} : () -> ()
  }) : () -> ()
  "hal.executable.variant_end"() : () -> ()
 }) {sym_name = "rocm_hsaco_fb", target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>} : () -> ()
 /sharedfile/32/fp8_32_kv16.mlir:3763:13: error: 'vector.transfer_write' op write affecting operations on global resources are restricted to workgroup distributed contexts.
    %1327 = torch.aten.index_put %1325, %1326, %1321, %false_597 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
            ^
 /sharedfile/32/fp8_32_kv16.mlir:3763:13: note: see current operation: "vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
 /sharedfile/32/fp8_32_kv16.mlir:3763:13: error: 'func.func' op failed on workgroup distribution verification
    %1327 = torch.aten.index_put %1325, %1326, %1321, %false_597 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
            ^
 /sharedfile/32/fp8_32_kv16.mlir:3763:13: note: see current operation: 
 "func.func"() <{function_type = () -> (), sym_name = "prefill_bs1$async_dispatch_32_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"}> ({
  %0 = "arith.constant"() <{value = 0.000000e+00 : f8E4M3FNUZ}> : () -> f8E4M3FNUZ
  %1 = "arith.constant"() <{value = 0 : i64}> : () -> i64
  %2 = "arith.constant"() <{value = 1 : index}> : () -> index
  %3 = "arith.constant"() <{value = 32 : i64}> : () -> i64
  %4 = "arith.constant"() <{value = 0 : index}> : () -> index
  %5 = "arith.constant"() <{value = 64 : index}> : () -> index
  %6 = "arith.constant"() <{value = dense<2> : vector<1xi64>}> : () -> vector<1xi64>
  %7 = "memref.alloc"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x3xi32, #gpu.address_space<workgroup>>
  %8 = "memref.alloca"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x1x1x1x4xbf16, #gpu.address_space<private>>
  %9 = "gpu.thread_id"() <{dimension = #gpu<dim x>, upper_bound = 64 : index}> : () -> index
  %10 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index} : () -> i32
  %11 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 1 : index} : () -> i32
  %12 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 2 : index} : () -> i32
  %13 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 3 : index} : () -> i32
  %14 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 4 : index} : () -> i32
  %15 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 5 : index} : () -> i32
  %16 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 6 : index} : () -> i32
  %17 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 7 : index} : () -> i32
  %18 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 8 : index} : () -> i32
  %19 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 9 : index} : () -> i32
  %20 = "arith.extui"(%10) : (i32) -> i64
  %21 = "arith.extui"(%11) : (i32) -> i64
  %22 = "arith.shli"(%21, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
  %23 = "arith.ori"(%20, %22) : (i64, i64) -> i64
  %24 = "arith.index_castui"(%23) : (i64) -> index
  %25 = "arith.extui"(%12) : (i32) -> i64
  %26 = "arith.extui"(%13) : (i32) -> i64
  %27 = "arith.shli"(%26, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
  %28 = "arith.ori"(%25, %27) : (i64, i64) -> i64
  %29 = "arith.index_castui"(%28) : (i64) -> index
  %30 = "arith.extui"(%14) : (i32) -> i64
  %31 = "arith.extui"(%15) : (i32) -> i64
  %32 = "arith.shli"(%31, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
  %33 = "arith.ori"(%30, %32) : (i64, i64) -> i64
  %34 = "arith.index_castui"(%33) : (i64) -> index
  %35 = "arith.index_castui"(%16) : (i32) -> index
  %36 = "arith.extui"(%17) : (i32) -> i64
  %37 = "arith.extui"(%18) : (i32) -> i64
  %38 = "arith.shli"(%37, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
  %39 = "arith.ori"(%36, %38) : (i64, i64) -> i64
  %40 = "arith.index_castui"(%39) : (i64) -> index
  %41 = "arith.index_castui"(%19) : (i32) -> index
  %42:6 = "util.assume.int"(%24, %29, %34, %35, %40, %41) <{assumptions = [[#util.int.assumption<umin = 68386816, umax = 5300322304>], [#util.int.assumption<umin = 68501504, umax = 5769969664>], [#util.int.assumption<umin = 68501568, umax = 5770002432>], [#util.int.assumption<umin = 1, umax = 4095>], [#util.int.assumption<umin = 64, umax = 576460752303423424, udiv = 64>], [#util.int.assumption<umin = 1, umax = 4095>]]}> : (index, index, index, index, index, index) -> (index, index, index, index, index, index)
  %43 = "flow.dispatch.workload.ordinal"(%42#3) <{ordinal = 1 : index}> : (index) -> index
  %44 = "flow.dispatch.workload.ordinal"(%42#4) <{ordinal = 2 : index}> : (index) -> index
  %45 = "flow.dispatch.workload.ordinal"(%42#5) <{ordinal = 3 : index}> : (index) -> index
  %46 = "hal.interface.binding.subspan"(%42#0, %43) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  "memref.assume_alignment"(%46) <{alignment = 1 : i32}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
  %47 = "hal.interface.binding.subspan"(%42#1, %45) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  "memref.assume_alignment"(%47) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
  %48 = "hal.interface.binding.subspan"(%4, %44) {alignment = 64 : index, binding = 1 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>
  "memref.assume_alignment"(%48) <{alignment = 64 : i32}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>) -> ()
  %49 = "hal.interface.binding.subspan"(%42#2, %45) {alignment = 64 : index, binding = 2 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  "memref.assume_alignment"(%49) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
  %50 = "memref.alloca"(%45) <{operandSegmentSizes = array<i32: 1, 0>}> : (index) -> memref<1x?xi32, #gpu.address_space<private>>
  %51 = "affine.apply"(%45) <{map = affine_map<()[s0] -> (s0 ceildiv 2)>}> : (index) -> index
  "gpu.barrier"() : () -> ()
  "scf.for"(%9, %51, %5) ({
  ^bb0(%arg7: index):
    %62 = "affine.apply"(%arg7) <{map = affine_map<(d0) -> (d0 * 2)>}> : (index) -> index
    %63 = "affine.min"(%arg7, %45) <{map = affine_map<(d0)[s0] -> (d0 * -2 + s0, 2)>}> : (index, index) -> index
    "scf.for"(%4, %63, %2) ({
    ^bb0(%arg8: index):
      %64 = "arith.addi"(%arg8, %62) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
      %65 = "vector.transfer_read"(%47, %4, %64, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
      %66 = "arith.addi"(%65, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
      %67 = "arith.trunci"(%66) : (vector<1xi64>) -> vector<1xi32>
      %68 = "affine.apply"(%arg7, %arg8) <{map = affine_map<(d0)[s0] -> (d0 * 2 + s0)>}> : (index, index) -> index
      "vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
      "vector.transfer_write"(%67, %50, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x?xi32, #gpu.address_space<private>>, index, index) -> ()
      "scf.yield"() : () -> ()
    }) : (index, index, index) -> ()
    "scf.yield"() : () -> ()
  }) : (index, index, index) -> ()
  "gpu.barrier"() : () -> ()
  %52:2 = "affine.delinearize_index"(%9) <{static_basis = array<i64: 4, 16>}> : (index) -> (index, index)
  "scf.forall"(%43) <{mapping = [#iree_codegen.workgroup_mapping<z:1>, #iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>], operandSegmentSizes = array<i32: 0, 1, 0, 0>, staticLowerBound = array<i64: 0, 0, 0, 0>, staticStep = array<i64: 1, 1, 1, 1>, staticUpperBound = array<i64: -9223372036854775808, 32, 2, 2>}> ({
  ^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index):
    %53 = "memref.subview"(%7) <{operandSegmentSizes = array<i32: 1, 0, 0, 0>, static_offsets = array<i64: 0, 0>, static_sizes = array<i64: 1, 1>, static_strides = array<i64: 1, 1>}> : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>
    "gpu.barrier"() : () -> ()
    "scf.for"(%9, %2, %5) ({
    ^bb0(%arg6: index):
      %59 = "vector.transfer_read"(%47, %4, %arg0, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
      %60 = "arith.addi"(%59, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
      %61 = "arith.trunci"(%60) : (vector<1xi64>) -> vector<1xi32>
      "vector.transfer_write"(%61, %7, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x3xi32, #gpu.address_space<workgroup>>, index, index) -> ()
      "scf.yield"() : () -> ()
    }) : (index, index, index) -> ()
    "gpu.barrier"() : () -> ()
    %54 = "affine.apply"(%arg2, %52#0) <{map = affine_map<(d0)[s0] -> (d0 * 4 + s0)>}> : (index, index) -> index
    %55 = "affine.apply"(%arg3, %52#1) <{map = affine_map<(d0)[s0] -> (d0 * 64 + s0 * 4)>}> : (index, index) -> index
    %56 = "memref.subview"(%48, %arg1, %54, %55, %44) <{operandSegmentSizes = array<i32: 1, 3, 1, 0>, static_offsets = array<i64: 0, -9223372036854775808, -9223372036854775808, -9223372036854775808>, static_sizes = array<i64: -9223372036854775808, 1, 1, 4>, static_strides = array<i64: 1, 1, 1, 1>}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>, index, index, index, index) -> memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %57 = "vector.transfer_read"(%46, %4, %arg0, %arg1, %54, %55, %0) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 5, 1, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, index, index, index, f8E4M3FNUZ) -> vector<4xf8E4M3FNUZ>
    %58 = "arith.extf"(%57) : (vector<4xf8E4M3FNUZ>) -> vector<4xbf16>
    "vector.transfer_write"(%58, %8, %4, %4, %4, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 5, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (vector<4xbf16>, memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, index, index, index, index, index) -> ()
    "iree_linalg_ext.scatter"(%8, %53, %56) <{dimension_map = array<i64: 0>, operandSegmentSizes = array<i32: 2, 1>, unique_indices = true}> ({
    ^bb0(%arg4: bf16, %arg5: bf16):
      "iree_linalg_ext.yield"(%arg4) : (bf16) -> ()
    }) : (memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>, memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
    "gpu.barrier"() : () -> ()
    "scf.forall.in_parallel"() ({
    ^bb0:
    }) : () -> ()
  }) : (index) -> ()
  "memref.dealloc"(%7) : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> ()
  "func.return"() : () -> ()
 }) {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64>} : () -> ()
 /sharedfile/32/fp8_32_kv16.mlir:3763:13: error: failed to run translation of source executable to target executable for backend #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>
    %1327 = torch.aten.index_put %1325, %1326, %1321, %false_597 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
            ^
 /sharedfile/32/fp8_32_kv16.mlir:3763:13: note: see current operation: 
 "hal.executable.variant"() ({
  "hal.executable.export"() ({
  ^bb0(%arg9: !hal.device, %arg10: index, %arg11: index, %arg12: index, %arg13: index):
    %69:3 = "flow.dispatch.workgroup_count_from_slice"(%arg10, %arg11, %arg12, %arg13) : (index, index, index, index) -> (index, index, index)
    "hal.return"(%69#0, %69#1, %69#2) : (index, index, index) -> ()
  }) {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index, sym_name = "prefill_bs1$async_dispatch_32_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"} : () -> ()
  "builtin.module"() ({
    "func.func"() <{function_type = () -> (), sym_name = "prefill_bs1$async_dispatch_32_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"}> ({
      %0 = "arith.constant"() <{value = 0.000000e+00 : f8E4M3FNUZ}> : () -> f8E4M3FNUZ
      %1 = "arith.constant"() <{value = 0 : i64}> : () -> i64
      %2 = "arith.constant"() <{value = 1 : index}> : () -> index
      %3 = "arith.constant"() <{value = 32 : i64}> : () -> i64
      %4 = "arith.constant"() <{value = 0 : index}> : () -> index
      %5 = "arith.constant"() <{value = 64 : index}> : () -> index
      %6 = "arith.constant"() <{value = dense<2> : vector<1xi64>}> : () -> vector<1xi64>
      %7 = "memref.alloc"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x3xi32, #gpu.address_space<workgroup>>
      %8 = "memref.alloca"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x1x1x1x4xbf16, #gpu.address_space<private>>
      %9 = "gpu.thread_id"() <{dimension = #gpu<dim x>, upper_bound = 64 : index}> : () -> index
      %10 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index} : () -> i32
      %11 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 1 : index} : () -> i32
      %12 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 2 : index} : () -> i32
      %13 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 3 : index} : () -> i32
      %14 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 4 : index} : () -> i32
      %15 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 5 : index} : () -> i32
      %16 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 6 : index} : () -> i32
      %17 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 7 : index} : () -> i32
      %18 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 8 : index} : () -> i32
      %19 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 9 : index} : () -> i32
      %20 = "arith.extui"(%10) : (i32) -> i64
      %21 = "arith.extui"(%11) : (i32) -> i64
      %22 = "arith.shli"(%21, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
      %23 = "arith.ori"(%20, %22) : (i64, i64) -> i64
      %24 = "arith.index_castui"(%23) : (i64) -> index
      %25 = "arith.extui"(%12) : (i32) -> i64
      %26 = "arith.extui"(%13) : (i32) -> i64
      %27 = "arith.shli"(%26, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
      %28 = "arith.ori"(%25, %27) : (i64, i64) -> i64
      %29 = "arith.index_castui"(%28) : (i64) -> index
      %30 = "arith.extui"(%14) : (i32) -> i64
      %31 = "arith.extui"(%15) : (i32) -> i64
      %32 = "arith.shli"(%31, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
      %33 = "arith.ori"(%30, %32) : (i64, i64) -> i64
      %34 = "arith.index_castui"(%33) : (i64) -> index
      %35 = "arith.index_castui"(%16) : (i32) -> index
      %36 = "arith.extui"(%17) : (i32) -> i64
      %37 = "arith.extui"(%18) : (i32) -> i64
      %38 = "arith.shli"(%37, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
      %39 = "arith.ori"(%36, %38) : (i64, i64) -> i64
      %40 = "arith.index_castui"(%39) : (i64) -> index
      %41 = "arith.index_castui"(%19) : (i32) -> index
      %42:6 = "util.assume.int"(%24, %29, %34, %35, %40, %41) <{assumptions = [[#util.int.assumption<umin = 68386816, umax = 5300322304>], [#util.int.assumption<umin = 68501504, umax = 5769969664>], [#util.int.assumption<umin = 68501568, umax = 5770002432>], [#util.int.assumption<umin = 1, umax = 4095>], [#util.int.assumption<umin = 64, umax = 576460752303423424, udiv = 64>], [#util.int.assumption<umin = 1, umax = 4095>]]}> : (index, index, index, index, index, index) -> (index, index, index, index, index, index)
      %43 = "flow.dispatch.workload.ordinal"(%42#3) <{ordinal = 1 : index}> : (index) -> index
      %44 = "flow.dispatch.workload.ordinal"(%42#4) <{ordinal = 2 : index}> : (index) -> index
      %45 = "flow.dispatch.workload.ordinal"(%42#5) <{ordinal = 3 : index}> : (index) -> index
      %46 = "hal.interface.binding.subspan"(%42#0, %43) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      "memref.assume_alignment"(%46) <{alignment = 1 : i32}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
      %47 = "hal.interface.binding.subspan"(%42#1, %45) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      "memref.assume_alignment"(%47) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
      %48 = "hal.interface.binding.subspan"(%4, %44) {alignment = 64 : index, binding = 1 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>
      "memref.assume_alignment"(%48) <{alignment = 64 : i32}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>) -> ()
      %49 = "hal.interface.binding.subspan"(%42#2, %45) {alignment = 64 : index, binding = 2 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      "memref.assume_alignment"(%49) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
      %50 = "memref.alloca"(%45) <{operandSegmentSizes = array<i32: 1, 0>}> : (index) -> memref<1x?xi32, #gpu.address_space<private>>
      %51 = "affine.apply"(%45) <{map = affine_map<()[s0] -> (s0 ceildiv 2)>}> : (index) -> index
      "gpu.barrier"() : () -> ()
      "scf.for"(%9, %51, %5) ({
      ^bb0(%arg7: index):
        %62 = "affine.apply"(%arg7) <{map = affine_map<(d0) -> (d0 * 2)>}> : (index) -> index
        %63 = "affine.min"(%arg7, %45) <{map = affine_map<(d0)[s0] -> (d0 * -2 + s0, 2)>}> : (index, index) -> index
        "scf.for"(%4, %63, %2) ({
        ^bb0(%arg8: index):
          %64 = "arith.addi"(%arg8, %62) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
          %65 = "vector.transfer_read"(%47, %4, %64, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
          %66 = "arith.addi"(%65, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
          %67 = "arith.trunci"(%66) : (vector<1xi64>) -> vector<1xi32>
          %68 = "affine.apply"(%arg7, %arg8) <{map = affine_map<(d0)[s0] -> (d0 * 2 + s0)>}> : (index, index) -> index
          "vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
          "vector.transfer_write"(%67, %50, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x?xi32, #gpu.address_space<private>>, index, index) -> ()
          "scf.yield"() : () -> ()
        }) : (index, index, index) -> ()
        "scf.yield"() : () -> ()
      }) : (index, index, index) -> ()
      "gpu.barrier"() : () -> ()
      %52:2 = "affine.delinearize_index"(%9) <{static_basis = array<i64: 4, 16>}> : (index) -> (index, index)
      "scf.forall"(%43) <{mapping = [#iree_codegen.workgroup_mapping<z:1>, #iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>], operandSegmentSizes = array<i32: 0, 1, 0, 0>, staticLowerBound = array<i64: 0, 0, 0, 0>, staticStep = array<i64: 1, 1, 1, 1>, staticUpperBound = array<i64: -9223372036854775808, 32, 2, 2>}> ({
      ^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index):
        %53 = "memref.subview"(%7) <{operandSegmentSizes = array<i32: 1, 0, 0, 0>, static_offsets = array<i64: 0, 0>, static_sizes = array<i64: 1, 1>, static_strides = array<i64: 1, 1>}> : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>
        "gpu.barrier"() : () -> ()
        "scf.for"(%9, %2, %5) ({
        ^bb0(%arg6: index):
          %59 = "vector.transfer_read"(%47, %4, %arg0, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
          %60 = "arith.addi"(%59, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
          %61 = "arith.trunci"(%60) : (vector<1xi64>) -> vector<1xi32>
          "vector.transfer_write"(%61, %7, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x3xi32, #gpu.address_space<workgroup>>, index, index) -> ()
          "scf.yield"() : () -> ()
        }) : (index, index, index) -> ()
        "gpu.barrier"() : () -> ()
        %54 = "affine.apply"(%arg2, %52#0) <{map = affine_map<(d0)[s0] -> (d0 * 4 + s0)>}> : (index, index) -> index
        %55 = "affine.apply"(%arg3, %52#1) <{map = affine_map<(d0)[s0] -> (d0 * 64 + s0 * 4)>}> : (index, index) -> index
        %56 = "memref.subview"(%48, %arg1, %54, %55, %44) <{operandSegmentSizes = array<i32: 1, 3, 1, 0>, static_offsets = array<i64: 0, -9223372036854775808, -9223372036854775808, -9223372036854775808>, static_sizes = array<i64: -9223372036854775808, 1, 1, 4>, static_strides = array<i64: 1, 1, 1, 1>}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>, index, index, index, index) -> memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        %57 = "vector.transfer_read"(%46, %4, %arg0, %arg1, %54, %55, %0) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 5, 1, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, index, index, index, f8E4M3FNUZ) -> vector<4xf8E4M3FNUZ>
        %58 = "arith.extf"(%57) : (vector<4xf8E4M3FNUZ>) -> vector<4xbf16>
        "vector.transfer_write"(%58, %8, %4, %4, %4, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 5, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (vector<4xbf16>, memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, index, index, index, index, index) -> ()
        "iree_linalg_ext.scatter"(%8, %53, %56) <{dimension_map = array<i64: 0>, operandSegmentSizes = array<i32: 2, 1>, unique_indices = true}> ({
        ^bb0(%arg4: bf16, %arg5: bf16):
          "iree_linalg_ext.yield"(%arg4) : (bf16) -> ()
        }) : (memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>, memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
        "gpu.barrier"() : () -> ()
        "scf.forall.in_parallel"() ({
        ^bb0:
        }) : () -> ()
      }) : (index) -> ()
      "memref.dealloc"(%7) : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> ()
      "func.return"() : () -> ()
    }) {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64>} : () -> ()
  }) : () -> ()
  "hal.executable.variant_end"() : () -> ()
 }) {sym_name = "rocm_hsaco_fb", target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>} : () -> ()
 /sharedfile/32/fp8_32_kv16.mlir:4639:13: error: 'vector.transfer_write' op write affecting operations on global resources are restricted to workgroup distributed contexts.
    %1664 = torch.aten.index_put %1662, %1663, %1658, %false_970 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
            ^
 /sharedfile/32/fp8_32_kv16.mlir:4639:13: note: see current operation: "vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
 /sharedfile/32/fp8_32_kv16.mlir:4639:13: error: 'func.func' op failed on workgroup distribution verification
    %1664 = torch.aten.index_put %1662, %1663, %1658, %false_970 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
            ^
 /sharedfile/32/fp8_32_kv16.mlir:4639:13: note: see current operation: 
 "func.func"() <{function_type = () -> (), sym_name = "prefill_bs1$async_dispatch_52_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"}> ({
  %0 = "arith.constant"() <{value = 0.000000e+00 : f8E4M3FNUZ}> : () -> f8E4M3FNUZ
  %1 = "arith.constant"() <{value = 0 : i64}> : () -> i64
  %2 = "arith.constant"() <{value = 1 : index}> : () -> index
  %3 = "arith.constant"() <{value = 32 : i64}> : () -> i64
  %4 = "arith.constant"() <{value = 0 : index}> : () -> index
  %5 = "arith.constant"() <{value = 64 : index}> : () -> index
  %6 = "arith.constant"() <{value = dense<4> : vector<1xi64>}> : () -> vector<1xi64>
  %7 = "memref.alloc"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x3xi32, #gpu.address_space<workgroup>>
  %8 = "memref.alloca"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x1x1x1x4xbf16, #gpu.address_space<private>>
  %9 = "gpu.thread_id"() <{dimension = #gpu<dim x>, upper_bound = 64 : index}> : () -> index
  %10 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index} : () -> i32
  %11 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 1 : index} : () -> i32
  %12 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 2 : index} : () -> i32
  %13 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 3 : index} : () -> i32
  %14 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 4 : index} : () -> i32
  %15 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 5 : index} : () -> i32
  %16 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 6 : index} : () -> i32
  %17 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 7 : index} : () -> i32
  %18 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 8 : index} : () -> i32
  %19 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 9 : index} : () -> i32
  %20 = "arith.extui"(%10) : (i32) -> i64
  %21 = "arith.extui"(%11) : (i32) -> i64
  %22 = "arith.shli"(%21, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
  %23 = "arith.ori"(%20, %22) : (i64, i64) -> i64
  %24 = "arith.index_castui"(%23) : (i64) -> index
  %25 = "arith.extui"(%12) : (i32) -> i64
  %26 = "arith.extui"(%13) : (i32) -> i64
  %27 = "arith.shli"(%26, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
  %28 = "arith.ori"(%25, %27) : (i64, i64) -> i64
  %29 = "arith.index_castui"(%28) : (i64) -> index
  %30 = "arith.extui"(%14) : (i32) -> i64
  %31 = "arith.extui"(%15) : (i32) -> i64
  %32 = "arith.shli"(%31, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
  %33 = "arith.ori"(%30, %32) : (i64, i64) -> i64
  %34 = "arith.index_castui"(%33) : (i64) -> index
  %35 = "arith.index_castui"(%16) : (i32) -> index
  %36 = "arith.extui"(%17) : (i32) -> i64
  %37 = "arith.extui"(%18) : (i32) -> i64
  %38 = "arith.shli"(%37, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
  %39 = "arith.ori"(%36, %38) : (i64, i64) -> i64
  %40 = "arith.index_castui"(%39) : (i64) -> index
  %41 = "arith.index_castui"(%19) : (i32) -> index
  %42:6 = "util.assume.int"(%24, %29, %34, %35, %40, %41) <{assumptions = [[#util.int.assumption<umin = 68386816, umax = 5300322304>], [#util.int.assumption<umin = 68501504, umax = 5769969664>], [#util.int.assumption<umin = 68501568, umax = 5770002432>], [#util.int.assumption<umin = 1, umax = 4095>], [#util.int.assumption<umin = 64, umax = 576460752303423424, udiv = 64>], [#util.int.assumption<umin = 1, umax = 4095>]]}> : (index, index, index, index, index, index) -> (index, index, index, index, index, index)
  %43 = "flow.dispatch.workload.ordinal"(%42#3) <{ordinal = 1 : index}> : (index) -> index
  %44 = "flow.dispatch.workload.ordinal"(%42#4) <{ordinal = 2 : index}> : (index) -> index
  %45 = "flow.dispatch.workload.ordinal"(%42#5) <{ordinal = 3 : index}> : (index) -> index
  %46 = "hal.interface.binding.subspan"(%42#0, %43) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  "memref.assume_alignment"(%46) <{alignment = 1 : i32}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
  %47 = "hal.interface.binding.subspan"(%42#1, %45) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  "memref.assume_alignment"(%47) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
  %48 = "hal.interface.binding.subspan"(%4, %44) {alignment = 64 : index, binding = 1 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>
  "memref.assume_alignment"(%48) <{alignment = 64 : i32}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>) -> ()
  %49 = "hal.interface.binding.subspan"(%42#2, %45) {alignment = 64 : index, binding = 2 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  "memref.assume_alignment"(%49) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
  %50 = "memref.alloca"(%45) <{operandSegmentSizes = array<i32: 1, 0>}> : (index) -> memref<1x?xi32, #gpu.address_space<private>>
  %51 = "affine.apply"(%45) <{map = affine_map<()[s0] -> (s0 ceildiv 2)>}> : (index) -> index
  "gpu.barrier"() : () -> ()
  "scf.for"(%9, %51, %5) ({
  ^bb0(%arg7: index):
    %62 = "affine.apply"(%arg7) <{map = affine_map<(d0) -> (d0 * 2)>}> : (index) -> index
    %63 = "affine.min"(%arg7, %45) <{map = affine_map<(d0)[s0] -> (d0 * -2 + s0, 2)>}> : (index, index) -> index
    "scf.for"(%4, %63, %2) ({
    ^bb0(%arg8: index):
      %64 = "arith.addi"(%arg8, %62) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
      %65 = "vector.transfer_read"(%47, %4, %64, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
      %66 = "arith.addi"(%65, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
      %67 = "arith.trunci"(%66) : (vector<1xi64>) -> vector<1xi32>
      %68 = "affine.apply"(%arg7, %arg8) <{map = affine_map<(d0)[s0] -> (d0 * 2 + s0)>}> : (index, index) -> index
      "vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
      "vector.transfer_write"(%67, %50, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x?xi32, #gpu.address_space<private>>, index, index) -> ()
      "scf.yield"() : () -> ()
    }) : (index, index, index) -> ()
    "scf.yield"() : () -> ()
  }) : (index, index, index) -> ()
  "gpu.barrier"() : () -> ()
  %52:2 = "affine.delinearize_index"(%9) <{static_basis = array<i64: 4, 16>}> : (index) -> (index, index)
  "scf.forall"(%43) <{mapping = [#iree_codegen.workgroup_mapping<z:1>, #iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>], operandSegmentSizes = array<i32: 0, 1, 0, 0>, staticLowerBound = array<i64: 0, 0, 0, 0>, staticStep = array<i64: 1, 1, 1, 1>, staticUpperBound = array<i64: -9223372036854775808, 32, 2, 2>}> ({
  ^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index):
    %53 = "memref.subview"(%7) <{operandSegmentSizes = array<i32: 1, 0, 0, 0>, static_offsets = array<i64: 0, 0>, static_sizes = array<i64: 1, 1>, static_strides = array<i64: 1, 1>}> : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>
    "gpu.barrier"() : () -> ()
    "scf.for"(%9, %2, %5) ({
    ^bb0(%arg6: index):
      %59 = "vector.transfer_read"(%47, %4, %arg0, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
      %60 = "arith.addi"(%59, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
      %61 = "arith.trunci"(%60) : (vector<1xi64>) -> vector<1xi32>
      "vector.transfer_write"(%61, %7, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x3xi32, #gpu.address_space<workgroup>>, index, index) -> ()
      "scf.yield"() : () -> ()
    }) : (index, index, index) -> ()
    "gpu.barrier"() : () -> ()
    %54 = "affine.apply"(%arg2, %52#0) <{map = affine_map<(d0)[s0] -> (d0 * 4 + s0)>}> : (index, index) -> index
    %55 = "affine.apply"(%arg3, %52#1) <{map = affine_map<(d0)[s0] -> (d0 * 64 + s0 * 4)>}> : (index, index) -> index
    %56 = "memref.subview"(%48, %arg1, %54, %55, %44) <{operandSegmentSizes = array<i32: 1, 3, 1, 0>, static_offsets = array<i64: 0, -9223372036854775808, -9223372036854775808, -9223372036854775808>, static_sizes = array<i64: -9223372036854775808, 1, 1, 4>, static_strides = array<i64: 1, 1, 1, 1>}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>, index, index, index, index) -> memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %57 = "vector.transfer_read"(%46, %4, %arg0, %arg1, %54, %55, %0) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 5, 1, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, index, index, index, f8E4M3FNUZ) -> vector<4xf8E4M3FNUZ>
    %58 = "arith.extf"(%57) : (vector<4xf8E4M3FNUZ>) -> vector<4xbf16>
    "vector.transfer_write"(%58, %8, %4, %4, %4, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 5, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (vector<4xbf16>, memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, index, index, index, index, index) -> ()
    "iree_linalg_ext.scatter"(%8, %53, %56) <{dimension_map = array<i64: 0>, operandSegmentSizes = array<i32: 2, 1>, unique_indices = true}> ({
    ^bb0(%arg4: bf16, %arg5: bf16):
      "iree_linalg_ext.yield"(%arg4) : (bf16) -> ()
    }) : (memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>, memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
    "gpu.barrier"() : () -> ()
    "scf.forall.in_parallel"() ({
    ^bb0:
    }) : () -> ()
  }) : (index) -> ()
  "memref.dealloc"(%7) : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> ()
  "func.return"() : () -> ()
 }) {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64>} : () -> ()
 /sharedfile/32/fp8_32_kv16.mlir:4639:13: error: failed to run translation of source executable to target executable for backend #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>
    %1664 = torch.aten.index_put %1662, %1663, %1658, %false_970 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
            ^
 /sharedfile/32/fp8_32_kv16.mlir:4639:13: note: see current operation: 
 "hal.executable.variant"() ({
  "hal.executable.export"() ({
  ^bb0(%arg9: !hal.device, %arg10: index, %arg11: index, %arg12: index, %arg13: index):
    %69:3 = "flow.dispatch.workgroup_count_from_slice"(%arg10, %arg11, %arg12, %arg13) : (index, index, index, index) -> (index, index, index)
    "hal.return"(%69#0, %69#1, %69#2) : (index, index, index) -> ()
  }) {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index, sym_name = "prefill_bs1$async_dispatch_52_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"} : () -> ()
  "builtin.module"() ({
    "func.func"() <{function_type = () -> (), sym_name = "prefill_bs1$async_dispatch_52_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"}> ({
      %0 = "arith.constant"() <{value = 0.000000e+00 : f8E4M3FNUZ}> : () -> f8E4M3FNUZ
      %1 = "arith.constant"() <{value = 0 : i64}> : () -> i64
      %2 = "arith.constant"() <{value = 1 : index}> : () -> index
      %3 = "arith.constant"() <{value = 32 : i64}> : () -> i64
      %4 = "arith.constant"() <{value = 0 : index}> : () -> index
      %5 = "arith.constant"() <{value = 64 : index}> : () -> index
      %6 = "arith.constant"() <{value = dense<4> : vector<1xi64>}> : () -> vector<1xi64>
      %7 = "memref.alloc"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x3xi32, #gpu.address_space<workgroup>>
      %8 = "memref.alloca"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x1x1x1x4xbf16, #gpu.address_space<private>>
      %9 = "gpu.thread_id"() <{dimension = #gpu<dim x>, upper_bound = 64 : index}> : () -> index
      %10 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index} : () -> i32
      %11 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 1 : index} : () -> i32
      %12 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 2 : index} : () -> i32
      %13 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 3 : index} : () -> i32
      %14 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 4 : index} : () -> i32
      %15 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 5 : index} : () -> i32
      %16 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 6 : index} : () -> i32
      %17 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 7 : index} : () -> i32
      %18 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 8 : index} : () -> i32
      %19 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 9 : index} : () -> i32
      %20 = "arith.extui"(%10) : (i32) -> i64
      %21 = "arith.extui"(%11) : (i32) -> i64
      %22 = "arith.shli"(%21, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
      %23 = "arith.ori"(%20, %22) : (i64, i64) -> i64
      %24 = "arith.index_castui"(%23) : (i64) -> index
      %25 = "arith.extui"(%12) : (i32) -> i64
      %26 = "arith.extui"(%13) : (i32) -> i64
      %27 = "arith.shli"(%26, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
      %28 = "arith.ori"(%25, %27) : (i64, i64) -> i64
      %29 = "arith.index_castui"(%28) : (i64) -> index
      %30 = "arith.extui"(%14) : (i32) -> i64
      %31 = "arith.extui"(%15) : (i32) -> i64
      %32 = "arith.shli"(%31, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
      %33 = "arith.ori"(%30, %32) : (i64, i64) -> i64
      %34 = "arith.index_castui"(%33) : (i64) -> index
      %35 = "arith.index_castui"(%16) : (i32) -> index
      %36 = "arith.extui"(%17) : (i32) -> i64
      %37 = "arith.extui"(%18) : (i32) -> i64
      %38 = "arith.shli"(%37, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
      %39 = "arith.ori"(%36, %38) : (i64, i64) -> i64
      %40 = "arith.index_castui"(%39) : (i64) -> index
      %41 = "arith.index_castui"(%19) : (i32) -> index
      %42:6 = "util.assume.int"(%24, %29, %34, %35, %40, %41) <{assumptions = [[#util.int.assumption<umin = 68386816, umax = 5300322304>], [#util.int.assumption<umin = 68501504, umax = 5769969664>], [#util.int.assumption<umin = 68501568, umax = 5770002432>], [#util.int.assumption<umin = 1, umax = 4095>], [#util.int.assumption<umin = 64, umax = 576460752303423424, udiv = 64>], [#util.int.assumption<umin = 1, umax = 4095>]]}> : (index, index, index, index, index, index) -> (index, index, index, index, index, index)
      %43 = "flow.dispatch.workload.ordinal"(%42#3) <{ordinal = 1 : index}> : (index) -> index
      %44 = "flow.dispatch.workload.ordinal"(%42#4) <{ordinal = 2 : index}> : (index) -> index
      %45 = "flow.dispatch.workload.ordinal"(%42#5) <{ordinal = 3 : index}> : (index) -> index
      %46 = "hal.interface.binding.subspan"(%42#0, %43) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      "memref.assume_alignment"(%46) <{alignment = 1 : i32}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
      %47 = "hal.interface.binding.subspan"(%42#1, %45) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      "memref.assume_alignment"(%47) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
      %48 = "hal.interface.binding.subspan"(%4, %44) {alignment = 64 : index, binding = 1 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>
      "memref.assume_alignment"(%48) <{alignment = 64 : i32}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>) -> ()
      %49 = "hal.interface.binding.subspan"(%42#2, %45) {alignment = 64 : index, binding = 2 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      "memref.assume_alignment"(%49) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
      %50 = "memref.alloca"(%45) <{operandSegmentSizes = array<i32: 1, 0>}> : (index) -> memref<1x?xi32, #gpu.address_space<private>>
      %51 = "affine.apply"(%45) <{map = affine_map<()[s0] -> (s0 ceildiv 2)>}> : (index) -> index
      "gpu.barrier"() : () -> ()
      "scf.for"(%9, %51, %5) ({
      ^bb0(%arg7: index):
        %62 = "affine.apply"(%arg7) <{map = affine_map<(d0) -> (d0 * 2)>}> : (index) -> index
        %63 = "affine.min"(%arg7, %45) <{map = affine_map<(d0)[s0] -> (d0 * -2 + s0, 2)>}> : (index, index) -> index
        "scf.for"(%4, %63, %2) ({
        ^bb0(%arg8: index):
          %64 = "arith.addi"(%arg8, %62) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
          %65 = "vector.transfer_read"(%47, %4, %64, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
          %66 = "arith.addi"(%65, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
          %67 = "arith.trunci"(%66) : (vector<1xi64>) -> vector<1xi32>
          %68 = "affine.apply"(%arg7, %arg8) <{map = affine_map<(d0)[s0] -> (d0 * 2 + s0)>}> : (index, index) -> index
          "vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
          "vector.transfer_write"(%67, %50, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x?xi32, #gpu.address_space<private>>, index, index) -> ()
          "scf.yield"() : () -> ()
        }) : (index, index, index) -> ()
        "scf.yield"() : () -> ()
      }) : (index, index, index) -> ()
      "gpu.barrier"() : () -> ()
      %52:2 = "affine.delinearize_index"(%9) <{static_basis = array<i64: 4, 16>}> : (index) -> (index, index)
      "scf.forall"(%43) <{mapping = [#iree_codegen.workgroup_mapping<z:1>, #iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>], operandSegmentSizes = array<i32: 0, 1, 0, 0>, staticLowerBound = array<i64: 0, 0, 0, 0>, staticStep = array<i64: 1, 1, 1, 1>, staticUpperBound = array<i64: -9223372036854775808, 32, 2, 2>}> ({
      ^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index):
        %53 = "memref.subview"(%7) <{operandSegmentSizes = array<i32: 1, 0, 0, 0>, static_offsets = array<i64: 0, 0>, static_sizes = array<i64: 1, 1>, static_strides = array<i64: 1, 1>}> : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>
        "gpu.barrier"() : () -> ()
        "scf.for"(%9, %2, %5) ({
        ^bb0(%arg6: index):
          %59 = "vector.transfer_read"(%47, %4, %arg0, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
          %60 = "arith.addi"(%59, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
          %61 = "arith.trunci"(%60) : (vector<1xi64>) -> vector<1xi32>
          "vector.transfer_write"(%61, %7, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x3xi32, #gpu.address_space<workgroup>>, index, index) -> ()
          "scf.yield"() : () -> ()
        }) : (index, index, index) -> ()
        "gpu.barrier"() : () -> ()
        %54 = "affine.apply"(%arg2, %52#0) <{map = affine_map<(d0)[s0] -> (d0 * 4 + s0)>}> : (index, index) -> index
        %55 = "affine.apply"(%arg3, %52#1) <{map = affine_map<(d0)[s0] -> (d0 * 64 + s0 * 4)>}> : (index, index) -> index
        %56 = "memref.subview"(%48, %arg1, %54, %55, %44) <{operandSegmentSizes = array<i32: 1, 3, 1, 0>, static_offsets = array<i64: 0, -9223372036854775808, -9223372036854775808, -9223372036854775808>, static_sizes = array<i64: -9223372036854775808, 1, 1, 4>, static_strides = array<i64: 1, 1, 1, 1>}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>, index, index, index, index) -> memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        %57 = "vector.transfer_read"(%46, %4, %arg0, %arg1, %54, %55, %0) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 5, 1, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, index, index, index, f8E4M3FNUZ) -> vector<4xf8E4M3FNUZ>
        %58 = "arith.extf"(%57) : (vector<4xf8E4M3FNUZ>) -> vector<4xbf16>
        "vector.transfer_write"(%58, %8, %4, %4, %4, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 5, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (vector<4xbf16>, memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, index, index, index, index, index) -> ()
        "iree_linalg_ext.scatter"(%8, %53, %56) <{dimension_map = array<i64: 0>, operandSegmentSizes = array<i32: 2, 1>, unique_indices = true}> ({
        ^bb0(%arg4: bf16, %arg5: bf16):
          "iree_linalg_ext.yield"(%arg4) : (bf16) -> ()
        }) : (memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>, memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
        "gpu.barrier"() : () -> ()
        "scf.forall.in_parallel"() ({
        ^bb0:
        }) : () -> ()
      }) : (index) -> ()
      "memref.dealloc"(%7) : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> ()
      "func.return"() : () -> ()
    }) {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64>} : () -> ()
  }) : () -> ()
  "hal.executable.variant_end"() : () -> ()
 }) {sym_name = "rocm_hsaco_fb", target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>} : () -> ()
 /sharedfile/32/fp8_32_kv16.mlir:5515:13: error: 'vector.transfer_write' op write affecting operations on global resources are restricted to workgroup distributed contexts.
    %2001 = torch.aten.index_put %1999, %2000, %1995, %false_1343 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
            ^
 /sharedfile/32/fp8_32_kv16.mlir:5515:13: note: see current operation: "vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
 /sharedfile/32/fp8_32_kv16.mlir:5515:13: error: 'func.func' op failed on workgroup distribution verification
    %2001 = torch.aten.index_put %1999, %2000, %1995, %false_1343 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
            ^
 /sharedfile/32/fp8_32_kv16.mlir:5515:13: note: see current operation: 
 "func.func"() <{function_type = () -> (), sym_name = "prefill_bs1$async_dispatch_72_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"}> ({
  %0 = "arith.constant"() <{value = 0.000000e+00 : f8E4M3FNUZ}> : () -> f8E4M3FNUZ
  %1 = "arith.constant"() <{value = 0 : i64}> : () -> i64
  %2 = "arith.constant"() <{value = 1 : index}> : () -> index
  %3 = "arith.constant"() <{value = 32 : i64}> : () -> i64
  %4 = "arith.constant"() <{value = 0 : index}> : () -> index
  %5 = "arith.constant"() <{value = 64 : index}> : () -> index
  %6 = "arith.constant"() <{value = dense<6> : vector<1xi64>}> : () -> vector<1xi64>
  %7 = "memref.alloc"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x3xi32, #gpu.address_space<workgroup>>
  %8 = "memref.alloca"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x1x1x1x4xbf16, #gpu.address_space<private>>
  %9 = "gpu.thread_id"() <{dimension = #gpu<dim x>, upper_bound = 64 : index}> : () -> index
  %10 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index} : () -> i32
  %11 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 1 : index} : () -> i32
  %12 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 2 : index} : () -> i32
  %13 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 3 : index} : () -> i32
  %14 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 4 : index} : () -> i32
  %15 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 5 : index} : () -> i32
  %16 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 6 : index} : () -> i32
  %17 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 7 : index} : () -> i32
  %18 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 8 : index} : () -> i32
  %19 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 9 : index} : () -> i32
  %20 = "arith.extui"(%10) : (i32) -> i64
  %21 = "arith.extui"(%11) : (i32) -> i64
  %22 = "arith.shli"(%21, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
  %23 = "arith.ori"(%20, %22) : (i64, i64) -> i64
  %24 = "arith.index_castui"(%23) : (i64) -> index
  %25 = "arith.extui"(%12) : (i32) -> i64
  %26 = "arith.extui"(%13) : (i32) -> i64
  %27 = "arith.shli"(%26, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
  %28 = "arith.ori"(%25, %27) : (i64, i64) -> i64
  %29 = "arith.index_castui"(%28) : (i64) -> index
  %30 = "arith.extui"(%14) : (i32) -> i64
  %31 = "arith.extui"(%15) : (i32) -> i64
  %32 = "arith.shli"(%31, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
  %33 = "arith.ori"(%30, %32) : (i64, i64) -> i64
  %34 = "arith.index_castui"(%33) : (i64) -> index
  %35 = "arith.index_castui"(%16) : (i32) -> index
  %36 = "arith.extui"(%17) : (i32) -> i64
  %37 = "arith.extui"(%18) : (i32) -> i64
  %38 = "arith.shli"(%37, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
  %39 = "arith.ori"(%36, %38) : (i64, i64) -> i64
  %40 = "arith.index_castui"(%39) : (i64) -> index
  %41 = "arith.index_castui"(%19) : (i32) -> index
  %42:6 = "util.assume.int"(%24, %29, %34, %35, %40, %41) <{assumptions = [[#util.int.assumption<umin = 68386816, umax = 5300322304>], [#util.int.assumption<umin = 68501504, umax = 5769969664>], [#util.int.assumption<umin = 68501568, umax = 5770002432>], [#util.int.assumption<umin = 1, umax = 4095>], [#util.int.assumption<umin = 64, umax = 576460752303423424, udiv = 64>], [#util.int.assumption<umin = 1, umax = 4095>]]}> : (index, index, index, index, index, index) -> (index, index, index, index, index, index)
  %43 = "flow.dispatch.workload.ordinal"(%42#3) <{ordinal = 1 : index}> : (index) -> index
  %44 = "flow.dispatch.workload.ordinal"(%42#4) <{ordinal = 2 : index}> : (index) -> index
  %45 = "flow.dispatch.workload.ordinal"(%42#5) <{ordinal = 3 : index}> : (index) -> index
  %46 = "hal.interface.binding.subspan"(%42#0, %43) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  "memref.assume_alignment"(%46) <{alignment = 1 : i32}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
  %47 = "hal.interface.binding.subspan"(%42#1, %45) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  "memref.assume_alignment"(%47) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
  %48 = "hal.interface.binding.subspan"(%4, %44) {alignment = 64 : index, binding = 1 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>
  "memref.assume_alignment"(%48) <{alignment = 64 : i32}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>) -> ()
  %49 = "hal.interface.binding.subspan"(%42#2, %45) {alignment = 64 : index, binding = 2 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  "memref.assume_alignment"(%49) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
  %50 = "memref.alloca"(%45) <{operandSegmentSizes = array<i32: 1, 0>}> : (index) -> memref<1x?xi32, #gpu.address_space<private>>
  %51 = "affine.apply"(%45) <{map = affine_map<()[s0] -> (s0 ceildiv 2)>}> : (index) -> index
  "gpu.barrier"() : () -> ()
  "scf.for"(%9, %51, %5) ({
  ^bb0(%arg7: index):
    %62 = "affine.apply"(%arg7) <{map = affine_map<(d0) -> (d0 * 2)>}> : (index) -> index
    %63 = "affine.min"(%arg7, %45) <{map = affine_map<(d0)[s0] -> (d0 * -2 + s0, 2)>}> : (index, index) -> index
    "scf.for"(%4, %63, %2) ({
    ^bb0(%arg8: index):
      %64 = "arith.addi"(%arg8, %62) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
      %65 = "vector.transfer_read"(%47, %4, %64, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
      %66 = "arith.addi"(%65, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
      %67 = "arith.trunci"(%66) : (vector<1xi64>) -> vector<1xi32>
      %68 = "affine.apply"(%arg7, %arg8) <{map = affine_map<(d0)[s0] -> (d0 * 2 + s0)>}> : (index, index) -> index
      "vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
      "vector.transfer_write"(%67, %50, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x?xi32, #gpu.address_space<private>>, index, index) -> ()
      "scf.yield"() : () -> ()
    }) : (index, index, index) -> ()
    "scf.yield"() : () -> ()
  }) : (index, index, index) -> ()
  "gpu.barrier"() : () -> ()
  %52:2 = "affine.delinearize_index"(%9) <{static_basis = array<i64: 4, 16>}> : (index) -> (index, index)
  "scf.forall"(%43) <{mapping = [#iree_codegen.workgroup_mapping<z:1>, #iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>], operandSegmentSizes = array<i32: 0, 1, 0, 0>, staticLowerBound = array<i64: 0, 0, 0, 0>, staticStep = array<i64: 1, 1, 1, 1>, staticUpperBound = array<i64: -9223372036854775808, 32, 2, 2>}> ({
  ^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index):
    %53 = "memref.subview"(%7) <{operandSegmentSizes = array<i32: 1, 0, 0, 0>, static_offsets = array<i64: 0, 0>, static_sizes = array<i64: 1, 1>, static_strides = array<i64: 1, 1>}> : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>
    "gpu.barrier"() : () -> ()
    "scf.for"(%9, %2, %5) ({
    ^bb0(%arg6: index):
      %59 = "vector.transfer_read"(%47, %4, %arg0, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
      %60 = "arith.addi"(%59, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
      %61 = "arith.trunci"(%60) : (vector<1xi64>) -> vector<1xi32>
      "vector.transfer_write"(%61, %7, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x3xi32, #gpu.address_space<workgroup>>, index, index) -> ()
      "scf.yield"() : () -> ()
    }) : (index, index, index) -> ()
    "gpu.barrier"() : () -> ()
    %54 = "affine.apply"(%arg2, %52#0) <{map = affine_map<(d0)[s0] -> (d0 * 4 + s0)>}> : (index, index) -> index
    %55 = "affine.apply"(%arg3, %52#1) <{map = affine_map<(d0)[s0] -> (d0 * 64 + s0 * 4)>}> : (index, index) -> index
    %56 = "memref.subview"(%48, %arg1, %54, %55, %44) <{operandSegmentSizes = array<i32: 1, 3, 1, 0>, static_offsets = array<i64: 0, -9223372036854775808, -9223372036854775808, -9223372036854775808>, static_sizes = array<i64: -9223372036854775808, 1, 1, 4>, static_strides = array<i64: 1, 1, 1, 1>}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>, index, index, index, index) -> memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %57 = "vector.transfer_read"(%46, %4, %arg0, %arg1, %54, %55, %0) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 5, 1, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, index, index, index, f8E4M3FNUZ) -> vector<4xf8E4M3FNUZ>
    %58 = "arith.extf"(%57) : (vector<4xf8E4M3FNUZ>) -> vector<4xbf16>
    "vector.transfer_write"(%58, %8, %4, %4, %4, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 5, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (vector<4xbf16>, memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, index, index, index, index, index) -> ()
    "iree_linalg_ext.scatter"(%8, %53, %56) <{dimension_map = array<i64: 0>, operandSegmentSizes = array<i32: 2, 1>, unique_indices = true}> ({
    ^bb0(%arg4: bf16, %arg5: bf16):
      "iree_linalg_ext.yield"(%arg4) : (bf16) -> ()
    }) : (memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>, memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
    "gpu.barrier"() : () -> ()
    "scf.forall.in_parallel"() ({
    ^bb0:
    }) : () -> ()
  }) : (index) -> ()
  "memref.dealloc"(%7) : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> ()
  "func.return"() : () -> ()
 }) {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64>} : () -> ()
 /sharedfile/32/fp8_32_kv16.mlir:5515:13: error: failed to run translation of source executable to target executable for backend #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>
    %2001 = torch.aten.index_put %1999, %2000, %1995, %false_1343 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
            ^
 /sharedfile/32/fp8_32_kv16.mlir:5515:13: note: see current operation: 
 "hal.executable.variant"() ({
  "hal.executable.export"() ({
  ^bb0(%arg9: !hal.device, %arg10: index, %arg11: index, %arg12: index, %arg13: index):
    %69:3 = "flow.dispatch.workgroup_count_from_slice"(%arg10, %arg11, %arg12, %arg13) : (index, index, index, index) -> (index, index, index)
    "hal.return"(%69#0, %69#1, %69#2) : (index, index, index) -> ()
  }) {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index, sym_name = "prefill_bs1$async_dispatch_72_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"} : () -> ()
  "builtin.module"() ({
    "func.func"() <{function_type = () -> (), sym_name = "prefill_bs1$async_dispatch_72_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"}> ({
      %0 = "arith.constant"() <{value = 0.000000e+00 : f8E4M3FNUZ}> : () -> f8E4M3FNUZ
      %1 = "arith.constant"() <{value = 0 : i64}> : () -> i64
      %2 = "arith.constant"() <{value = 1 : index}> : () -> index
      %3 = "arith.constant"() <{value = 32 : i64}> : () -> i64
      %4 = "arith.constant"() <{value = 0 : index}> : () -> index
      %5 = "arith.constant"() <{value = 64 : index}> : () -> index
      %6 = "arith.constant"() <{value = dense<6> : vector<1xi64>}> : () -> vector<1xi64>
      %7 = "memref.alloc"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x3xi32, #gpu.address_space<workgroup>>
      %8 = "memref.alloca"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x1x1x1x4xbf16, #gpu.address_space<private>>
      %9 = "gpu.thread_id"() <{dimension = #gpu<dim x>, upper_bound = 64 : index}> : () -> index
      %10 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index} : () -> i32
      %11 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 1 : index} : () -> i32
      %12 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 2 : index} : () -> i32
      %13 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 3 : index} : () -> i32
      %14 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 4 : index} : () -> i32
      %15 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 5 : index} : () -> i32
      %16 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 6 : index} : () -> i32
      %17 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 7 : index} : () -> i32
      %18 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 8 : index} : () -> i32
      %19 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 9 : index} : () -> i32
      %20 = "arith.extui"(%10) : (i32) -> i64
      %21 = "arith.extui"(%11) : (i32) -> i64
      %22 = "arith.shli"(%21, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
      %23 = "arith.ori"(%20, %22) : (i64, i64) -> i64
      %24 = "arith.index_castui"(%23) : (i64) -> index
      %25 = "arith.extui"(%12) : (i32) -> i64
      %26 = "arith.extui"(%13) : (i32) -> i64
      %27 = "arith.shli"(%26, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
      %28 = "arith.ori"(%25, %27) : (i64, i64) -> i64
      %29 = "arith.index_castui"(%28) : (i64) -> index
      %30 = "arith.extui"(%14) : (i32) -> i64
      %31 = "arith.extui"(%15) : (i32) -> i64
      %32 = "arith.shli"(%31, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
      %33 = "arith.ori"(%30, %32) : (i64, i64) -> i64
      %34 = "arith.index_castui"(%33) : (i64) -> index
      %35 = "arith.index_castui"(%16) : (i32) -> index
      %36 = "arith.extui"(%17) : (i32) -> i64
      %37 = "arith.extui"(%18) : (i32) -> i64
      %38 = "arith.shli"(%37, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
      %39 = "arith.ori"(%36, %38) : (i64, i64) -> i64
      %40 = "arith.index_castui"(%39) : (i64) -> index
      %41 = "arith.index_castui"(%19) : (i32) -> index
      %42:6 = "util.assume.int"(%24, %29, %34, %35, %40, %41) <{assumptions = [[#util.int.assumption<umin = 68386816, umax = 5300322304>], [#util.int.assumption<umin = 68501504, umax = 5769969664>], [#util.int.assumption<umin = 68501568, umax = 5770002432>], [#util.int.assumption<umin = 1, umax = 4095>], [#util.int.assumption<umin = 64, umax = 576460752303423424, udiv = 64>], [#util.int.assumption<umin = 1, umax = 4095>]]}> : (index, index, index, index, index, index) -> (index, index, index, index, index, index)
      %43 = "flow.dispatch.workload.ordinal"(%42#3) <{ordinal = 1 : index}> : (index) -> index
      %44 = "flow.dispatch.workload.ordinal"(%42#4) <{ordinal = 2 : index}> : (index) -> index
      %45 = "flow.dispatch.workload.ordinal"(%42#5) <{ordinal = 3 : index}> : (index) -> index
      %46 = "hal.interface.binding.subspan"(%42#0, %43) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      "memref.assume_alignment"(%46) <{alignment = 1 : i32}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
      %47 = "hal.interface.binding.subspan"(%42#1, %45) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      "memref.assume_alignment"(%47) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
      %48 = "hal.interface.binding.subspan"(%4, %44) {alignment = 64 : index, binding = 1 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>
      "memref.assume_alignment"(%48) <{alignment = 64 : i32}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>) -> ()
      %49 = "hal.interface.binding.subspan"(%42#2, %45) {alignment = 64 : index, binding = 2 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      "memref.assume_alignment"(%49) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
      %50 = "memref.alloca"(%45) <{operandSegmentSizes = array<i32: 1, 0>}> : (index) -> memref<1x?xi32, #gpu.address_space<private>>
      %51 = "affine.apply"(%45) <{map = affine_map<()[s0] -> (s0 ceildiv 2)>}> : (index) -> index
      "gpu.barrier"() : () -> ()
      "scf.for"(%9, %51, %5) ({
      ^bb0(%arg7: index):
        %62 = "affine.apply"(%arg7) <{map = affine_map<(d0) -> (d0 * 2)>}> : (index) -> index
        %63 = "affine.min"(%arg7, %45) <{map = affine_map<(d0)[s0] -> (d0 * -2 + s0, 2)>}> : (index, index) -> index
        "scf.for"(%4, %63, %2) ({
        ^bb0(%arg8: index):
          %64 = "arith.addi"(%arg8, %62) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
          %65 = "vector.transfer_read"(%47, %4, %64, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
          %66 = "arith.addi"(%65, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
          %67 = "arith.trunci"(%66) : (vector<1xi64>) -> vector<1xi32>
          %68 = "affine.apply"(%arg7, %arg8) <{map = affine_map<(d0)[s0] -> (d0 * 2 + s0)>}> : (index, index) -> index
          "vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
          "vector.transfer_write"(%67, %50, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x?xi32, #gpu.address_space<private>>, index, index) -> ()
          "scf.yield"() : () -> ()
        }) : (index, index, index) -> ()
        "scf.yield"() : () -> ()
      }) : (index, index, index) -> ()
      "gpu.barrier"() : () -> ()
      %52:2 = "affine.delinearize_index"(%9) <{static_basis = array<i64: 4, 16>}> : (index) -> (index, index)
      "scf.forall"(%43) <{mapping = [#iree_codegen.workgroup_mapping<z:1>, #iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>], operandSegmentSizes = array<i32: 0, 1, 0, 0>, staticLowerBound = array<i64: 0, 0, 0, 0>, staticStep = array<i64: 1, 1, 1, 1>, staticUpperBound = array<i64: -9223372036854775808, 32, 2, 2>}> ({
      ^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index):
        %53 = "memref.subview"(%7) <{operandSegmentSizes = array<i32: 1, 0, 0, 0>, static_offsets = array<i64: 0, 0>, static_sizes = array<i64: 1, 1>, static_strides = array<i64: 1, 1>}> : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>
        "gpu.barrier"() : () -> ()
        "scf.for"(%9, %2, %5) ({
        ^bb0(%arg6: index):
          %59 = "vector.transfer_read"(%47, %4, %arg0, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
          %60 = "arith.addi"(%59, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
          %61 = "arith.trunci"(%60) : (vector<1xi64>) -> vector<1xi32>
          "vector.transfer_write"(%61, %7, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x3xi32, #gpu.address_space<workgroup>>, index, index) -> ()
          "scf.yield"() : () -> ()
        }) : (index, index, index) -> ()
        "gpu.barrier"() : () -> ()
        %54 = "affine.apply"(%arg2, %52#0) <{map = affine_map<(d0)[s0] -> (d0 * 4 + s0)>}> : (index, index) -> index
        %55 = "affine.apply"(%arg3, %52#1) <{map = affine_map<(d0)[s0] -> (d0 * 64 + s0 * 4)>}> : (index, index) -> index
        %56 = "memref.subview"(%48, %arg1, %54, %55, %44) <{operandSegmentSizes = array<i32: 1, 3, 1, 0>, static_offsets = array<i64: 0, -9223372036854775808, -9223372036854775808, -9223372036854775808>, static_sizes = array<i64: -9223372036854775808, 1, 1, 4>, static_strides = array<i64: 1, 1, 1, 1>}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>, index, index, index, index) -> memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        %57 = "vector.transfer_read"(%46, %4, %arg0, %arg1, %54, %55, %0) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 5, 1, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, index, index, index, f8E4M3FNUZ) -> vector<4xf8E4M3FNUZ>
        %58 = "arith.extf"(%57) : (vector<4xf8E4M3FNUZ>) -> vector<4xbf16>
        "vector.transfer_write"(%58, %8, %4, %4, %4, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 5, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (vector<4xbf16>, memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, index, index, index, index, index) -> ()
        "iree_linalg_ext.scatter"(%8, %53, %56) <{dimension_map = array<i64: 0>, operandSegmentSizes = array<i32: 2, 1>, unique_indices = true}> ({
        ^bb0(%arg4: bf16, %arg5: bf16):
          "iree_linalg_ext.yield"(%arg4) : (bf16) -> ()
        }) : (memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>, memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
        "gpu.barrier"() : () -> ()
        "scf.forall.in_parallel"() ({
        ^bb0:
        }) : () -> ()
      }) : (index) -> ()
      "memref.dealloc"(%7) : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> ()
      "func.return"() : () -> ()
    }) {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64>} : () -> ()
  }) : () -> ()
  "hal.executable.variant_end"() : () -> ()
 }) {sym_name = "rocm_hsaco_fb", target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>} : () -> ()
 /sharedfile/32/fp8_32_kv16.mlir:6391:13: error: 'vector.transfer_write' op write affecting operations on global resources are restricted to workgroup distributed contexts.
    %2338 = torch.aten.index_put %2336, %2337, %2332, %false_1716 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
            ^
 /sharedfile/32/fp8_32_kv16.mlir:6391:13: note: see current operation: "vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
 /sharedfile/32/fp8_32_kv16.mlir:6391:13: error: 'func.func' op failed on workgroup distribution verification
    %2338 = torch.aten.index_put %2336, %2337, %2332, %false_1716 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
            ^
 /sharedfile/32/fp8_32_kv16.mlir:6391:13: note: see current operation: 
 "func.func"() <{function_type = () -> (), sym_name = "prefill_bs1$async_dispatch_92_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"}> ({
  %0 = "arith.constant"() <{value = 0.000000e+00 : f8E4M3FNUZ}> : () -> f8E4M3FNUZ
  %1 = "arith.constant"() <{value = 0 : i64}> : () -> i64
  %2 = "arith.constant"() <{value = 1 : index}> : () -> index
  %3 = "arith.constant"() <{value = 32 : i64}> : () -> i64
  %4 = "arith.constant"() <{value = 0 : index}> : () -> index
  %5 = "arith.constant"() <{value = 64 : index}> : () -> index
  %6 = "arith.constant"() <{value = dense<8> : vector<1xi64>}> : () -> vector<1xi64>
  %7 = "memref.alloc"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x3xi32, #gpu.address_space<workgroup>>
  %8 = "memref.alloca"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x1x1x1x4xbf16, #gpu.address_space<private>>
  %9 = "gpu.thread_id"() <{dimension = #gpu<dim x>, upper_bound = 64 : index}> : () -> index
  %10 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index} : () -> i32
  %11 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 1 : index} : () -> i32
  %12 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 2 : index} : () -> i32
  %13 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 3 : index} : () -> i32
  %14 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 4 : index} : () -> i32
  %15 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 5 : index} : () -> i32
  %16 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 6 : index} : () -> i32
  %17 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 7 : index} : () -> i32
  %18 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 8 : index} : () -> i32
  %19 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 9 : index} : () -> i32
  %20 = "arith.extui"(%10) : (i32) -> i64
  %21 = "arith.extui"(%11) : (i32) -> i64
  %22 = "arith.shli"(%21, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
  %23 = "arith.ori"(%20, %22) : (i64, i64) -> i64
  %24 = "arith.index_castui"(%23) : (i64) -> index
  %25 = "arith.extui"(%12) : (i32) -> i64
  %26 = "arith.extui"(%13) : (i32) -> i64
  %27 = "arith.shli"(%26, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
  %28 = "arith.ori"(%25, %27) : (i64, i64) -> i64
  %29 = "arith.index_castui"(%28) : (i64) -> index
  %30 = "arith.extui"(%14) : (i32) -> i64
  %31 = "arith.extui"(%15) : (i32) -> i64
  %32 = "arith.shli"(%31, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
  %33 = "arith.ori"(%30, %32) : (i64, i64) -> i64
  %34 = "arith.index_castui"(%33) : (i64) -> index
  %35 = "arith.index_castui"(%16) : (i32) -> index
  %36 = "arith.extui"(%17) : (i32) -> i64
  %37 = "arith.extui"(%18) : (i32) -> i64
  %38 = "arith.shli"(%37, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
  %39 = "arith.ori"(%36, %38) : (i64, i64) -> i64
  %40 = "arith.index_castui"(%39) : (i64) -> index
  %41 = "arith.index_castui"(%19) : (i32) -> index
  %42:6 = "util.assume.int"(%24, %29, %34, %35, %40, %41) <{assumptions = [[#util.int.assumption<umin = 68386816, umax = 5300322304>], [#util.int.assumption<umin = 68501504, umax = 5769969664>], [#util.int.assumption<umin = 68501568, umax = 5770002432>], [#util.int.assumption<umin = 1, umax = 4095>], [#util.int.assumption<umin = 64, umax = 576460752303423424, udiv = 64>], [#util.int.assumption<umin = 1, umax = 4095>]]}> : (index, index, index, index, index, index) -> (index, index, index, index, index, index)
  %43 = "flow.dispatch.workload.ordinal"(%42#3) <{ordinal = 1 : index}> : (index) -> index
  %44 = "flow.dispatch.workload.ordinal"(%42#4) <{ordinal = 2 : index}> : (index) -> index
  %45 = "flow.dispatch.workload.ordinal"(%42#5) <{ordinal = 3 : index}> : (index) -> index
  %46 = "hal.interface.binding.subspan"(%42#0, %43) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  "memref.assume_alignment"(%46) <{alignment = 1 : i32}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
  %47 = "hal.interface.binding.subspan"(%42#1, %45) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  "memref.assume_alignment"(%47) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
  %48 = "hal.interface.binding.subspan"(%4, %44) {alignment = 64 : index, binding = 1 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>
  "memref.assume_alignment"(%48) <{alignment = 64 : i32}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>) -> ()
  %49 = "hal.interface.binding.subspan"(%42#2, %45) {alignment = 64 : index, binding = 2 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  "memref.assume_alignment"(%49) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
  %50 = "memref.alloca"(%45) <{operandSegmentSizes = array<i32: 1, 0>}> : (index) -> memref<1x?xi32, #gpu.address_space<private>>
  %51 = "affine.apply"(%45) <{map = affine_map<()[s0] -> (s0 ceildiv 2)>}> : (index) -> index
  "gpu.barrier"() : () -> ()
  "scf.for"(%9, %51, %5) ({
  ^bb0(%arg7: index):
    %62 = "affine.apply"(%arg7) <{map = affine_map<(d0) -> (d0 * 2)>}> : (index) -> index
    %63 = "affine.min"(%arg7, %45) <{map = affine_map<(d0)[s0] -> (d0 * -2 + s0, 2)>}> : (index, index) -> index
    "scf.for"(%4, %63, %2) ({
    ^bb0(%arg8: index):
      %64 = "arith.addi"(%arg8, %62) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
      %65 = "vector.transfer_read"(%47, %4, %64, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
      %66 = "arith.addi"(%65, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
      %67 = "arith.trunci"(%66) : (vector<1xi64>) -> vector<1xi32>
      %68 = "affine.apply"(%arg7, %arg8) <{map = affine_map<(d0)[s0] -> (d0 * 2 + s0)>}> : (index, index) -> index
      "vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
      "vector.transfer_write"(%67, %50, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x?xi32, #gpu.address_space<private>>, index, index) -> ()
      "scf.yield"() : () -> ()
    }) : (index, index, index) -> ()
    "scf.yield"() : () -> ()
  }) : (index, index, index) -> ()
  "gpu.barrier"() : () -> ()
  %52:2 = "affine.delinearize_index"(%9) <{static_basis = array<i64: 4, 16>}> : (index) -> (index, index)
  "scf.forall"(%43) <{mapping = [#iree_codegen.workgroup_mapping<z:1>, #iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>], operandSegmentSizes = array<i32: 0, 1, 0, 0>, staticLowerBound = array<i64: 0, 0, 0, 0>, staticStep = array<i64: 1, 1, 1, 1>, staticUpperBound = array<i64: -9223372036854775808, 32, 2, 2>}> ({
  ^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index):
    %53 = "memref.subview"(%7) <{operandSegmentSizes = array<i32: 1, 0, 0, 0>, static_offsets = array<i64: 0, 0>, static_sizes = array<i64: 1, 1>, static_strides = array<i64: 1, 1>}> : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>
    "gpu.barrier"() : () -> ()
    "scf.for"(%9, %2, %5) ({
    ^bb0(%arg6: index):
      %59 = "vector.transfer_read"(%47, %4, %arg0, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
      %60 = "arith.addi"(%59, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
      %61 = "arith.trunci"(%60) : (vector<1xi64>) -> vector<1xi32>
      "vector.transfer_write"(%61, %7, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x3xi32, #gpu.address_space<workgroup>>, index, index) -> ()
      "scf.yield"() : () -> ()
    }) : (index, index, index) -> ()
    "gpu.barrier"() : () -> ()
    %54 = "affine.apply"(%arg2, %52#0) <{map = affine_map<(d0)[s0] -> (d0 * 4 + s0)>}> : (index, index) -> index
    %55 = "affine.apply"(%arg3, %52#1) <{map = affine_map<(d0)[s0] -> (d0 * 64 + s0 * 4)>}> : (index, index) -> index
    %56 = "memref.subview"(%48, %arg1, %54, %55, %44) <{operandSegmentSizes = array<i32: 1, 3, 1, 0>, static_offsets = array<i64: 0, -9223372036854775808, -9223372036854775808, -9223372036854775808>, static_sizes = array<i64: -9223372036854775808, 1, 1, 4>, static_strides = array<i64: 1, 1, 1, 1>}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>, index, index, index, index) -> memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %57 = "vector.transfer_read"(%46, %4, %arg0, %arg1, %54, %55, %0) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 5, 1, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, index, index, index, f8E4M3FNUZ) -> vector<4xf8E4M3FNUZ>
    %58 = "arith.extf"(%57) : (vector<4xf8E4M3FNUZ>) -> vector<4xbf16>
    "vector.transfer_write"(%58, %8, %4, %4, %4, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 5, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (vector<4xbf16>, memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, index, index, index, index, index) -> ()
    "iree_linalg_ext.scatter"(%8, %53, %56) <{dimension_map = array<i64: 0>, operandSegmentSizes = array<i32: 2, 1>, unique_indices = true}> ({
    ^bb0(%arg4: bf16, %arg5: bf16):
      "iree_linalg_ext.yield"(%arg4) : (bf16) -> ()
    }) : (memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>, memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
    "gpu.barrier"() : () -> ()
    "scf.forall.in_parallel"() ({
    ^bb0:
    }) : () -> ()
  }) : (index) -> ()
  "memref.dealloc"(%7) : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> ()
  "func.return"() : () -> ()
 }) {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64>} : () -> ()
 /sharedfile/32/fp8_32_kv16.mlir:6391:13: error: failed to run translation of source executable to target executable for backend #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>
    %2338 = torch.aten.index_put %2336, %2337, %2332, %false_1716 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
            ^
 /sharedfile/32/fp8_32_kv16.mlir:6391:13: note: see current operation: 
 "hal.executable.variant"() ({
  "hal.executable.export"() ({
  ^bb0(%arg9: !hal.device, %arg10: index, %arg11: index, %arg12: index, %arg13: index):
    %69:3 = "flow.dispatch.workgroup_count_from_slice"(%arg10, %arg11, %arg12, %arg13) : (index, index, index, index) -> (index, index, index)
    "hal.return"(%69#0, %69#1, %69#2) : (index, index, index) -> ()
  }) {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index, sym_name = "prefill_bs1$async_dispatch_92_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"} : () -> ()
  "builtin.module"() ({
    "func.func"() <{function_type = () -> (), sym_name = "prefill_bs1$async_dispatch_92_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"}> ({
      %0 = "arith.constant"() <{value = 0.000000e+00 : f8E4M3FNUZ}> : () -> f8E4M3FNUZ
      %1 = "arith.constant"() <{value = 0 : i64}> : () -> i64
      %2 = "arith.constant"() <{value = 1 : index}> : () -> index
      %3 = "arith.constant"() <{value = 32 : i64}> : () -> i64
      %4 = "arith.constant"() <{value = 0 : index}> : () -> index
      %5 = "arith.constant"() <{value = 64 : index}> : () -> index
      %6 = "arith.constant"() <{value = dense<8> : vector<1xi64>}> : () -> vector<1xi64>
      %7 = "memref.alloc"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x3xi32, #gpu.address_space<workgroup>>
      %8 = "memref.alloca"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x1x1x1x4xbf16, #gpu.address_space<private>>
      %9 = "gpu.thread_id"() <{dimension = #gpu<dim x>, upper_bound = 64 : index}> : () -> index
      %10 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index} : () -> i32
      %11 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 1 : index} : () -> i32
      %12 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 2 : index} : () -> i32
      %13 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 3 : index} : () -> i32
      %14 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 4 : index} : () -> i32
      %15 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 5 : index} : () -> i32
      %16 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 6 : index} : () -> i32
      %17 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 7 : index} : () -> i32
      %18 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 8 : index} : () -> i32
      %19 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 9 : index} : () -> i32
      %20 = "arith.extui"(%10) : (i32) -> i64
      %21 = "arith.extui"(%11) : (i32) -> i64
      %22 = "arith.shli"(%21, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
      %23 = "arith.ori"(%20, %22) : (i64, i64) -> i64
      %24 = "arith.index_castui"(%23) : (i64) -> index
      %25 = "arith.extui"(%12) : (i32) -> i64
      %26 = "arith.extui"(%13) : (i32) -> i64
      %27 = "arith.shli"(%26, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
      %28 = "arith.ori"(%25, %27) : (i64, i64) -> i64
      %29 = "arith.index_castui"(%28) : (i64) -> index
      %30 = "arith.extui"(%14) : (i32) -> i64
      %31 = "arith.extui"(%15) : (i32) -> i64
      %32 = "arith.shli"(%31, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
      %33 = "arith.ori"(%30, %32) : (i64, i64) -> i64
      %34 = "arith.index_castui"(%33) : (i64) -> index
      %35 = "arith.index_castui"(%16) : (i32) -> index
      %36 = "arith.extui"(%17) : (i32) -> i64
      %37 = "arith.extui"(%18) : (i32) -> i64
      %38 = "arith.shli"(%37, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
      %39 = "arith.ori"(%36, %38) : (i64, i64) -> i64
      %40 = "arith.index_castui"(%39) : (i64) -> index
      %41 = "arith.index_castui"(%19) : (i32) -> index
      %42:6 = "util.assume.int"(%24, %29, %34, %35, %40, %41) <{assumptions = [[#util.int.assumption<umin = 68386816, umax = 5300322304>], [#util.int.assumption<umin = 68501504, umax = 5769969664>], [#util.int.assumption<umin = 68501568, umax = 5770002432>], [#util.int.assumption<umin = 1, umax = 4095>], [#util.int.assumption<umin = 64, umax = 576460752303423424, udiv = 64>], [#util.int.assumption<umin = 1, umax = 4095>]]}> : (index, index, index, index, index, index) -> (index, index, index, index, index, index)
      %43 = "flow.dispatch.workload.ordinal"(%42#3) <{ordinal = 1 : index}> : (index) -> index
      %44 = "flow.dispatch.workload.ordinal"(%42#4) <{ordinal = 2 : index}> : (index) -> index
      %45 = "flow.dispatch.workload.ordinal"(%42#5) <{ordinal = 3 : index}> : (index) -> index
      %46 = "hal.interface.binding.subspan"(%42#0, %43) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      "memref.assume_alignment"(%46) <{alignment = 1 : i32}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
      %47 = "hal.interface.binding.subspan"(%42#1, %45) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      "memref.assume_alignment"(%47) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
      %48 = "hal.interface.binding.subspan"(%4, %44) {alignment = 64 : index, binding = 1 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>
      "memref.assume_alignment"(%48) <{alignment = 64 : i32}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>) -> ()
      %49 = "hal.interface.binding.subspan"(%42#2, %45) {alignment = 64 : index, binding = 2 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      "memref.assume_alignment"(%49) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
      %50 = "memref.alloca"(%45) <{operandSegmentSizes = array<i32: 1, 0>}> : (index) -> memref<1x?xi32, #gpu.address_space<private>>
      %51 = "affine.apply"(%45) <{map = affine_map<()[s0] -> (s0 ceildiv 2)>}> : (index) -> index
      "gpu.barrier"() : () -> ()
      "scf.for"(%9, %51, %5) ({
      ^bb0(%arg7: index):
        %62 = "affine.apply"(%arg7) <{map = affine_map<(d0) -> (d0 * 2)>}> : (index) -> index
        %63 = "affine.min"(%arg7, %45) <{map = affine_map<(d0)[s0] -> (d0 * -2 + s0, 2)>}> : (index, index) -> index
        "scf.for"(%4, %63, %2) ({
        ^bb0(%arg8: index):
          %64 = "arith.addi"(%arg8, %62) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
          %65 = "vector.transfer_read"(%47, %4, %64, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
          %66 = "arith.addi"(%65, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
          %67 = "arith.trunci"(%66) : (vector<1xi64>) -> vector<1xi32>
          %68 = "affine.apply"(%arg7, %arg8) <{map = affine_map<(d0)[s0] -> (d0 * 2 + s0)>}> : (index, index) -> index
          "vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
          "vector.transfer_write"(%67, %50, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x?xi32, #gpu.address_space<private>>, index, index) -> ()
          "scf.yield"() : () -> ()
        }) : (index, index, index) -> ()
        "scf.yield"() : () -> ()
      }) : (index, index, index) -> ()
      "gpu.barrier"() : () -> ()
      %52:2 = "affine.delinearize_index"(%9) <{static_basis = array<i64: 4, 16>}> : (index) -> (index, index)
      "scf.forall"(%43) <{mapping = [#iree_codegen.workgroup_mapping<z:1>, #iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>], operandSegmentSizes = array<i32: 0, 1, 0, 0>, staticLowerBound = array<i64: 0, 0, 0, 0>, staticStep = array<i64: 1, 1, 1, 1>, staticUpperBound = array<i64: -9223372036854775808, 32, 2, 2>}> ({
      ^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index):
        %53 = "memref.subview"(%7) <{operandSegmentSizes = array<i32: 1, 0, 0, 0>, static_offsets = array<i64: 0, 0>, static_sizes = array<i64: 1, 1>, static_strides = array<i64: 1, 1>}> : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>
        "gpu.barrier"() : () -> ()
        "scf.for"(%9, %2, %5) ({
        ^bb0(%arg6: index):
          %59 = "vector.transfer_read"(%47, %4, %arg0, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
          %60 = "arith.addi"(%59, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
          %61 = "arith.trunci"(%60) : (vector<1xi64>) -> vector<1xi32>
          "vector.transfer_write"(%61, %7, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x3xi32, #gpu.address_space<workgroup>>, index, index) -> ()
          "scf.yield"() : () -> ()
        }) : (index, index, index) -> ()
        "gpu.barrier"() : () -> ()
        %54 = "affine.apply"(%arg2, %52#0) <{map = affine_map<(d0)[s0] -> (d0 * 4 + s0)>}> : (index, index) -> index
        %55 = "affine.apply"(%arg3, %52#1) <{map = affine_map<(d0)[s0] -> (d0 * 64 + s0 * 4)>}> : (index, index) -> index
        %56 = "memref.subview"(%48, %arg1, %54, %55, %44) <{operandSegmentSizes = array<i32: 1, 3, 1, 0>, static_offsets = array<i64: 0, -9223372036854775808, -9223372036854775808, -9223372036854775808>, static_sizes = array<i64: -9223372036854775808, 1, 1, 4>, static_strides = array<i64: 1, 1, 1, 1>}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>, index, index, index, index) -> memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        %57 = "vector.transfer_read"(%46, %4, %arg0, %arg1, %54, %55, %0) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 5, 1, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, index, index, index, f8E4M3FNUZ) -> vector<4xf8E4M3FNUZ>
        %58 = "arith.extf"(%57) : (vector<4xf8E4M3FNUZ>) -> vector<4xbf16>
        "vector.transfer_write"(%58, %8, %4, %4, %4, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 5, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (vector<4xbf16>, memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, index, index, index, index, index) -> ()
        "iree_linalg_ext.scatter"(%8, %53, %56) <{dimension_map = array<i64: 0>, operandSegmentSizes = array<i32: 2, 1>, unique_indices = true}> ({
        ^bb0(%arg4: bf16, %arg5: bf16):
          "iree_linalg_ext.yield"(%arg4) : (bf16) -> ()
        }) : (memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>, memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
        "gpu.barrier"() : () -> ()
        "scf.forall.in_parallel"() ({
        ^bb0:
        }) : () -> ()
      }) : (index) -> ()
      "memref.dealloc"(%7) : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> ()
      "func.return"() : () -> ()
    }) {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64>} : () -> ()
  }) : () -> ()
  "hal.executable.variant_end"() : () -> ()
 }) {sym_name = "rocm_hsaco_fb", target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>} : () -> ()
 /sharedfile/32/fp8_32_kv16.mlir:7267:13: error: 'vector.transfer_write' op write affecting operations on global resources are restricted to workgroup distributed contexts.
    %2675 = torch.aten.index_put %2673, %2674, %2669, %false_2088 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
            ^
 /sharedfile/32/fp8_32_kv16.mlir:7267:13: note: see current operation: "vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
 /sharedfile/32/fp8_32_kv16.mlir:7267:13: error: 'func.func' op failed on workgroup distribution verification
    %2675 = torch.aten.index_put %2673, %2674, %2669, %false_2088 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
            ^
 /sharedfile/32/fp8_32_kv16.mlir:7267:13: note: see current operation: 
 "func.func"() <{function_type = () -> (), sym_name = "prefill_bs1$async_dispatch_112_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"}> ({
  %0 = "arith.constant"() <{value = 0.000000e+00 : f8E4M3FNUZ}> : () -> f8E4M3FNUZ
  %1 = "arith.constant"() <{value = 0 : i64}> : () -> i64
  %2 = "arith.constant"() <{value = 1 : index}> : () -> index
  %3 = "arith.constant"() <{value = 32 : i64}> : () -> i64
  %4 = "arith.constant"() <{value = 0 : index}> : () -> index
  %5 = "arith.constant"() <{value = 64 : index}> : () -> index
  %6 = "arith.constant"() <{value = dense<10> : vector<1xi64>}> : () -> vector<1xi64>
  %7 = "memref.alloc"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x3xi32, #gpu.address_space<workgroup>>
  %8 = "memref.alloca"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x1x1x1x4xbf16, #gpu.address_space<private>>
  %9 = "gpu.thread_id"() <{dimension = #gpu<dim x>, upper_bound = 64 : index}> : () -> index
  %10 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index} : () -> i32
  %11 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 1 : index} : () -> i32
  %12 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 2 : index} : () -> i32
  %13 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 3 : index} : () -> i32
  %14 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 4 : index} : () -> i32
  %15 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 5 : index} : () -> i32
  %16 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 6 : index} : () -> i32
  %17 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 7 : index} : () -> i32
  %18 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 8 : index} : () -> i32
  %19 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 9 : index} : () -> i32
  %20 = "arith.extui"(%10) : (i32) -> i64
  %21 = "arith.extui"(%11) : (i32) -> i64
  %22 = "arith.shli"(%21, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
  %23 = "arith.ori"(%20, %22) : (i64, i64) -> i64
  %24 = "arith.index_castui"(%23) : (i64) -> index
  %25 = "arith.extui"(%12) : (i32) -> i64
  %26 = "arith.extui"(%13) : (i32) -> i64
  %27 = "arith.shli"(%26, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
  %28 = "arith.ori"(%25, %27) : (i64, i64) -> i64
  %29 = "arith.index_castui"(%28) : (i64) -> index
  %30 = "arith.extui"(%14) : (i32) -> i64
  %31 = "arith.extui"(%15) : (i32) -> i64
  %32 = "arith.shli"(%31, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
  %33 = "arith.ori"(%30, %32) : (i64, i64) -> i64
  %34 = "arith.index_castui"(%33) : (i64) -> index
  %35 = "arith.index_castui"(%16) : (i32) -> index
  %36 = "arith.extui"(%17) : (i32) -> i64
  %37 = "arith.extui"(%18) : (i32) -> i64
  %38 = "arith.shli"(%37, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
  %39 = "arith.ori"(%36, %38) : (i64, i64) -> i64
  %40 = "arith.index_castui"(%39) : (i64) -> index
  %41 = "arith.index_castui"(%19) : (i32) -> index
  %42:6 = "util.assume.int"(%24, %29, %34, %35, %40, %41) <{assumptions = [[#util.int.assumption<umin = 68386816, umax = 5300322304>], [#util.int.assumption<umin = 68501504, umax = 5769969664>], [#util.int.assumption<umin = 68501568, umax = 5770002432>], [#util.int.assumption<umin = 1, umax = 4095>], [#util.int.assumption<umin = 64, umax = 576460752303423424, udiv = 64>], [#util.int.assumption<umin = 1, umax = 4095>]]}> : (index, index, index, index, index, index) -> (index, index, index, index, index, index)
  %43 = "flow.dispatch.workload.ordinal"(%42#3) <{ordinal = 1 : index}> : (index) -> index
  %44 = "flow.dispatch.workload.ordinal"(%42#4) <{ordinal = 2 : index}> : (index) -> index
  %45 = "flow.dispatch.workload.ordinal"(%42#5) <{ordinal = 3 : index}> : (index) -> index
  %46 = "hal.interface.binding.subspan"(%42#0, %43) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  "memref.assume_alignment"(%46) <{alignment = 1 : i32}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
  %47 = "hal.interface.binding.subspan"(%42#1, %45) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  "memref.assume_alignment"(%47) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
  %48 = "hal.interface.binding.subspan"(%4, %44) {alignment = 64 : index, binding = 1 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>
  "memref.assume_alignment"(%48) <{alignment = 64 : i32}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>) -> ()
  %49 = "hal.interface.binding.subspan"(%42#2, %45) {alignment = 64 : index, binding = 2 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  "memref.assume_alignment"(%49) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
  %50 = "memref.alloca"(%45) <{operandSegmentSizes = array<i32: 1, 0>}> : (index) -> memref<1x?xi32, #gpu.address_space<private>>
  %51 = "affine.apply"(%45) <{map = affine_map<()[s0] -> (s0 ceildiv 2)>}> : (index) -> index
  "gpu.barrier"() : () -> ()
  "scf.for"(%9, %51, %5) ({
  ^bb0(%arg7: index):
    %62 = "affine.apply"(%arg7) <{map = affine_map<(d0) -> (d0 * 2)>}> : (index) -> index
    %63 = "affine.min"(%arg7, %45) <{map = affine_map<(d0)[s0] -> (d0 * -2 + s0, 2)>}> : (index, index) -> index
    "scf.for"(%4, %63, %2) ({
    ^bb0(%arg8: index):
      %64 = "arith.addi"(%arg8, %62) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
      %65 = "vector.transfer_read"(%47, %4, %64, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
      %66 = "arith.addi"(%65, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
      %67 = "arith.trunci"(%66) : (vector<1xi64>) -> vector<1xi32>
      %68 = "affine.apply"(%arg7, %arg8) <{map = affine_map<(d0)[s0] -> (d0 * 2 + s0)>}> : (index, index) -> index
      "vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
      "vector.transfer_write"(%67, %50, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x?xi32, #gpu.address_space<private>>, index, index) -> ()
      "scf.yield"() : () -> ()
    }) : (index, index, index) -> ()
    "scf.yield"() : () -> ()
  }) : (index, index, index) -> ()
  "gpu.barrier"() : () -> ()
  %52:2 = "affine.delinearize_index"(%9) <{static_basis = array<i64: 4, 16>}> : (index) -> (index, index)
  "scf.forall"(%43) <{mapping = [#iree_codegen.workgroup_mapping<z:1>, #iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>], operandSegmentSizes = array<i32: 0, 1, 0, 0>, staticLowerBound = array<i64: 0, 0, 0, 0>, staticStep = array<i64: 1, 1, 1, 1>, staticUpperBound = array<i64: -9223372036854775808, 32, 2, 2>}> ({
  ^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index):
    %53 = "memref.subview"(%7) <{operandSegmentSizes = array<i32: 1, 0, 0, 0>, static_offsets = array<i64: 0, 0>, static_sizes = array<i64: 1, 1>, static_strides = array<i64: 1, 1>}> : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>
    "gpu.barrier"() : () -> ()
    "scf.for"(%9, %2, %5) ({
    ^bb0(%arg6: index):
      %59 = "vector.transfer_read"(%47, %4, %arg0, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
      %60 = "arith.addi"(%59, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
      %61 = "arith.trunci"(%60) : (vector<1xi64>) -> vector<1xi32>
      "vector.transfer_write"(%61, %7, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x3xi32, #gpu.address_space<workgroup>>, index, index) -> ()
      "scf.yield"() : () -> ()
    }) : (index, index, index) -> ()
    "gpu.barrier"() : () -> ()
    %54 = "affine.apply"(%arg2, %52#0) <{map = affine_map<(d0)[s0] -> (d0 * 4 + s0)>}> : (index, index) -> index
    %55 = "affine.apply"(%arg3, %52#1) <{map = affine_map<(d0)[s0] -> (d0 * 64 + s0 * 4)>}> : (index, index) -> index
    %56 = "memref.subview"(%48, %arg1, %54, %55, %44) <{operandSegmentSizes = array<i32: 1, 3, 1, 0>, static_offsets = array<i64: 0, -9223372036854775808, -9223372036854775808, -9223372036854775808>, static_sizes = array<i64: -9223372036854775808, 1, 1, 4>, static_strides = array<i64: 1, 1, 1, 1>}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>, index, index, index, index) -> memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %57 = "vector.transfer_read"(%46, %4, %arg0, %arg1, %54, %55, %0) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 5, 1, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, index, index, index, f8E4M3FNUZ) -> vector<4xf8E4M3FNUZ>
    %58 = "arith.extf"(%57) : (vector<4xf8E4M3FNUZ>) -> vector<4xbf16>
    "vector.transfer_write"(%58, %8, %4, %4, %4, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 5, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (vector<4xbf16>, memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, index, index, index, index, index) -> ()
    "iree_linalg_ext.scatter"(%8, %53, %56) <{dimension_map = array<i64: 0>, operandSegmentSizes = array<i32: 2, 1>, unique_indices = true}> ({
    ^bb0(%arg4: bf16, %arg5: bf16):
      "iree_linalg_ext.yield"(%arg4) : (bf16) -> ()
    }) : (memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>, memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
    "gpu.barrier"() : () -> ()
    "scf.forall.in_parallel"() ({
    ^bb0:
    }) : () -> ()
  }) : (index) -> ()
  "memref.dealloc"(%7) : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> ()
  "func.return"() : () -> ()
 }) {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64>} : () -> ()
 /sharedfile/32/fp8_32_kv16.mlir:7267:13: error: failed to run translation of source executable to target executable for backend #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>
    %2675 = torch.aten.index_put %2673, %2674, %2669, %false_2088 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
            ^
 /sharedfile/32/fp8_32_kv16.mlir:7267:13: note: see current operation: 
 "hal.executable.variant"() ({
  "hal.executable.export"() ({
  ^bb0(%arg9: !hal.device, %arg10: index, %arg11: index, %arg12: index, %arg13: index):
    %69:3 = "flow.dispatch.workgroup_count_from_slice"(%arg10, %arg11, %arg12, %arg13) : (index, index, index, index) -> (index, index, index)
    "hal.return"(%69#0, %69#1, %69#2) : (index, index, index) -> ()
  }) {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index, sym_name = "prefill_bs1$async_dispatch_112_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"} : () -> ()
  "builtin.module"() ({
    "func.func"() <{function_type = () -> (), sym_name = "prefill_bs1$async_dispatch_112_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"}> ({
      %0 = "arith.constant"() <{value = 0.000000e+00 : f8E4M3FNUZ}> : () -> f8E4M3FNUZ
      %1 = "arith.constant"() <{value = 0 : i64}> : () -> i64
      %2 = "arith.constant"() <{value = 1 : index}> : () -> index
      %3 = "arith.constant"() <{value = 32 : i64}> : () -> i64
      %4 = "arith.constant"() <{value = 0 : index}> : () -> index
      %5 = "arith.constant"() <{value = 64 : index}> : () -> index
      %6 = "arith.constant"() <{value = dense<10> : vector<1xi64>}> : () -> vector<1xi64>
      %7 = "memref.alloc"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x3xi32, #gpu.address_space<workgroup>>
      %8 = "memref.alloca"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x1x1x1x4xbf16, #gpu.address_space<private>>
      %9 = "gpu.thread_id"() <{dimension = #gpu<dim x>, upper_bound = 64 : index}> : () -> index
      %10 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index} : () -> i32
      %11 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 1 : index} : () -> i32
      %12 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 2 : index} : () -> i32
      %13 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 3 : index} : () -> i32
      %14 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 4 : index} : () -> i32
      %15 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 5 : index} : () -> i32
      %16 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 6 : index} : () -> i32
      %17 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 7 : index} : () -> i32
      %18 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 8 : index} : () -> i32
      %19 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 9 : index} : () -> i32
      %20 = "arith.extui"(%10) : (i32) -> i64
      %21 = "arith.extui"(%11) : (i32) -> i64
      %22 = "arith.shli"(%21, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
      %23 = "arith.ori"(%20, %22) : (i64, i64) -> i64
      %24 = "arith.index_castui"(%23) : (i64) -> index
      %25 = "arith.extui"(%12) : (i32) -> i64
      %26 = "arith.extui"(%13) : (i32) -> i64
      %27 = "arith.shli"(%26, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
      %28 = "arith.ori"(%25, %27) : (i64, i64) -> i64
      %29 = "arith.index_castui"(%28) : (i64) -> index
      %30 = "arith.extui"(%14) : (i32) -> i64
      %31 = "arith.extui"(%15) : (i32) -> i64
      %32 = "arith.shli"(%31, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
      %33 = "arith.ori"(%30, %32) : (i64, i64) -> i64
      %34 = "arith.index_castui"(%33) : (i64) -> index
      %35 = "arith.index_castui"(%16) : (i32) -> index
      %36 = "arith.extui"(%17) : (i32) -> i64
      %37 = "arith.extui"(%18) : (i32) -> i64
      %38 = "arith.shli"(%37, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
      %39 = "arith.ori"(%36, %38) : (i64, i64) -> i64
      %40 = "arith.index_castui"(%39) : (i64) -> index
      %41 = "arith.index_castui"(%19) : (i32) -> index
      %42:6 = "util.assume.int"(%24, %29, %34, %35, %40, %41) <{assumptions = [[#util.int.assumption<umin = 68386816, umax = 5300322304>], [#util.int.assumption<umin = 68501504, umax = 5769969664>], [#util.int.assumption<umin = 68501568, umax = 5770002432>], [#util.int.assumption<umin = 1, umax = 4095>], [#util.int.assumption<umin = 64, umax = 576460752303423424, udiv = 64>], [#util.int.assumption<umin = 1, umax = 4095>]]}> : (index, index, index, index, index, index) -> (index, index, index, index, index, index)
      %43 = "flow.dispatch.workload.ordinal"(%42#3) <{ordinal = 1 : index}> : (index) -> index
      %44 = "flow.dispatch.workload.ordinal"(%42#4) <{ordinal = 2 : index}> : (index) -> index
      %45 = "flow.dispatch.workload.ordinal"(%42#5) <{ordinal = 3 : index}> : (index) -> index
      %46 = "hal.interface.binding.subspan"(%42#0, %43) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      "memref.assume_alignment"(%46) <{alignment = 1 : i32}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
      %47 = "hal.interface.binding.subspan"(%42#1, %45) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      "memref.assume_alignment"(%47) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
      %48 = "hal.interface.binding.subspan"(%4, %44) {alignment = 64 : index, binding = 1 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>
      "memref.assume_alignment"(%48) <{alignment = 64 : i32}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>) -> ()
      %49 = "hal.interface.binding.subspan"(%42#2, %45) {alignment = 64 : index, binding = 2 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      "memref.assume_alignment"(%49) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
      %50 = "memref.alloca"(%45) <{operandSegmentSizes = array<i32: 1, 0>}> : (index) -> memref<1x?xi32, #gpu.address_space<private>>
      %51 = "affine.apply"(%45) <{map = affine_map<()[s0] -> (s0 ceildiv 2)>}> : (index) -> index
      "gpu.barrier"() : () -> ()
      "scf.for"(%9, %51, %5) ({
      ^bb0(%arg7: index):
        %62 = "affine.apply"(%arg7) <{map = affine_map<(d0) -> (d0 * 2)>}> : (index) -> index
        %63 = "affine.min"(%arg7, %45) <{map = affine_map<(d0)[s0] -> (d0 * -2 + s0, 2)>}> : (index, index) -> index
        "scf.for"(%4, %63, %2) ({
        ^bb0(%arg8: index):
          %64 = "arith.addi"(%arg8, %62) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
          %65 = "vector.transfer_read"(%47, %4, %64, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
          %66 = "arith.addi"(%65, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
          %67 = "arith.trunci"(%66) : (vector<1xi64>) -> vector<1xi32>
          %68 = "affine.apply"(%arg7, %arg8) <{map = affine_map<(d0)[s0] -> (d0 * 2 + s0)>}> : (index, index) -> index
          "vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
          "vector.transfer_write"(%67, %50, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x?xi32, #gpu.address_space<private>>, index, index) -> ()
          "scf.yield"() : () -> ()
        }) : (index, index, index) -> ()
        "scf.yield"() : () -> ()
      }) : (index, index, index) -> ()
      "gpu.barrier"() : () -> ()
      %52:2 = "affine.delinearize_index"(%9) <{static_basis = array<i64: 4, 16>}> : (index) -> (index, index)
      "scf.forall"(%43) <{mapping = [#iree_codegen.workgroup_mapping<z:1>, #iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>], operandSegmentSizes = array<i32: 0, 1, 0, 0>, staticLowerBound = array<i64: 0, 0, 0, 0>, staticStep = array<i64: 1, 1, 1, 1>, staticUpperBound = array<i64: -9223372036854775808, 32, 2, 2>}> ({
      ^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index):
        %53 = "memref.subview"(%7) <{operandSegmentSizes = array<i32: 1, 0, 0, 0>, static_offsets = array<i64: 0, 0>, static_sizes = array<i64: 1, 1>, static_strides = array<i64: 1, 1>}> : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>
        "gpu.barrier"() : () -> ()
        "scf.for"(%9, %2, %5) ({
        ^bb0(%arg6: index):
          %59 = "vector.transfer_read"(%47, %4, %arg0, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
          %60 = "arith.addi"(%59, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
          %61 = "arith.trunci"(%60) : (vector<1xi64>) -> vector<1xi32>
          "vector.transfer_write"(%61, %7, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x3xi32, #gpu.address_space<workgroup>>, index, index) -> ()
          "scf.yield"() : () -> ()
        }) : (index, index, index) -> ()
        "gpu.barrier"() : () -> ()
        %54 = "affine.apply"(%arg2, %52#0) <{map = affine_map<(d0)[s0] -> (d0 * 4 + s0)>}> : (index, index) -> index
        %55 = "affine.apply"(%arg3, %52#1) <{map = affine_map<(d0)[s0] -> (d0 * 64 + s0 * 4)>}> : (index, index) -> index
        %56 = "memref.subview"(%48, %arg1, %54, %55, %44) <{operandSegmentSizes = array<i32: 1, 3, 1, 0>, static_offsets = array<i64: 0, -9223372036854775808, -9223372036854775808, -9223372036854775808>, static_sizes = array<i64: -9223372036854775808, 1, 1, 4>, static_strides = array<i64: 1, 1, 1, 1>}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>, index, index, index, index) -> memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        %57 = "vector.transfer_read"(%46, %4, %arg0, %arg1, %54, %55, %0) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 5, 1, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, index, index, index, f8E4M3FNUZ) -> vector<4xf8E4M3FNUZ>
        %58 = "arith.extf"(%57) : (vector<4xf8E4M3FNUZ>) -> vector<4xbf16>
        "vector.transfer_write"(%58, %8, %4, %4, %4, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 5, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (vector<4xbf16>, memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, index, index, index, index, index) -> ()
        "iree_linalg_ext.scatter"(%8, %53, %56) <{dimension_map = array<i64: 0>, operandSegmentSizes = array<i32: 2, 1>, unique_indices = true}> ({
        ^bb0(%arg4: bf16, %arg5: bf16):
          "iree_linalg_ext.yield"(%arg4) : (bf16) -> ()
        }) : (memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>, memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
        "gpu.barrier"() : () -> ()
        "scf.forall.in_parallel"() ({
        ^bb0:
        }) : () -> ()
      }) : (index) -> ()
      "memref.dealloc"(%7) : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> ()
      "func.return"() : () -> ()
    }) {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64>} : () -> ()
  }) : () -> ()
  "hal.executable.variant_end"() : () -> ()
 }) {sym_name = "rocm_hsaco_fb", target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>} : () -> ()
 /sharedfile/32/fp8_32_kv16.mlir:8143:13: error: 'vector.transfer_write' op write affecting operations on global resources are restricted to workgroup distributed contexts.
    %3012 = torch.aten.index_put %3010, %3011, %3006, %false_2460 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
            ^
 /sharedfile/32/fp8_32_kv16.mlir:8143:13: note: see current operation: "vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
 /sharedfile/32/fp8_32_kv16.mlir:8143:13: error: 'func.func' op failed on workgroup distribution verification
    %3012 = torch.aten.index_put %3010, %3011, %3006, %false_2460 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
            ^
 /sharedfile/32/fp8_32_kv16.mlir:8143:13: note: see current operation: 
 "func.func"() <{function_type = () -> (), sym_name = "prefill_bs1$async_dispatch_132_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"}> ({
  %0 = "arith.constant"() <{value = 0.000000e+00 : f8E4M3FNUZ}> : () -> f8E4M3FNUZ
  %1 = "arith.constant"() <{value = 0 : i64}> : () -> i64
  %2 = "arith.constant"() <{value = 1 : index}> : () -> index
  %3 = "arith.constant"() <{value = 32 : i64}> : () -> i64
  %4 = "arith.constant"() <{value = 0 : index}> : () -> index
  %5 = "arith.constant"() <{value = 64 : index}> : () -> index
  %6 = "arith.constant"() <{value = dense<12> : vector<1xi64>}> : () -> vector<1xi64>
  %7 = "memref.alloc"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x3xi32, #gpu.address_space<workgroup>>
  %8 = "memref.alloca"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x1x1x1x4xbf16, #gpu.address_space<private>>
  %9 = "gpu.thread_id"() <{dimension = #gpu<dim x>, upper_bound = 64 : index}> : () -> index
  %10 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index} : () -> i32
  %11 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 1 : index} : () -> i32
  %12 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 2 : index} : () -> i32
  %13 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 3 : index} : () -> i32
  %14 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 4 : index} : () -> i32
  %15 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 5 : index} : () -> i32
  %16 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 6 : index} : () -> i32
  %17 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 7 : index} : () -> i32
  %18 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 8 : index} : () -> i32
  %19 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 9 : index} : () -> i32
  %20 = "arith.extui"(%10) : (i32) -> i64
  %21 = "arith.extui"(%11) : (i32) -> i64
  %22 = "arith.shli"(%21, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
  %23 = "arith.ori"(%20, %22) : (i64, i64) -> i64
  %24 = "arith.index_castui"(%23) : (i64) -> index
  %25 = "arith.extui"(%12) : (i32) -> i64
  %26 = "arith.extui"(%13) : (i32) -> i64
  %27 = "arith.shli"(%26, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
  %28 = "arith.ori"(%25, %27) : (i64, i64) -> i64
  %29 = "arith.index_castui"(%28) : (i64) -> index
  %30 = "arith.extui"(%14) : (i32) -> i64
  %31 = "arith.extui"(%15) : (i32) -> i64
  %32 = "arith.shli"(%31, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
  %33 = "arith.ori"(%30, %32) : (i64, i64) -> i64
  %34 = "arith.index_castui"(%33) : (i64) -> index
  %35 = "arith.index_castui"(%16) : (i32) -> index
  %36 = "arith.extui"(%17) : (i32) -> i64
  %37 = "arith.extui"(%18) : (i32) -> i64
  %38 = "arith.shli"(%37, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
  %39 = "arith.ori"(%36, %38) : (i64, i64) -> i64
  %40 = "arith.index_castui"(%39) : (i64) -> index
  %41 = "arith.index_castui"(%19) : (i32) -> index
  %42:6 = "util.assume.int"(%24, %29, %34, %35, %40, %41) <{assumptions = [[#util.int.assumption<umin = 68386816, umax = 5300322304>], [#util.int.assumption<umin = 68501504, umax = 5769969664>], [#util.int.assumption<umin = 68501568, umax = 5770002432>], [#util.int.assumption<umin = 1, umax = 4095>], [#util.int.assumption<umin = 64, umax = 576460752303423424, udiv = 64>], [#util.int.assumption<umin = 1, umax = 4095>]]}> : (index, index, index, index, index, index) -> (index, index, index, index, index, index)
  %43 = "flow.dispatch.workload.ordinal"(%42#3) <{ordinal = 1 : index}> : (index) -> index
  %44 = "flow.dispatch.workload.ordinal"(%42#4) <{ordinal = 2 : index}> : (index) -> index
  %45 = "flow.dispatch.workload.ordinal"(%42#5) <{ordinal = 3 : index}> : (index) -> index
  %46 = "hal.interface.binding.subspan"(%42#0, %43) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  "memref.assume_alignment"(%46) <{alignment = 1 : i32}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
  %47 = "hal.interface.binding.subspan"(%42#1, %45) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  "memref.assume_alignment"(%47) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
  %48 = "hal.interface.binding.subspan"(%4, %44) {alignment = 64 : index, binding = 1 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>
  "memref.assume_alignment"(%48) <{alignment = 64 : i32}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>) -> ()
  %49 = "hal.interface.binding.subspan"(%42#2, %45) {alignment = 64 : index, binding = 2 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  "memref.assume_alignment"(%49) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
  %50 = "memref.alloca"(%45) <{operandSegmentSizes = array<i32: 1, 0>}> : (index) -> memref<1x?xi32, #gpu.address_space<private>>
  %51 = "affine.apply"(%45) <{map = affine_map<()[s0] -> (s0 ceildiv 2)>}> : (index) -> index
  "gpu.barrier"() : () -> ()
  "scf.for"(%9, %51, %5) ({
  ^bb0(%arg7: index):
    %62 = "affine.apply"(%arg7) <{map = affine_map<(d0) -> (d0 * 2)>}> : (index) -> index
    %63 = "affine.min"(%arg7, %45) <{map = affine_map<(d0)[s0] -> (d0 * -2 + s0, 2)>}> : (index, index) -> index
    "scf.for"(%4, %63, %2) ({
    ^bb0(%arg8: index):
      %64 = "arith.addi"(%arg8, %62) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
      %65 = "vector.transfer_read"(%47, %4, %64, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
      %66 = "arith.addi"(%65, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
      %67 = "arith.trunci"(%66) : (vector<1xi64>) -> vector<1xi32>
      %68 = "affine.apply"(%arg7, %arg8) <{map = affine_map<(d0)[s0] -> (d0 * 2 + s0)>}> : (index, index) -> index
      "vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
      "vector.transfer_write"(%67, %50, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x?xi32, #gpu.address_space<private>>, index, index) -> ()
      "scf.yield"() : () -> ()
    }) : (index, index, index) -> ()
    "scf.yield"() : () -> ()
  }) : (index, index, index) -> ()
  "gpu.barrier"() : () -> ()
  %52:2 = "affine.delinearize_index"(%9) <{static_basis = array<i64: 4, 16>}> : (index) -> (index, index)
  "scf.forall"(%43) <{mapping = [#iree_codegen.workgroup_mapping<z:1>, #iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>], operandSegmentSizes = array<i32: 0, 1, 0, 0>, staticLowerBound = array<i64: 0, 0, 0, 0>, staticStep = array<i64: 1, 1, 1, 1>, staticUpperBound = array<i64: -9223372036854775808, 32, 2, 2>}> ({
  ^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index):
    %53 = "memref.subview"(%7) <{operandSegmentSizes = array<i32: 1, 0, 0, 0>, static_offsets = array<i64: 0, 0>, static_sizes = array<i64: 1, 1>, static_strides = array<i64: 1, 1>}> : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>
    "gpu.barrier"() : () -> ()
    "scf.for"(%9, %2, %5) ({
    ^bb0(%arg6: index):
      %59 = "vector.transfer_read"(%47, %4, %arg0, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
      %60 = "arith.addi"(%59, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
      %61 = "arith.trunci"(%60) : (vector<1xi64>) -> vector<1xi32>
      "vector.transfer_write"(%61, %7, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x3xi32, #gpu.address_space<workgroup>>, index, index) -> ()
      "scf.yield"() : () -> ()
    }) : (index, index, index) -> ()
    "gpu.barrier"() : () -> ()
    %54 = "affine.apply"(%arg2, %52#0) <{map = affine_map<(d0)[s0] -> (d0 * 4 + s0)>}> : (index, index) -> index
    %55 = "affine.apply"(%arg3, %52#1) <{map = affine_map<(d0)[s0] -> (d0 * 64 + s0 * 4)>}> : (index, index) -> index
    %56 = "memref.subview"(%48, %arg1, %54, %55, %44) <{operandSegmentSizes = array<i32: 1, 3, 1, 0>, static_offsets = array<i64: 0, -9223372036854775808, -9223372036854775808, -9223372036854775808>, static_sizes = array<i64: -9223372036854775808, 1, 1, 4>, static_strides = array<i64: 1, 1, 1, 1>}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>, index, index, index, index) -> memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %57 = "vector.transfer_read"(%46, %4, %arg0, %arg1, %54, %55, %0) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 5, 1, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, index, index, index, f8E4M3FNUZ) -> vector<4xf8E4M3FNUZ>
    %58 = "arith.extf"(%57) : (vector<4xf8E4M3FNUZ>) -> vector<4xbf16>
    "vector.transfer_write"(%58, %8, %4, %4, %4, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 5, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (vector<4xbf16>, memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, index, index, index, index, index) -> ()
    "iree_linalg_ext.scatter"(%8, %53, %56) <{dimension_map = array<i64: 0>, operandSegmentSizes = array<i32: 2, 1>, unique_indices = true}> ({
    ^bb0(%arg4: bf16, %arg5: bf16):
      "iree_linalg_ext.yield"(%arg4) : (bf16) -> ()
    }) : (memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>, memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
    "gpu.barrier"() : () -> ()
    "scf.forall.in_parallel"() ({
    ^bb0:
    }) : () -> ()
  }) : (index) -> ()
  "memref.dealloc"(%7) : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> ()
  "func.return"() : () -> ()
 }) {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64>} : () -> ()
 /sharedfile/32/fp8_32_kv16.mlir:8143:13: error: failed to run translation of source executable to target executable for backend #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>
    %3012 = torch.aten.index_put %3010, %3011, %3006, %false_2460 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
            ^
 /sharedfile/32/fp8_32_kv16.mlir:8143:13: note: see current operation: 
 "hal.executable.variant"() ({
  "hal.executable.export"() ({
  ^bb0(%arg9: !hal.device, %arg10: index, %arg11: index, %arg12: index, %arg13: index):
    %69:3 = "flow.dispatch.workgroup_count_from_slice"(%arg10, %arg11, %arg12, %arg13) : (index, index, index, index) -> (index, index, index)
    "hal.return"(%69#0, %69#1, %69#2) : (index, index, index) -> ()
  }) {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index, sym_name = "prefill_bs1$async_dispatch_132_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"} : () -> ()
  "builtin.module"() ({
    "func.func"() <{function_type = () -> (), sym_name = "prefill_bs1$async_dispatch_132_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"}> ({
      %0 = "arith.constant"() <{value = 0.000000e+00 : f8E4M3FNUZ}> : () -> f8E4M3FNUZ
      %1 = "arith.constant"() <{value = 0 : i64}> : () -> i64
      %2 = "arith.constant"() <{value = 1 : index}> : () -> index
      %3 = "arith.constant"() <{value = 32 : i64}> : () -> i64
      %4 = "arith.constant"() <{value = 0 : index}> : () -> index
      %5 = "arith.constant"() <{value = 64 : index}> : () -> index
      %6 = "arith.constant"() <{value = dense<12> : vector<1xi64>}> : () -> vector<1xi64>
      %7 = "memref.alloc"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x3xi32, #gpu.address_space<workgroup>>
      %8 = "memref.alloca"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x1x1x1x4xbf16, #gpu.address_space<private>>
      %9 = "gpu.thread_id"() <{dimension = #gpu<dim x>, upper_bound = 64 : index}> : () -> index
      %10 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index} : () -> i32
      %11 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 1 : index} : () -> i32
      %12 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 2 : index} : () -> i32
      %13 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 3 : index} : () -> i32
      %14 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 4 : index} : () -> i32
      %15 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 5 : index} : () -> i32
      %16 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 6 : index} : () -> i32
      %17 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 7 : index} : () -> i32
      %18 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 8 : index} : () -> i32
      %19 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 9 : index} : () -> i32
      %20 = "arith.extui"(%10) : (i32) -> i64
      %21 = "arith.extui"(%11) : (i32) -> i64
      %22 = "arith.shli"(%21, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
      %23 = "arith.ori"(%20, %22) : (i64, i64) -> i64
      %24 = "arith.index_castui"(%23) : (i64) -> index
      %25 = "arith.extui"(%12) : (i32) -> i64
      %26 = "arith.extui"(%13) : (i32) -> i64
      %27 = "arith.shli"(%26, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
      %28 = "arith.ori"(%25, %27) : (i64, i64) -> i64
      %29 = "arith.index_castui"(%28) : (i64) -> index
      %30 = "arith.extui"(%14) : (i32) -> i64
      %31 = "arith.extui"(%15) : (i32) -> i64
      %32 = "arith.shli"(%31, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
      %33 = "arith.ori"(%30, %32) : (i64, i64) -> i64
      %34 = "arith.index_castui"(%33) : (i64) -> index
      %35 = "arith.index_castui"(%16) : (i32) -> index
      %36 = "arith.extui"(%17) : (i32) -> i64
      %37 = "arith.extui"(%18) : (i32) -> i64
      %38 = "arith.shli"(%37, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
      %39 = "arith.ori"(%36, %38) : (i64, i64) -> i64
      %40 = "arith.index_castui"(%39) : (i64) -> index
      %41 = "arith.index_castui"(%19) : (i32) -> index
      %42:6 = "util.assume.int"(%24, %29, %34, %35, %40, %41) <{assumptions = [[#util.int.assumption<umin = 68386816, umax = 5300322304>], [#util.int.assumption<umin = 68501504, umax = 5769969664>], [#util.int.assumption<umin = 68501568, umax = 5770002432>], [#util.int.assumption<umin = 1, umax = 4095>], [#util.int.assumption<umin = 64, umax = 576460752303423424, udiv = 64>], [#util.int.assumption<umin = 1, umax = 4095>]]}> : (index, index, index, index, index, index) -> (index, index, index, index, index, index)
      %43 = "flow.dispatch.workload.ordinal"(%42#3) <{ordinal = 1 : index}> : (index) -> index
      %44 = "flow.dispatch.workload.ordinal"(%42#4) <{ordinal = 2 : index}> : (index) -> index
      %45 = "flow.dispatch.workload.ordinal"(%42#5) <{ordinal = 3 : index}> : (index) -> index
      %46 = "hal.interface.binding.subspan"(%42#0, %43) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      "memref.assume_alignment"(%46) <{alignment = 1 : i32}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
      %47 = "hal.interface.binding.subspan"(%42#1, %45) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      "memref.assume_alignment"(%47) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
      %48 = "hal.interface.binding.subspan"(%4, %44) {alignment = 64 : index, binding = 1 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>
      "memref.assume_alignment"(%48) <{alignment = 64 : i32}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>) -> ()
      %49 = "hal.interface.binding.subspan"(%42#2, %45) {alignment = 64 : index, binding = 2 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      "memref.assume_alignment"(%49) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
      %50 = "memref.alloca"(%45) <{operandSegmentSizes = array<i32: 1, 0>}> : (index) -> memref<1x?xi32, #gpu.address_space<private>>
      %51 = "affine.apply"(%45) <{map = affine_map<()[s0] -> (s0 ceildiv 2)>}> : (index) -> index
      "gpu.barrier"() : () -> ()
      "scf.for"(%9, %51, %5) ({
      ^bb0(%arg7: index):
        %62 = "affine.apply"(%arg7) <{map = affine_map<(d0) -> (d0 * 2)>}> : (index) -> index
        %63 = "affine.min"(%arg7, %45) <{map = affine_map<(d0)[s0] -> (d0 * -2 + s0, 2)>}> : (index, index) -> index
        "scf.for"(%4, %63, %2) ({
        ^bb0(%arg8: index):
          %64 = "arith.addi"(%arg8, %62) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
          %65 = "vector.transfer_read"(%47, %4, %64, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
          %66 = "arith.addi"(%65, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
          %67 = "arith.trunci"(%66) : (vector<1xi64>) -> vector<1xi32>
          %68 = "affine.apply"(%arg7, %arg8) <{map = affine_map<(d0)[s0] -> (d0 * 2 + s0)>}> : (index, index) -> index
          "vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
          "vector.transfer_write"(%67, %50, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x?xi32, #gpu.address_space<private>>, index, index) -> ()
          "scf.yield"() : () -> ()
        }) : (index, index, index) -> ()
        "scf.yield"() : () -> ()
      }) : (index, index, index) -> ()
      "gpu.barrier"() : () -> ()
      %52:2 = "affine.delinearize_index"(%9) <{static_basis = array<i64: 4, 16>}> : (index) -> (index, index)
      "scf.forall"(%43) <{mapping = [#iree_codegen.workgroup_mapping<z:1>, #iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>], operandSegmentSizes = array<i32: 0, 1, 0, 0>, staticLowerBound = array<i64: 0, 0, 0, 0>, staticStep = array<i64: 1, 1, 1, 1>, staticUpperBound = array<i64: -9223372036854775808, 32, 2, 2>}> ({
      ^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index):
        %53 = "memref.subview"(%7) <{operandSegmentSizes = array<i32: 1, 0, 0, 0>, static_offsets = array<i64: 0, 0>, static_sizes = array<i64: 1, 1>, static_strides = array<i64: 1, 1>}> : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>
        "gpu.barrier"() : () -> ()
        "scf.for"(%9, %2, %5) ({
        ^bb0(%arg6: index):
          %59 = "vector.transfer_read"(%47, %4, %arg0, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
          %60 = "arith.addi"(%59, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
          %61 = "arith.trunci"(%60) : (vector<1xi64>) -> vector<1xi32>
          "vector.transfer_write"(%61, %7, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x3xi32, #gpu.address_space<workgroup>>, index, index) -> ()
          "scf.yield"() : () -> ()
        }) : (index, index, index) -> ()
        "gpu.barrier"() : () -> ()
        %54 = "affine.apply"(%arg2, %52#0) <{map = affine_map<(d0)[s0] -> (d0 * 4 + s0)>}> : (index, index) -> index
        %55 = "affine.apply"(%arg3, %52#1) <{map = affine_map<(d0)[s0] -> (d0 * 64 + s0 * 4)>}> : (index, index) -> index
        %56 = "memref.subview"(%48, %arg1, %54, %55, %44) <{operandSegmentSizes = array<i32: 1, 3, 1, 0>, static_offsets = array<i64: 0, -9223372036854775808, -9223372036854775808, -9223372036854775808>, static_sizes = array<i64: -9223372036854775808, 1, 1, 4>, static_strides = array<i64: 1, 1, 1, 1>}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>, index, index, index, index) -> memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        %57 = "vector.transfer_read"(%46, %4, %arg0, %arg1, %54, %55, %0) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 5, 1, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, index, index, index, f8E4M3FNUZ) -> vector<4xf8E4M3FNUZ>
        %58 = "arith.extf"(%57) : (vector<4xf8E4M3FNUZ>) -> vector<4xbf16>
        "vector.transfer_write"(%58, %8, %4, %4, %4, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 5, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (vector<4xbf16>, memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, index, index, index, index, index) -> ()
        "iree_linalg_ext.scatter"(%8, %53, %56) <{dimension_map = array<i64: 0>, operandSegmentSizes = array<i32: 2, 1>, unique_indices = true}> ({
        ^bb0(%arg4: bf16, %arg5: bf16):
          "iree_linalg_ext.yield"(%arg4) : (bf16) -> ()
        }) : (memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>, memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
        "gpu.barrier"() : () -> ()
        "scf.forall.in_parallel"() ({
        ^bb0:
        }) : () -> ()
      }) : (index) -> ()
      "memref.dealloc"(%7) : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> ()
      "func.return"() : () -> ()
    }) {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64>} : () -> ()
  }) : () -> ()
  "hal.executable.variant_end"() : () -> ()
 }) {sym_name = "rocm_hsaco_fb", target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>} : () -> ()
 /sharedfile/32/fp8_32_kv16.mlir:9019:13: error: 'vector.transfer_write' op write affecting operations on global resources are restricted to workgroup distributed contexts.
    %3349 = torch.aten.index_put %3347, %3348, %3343, %false_2832 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
            ^
 /sharedfile/32/fp8_32_kv16.mlir:9019:13: note: see current operation: "vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
 /sharedfile/32/fp8_32_kv16.mlir:9019:13: error: 'func.func' op failed on workgroup distribution verification
    %3349 = torch.aten.index_put %3347, %3348, %3343, %false_2832 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
            ^
 /sharedfile/32/fp8_32_kv16.mlir:9019:13: note: see current operation: 
 "func.func"() <{function_type = () -> (), sym_name = "prefill_bs1$async_dispatch_152_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"}> ({
  %0 = "arith.constant"() <{value = 0.000000e+00 : f8E4M3FNUZ}> : () -> f8E4M3FNUZ
  %1 = "arith.constant"() <{value = 0 : i64}> : () -> i64
  %2 = "arith.constant"() <{value = 1 : index}> : () -> index
  %3 = "arith.constant"() <{value = 32 : i64}> : () -> i64
  %4 = "arith.constant"() <{value = 0 : index}> : () -> index
  %5 = "arith.constant"() <{value = 64 : index}> : () -> index
  %6 = "arith.constant"() <{value = dense<14> : vector<1xi64>}> : () -> vector<1xi64>
  %7 = "memref.alloc"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x3xi32, #gpu.address_space<workgroup>>
  %8 = "memref.alloca"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x1x1x1x4xbf16, #gpu.address_space<private>>
  %9 = "gpu.thread_id"() <{dimension = #gpu<dim x>, upper_bound = 64 : index}> : () -> index
  %10 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index} : () -> i32
  %11 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 1 : index} : () -> i32
  %12 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 2 : index} : () -> i32
  %13 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 3 : index} : () -> i32
  %14 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 4 : index} : () -> i32
  %15 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 5 : index} : () -> i32
  %16 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 6 : index} : () -> i32
  %17 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 7 : index} : () -> i32
  %18 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 8 : index} : () -> i32
  %19 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 9 : index} : () -> i32
  %20 = "arith.extui"(%10) : (i32) -> i64
  %21 = "arith.extui"(%11) : (i32) -> i64
  %22 = "arith.shli"(%21, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
  %23 = "arith.ori"(%20, %22) : (i64, i64) -> i64
  %24 = "arith.index_castui"(%23) : (i64) -> index
  %25 = "arith.extui"(%12) : (i32) -> i64
  %26 = "arith.extui"(%13) : (i32) -> i64
  %27 = "arith.shli"(%26, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
  %28 = "arith.ori"(%25, %27) : (i64, i64) -> i64
  %29 = "arith.index_castui"(%28) : (i64) -> index
  %30 = "arith.extui"(%14) : (i32) -> i64
  %31 = "arith.extui"(%15) : (i32) -> i64
  %32 = "arith.shli"(%31, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
  %33 = "arith.ori"(%30, %32) : (i64, i64) -> i64
  %34 = "arith.index_castui"(%33) : (i64) -> index
  %35 = "arith.index_castui"(%16) : (i32) -> index
  %36 = "arith.extui"(%17) : (i32) -> i64
  %37 = "arith.extui"(%18) : (i32) -> i64
  %38 = "arith.shli"(%37, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
  %39 = "arith.ori"(%36, %38) : (i64, i64) -> i64
  %40 = "arith.index_castui"(%39) : (i64) -> index
  %41 = "arith.index_castui"(%19) : (i32) -> index
  %42:6 = "util.assume.int"(%24, %29, %34, %35, %40, %41) <{assumptions = [[#util.int.assumption<umin = 68386816, umax = 5300322304>], [#util.int.assumption<umin = 68501504, umax = 5769969664>], [#util.int.assumption<umin = 68501568, umax = 5770002432>], [#util.int.assumption<umin = 1, umax = 4095>], [#util.int.assumption<umin = 64, umax = 576460752303423424, udiv = 64>], [#util.int.assumption<umin = 1, umax = 4095>]]}> : (index, index, index, index, index, index) -> (index, index, index, index, index, index)
  %43 = "flow.dispatch.workload.ordinal"(%42#3) <{ordinal = 1 : index}> : (index) -> index
  %44 = "flow.dispatch.workload.ordinal"(%42#4) <{ordinal = 2 : index}> : (index) -> index
  %45 = "flow.dispatch.workload.ordinal"(%42#5) <{ordinal = 3 : index}> : (index) -> index
  %46 = "hal.interface.binding.subspan"(%42#0, %43) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  "memref.assume_alignment"(%46) <{alignment = 1 : i32}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
  %47 = "hal.interface.binding.subspan"(%42#1, %45) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  "memref.assume_alignment"(%47) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
  %48 = "hal.interface.binding.subspan"(%4, %44) {alignment = 64 : index, binding = 1 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>
  "memref.assume_alignment"(%48) <{alignment = 64 : i32}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>) -> ()
  %49 = "hal.interface.binding.subspan"(%42#2, %45) {alignment = 64 : index, binding = 2 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  "memref.assume_alignment"(%49) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
  %50 = "memref.alloca"(%45) <{operandSegmentSizes = array<i32: 1, 0>}> : (index) -> memref<1x?xi32, #gpu.address_space<private>>
  %51 = "affine.apply"(%45) <{map = affine_map<()[s0] -> (s0 ceildiv 2)>}> : (index) -> index
  "gpu.barrier"() : () -> ()
  "scf.for"(%9, %51, %5) ({
  ^bb0(%arg7: index):
    %62 = "affine.apply"(%arg7) <{map = affine_map<(d0) -> (d0 * 2)>}> : (index) -> index
    %63 = "affine.min"(%arg7, %45) <{map = affine_map<(d0)[s0] -> (d0 * -2 + s0, 2)>}> : (index, index) -> index
    "scf.for"(%4, %63, %2) ({
    ^bb0(%arg8: index):
      %64 = "arith.addi"(%arg8, %62) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
      %65 = "vector.transfer_read"(%47, %4, %64, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
      %66 = "arith.addi"(%65, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
      %67 = "arith.trunci"(%66) : (vector<1xi64>) -> vector<1xi32>
      %68 = "affine.apply"(%arg7, %arg8) <{map = affine_map<(d0)[s0] -> (d0 * 2 + s0)>}> : (index, index) -> index
      "vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
      "vector.transfer_write"(%67, %50, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x?xi32, #gpu.address_space<private>>, index, index) -> ()
      "scf.yield"() : () -> ()
    }) : (index, index, index) -> ()
    "scf.yield"() : () -> ()
  }) : (index, index, index) -> ()
  "gpu.barrier"() : () -> ()
  %52:2 = "affine.delinearize_index"(%9) <{static_basis = array<i64: 4, 16>}> : (index) -> (index, index)
  "scf.forall"(%43) <{mapping = [#iree_codegen.workgroup_mapping<z:1>, #iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>], operandSegmentSizes = array<i32: 0, 1, 0, 0>, staticLowerBound = array<i64: 0, 0, 0, 0>, staticStep = array<i64: 1, 1, 1, 1>, staticUpperBound = array<i64: -9223372036854775808, 32, 2, 2>}> ({
  ^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index):
    %53 = "memref.subview"(%7) <{operandSegmentSizes = array<i32: 1, 0, 0, 0>, static_offsets = array<i64: 0, 0>, static_sizes = array<i64: 1, 1>, static_strides = array<i64: 1, 1>}> : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>
    "gpu.barrier"() : () -> ()
    "scf.for"(%9, %2, %5) ({
    ^bb0(%arg6: index):
      %59 = "vector.transfer_read"(%47, %4, %arg0, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
      %60 = "arith.addi"(%59, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
      %61 = "arith.trunci"(%60) : (vector<1xi64>) -> vector<1xi32>
      "vector.transfer_write"(%61, %7, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x3xi32, #gpu.address_space<workgroup>>, index, index) -> ()
      "scf.yield"() : () -> ()
    }) : (index, index, index) -> ()
    "gpu.barrier"() : () -> ()
    %54 = "affine.apply"(%arg2, %52#0) <{map = affine_map<(d0)[s0] -> (d0 * 4 + s0)>}> : (index, index) -> index
    %55 = "affine.apply"(%arg3, %52#1) <{map = affine_map<(d0)[s0] -> (d0 * 64 + s0 * 4)>}> : (index, index) -> index
    %56 = "memref.subview"(%48, %arg1, %54, %55, %44) <{operandSegmentSizes = array<i32: 1, 3, 1, 0>, static_offsets = array<i64: 0, -9223372036854775808, -9223372036854775808, -9223372036854775808>, static_sizes = array<i64: -9223372036854775808, 1, 1, 4>, static_strides = array<i64: 1, 1, 1, 1>}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>, index, index, index, index) -> memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %57 = "vector.transfer_read"(%46, %4, %arg0, %arg1, %54, %55, %0) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 5, 1, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, index, index, index, f8E4M3FNUZ) -> vector<4xf8E4M3FNUZ>
    %58 = "arith.extf"(%57) : (vector<4xf8E4M3FNUZ>) -> vector<4xbf16>
    "vector.transfer_write"(%58, %8, %4, %4, %4, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 5, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (vector<4xbf16>, memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, index, index, index, index, index) -> ()
    "iree_linalg_ext.scatter"(%8, %53, %56) <{dimension_map = array<i64: 0>, operandSegmentSizes = array<i32: 2, 1>, unique_indices = true}> ({
    ^bb0(%arg4: bf16, %arg5: bf16):
      "iree_linalg_ext.yield"(%arg4) : (bf16) -> ()
    }) : (memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>, memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
    "gpu.barrier"() : () -> ()
    "scf.forall.in_parallel"() ({
    ^bb0:
    }) : () -> ()
  }) : (index) -> ()
  "memref.dealloc"(%7) : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> ()
  "func.return"() : () -> ()
 }) {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64>} : () -> ()
 /sharedfile/32/fp8_32_kv16.mlir:9019:13: error: failed to run translation of source executable to target executable for backend #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>
    %3349 = torch.aten.index_put %3347, %3348, %3343, %false_2832 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
            ^
 /sharedfile/32/fp8_32_kv16.mlir:9019:13: note: see current operation: 
 "hal.executable.variant"() ({
  "hal.executable.export"() ({
  ^bb0(%arg9: !hal.device, %arg10: index, %arg11: index, %arg12: index, %arg13: index):
    %69:3 = "flow.dispatch.workgroup_count_from_slice"(%arg10, %arg11, %arg12, %arg13) : (index, index, index, index) -> (index, index, index)
    "hal.return"(%69#0, %69#1, %69#2) : (index, index, index) -> ()
  }) {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index, sym_name = "prefill_bs1$async_dispatch_152_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"} : () -> ()
  "builtin.module"() ({
    "func.func"() <{function_type = () -> (), sym_name = "prefill_bs1$async_dispatch_152_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"}> ({
      %0 = "arith.constant"() <{value = 0.000000e+00 : f8E4M3FNUZ}> : () -> f8E4M3FNUZ
      %1 = "arith.constant"() <{value = 0 : i64}> : () -> i64
      %2 = "arith.constant"() <{value = 1 : index}> : () -> index
      %3 = "arith.constant"() <{value = 32 : i64}> : () -> i64
      %4 = "arith.constant"() <{value = 0 : index}> : () -> index
      %5 = "arith.constant"() <{value = 64 : index}> : () -> index
      %6 = "arith.constant"() <{value = dense<14> : vector<1xi64>}> : () -> vector<1xi64>
      %7 = "memref.alloc"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x3xi32, #gpu.address_space<workgroup>>
      %8 = "memref.alloca"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x1x1x1x4xbf16, #gpu.address_space<private>>
      %9 = "gpu.thread_id"() <{dimension = #gpu<dim x>, upper_bound = 64 : index}> : () -> index
      %10 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index} : () -> i32
      %11 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 1 : index} : () -> i32
      %12 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 2 : index} : () -> i32
      %13 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 3 : index} : () -> i32
      %14 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 4 : index} : () -> i32
      %15 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 5 : index} : () -> i32
      %16 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 6 : index} : () -> i32
      %17 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 7 : index} : () -> i32
      %18 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 8 : index} : () -> i32
      %19 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 9 : index} : () -> i32
      %20 = "arith.extui"(%10) : (i32) -> i64
      %21 = "arith.extui"(%11) : (i32) -> i64
      %22 = "arith.shli"(%21, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
      %23 = "arith.ori"(%20, %22) : (i64, i64) -> i64
      %24 = "arith.index_castui"(%23) : (i64) -> index
      %25 = "arith.extui"(%12) : (i32) -> i64
      %26 = "arith.extui"(%13) : (i32) -> i64
      %27 = "arith.shli"(%26, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
      %28 = "arith.ori"(%25, %27) : (i64, i64) -> i64
      %29 = "arith.index_castui"(%28) : (i64) -> index
      %30 = "arith.extui"(%14) : (i32) -> i64
      %31 = "arith.extui"(%15) : (i32) -> i64
      %32 = "arith.shli"(%31, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
      %33 = "arith.ori"(%30, %32) : (i64, i64) -> i64
      %34 = "arith.index_castui"(%33) : (i64) -> index
      %35 = "arith.index_castui"(%16) : (i32) -> index
      %36 = "arith.extui"(%17) : (i32) -> i64
      %37 = "arith.extui"(%18) : (i32) -> i64
      %38 = "arith.shli"(%37, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
      %39 = "arith.ori"(%36, %38) : (i64, i64) -> i64
      %40 = "arith.index_castui"(%39) : (i64) -> index
      %41 = "arith.index_castui"(%19) : (i32) -> index
      %42:6 = "util.assume.int"(%24, %29, %34, %35, %40, %41) <{assumptions = [[#util.int.assumption<umin = 68386816, umax = 5300322304>], [#util.int.assumption<umin = 68501504, umax = 5769969664>], [#util.int.assumption<umin = 68501568, umax = 5770002432>], [#util.int.assumption<umin = 1, umax = 4095>], [#util.int.assumption<umin = 64, umax = 576460752303423424, udiv = 64>], [#util.int.assumption<umin = 1, umax = 4095>]]}> : (index, index, index, index, index, index) -> (index, index, index, index, index, index)
      %43 = "flow.dispatch.workload.ordinal"(%42#3) <{ordinal = 1 : index}> : (index) -> index
      %44 = "flow.dispatch.workload.ordinal"(%42#4) <{ordinal = 2 : index}> : (index) -> index
      %45 = "flow.dispatch.workload.ordinal"(%42#5) <{ordinal = 3 : index}> : (index) -> index
      %46 = "hal.interface.binding.subspan"(%42#0, %43) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      "memref.assume_alignment"(%46) <{alignment = 1 : i32}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
      %47 = "hal.interface.binding.subspan"(%42#1, %45) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      "memref.assume_alignment"(%47) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
      %48 = "hal.interface.binding.subspan"(%4, %44) {alignment = 64 : index, binding = 1 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>
      "memref.assume_alignment"(%48) <{alignment = 64 : i32}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>) -> ()
      %49 = "hal.interface.binding.subspan"(%42#2, %45) {alignment = 64 : index, binding = 2 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      "memref.assume_alignment"(%49) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
      %50 = "memref.alloca"(%45) <{operandSegmentSizes = array<i32: 1, 0>}> : (index) -> memref<1x?xi32, #gpu.address_space<private>>
      %51 = "affine.apply"(%45) <{map = affine_map<()[s0] -> (s0 ceildiv 2)>}> : (index) -> index
      "gpu.barrier"() : () -> ()
      "scf.for"(%9, %51, %5) ({
      ^bb0(%arg7: index):
        %62 = "affine.apply"(%arg7) <{map = affine_map<(d0) -> (d0 * 2)>}> : (index) -> index
        %63 = "affine.min"(%arg7, %45) <{map = affine_map<(d0)[s0] -> (d0 * -2 + s0, 2)>}> : (index, index) -> index
        "scf.for"(%4, %63, %2) ({
        ^bb0(%arg8: index):
          %64 = "arith.addi"(%arg8, %62) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
          %65 = "vector.transfer_read"(%47, %4, %64, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
          %66 = "arith.addi"(%65, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
          %67 = "arith.trunci"(%66) : (vector<1xi64>) -> vector<1xi32>
          %68 = "affine.apply"(%arg7, %arg8) <{map = affine_map<(d0)[s0] -> (d0 * 2 + s0)>}> : (index, index) -> index
          "vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
          "vector.transfer_write"(%67, %50, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x?xi32, #gpu.address_space<private>>, index, index) -> ()
          "scf.yield"() : () -> ()
        }) : (index, index, index) -> ()
        "scf.yield"() : () -> ()
      }) : (index, index, index) -> ()
      "gpu.barrier"() : () -> ()
      %52:2 = "affine.delinearize_index"(%9) <{static_basis = array<i64: 4, 16>}> : (index) -> (index, index)
      "scf.forall"(%43) <{mapping = [#iree_codegen.workgroup_mapping<z:1>, #iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>], operandSegmentSizes = array<i32: 0, 1, 0, 0>, staticLowerBound = array<i64: 0, 0, 0, 0>, staticStep = array<i64: 1, 1, 1, 1>, staticUpperBound = array<i64: -9223372036854775808, 32, 2, 2>}> ({
      ^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index):
        %53 = "memref.subview"(%7) <{operandSegmentSizes = array<i32: 1, 0, 0, 0>, static_offsets = array<i64: 0, 0>, static_sizes = array<i64: 1, 1>, static_strides = array<i64: 1, 1>}> : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>
        "gpu.barrier"() : () -> ()
        "scf.for"(%9, %2, %5) ({
        ^bb0(%arg6: index):
          %59 = "vector.transfer_read"(%47, %4, %arg0, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
          %60 = "arith.addi"(%59, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
          %61 = "arith.trunci"(%60) : (vector<1xi64>) -> vector<1xi32>
          "vector.transfer_write"(%61, %7, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x3xi32, #gpu.address_space<workgroup>>, index, index) -> ()
          "scf.yield"() : () -> ()
        }) : (index, index, index) -> ()
        "gpu.barrier"() : () -> ()
        %54 = "affine.apply"(%arg2, %52#0) <{map = affine_map<(d0)[s0] -> (d0 * 4 + s0)>}> : (index, index) -> index
        %55 = "affine.apply"(%arg3, %52#1) <{map = affine_map<(d0)[s0] -> (d0 * 64 + s0 * 4)>}> : (index, index) -> index
        %56 = "memref.subview"(%48, %arg1, %54, %55, %44) <{operandSegmentSizes = array<i32: 1, 3, 1, 0>, static_offsets = array<i64: 0, -9223372036854775808, -9223372036854775808, -9223372036854775808>, static_sizes = array<i64: -9223372036854775808, 1, 1, 4>, static_strides = array<i64: 1, 1, 1, 1>}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>, index, index, index, index) -> memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        %57 = "vector.transfer_read"(%46, %4, %arg0, %arg1, %54, %55, %0) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 5, 1, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, index, index, index, f8E4M3FNUZ) -> vector<4xf8E4M3FNUZ>
        %58 = "arith.extf"(%57) : (vector<4xf8E4M3FNUZ>) -> vector<4xbf16>
        "vector.transfer_write"(%58, %8, %4, %4, %4, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 5, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (vector<4xbf16>, memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, index, index, index, index, index) -> ()
        "iree_linalg_ext.scatter"(%8, %53, %56) <{dimension_map = array<i64: 0>, operandSegmentSizes = array<i32: 2, 1>, unique_indices = true}> ({
        ^bb0(%arg4: bf16, %arg5: bf16):
          "iree_linalg_ext.yield"(%arg4) : (bf16) -> ()
        }) : (memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>, memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
        "gpu.barrier"() : () -> ()
        "scf.forall.in_parallel"() ({
        ^bb0:
        }) : () -> ()
      }) : (index) -> ()
      "memref.dealloc"(%7) : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> ()
      "func.return"() : () -> ()
    }) {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64>} : () -> ()
  }) : () -> ()
  "hal.executable.variant_end"() : () -> ()
 }) {sym_name = "rocm_hsaco_fb", target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>} : () -> ()
 /sharedfile/32/fp8_32_kv16.mlir:9895:13: error: 'vector.transfer_write' op write affecting operations on global resources are restricted to workgroup distributed contexts.
    %3686 = torch.aten.index_put %3684, %3685, %3680, %false_3204 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
            ^
 /sharedfile/32/fp8_32_kv16.mlir:9895:13: note: see current operation: "vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
 /sharedfile/32/fp8_32_kv16.mlir:9895:13: error: 'func.func' op failed on workgroup distribution verification
    %3686 = torch.aten.index_put %3684, %3685, %3680, %false_3204 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
            ^
 /sharedfile/32/fp8_32_kv16.mlir:9895:13: note: see current operation: 
 "func.func"() <{function_type = () -> (), sym_name = "prefill_bs1$async_dispatch_172_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"}> ({
  %0 = "arith.constant"() <{value = 0.000000e+00 : f8E4M3FNUZ}> : () -> f8E4M3FNUZ
  %1 = "arith.constant"() <{value = 0 : i64}> : () -> i64
  %2 = "arith.constant"() <{value = 1 : index}> : () -> index
  %3 = "arith.constant"() <{value = 32 : i64}> : () -> i64
  %4 = "arith.constant"() <{value = 0 : index}> : () -> index
  %5 = "arith.constant"() <{value = 64 : index}> : () -> index
  %6 = "arith.constant"() <{value = dense<16> : vector<1xi64>}> : () -> vector<1xi64>
  %7 = "memref.alloc"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x3xi32, #gpu.address_space<workgroup>>
  %8 = "memref.alloca"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x1x1x1x4xbf16, #gpu.address_space<private>>
  %9 = "gpu.thread_id"() <{dimension = #gpu<dim x>, upper_bound = 64 : index}> : () -> index
  %10 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index} : () -> i32
  %11 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 1 : index} : () -> i32
  %12 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 2 : index} : () -> i32
  %13 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 3 : index} : () -> i32
  %14 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 4 : index} : () -> i32
  %15 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 5 : index} : () -> i32
  %16 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 6 : index} : () -> i32
  %17 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 7 : index} : () -> i32
  %18 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 8 : index} : () -> i32
  %19 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 9 : index} : () -> i32
  %20 = "arith.extui"(%10) : (i32) -> i64
  %21 = "arith.extui"(%11) : (i32) -> i64
  %22 = "arith.shli"(%21, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
  %23 = "arith.ori"(%20, %22) : (i64, i64) -> i64
  %24 = "arith.index_castui"(%23) : (i64) -> index
  %25 = "arith.extui"(%12) : (i32) -> i64
  %26 = "arith.extui"(%13) : (i32) -> i64
  %27 = "arith.shli"(%26, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
  %28 = "arith.ori"(%25, %27) : (i64, i64) -> i64
  %29 = "arith.index_castui"(%28) : (i64) -> index
  %30 = "arith.extui"(%14) : (i32) -> i64
  %31 = "arith.extui"(%15) : (i32) -> i64
  %32 = "arith.shli"(%31, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
  %33 = "arith.ori"(%30, %32) : (i64, i64) -> i64
  %34 = "arith.index_castui"(%33) : (i64) -> index
  %35 = "arith.index_castui"(%16) : (i32) -> index
  %36 = "arith.extui"(%17) : (i32) -> i64
  %37 = "arith.extui"(%18) : (i32) -> i64
  %38 = "arith.shli"(%37, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
  %39 = "arith.ori"(%36, %38) : (i64, i64) -> i64
  %40 = "arith.index_castui"(%39) : (i64) -> index
  %41 = "arith.index_castui"(%19) : (i32) -> index
  %42:6 = "util.assume.int"(%24, %29, %34, %35, %40, %41) <{assumptions = [[#util.int.assumption<umin = 68386816, umax = 5300322304>], [#util.int.assumption<umin = 68501504, umax = 5769969664>], [#util.int.assumption<umin = 68501568, umax = 5770002432>], [#util.int.assumption<umin = 1, umax = 4095>], [#util.int.assumption<umin = 64, umax = 576460752303423424, udiv = 64>], [#util.int.assumption<umin = 1, umax = 4095>]]}> : (index, index, index, index, index, index) -> (index, index, index, index, index, index)
  %43 = "flow.dispatch.workload.ordinal"(%42#3) <{ordinal = 1 : index}> : (index) -> index
  %44 = "flow.dispatch.workload.ordinal"(%42#4) <{ordinal = 2 : index}> : (index) -> index
  %45 = "flow.dispatch.workload.ordinal"(%42#5) <{ordinal = 3 : index}> : (index) -> index
  %46 = "hal.interface.binding.subspan"(%42#0, %43) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  "memref.assume_alignment"(%46) <{alignment = 1 : i32}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
  %47 = "hal.interface.binding.subspan"(%42#1, %45) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  "memref.assume_alignment"(%47) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
  %48 = "hal.interface.binding.subspan"(%4, %44) {alignment = 64 : index, binding = 1 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>
  "memref.assume_alignment"(%48) <{alignment = 64 : i32}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>) -> ()
  %49 = "hal.interface.binding.subspan"(%42#2, %45) {alignment = 64 : index, binding = 2 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  "memref.assume_alignment"(%49) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
  %50 = "memref.alloca"(%45) <{operandSegmentSizes = array<i32: 1, 0>}> : (index) -> memref<1x?xi32, #gpu.address_space<private>>
  %51 = "affine.apply"(%45) <{map = affine_map<()[s0] -> (s0 ceildiv 2)>}> : (index) -> index
  "gpu.barrier"() : () -> ()
  "scf.for"(%9, %51, %5) ({
  ^bb0(%arg7: index):
    %62 = "affine.apply"(%arg7) <{map = affine_map<(d0) -> (d0 * 2)>}> : (index) -> index
    %63 = "affine.min"(%arg7, %45) <{map = affine_map<(d0)[s0] -> (d0 * -2 + s0, 2)>}> : (index, index) -> index
    "scf.for"(%4, %63, %2) ({
    ^bb0(%arg8: index):
      %64 = "arith.addi"(%arg8, %62) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
      %65 = "vector.transfer_read"(%47, %4, %64, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
      %66 = "arith.addi"(%65, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
      %67 = "arith.trunci"(%66) : (vector<1xi64>) -> vector<1xi32>
      %68 = "affine.apply"(%arg7, %arg8) <{map = affine_map<(d0)[s0] -> (d0 * 2 + s0)>}> : (index, index) -> index
      "vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
      "vector.transfer_write"(%67, %50, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x?xi32, #gpu.address_space<private>>, index, index) -> ()
      "scf.yield"() : () -> ()
    }) : (index, index, index) -> ()
    "scf.yield"() : () -> ()
  }) : (index, index, index) -> ()
  "gpu.barrier"() : () -> ()
  %52:2 = "affine.delinearize_index"(%9) <{static_basis = array<i64: 4, 16>}> : (index) -> (index, index)
  "scf.forall"(%43) <{mapping = [#iree_codegen.workgroup_mapping<z:1>, #iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>], operandSegmentSizes = array<i32: 0, 1, 0, 0>, staticLowerBound = array<i64: 0, 0, 0, 0>, staticStep = array<i64: 1, 1, 1, 1>, staticUpperBound = array<i64: -9223372036854775808, 32, 2, 2>}> ({
  ^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index):
    %53 = "memref.subview"(%7) <{operandSegmentSizes = array<i32: 1, 0, 0, 0>, static_offsets = array<i64: 0, 0>, static_sizes = array<i64: 1, 1>, static_strides = array<i64: 1, 1>}> : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>
    "gpu.barrier"() : () -> ()
    "scf.for"(%9, %2, %5) ({
    ^bb0(%arg6: index):
      %59 = "vector.transfer_read"(%47, %4, %arg0, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
      %60 = "arith.addi"(%59, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
      %61 = "arith.trunci"(%60) : (vector<1xi64>) -> vector<1xi32>
      "vector.transfer_write"(%61, %7, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x3xi32, #gpu.address_space<workgroup>>, index, index) -> ()
      "scf.yield"() : () -> ()
    }) : (index, index, index) -> ()
    "gpu.barrier"() : () -> ()
    %54 = "affine.apply"(%arg2, %52#0) <{map = affine_map<(d0)[s0] -> (d0 * 4 + s0)>}> : (index, index) -> index
    %55 = "affine.apply"(%arg3, %52#1) <{map = affine_map<(d0)[s0] -> (d0 * 64 + s0 * 4)>}> : (index, index) -> index
    %56 = "memref.subview"(%48, %arg1, %54, %55, %44) <{operandSegmentSizes = array<i32: 1, 3, 1, 0>, static_offsets = array<i64: 0, -9223372036854775808, -9223372036854775808, -9223372036854775808>, static_sizes = array<i64: -9223372036854775808, 1, 1, 4>, static_strides = array<i64: 1, 1, 1, 1>}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>, index, index, index, index) -> memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %57 = "vector.transfer_read"(%46, %4, %arg0, %arg1, %54, %55, %0) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 5, 1, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, index, index, index, f8E4M3FNUZ) -> vector<4xf8E4M3FNUZ>
    %58 = "arith.extf"(%57) : (vector<4xf8E4M3FNUZ>) -> vector<4xbf16>
    "vector.transfer_write"(%58, %8, %4, %4, %4, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 5, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (vector<4xbf16>, memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, index, index, index, index, index) -> ()
    "iree_linalg_ext.scatter"(%8, %53, %56) <{dimension_map = array<i64: 0>, operandSegmentSizes = array<i32: 2, 1>, unique_indices = true}> ({
    ^bb0(%arg4: bf16, %arg5: bf16):
      "iree_linalg_ext.yield"(%arg4) : (bf16) -> ()
    }) : (memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>, memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
    "gpu.barrier"() : () -> ()
    "scf.forall.in_parallel"() ({
    ^bb0:
    }) : () -> ()
  }) : (index) -> ()
  "memref.dealloc"(%7) : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> ()
  "func.return"() : () -> ()
 }) {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64>} : () -> ()
 /sharedfile/32/fp8_32_kv16.mlir:9895:13: error: failed to run translation of source executable to target executable for backend #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>
    %3686 = torch.aten.index_put %3684, %3685, %3680, %false_3204 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
            ^
 /sharedfile/32/fp8_32_kv16.mlir:9895:13: note: see current operation: 
 "hal.executable.variant"() ({
  "hal.executable.export"() ({
  ^bb0(%arg9: !hal.device, %arg10: index, %arg11: index, %arg12: index, %arg13: index):
    %69:3 = "flow.dispatch.workgroup_count_from_slice"(%arg10, %arg11, %arg12, %arg13) : (index, index, index, index) -> (index, index, index)
    "hal.return"(%69#0, %69#1, %69#2) : (index, index, index) -> ()
  }) {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index, sym_name = "prefill_bs1$async_dispatch_172_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"} : () -> ()
  "builtin.module"() ({
    "func.func"() <{function_type = () -> (), sym_name = "prefill_bs1$async_dispatch_172_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"}> ({
      %0 = "arith.constant"() <{value = 0.000000e+00 : f8E4M3FNUZ}> : () -> f8E4M3FNUZ
      %1 = "arith.constant"() <{value = 0 : i64}> : () -> i64
      %2 = "arith.constant"() <{value = 1 : index}> : () -> index
      %3 = "arith.constant"() <{value = 32 : i64}> : () -> i64
      %4 = "arith.constant"() <{value = 0 : index}> : () -> index
      %5 = "arith.constant"() <{value = 64 : index}> : () -> index
      %6 = "arith.constant"() <{value = dense<16> : vector<1xi64>}> : () -> vector<1xi64>
      %7 = "memref.alloc"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x3xi32, #gpu.address_space<workgroup>>
      %8 = "memref.alloca"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x1x1x1x4xbf16, #gpu.address_space<private>>
      %9 = "gpu.thread_id"() <{dimension = #gpu<dim x>, upper_bound = 64 : index}> : () -> index
      %10 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index} : () -> i32
      %11 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 1 : index} : () -> i32
      %12 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 2 : index} : () -> i32
      %13 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 3 : index} : () -> i32
      %14 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 4 : index} : () -> i32
      %15 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 5 : index} : () -> i32
      %16 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 6 : index} : () -> i32
      %17 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 7 : index} : () -> i32
      %18 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 8 : index} : () -> i32
      %19 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 9 : index} : () -> i32
      %20 = "arith.extui"(%10) : (i32) -> i64
      %21 = "arith.extui"(%11) : (i32) -> i64
      %22 = "arith.shli"(%21, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
      %23 = "arith.ori"(%20, %22) : (i64, i64) -> i64
      %24 = "arith.index_castui"(%23) : (i64) -> index
      %25 = "arith.extui"(%12) : (i32) -> i64
      %26 = "arith.extui"(%13) : (i32) -> i64
      %27 = "arith.shli"(%26, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
      %28 = "arith.ori"(%25, %27) : (i64, i64) -> i64
      %29 = "arith.index_castui"(%28) : (i64) -> index
      %30 = "arith.extui"(%14) : (i32) -> i64
      %31 = "arith.extui"(%15) : (i32) -> i64
      %32 = "arith.shli"(%31, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
      %33 = "arith.ori"(%30, %32) : (i64, i64) -> i64
      %34 = "arith.index_castui"(%33) : (i64) -> index
      %35 = "arith.index_castui"(%16) : (i32) -> index
      %36 = "arith.extui"(%17) : (i32) -> i64
      %37 = "arith.extui"(%18) : (i32) -> i64
      %38 = "arith.shli"(%37, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
      %39 = "arith.ori"(%36, %38) : (i64, i64) -> i64
      %40 = "arith.index_castui"(%39) : (i64) -> index
      %41 = "arith.index_castui"(%19) : (i32) -> index
      %42:6 = "util.assume.int"(%24, %29, %34, %35, %40, %41) <{assumptions = [[#util.int.assumption<umin = 68386816, umax = 5300322304>], [#util.int.assumption<umin = 68501504, umax = 5769969664>], [#util.int.assumption<umin = 68501568, umax = 5770002432>], [#util.int.assumption<umin = 1, umax = 4095>], [#util.int.assumption<umin = 64, umax = 576460752303423424, udiv = 64>], [#util.int.assumption<umin = 1, umax = 4095>]]}> : (index, index, index, index, index, index) -> (index, index, index, index, index, index)
      %43 = "flow.dispatch.workload.ordinal"(%42#3) <{ordinal = 1 : index}> : (index) -> index
      %44 = "flow.dispatch.workload.ordinal"(%42#4) <{ordinal = 2 : index}> : (index) -> index
      %45 = "flow.dispatch.workload.ordinal"(%42#5) <{ordinal = 3 : index}> : (index) -> index
      %46 = "hal.interface.binding.subspan"(%42#0, %43) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      "memref.assume_alignment"(%46) <{alignment = 1 : i32}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
      %47 = "hal.interface.binding.subspan"(%42#1, %45) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      "memref.assume_alignment"(%47) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
      %48 = "hal.interface.binding.subspan"(%4, %44) {alignment = 64 : index, binding = 1 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>
      "memref.assume_alignment"(%48) <{alignment = 64 : i32}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>) -> ()
      %49 = "hal.interface.binding.subspan"(%42#2, %45) {alignment = 64 : index, binding = 2 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      "memref.assume_alignment"(%49) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
      %50 = "memref.alloca"(%45) <{operandSegmentSizes = array<i32: 1, 0>}> : (index) -> memref<1x?xi32, #gpu.address_space<private>>
      %51 = "affine.apply"(%45) <{map = affine_map<()[s0] -> (s0 ceildiv 2)>}> : (index) -> index
      "gpu.barrier"() : () -> ()
      "scf.for"(%9, %51, %5) ({
      ^bb0(%arg7: index):
        %62 = "affine.apply"(%arg7) <{map = affine_map<(d0) -> (d0 * 2)>}> : (index) -> index
        %63 = "affine.min"(%arg7, %45) <{map = affine_map<(d0)[s0] -> (d0 * -2 + s0, 2)>}> : (index, index) -> index
        "scf.for"(%4, %63, %2) ({
        ^bb0(%arg8: index):
          %64 = "arith.addi"(%arg8, %62) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
          %65 = "vector.transfer_read"(%47, %4, %64, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
          %66 = "arith.addi"(%65, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
          %67 = "arith.trunci"(%66) : (vector<1xi64>) -> vector<1xi32>
          %68 = "affine.apply"(%arg7, %arg8) <{map = affine_map<(d0)[s0] -> (d0 * 2 + s0)>}> : (index, index) -> index
          "vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
          "vector.transfer_write"(%67, %50, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x?xi32, #gpu.address_space<private>>, index, index) -> ()
          "scf.yield"() : () -> ()
        }) : (index, index, index) -> ()
        "scf.yield"() : () -> ()
      }) : (index, index, index) -> ()
      "gpu.barrier"() : () -> ()
      %52:2 = "affine.delinearize_index"(%9) <{static_basis = array<i64: 4, 16>}> : (index) -> (index, index)
      "scf.forall"(%43) <{mapping = [#iree_codegen.workgroup_mapping<z:1>, #iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>], operandSegmentSizes = array<i32: 0, 1, 0, 0>, staticLowerBound = array<i64: 0, 0, 0, 0>, staticStep = array<i64: 1, 1, 1, 1>, staticUpperBound = array<i64: -9223372036854775808, 32, 2, 2>}> ({
      ^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index):
        %53 = "memref.subview"(%7) <{operandSegmentSizes = array<i32: 1, 0, 0, 0>, static_offsets = array<i64: 0, 0>, static_sizes = array<i64: 1, 1>, static_strides = array<i64: 1, 1>}> : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>
        "gpu.barrier"() : () -> ()
        "scf.for"(%9, %2, %5) ({
        ^bb0(%arg6: index):
          %59 = "vector.transfer_read"(%47, %4, %arg0, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
          %60 = "arith.addi"(%59, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
          %61 = "arith.trunci"(%60) : (vector<1xi64>) -> vector<1xi32>
          "vector.transfer_write"(%61, %7, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x3xi32, #gpu.address_space<workgroup>>, index, index) -> ()
          "scf.yield"() : () -> ()
        }) : (index, index, index) -> ()
        "gpu.barrier"() : () -> ()
        %54 = "affine.apply"(%arg2, %52#0) <{map = affine_map<(d0)[s0] -> (d0 * 4 + s0)>}> : (index, index) -> index
        %55 = "affine.apply"(%arg3, %52#1) <{map = affine_map<(d0)[s0] -> (d0 * 64 + s0 * 4)>}> : (index, index) -> index
        %56 = "memref.subview"(%48, %arg1, %54, %55, %44) <{operandSegmentSizes = array<i32: 1, 3, 1, 0>, static_offsets = array<i64: 0, -9223372036854775808, -9223372036854775808, -9223372036854775808>, static_sizes = array<i64: -9223372036854775808, 1, 1, 4>, static_strides = array<i64: 1, 1, 1, 1>}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>, index, index, index, index) -> memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        %57 = "vector.transfer_read"(%46, %4, %arg0, %arg1, %54, %55, %0) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 5, 1, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, index, index, index, f8E4M3FNUZ) -> vector<4xf8E4M3FNUZ>
        %58 = "arith.extf"(%57) : (vector<4xf8E4M3FNUZ>) -> vector<4xbf16>
        "vector.transfer_write"(%58, %8, %4, %4, %4, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 5, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (vector<4xbf16>, memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, index, index, index, index, index) -> ()
        "iree_linalg_ext.scatter"(%8, %53, %56) <{dimension_map = array<i64: 0>, operandSegmentSizes = array<i32: 2, 1>, unique_indices = true}> ({
        ^bb0(%arg4: bf16, %arg5: bf16):
          "iree_linalg_ext.yield"(%arg4) : (bf16) -> ()
        }) : (memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>, memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
        "gpu.barrier"() : () -> ()
        "scf.forall.in_parallel"() ({
        ^bb0:
        }) : () -> ()
      }) : (index) -> ()
      "memref.dealloc"(%7) : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> ()
      "func.return"() : () -> ()
    }) {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64>} : () -> ()
  }) : () -> ()
  "hal.executable.variant_end"() : () -> ()
 }) {sym_name = "rocm_hsaco_fb", target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>} : () -> ()
 /sharedfile/32/fp8_32_kv16.mlir:10771:13: error: 'vector.transfer_write' op write affecting operations on global resources are restricted to workgroup distributed contexts.
    %4023 = torch.aten.index_put %4021, %4022, %4017, %false_3576 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
            ^
 /sharedfile/32/fp8_32_kv16.mlir:10771:13: note: see current operation: "vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
 /sharedfile/32/fp8_32_kv16.mlir:10771:13: error: 'func.func' op failed on workgroup distribution verification
    %4023 = torch.aten.index_put %4021, %4022, %4017, %false_3576 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
            ^
 /sharedfile/32/fp8_32_kv16.mlir:10771:13: note: see current operation: 
 "func.func"() <{function_type = () -> (), sym_name = "prefill_bs1$async_dispatch_192_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"}> ({
  %0 = "arith.constant"() <{value = 0.000000e+00 : f8E4M3FNUZ}> : () -> f8E4M3FNUZ
  %1 = "arith.constant"() <{value = 0 : i64}> : () -> i64
  %2 = "arith.constant"() <{value = 1 : index}> : () -> index
  %3 = "arith.constant"() <{value = 32 : i64}> : () -> i64
  %4 = "arith.constant"() <{value = 0 : index}> : () -> index
  %5 = "arith.constant"() <{value = 64 : index}> : () -> index
  %6 = "arith.constant"() <{value = dense<18> : vector<1xi64>}> : () -> vector<1xi64>
  %7 = "memref.alloc"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x3xi32, #gpu.address_space<workgroup>>
  %8 = "memref.alloca"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x1x1x1x4xbf16, #gpu.address_space<private>>
  %9 = "gpu.thread_id"() <{dimension = #gpu<dim x>, upper_bound = 64 : index}> : () -> index
  %10 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index} : () -> i32
  %11 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 1 : index} : () -> i32
  %12 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 2 : index} : () -> i32
  %13 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 3 : index} : () -> i32
  %14 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 4 : index} : () -> i32
  %15 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 5 : index} : () -> i32
  %16 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 6 : index} : () -> i32
  %17 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 7 : index} : () -> i32
  %18 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 8 : index} : () -> i32
  %19 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 9 : index} : () -> i32
  %20 = "arith.extui"(%10) : (i32) -> i64
  %21 = "arith.extui"(%11) : (i32) -> i64
  %22 = "arith.shli"(%21, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
  %23 = "arith.ori"(%20, %22) : (i64, i64) -> i64
  %24 = "arith.index_castui"(%23) : (i64) -> index
  %25 = "arith.extui"(%12) : (i32) -> i64
  %26 = "arith.extui"(%13) : (i32) -> i64
  %27 = "arith.shli"(%26, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
  %28 = "arith.ori"(%25, %27) : (i64, i64) -> i64
  %29 = "arith.index_castui"(%28) : (i64) -> index
  %30 = "arith.extui"(%14) : (i32) -> i64
  %31 = "arith.extui"(%15) : (i32) -> i64
  %32 = "arith.shli"(%31, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
  %33 = "arith.ori"(%30, %32) : (i64, i64) -> i64
  %34 = "arith.index_castui"(%33) : (i64) -> index
  %35 = "arith.index_castui"(%16) : (i32) -> index
  %36 = "arith.extui"(%17) : (i32) -> i64
  %37 = "arith.extui"(%18) : (i32) -> i64
  %38 = "arith.shli"(%37, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
  %39 = "arith.ori"(%36, %38) : (i64, i64) -> i64
  %40 = "arith.index_castui"(%39) : (i64) -> index
  %41 = "arith.index_castui"(%19) : (i32) -> index
  %42:6 = "util.assume.int"(%24, %29, %34, %35, %40, %41) <{assumptions = [[#util.int.assumption<umin = 68386816, umax = 5300322304>], [#util.int.assumption<umin = 68501504, umax = 5769969664>], [#util.int.assumption<umin = 68501568, umax = 5770002432>], [#util.int.assumption<umin = 1, umax = 4095>], [#util.int.assumption<umin = 64, umax = 576460752303423424, udiv = 64>], [#util.int.assumption<umin = 1, umax = 4095>]]}> : (index, index, index, index, index, index) -> (index, index, index, index, index, index)
  %43 = "flow.dispatch.workload.ordinal"(%42#3) <{ordinal = 1 : index}> : (index) -> index
  %44 = "flow.dispatch.workload.ordinal"(%42#4) <{ordinal = 2 : index}> : (index) -> index
  %45 = "flow.dispatch.workload.ordinal"(%42#5) <{ordinal = 3 : index}> : (index) -> index
  %46 = "hal.interface.binding.subspan"(%42#0, %43) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  "memref.assume_alignment"(%46) <{alignment = 1 : i32}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
  %47 = "hal.interface.binding.subspan"(%42#1, %45) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  "memref.assume_alignment"(%47) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
  %48 = "hal.interface.binding.subspan"(%4, %44) {alignment = 64 : index, binding = 1 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>
  "memref.assume_alignment"(%48) <{alignment = 64 : i32}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>) -> ()
  %49 = "hal.interface.binding.subspan"(%42#2, %45) {alignment = 64 : index, binding = 2 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  "memref.assume_alignment"(%49) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
  %50 = "memref.alloca"(%45) <{operandSegmentSizes = array<i32: 1, 0>}> : (index) -> memref<1x?xi32, #gpu.address_space<private>>
  %51 = "affine.apply"(%45) <{map = affine_map<()[s0] -> (s0 ceildiv 2)>}> : (index) -> index
  "gpu.barrier"() : () -> ()
  "scf.for"(%9, %51, %5) ({
  ^bb0(%arg7: index):
    %62 = "affine.apply"(%arg7) <{map = affine_map<(d0) -> (d0 * 2)>}> : (index) -> index
    %63 = "affine.min"(%arg7, %45) <{map = affine_map<(d0)[s0] -> (d0 * -2 + s0, 2)>}> : (index, index) -> index
    "scf.for"(%4, %63, %2) ({
    ^bb0(%arg8: index):
      %64 = "arith.addi"(%arg8, %62) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
      %65 = "vector.transfer_read"(%47, %4, %64, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
      %66 = "arith.addi"(%65, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
      %67 = "arith.trunci"(%66) : (vector<1xi64>) -> vector<1xi32>
      %68 = "affine.apply"(%arg7, %arg8) <{map = affine_map<(d0)[s0] -> (d0 * 2 + s0)>}> : (index, index) -> index
      "vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
      "vector.transfer_write"(%67, %50, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x?xi32, #gpu.address_space<private>>, index, index) -> ()
      "scf.yield"() : () -> ()
    }) : (index, index, index) -> ()
    "scf.yield"() : () -> ()
  }) : (index, index, index) -> ()
  "gpu.barrier"() : () -> ()
  %52:2 = "affine.delinearize_index"(%9) <{static_basis = array<i64: 4, 16>}> : (index) -> (index, index)
  "scf.forall"(%43) <{mapping = [#iree_codegen.workgroup_mapping<z:1>, #iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>], operandSegmentSizes = array<i32: 0, 1, 0, 0>, staticLowerBound = array<i64: 0, 0, 0, 0>, staticStep = array<i64: 1, 1, 1, 1>, staticUpperBound = array<i64: -9223372036854775808, 32, 2, 2>}> ({
  ^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index):
    %53 = "memref.subview"(%7) <{operandSegmentSizes = array<i32: 1, 0, 0, 0>, static_offsets = array<i64: 0, 0>, static_sizes = array<i64: 1, 1>, static_strides = array<i64: 1, 1>}> : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>
    "gpu.barrier"() : () -> ()
    "scf.for"(%9, %2, %5) ({
    ^bb0(%arg6: index):
      %59 = "vector.transfer_read"(%47, %4, %arg0, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
      %60 = "arith.addi"(%59, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
      %61 = "arith.trunci"(%60) : (vector<1xi64>) -> vector<1xi32>
      "vector.transfer_write"(%61, %7, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x3xi32, #gpu.address_space<workgroup>>, index, index) -> ()
      "scf.yield"() : () -> ()
    }) : (index, index, index) -> ()
    "gpu.barrier"() : () -> ()
    %54 = "affine.apply"(%arg2, %52#0) <{map = affine_map<(d0)[s0] -> (d0 * 4 + s0)>}> : (index, index) -> index
    %55 = "affine.apply"(%arg3, %52#1) <{map = affine_map<(d0)[s0] -> (d0 * 64 + s0 * 4)>}> : (index, index) -> index
    %56 = "memref.subview"(%48, %arg1, %54, %55, %44) <{operandSegmentSizes = array<i32: 1, 3, 1, 0>, static_offsets = array<i64: 0, -9223372036854775808, -9223372036854775808, -9223372036854775808>, static_sizes = array<i64: -9223372036854775808, 1, 1, 4>, static_strides = array<i64: 1, 1, 1, 1>}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>, index, index, index, index) -> memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %57 = "vector.transfer_read"(%46, %4, %arg0, %arg1, %54, %55, %0) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 5, 1, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, index, index, index, f8E4M3FNUZ) -> vector<4xf8E4M3FNUZ>
    %58 = "arith.extf"(%57) : (vector<4xf8E4M3FNUZ>) -> vector<4xbf16>
    "vector.transfer_write"(%58, %8, %4, %4, %4, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 5, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (vector<4xbf16>, memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, index, index, index, index, index) -> ()
    "iree_linalg_ext.scatter"(%8, %53, %56) <{dimension_map = array<i64: 0>, operandSegmentSizes = array<i32: 2, 1>, unique_indices = true}> ({
    ^bb0(%arg4: bf16, %arg5: bf16):
      "iree_linalg_ext.yield"(%arg4) : (bf16) -> ()
    }) : (memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>, memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
    "gpu.barrier"() : () -> ()
    "scf.forall.in_parallel"() ({
    ^bb0:
    }) : () -> ()
  }) : (index) -> ()
  "memref.dealloc"(%7) : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> ()
  "func.return"() : () -> ()
 }) {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64>} : () -> ()
 /sharedfile/32/fp8_32_kv16.mlir:10771:13: error: failed to run translation of source executable to target executable for backend #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>
    %4023 = torch.aten.index_put %4021, %4022, %4017, %false_3576 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
            ^
 /sharedfile/32/fp8_32_kv16.mlir:10771:13: note: see current operation: 
 "hal.executable.variant"() ({
  "hal.executable.export"() ({
  ^bb0(%arg9: !hal.device, %arg10: index, %arg11: index, %arg12: index, %arg13: index):
    %69:3 = "flow.dispatch.workgroup_count_from_slice"(%arg10, %arg11, %arg12, %arg13) : (index, index, index, index) -> (index, index, index)
    "hal.return"(%69#0, %69#1, %69#2) : (index, index, index) -> ()
  }) {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index, sym_name = "prefill_bs1$async_dispatch_192_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"} : () -> ()
  "builtin.module"() ({
    "func.func"() <{function_type = () -> (), sym_name = "prefill_bs1$async_dispatch_192_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"}> ({
      %0 = "arith.constant"() <{value = 0.000000e+00 : f8E4M3FNUZ}> : () -> f8E4M3FNUZ
      %1 = "arith.constant"() <{value = 0 : i64}> : () -> i64
      %2 = "arith.constant"() <{value = 1 : index}> : () -> index
      %3 = "arith.constant"() <{value = 32 : i64}> : () -> i64
      %4 = "arith.constant"() <{value = 0 : index}> : () -> index
      %5 = "arith.constant"() <{value = 64 : index}> : () -> index
      %6 = "arith.constant"() <{value = dense<18> : vector<1xi64>}> : () -> vector<1xi64>
      %7 = "memref.alloc"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x3xi32, #gpu.address_space<workgroup>>
      %8 = "memref.alloca"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x1x1x1x4xbf16, #gpu.address_space<private>>
      %9 = "gpu.thread_id"() <{dimension = #gpu<dim x>, upper_bound = 64 : index}> : () -> index
      %10 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index} : () -> i32
      %11 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 1 : index} : () -> i32
      %12 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 2 : index} : () -> i32
      %13 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 3 : index} : () -> i32
      %14 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 4 : index} : () -> i32
      %15 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 5 : index} : () -> i32
      %16 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 6 : index} : () -> i32
      %17 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 7 : index} : () -> i32
      %18 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 8 : index} : () -> i32
      %19 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 9 : index} : () -> i32
      %20 = "arith.extui"(%10) : (i32) -> i64
      %21 = "arith.extui"(%11) : (i32) -> i64
      %22 = "arith.shli"(%21, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
      %23 = "arith.ori"(%20, %22) : (i64, i64) -> i64
      %24 = "arith.index_castui"(%23) : (i64) -> index
      %25 = "arith.extui"(%12) : (i32) -> i64
      %26 = "arith.extui"(%13) : (i32) -> i64
      %27 = "arith.shli"(%26, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
      %28 = "arith.ori"(%25, %27) : (i64, i64) -> i64
      %29 = "arith.index_castui"(%28) : (i64) -> index
      %30 = "arith.extui"(%14) : (i32) -> i64
      %31 = "arith.extui"(%15) : (i32) -> i64
      %32 = "arith.shli"(%31, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
      %33 = "arith.ori"(%30, %32) : (i64, i64) -> i64
      %34 = "arith.index_castui"(%33) : (i64) -> index
      %35 = "arith.index_castui"(%16) : (i32) -> index
      %36 = "arith.extui"(%17) : (i32) -> i64
      %37 = "arith.extui"(%18) : (i32) -> i64
      %38 = "arith.shli"(%37, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
      %39 = "arith.ori"(%36, %38) : (i64, i64) -> i64
      %40 = "arith.index_castui"(%39) : (i64) -> index
      %41 = "arith.index_castui"(%19) : (i32) -> index
      %42:6 = "util.assume.int"(%24, %29, %34, %35, %40, %41) <{assumptions = [[#util.int.assumption<umin = 68386816, umax = 5300322304>], [#util.int.assumption<umin = 68501504, umax = 5769969664>], [#util.int.assumption<umin = 68501568, umax = 5770002432>], [#util.int.assumption<umin = 1, umax = 4095>], [#util.int.assumption<umin = 64, umax = 576460752303423424, udiv = 64>], [#util.int.assumption<umin = 1, umax = 4095>]]}> : (index, index, index, index, index, index) -> (index, index, index, index, index, index)
      %43 = "flow.dispatch.workload.ordinal"(%42#3) <{ordinal = 1 : index}> : (index) -> index
      %44 = "flow.dispatch.workload.ordinal"(%42#4) <{ordinal = 2 : index}> : (index) -> index
      %45 = "flow.dispatch.workload.ordinal"(%42#5) <{ordinal = 3 : index}> : (index) -> index
      %46 = "hal.interface.binding.subspan"(%42#0, %43) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      "memref.assume_alignment"(%46) <{alignment = 1 : i32}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
      %47 = "hal.interface.binding.subspan"(%42#1, %45) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      "memref.assume_alignment"(%47) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
      %48 = "hal.interface.binding.subspan"(%4, %44) {alignment = 64 : index, binding = 1 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>
      "memref.assume_alignment"(%48) <{alignment = 64 : i32}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>) -> ()
      %49 = "hal.interface.binding.subspan"(%42#2, %45) {alignment = 64 : index, binding = 2 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      "memref.assume_alignment"(%49) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
      %50 = "memref.alloca"(%45) <{operandSegmentSizes = array<i32: 1, 0>}> : (index) -> memref<1x?xi32, #gpu.address_space<private>>
      %51 = "affine.apply"(%45) <{map = affine_map<()[s0] -> (s0 ceildiv 2)>}> : (index) -> index
      "gpu.barrier"() : () -> ()
      "scf.for"(%9, %51, %5) ({
      ^bb0(%arg7: index):
        %62 = "affine.apply"(%arg7) <{map = affine_map<(d0) -> (d0 * 2)>}> : (index) -> index
        %63 = "affine.min"(%arg7, %45) <{map = affine_map<(d0)[s0] -> (d0 * -2 + s0, 2)>}> : (index, index) -> index
        "scf.for"(%4, %63, %2) ({
        ^bb0(%arg8: index):
          %64 = "arith.addi"(%arg8, %62) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
          %65 = "vector.transfer_read"(%47, %4, %64, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
          %66 = "arith.addi"(%65, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
          %67 = "arith.trunci"(%66) : (vector<1xi64>) -> vector<1xi32>
          %68 = "affine.apply"(%arg7, %arg8) <{map = affine_map<(d0)[s0] -> (d0 * 2 + s0)>}> : (index, index) -> index
          "vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
          "vector.transfer_write"(%67, %50, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x?xi32, #gpu.address_space<private>>, index, index) -> ()
          "scf.yield"() : () -> ()
        }) : (index, index, index) -> ()
        "scf.yield"() : () -> ()
      }) : (index, index, index) -> ()
      "gpu.barrier"() : () -> ()
      %52:2 = "affine.delinearize_index"(%9) <{static_basis = array<i64: 4, 16>}> : (index) -> (index, index)
      "scf.forall"(%43) <{mapping = [#iree_codegen.workgroup_mapping<z:1>, #iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>], operandSegmentSizes = array<i32: 0, 1, 0, 0>, staticLowerBound = array<i64: 0, 0, 0, 0>, staticStep = array<i64: 1, 1, 1, 1>, staticUpperBound = array<i64: -9223372036854775808, 32, 2, 2>}> ({
      ^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index):
        %53 = "memref.subview"(%7) <{operandSegmentSizes = array<i32: 1, 0, 0, 0>, static_offsets = array<i64: 0, 0>, static_sizes = array<i64: 1, 1>, static_strides = array<i64: 1, 1>}> : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>
        "gpu.barrier"() : () -> ()
        "scf.for"(%9, %2, %5) ({
        ^bb0(%arg6: index):
          %59 = "vector.transfer_read"(%47, %4, %arg0, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
          %60 = "arith.addi"(%59, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
          %61 = "arith.trunci"(%60) : (vector<1xi64>) -> vector<1xi32>
          "vector.transfer_write"(%61, %7, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x3xi32, #gpu.address_space<workgroup>>, index, index) -> ()
          "scf.yield"() : () -> ()
        }) : (index, index, index) -> ()
        "gpu.barrier"() : () -> ()
        %54 = "affine.apply"(%arg2, %52#0) <{map = affine_map<(d0)[s0] -> (d0 * 4 + s0)>}> : (index, index) -> index
        %55 = "affine.apply"(%arg3, %52#1) <{map = affine_map<(d0)[s0] -> (d0 * 64 + s0 * 4)>}> : (index, index) -> index
        %56 = "memref.subview"(%48, %arg1, %54, %55, %44) <{operandSegmentSizes = array<i32: 1, 3, 1, 0>, static_offsets = array<i64: 0, -9223372036854775808, -9223372036854775808, -9223372036854775808>, static_sizes = array<i64: -9223372036854775808, 1, 1, 4>, static_strides = array<i64: 1, 1, 1, 1>}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>, index, index, index, index) -> memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        %57 = "vector.transfer_read"(%46, %4, %arg0, %arg1, %54, %55, %0) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 5, 1, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, index, index, index, f8E4M3FNUZ) -> vector<4xf8E4M3FNUZ>
        %58 = "arith.extf"(%57) : (vector<4xf8E4M3FNUZ>) -> vector<4xbf16>
        "vector.transfer_write"(%58, %8, %4, %4, %4, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 5, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (vector<4xbf16>, memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, index, index, index, index, index) -> ()
        "iree_linalg_ext.scatter"(%8, %53, %56) <{dimension_map = array<i64: 0>, operandSegmentSizes = array<i32: 2, 1>, unique_indices = true}> ({
        ^bb0(%arg4: bf16, %arg5: bf16):
          "iree_linalg_ext.yield"(%arg4) : (bf16) -> ()
        }) : (memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>, memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
        "gpu.barrier"() : () -> ()
        "scf.forall.in_parallel"() ({
        ^bb0:
        }) : () -> ()
      }) : (index) -> ()
      "memref.dealloc"(%7) : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> ()
      "func.return"() : () -> ()
    }) {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64>} : () -> ()
  }) : () -> ()
  "hal.executable.variant_end"() : () -> ()
 }) {sym_name = "rocm_hsaco_fb", target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>} : () -> ()
 /sharedfile/32/fp8_32_kv16.mlir:11647:13: error: 'vector.transfer_write' op write affecting operations on global resources are restricted to workgroup distributed contexts.
    %4360 = torch.aten.index_put %4358, %4359, %4354, %false_3948 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
            ^
 /sharedfile/32/fp8_32_kv16.mlir:11647:13: note: see current operation: "vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
 /sharedfile/32/fp8_32_kv16.mlir:11647:13: error: 'func.func' op failed on workgroup distribution verification
    %4360 = torch.aten.index_put %4358, %4359, %4354, %false_3948 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
            ^
 /sharedfile/32/fp8_32_kv16.mlir:11647:13: note: see current operation: 
 "func.func"() <{function_type = () -> (), sym_name = "prefill_bs1$async_dispatch_212_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"}> ({
  %0 = "arith.constant"() <{value = 0.000000e+00 : f8E4M3FNUZ}> : () -> f8E4M3FNUZ
  %1 = "arith.constant"() <{value = 0 : i64}> : () -> i64
  %2 = "arith.constant"() <{value = 1 : index}> : () -> index
  %3 = "arith.constant"() <{value = 32 : i64}> : () -> i64
  %4 = "arith.constant"() <{value = 0 : index}> : () -> index
  %5 = "arith.constant"() <{value = 64 : index}> : () -> index
  %6 = "arith.constant"() <{value = dense<20> : vector<1xi64>}> : () -> vector<1xi64>
  %7 = "memref.alloc"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x3xi32, #gpu.address_space<workgroup>>
  %8 = "memref.alloca"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x1x1x1x4xbf16, #gpu.address_space<private>>
  %9 = "gpu.thread_id"() <{dimension = #gpu<dim x>, upper_bound = 64 : index}> : () -> index
  %10 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index} : () -> i32
  %11 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 1 : index} : () -> i32
  %12 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 2 : index} : () -> i32
  %13 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 3 : index} : () -> i32
  %14 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 4 : index} : () -> i32
  %15 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 5 : index} : () -> i32
  %16 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 6 : index} : () -> i32
  %17 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 7 : index} : () -> i32
  %18 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 8 : index} : () -> i32
  %19 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 9 : index} : () -> i32
  %20 = "arith.extui"(%10) : (i32) -> i64
  %21 = "arith.extui"(%11) : (i32) -> i64
  %22 = "arith.shli"(%21, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
  %23 = "arith.ori"(%20, %22) : (i64, i64) -> i64
  %24 = "arith.index_castui"(%23) : (i64) -> index
  %25 = "arith.extui"(%12) : (i32) -> i64
  %26 = "arith.extui"(%13) : (i32) -> i64
  %27 = "arith.shli"(%26, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
  %28 = "arith.ori"(%25, %27) : (i64, i64) -> i64
  %29 = "arith.index_castui"(%28) : (i64) -> index
  %30 = "arith.extui"(%14) : (i32) -> i64
  %31 = "arith.extui"(%15) : (i32) -> i64
  %32 = "arith.shli"(%31, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
  %33 = "arith.ori"(%30, %32) : (i64, i64) -> i64
  %34 = "arith.index_castui"(%33) : (i64) -> index
  %35 = "arith.index_castui"(%16) : (i32) -> index
  %36 = "arith.extui"(%17) : (i32) -> i64
  %37 = "arith.extui"(%18) : (i32) -> i64
  %38 = "arith.shli"(%37, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
  %39 = "arith.ori"(%36, %38) : (i64, i64) -> i64
  %40 = "arith.index_castui"(%39) : (i64) -> index
  %41 = "arith.index_castui"(%19) : (i32) -> index
  %42:6 = "util.assume.int"(%24, %29, %34, %35, %40, %41) <{assumptions = [[#util.int.assumption<umin = 68386816, umax = 5300322304>], [#util.int.assumption<umin = 68501504, umax = 5769969664>], [#util.int.assumption<umin = 68501568, umax = 5770002432>], [#util.int.assumption<umin = 1, umax = 4095>], [#util.int.assumption<umin = 64, umax = 576460752303423424, udiv = 64>], [#util.int.assumption<umin = 1, umax = 4095>]]}> : (index, index, index, index, index, index) -> (index, index, index, index, index, index)
  %43 = "flow.dispatch.workload.ordinal"(%42#3) <{ordinal = 1 : index}> : (index) -> index
  %44 = "flow.dispatch.workload.ordinal"(%42#4) <{ordinal = 2 : index}> : (index) -> index
  %45 = "flow.dispatch.workload.ordinal"(%42#5) <{ordinal = 3 : index}> : (index) -> index
  %46 = "hal.interface.binding.subspan"(%42#0, %43) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  "memref.assume_alignment"(%46) <{alignment = 1 : i32}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
  %47 = "hal.interface.binding.subspan"(%42#1, %45) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  "memref.assume_alignment"(%47) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
  %48 = "hal.interface.binding.subspan"(%4, %44) {alignment = 64 : index, binding = 1 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>
  "memref.assume_alignment"(%48) <{alignment = 64 : i32}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>) -> ()
  %49 = "hal.interface.binding.subspan"(%42#2, %45) {alignment = 64 : index, binding = 2 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  "memref.assume_alignment"(%49) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
  %50 = "memref.alloca"(%45) <{operandSegmentSizes = array<i32: 1, 0>}> : (index) -> memref<1x?xi32, #gpu.address_space<private>>
  %51 = "affine.apply"(%45) <{map = affine_map<()[s0] -> (s0 ceildiv 2)>}> : (index) -> index
  "gpu.barrier"() : () -> ()
  "scf.for"(%9, %51, %5) ({
  ^bb0(%arg7: index):
    %62 = "affine.apply"(%arg7) <{map = affine_map<(d0) -> (d0 * 2)>}> : (index) -> index
    %63 = "affine.min"(%arg7, %45) <{map = affine_map<(d0)[s0] -> (d0 * -2 + s0, 2)>}> : (index, index) -> index
    "scf.for"(%4, %63, %2) ({
    ^bb0(%arg8: index):
      %64 = "arith.addi"(%arg8, %62) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
      %65 = "vector.transfer_read"(%47, %4, %64, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
      %66 = "arith.addi"(%65, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
      %67 = "arith.trunci"(%66) : (vector<1xi64>) -> vector<1xi32>
      %68 = "affine.apply"(%arg7, %arg8) <{map = affine_map<(d0)[s0] -> (d0 * 2 + s0)>}> : (index, index) -> index
      "vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
      "vector.transfer_write"(%67, %50, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x?xi32, #gpu.address_space<private>>, index, index) -> ()
      "scf.yield"() : () -> ()
    }) : (index, index, index) -> ()
    "scf.yield"() : () -> ()
  }) : (index, index, index) -> ()
  "gpu.barrier"() : () -> ()
  %52:2 = "affine.delinearize_index"(%9) <{static_basis = array<i64: 4, 16>}> : (index) -> (index, index)
  "scf.forall"(%43) <{mapping = [#iree_codegen.workgroup_mapping<z:1>, #iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>], operandSegmentSizes = array<i32: 0, 1, 0, 0>, staticLowerBound = array<i64: 0, 0, 0, 0>, staticStep = array<i64: 1, 1, 1, 1>, staticUpperBound = array<i64: -9223372036854775808, 32, 2, 2>}> ({
  ^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index):
    %53 = "memref.subview"(%7) <{operandSegmentSizes = array<i32: 1, 0, 0, 0>, static_offsets = array<i64: 0, 0>, static_sizes = array<i64: 1, 1>, static_strides = array<i64: 1, 1>}> : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>
    "gpu.barrier"() : () -> ()
    "scf.for"(%9, %2, %5) ({
    ^bb0(%arg6: index):
      %59 = "vector.transfer_read"(%47, %4, %arg0, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
      %60 = "arith.addi"(%59, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
      %61 = "arith.trunci"(%60) : (vector<1xi64>) -> vector<1xi32>
      "vector.transfer_write"(%61, %7, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x3xi32, #gpu.address_space<workgroup>>, index, index) -> ()
      "scf.yield"() : () -> ()
    }) : (index, index, index) -> ()
    "gpu.barrier"() : () -> ()
    %54 = "affine.apply"(%arg2, %52#0) <{map = affine_map<(d0)[s0] -> (d0 * 4 + s0)>}> : (index, index) -> index
    %55 = "affine.apply"(%arg3, %52#1) <{map = affine_map<(d0)[s0] -> (d0 * 64 + s0 * 4)>}> : (index, index) -> index
    %56 = "memref.subview"(%48, %arg1, %54, %55, %44) <{operandSegmentSizes = array<i32: 1, 3, 1, 0>, static_offsets = array<i64: 0, -9223372036854775808, -9223372036854775808, -9223372036854775808>, static_sizes = array<i64: -9223372036854775808, 1, 1, 4>, static_strides = array<i64: 1, 1, 1, 1>}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>, index, index, index, index) -> memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %57 = "vector.transfer_read"(%46, %4, %arg0, %arg1, %54, %55, %0) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 5, 1, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, index, index, index, f8E4M3FNUZ) -> vector<4xf8E4M3FNUZ>
    %58 = "arith.extf"(%57) : (vector<4xf8E4M3FNUZ>) -> vector<4xbf16>
    "vector.transfer_write"(%58, %8, %4, %4, %4, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 5, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (vector<4xbf16>, memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, index, index, index, index, index) -> ()
    "iree_linalg_ext.scatter"(%8, %53, %56) <{dimension_map = array<i64: 0>, operandSegmentSizes = array<i32: 2, 1>, unique_indices = true}> ({
    ^bb0(%arg4: bf16, %arg5: bf16):
      "iree_linalg_ext.yield"(%arg4) : (bf16) -> ()
    }) : (memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>, memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
    "gpu.barrier"() : () -> ()
    "scf.forall.in_parallel"() ({
    ^bb0:
    }) : () -> ()
  }) : (index) -> ()
  "memref.dealloc"(%7) : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> ()
  "func.return"() : () -> ()
 }) {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64>} : () -> ()
 /sharedfile/32/fp8_32_kv16.mlir:11647:13: error: failed to run translation of source executable to target executable for backend #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>
    %4360 = torch.aten.index_put %4358, %4359, %4354, %false_3948 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
            ^
 /sharedfile/32/fp8_32_kv16.mlir:11647:13: note: see current operation: 
 "hal.executable.variant"() ({
  "hal.executable.export"() ({
  ^bb0(%arg9: !hal.device, %arg10: index, %arg11: index, %arg12: index, %arg13: index):
    %69:3 = "flow.dispatch.workgroup_count_from_slice"(%arg10, %arg11, %arg12, %arg13) : (index, index, index, index) -> (index, index, index)
    "hal.return"(%69#0, %69#1, %69#2) : (index, index, index) -> ()
  }) {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index, sym_name = "prefill_bs1$async_dispatch_212_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"} : () -> ()
  "builtin.module"() ({
    "func.func"() <{function_type = () -> (), sym_name = "prefill_bs1$async_dispatch_212_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"}> ({
      %0 = "arith.constant"() <{value = 0.000000e+00 : f8E4M3FNUZ}> : () -> f8E4M3FNUZ
      %1 = "arith.constant"() <{value = 0 : i64}> : () -> i64
      %2 = "arith.constant"() <{value = 1 : index}> : () -> index
      %3 = "arith.constant"() <{value = 32 : i64}> : () -> i64
      %4 = "arith.constant"() <{value = 0 : index}> : () -> index
      %5 = "arith.constant"() <{value = 64 : index}> : () -> index
      %6 = "arith.constant"() <{value = dense<20> : vector<1xi64>}> : () -> vector<1xi64>
      %7 = "memref.alloc"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x3xi32, #gpu.address_space<workgroup>>
      %8 = "memref.alloca"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x1x1x1x4xbf16, #gpu.address_space<private>>
      %9 = "gpu.thread_id"() <{dimension = #gpu<dim x>, upper_bound = 64 : index}> : () -> index
      %10 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index} : () -> i32
      %11 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 1 : index} : () -> i32
      %12 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 2 : index} : () -> i32
      %13 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 3 : index} : () -> i32
      %14 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 4 : index} : () -> i32
      %15 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 5 : index} : () -> i32
      %16 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 6 : index} : () -> i32
      %17 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 7 : index} : () -> i32
      %18 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 8 : index} : () -> i32
      %19 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 9 : index} : () -> i32
      %20 = "arith.extui"(%10) : (i32) -> i64
      %21 = "arith.extui"(%11) : (i32) -> i64
      %22 = "arith.shli"(%21, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
      %23 = "arith.ori"(%20, %22) : (i64, i64) -> i64
      %24 = "arith.index_castui"(%23) : (i64) -> index
      %25 = "arith.extui"(%12) : (i32) -> i64
      %26 = "arith.extui"(%13) : (i32) -> i64
      %27 = "arith.shli"(%26, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
      %28 = "arith.ori"(%25, %27) : (i64, i64) -> i64
      %29 = "arith.index_castui"(%28) : (i64) -> index
      %30 = "arith.extui"(%14) : (i32) -> i64
      %31 = "arith.extui"(%15) : (i32) -> i64
      %32 = "arith.shli"(%31, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
      %33 = "arith.ori"(%30, %32) : (i64, i64) -> i64
      %34 = "arith.index_castui"(%33) : (i64) -> index
      %35 = "arith.index_castui"(%16) : (i32) -> index
      %36 = "arith.extui"(%17) : (i32) -> i64
      %37 = "arith.extui"(%18) : (i32) -> i64
      %38 = "arith.shli"(%37, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
      %39 = "arith.ori"(%36, %38) : (i64, i64) -> i64
      %40 = "arith.index_castui"(%39) : (i64) -> index
      %41 = "arith.index_castui"(%19) : (i32) -> index
      %42:6 = "util.assume.int"(%24, %29, %34, %35, %40, %41) <{assumptions = [[#util.int.assumption<umin = 68386816, umax = 5300322304>], [#util.int.assumption<umin = 68501504, umax = 5769969664>], [#util.int.assumption<umin = 68501568, umax = 5770002432>], [#util.int.assumption<umin = 1, umax = 4095>], [#util.int.assumption<umin = 64, umax = 576460752303423424, udiv = 64>], [#util.int.assumption<umin = 1, umax = 4095>]]}> : (index, index, index, index, index, index) -> (index, index, index, index, index, index)
      %43 = "flow.dispatch.workload.ordinal"(%42#3) <{ordinal = 1 : index}> : (index) -> index
      %44 = "flow.dispatch.workload.ordinal"(%42#4) <{ordinal = 2 : index}> : (index) -> index
      %45 = "flow.dispatch.workload.ordinal"(%42#5) <{ordinal = 3 : index}> : (index) -> index
      %46 = "hal.interface.binding.subspan"(%42#0, %43) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      "memref.assume_alignment"(%46) <{alignment = 1 : i32}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
      %47 = "hal.interface.binding.subspan"(%42#1, %45) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      "memref.assume_alignment"(%47) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
      %48 = "hal.interface.binding.subspan"(%4, %44) {alignment = 64 : index, binding = 1 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>
      "memref.assume_alignment"(%48) <{alignment = 64 : i32}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>) -> ()
      %49 = "hal.interface.binding.subspan"(%42#2, %45) {alignment = 64 : index, binding = 2 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      "memref.assume_alignment"(%49) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
      %50 = "memref.alloca"(%45) <{operandSegmentSizes = array<i32: 1, 0>}> : (index) -> memref<1x?xi32, #gpu.address_space<private>>
      %51 = "affine.apply"(%45) <{map = affine_map<()[s0] -> (s0 ceildiv 2)>}> : (index) -> index
      "gpu.barrier"() : () -> ()
      "scf.for"(%9, %51, %5) ({
      ^bb0(%arg7: index):
        %62 = "affine.apply"(%arg7) <{map = affine_map<(d0) -> (d0 * 2)>}> : (index) -> index
        %63 = "affine.min"(%arg7, %45) <{map = affine_map<(d0)[s0] -> (d0 * -2 + s0, 2)>}> : (index, index) -> index
        "scf.for"(%4, %63, %2) ({
        ^bb0(%arg8: index):
          %64 = "arith.addi"(%arg8, %62) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
          %65 = "vector.transfer_read"(%47, %4, %64, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
          %66 = "arith.addi"(%65, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
          %67 = "arith.trunci"(%66) : (vector<1xi64>) -> vector<1xi32>
          %68 = "affine.apply"(%arg7, %arg8) <{map = affine_map<(d0)[s0] -> (d0 * 2 + s0)>}> : (index, index) -> index
          "vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
          "vector.transfer_write"(%67, %50, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x?xi32, #gpu.address_space<private>>, index, index) -> ()
          "scf.yield"() : () -> ()
        }) : (index, index, index) -> ()
        "scf.yield"() : () -> ()
      }) : (index, index, index) -> ()
      "gpu.barrier"() : () -> ()
      %52:2 = "affine.delinearize_index"(%9) <{static_basis = array<i64: 4, 16>}> : (index) -> (index, index)
      "scf.forall"(%43) <{mapping = [#iree_codegen.workgroup_mapping<z:1>, #iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>], operandSegmentSizes = array<i32: 0, 1, 0, 0>, staticLowerBound = array<i64: 0, 0, 0, 0>, staticStep = array<i64: 1, 1, 1, 1>, staticUpperBound = array<i64: -9223372036854775808, 32, 2, 2>}> ({
      ^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index):
        %53 = "memref.subview"(%7) <{operandSegmentSizes = array<i32: 1, 0, 0, 0>, static_offsets = array<i64: 0, 0>, static_sizes = array<i64: 1, 1>, static_strides = array<i64: 1, 1>}> : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>
        "gpu.barrier"() : () -> ()
        "scf.for"(%9, %2, %5) ({
        ^bb0(%arg6: index):
          %59 = "vector.transfer_read"(%47, %4, %arg0, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
          %60 = "arith.addi"(%59, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
          %61 = "arith.trunci"(%60) : (vector<1xi64>) -> vector<1xi32>
          "vector.transfer_write"(%61, %7, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x3xi32, #gpu.address_space<workgroup>>, index, index) -> ()
          "scf.yield"() : () -> ()
        }) : (index, index, index) -> ()
        "gpu.barrier"() : () -> ()
        %54 = "affine.apply"(%arg2, %52#0) <{map = affine_map<(d0)[s0] -> (d0 * 4 + s0)>}> : (index, index) -> index
        %55 = "affine.apply"(%arg3, %52#1) <{map = affine_map<(d0)[s0] -> (d0 * 64 + s0 * 4)>}> : (index, index) -> index
        %56 = "memref.subview"(%48, %arg1, %54, %55, %44) <{operandSegmentSizes = array<i32: 1, 3, 1, 0>, static_offsets = array<i64: 0, -9223372036854775808, -9223372036854775808, -9223372036854775808>, static_sizes = array<i64: -9223372036854775808, 1, 1, 4>, static_strides = array<i64: 1, 1, 1, 1>}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>, index, index, index, index) -> memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        %57 = "vector.transfer_read"(%46, %4, %arg0, %arg1, %54, %55, %0) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 5, 1, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, index, index, index, f8E4M3FNUZ) -> vector<4xf8E4M3FNUZ>
        %58 = "arith.extf"(%57) : (vector<4xf8E4M3FNUZ>) -> vector<4xbf16>
        "vector.transfer_write"(%58, %8, %4, %4, %4, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 5, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (vector<4xbf16>, memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, index, index, index, index, index) -> ()
        "iree_linalg_ext.scatter"(%8, %53, %56) <{dimension_map = array<i64: 0>, operandSegmentSizes = array<i32: 2, 1>, unique_indices = true}> ({
        ^bb0(%arg4: bf16, %arg5: bf16):
          "iree_linalg_ext.yield"(%arg4) : (bf16) -> ()
        }) : (memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>, memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
        "gpu.barrier"() : () -> ()
        "scf.forall.in_parallel"() ({
        ^bb0:
        }) : () -> ()
      }) : (index) -> ()
      "memref.dealloc"(%7) : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> ()
      "func.return"() : () -> ()
    }) {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64>} : () -> ()
  }) : () -> ()
  "hal.executable.variant_end"() : () -> ()
 }) {sym_name = "rocm_hsaco_fb", target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>} : () -> ()
 /sharedfile/32/fp8_32_kv16.mlir:12523:13: error: 'vector.transfer_write' op write affecting operations on global resources are restricted to workgroup distributed contexts.
    %4697 = torch.aten.index_put %4695, %4696, %4691, %false_4320 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
            ^
 /sharedfile/32/fp8_32_kv16.mlir:12523:13: note: see current operation: "vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
 /sharedfile/32/fp8_32_kv16.mlir:12523:13: error: 'func.func' op failed on workgroup distribution verification
    %4697 = torch.aten.index_put %4695, %4696, %4691, %false_4320 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
            ^
 /sharedfile/32/fp8_32_kv16.mlir:12523:13: note: see current operation: 
 "func.func"() <{function_type = () -> (), sym_name = "prefill_bs1$async_dispatch_232_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"}> ({
  %0 = "arith.constant"() <{value = 0.000000e+00 : f8E4M3FNUZ}> : () -> f8E4M3FNUZ
  %1 = "arith.constant"() <{value = 0 : i64}> : () -> i64
  %2 = "arith.constant"() <{value = 1 : index}> : () -> index
  %3 = "arith.constant"() <{value = 32 : i64}> : () -> i64
  %4 = "arith.constant"() <{value = 0 : index}> : () -> index
  %5 = "arith.constant"() <{value = 64 : index}> : () -> index
  %6 = "arith.constant"() <{value = dense<22> : vector<1xi64>}> : () -> vector<1xi64>
  %7 = "memref.alloc"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x3xi32, #gpu.address_space<workgroup>>
  %8 = "memref.alloca"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x1x1x1x4xbf16, #gpu.address_space<private>>
  %9 = "gpu.thread_id"() <{dimension = #gpu<dim x>, upper_bound = 64 : index}> : () -> index
  %10 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index} : () -> i32
  %11 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 1 : index} : () -> i32
  %12 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 2 : index} : () -> i32
  %13 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 3 : index} : () -> i32
  %14 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 4 : index} : () -> i32
  %15 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 5 : index} : () -> i32
  %16 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 6 : index} : () -> i32
  %17 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 7 : index} : () -> i32
  %18 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 8 : index} : () -> i32
  %19 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 9 : index} : () -> i32
  %20 = "arith.extui"(%10) : (i32) -> i64
  %21 = "arith.extui"(%11) : (i32) -> i64
  %22 = "arith.shli"(%21, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
  %23 = "arith.ori"(%20, %22) : (i64, i64) -> i64
  %24 = "arith.index_castui"(%23) : (i64) -> index
  %25 = "arith.extui"(%12) : (i32) -> i64
  %26 = "arith.extui"(%13) : (i32) -> i64
  %27 = "arith.shli"(%26, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
  %28 = "arith.ori"(%25, %27) : (i64, i64) -> i64
  %29 = "arith.index_castui"(%28) : (i64) -> index
  %30 = "arith.extui"(%14) : (i32) -> i64
  %31 = "arith.extui"(%15) : (i32) -> i64
  %32 = "arith.shli"(%31, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
  %33 = "arith.ori"(%30, %32) : (i64, i64) -> i64
  %34 = "arith.index_castui"(%33) : (i64) -> index
  %35 = "arith.index_castui"(%16) : (i32) -> index
  %36 = "arith.extui"(%17) : (i32) -> i64
  %37 = "arith.extui"(%18) : (i32) -> i64
  %38 = "arith.shli"(%37, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
  %39 = "arith.ori"(%36, %38) : (i64, i64) -> i64
  %40 = "arith.index_castui"(%39) : (i64) -> index
  %41 = "arith.index_castui"(%19) : (i32) -> index
  %42:6 = "util.assume.int"(%24, %29, %34, %35, %40, %41) <{assumptions = [[#util.int.assumption<umin = 68386816, umax = 5300322304>], [#util.int.assumption<umin = 68501504, umax = 5769969664>], [#util.int.assumption<umin = 68501568, umax = 5770002432>], [#util.int.assumption<umin = 1, umax = 4095>], [#util.int.assumption<umin = 64, umax = 576460752303423424, udiv = 64>], [#util.int.assumption<umin = 1, umax = 4095>]]}> : (index, index, index, index, index, index) -> (index, index, index, index, index, index)
  %43 = "flow.dispatch.workload.ordinal"(%42#3) <{ordinal = 1 : index}> : (index) -> index
  %44 = "flow.dispatch.workload.ordinal"(%42#4) <{ordinal = 2 : index}> : (index) -> index
  %45 = "flow.dispatch.workload.ordinal"(%42#5) <{ordinal = 3 : index}> : (index) -> index
  %46 = "hal.interface.binding.subspan"(%42#0, %43) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  "memref.assume_alignment"(%46) <{alignment = 1 : i32}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
  %47 = "hal.interface.binding.subspan"(%42#1, %45) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  "memref.assume_alignment"(%47) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
  %48 = "hal.interface.binding.subspan"(%4, %44) {alignment = 64 : index, binding = 1 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>
  "memref.assume_alignment"(%48) <{alignment = 64 : i32}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>) -> ()
  %49 = "hal.interface.binding.subspan"(%42#2, %45) {alignment = 64 : index, binding = 2 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  "memref.assume_alignment"(%49) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
  %50 = "memref.alloca"(%45) <{operandSegmentSizes = array<i32: 1, 0>}> : (index) -> memref<1x?xi32, #gpu.address_space<private>>
  %51 = "affine.apply"(%45) <{map = affine_map<()[s0] -> (s0 ceildiv 2)>}> : (index) -> index
  "gpu.barrier"() : () -> ()
  "scf.for"(%9, %51, %5) ({
  ^bb0(%arg7: index):
    %62 = "affine.apply"(%arg7) <{map = affine_map<(d0) -> (d0 * 2)>}> : (index) -> index
    %63 = "affine.min"(%arg7, %45) <{map = affine_map<(d0)[s0] -> (d0 * -2 + s0, 2)>}> : (index, index) -> index
    "scf.for"(%4, %63, %2) ({
    ^bb0(%arg8: index):
      %64 = "arith.addi"(%arg8, %62) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
      %65 = "vector.transfer_read"(%47, %4, %64, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
      %66 = "arith.addi"(%65, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
      %67 = "arith.trunci"(%66) : (vector<1xi64>) -> vector<1xi32>
      %68 = "affine.apply"(%arg7, %arg8) <{map = affine_map<(d0)[s0] -> (d0 * 2 + s0)>}> : (index, index) -> index
      "vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
      "vector.transfer_write"(%67, %50, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x?xi32, #gpu.address_space<private>>, index, index) -> ()
      "scf.yield"() : () -> ()
    }) : (index, index, index) -> ()
    "scf.yield"() : () -> ()
  }) : (index, index, index) -> ()
  "gpu.barrier"() : () -> ()
  %52:2 = "affine.delinearize_index"(%9) <{static_basis = array<i64: 4, 16>}> : (index) -> (index, index)
  "scf.forall"(%43) <{mapping = [#iree_codegen.workgroup_mapping<z:1>, #iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>], operandSegmentSizes = array<i32: 0, 1, 0, 0>, staticLowerBound = array<i64: 0, 0, 0, 0>, staticStep = array<i64: 1, 1, 1, 1>, staticUpperBound = array<i64: -9223372036854775808, 32, 2, 2>}> ({
  ^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index):
    %53 = "memref.subview"(%7) <{operandSegmentSizes = array<i32: 1, 0, 0, 0>, static_offsets = array<i64: 0, 0>, static_sizes = array<i64: 1, 1>, static_strides = array<i64: 1, 1>}> : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>
    "gpu.barrier"() : () -> ()
    "scf.for"(%9, %2, %5) ({
    ^bb0(%arg6: index):
      %59 = "vector.transfer_read"(%47, %4, %arg0, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
      %60 = "arith.addi"(%59, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
      %61 = "arith.trunci"(%60) : (vector<1xi64>) -> vector<1xi32>
      "vector.transfer_write"(%61, %7, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x3xi32, #gpu.address_space<workgroup>>, index, index) -> ()
      "scf.yield"() : () -> ()
    }) : (index, index, index) -> ()
    "gpu.barrier"() : () -> ()
    %54 = "affine.apply"(%arg2, %52#0) <{map = affine_map<(d0)[s0] -> (d0 * 4 + s0)>}> : (index, index) -> index
    %55 = "affine.apply"(%arg3, %52#1) <{map = affine_map<(d0)[s0] -> (d0 * 64 + s0 * 4)>}> : (index, index) -> index
    %56 = "memref.subview"(%48, %arg1, %54, %55, %44) <{operandSegmentSizes = array<i32: 1, 3, 1, 0>, static_offsets = array<i64: 0, -9223372036854775808, -9223372036854775808, -9223372036854775808>, static_sizes = array<i64: -9223372036854775808, 1, 1, 4>, static_strides = array<i64: 1, 1, 1, 1>}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>, index, index, index, index) -> memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %57 = "vector.transfer_read"(%46, %4, %arg0, %arg1, %54, %55, %0) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 5, 1, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, index, index, index, f8E4M3FNUZ) -> vector<4xf8E4M3FNUZ>
    %58 = "arith.extf"(%57) : (vector<4xf8E4M3FNUZ>) -> vector<4xbf16>
    "vector.transfer_write"(%58, %8, %4, %4, %4, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 5, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (vector<4xbf16>, memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, index, index, index, index, index) -> ()
    "iree_linalg_ext.scatter"(%8, %53, %56) <{dimension_map = array<i64: 0>, operandSegmentSizes = array<i32: 2, 1>, unique_indices = true}> ({
    ^bb0(%arg4: bf16, %arg5: bf16):
      "iree_linalg_ext.yield"(%arg4) : (bf16) -> ()
    }) : (memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>, memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
    "gpu.barrier"() : () -> ()
    "scf.forall.in_parallel"() ({
    ^bb0:
    }) : () -> ()
  }) : (index) -> ()
  "memref.dealloc"(%7) : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> ()
  "func.return"() : () -> ()
 }) {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64>} : () -> ()
 /sharedfile/32/fp8_32_kv16.mlir:12523:13: error: failed to run translation of source executable to target executable for backend #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>
    %4697 = torch.aten.index_put %4695, %4696, %4691, %false_4320 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
            ^
 /sharedfile/32/fp8_32_kv16.mlir:12523:13: note: see current operation: 
 "hal.executable.variant"() ({
  "hal.executable.export"() ({
  ^bb0(%arg9: !hal.device, %arg10: index, %arg11: index, %arg12: index, %arg13: index):
    %69:3 = "flow.dispatch.workgroup_count_from_slice"(%arg10, %arg11, %arg12, %arg13) : (index, index, index, index) -> (index, index, index)
    "hal.return"(%69#0, %69#1, %69#2) : (index, index, index) -> ()
  }) {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index, sym_name = "prefill_bs1$async_dispatch_232_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"} : () -> ()
  "builtin.module"() ({
    "func.func"() <{function_type = () -> (), sym_name = "prefill_bs1$async_dispatch_232_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"}> ({
      %0 = "arith.constant"() <{value = 0.000000e+00 : f8E4M3FNUZ}> : () -> f8E4M3FNUZ
      %1 = "arith.constant"() <{value = 0 : i64}> : () -> i64
      %2 = "arith.constant"() <{value = 1 : index}> : () -> index
      %3 = "arith.constant"() <{value = 32 : i64}> : () -> i64
      %4 = "arith.constant"() <{value = 0 : index}> : () -> index
      %5 = "arith.constant"() <{value = 64 : index}> : () -> index
      %6 = "arith.constant"() <{value = dense<22> : vector<1xi64>}> : () -> vector<1xi64>
      %7 = "memref.alloc"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x3xi32, #gpu.address_space<workgroup>>
      %8 = "memref.alloca"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x1x1x1x4xbf16, #gpu.address_space<private>>
      %9 = "gpu.thread_id"() <{dimension = #gpu<dim x>, upper_bound = 64 : index}> : () -> index
      %10 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index} : () -> i32
      %11 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 1 : index} : () -> i32
      %12 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 2 : index} : () -> i32
      %13 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 3 : index} : () -> i32
      %14 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 4 : index} : () -> i32
      %15 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 5 : index} : () -> i32
      %16 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 6 : index} : () -> i32
      %17 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 7 : index} : () -> i32
      %18 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 8 : index} : () -> i32
      %19 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 9 : index} : () -> i32
      %20 = "arith.extui"(%10) : (i32) -> i64
      %21 = "arith.extui"(%11) : (i32) -> i64
      %22 = "arith.shli"(%21, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
      %23 = "arith.ori"(%20, %22) : (i64, i64) -> i64
      %24 = "arith.index_castui"(%23) : (i64) -> index
      %25 = "arith.extui"(%12) : (i32) -> i64
      %26 = "arith.extui"(%13) : (i32) -> i64
      %27 = "arith.shli"(%26, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
      %28 = "arith.ori"(%25, %27) : (i64, i64) -> i64
      %29 = "arith.index_castui"(%28) : (i64) -> index
      %30 = "arith.extui"(%14) : (i32) -> i64
      %31 = "arith.extui"(%15) : (i32) -> i64
      %32 = "arith.shli"(%31, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
      %33 = "arith.ori"(%30, %32) : (i64, i64) -> i64
      %34 = "arith.index_castui"(%33) : (i64) -> index
      %35 = "arith.index_castui"(%16) : (i32) -> index
      %36 = "arith.extui"(%17) : (i32) -> i64
      %37 = "arith.extui"(%18) : (i32) -> i64
      %38 = "arith.shli"(%37, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
      %39 = "arith.ori"(%36, %38) : (i64, i64) -> i64
      %40 = "arith.index_castui"(%39) : (i64) -> index
      %41 = "arith.index_castui"(%19) : (i32) -> index
      %42:6 = "util.assume.int"(%24, %29, %34, %35, %40, %41) <{assumptions = [[#util.int.assumption<umin = 68386816, umax = 5300322304>], [#util.int.assumption<umin = 68501504, umax = 5769969664>], [#util.int.assumption<umin = 68501568, umax = 5770002432>], [#util.int.assumption<umin = 1, umax = 4095>], [#util.int.assumption<umin = 64, umax = 576460752303423424, udiv = 64>], [#util.int.assumption<umin = 1, umax = 4095>]]}> : (index, index, index, index, index, index) -> (index, index, index, index, index, index)
      %43 = "flow.dispatch.workload.ordinal"(%42#3) <{ordinal = 1 : index}> : (index) -> index
      %44 = "flow.dispatch.workload.ordinal"(%42#4) <{ordinal = 2 : index}> : (index) -> index
      %45 = "flow.dispatch.workload.ordinal"(%42#5) <{ordinal = 3 : index}> : (index) -> index
      %46 = "hal.interface.binding.subspan"(%42#0, %43) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      "memref.assume_alignment"(%46) <{alignment = 1 : i32}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
      %47 = "hal.interface.binding.subspan"(%42#1, %45) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      "memref.assume_alignment"(%47) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
      %48 = "hal.interface.binding.subspan"(%4, %44) {alignment = 64 : index, binding = 1 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>
      "memref.assume_alignment"(%48) <{alignment = 64 : i32}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>) -> ()
      %49 = "hal.interface.binding.subspan"(%42#2, %45) {alignment = 64 : index, binding = 2 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      "memref.assume_alignment"(%49) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
      %50 = "memref.alloca"(%45) <{operandSegmentSizes = array<i32: 1, 0>}> : (index) -> memref<1x?xi32, #gpu.address_space<private>>
      %51 = "affine.apply"(%45) <{map = affine_map<()[s0] -> (s0 ceildiv 2)>}> : (index) -> index
      "gpu.barrier"() : () -> ()
      "scf.for"(%9, %51, %5) ({
      ^bb0(%arg7: index):
        %62 = "affine.apply"(%arg7) <{map = affine_map<(d0) -> (d0 * 2)>}> : (index) -> index
        %63 = "affine.min"(%arg7, %45) <{map = affine_map<(d0)[s0] -> (d0 * -2 + s0, 2)>}> : (index, index) -> index
        "scf.for"(%4, %63, %2) ({
        ^bb0(%arg8: index):
          %64 = "arith.addi"(%arg8, %62) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
          %65 = "vector.transfer_read"(%47, %4, %64, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
          %66 = "arith.addi"(%65, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
          %67 = "arith.trunci"(%66) : (vector<1xi64>) -> vector<1xi32>
          %68 = "affine.apply"(%arg7, %arg8) <{map = affine_map<(d0)[s0] -> (d0 * 2 + s0)>}> : (index, index) -> index
          "vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
          "vector.transfer_write"(%67, %50, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x?xi32, #gpu.address_space<private>>, index, index) -> ()
          "scf.yield"() : () -> ()
        }) : (index, index, index) -> ()
        "scf.yield"() : () -> ()
      }) : (index, index, index) -> ()
      "gpu.barrier"() : () -> ()
      %52:2 = "affine.delinearize_index"(%9) <{static_basis = array<i64: 4, 16>}> : (index) -> (index, index)
      "scf.forall"(%43) <{mapping = [#iree_codegen.workgroup_mapping<z:1>, #iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>], operandSegmentSizes = array<i32: 0, 1, 0, 0>, staticLowerBound = array<i64: 0, 0, 0, 0>, staticStep = array<i64: 1, 1, 1, 1>, staticUpperBound = array<i64: -9223372036854775808, 32, 2, 2>}> ({
      ^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index):
        %53 = "memref.subview"(%7) <{operandSegmentSizes = array<i32: 1, 0, 0, 0>, static_offsets = array<i64: 0, 0>, static_sizes = array<i64: 1, 1>, static_strides = array<i64: 1, 1>}> : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>
        "gpu.barrier"() : () -> ()
        "scf.for"(%9, %2, %5) ({
        ^bb0(%arg6: index):
          %59 = "vector.transfer_read"(%47, %4, %arg0, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
          %60 = "arith.addi"(%59, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
          %61 = "arith.trunci"(%60) : (vector<1xi64>) -> vector<1xi32>
          "vector.transfer_write"(%61, %7, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x3xi32, #gpu.address_space<workgroup>>, index, index) -> ()
          "scf.yield"() : () -> ()
        }) : (index, index, index) -> ()
        "gpu.barrier"() : () -> ()
        %54 = "affine.apply"(%arg2, %52#0) <{map = affine_map<(d0)[s0] -> (d0 * 4 + s0)>}> : (index, index) -> index
        %55 = "affine.apply"(%arg3, %52#1) <{map = affine_map<(d0)[s0] -> (d0 * 64 + s0 * 4)>}> : (index, index) -> index
        %56 = "memref.subview"(%48, %arg1, %54, %55, %44) <{operandSegmentSizes = array<i32: 1, 3, 1, 0>, static_offsets = array<i64: 0, -9223372036854775808, -9223372036854775808, -9223372036854775808>, static_sizes = array<i64: -9223372036854775808, 1, 1, 4>, static_strides = array<i64: 1, 1, 1, 1>}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>, index, index, index, index) -> memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        %57 = "vector.transfer_read"(%46, %4, %arg0, %arg1, %54, %55, %0) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 5, 1, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, index, index, index, f8E4M3FNUZ) -> vector<4xf8E4M3FNUZ>
        %58 = "arith.extf"(%57) : (vector<4xf8E4M3FNUZ>) -> vector<4xbf16>
        "vector.transfer_write"(%58, %8, %4, %4, %4, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 5, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (vector<4xbf16>, memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, index, index, index, index, index) -> ()
        "iree_linalg_ext.scatter"(%8, %53, %56) <{dimension_map = array<i64: 0>, operandSegmentSizes = array<i32: 2, 1>, unique_indices = true}> ({
        ^bb0(%arg4: bf16, %arg5: bf16):
          "iree_linalg_ext.yield"(%arg4) : (bf16) -> ()
        }) : (memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>, memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
        "gpu.barrier"() : () -> ()
        "scf.forall.in_parallel"() ({
        ^bb0:
        }) : () -> ()
      }) : (index) -> ()
      "memref.dealloc"(%7) : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> ()
      "func.return"() : () -> ()
    }) {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64>} : () -> ()
  }) : () -> ()
  "hal.executable.variant_end"() : () -> ()
 }) {sym_name = "rocm_hsaco_fb", target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>} : () -> ()
 /sharedfile/32/fp8_32_kv16.mlir:13399:13: error: 'vector.transfer_write' op write affecting operations on global resources are restricted to workgroup distributed contexts.
    %5034 = torch.aten.index_put %5032, %5033, %5028, %false_4692 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
            ^
 /sharedfile/32/fp8_32_kv16.mlir:13399:13: note: see current operation: "vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
 /sharedfile/32/fp8_32_kv16.mlir:13399:13: error: 'func.func' op failed on workgroup distribution verification
    %5034 = torch.aten.index_put %5032, %5033, %5028, %false_4692 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
            ^
 /sharedfile/32/fp8_32_kv16.mlir:13399:13: note: see current operation: 
 "func.func"() <{function_type = () -> (), sym_name = "prefill_bs1$async_dispatch_252_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"}> ({
  %0 = "arith.constant"() <{value = 0.000000e+00 : f8E4M3FNUZ}> : () -> f8E4M3FNUZ
  %1 = "arith.constant"() <{value = 0 : i64}> : () -> i64
  %2 = "arith.constant"() <{value = 1 : index}> : () -> index
  %3 = "arith.constant"() <{value = 32 : i64}> : () -> i64
  %4 = "arith.constant"() <{value = 0 : index}> : () -> index
  %5 = "arith.constant"() <{value = 64 : index}> : () -> index
  %6 = "arith.constant"() <{value = dense<24> : vector<1xi64>}> : () -> vector<1xi64>
  %7 = "memref.alloc"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x3xi32, #gpu.address_space<workgroup>>
  %8 = "memref.alloca"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x1x1x1x4xbf16, #gpu.address_space<private>>
  %9 = "gpu.thread_id"() <{dimension = #gpu<dim x>, upper_bound = 64 : index}> : () -> index
  %10 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index} : () -> i32
  %11 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 1 : index} : () -> i32
  %12 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 2 : index} : () -> i32
  %13 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 3 : index} : () -> i32
  %14 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 4 : index} : () -> i32
  %15 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 5 : index} : () -> i32
  %16 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 6 : index} : () -> i32
  %17 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 7 : index} : () -> i32
  %18 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 8 : index} : () -> i32
  %19 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 9 : index} : () -> i32
  %20 = "arith.extui"(%10) : (i32) -> i64
  %21 = "arith.extui"(%11) : (i32) -> i64
  %22 = "arith.shli"(%21, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
  %23 = "arith.ori"(%20, %22) : (i64, i64) -> i64
  %24 = "arith.index_castui"(%23) : (i64) -> index
  %25 = "arith.extui"(%12) : (i32) -> i64
  %26 = "arith.extui"(%13) : (i32) -> i64
  %27 = "arith.shli"(%26, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
  %28 = "arith.ori"(%25, %27) : (i64, i64) -> i64
  %29 = "arith.index_castui"(%28) : (i64) -> index
  %30 = "arith.extui"(%14) : (i32) -> i64
  %31 = "arith.extui"(%15) : (i32) -> i64
  %32 = "arith.shli"(%31, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
  %33 = "arith.ori"(%30, %32) : (i64, i64) -> i64
  %34 = "arith.index_castui"(%33) : (i64) -> index
  %35 = "arith.index_castui"(%16) : (i32) -> index
  %36 = "arith.extui"(%17) : (i32) -> i64
  %37 = "arith.extui"(%18) : (i32) -> i64
  %38 = "arith.shli"(%37, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
  %39 = "arith.ori"(%36, %38) : (i64, i64) -> i64
  %40 = "arith.index_castui"(%39) : (i64) -> index
  %41 = "arith.index_castui"(%19) : (i32) -> index
  %42:6 = "util.assume.int"(%24, %29, %34, %35, %40, %41) <{assumptions = [[#util.int.assumption<umin = 68386816, umax = 5300322304>], [#util.int.assumption<umin = 68501504, umax = 5769969664>], [#util.int.assumption<umin = 68501568, umax = 5770002432>], [#util.int.assumption<umin = 1, umax = 4095>], [#util.int.assumption<umin = 64, umax = 576460752303423424, udiv = 64>], [#util.int.assumption<umin = 1, umax = 4095>]]}> : (index, index, index, index, index, index) -> (index, index, index, index, index, index)
  %43 = "flow.dispatch.workload.ordinal"(%42#3) <{ordinal = 1 : index}> : (index) -> index
  %44 = "flow.dispatch.workload.ordinal"(%42#4) <{ordinal = 2 : index}> : (index) -> index
  %45 = "flow.dispatch.workload.ordinal"(%42#5) <{ordinal = 3 : index}> : (index) -> index
  %46 = "hal.interface.binding.subspan"(%42#0, %43) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  "memref.assume_alignment"(%46) <{alignment = 1 : i32}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
  %47 = "hal.interface.binding.subspan"(%42#1, %45) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  "memref.assume_alignment"(%47) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
  %48 = "hal.interface.binding.subspan"(%4, %44) {alignment = 64 : index, binding = 1 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>
  "memref.assume_alignment"(%48) <{alignment = 64 : i32}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>) -> ()
  %49 = "hal.interface.binding.subspan"(%42#2, %45) {alignment = 64 : index, binding = 2 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  "memref.assume_alignment"(%49) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
  %50 = "memref.alloca"(%45) <{operandSegmentSizes = array<i32: 1, 0>}> : (index) -> memref<1x?xi32, #gpu.address_space<private>>
  %51 = "affine.apply"(%45) <{map = affine_map<()[s0] -> (s0 ceildiv 2)>}> : (index) -> index
  "gpu.barrier"() : () -> ()
  "scf.for"(%9, %51, %5) ({
  ^bb0(%arg7: index):
    %62 = "affine.apply"(%arg7) <{map = affine_map<(d0) -> (d0 * 2)>}> : (index) -> index
    %63 = "affine.min"(%arg7, %45) <{map = affine_map<(d0)[s0] -> (d0 * -2 + s0, 2)>}> : (index, index) -> index
    "scf.for"(%4, %63, %2) ({
    ^bb0(%arg8: index):
      %64 = "arith.addi"(%arg8, %62) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
      %65 = "vector.transfer_read"(%47, %4, %64, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
      %66 = "arith.addi"(%65, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
      %67 = "arith.trunci"(%66) : (vector<1xi64>) -> vector<1xi32>
      %68 = "affine.apply"(%arg7, %arg8) <{map = affine_map<(d0)[s0] -> (d0 * 2 + s0)>}> : (index, index) -> index
      "vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
      "vector.transfer_write"(%67, %50, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x?xi32, #gpu.address_space<private>>, index, index) -> ()
      "scf.yield"() : () -> ()
    }) : (index, index, index) -> ()
    "scf.yield"() : () -> ()
  }) : (index, index, index) -> ()
  "gpu.barrier"() : () -> ()
  %52:2 = "affine.delinearize_index"(%9) <{static_basis = array<i64: 4, 16>}> : (index) -> (index, index)
  "scf.forall"(%43) <{mapping = [#iree_codegen.workgroup_mapping<z:1>, #iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>], operandSegmentSizes = array<i32: 0, 1, 0, 0>, staticLowerBound = array<i64: 0, 0, 0, 0>, staticStep = array<i64: 1, 1, 1, 1>, staticUpperBound = array<i64: -9223372036854775808, 32, 2, 2>}> ({
  ^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index):
    %53 = "memref.subview"(%7) <{operandSegmentSizes = array<i32: 1, 0, 0, 0>, static_offsets = array<i64: 0, 0>, static_sizes = array<i64: 1, 1>, static_strides = array<i64: 1, 1>}> : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>
    "gpu.barrier"() : () -> ()
    "scf.for"(%9, %2, %5) ({
    ^bb0(%arg6: index):
      %59 = "vector.transfer_read"(%47, %4, %arg0, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
      %60 = "arith.addi"(%59, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
      %61 = "arith.trunci"(%60) : (vector<1xi64>) -> vector<1xi32>
      "vector.transfer_write"(%61, %7, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x3xi32, #gpu.address_space<workgroup>>, index, index) -> ()
      "scf.yield"() : () -> ()
    }) : (index, index, index) -> ()
    "gpu.barrier"() : () -> ()
    %54 = "affine.apply"(%arg2, %52#0) <{map = affine_map<(d0)[s0] -> (d0 * 4 + s0)>}> : (index, index) -> index
    %55 = "affine.apply"(%arg3, %52#1) <{map = affine_map<(d0)[s0] -> (d0 * 64 + s0 * 4)>}> : (index, index) -> index
    %56 = "memref.subview"(%48, %arg1, %54, %55, %44) <{operandSegmentSizes = array<i32: 1, 3, 1, 0>, static_offsets = array<i64: 0, -9223372036854775808, -9223372036854775808, -9223372036854775808>, static_sizes = array<i64: -9223372036854775808, 1, 1, 4>, static_strides = array<i64: 1, 1, 1, 1>}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>, index, index, index, index) -> memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %57 = "vector.transfer_read"(%46, %4, %arg0, %arg1, %54, %55, %0) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 5, 1, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, index, index, index, f8E4M3FNUZ) -> vector<4xf8E4M3FNUZ>
    %58 = "arith.extf"(%57) : (vector<4xf8E4M3FNUZ>) -> vector<4xbf16>
    "vector.transfer_write"(%58, %8, %4, %4, %4, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 5, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (vector<4xbf16>, memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, index, index, index, index, index) -> ()
    "iree_linalg_ext.scatter"(%8, %53, %56) <{dimension_map = array<i64: 0>, operandSegmentSizes = array<i32: 2, 1>, unique_indices = true}> ({
    ^bb0(%arg4: bf16, %arg5: bf16):
      "iree_linalg_ext.yield"(%arg4) : (bf16) -> ()
    }) : (memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>, memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
    "gpu.barrier"() : () -> ()
    "scf.forall.in_parallel"() ({
    ^bb0:
    }) : () -> ()
  }) : (index) -> ()
  "memref.dealloc"(%7) : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> ()
  "func.return"() : () -> ()
 }) {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64>} : () -> ()
 /sharedfile/32/fp8_32_kv16.mlir:13399:13: error: failed to run translation of source executable to target executable for backend #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>
    %5034 = torch.aten.index_put %5032, %5033, %5028, %false_4692 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
            ^
 /sharedfile/32/fp8_32_kv16.mlir:13399:13: note: see current operation: 
 "hal.executable.variant"() ({
  "hal.executable.export"() ({
  ^bb0(%arg9: !hal.device, %arg10: index, %arg11: index, %arg12: index, %arg13: index):
    %69:3 = "flow.dispatch.workgroup_count_from_slice"(%arg10, %arg11, %arg12, %arg13) : (index, index, index, index) -> (index, index, index)
    "hal.return"(%69#0, %69#1, %69#2) : (index, index, index) -> ()
  }) {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index, sym_name = "prefill_bs1$async_dispatch_252_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"} : () -> ()
  "builtin.module"() ({
    "func.func"() <{function_type = () -> (), sym_name = "prefill_bs1$async_dispatch_252_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"}> ({
      %0 = "arith.constant"() <{value = 0.000000e+00 : f8E4M3FNUZ}> : () -> f8E4M3FNUZ
      %1 = "arith.constant"() <{value = 0 : i64}> : () -> i64
      %2 = "arith.constant"() <{value = 1 : index}> : () -> index
      %3 = "arith.constant"() <{value = 32 : i64}> : () -> i64
      %4 = "arith.constant"() <{value = 0 : index}> : () -> index
      %5 = "arith.constant"() <{value = 64 : index}> : () -> index
      %6 = "arith.constant"() <{value = dense<24> : vector<1xi64>}> : () -> vector<1xi64>
      %7 = "memref.alloc"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x3xi32, #gpu.address_space<workgroup>>
      %8 = "memref.alloca"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x1x1x1x4xbf16, #gpu.address_space<private>>
      %9 = "gpu.thread_id"() <{dimension = #gpu<dim x>, upper_bound = 64 : index}> : () -> index
      %10 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index} : () -> i32
      %11 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 1 : index} : () -> i32
      %12 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 2 : index} : () -> i32
      %13 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 3 : index} : () -> i32
      %14 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 4 : index} : () -> i32
      %15 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 5 : index} : () -> i32
      %16 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 6 : index} : () -> i32
      %17 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 7 : index} : () -> i32
      %18 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 8 : index} : () -> i32
      %19 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 9 : index} : () -> i32
      %20 = "arith.extui"(%10) : (i32) -> i64
      %21 = "arith.extui"(%11) : (i32) -> i64
      %22 = "arith.shli"(%21, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
      %23 = "arith.ori"(%20, %22) : (i64, i64) -> i64
      %24 = "arith.index_castui"(%23) : (i64) -> index
      %25 = "arith.extui"(%12) : (i32) -> i64
      %26 = "arith.extui"(%13) : (i32) -> i64
      %27 = "arith.shli"(%26, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
      %28 = "arith.ori"(%25, %27) : (i64, i64) -> i64
      %29 = "arith.index_castui"(%28) : (i64) -> index
      %30 = "arith.extui"(%14) : (i32) -> i64
      %31 = "arith.extui"(%15) : (i32) -> i64
      %32 = "arith.shli"(%31, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
      %33 = "arith.ori"(%30, %32) : (i64, i64) -> i64
      %34 = "arith.index_castui"(%33) : (i64) -> index
      %35 = "arith.index_castui"(%16) : (i32) -> index
      %36 = "arith.extui"(%17) : (i32) -> i64
      %37 = "arith.extui"(%18) : (i32) -> i64
      %38 = "arith.shli"(%37, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
      %39 = "arith.ori"(%36, %38) : (i64, i64) -> i64
      %40 = "arith.index_castui"(%39) : (i64) -> index
      %41 = "arith.index_castui"(%19) : (i32) -> index
      %42:6 = "util.assume.int"(%24, %29, %34, %35, %40, %41) <{assumptions = [[#util.int.assumption<umin = 68386816, umax = 5300322304>], [#util.int.assumption<umin = 68501504, umax = 5769969664>], [#util.int.assumption<umin = 68501568, umax = 5770002432>], [#util.int.assumption<umin = 1, umax = 4095>], [#util.int.assumption<umin = 64, umax = 576460752303423424, udiv = 64>], [#util.int.assumption<umin = 1, umax = 4095>]]}> : (index, index, index, index, index, index) -> (index, index, index, index, index, index)
      %43 = "flow.dispatch.workload.ordinal"(%42#3) <{ordinal = 1 : index}> : (index) -> index
      %44 = "flow.dispatch.workload.ordinal"(%42#4) <{ordinal = 2 : index}> : (index) -> index
      %45 = "flow.dispatch.workload.ordinal"(%42#5) <{ordinal = 3 : index}> : (index) -> index
      %46 = "hal.interface.binding.subspan"(%42#0, %43) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      "memref.assume_alignment"(%46) <{alignment = 1 : i32}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
      %47 = "hal.interface.binding.subspan"(%42#1, %45) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      "memref.assume_alignment"(%47) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
      %48 = "hal.interface.binding.subspan"(%4, %44) {alignment = 64 : index, binding = 1 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>
      "memref.assume_alignment"(%48) <{alignment = 64 : i32}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>) -> ()
      %49 = "hal.interface.binding.subspan"(%42#2, %45) {alignment = 64 : index, binding = 2 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      "memref.assume_alignment"(%49) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
      %50 = "memref.alloca"(%45) <{operandSegmentSizes = array<i32: 1, 0>}> : (index) -> memref<1x?xi32, #gpu.address_space<private>>
      %51 = "affine.apply"(%45) <{map = affine_map<()[s0] -> (s0 ceildiv 2)>}> : (index) -> index
      "gpu.barrier"() : () -> ()
      "scf.for"(%9, %51, %5) ({
      ^bb0(%arg7: index):
        %62 = "affine.apply"(%arg7) <{map = affine_map<(d0) -> (d0 * 2)>}> : (index) -> index
        %63 = "affine.min"(%arg7, %45) <{map = affine_map<(d0)[s0] -> (d0 * -2 + s0, 2)>}> : (index, index) -> index
        "scf.for"(%4, %63, %2) ({
        ^bb0(%arg8: index):
          %64 = "arith.addi"(%arg8, %62) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
          %65 = "vector.transfer_read"(%47, %4, %64, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
          %66 = "arith.addi"(%65, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
          %67 = "arith.trunci"(%66) : (vector<1xi64>) -> vector<1xi32>
          %68 = "affine.apply"(%arg7, %arg8) <{map = affine_map<(d0)[s0] -> (d0 * 2 + s0)>}> : (index, index) -> index
          "vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
          "vector.transfer_write"(%67, %50, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x?xi32, #gpu.address_space<private>>, index, index) -> ()
          "scf.yield"() : () -> ()
        }) : (index, index, index) -> ()
        "scf.yield"() : () -> ()
      }) : (index, index, index) -> ()
      "gpu.barrier"() : () -> ()
      %52:2 = "affine.delinearize_index"(%9) <{static_basis = array<i64: 4, 16>}> : (index) -> (index, index)
      "scf.forall"(%43) <{mapping = [#iree_codegen.workgroup_mapping<z:1>, #iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>], operandSegmentSizes = array<i32: 0, 1, 0, 0>, staticLowerBound = array<i64: 0, 0, 0, 0>, staticStep = array<i64: 1, 1, 1, 1>, staticUpperBound = array<i64: -9223372036854775808, 32, 2, 2>}> ({
      ^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index):
        %53 = "memref.subview"(%7) <{operandSegmentSizes = array<i32: 1, 0, 0, 0>, static_offsets = array<i64: 0, 0>, static_sizes = array<i64: 1, 1>, static_strides = array<i64: 1, 1>}> : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>
        "gpu.barrier"() : () -> ()
        "scf.for"(%9, %2, %5) ({
        ^bb0(%arg6: index):
          %59 = "vector.transfer_read"(%47, %4, %arg0, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
          %60 = "arith.addi"(%59, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
          %61 = "arith.trunci"(%60) : (vector<1xi64>) -> vector<1xi32>
          "vector.transfer_write"(%61, %7, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x3xi32, #gpu.address_space<workgroup>>, index, index) -> ()
          "scf.yield"() : () -> ()
        }) : (index, index, index) -> ()
        "gpu.barrier"() : () -> ()
        %54 = "affine.apply"(%arg2, %52#0) <{map = affine_map<(d0)[s0] -> (d0 * 4 + s0)>}> : (index, index) -> index
        %55 = "affine.apply"(%arg3, %52#1) <{map = affine_map<(d0)[s0] -> (d0 * 64 + s0 * 4)>}> : (index, index) -> index
        %56 = "memref.subview"(%48, %arg1, %54, %55, %44) <{operandSegmentSizes = array<i32: 1, 3, 1, 0>, static_offsets = array<i64: 0, -9223372036854775808, -9223372036854775808, -9223372036854775808>, static_sizes = array<i64: -9223372036854775808, 1, 1, 4>, static_strides = array<i64: 1, 1, 1, 1>}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>, index, index, index, index) -> memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        %57 = "vector.transfer_read"(%46, %4, %arg0, %arg1, %54, %55, %0) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 5, 1, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, index, index, index, f8E4M3FNUZ) -> vector<4xf8E4M3FNUZ>
        %58 = "arith.extf"(%57) : (vector<4xf8E4M3FNUZ>) -> vector<4xbf16>
        "vector.transfer_write"(%58, %8, %4, %4, %4, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 5, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (vector<4xbf16>, memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, index, index, index, index, index) -> ()
        "iree_linalg_ext.scatter"(%8, %53, %56) <{dimension_map = array<i64: 0>, operandSegmentSizes = array<i32: 2, 1>, unique_indices = true}> ({
        ^bb0(%arg4: bf16, %arg5: bf16):
          "iree_linalg_ext.yield"(%arg4) : (bf16) -> ()
        }) : (memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>, memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
        "gpu.barrier"() : () -> ()
        "scf.forall.in_parallel"() ({
        ^bb0:
        }) : () -> ()
      }) : (index) -> ()
      "memref.dealloc"(%7) : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> ()
      "func.return"() : () -> ()
    }) {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64>} : () -> ()
  }) : () -> ()
  "hal.executable.variant_end"() : () -> ()
 }) {sym_name = "rocm_hsaco_fb", target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>} : () -> ()
 /sharedfile/32/fp8_32_kv16.mlir:14275:13: error: 'vector.transfer_write' op write affecting operations on global resources are restricted to workgroup distributed contexts.
    %5371 = torch.aten.index_put %5369, %5370, %5365, %false_5065 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
            ^
 /sharedfile/32/fp8_32_kv16.mlir:14275:13: note: see current operation: "vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
 /sharedfile/32/fp8_32_kv16.mlir:14275:13: error: 'func.func' op failed on workgroup distribution verification
    %5371 = torch.aten.index_put %5369, %5370, %5365, %false_5065 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
            ^
 /sharedfile/32/fp8_32_kv16.mlir:14275:13: note: see current operation: 
 "func.func"() <{function_type = () -> (), sym_name = "prefill_bs1$async_dispatch_272_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"}> ({
  %0 = "arith.constant"() <{value = 0.000000e+00 : f8E4M3FNUZ}> : () -> f8E4M3FNUZ
  %1 = "arith.constant"() <{value = 0 : i64}> : () -> i64
  %2 = "arith.constant"() <{value = 1 : index}> : () -> index
  %3 = "arith.constant"() <{value = 32 : i64}> : () -> i64
  %4 = "arith.constant"() <{value = 0 : index}> : () -> index
  %5 = "arith.constant"() <{value = 64 : index}> : () -> index
  %6 = "arith.constant"() <{value = dense<26> : vector<1xi64>}> : () -> vector<1xi64>
  %7 = "memref.alloc"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x3xi32, #gpu.address_space<workgroup>>
  %8 = "memref.alloca"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x1x1x1x4xbf16, #gpu.address_space<private>>
  %9 = "gpu.thread_id"() <{dimension = #gpu<dim x>, upper_bound = 64 : index}> : () -> index
  %10 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index} : () -> i32
  %11 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 1 : index} : () -> i32
  %12 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 2 : index} : () -> i32
  %13 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 3 : index} : () -> i32
  %14 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 4 : index} : () -> i32
  %15 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 5 : index} : () -> i32
  %16 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 6 : index} : () -> i32
  %17 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 7 : index} : () -> i32
  %18 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 8 : index} : () -> i32
  %19 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 9 : index} : () -> i32
  %20 = "arith.extui"(%10) : (i32) -> i64
  %21 = "arith.extui"(%11) : (i32) -> i64
  %22 = "arith.shli"(%21, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
  %23 = "arith.ori"(%20, %22) : (i64, i64) -> i64
  %24 = "arith.index_castui"(%23) : (i64) -> index
  %25 = "arith.extui"(%12) : (i32) -> i64
  %26 = "arith.extui"(%13) : (i32) -> i64
  %27 = "arith.shli"(%26, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
  %28 = "arith.ori"(%25, %27) : (i64, i64) -> i64
  %29 = "arith.index_castui"(%28) : (i64) -> index
  %30 = "arith.extui"(%14) : (i32) -> i64
  %31 = "arith.extui"(%15) : (i32) -> i64
  %32 = "arith.shli"(%31, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
  %33 = "arith.ori"(%30, %32) : (i64, i64) -> i64
  %34 = "arith.index_castui"(%33) : (i64) -> index
  %35 = "arith.index_castui"(%16) : (i32) -> index
  %36 = "arith.extui"(%17) : (i32) -> i64
  %37 = "arith.extui"(%18) : (i32) -> i64
  %38 = "arith.shli"(%37, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
  %39 = "arith.ori"(%36, %38) : (i64, i64) -> i64
  %40 = "arith.index_castui"(%39) : (i64) -> index
  %41 = "arith.index_castui"(%19) : (i32) -> index
  %42:6 = "util.assume.int"(%24, %29, %34, %35, %40, %41) <{assumptions = [[#util.int.assumption<umin = 68386816, umax = 5300322304>], [#util.int.assumption<umin = 68501504, umax = 5769969664>], [#util.int.assumption<umin = 68501568, umax = 5770002432>], [#util.int.assumption<umin = 1, umax = 4095>], [#util.int.assumption<umin = 64, umax = 576460752303423424, udiv = 64>], [#util.int.assumption<umin = 1, umax = 4095>]]}> : (index, index, index, index, index, index) -> (index, index, index, index, index, index)
  %43 = "flow.dispatch.workload.ordinal"(%42#3) <{ordinal = 1 : index}> : (index) -> index
  %44 = "flow.dispatch.workload.ordinal"(%42#4) <{ordinal = 2 : index}> : (index) -> index
  %45 = "flow.dispatch.workload.ordinal"(%42#5) <{ordinal = 3 : index}> : (index) -> index
  %46 = "hal.interface.binding.subspan"(%42#0, %43) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  "memref.assume_alignment"(%46) <{alignment = 1 : i32}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
  %47 = "hal.interface.binding.subspan"(%42#1, %45) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  "memref.assume_alignment"(%47) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
  %48 = "hal.interface.binding.subspan"(%4, %44) {alignment = 64 : index, binding = 1 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>
  "memref.assume_alignment"(%48) <{alignment = 64 : i32}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>) -> ()
  %49 = "hal.interface.binding.subspan"(%42#2, %45) {alignment = 64 : index, binding = 2 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  "memref.assume_alignment"(%49) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
  %50 = "memref.alloca"(%45) <{operandSegmentSizes = array<i32: 1, 0>}> : (index) -> memref<1x?xi32, #gpu.address_space<private>>
  %51 = "affine.apply"(%45) <{map = affine_map<()[s0] -> (s0 ceildiv 2)>}> : (index) -> index
  "gpu.barrier"() : () -> ()
  "scf.for"(%9, %51, %5) ({
  ^bb0(%arg7: index):
    %62 = "affine.apply"(%arg7) <{map = affine_map<(d0) -> (d0 * 2)>}> : (index) -> index
    %63 = "affine.min"(%arg7, %45) <{map = affine_map<(d0)[s0] -> (d0 * -2 + s0, 2)>}> : (index, index) -> index
    "scf.for"(%4, %63, %2) ({
    ^bb0(%arg8: index):
      %64 = "arith.addi"(%arg8, %62) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
      %65 = "vector.transfer_read"(%47, %4, %64, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
      %66 = "arith.addi"(%65, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
      %67 = "arith.trunci"(%66) : (vector<1xi64>) -> vector<1xi32>
      %68 = "affine.apply"(%arg7, %arg8) <{map = affine_map<(d0)[s0] -> (d0 * 2 + s0)>}> : (index, index) -> index
      "vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
      "vector.transfer_write"(%67, %50, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x?xi32, #gpu.address_space<private>>, index, index) -> ()
      "scf.yield"() : () -> ()
    }) : (index, index, index) -> ()
    "scf.yield"() : () -> ()
  }) : (index, index, index) -> ()
  "gpu.barrier"() : () -> ()
  %52:2 = "affine.delinearize_index"(%9) <{static_basis = array<i64: 4, 16>}> : (index) -> (index, index)
  "scf.forall"(%43) <{mapping = [#iree_codegen.workgroup_mapping<z:1>, #iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>], operandSegmentSizes = array<i32: 0, 1, 0, 0>, staticLowerBound = array<i64: 0, 0, 0, 0>, staticStep = array<i64: 1, 1, 1, 1>, staticUpperBound = array<i64: -9223372036854775808, 32, 2, 2>}> ({
  ^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index):
    %53 = "memref.subview"(%7) <{operandSegmentSizes = array<i32: 1, 0, 0, 0>, static_offsets = array<i64: 0, 0>, static_sizes = array<i64: 1, 1>, static_strides = array<i64: 1, 1>}> : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>
    "gpu.barrier"() : () -> ()
    "scf.for"(%9, %2, %5) ({
    ^bb0(%arg6: index):
      %59 = "vector.transfer_read"(%47, %4, %arg0, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
      %60 = "arith.addi"(%59, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
      %61 = "arith.trunci"(%60) : (vector<1xi64>) -> vector<1xi32>
      "vector.transfer_write"(%61, %7, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x3xi32, #gpu.address_space<workgroup>>, index, index) -> ()
      "scf.yield"() : () -> ()
    }) : (index, index, index) -> ()
    "gpu.barrier"() : () -> ()
    %54 = "affine.apply"(%arg2, %52#0) <{map = affine_map<(d0)[s0] -> (d0 * 4 + s0)>}> : (index, index) -> index
    %55 = "affine.apply"(%arg3, %52#1) <{map = affine_map<(d0)[s0] -> (d0 * 64 + s0 * 4)>}> : (index, index) -> index
    %56 = "memref.subview"(%48, %arg1, %54, %55, %44) <{operandSegmentSizes = array<i32: 1, 3, 1, 0>, static_offsets = array<i64: 0, -9223372036854775808, -9223372036854775808, -9223372036854775808>, static_sizes = array<i64: -9223372036854775808, 1, 1, 4>, static_strides = array<i64: 1, 1, 1, 1>}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>, index, index, index, index) -> memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %57 = "vector.transfer_read"(%46, %4, %arg0, %arg1, %54, %55, %0) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 5, 1, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, index, index, index, f8E4M3FNUZ) -> vector<4xf8E4M3FNUZ>
    %58 = "arith.extf"(%57) : (vector<4xf8E4M3FNUZ>) -> vector<4xbf16>
    "vector.transfer_write"(%58, %8, %4, %4, %4, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 5, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (vector<4xbf16>, memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, index, index, index, index, index) -> ()
    "iree_linalg_ext.scatter"(%8, %53, %56) <{dimension_map = array<i64: 0>, operandSegmentSizes = array<i32: 2, 1>, unique_indices = true}> ({
    ^bb0(%arg4: bf16, %arg5: bf16):
      "iree_linalg_ext.yield"(%arg4) : (bf16) -> ()
    }) : (memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>, memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
    "gpu.barrier"() : () -> ()
    "scf.forall.in_parallel"() ({
    ^bb0:
    }) : () -> ()
  }) : (index) -> ()
  "memref.dealloc"(%7) : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> ()
  "func.return"() : () -> ()
 }) {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64>} : () -> ()
 /sharedfile/32/fp8_32_kv16.mlir:14275:13: error: failed to run translation of source executable to target executable for backend #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>
    %5371 = torch.aten.index_put %5369, %5370, %5365, %false_5065 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
            ^
 /sharedfile/32/fp8_32_kv16.mlir:14275:13: note: see current operation: 
 "hal.executable.variant"() ({
  "hal.executable.export"() ({
  ^bb0(%arg9: !hal.device, %arg10: index, %arg11: index, %arg12: index, %arg13: index):
    %69:3 = "flow.dispatch.workgroup_count_from_slice"(%arg10, %arg11, %arg12, %arg13) : (index, index, index, index) -> (index, index, index)
    "hal.return"(%69#0, %69#1, %69#2) : (index, index, index) -> ()
  }) {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index, sym_name = "prefill_bs1$async_dispatch_272_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"} : () -> ()
  "builtin.module"() ({
    "func.func"() <{function_type = () -> (), sym_name = "prefill_bs1$async_dispatch_272_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"}> ({
      %0 = "arith.constant"() <{value = 0.000000e+00 : f8E4M3FNUZ}> : () -> f8E4M3FNUZ
      %1 = "arith.constant"() <{value = 0 : i64}> : () -> i64
      %2 = "arith.constant"() <{value = 1 : index}> : () -> index
      %3 = "arith.constant"() <{value = 32 : i64}> : () -> i64
      %4 = "arith.constant"() <{value = 0 : index}> : () -> index
      %5 = "arith.constant"() <{value = 64 : index}> : () -> index
      %6 = "arith.constant"() <{value = dense<26> : vector<1xi64>}> : () -> vector<1xi64>
      %7 = "memref.alloc"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x3xi32, #gpu.address_space<workgroup>>
      %8 = "memref.alloca"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x1x1x1x4xbf16, #gpu.address_space<private>>
      %9 = "gpu.thread_id"() <{dimension = #gpu<dim x>, upper_bound = 64 : index}> : () -> index
      %10 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index} : () -> i32
      %11 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 1 : index} : () -> i32
      %12 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 2 : index} : () -> i32
      %13 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 3 : index} : () -> i32
      %14 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 4 : index} : () -> i32
      %15 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 5 : index} : () -> i32
      %16 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 6 : index} : () -> i32
      %17 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 7 : index} : () -> i32
      %18 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 8 : index} : () -> i32
      %19 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 9 : index} : () -> i32
      %20 = "arith.extui"(%10) : (i32) -> i64
      %21 = "arith.extui"(%11) : (i32) -> i64
      %22 = "arith.shli"(%21, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
      %23 = "arith.ori"(%20, %22) : (i64, i64) -> i64
      %24 = "arith.index_castui"(%23) : (i64) -> index
      %25 = "arith.extui"(%12) : (i32) -> i64
      %26 = "arith.extui"(%13) : (i32) -> i64
      %27 = "arith.shli"(%26, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
      %28 = "arith.ori"(%25, %27) : (i64, i64) -> i64
      %29 = "arith.index_castui"(%28) : (i64) -> index
      %30 = "arith.extui"(%14) : (i32) -> i64
      %31 = "arith.extui"(%15) : (i32) -> i64
      %32 = "arith.shli"(%31, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
      %33 = "arith.ori"(%30, %32) : (i64, i64) -> i64
      %34 = "arith.index_castui"(%33) : (i64) -> index
      %35 = "arith.index_castui"(%16) : (i32) -> index
      %36 = "arith.extui"(%17) : (i32) -> i64
      %37 = "arith.extui"(%18) : (i32) -> i64
      %38 = "arith.shli"(%37, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
      %39 = "arith.ori"(%36, %38) : (i64, i64) -> i64
      %40 = "arith.index_castui"(%39) : (i64) -> index
      %41 = "arith.index_castui"(%19) : (i32) -> index
      %42:6 = "util.assume.int"(%24, %29, %34, %35, %40, %41) <{assumptions = [[#util.int.assumption<umin = 68386816, umax = 5300322304>], [#util.int.assumption<umin = 68501504, umax = 5769969664>], [#util.int.assumption<umin = 68501568, umax = 5770002432>], [#util.int.assumption<umin = 1, umax = 4095>], [#util.int.assumption<umin = 64, umax = 576460752303423424, udiv = 64>], [#util.int.assumption<umin = 1, umax = 4095>]]}> : (index, index, index, index, index, index) -> (index, index, index, index, index, index)
      %43 = "flow.dispatch.workload.ordinal"(%42#3) <{ordinal = 1 : index}> : (index) -> index
      %44 = "flow.dispatch.workload.ordinal"(%42#4) <{ordinal = 2 : index}> : (index) -> index
      %45 = "flow.dispatch.workload.ordinal"(%42#5) <{ordinal = 3 : index}> : (index) -> index
      %46 = "hal.interface.binding.subspan"(%42#0, %43) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      "memref.assume_alignment"(%46) <{alignment = 1 : i32}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
      %47 = "hal.interface.binding.subspan"(%42#1, %45) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      "memref.assume_alignment"(%47) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
      %48 = "hal.interface.binding.subspan"(%4, %44) {alignment = 64 : index, binding = 1 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>
      "memref.assume_alignment"(%48) <{alignment = 64 : i32}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>) -> ()
      %49 = "hal.interface.binding.subspan"(%42#2, %45) {alignment = 64 : index, binding = 2 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      "memref.assume_alignment"(%49) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
      %50 = "memref.alloca"(%45) <{operandSegmentSizes = array<i32: 1, 0>}> : (index) -> memref<1x?xi32, #gpu.address_space<private>>
      %51 = "affine.apply"(%45) <{map = affine_map<()[s0] -> (s0 ceildiv 2)>}> : (index) -> index
      "gpu.barrier"() : () -> ()
      "scf.for"(%9, %51, %5) ({
      ^bb0(%arg7: index):
        %62 = "affine.apply"(%arg7) <{map = affine_map<(d0) -> (d0 * 2)>}> : (index) -> index
        %63 = "affine.min"(%arg7, %45) <{map = affine_map<(d0)[s0] -> (d0 * -2 + s0, 2)>}> : (index, index) -> index
        "scf.for"(%4, %63, %2) ({
        ^bb0(%arg8: index):
          %64 = "arith.addi"(%arg8, %62) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
          %65 = "vector.transfer_read"(%47, %4, %64, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
          %66 = "arith.addi"(%65, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
          %67 = "arith.trunci"(%66) : (vector<1xi64>) -> vector<1xi32>
          %68 = "affine.apply"(%arg7, %arg8) <{map = affine_map<(d0)[s0] -> (d0 * 2 + s0)>}> : (index, index) -> index
          "vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
          "vector.transfer_write"(%67, %50, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x?xi32, #gpu.address_space<private>>, index, index) -> ()
          "scf.yield"() : () -> ()
        }) : (index, index, index) -> ()
        "scf.yield"() : () -> ()
      }) : (index, index, index) -> ()
      "gpu.barrier"() : () -> ()
      %52:2 = "affine.delinearize_index"(%9) <{static_basis = array<i64: 4, 16>}> : (index) -> (index, index)
      "scf.forall"(%43) <{mapping = [#iree_codegen.workgroup_mapping<z:1>, #iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>], operandSegmentSizes = array<i32: 0, 1, 0, 0>, staticLowerBound = array<i64: 0, 0, 0, 0>, staticStep = array<i64: 1, 1, 1, 1>, staticUpperBound = array<i64: -9223372036854775808, 32, 2, 2>}> ({
      ^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index):
        %53 = "memref.subview"(%7) <{operandSegmentSizes = array<i32: 1, 0, 0, 0>, static_offsets = array<i64: 0, 0>, static_sizes = array<i64: 1, 1>, static_strides = array<i64: 1, 1>}> : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>
        "gpu.barrier"() : () -> ()
        "scf.for"(%9, %2, %5) ({
        ^bb0(%arg6: index):
          %59 = "vector.transfer_read"(%47, %4, %arg0, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
          %60 = "arith.addi"(%59, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
          %61 = "arith.trunci"(%60) : (vector<1xi64>) -> vector<1xi32>
          "vector.transfer_write"(%61, %7, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x3xi32, #gpu.address_space<workgroup>>, index, index) -> ()
          "scf.yield"() : () -> ()
        }) : (index, index, index) -> ()
        "gpu.barrier"() : () -> ()
        %54 = "affine.apply"(%arg2, %52#0) <{map = affine_map<(d0)[s0] -> (d0 * 4 + s0)>}> : (index, index) -> index
        %55 = "affine.apply"(%arg3, %52#1) <{map = affine_map<(d0)[s0] -> (d0 * 64 + s0 * 4)>}> : (index, index) -> index
        %56 = "memref.subview"(%48, %arg1, %54, %55, %44) <{operandSegmentSizes = array<i32: 1, 3, 1, 0>, static_offsets = array<i64: 0, -9223372036854775808, -9223372036854775808, -9223372036854775808>, static_sizes = array<i64: -9223372036854775808, 1, 1, 4>, static_strides = array<i64: 1, 1, 1, 1>}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>, index, index, index, index) -> memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        %57 = "vector.transfer_read"(%46, %4, %arg0, %arg1, %54, %55, %0) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 5, 1, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, index, index, index, f8E4M3FNUZ) -> vector<4xf8E4M3FNUZ>
        %58 = "arith.extf"(%57) : (vector<4xf8E4M3FNUZ>) -> vector<4xbf16>
        "vector.transfer_write"(%58, %8, %4, %4, %4, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 5, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (vector<4xbf16>, memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, index, index, index, index, index) -> ()
        "iree_linalg_ext.scatter"(%8, %53, %56) <{dimension_map = array<i64: 0>, operandSegmentSizes = array<i32: 2, 1>, unique_indices = true}> ({
        ^bb0(%arg4: bf16, %arg5: bf16):
          "iree_linalg_ext.yield"(%arg4) : (bf16) -> ()
        }) : (memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>, memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
        "gpu.barrier"() : () -> ()
        "scf.forall.in_parallel"() ({
        ^bb0:
        }) : () -> ()
      }) : (index) -> ()
      "memref.dealloc"(%7) : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> ()
      "func.return"() : () -> ()
    }) {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64>} : () -> ()
  }) : () -> ()
  "hal.executable.variant_end"() : () -> ()
 }) {sym_name = "rocm_hsaco_fb", target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>} : () -> ()
 /sharedfile/32/fp8_32_kv16.mlir:15151:13: error: 'vector.transfer_write' op write affecting operations on global resources are restricted to workgroup distributed contexts.
    %5708 = torch.aten.index_put %5706, %5707, %5702, %false_5437 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
            ^
 /sharedfile/32/fp8_32_kv16.mlir:15151:13: note: see current operation: "vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
 /sharedfile/32/fp8_32_kv16.mlir:15151:13: error: 'func.func' op failed on workgroup distribution verification
    %5708 = torch.aten.index_put %5706, %5707, %5702, %false_5437 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
            ^
 /sharedfile/32/fp8_32_kv16.mlir:15151:13: note: see current operation: 
 "func.func"() <{function_type = () -> (), sym_name = "prefill_bs1$async_dispatch_292_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"}> ({
  %0 = "arith.constant"() <{value = 0.000000e+00 : f8E4M3FNUZ}> : () -> f8E4M3FNUZ
  %1 = "arith.constant"() <{value = 0 : i64}> : () -> i64
  %2 = "arith.constant"() <{value = 1 : index}> : () -> index
  %3 = "arith.constant"() <{value = 32 : i64}> : () -> i64
  %4 = "arith.constant"() <{value = 0 : index}> : () -> index
  %5 = "arith.constant"() <{value = 64 : index}> : () -> index
  %6 = "arith.constant"() <{value = dense<28> : vector<1xi64>}> : () -> vector<1xi64>
  %7 = "memref.alloc"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x3xi32, #gpu.address_space<workgroup>>
  %8 = "memref.alloca"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x1x1x1x4xbf16, #gpu.address_space<private>>
  %9 = "gpu.thread_id"() <{dimension = #gpu<dim x>, upper_bound = 64 : index}> : () -> index
  %10 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index} : () -> i32
  %11 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 1 : index} : () -> i32
  %12 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 2 : index} : () -> i32
  %13 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 3 : index} : () -> i32
  %14 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 4 : index} : () -> i32
  %15 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 5 : index} : () -> i32
  %16 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 6 : index} : () -> i32
  %17 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 7 : index} : () -> i32
  %18 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 8 : index} : () -> i32
  %19 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 9 : index} : () -> i32
  %20 = "arith.extui"(%10) : (i32) -> i64
  %21 = "arith.extui"(%11) : (i32) -> i64
  %22 = "arith.shli"(%21, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
  %23 = "arith.ori"(%20, %22) : (i64, i64) -> i64
  %24 = "arith.index_castui"(%23) : (i64) -> index
  %25 = "arith.extui"(%12) : (i32) -> i64
  %26 = "arith.extui"(%13) : (i32) -> i64
  %27 = "arith.shli"(%26, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
  %28 = "arith.ori"(%25, %27) : (i64, i64) -> i64
  %29 = "arith.index_castui"(%28) : (i64) -> index
  %30 = "arith.extui"(%14) : (i32) -> i64
  %31 = "arith.extui"(%15) : (i32) -> i64
  %32 = "arith.shli"(%31, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
  %33 = "arith.ori"(%30, %32) : (i64, i64) -> i64
  %34 = "arith.index_castui"(%33) : (i64) -> index
  %35 = "arith.index_castui"(%16) : (i32) -> index
  %36 = "arith.extui"(%17) : (i32) -> i64
  %37 = "arith.extui"(%18) : (i32) -> i64
  %38 = "arith.shli"(%37, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
  %39 = "arith.ori"(%36, %38) : (i64, i64) -> i64
  %40 = "arith.index_castui"(%39) : (i64) -> index
  %41 = "arith.index_castui"(%19) : (i32) -> index
  %42:6 = "util.assume.int"(%24, %29, %34, %35, %40, %41) <{assumptions = [[#util.int.assumption<umin = 68386816, umax = 5300322304>], [#util.int.assumption<umin = 68501504, umax = 5769969664>], [#util.int.assumption<umin = 68501568, umax = 5770002432>], [#util.int.assumption<umin = 1, umax = 4095>], [#util.int.assumption<umin = 64, umax = 576460752303423424, udiv = 64>], [#util.int.assumption<umin = 1, umax = 4095>]]}> : (index, index, index, index, index, index) -> (index, index, index, index, index, index)
  %43 = "flow.dispatch.workload.ordinal"(%42#3) <{ordinal = 1 : index}> : (index) -> index
  %44 = "flow.dispatch.workload.ordinal"(%42#4) <{ordinal = 2 : index}> : (index) -> index
  %45 = "flow.dispatch.workload.ordinal"(%42#5) <{ordinal = 3 : index}> : (index) -> index
  %46 = "hal.interface.binding.subspan"(%42#0, %43) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  "memref.assume_alignment"(%46) <{alignment = 1 : i32}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
  %47 = "hal.interface.binding.subspan"(%42#1, %45) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  "memref.assume_alignment"(%47) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
  %48 = "hal.interface.binding.subspan"(%4, %44) {alignment = 64 : index, binding = 1 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>
  "memref.assume_alignment"(%48) <{alignment = 64 : i32}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>) -> ()
  %49 = "hal.interface.binding.subspan"(%42#2, %45) {alignment = 64 : index, binding = 2 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  "memref.assume_alignment"(%49) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
  %50 = "memref.alloca"(%45) <{operandSegmentSizes = array<i32: 1, 0>}> : (index) -> memref<1x?xi32, #gpu.address_space<private>>
  %51 = "affine.apply"(%45) <{map = affine_map<()[s0] -> (s0 ceildiv 2)>}> : (index) -> index
  "gpu.barrier"() : () -> ()
  "scf.for"(%9, %51, %5) ({
  ^bb0(%arg7: index):
    %62 = "affine.apply"(%arg7) <{map = affine_map<(d0) -> (d0 * 2)>}> : (index) -> index
    %63 = "affine.min"(%arg7, %45) <{map = affine_map<(d0)[s0] -> (d0 * -2 + s0, 2)>}> : (index, index) -> index
    "scf.for"(%4, %63, %2) ({
    ^bb0(%arg8: index):
      %64 = "arith.addi"(%arg8, %62) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
      %65 = "vector.transfer_read"(%47, %4, %64, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
      %66 = "arith.addi"(%65, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
      %67 = "arith.trunci"(%66) : (vector<1xi64>) -> vector<1xi32>
      %68 = "affine.apply"(%arg7, %arg8) <{map = affine_map<(d0)[s0] -> (d0 * 2 + s0)>}> : (index, index) -> index
      "vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
      "vector.transfer_write"(%67, %50, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x?xi32, #gpu.address_space<private>>, index, index) -> ()
      "scf.yield"() : () -> ()
    }) : (index, index, index) -> ()
    "scf.yield"() : () -> ()
  }) : (index, index, index) -> ()
  "gpu.barrier"() : () -> ()
  %52:2 = "affine.delinearize_index"(%9) <{static_basis = array<i64: 4, 16>}> : (index) -> (index, index)
  "scf.forall"(%43) <{mapping = [#iree_codegen.workgroup_mapping<z:1>, #iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>], operandSegmentSizes = array<i32: 0, 1, 0, 0>, staticLowerBound = array<i64: 0, 0, 0, 0>, staticStep = array<i64: 1, 1, 1, 1>, staticUpperBound = array<i64: -9223372036854775808, 32, 2, 2>}> ({
  ^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index):
    %53 = "memref.subview"(%7) <{operandSegmentSizes = array<i32: 1, 0, 0, 0>, static_offsets = array<i64: 0, 0>, static_sizes = array<i64: 1, 1>, static_strides = array<i64: 1, 1>}> : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>
    "gpu.barrier"() : () -> ()
    "scf.for"(%9, %2, %5) ({
    ^bb0(%arg6: index):
      %59 = "vector.transfer_read"(%47, %4, %arg0, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
      %60 = "arith.addi"(%59, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
      %61 = "arith.trunci"(%60) : (vector<1xi64>) -> vector<1xi32>
      "vector.transfer_write"(%61, %7, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x3xi32, #gpu.address_space<workgroup>>, index, index) -> ()
      "scf.yield"() : () -> ()
    }) : (index, index, index) -> ()
    "gpu.barrier"() : () -> ()
    %54 = "affine.apply"(%arg2, %52#0) <{map = affine_map<(d0)[s0] -> (d0 * 4 + s0)>}> : (index, index) -> index
    %55 = "affine.apply"(%arg3, %52#1) <{map = affine_map<(d0)[s0] -> (d0 * 64 + s0 * 4)>}> : (index, index) -> index
    %56 = "memref.subview"(%48, %arg1, %54, %55, %44) <{operandSegmentSizes = array<i32: 1, 3, 1, 0>, static_offsets = array<i64: 0, -9223372036854775808, -9223372036854775808, -9223372036854775808>, static_sizes = array<i64: -9223372036854775808, 1, 1, 4>, static_strides = array<i64: 1, 1, 1, 1>}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>, index, index, index, index) -> memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %57 = "vector.transfer_read"(%46, %4, %arg0, %arg1, %54, %55, %0) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 5, 1, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, index, index, index, f8E4M3FNUZ) -> vector<4xf8E4M3FNUZ>
    %58 = "arith.extf"(%57) : (vector<4xf8E4M3FNUZ>) -> vector<4xbf16>
    "vector.transfer_write"(%58, %8, %4, %4, %4, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 5, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (vector<4xbf16>, memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, index, index, index, index, index) -> ()
    "iree_linalg_ext.scatter"(%8, %53, %56) <{dimension_map = array<i64: 0>, operandSegmentSizes = array<i32: 2, 1>, unique_indices = true}> ({
    ^bb0(%arg4: bf16, %arg5: bf16):
      "iree_linalg_ext.yield"(%arg4) : (bf16) -> ()
    }) : (memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>, memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
    "gpu.barrier"() : () -> ()
    "scf.forall.in_parallel"() ({
    ^bb0:
    }) : () -> ()
  }) : (index) -> ()
  "memref.dealloc"(%7) : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> ()
  "func.return"() : () -> ()
 }) {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64>} : () -> ()
 /sharedfile/32/fp8_32_kv16.mlir:15151:13: error: failed to run translation of source executable to target executable for backend #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>
    %5708 = torch.aten.index_put %5706, %5707, %5702, %false_5437 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
            ^
 /sharedfile/32/fp8_32_kv16.mlir:15151:13: note: see current operation: 
 "hal.executable.variant"() ({
  "hal.executable.export"() ({
  ^bb0(%arg9: !hal.device, %arg10: index, %arg11: index, %arg12: index, %arg13: index):
    %69:3 = "flow.dispatch.workgroup_count_from_slice"(%arg10, %arg11, %arg12, %arg13) : (index, index, index, index) -> (index, index, index)
    "hal.return"(%69#0, %69#1, %69#2) : (index, index, index) -> ()
  }) {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index, sym_name = "prefill_bs1$async_dispatch_292_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"} : () -> ()
  "builtin.module"() ({
    "func.func"() <{function_type = () -> (), sym_name = "prefill_bs1$async_dispatch_292_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"}> ({
      %0 = "arith.constant"() <{value = 0.000000e+00 : f8E4M3FNUZ}> : () -> f8E4M3FNUZ
      %1 = "arith.constant"() <{value = 0 : i64}> : () -> i64
      %2 = "arith.constant"() <{value = 1 : index}> : () -> index
      %3 = "arith.constant"() <{value = 32 : i64}> : () -> i64
      %4 = "arith.constant"() <{value = 0 : index}> : () -> index
      %5 = "arith.constant"() <{value = 64 : index}> : () -> index
      %6 = "arith.constant"() <{value = dense<28> : vector<1xi64>}> : () -> vector<1xi64>
      %7 = "memref.alloc"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x3xi32, #gpu.address_space<workgroup>>
      %8 = "memref.alloca"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x1x1x1x4xbf16, #gpu.address_space<private>>
      %9 = "gpu.thread_id"() <{dimension = #gpu<dim x>, upper_bound = 64 : index}> : () -> index
      %10 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index} : () -> i32
      %11 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 1 : index} : () -> i32
      %12 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 2 : index} : () -> i32
      %13 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 3 : index} : () -> i32
      %14 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 4 : index} : () -> i32
      %15 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 5 : index} : () -> i32
      %16 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 6 : index} : () -> i32
      %17 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 7 : index} : () -> i32
      %18 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 8 : index} : () -> i32
      %19 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 9 : index} : () -> i32
      %20 = "arith.extui"(%10) : (i32) -> i64
      %21 = "arith.extui"(%11) : (i32) -> i64
      %22 = "arith.shli"(%21, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
      %23 = "arith.ori"(%20, %22) : (i64, i64) -> i64
      %24 = "arith.index_castui"(%23) : (i64) -> index
      %25 = "arith.extui"(%12) : (i32) -> i64
      %26 = "arith.extui"(%13) : (i32) -> i64
      %27 = "arith.shli"(%26, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
      %28 = "arith.ori"(%25, %27) : (i64, i64) -> i64
      %29 = "arith.index_castui"(%28) : (i64) -> index
      %30 = "arith.extui"(%14) : (i32) -> i64
      %31 = "arith.extui"(%15) : (i32) -> i64
      %32 = "arith.shli"(%31, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
      %33 = "arith.ori"(%30, %32) : (i64, i64) -> i64
      %34 = "arith.index_castui"(%33) : (i64) -> index
      %35 = "arith.index_castui"(%16) : (i32) -> index
      %36 = "arith.extui"(%17) : (i32) -> i64
      %37 = "arith.extui"(%18) : (i32) -> i64
      %38 = "arith.shli"(%37, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
      %39 = "arith.ori"(%36, %38) : (i64, i64) -> i64
      %40 = "arith.index_castui"(%39) : (i64) -> index
      %41 = "arith.index_castui"(%19) : (i32) -> index
      %42:6 = "util.assume.int"(%24, %29, %34, %35, %40, %41) <{assumptions = [[#util.int.assumption<umin = 68386816, umax = 5300322304>], [#util.int.assumption<umin = 68501504, umax = 5769969664>], [#util.int.assumption<umin = 68501568, umax = 5770002432>], [#util.int.assumption<umin = 1, umax = 4095>], [#util.int.assumption<umin = 64, umax = 576460752303423424, udiv = 64>], [#util.int.assumption<umin = 1, umax = 4095>]]}> : (index, index, index, index, index, index) -> (index, index, index, index, index, index)
      %43 = "flow.dispatch.workload.ordinal"(%42#3) <{ordinal = 1 : index}> : (index) -> index
      %44 = "flow.dispatch.workload.ordinal"(%42#4) <{ordinal = 2 : index}> : (index) -> index
      %45 = "flow.dispatch.workload.ordinal"(%42#5) <{ordinal = 3 : index}> : (index) -> index
      %46 = "hal.interface.binding.subspan"(%42#0, %43) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      "memref.assume_alignment"(%46) <{alignment = 1 : i32}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
      %47 = "hal.interface.binding.subspan"(%42#1, %45) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      "memref.assume_alignment"(%47) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
      %48 = "hal.interface.binding.subspan"(%4, %44) {alignment = 64 : index, binding = 1 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>
      "memref.assume_alignment"(%48) <{alignment = 64 : i32}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>) -> ()
      %49 = "hal.interface.binding.subspan"(%42#2, %45) {alignment = 64 : index, binding = 2 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      "memref.assume_alignment"(%49) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
      %50 = "memref.alloca"(%45) <{operandSegmentSizes = array<i32: 1, 0>}> : (index) -> memref<1x?xi32, #gpu.address_space<private>>
      %51 = "affine.apply"(%45) <{map = affine_map<()[s0] -> (s0 ceildiv 2)>}> : (index) -> index
      "gpu.barrier"() : () -> ()
      "scf.for"(%9, %51, %5) ({
      ^bb0(%arg7: index):
        %62 = "affine.apply"(%arg7) <{map = affine_map<(d0) -> (d0 * 2)>}> : (index) -> index
        %63 = "affine.min"(%arg7, %45) <{map = affine_map<(d0)[s0] -> (d0 * -2 + s0, 2)>}> : (index, index) -> index
        "scf.for"(%4, %63, %2) ({
        ^bb0(%arg8: index):
          %64 = "arith.addi"(%arg8, %62) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
          %65 = "vector.transfer_read"(%47, %4, %64, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
          %66 = "arith.addi"(%65, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
          %67 = "arith.trunci"(%66) : (vector<1xi64>) -> vector<1xi32>
          %68 = "affine.apply"(%arg7, %arg8) <{map = affine_map<(d0)[s0] -> (d0 * 2 + s0)>}> : (index, index) -> index
          "vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
          "vector.transfer_write"(%67, %50, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x?xi32, #gpu.address_space<private>>, index, index) -> ()
          "scf.yield"() : () -> ()
        }) : (index, index, index) -> ()
        "scf.yield"() : () -> ()
      }) : (index, index, index) -> ()
      "gpu.barrier"() : () -> ()
      %52:2 = "affine.delinearize_index"(%9) <{static_basis = array<i64: 4, 16>}> : (index) -> (index, index)
      "scf.forall"(%43) <{mapping = [#iree_codegen.workgroup_mapping<z:1>, #iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>], operandSegmentSizes = array<i32: 0, 1, 0, 0>, staticLowerBound = array<i64: 0, 0, 0, 0>, staticStep = array<i64: 1, 1, 1, 1>, staticUpperBound = array<i64: -9223372036854775808, 32, 2, 2>}> ({
      ^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index):
        %53 = "memref.subview"(%7) <{operandSegmentSizes = array<i32: 1, 0, 0, 0>, static_offsets = array<i64: 0, 0>, static_sizes = array<i64: 1, 1>, static_strides = array<i64: 1, 1>}> : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>
        "gpu.barrier"() : () -> ()
        "scf.for"(%9, %2, %5) ({
        ^bb0(%arg6: index):
          %59 = "vector.transfer_read"(%47, %4, %arg0, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
          %60 = "arith.addi"(%59, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
          %61 = "arith.trunci"(%60) : (vector<1xi64>) -> vector<1xi32>
          "vector.transfer_write"(%61, %7, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x3xi32, #gpu.address_space<workgroup>>, index, index) -> ()
          "scf.yield"() : () -> ()
        }) : (index, index, index) -> ()
        "gpu.barrier"() : () -> ()
        %54 = "affine.apply"(%arg2, %52#0) <{map = affine_map<(d0)[s0] -> (d0 * 4 + s0)>}> : (index, index) -> index
        %55 = "affine.apply"(%arg3, %52#1) <{map = affine_map<(d0)[s0] -> (d0 * 64 + s0 * 4)>}> : (index, index) -> index
        %56 = "memref.subview"(%48, %arg1, %54, %55, %44) <{operandSegmentSizes = array<i32: 1, 3, 1, 0>, static_offsets = array<i64: 0, -9223372036854775808, -9223372036854775808, -9223372036854775808>, static_sizes = array<i64: -9223372036854775808, 1, 1, 4>, static_strides = array<i64: 1, 1, 1, 1>}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>, index, index, index, index) -> memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        %57 = "vector.transfer_read"(%46, %4, %arg0, %arg1, %54, %55, %0) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 5, 1, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, index, index, index, f8E4M3FNUZ) -> vector<4xf8E4M3FNUZ>
        %58 = "arith.extf"(%57) : (vector<4xf8E4M3FNUZ>) -> vector<4xbf16>
        "vector.transfer_write"(%58, %8, %4, %4, %4, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 5, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (vector<4xbf16>, memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, index, index, index, index, index) -> ()
        "iree_linalg_ext.scatter"(%8, %53, %56) <{dimension_map = array<i64: 0>, operandSegmentSizes = array<i32: 2, 1>, unique_indices = true}> ({
        ^bb0(%arg4: bf16, %arg5: bf16):
          "iree_linalg_ext.yield"(%arg4) : (bf16) -> ()
        }) : (memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>, memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
        "gpu.barrier"() : () -> ()
        "scf.forall.in_parallel"() ({
        ^bb0:
        }) : () -> ()
      }) : (index) -> ()
      "memref.dealloc"(%7) : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> ()
      "func.return"() : () -> ()
    }) {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64>} : () -> ()
  }) : () -> ()
  "hal.executable.variant_end"() : () -> ()
 }) {sym_name = "rocm_hsaco_fb", target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>} : () -> ()
 /sharedfile/32/fp8_32_kv16.mlir:16027:13: error: 'vector.transfer_write' op write affecting operations on global resources are restricted to workgroup distributed contexts.
    %6045 = torch.aten.index_put %6043, %6044, %6039, %false_5809 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
            ^
 /sharedfile/32/fp8_32_kv16.mlir:16027:13: note: see current operation: "vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
 /sharedfile/32/fp8_32_kv16.mlir:16027:13: error: 'func.func' op failed on workgroup distribution verification
    %6045 = torch.aten.index_put %6043, %6044, %6039, %false_5809 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
            ^
 /sharedfile/32/fp8_32_kv16.mlir:16027:13: note: see current operation: 
 "func.func"() <{function_type = () -> (), sym_name = "prefill_bs1$async_dispatch_312_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"}> ({
  %0 = "arith.constant"() <{value = 0.000000e+00 : f8E4M3FNUZ}> : () -> f8E4M3FNUZ
  %1 = "arith.constant"() <{value = 0 : i64}> : () -> i64
  %2 = "arith.constant"() <{value = 1 : index}> : () -> index
  %3 = "arith.constant"() <{value = 32 : i64}> : () -> i64
  %4 = "arith.constant"() <{value = 0 : index}> : () -> index
  %5 = "arith.constant"() <{value = 64 : index}> : () -> index
  %6 = "arith.constant"() <{value = dense<30> : vector<1xi64>}> : () -> vector<1xi64>
  %7 = "memref.alloc"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x3xi32, #gpu.address_space<workgroup>>
  %8 = "memref.alloca"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x1x1x1x4xbf16, #gpu.address_space<private>>
  %9 = "gpu.thread_id"() <{dimension = #gpu<dim x>, upper_bound = 64 : index}> : () -> index
  %10 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index} : () -> i32
  %11 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 1 : index} : () -> i32
  %12 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 2 : index} : () -> i32
  %13 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 3 : index} : () -> i32
  %14 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 4 : index} : () -> i32
  %15 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 5 : index} : () -> i32
  %16 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 6 : index} : () -> i32
  %17 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 7 : index} : () -> i32
  %18 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 8 : index} : () -> i32
  %19 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 9 : index} : () -> i32
  %20 = "arith.extui"(%10) : (i32) -> i64
  %21 = "arith.extui"(%11) : (i32) -> i64
  %22 = "arith.shli"(%21, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
  %23 = "arith.ori"(%20, %22) : (i64, i64) -> i64
  %24 = "arith.index_castui"(%23) : (i64) -> index
  %25 = "arith.extui"(%12) : (i32) -> i64
  %26 = "arith.extui"(%13) : (i32) -> i64
  %27 = "arith.shli"(%26, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
  %28 = "arith.ori"(%25, %27) : (i64, i64) -> i64
  %29 = "arith.index_castui"(%28) : (i64) -> index
  %30 = "arith.extui"(%14) : (i32) -> i64
  %31 = "arith.extui"(%15) : (i32) -> i64
  %32 = "arith.shli"(%31, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
  %33 = "arith.ori"(%30, %32) : (i64, i64) -> i64
  %34 = "arith.index_castui"(%33) : (i64) -> index
  %35 = "arith.index_castui"(%16) : (i32) -> index
  %36 = "arith.extui"(%17) : (i32) -> i64
  %37 = "arith.extui"(%18) : (i32) -> i64
  %38 = "arith.shli"(%37, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
  %39 = "arith.ori"(%36, %38) : (i64, i64) -> i64
  %40 = "arith.index_castui"(%39) : (i64) -> index
  %41 = "arith.index_castui"(%19) : (i32) -> index
  %42:6 = "util.assume.int"(%24, %29, %34, %35, %40, %41) <{assumptions = [[#util.int.assumption<umin = 68386816, umax = 5300322304>], [#util.int.assumption<umin = 68501504, umax = 5769969664>], [#util.int.assumption<umin = 68501568, umax = 5770002432>], [#util.int.assumption<umin = 1, umax = 4095>], [#util.int.assumption<umin = 64, umax = 576460752303423424, udiv = 64>], [#util.int.assumption<umin = 1, umax = 4095>]]}> : (index, index, index, index, index, index) -> (index, index, index, index, index, index)
  %43 = "flow.dispatch.workload.ordinal"(%42#3) <{ordinal = 1 : index}> : (index) -> index
  %44 = "flow.dispatch.workload.ordinal"(%42#4) <{ordinal = 2 : index}> : (index) -> index
  %45 = "flow.dispatch.workload.ordinal"(%42#5) <{ordinal = 3 : index}> : (index) -> index
  %46 = "hal.interface.binding.subspan"(%42#0, %43) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  "memref.assume_alignment"(%46) <{alignment = 1 : i32}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
  %47 = "hal.interface.binding.subspan"(%42#1, %45) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  "memref.assume_alignment"(%47) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
  %48 = "hal.interface.binding.subspan"(%4, %44) {alignment = 64 : index, binding = 1 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>
  "memref.assume_alignment"(%48) <{alignment = 64 : i32}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>) -> ()
  %49 = "hal.interface.binding.subspan"(%42#2, %45) {alignment = 64 : index, binding = 2 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  "memref.assume_alignment"(%49) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
  %50 = "memref.alloca"(%45) <{operandSegmentSizes = array<i32: 1, 0>}> : (index) -> memref<1x?xi32, #gpu.address_space<private>>
  %51 = "affine.apply"(%45) <{map = affine_map<()[s0] -> (s0 ceildiv 2)>}> : (index) -> index
  "gpu.barrier"() : () -> ()
  "scf.for"(%9, %51, %5) ({
  ^bb0(%arg7: index):
    %62 = "affine.apply"(%arg7) <{map = affine_map<(d0) -> (d0 * 2)>}> : (index) -> index
    %63 = "affine.min"(%arg7, %45) <{map = affine_map<(d0)[s0] -> (d0 * -2 + s0, 2)>}> : (index, index) -> index
    "scf.for"(%4, %63, %2) ({
    ^bb0(%arg8: index):
      %64 = "arith.addi"(%arg8, %62) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
      %65 = "vector.transfer_read"(%47, %4, %64, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
      %66 = "arith.addi"(%65, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
      %67 = "arith.trunci"(%66) : (vector<1xi64>) -> vector<1xi32>
      %68 = "affine.apply"(%arg7, %arg8) <{map = affine_map<(d0)[s0] -> (d0 * 2 + s0)>}> : (index, index) -> index
      "vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
      "vector.transfer_write"(%67, %50, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x?xi32, #gpu.address_space<private>>, index, index) -> ()
      "scf.yield"() : () -> ()
    }) : (index, index, index) -> ()
    "scf.yield"() : () -> ()
  }) : (index, index, index) -> ()
  "gpu.barrier"() : () -> ()
  %52:2 = "affine.delinearize_index"(%9) <{static_basis = array<i64: 4, 16>}> : (index) -> (index, index)
  "scf.forall"(%43) <{mapping = [#iree_codegen.workgroup_mapping<z:1>, #iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>], operandSegmentSizes = array<i32: 0, 1, 0, 0>, staticLowerBound = array<i64: 0, 0, 0, 0>, staticStep = array<i64: 1, 1, 1, 1>, staticUpperBound = array<i64: -9223372036854775808, 32, 2, 2>}> ({
  ^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index):
    %53 = "memref.subview"(%7) <{operandSegmentSizes = array<i32: 1, 0, 0, 0>, static_offsets = array<i64: 0, 0>, static_sizes = array<i64: 1, 1>, static_strides = array<i64: 1, 1>}> : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>
    "gpu.barrier"() : () -> ()
    "scf.for"(%9, %2, %5) ({
    ^bb0(%arg6: index):
      %59 = "vector.transfer_read"(%47, %4, %arg0, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
      %60 = "arith.addi"(%59, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
      %61 = "arith.trunci"(%60) : (vector<1xi64>) -> vector<1xi32>
      "vector.transfer_write"(%61, %7, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x3xi32, #gpu.address_space<workgroup>>, index, index) -> ()
      "scf.yield"() : () -> ()
    }) : (index, index, index) -> ()
    "gpu.barrier"() : () -> ()
    %54 = "affine.apply"(%arg2, %52#0) <{map = affine_map<(d0)[s0] -> (d0 * 4 + s0)>}> : (index, index) -> index
    %55 = "affine.apply"(%arg3, %52#1) <{map = affine_map<(d0)[s0] -> (d0 * 64 + s0 * 4)>}> : (index, index) -> index
    %56 = "memref.subview"(%48, %arg1, %54, %55, %44) <{operandSegmentSizes = array<i32: 1, 3, 1, 0>, static_offsets = array<i64: 0, -9223372036854775808, -9223372036854775808, -9223372036854775808>, static_sizes = array<i64: -9223372036854775808, 1, 1, 4>, static_strides = array<i64: 1, 1, 1, 1>}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>, index, index, index, index) -> memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %57 = "vector.transfer_read"(%46, %4, %arg0, %arg1, %54, %55, %0) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 5, 1, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, index, index, index, f8E4M3FNUZ) -> vector<4xf8E4M3FNUZ>
    %58 = "arith.extf"(%57) : (vector<4xf8E4M3FNUZ>) -> vector<4xbf16>
    "vector.transfer_write"(%58, %8, %4, %4, %4, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 5, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (vector<4xbf16>, memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, index, index, index, index, index) -> ()
    "iree_linalg_ext.scatter"(%8, %53, %56) <{dimension_map = array<i64: 0>, operandSegmentSizes = array<i32: 2, 1>, unique_indices = true}> ({
    ^bb0(%arg4: bf16, %arg5: bf16):
      "iree_linalg_ext.yield"(%arg4) : (bf16) -> ()
    }) : (memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>, memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
    "gpu.barrier"() : () -> ()
    "scf.forall.in_parallel"() ({
    ^bb0:
    }) : () -> ()
  }) : (index) -> ()
  "memref.dealloc"(%7) : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> ()
  "func.return"() : () -> ()
 }) {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64>} : () -> ()
 /sharedfile/32/fp8_32_kv16.mlir:16027:13: error: failed to run translation of source executable to target executable for backend #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>
    %6045 = torch.aten.index_put %6043, %6044, %6039, %false_5809 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
            ^
 /sharedfile/32/fp8_32_kv16.mlir:16027:13: note: see current operation: 
 "hal.executable.variant"() ({
  "hal.executable.export"() ({
  ^bb0(%arg9: !hal.device, %arg10: index, %arg11: index, %arg12: index, %arg13: index):
    %69:3 = "flow.dispatch.workgroup_count_from_slice"(%arg10, %arg11, %arg12, %arg13) : (index, index, index, index) -> (index, index, index)
    "hal.return"(%69#0, %69#1, %69#2) : (index, index, index) -> ()
  }) {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index, sym_name = "prefill_bs1$async_dispatch_312_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"} : () -> ()
  "builtin.module"() ({
    "func.func"() <{function_type = () -> (), sym_name = "prefill_bs1$async_dispatch_312_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"}> ({
      %0 = "arith.constant"() <{value = 0.000000e+00 : f8E4M3FNUZ}> : () -> f8E4M3FNUZ
      %1 = "arith.constant"() <{value = 0 : i64}> : () -> i64
      %2 = "arith.constant"() <{value = 1 : index}> : () -> index
      %3 = "arith.constant"() <{value = 32 : i64}> : () -> i64
      %4 = "arith.constant"() <{value = 0 : index}> : () -> index
      %5 = "arith.constant"() <{value = 64 : index}> : () -> index
      %6 = "arith.constant"() <{value = dense<30> : vector<1xi64>}> : () -> vector<1xi64>
      %7 = "memref.alloc"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x3xi32, #gpu.address_space<workgroup>>
      %8 = "memref.alloca"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x1x1x1x4xbf16, #gpu.address_space<private>>
      %9 = "gpu.thread_id"() <{dimension = #gpu<dim x>, upper_bound = 64 : index}> : () -> index
      %10 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index} : () -> i32
      %11 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 1 : index} : () -> i32
      %12 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 2 : index} : () -> i32
      %13 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 3 : index} : () -> i32
      %14 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 4 : index} : () -> i32
      %15 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 5 : index} : () -> i32
      %16 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 6 : index} : () -> i32
      %17 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 7 : index} : () -> i32
      %18 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 8 : index} : () -> i32
      %19 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 9 : index} : () -> i32
      %20 = "arith.extui"(%10) : (i32) -> i64
      %21 = "arith.extui"(%11) : (i32) -> i64
      %22 = "arith.shli"(%21, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
      %23 = "arith.ori"(%20, %22) : (i64, i64) -> i64
      %24 = "arith.index_castui"(%23) : (i64) -> index
      %25 = "arith.extui"(%12) : (i32) -> i64
      %26 = "arith.extui"(%13) : (i32) -> i64
      %27 = "arith.shli"(%26, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
      %28 = "arith.ori"(%25, %27) : (i64, i64) -> i64
      %29 = "arith.index_castui"(%28) : (i64) -> index
      %30 = "arith.extui"(%14) : (i32) -> i64
      %31 = "arith.extui"(%15) : (i32) -> i64
      %32 = "arith.shli"(%31, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
      %33 = "arith.ori"(%30, %32) : (i64, i64) -> i64
      %34 = "arith.index_castui"(%33) : (i64) -> index
      %35 = "arith.index_castui"(%16) : (i32) -> index
      %36 = "arith.extui"(%17) : (i32) -> i64
      %37 = "arith.extui"(%18) : (i32) -> i64
      %38 = "arith.shli"(%37, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
      %39 = "arith.ori"(%36, %38) : (i64, i64) -> i64
      %40 = "arith.index_castui"(%39) : (i64) -> index
      %41 = "arith.index_castui"(%19) : (i32) -> index
      %42:6 = "util.assume.int"(%24, %29, %34, %35, %40, %41) <{assumptions = [[#util.int.assumption<umin = 68386816, umax = 5300322304>], [#util.int.assumption<umin = 68501504, umax = 5769969664>], [#util.int.assumption<umin = 68501568, umax = 5770002432>], [#util.int.assumption<umin = 1, umax = 4095>], [#util.int.assumption<umin = 64, umax = 576460752303423424, udiv = 64>], [#util.int.assumption<umin = 1, umax = 4095>]]}> : (index, index, index, index, index, index) -> (index, index, index, index, index, index)
      %43 = "flow.dispatch.workload.ordinal"(%42#3) <{ordinal = 1 : index}> : (index) -> index
      %44 = "flow.dispatch.workload.ordinal"(%42#4) <{ordinal = 2 : index}> : (index) -> index
      %45 = "flow.dispatch.workload.ordinal"(%42#5) <{ordinal = 3 : index}> : (index) -> index
      %46 = "hal.interface.binding.subspan"(%42#0, %43) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      "memref.assume_alignment"(%46) <{alignment = 1 : i32}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
      %47 = "hal.interface.binding.subspan"(%42#1, %45) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      "memref.assume_alignment"(%47) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
      %48 = "hal.interface.binding.subspan"(%4, %44) {alignment = 64 : index, binding = 1 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>
      "memref.assume_alignment"(%48) <{alignment = 64 : i32}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>) -> ()
      %49 = "hal.interface.binding.subspan"(%42#2, %45) {alignment = 64 : index, binding = 2 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      "memref.assume_alignment"(%49) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
      %50 = "memref.alloca"(%45) <{operandSegmentSizes = array<i32: 1, 0>}> : (index) -> memref<1x?xi32, #gpu.address_space<private>>
      %51 = "affine.apply"(%45) <{map = affine_map<()[s0] -> (s0 ceildiv 2)>}> : (index) -> index
      "gpu.barrier"() : () -> ()
      "scf.for"(%9, %51, %5) ({
      ^bb0(%arg7: index):
        %62 = "affine.apply"(%arg7) <{map = affine_map<(d0) -> (d0 * 2)>}> : (index) -> index
        %63 = "affine.min"(%arg7, %45) <{map = affine_map<(d0)[s0] -> (d0 * -2 + s0, 2)>}> : (index, index) -> index
        "scf.for"(%4, %63, %2) ({
        ^bb0(%arg8: index):
          %64 = "arith.addi"(%arg8, %62) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
          %65 = "vector.transfer_read"(%47, %4, %64, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
          %66 = "arith.addi"(%65, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
          %67 = "arith.trunci"(%66) : (vector<1xi64>) -> vector<1xi32>
          %68 = "affine.apply"(%arg7, %arg8) <{map = affine_map<(d0)[s0] -> (d0 * 2 + s0)>}> : (index, index) -> index
          "vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
          "vector.transfer_write"(%67, %50, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x?xi32, #gpu.address_space<private>>, index, index) -> ()
          "scf.yield"() : () -> ()
        }) : (index, index, index) -> ()
        "scf.yield"() : () -> ()
      }) : (index, index, index) -> ()
      "gpu.barrier"() : () -> ()
      %52:2 = "affine.delinearize_index"(%9) <{static_basis = array<i64: 4, 16>}> : (index) -> (index, index)
      "scf.forall"(%43) <{mapping = [#iree_codegen.workgroup_mapping<z:1>, #iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>], operandSegmentSizes = array<i32: 0, 1, 0, 0>, staticLowerBound = array<i64: 0, 0, 0, 0>, staticStep = array<i64: 1, 1, 1, 1>, staticUpperBound = array<i64: -9223372036854775808, 32, 2, 2>}> ({
      ^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index):
        %53 = "memref.subview"(%7) <{operandSegmentSizes = array<i32: 1, 0, 0, 0>, static_offsets = array<i64: 0, 0>, static_sizes = array<i64: 1, 1>, static_strides = array<i64: 1, 1>}> : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>
        "gpu.barrier"() : () -> ()
        "scf.for"(%9, %2, %5) ({
        ^bb0(%arg6: index):
          %59 = "vector.transfer_read"(%47, %4, %arg0, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
          %60 = "arith.addi"(%59, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
          %61 = "arith.trunci"(%60) : (vector<1xi64>) -> vector<1xi32>
          "vector.transfer_write"(%61, %7, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x3xi32, #gpu.address_space<workgroup>>, index, index) -> ()
          "scf.yield"() : () -> ()
        }) : (index, index, index) -> ()
        "gpu.barrier"() : () -> ()
        %54 = "affine.apply"(%arg2, %52#0) <{map = affine_map<(d0)[s0] -> (d0 * 4 + s0)>}> : (index, index) -> index
        %55 = "affine.apply"(%arg3, %52#1) <{map = affine_map<(d0)[s0] -> (d0 * 64 + s0 * 4)>}> : (index, index) -> index
        %56 = "memref.subview"(%48, %arg1, %54, %55, %44) <{operandSegmentSizes = array<i32: 1, 3, 1, 0>, static_offsets = array<i64: 0, -9223372036854775808, -9223372036854775808, -9223372036854775808>, static_sizes = array<i64: -9223372036854775808, 1, 1, 4>, static_strides = array<i64: 1, 1, 1, 1>}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>, index, index, index, index) -> memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        %57 = "vector.transfer_read"(%46, %4, %arg0, %arg1, %54, %55, %0) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 5, 1, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, index, index, index, f8E4M3FNUZ) -> vector<4xf8E4M3FNUZ>
        %58 = "arith.extf"(%57) : (vector<4xf8E4M3FNUZ>) -> vector<4xbf16>
        "vector.transfer_write"(%58, %8, %4, %4, %4, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 5, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (vector<4xbf16>, memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, index, index, index, index, index) -> ()
        "iree_linalg_ext.scatter"(%8, %53, %56) <{dimension_map = array<i64: 0>, operandSegmentSizes = array<i32: 2, 1>, unique_indices = true}> ({
        ^bb0(%arg4: bf16, %arg5: bf16):
          "iree_linalg_ext.yield"(%arg4) : (bf16) -> ()
        }) : (memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>, memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
        "gpu.barrier"() : () -> ()
        "scf.forall.in_parallel"() ({
        ^bb0:
        }) : () -> ()
      }) : (index) -> ()
      "memref.dealloc"(%7) : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> ()
      "func.return"() : () -> ()
    }) {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64>} : () -> ()
  }) : () -> ()
  "hal.executable.variant_end"() : () -> ()
 }) {sym_name = "rocm_hsaco_fb", target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>} : () -> ()
 /sharedfile/32/fp8_32_kv16.mlir:16903:13: error: 'vector.transfer_write' op write affecting operations on global resources are restricted to workgroup distributed contexts.
    %6382 = torch.aten.index_put %6380, %6381, %6376, %false_6182 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
            ^
 /sharedfile/32/fp8_32_kv16.mlir:16903:13: note: see current operation: "vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
 /sharedfile/32/fp8_32_kv16.mlir:16903:13: error: 'func.func' op failed on workgroup distribution verification
    %6382 = torch.aten.index_put %6380, %6381, %6376, %false_6182 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
            ^
 /sharedfile/32/fp8_32_kv16.mlir:16903:13: note: see current operation: 
 "func.func"() <{function_type = () -> (), sym_name = "prefill_bs1$async_dispatch_332_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"}> ({
  %0 = "arith.constant"() <{value = 0.000000e+00 : f8E4M3FNUZ}> : () -> f8E4M3FNUZ
  %1 = "arith.constant"() <{value = 0 : i64}> : () -> i64
  %2 = "arith.constant"() <{value = 1 : index}> : () -> index
  %3 = "arith.constant"() <{value = 32 : i64}> : () -> i64
  %4 = "arith.constant"() <{value = 0 : index}> : () -> index
  %5 = "arith.constant"() <{value = 64 : index}> : () -> index
  %6 = "arith.constant"() <{value = dense<32> : vector<1xi64>}> : () -> vector<1xi64>
  %7 = "memref.alloc"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x3xi32, #gpu.address_space<workgroup>>
  %8 = "memref.alloca"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x1x1x1x4xbf16, #gpu.address_space<private>>
  %9 = "gpu.thread_id"() <{dimension = #gpu<dim x>, upper_bound = 64 : index}> : () -> index
  %10 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index} : () -> i32
  %11 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 1 : index} : () -> i32
  %12 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 2 : index} : () -> i32
  %13 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 3 : index} : () -> i32
  %14 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 4 : index} : () -> i32
  %15 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 5 : index} : () -> i32
  %16 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 6 : index} : () -> i32
  %17 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 7 : index} : () -> i32
  %18 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 8 : index} : () -> i32
  %19 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 9 : index} : () -> i32
  %20 = "arith.extui"(%10) : (i32) -> i64
  %21 = "arith.extui"(%11) : (i32) -> i64
  %22 = "arith.shli"(%21, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
  %23 = "arith.ori"(%20, %22) : (i64, i64) -> i64
  %24 = "arith.index_castui"(%23) : (i64) -> index
  %25 = "arith.extui"(%12) : (i32) -> i64
  %26 = "arith.extui"(%13) : (i32) -> i64
  %27 = "arith.shli"(%26, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
  %28 = "arith.ori"(%25, %27) : (i64, i64) -> i64
  %29 = "arith.index_castui"(%28) : (i64) -> index
  %30 = "arith.extui"(%14) : (i32) -> i64
  %31 = "arith.extui"(%15) : (i32) -> i64
  %32 = "arith.shli"(%31, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
  %33 = "arith.ori"(%30, %32) : (i64, i64) -> i64
  %34 = "arith.index_castui"(%33) : (i64) -> index
  %35 = "arith.index_castui"(%16) : (i32) -> index
  %36 = "arith.extui"(%17) : (i32) -> i64
  %37 = "arith.extui"(%18) : (i32) -> i64
  %38 = "arith.shli"(%37, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
  %39 = "arith.ori"(%36, %38) : (i64, i64) -> i64
  %40 = "arith.index_castui"(%39) : (i64) -> index
  %41 = "arith.index_castui"(%19) : (i32) -> index
  %42:6 = "util.assume.int"(%24, %29, %34, %35, %40, %41) <{assumptions = [[#util.int.assumption<umin = 68386816, umax = 5300322304>], [#util.int.assumption<umin = 68501504, umax = 5769969664>], [#util.int.assumption<umin = 68501568, umax = 5770002432>], [#util.int.assumption<umin = 1, umax = 4095>], [#util.int.assumption<umin = 64, umax = 576460752303423424, udiv = 64>], [#util.int.assumption<umin = 1, umax = 4095>]]}> : (index, index, index, index, index, index) -> (index, index, index, index, index, index)
  %43 = "flow.dispatch.workload.ordinal"(%42#3) <{ordinal = 1 : index}> : (index) -> index
  %44 = "flow.dispatch.workload.ordinal"(%42#4) <{ordinal = 2 : index}> : (index) -> index
  %45 = "flow.dispatch.workload.ordinal"(%42#5) <{ordinal = 3 : index}> : (index) -> index
  %46 = "hal.interface.binding.subspan"(%42#0, %43) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  "memref.assume_alignment"(%46) <{alignment = 1 : i32}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
  %47 = "hal.interface.binding.subspan"(%42#1, %45) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  "memref.assume_alignment"(%47) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
  %48 = "hal.interface.binding.subspan"(%4, %44) {alignment = 64 : index, binding = 1 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>
  "memref.assume_alignment"(%48) <{alignment = 64 : i32}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>) -> ()
  %49 = "hal.interface.binding.subspan"(%42#2, %45) {alignment = 64 : index, binding = 2 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  "memref.assume_alignment"(%49) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
  %50 = "memref.alloca"(%45) <{operandSegmentSizes = array<i32: 1, 0>}> : (index) -> memref<1x?xi32, #gpu.address_space<private>>
  %51 = "affine.apply"(%45) <{map = affine_map<()[s0] -> (s0 ceildiv 2)>}> : (index) -> index
  "gpu.barrier"() : () -> ()
  "scf.for"(%9, %51, %5) ({
  ^bb0(%arg7: index):
    %62 = "affine.apply"(%arg7) <{map = affine_map<(d0) -> (d0 * 2)>}> : (index) -> index
    %63 = "affine.min"(%arg7, %45) <{map = affine_map<(d0)[s0] -> (d0 * -2 + s0, 2)>}> : (index, index) -> index
    "scf.for"(%4, %63, %2) ({
    ^bb0(%arg8: index):
      %64 = "arith.addi"(%arg8, %62) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
      %65 = "vector.transfer_read"(%47, %4, %64, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
      %66 = "arith.addi"(%65, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
      %67 = "arith.trunci"(%66) : (vector<1xi64>) -> vector<1xi32>
      %68 = "affine.apply"(%arg7, %arg8) <{map = affine_map<(d0)[s0] -> (d0 * 2 + s0)>}> : (index, index) -> index
      "vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
      "vector.transfer_write"(%67, %50, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x?xi32, #gpu.address_space<private>>, index, index) -> ()
      "scf.yield"() : () -> ()
    }) : (index, index, index) -> ()
    "scf.yield"() : () -> ()
  }) : (index, index, index) -> ()
  "gpu.barrier"() : () -> ()
  %52:2 = "affine.delinearize_index"(%9) <{static_basis = array<i64: 4, 16>}> : (index) -> (index, index)
  "scf.forall"(%43) <{mapping = [#iree_codegen.workgroup_mapping<z:1>, #iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>], operandSegmentSizes = array<i32: 0, 1, 0, 0>, staticLowerBound = array<i64: 0, 0, 0, 0>, staticStep = array<i64: 1, 1, 1, 1>, staticUpperBound = array<i64: -9223372036854775808, 32, 2, 2>}> ({
  ^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index):
    %53 = "memref.subview"(%7) <{operandSegmentSizes = array<i32: 1, 0, 0, 0>, static_offsets = array<i64: 0, 0>, static_sizes = array<i64: 1, 1>, static_strides = array<i64: 1, 1>}> : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>
    "gpu.barrier"() : () -> ()
    "scf.for"(%9, %2, %5) ({
    ^bb0(%arg6: index):
      %59 = "vector.transfer_read"(%47, %4, %arg0, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
      %60 = "arith.addi"(%59, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
      %61 = "arith.trunci"(%60) : (vector<1xi64>) -> vector<1xi32>
      "vector.transfer_write"(%61, %7, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x3xi32, #gpu.address_space<workgroup>>, index, index) -> ()
      "scf.yield"() : () -> ()
    }) : (index, index, index) -> ()
    "gpu.barrier"() : () -> ()
    %54 = "affine.apply"(%arg2, %52#0) <{map = affine_map<(d0)[s0] -> (d0 * 4 + s0)>}> : (index, index) -> index
    %55 = "affine.apply"(%arg3, %52#1) <{map = affine_map<(d0)[s0] -> (d0 * 64 + s0 * 4)>}> : (index, index) -> index
    %56 = "memref.subview"(%48, %arg1, %54, %55, %44) <{operandSegmentSizes = array<i32: 1, 3, 1, 0>, static_offsets = array<i64: 0, -9223372036854775808, -9223372036854775808, -9223372036854775808>, static_sizes = array<i64: -9223372036854775808, 1, 1, 4>, static_strides = array<i64: 1, 1, 1, 1>}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>, index, index, index, index) -> memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %57 = "vector.transfer_read"(%46, %4, %arg0, %arg1, %54, %55, %0) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 5, 1, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, index, index, index, f8E4M3FNUZ) -> vector<4xf8E4M3FNUZ>
    %58 = "arith.extf"(%57) : (vector<4xf8E4M3FNUZ>) -> vector<4xbf16>
    "vector.transfer_write"(%58, %8, %4, %4, %4, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 5, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (vector<4xbf16>, memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, index, index, index, index, index) -> ()
    "iree_linalg_ext.scatter"(%8, %53, %56) <{dimension_map = array<i64: 0>, operandSegmentSizes = array<i32: 2, 1>, unique_indices = true}> ({
    ^bb0(%arg4: bf16, %arg5: bf16):
      "iree_linalg_ext.yield"(%arg4) : (bf16) -> ()
    }) : (memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>, memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
    "gpu.barrier"() : () -> ()
    "scf.forall.in_parallel"() ({
    ^bb0:
    }) : () -> ()
  }) : (index) -> ()
  "memref.dealloc"(%7) : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> ()
  "func.return"() : () -> ()
 }) {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64>} : () -> ()
 /sharedfile/32/fp8_32_kv16.mlir:16903:13: error: failed to run translation of source executable to target executable for backend #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>
    %6382 = torch.aten.index_put %6380, %6381, %6376, %false_6182 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
            ^
 /sharedfile/32/fp8_32_kv16.mlir:16903:13: note: see current operation: 
 "hal.executable.variant"() ({
  "hal.executable.export"() ({
  ^bb0(%arg9: !hal.device, %arg10: index, %arg11: index, %arg12: index, %arg13: index):
    %69:3 = "flow.dispatch.workgroup_count_from_slice"(%arg10, %arg11, %arg12, %arg13) : (index, index, index, index) -> (index, index, index)
    "hal.return"(%69#0, %69#1, %69#2) : (index, index, index) -> ()
  }) {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index, sym_name = "prefill_bs1$async_dispatch_332_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"} : () -> ()
  "builtin.module"() ({
    "func.func"() <{function_type = () -> (), sym_name = "prefill_bs1$async_dispatch_332_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"}> ({
      %0 = "arith.constant"() <{value = 0.000000e+00 : f8E4M3FNUZ}> : () -> f8E4M3FNUZ
      %1 = "arith.constant"() <{value = 0 : i64}> : () -> i64
      %2 = "arith.constant"() <{value = 1 : index}> : () -> index
      %3 = "arith.constant"() <{value = 32 : i64}> : () -> i64
      %4 = "arith.constant"() <{value = 0 : index}> : () -> index
      %5 = "arith.constant"() <{value = 64 : index}> : () -> index
      %6 = "arith.constant"() <{value = dense<32> : vector<1xi64>}> : () -> vector<1xi64>
      %7 = "memref.alloc"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x3xi32, #gpu.address_space<workgroup>>
      %8 = "memref.alloca"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x1x1x1x4xbf16, #gpu.address_space<private>>
      %9 = "gpu.thread_id"() <{dimension = #gpu<dim x>, upper_bound = 64 : index}> : () -> index
      %10 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index} : () -> i32
      %11 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 1 : index} : () -> i32
      %12 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 2 : index} : () -> i32
      %13 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 3 : index} : () -> i32
      %14 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 4 : index} : () -> i32
      %15 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 5 : index} : () -> i32
      %16 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 6 : index} : () -> i32
      %17 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 7 : index} : () -> i32
      %18 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 8 : index} : () -> i32
      %19 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 9 : index} : () -> i32
      %20 = "arith.extui"(%10) : (i32) -> i64
      %21 = "arith.extui"(%11) : (i32) -> i64
      %22 = "arith.shli"(%21, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
      %23 = "arith.ori"(%20, %22) : (i64, i64) -> i64
      %24 = "arith.index_castui"(%23) : (i64) -> index
      %25 = "arith.extui"(%12) : (i32) -> i64
      %26 = "arith.extui"(%13) : (i32) -> i64
      %27 = "arith.shli"(%26, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
      %28 = "arith.ori"(%25, %27) : (i64, i64) -> i64
      %29 = "arith.index_castui"(%28) : (i64) -> index
      %30 = "arith.extui"(%14) : (i32) -> i64
      %31 = "arith.extui"(%15) : (i32) -> i64
      %32 = "arith.shli"(%31, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
      %33 = "arith.ori"(%30, %32) : (i64, i64) -> i64
      %34 = "arith.index_castui"(%33) : (i64) -> index
      %35 = "arith.index_castui"(%16) : (i32) -> index
      %36 = "arith.extui"(%17) : (i32) -> i64
      %37 = "arith.extui"(%18) : (i32) -> i64
      %38 = "arith.shli"(%37, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
      %39 = "arith.ori"(%36, %38) : (i64, i64) -> i64
      %40 = "arith.index_castui"(%39) : (i64) -> index
      %41 = "arith.index_castui"(%19) : (i32) -> index
      %42:6 = "util.assume.int"(%24, %29, %34, %35, %40, %41) <{assumptions = [[#util.int.assumption<umin = 68386816, umax = 5300322304>], [#util.int.assumption<umin = 68501504, umax = 5769969664>], [#util.int.assumption<umin = 68501568, umax = 5770002432>], [#util.int.assumption<umin = 1, umax = 4095>], [#util.int.assumption<umin = 64, umax = 576460752303423424, udiv = 64>], [#util.int.assumption<umin = 1, umax = 4095>]]}> : (index, index, index, index, index, index) -> (index, index, index, index, index, index)
      %43 = "flow.dispatch.workload.ordinal"(%42#3) <{ordinal = 1 : index}> : (index) -> index
      %44 = "flow.dispatch.workload.ordinal"(%42#4) <{ordinal = 2 : index}> : (index) -> index
      %45 = "flow.dispatch.workload.ordinal"(%42#5) <{ordinal = 3 : index}> : (index) -> index
      %46 = "hal.interface.binding.subspan"(%42#0, %43) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      "memref.assume_alignment"(%46) <{alignment = 1 : i32}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
      %47 = "hal.interface.binding.subspan"(%42#1, %45) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      "memref.assume_alignment"(%47) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
      %48 = "hal.interface.binding.subspan"(%4, %44) {alignment = 64 : index, binding = 1 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>
      "memref.assume_alignment"(%48) <{alignment = 64 : i32}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>) -> ()
      %49 = "hal.interface.binding.subspan"(%42#2, %45) {alignment = 64 : index, binding = 2 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      "memref.assume_alignment"(%49) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
      %50 = "memref.alloca"(%45) <{operandSegmentSizes = array<i32: 1, 0>}> : (index) -> memref<1x?xi32, #gpu.address_space<private>>
      %51 = "affine.apply"(%45) <{map = affine_map<()[s0] -> (s0 ceildiv 2)>}> : (index) -> index
      "gpu.barrier"() : () -> ()
      "scf.for"(%9, %51, %5) ({
      ^bb0(%arg7: index):
        %62 = "affine.apply"(%arg7) <{map = affine_map<(d0) -> (d0 * 2)>}> : (index) -> index
        %63 = "affine.min"(%arg7, %45) <{map = affine_map<(d0)[s0] -> (d0 * -2 + s0, 2)>}> : (index, index) -> index
        "scf.for"(%4, %63, %2) ({
        ^bb0(%arg8: index):
          %64 = "arith.addi"(%arg8, %62) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
          %65 = "vector.transfer_read"(%47, %4, %64, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
          %66 = "arith.addi"(%65, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
          %67 = "arith.trunci"(%66) : (vector<1xi64>) -> vector<1xi32>
          %68 = "affine.apply"(%arg7, %arg8) <{map = affine_map<(d0)[s0] -> (d0 * 2 + s0)>}> : (index, index) -> index
          "vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
          "vector.transfer_write"(%67, %50, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x?xi32, #gpu.address_space<private>>, index, index) -> ()
          "scf.yield"() : () -> ()
        }) : (index, index, index) -> ()
        "scf.yield"() : () -> ()
      }) : (index, index, index) -> ()
      "gpu.barrier"() : () -> ()
      %52:2 = "affine.delinearize_index"(%9) <{static_basis = array<i64: 4, 16>}> : (index) -> (index, index)
      "scf.forall"(%43) <{mapping = [#iree_codegen.workgroup_mapping<z:1>, #iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>], operandSegmentSizes = array<i32: 0, 1, 0, 0>, staticLowerBound = array<i64: 0, 0, 0, 0>, staticStep = array<i64: 1, 1, 1, 1>, staticUpperBound = array<i64: -9223372036854775808, 32, 2, 2>}> ({
      ^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index):
        %53 = "memref.subview"(%7) <{operandSegmentSizes = array<i32: 1, 0, 0, 0>, static_offsets = array<i64: 0, 0>, static_sizes = array<i64: 1, 1>, static_strides = array<i64: 1, 1>}> : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>
        "gpu.barrier"() : () -> ()
        "scf.for"(%9, %2, %5) ({
        ^bb0(%arg6: index):
          %59 = "vector.transfer_read"(%47, %4, %arg0, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
          %60 = "arith.addi"(%59, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
          %61 = "arith.trunci"(%60) : (vector<1xi64>) -> vector<1xi32>
          "vector.transfer_write"(%61, %7, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x3xi32, #gpu.address_space<workgroup>>, index, index) -> ()
          "scf.yield"() : () -> ()
        }) : (index, index, index) -> ()
        "gpu.barrier"() : () -> ()
        %54 = "affine.apply"(%arg2, %52#0) <{map = affine_map<(d0)[s0] -> (d0 * 4 + s0)>}> : (index, index) -> index
        %55 = "affine.apply"(%arg3, %52#1) <{map = affine_map<(d0)[s0] -> (d0 * 64 + s0 * 4)>}> : (index, index) -> index
        %56 = "memref.subview"(%48, %arg1, %54, %55, %44) <{operandSegmentSizes = array<i32: 1, 3, 1, 0>, static_offsets = array<i64: 0, -9223372036854775808, -9223372036854775808, -9223372036854775808>, static_sizes = array<i64: -9223372036854775808, 1, 1, 4>, static_strides = array<i64: 1, 1, 1, 1>}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>, index, index, index, index) -> memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        %57 = "vector.transfer_read"(%46, %4, %arg0, %arg1, %54, %55, %0) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 5, 1, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, index, index, index, f8E4M3FNUZ) -> vector<4xf8E4M3FNUZ>
        %58 = "arith.extf"(%57) : (vector<4xf8E4M3FNUZ>) -> vector<4xbf16>
        "vector.transfer_write"(%58, %8, %4, %4, %4, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 5, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (vector<4xbf16>, memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, index, index, index, index, index) -> ()
        "iree_linalg_ext.scatter"(%8, %53, %56) <{dimension_map = array<i64: 0>, operandSegmentSizes = array<i32: 2, 1>, unique_indices = true}> ({
        ^bb0(%arg4: bf16, %arg5: bf16):
          "iree_linalg_ext.yield"(%arg4) : (bf16) -> ()
        }) : (memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>, memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
        "gpu.barrier"() : () -> ()
        "scf.forall.in_parallel"() ({
        ^bb0:
        }) : () -> ()
      }) : (index) -> ()
      "memref.dealloc"(%7) : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> ()
      "func.return"() : () -> ()
    }) {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64>} : () -> ()
  }) : () -> ()
  "hal.executable.variant_end"() : () -> ()
 }) {sym_name = "rocm_hsaco_fb", target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>} : () -> ()
 /sharedfile/32/fp8_32_kv16.mlir:17779:13: error: 'vector.transfer_write' op write affecting operations on global resources are restricted to workgroup distributed contexts.
    %6719 = torch.aten.index_put %6717, %6718, %6713, %false_6554 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
            ^
 /sharedfile/32/fp8_32_kv16.mlir:17779:13: note: see current operation: "vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
 /sharedfile/32/fp8_32_kv16.mlir:17779:13: error: 'func.func' op failed on workgroup distribution verification
    %6719 = torch.aten.index_put %6717, %6718, %6713, %false_6554 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
            ^
 /sharedfile/32/fp8_32_kv16.mlir:17779:13: note: see current operation: 
 "func.func"() <{function_type = () -> (), sym_name = "prefill_bs1$async_dispatch_352_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"}> ({
  %0 = "arith.constant"() <{value = 0.000000e+00 : f8E4M3FNUZ}> : () -> f8E4M3FNUZ
  %1 = "arith.constant"() <{value = 0 : i64}> : () -> i64
  %2 = "arith.constant"() <{value = 1 : index}> : () -> index
  %3 = "arith.constant"() <{value = 32 : i64}> : () -> i64
  %4 = "arith.constant"() <{value = 0 : index}> : () -> index
  %5 = "arith.constant"() <{value = 64 : index}> : () -> index
  %6 = "arith.constant"() <{value = dense<34> : vector<1xi64>}> : () -> vector<1xi64>
  %7 = "memref.alloc"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x3xi32, #gpu.address_space<workgroup>>
  %8 = "memref.alloca"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x1x1x1x4xbf16, #gpu.address_space<private>>
  %9 = "gpu.thread_id"() <{dimension = #gpu<dim x>, upper_bound = 64 : index}> : () -> index
  %10 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index} : () -> i32
  %11 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 1 : index} : () -> i32
  %12 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 2 : index} : () -> i32
  %13 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 3 : index} : () -> i32
  %14 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 4 : index} : () -> i32
  %15 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 5 : index} : () -> i32
  %16 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 6 : index} : () -> i32
  %17 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 7 : index} : () -> i32
  %18 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 8 : index} : () -> i32
  %19 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 9 : index} : () -> i32
  %20 = "arith.extui"(%10) : (i32) -> i64
  %21 = "arith.extui"(%11) : (i32) -> i64
  %22 = "arith.shli"(%21, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
  %23 = "arith.ori"(%20, %22) : (i64, i64) -> i64
  %24 = "arith.index_castui"(%23) : (i64) -> index
  %25 = "arith.extui"(%12) : (i32) -> i64
  %26 = "arith.extui"(%13) : (i32) -> i64
  %27 = "arith.shli"(%26, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
  %28 = "arith.ori"(%25, %27) : (i64, i64) -> i64
  %29 = "arith.index_castui"(%28) : (i64) -> index
  %30 = "arith.extui"(%14) : (i32) -> i64
  %31 = "arith.extui"(%15) : (i32) -> i64
  %32 = "arith.shli"(%31, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
  %33 = "arith.ori"(%30, %32) : (i64, i64) -> i64
  %34 = "arith.index_castui"(%33) : (i64) -> index
  %35 = "arith.index_castui"(%16) : (i32) -> index
  %36 = "arith.extui"(%17) : (i32) -> i64
  %37 = "arith.extui"(%18) : (i32) -> i64
  %38 = "arith.shli"(%37, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
  %39 = "arith.ori"(%36, %38) : (i64, i64) -> i64
  %40 = "arith.index_castui"(%39) : (i64) -> index
  %41 = "arith.index_castui"(%19) : (i32) -> index
  %42:6 = "util.assume.int"(%24, %29, %34, %35, %40, %41) <{assumptions = [[#util.int.assumption<umin = 68386816, umax = 5300322304>], [#util.int.assumption<umin = 68501504, umax = 5769969664>], [#util.int.assumption<umin = 68501568, umax = 5770002432>], [#util.int.assumption<umin = 1, umax = 4095>], [#util.int.assumption<umin = 64, umax = 576460752303423424, udiv = 64>], [#util.int.assumption<umin = 1, umax = 4095>]]}> : (index, index, index, index, index, index) -> (index, index, index, index, index, index)
  %43 = "flow.dispatch.workload.ordinal"(%42#3) <{ordinal = 1 : index}> : (index) -> index
  %44 = "flow.dispatch.workload.ordinal"(%42#4) <{ordinal = 2 : index}> : (index) -> index
  %45 = "flow.dispatch.workload.ordinal"(%42#5) <{ordinal = 3 : index}> : (index) -> index
  %46 = "hal.interface.binding.subspan"(%42#0, %43) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  "memref.assume_alignment"(%46) <{alignment = 1 : i32}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
  %47 = "hal.interface.binding.subspan"(%42#1, %45) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  "memref.assume_alignment"(%47) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
  %48 = "hal.interface.binding.subspan"(%4, %44) {alignment = 64 : index, binding = 1 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>
  "memref.assume_alignment"(%48) <{alignment = 64 : i32}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>) -> ()
  %49 = "hal.interface.binding.subspan"(%42#2, %45) {alignment = 64 : index, binding = 2 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  "memref.assume_alignment"(%49) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
  %50 = "memref.alloca"(%45) <{operandSegmentSizes = array<i32: 1, 0>}> : (index) -> memref<1x?xi32, #gpu.address_space<private>>
  %51 = "affine.apply"(%45) <{map = affine_map<()[s0] -> (s0 ceildiv 2)>}> : (index) -> index
  "gpu.barrier"() : () -> ()
  "scf.for"(%9, %51, %5) ({
  ^bb0(%arg7: index):
    %62 = "affine.apply"(%arg7) <{map = affine_map<(d0) -> (d0 * 2)>}> : (index) -> index
    %63 = "affine.min"(%arg7, %45) <{map = affine_map<(d0)[s0] -> (d0 * -2 + s0, 2)>}> : (index, index) -> index
    "scf.for"(%4, %63, %2) ({
    ^bb0(%arg8: index):
      %64 = "arith.addi"(%arg8, %62) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
      %65 = "vector.transfer_read"(%47, %4, %64, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
      %66 = "arith.addi"(%65, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
      %67 = "arith.trunci"(%66) : (vector<1xi64>) -> vector<1xi32>
      %68 = "affine.apply"(%arg7, %arg8) <{map = affine_map<(d0)[s0] -> (d0 * 2 + s0)>}> : (index, index) -> index
      "vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
      "vector.transfer_write"(%67, %50, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x?xi32, #gpu.address_space<private>>, index, index) -> ()
      "scf.yield"() : () -> ()
    }) : (index, index, index) -> ()
    "scf.yield"() : () -> ()
  }) : (index, index, index) -> ()
  "gpu.barrier"() : () -> ()
  %52:2 = "affine.delinearize_index"(%9) <{static_basis = array<i64: 4, 16>}> : (index) -> (index, index)
  "scf.forall"(%43) <{mapping = [#iree_codegen.workgroup_mapping<z:1>, #iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>], operandSegmentSizes = array<i32: 0, 1, 0, 0>, staticLowerBound = array<i64: 0, 0, 0, 0>, staticStep = array<i64: 1, 1, 1, 1>, staticUpperBound = array<i64: -9223372036854775808, 32, 2, 2>}> ({
  ^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index):
    %53 = "memref.subview"(%7) <{operandSegmentSizes = array<i32: 1, 0, 0, 0>, static_offsets = array<i64: 0, 0>, static_sizes = array<i64: 1, 1>, static_strides = array<i64: 1, 1>}> : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>
    "gpu.barrier"() : () -> ()
    "scf.for"(%9, %2, %5) ({
    ^bb0(%arg6: index):
      %59 = "vector.transfer_read"(%47, %4, %arg0, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
      %60 = "arith.addi"(%59, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
      %61 = "arith.trunci"(%60) : (vector<1xi64>) -> vector<1xi32>
      "vector.transfer_write"(%61, %7, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x3xi32, #gpu.address_space<workgroup>>, index, index) -> ()
      "scf.yield"() : () -> ()
    }) : (index, index, index) -> ()
    "gpu.barrier"() : () -> ()
    %54 = "affine.apply"(%arg2, %52#0) <{map = affine_map<(d0)[s0] -> (d0 * 4 + s0)>}> : (index, index) -> index
    %55 = "affine.apply"(%arg3, %52#1) <{map = affine_map<(d0)[s0] -> (d0 * 64 + s0 * 4)>}> : (index, index) -> index
    %56 = "memref.subview"(%48, %arg1, %54, %55, %44) <{operandSegmentSizes = array<i32: 1, 3, 1, 0>, static_offsets = array<i64: 0, -9223372036854775808, -9223372036854775808, -9223372036854775808>, static_sizes = array<i64: -9223372036854775808, 1, 1, 4>, static_strides = array<i64: 1, 1, 1, 1>}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>, index, index, index, index) -> memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %57 = "vector.transfer_read"(%46, %4, %arg0, %arg1, %54, %55, %0) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 5, 1, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, index, index, index, f8E4M3FNUZ) -> vector<4xf8E4M3FNUZ>
    %58 = "arith.extf"(%57) : (vector<4xf8E4M3FNUZ>) -> vector<4xbf16>
    "vector.transfer_write"(%58, %8, %4, %4, %4, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 5, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (vector<4xbf16>, memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, index, index, index, index, index) -> ()
    "iree_linalg_ext.scatter"(%8, %53, %56) <{dimension_map = array<i64: 0>, operandSegmentSizes = array<i32: 2, 1>, unique_indices = true}> ({
    ^bb0(%arg4: bf16, %arg5: bf16):
      "iree_linalg_ext.yield"(%arg4) : (bf16) -> ()
    }) : (memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>, memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
    "gpu.barrier"() : () -> ()
    "scf.forall.in_parallel"() ({
    ^bb0:
    }) : () -> ()
  }) : (index) -> ()
  "memref.dealloc"(%7) : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> ()
  "func.return"() : () -> ()
 }) {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64>} : () -> ()
 /sharedfile/32/fp8_32_kv16.mlir:17779:13: error: failed to run translation of source executable to target executable for backend #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>
    %6719 = torch.aten.index_put %6717, %6718, %6713, %false_6554 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
            ^
 /sharedfile/32/fp8_32_kv16.mlir:17779:13: note: see current operation: 
 "hal.executable.variant"() ({
  "hal.executable.export"() ({
  ^bb0(%arg9: !hal.device, %arg10: index, %arg11: index, %arg12: index, %arg13: index):
    %69:3 = "flow.dispatch.workgroup_count_from_slice"(%arg10, %arg11, %arg12, %arg13) : (index, index, index, index) -> (index, index, index)
    "hal.return"(%69#0, %69#1, %69#2) : (index, index, index) -> ()
  }) {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index, sym_name = "prefill_bs1$async_dispatch_352_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"} : () -> ()
  "builtin.module"() ({
    "func.func"() <{function_type = () -> (), sym_name = "prefill_bs1$async_dispatch_352_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"}> ({
      %0 = "arith.constant"() <{value = 0.000000e+00 : f8E4M3FNUZ}> : () -> f8E4M3FNUZ
      %1 = "arith.constant"() <{value = 0 : i64}> : () -> i64
      %2 = "arith.constant"() <{value = 1 : index}> : () -> index
      %3 = "arith.constant"() <{value = 32 : i64}> : () -> i64
      %4 = "arith.constant"() <{value = 0 : index}> : () -> index
      %5 = "arith.constant"() <{value = 64 : index}> : () -> index
      %6 = "arith.constant"() <{value = dense<34> : vector<1xi64>}> : () -> vector<1xi64>
      %7 = "memref.alloc"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x3xi32, #gpu.address_space<workgroup>>
      %8 = "memref.alloca"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x1x1x1x4xbf16, #gpu.address_space<private>>
      %9 = "gpu.thread_id"() <{dimension = #gpu<dim x>, upper_bound = 64 : index}> : () -> index
      %10 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index} : () -> i32
      %11 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 1 : index} : () -> i32
      %12 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 2 : index} : () -> i32
      %13 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 3 : index} : () -> i32
      %14 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 4 : index} : () -> i32
      %15 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 5 : index} : () -> i32
      %16 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 6 : index} : () -> i32
      %17 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 7 : index} : () -> i32
      %18 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 8 : index} : () -> i32
      %19 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 9 : index} : () -> i32
      %20 = "arith.extui"(%10) : (i32) -> i64
      %21 = "arith.extui"(%11) : (i32) -> i64
      %22 = "arith.shli"(%21, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
      %23 = "arith.ori"(%20, %22) : (i64, i64) -> i64
      %24 = "arith.index_castui"(%23) : (i64) -> index
      %25 = "arith.extui"(%12) : (i32) -> i64
      %26 = "arith.extui"(%13) : (i32) -> i64
      %27 = "arith.shli"(%26, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
      %28 = "arith.ori"(%25, %27) : (i64, i64) -> i64
      %29 = "arith.index_castui"(%28) : (i64) -> index
      %30 = "arith.extui"(%14) : (i32) -> i64
      %31 = "arith.extui"(%15) : (i32) -> i64
      %32 = "arith.shli"(%31, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
      %33 = "arith.ori"(%30, %32) : (i64, i64) -> i64
      %34 = "arith.index_castui"(%33) : (i64) -> index
      %35 = "arith.index_castui"(%16) : (i32) -> index
      %36 = "arith.extui"(%17) : (i32) -> i64
      %37 = "arith.extui"(%18) : (i32) -> i64
      %38 = "arith.shli"(%37, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
      %39 = "arith.ori"(%36, %38) : (i64, i64) -> i64
      %40 = "arith.index_castui"(%39) : (i64) -> index
      %41 = "arith.index_castui"(%19) : (i32) -> index
      %42:6 = "util.assume.int"(%24, %29, %34, %35, %40, %41) <{assumptions = [[#util.int.assumption<umin = 68386816, umax = 5300322304>], [#util.int.assumption<umin = 68501504, umax = 5769969664>], [#util.int.assumption<umin = 68501568, umax = 5770002432>], [#util.int.assumption<umin = 1, umax = 4095>], [#util.int.assumption<umin = 64, umax = 576460752303423424, udiv = 64>], [#util.int.assumption<umin = 1, umax = 4095>]]}> : (index, index, index, index, index, index) -> (index, index, index, index, index, index)
      %43 = "flow.dispatch.workload.ordinal"(%42#3) <{ordinal = 1 : index}> : (index) -> index
      %44 = "flow.dispatch.workload.ordinal"(%42#4) <{ordinal = 2 : index}> : (index) -> index
      %45 = "flow.dispatch.workload.ordinal"(%42#5) <{ordinal = 3 : index}> : (index) -> index
      %46 = "hal.interface.binding.subspan"(%42#0, %43) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      "memref.assume_alignment"(%46) <{alignment = 1 : i32}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
      %47 = "hal.interface.binding.subspan"(%42#1, %45) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      "memref.assume_alignment"(%47) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
      %48 = "hal.interface.binding.subspan"(%4, %44) {alignment = 64 : index, binding = 1 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>
      "memref.assume_alignment"(%48) <{alignment = 64 : i32}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>) -> ()
      %49 = "hal.interface.binding.subspan"(%42#2, %45) {alignment = 64 : index, binding = 2 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      "memref.assume_alignment"(%49) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
      %50 = "memref.alloca"(%45) <{operandSegmentSizes = array<i32: 1, 0>}> : (index) -> memref<1x?xi32, #gpu.address_space<private>>
      %51 = "affine.apply"(%45) <{map = affine_map<()[s0] -> (s0 ceildiv 2)>}> : (index) -> index
      "gpu.barrier"() : () -> ()
      "scf.for"(%9, %51, %5) ({
      ^bb0(%arg7: index):
        %62 = "affine.apply"(%arg7) <{map = affine_map<(d0) -> (d0 * 2)>}> : (index) -> index
        %63 = "affine.min"(%arg7, %45) <{map = affine_map<(d0)[s0] -> (d0 * -2 + s0, 2)>}> : (index, index) -> index
        "scf.for"(%4, %63, %2) ({
        ^bb0(%arg8: index):
          %64 = "arith.addi"(%arg8, %62) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
          %65 = "vector.transfer_read"(%47, %4, %64, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
          %66 = "arith.addi"(%65, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
          %67 = "arith.trunci"(%66) : (vector<1xi64>) -> vector<1xi32>
          %68 = "affine.apply"(%arg7, %arg8) <{map = affine_map<(d0)[s0] -> (d0 * 2 + s0)>}> : (index, index) -> index
          "vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
          "vector.transfer_write"(%67, %50, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x?xi32, #gpu.address_space<private>>, index, index) -> ()
          "scf.yield"() : () -> ()
        }) : (index, index, index) -> ()
        "scf.yield"() : () -> ()
      }) : (index, index, index) -> ()
      "gpu.barrier"() : () -> ()
      %52:2 = "affine.delinearize_index"(%9) <{static_basis = array<i64: 4, 16>}> : (index) -> (index, index)
      "scf.forall"(%43) <{mapping = [#iree_codegen.workgroup_mapping<z:1>, #iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>], operandSegmentSizes = array<i32: 0, 1, 0, 0>, staticLowerBound = array<i64: 0, 0, 0, 0>, staticStep = array<i64: 1, 1, 1, 1>, staticUpperBound = array<i64: -9223372036854775808, 32, 2, 2>}> ({
      ^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index):
        %53 = "memref.subview"(%7) <{operandSegmentSizes = array<i32: 1, 0, 0, 0>, static_offsets = array<i64: 0, 0>, static_sizes = array<i64: 1, 1>, static_strides = array<i64: 1, 1>}> : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>
        "gpu.barrier"() : () -> ()
        "scf.for"(%9, %2, %5) ({
        ^bb0(%arg6: index):
          %59 = "vector.transfer_read"(%47, %4, %arg0, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
          %60 = "arith.addi"(%59, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
          %61 = "arith.trunci"(%60) : (vector<1xi64>) -> vector<1xi32>
          "vector.transfer_write"(%61, %7, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x3xi32, #gpu.address_space<workgroup>>, index, index) -> ()
          "scf.yield"() : () -> ()
        }) : (index, index, index) -> ()
        "gpu.barrier"() : () -> ()
        %54 = "affine.apply"(%arg2, %52#0) <{map = affine_map<(d0)[s0] -> (d0 * 4 + s0)>}> : (index, index) -> index
        %55 = "affine.apply"(%arg3, %52#1) <{map = affine_map<(d0)[s0] -> (d0 * 64 + s0 * 4)>}> : (index, index) -> index
        %56 = "memref.subview"(%48, %arg1, %54, %55, %44) <{operandSegmentSizes = array<i32: 1, 3, 1, 0>, static_offsets = array<i64: 0, -9223372036854775808, -9223372036854775808, -9223372036854775808>, static_sizes = array<i64: -9223372036854775808, 1, 1, 4>, static_strides = array<i64: 1, 1, 1, 1>}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>, index, index, index, index) -> memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        %57 = "vector.transfer_read"(%46, %4, %arg0, %arg1, %54, %55, %0) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 5, 1, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, index, index, index, f8E4M3FNUZ) -> vector<4xf8E4M3FNUZ>
        %58 = "arith.extf"(%57) : (vector<4xf8E4M3FNUZ>) -> vector<4xbf16>
        "vector.transfer_write"(%58, %8, %4, %4, %4, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 5, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (vector<4xbf16>, memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, index, index, index, index, index) -> ()
        "iree_linalg_ext.scatter"(%8, %53, %56) <{dimension_map = array<i64: 0>, operandSegmentSizes = array<i32: 2, 1>, unique_indices = true}> ({
        ^bb0(%arg4: bf16, %arg5: bf16):
          "iree_linalg_ext.yield"(%arg4) : (bf16) -> ()
        }) : (memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>, memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
        "gpu.barrier"() : () -> ()
        "scf.forall.in_parallel"() ({
        ^bb0:
        }) : () -> ()
      }) : (index) -> ()
      "memref.dealloc"(%7) : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> ()
      "func.return"() : () -> ()
    }) {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64>} : () -> ()
  }) : () -> ()
  "hal.executable.variant_end"() : () -> ()
 }) {sym_name = "rocm_hsaco_fb", target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>} : () -> ()
 /sharedfile/32/fp8_32_kv16.mlir:18655:13: error: 'vector.transfer_write' op write affecting operations on global resources are restricted to workgroup distributed contexts.
    %7056 = torch.aten.index_put %7054, %7055, %7050, %false_6926 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
            ^
 /sharedfile/32/fp8_32_kv16.mlir:18655:13: note: see current operation: "vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
 /sharedfile/32/fp8_32_kv16.mlir:18655:13: error: 'func.func' op failed on workgroup distribution verification
    %7056 = torch.aten.index_put %7054, %7055, %7050, %false_6926 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
            ^
 /sharedfile/32/fp8_32_kv16.mlir:18655:13: note: see current operation: 
 "func.func"() <{function_type = () -> (), sym_name = "prefill_bs1$async_dispatch_372_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"}> ({
  %0 = "arith.constant"() <{value = 0.000000e+00 : f8E4M3FNUZ}> : () -> f8E4M3FNUZ
  %1 = "arith.constant"() <{value = 0 : i64}> : () -> i64
  %2 = "arith.constant"() <{value = 1 : index}> : () -> index
  %3 = "arith.constant"() <{value = 32 : i64}> : () -> i64
  %4 = "arith.constant"() <{value = 0 : index}> : () -> index
  %5 = "arith.constant"() <{value = 64 : index}> : () -> index
  %6 = "arith.constant"() <{value = dense<36> : vector<1xi64>}> : () -> vector<1xi64>
  %7 = "memref.alloc"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x3xi32, #gpu.address_space<workgroup>>
  %8 = "memref.alloca"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x1x1x1x4xbf16, #gpu.address_space<private>>
  %9 = "gpu.thread_id"() <{dimension = #gpu<dim x>, upper_bound = 64 : index}> : () -> index
  %10 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index} : () -> i32
  %11 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 1 : index} : () -> i32
  %12 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 2 : index} : () -> i32
  %13 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 3 : index} : () -> i32
  %14 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 4 : index} : () -> i32
  %15 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 5 : index} : () -> i32
  %16 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 6 : index} : () -> i32
  %17 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 7 : index} : () -> i32
  %18 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 8 : index} : () -> i32
  %19 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 9 : index} : () -> i32
  %20 = "arith.extui"(%10) : (i32) -> i64
  %21 = "arith.extui"(%11) : (i32) -> i64
  %22 = "arith.shli"(%21, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
  %23 = "arith.ori"(%20, %22) : (i64, i64) -> i64
  %24 = "arith.index_castui"(%23) : (i64) -> index
  %25 = "arith.extui"(%12) : (i32) -> i64
  %26 = "arith.extui"(%13) : (i32) -> i64
  %27 = "arith.shli"(%26, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
  %28 = "arith.ori"(%25, %27) : (i64, i64) -> i64
  %29 = "arith.index_castui"(%28) : (i64) -> index
  %30 = "arith.extui"(%14) : (i32) -> i64
  %31 = "arith.extui"(%15) : (i32) -> i64
  %32 = "arith.shli"(%31, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
  %33 = "arith.ori"(%30, %32) : (i64, i64) -> i64
  %34 = "arith.index_castui"(%33) : (i64) -> index
  %35 = "arith.index_castui"(%16) : (i32) -> index
  %36 = "arith.extui"(%17) : (i32) -> i64
  %37 = "arith.extui"(%18) : (i32) -> i64
  %38 = "arith.shli"(%37, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
  %39 = "arith.ori"(%36, %38) : (i64, i64) -> i64
  %40 = "arith.index_castui"(%39) : (i64) -> index
  %41 = "arith.index_castui"(%19) : (i32) -> index
  %42:6 = "util.assume.int"(%24, %29, %34, %35, %40, %41) <{assumptions = [[#util.int.assumption<umin = 68386816, umax = 5300322304>], [#util.int.assumption<umin = 68501504, umax = 5769969664>], [#util.int.assumption<umin = 68501568, umax = 5770002432>], [#util.int.assumption<umin = 1, umax = 4095>], [#util.int.assumption<umin = 64, umax = 576460752303423424, udiv = 64>], [#util.int.assumption<umin = 1, umax = 4095>]]}> : (index, index, index, index, index, index) -> (index, index, index, index, index, index)
  %43 = "flow.dispatch.workload.ordinal"(%42#3) <{ordinal = 1 : index}> : (index) -> index
  %44 = "flow.dispatch.workload.ordinal"(%42#4) <{ordinal = 2 : index}> : (index) -> index
  %45 = "flow.dispatch.workload.ordinal"(%42#5) <{ordinal = 3 : index}> : (index) -> index
  %46 = "hal.interface.binding.subspan"(%42#0, %43) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  "memref.assume_alignment"(%46) <{alignment = 1 : i32}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
  %47 = "hal.interface.binding.subspan"(%42#1, %45) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  "memref.assume_alignment"(%47) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
  %48 = "hal.interface.binding.subspan"(%4, %44) {alignment = 64 : index, binding = 1 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>
  "memref.assume_alignment"(%48) <{alignment = 64 : i32}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>) -> ()
  %49 = "hal.interface.binding.subspan"(%42#2, %45) {alignment = 64 : index, binding = 2 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  "memref.assume_alignment"(%49) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
  %50 = "memref.alloca"(%45) <{operandSegmentSizes = array<i32: 1, 0>}> : (index) -> memref<1x?xi32, #gpu.address_space<private>>
  %51 = "affine.apply"(%45) <{map = affine_map<()[s0] -> (s0 ceildiv 2)>}> : (index) -> index
  "gpu.barrier"() : () -> ()
  "scf.for"(%9, %51, %5) ({
  ^bb0(%arg7: index):
    %62 = "affine.apply"(%arg7) <{map = affine_map<(d0) -> (d0 * 2)>}> : (index) -> index
    %63 = "affine.min"(%arg7, %45) <{map = affine_map<(d0)[s0] -> (d0 * -2 + s0, 2)>}> : (index, index) -> index
    "scf.for"(%4, %63, %2) ({
    ^bb0(%arg8: index):
      %64 = "arith.addi"(%arg8, %62) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
      %65 = "vector.transfer_read"(%47, %4, %64, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
      %66 = "arith.addi"(%65, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
      %67 = "arith.trunci"(%66) : (vector<1xi64>) -> vector<1xi32>
      %68 = "affine.apply"(%arg7, %arg8) <{map = affine_map<(d0)[s0] -> (d0 * 2 + s0)>}> : (index, index) -> index
      "vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
      "vector.transfer_write"(%67, %50, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x?xi32, #gpu.address_space<private>>, index, index) -> ()
      "scf.yield"() : () -> ()
    }) : (index, index, index) -> ()
    "scf.yield"() : () -> ()
  }) : (index, index, index) -> ()
  "gpu.barrier"() : () -> ()
  %52:2 = "affine.delinearize_index"(%9) <{static_basis = array<i64: 4, 16>}> : (index) -> (index, index)
  "scf.forall"(%43) <{mapping = [#iree_codegen.workgroup_mapping<z:1>, #iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>], operandSegmentSizes = array<i32: 0, 1, 0, 0>, staticLowerBound = array<i64: 0, 0, 0, 0>, staticStep = array<i64: 1, 1, 1, 1>, staticUpperBound = array<i64: -9223372036854775808, 32, 2, 2>}> ({
  ^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index):
    %53 = "memref.subview"(%7) <{operandSegmentSizes = array<i32: 1, 0, 0, 0>, static_offsets = array<i64: 0, 0>, static_sizes = array<i64: 1, 1>, static_strides = array<i64: 1, 1>}> : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>
    "gpu.barrier"() : () -> ()
    "scf.for"(%9, %2, %5) ({
    ^bb0(%arg6: index):
      %59 = "vector.transfer_read"(%47, %4, %arg0, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
      %60 = "arith.addi"(%59, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
      %61 = "arith.trunci"(%60) : (vector<1xi64>) -> vector<1xi32>
      "vector.transfer_write"(%61, %7, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x3xi32, #gpu.address_space<workgroup>>, index, index) -> ()
      "scf.yield"() : () -> ()
    }) : (index, index, index) -> ()
    "gpu.barrier"() : () -> ()
    %54 = "affine.apply"(%arg2, %52#0) <{map = affine_map<(d0)[s0] -> (d0 * 4 + s0)>}> : (index, index) -> index
    %55 = "affine.apply"(%arg3, %52#1) <{map = affine_map<(d0)[s0] -> (d0 * 64 + s0 * 4)>}> : (index, index) -> index
    %56 = "memref.subview"(%48, %arg1, %54, %55, %44) <{operandSegmentSizes = array<i32: 1, 3, 1, 0>, static_offsets = array<i64: 0, -9223372036854775808, -9223372036854775808, -9223372036854775808>, static_sizes = array<i64: -9223372036854775808, 1, 1, 4>, static_strides = array<i64: 1, 1, 1, 1>}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>, index, index, index, index) -> memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %57 = "vector.transfer_read"(%46, %4, %arg0, %arg1, %54, %55, %0) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 5, 1, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, index, index, index, f8E4M3FNUZ) -> vector<4xf8E4M3FNUZ>
    %58 = "arith.extf"(%57) : (vector<4xf8E4M3FNUZ>) -> vector<4xbf16>
    "vector.transfer_write"(%58, %8, %4, %4, %4, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 5, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (vector<4xbf16>, memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, index, index, index, index, index) -> ()
    "iree_linalg_ext.scatter"(%8, %53, %56) <{dimension_map = array<i64: 0>, operandSegmentSizes = array<i32: 2, 1>, unique_indices = true}> ({
    ^bb0(%arg4: bf16, %arg5: bf16):
      "iree_linalg_ext.yield"(%arg4) : (bf16) -> ()
    }) : (memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>, memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
    "gpu.barrier"() : () -> ()
    "scf.forall.in_parallel"() ({
    ^bb0:
    }) : () -> ()
  }) : (index) -> ()
  "memref.dealloc"(%7) : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> ()
  "func.return"() : () -> ()
 }) {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64>} : () -> ()
 /sharedfile/32/fp8_32_kv16.mlir:18655:13: error: failed to run translation of source executable to target executable for backend #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>
    %7056 = torch.aten.index_put %7054, %7055, %7050, %false_6926 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
            ^
 /sharedfile/32/fp8_32_kv16.mlir:18655:13: note: see current operation: 
 "hal.executable.variant"() ({
  "hal.executable.export"() ({
  ^bb0(%arg9: !hal.device, %arg10: index, %arg11: index, %arg12: index, %arg13: index):
    %69:3 = "flow.dispatch.workgroup_count_from_slice"(%arg10, %arg11, %arg12, %arg13) : (index, index, index, index) -> (index, index, index)
    "hal.return"(%69#0, %69#1, %69#2) : (index, index, index) -> ()
  }) {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index, sym_name = "prefill_bs1$async_dispatch_372_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"} : () -> ()
  "builtin.module"() ({
    "func.func"() <{function_type = () -> (), sym_name = "prefill_bs1$async_dispatch_372_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"}> ({
      %0 = "arith.constant"() <{value = 0.000000e+00 : f8E4M3FNUZ}> : () -> f8E4M3FNUZ
      %1 = "arith.constant"() <{value = 0 : i64}> : () -> i64
      %2 = "arith.constant"() <{value = 1 : index}> : () -> index
      %3 = "arith.constant"() <{value = 32 : i64}> : () -> i64
      %4 = "arith.constant"() <{value = 0 : index}> : () -> index
      %5 = "arith.constant"() <{value = 64 : index}> : () -> index
      %6 = "arith.constant"() <{value = dense<36> : vector<1xi64>}> : () -> vector<1xi64>
      %7 = "memref.alloc"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x3xi32, #gpu.address_space<workgroup>>
      %8 = "memref.alloca"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x1x1x1x4xbf16, #gpu.address_space<private>>
      %9 = "gpu.thread_id"() <{dimension = #gpu<dim x>, upper_bound = 64 : index}> : () -> index
      %10 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index} : () -> i32
      %11 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 1 : index} : () -> i32
      %12 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 2 : index} : () -> i32
      %13 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 3 : index} : () -> i32
      %14 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 4 : index} : () -> i32
      %15 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 5 : index} : () -> i32
      %16 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 6 : index} : () -> i32
      %17 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 7 : index} : () -> i32
      %18 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 8 : index} : () -> i32
      %19 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 9 : index} : () -> i32
      %20 = "arith.extui"(%10) : (i32) -> i64
      %21 = "arith.extui"(%11) : (i32) -> i64
      %22 = "arith.shli"(%21, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
      %23 = "arith.ori"(%20, %22) : (i64, i64) -> i64
      %24 = "arith.index_castui"(%23) : (i64) -> index
      %25 = "arith.extui"(%12) : (i32) -> i64
      %26 = "arith.extui"(%13) : (i32) -> i64
      %27 = "arith.shli"(%26, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
      %28 = "arith.ori"(%25, %27) : (i64, i64) -> i64
      %29 = "arith.index_castui"(%28) : (i64) -> index
      %30 = "arith.extui"(%14) : (i32) -> i64
      %31 = "arith.extui"(%15) : (i32) -> i64
      %32 = "arith.shli"(%31, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
      %33 = "arith.ori"(%30, %32) : (i64, i64) -> i64
      %34 = "arith.index_castui"(%33) : (i64) -> index
      %35 = "arith.index_castui"(%16) : (i32) -> index
      %36 = "arith.extui"(%17) : (i32) -> i64
      %37 = "arith.extui"(%18) : (i32) -> i64
      %38 = "arith.shli"(%37, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
      %39 = "arith.ori"(%36, %38) : (i64, i64) -> i64
      %40 = "arith.index_castui"(%39) : (i64) -> index
      %41 = "arith.index_castui"(%19) : (i32) -> index
      %42:6 = "util.assume.int"(%24, %29, %34, %35, %40, %41) <{assumptions = [[#util.int.assumption<umin = 68386816, umax = 5300322304>], [#util.int.assumption<umin = 68501504, umax = 5769969664>], [#util.int.assumption<umin = 68501568, umax = 5770002432>], [#util.int.assumption<umin = 1, umax = 4095>], [#util.int.assumption<umin = 64, umax = 576460752303423424, udiv = 64>], [#util.int.assumption<umin = 1, umax = 4095>]]}> : (index, index, index, index, index, index) -> (index, index, index, index, index, index)
      %43 = "flow.dispatch.workload.ordinal"(%42#3) <{ordinal = 1 : index}> : (index) -> index
      %44 = "flow.dispatch.workload.ordinal"(%42#4) <{ordinal = 2 : index}> : (index) -> index
      %45 = "flow.dispatch.workload.ordinal"(%42#5) <{ordinal = 3 : index}> : (index) -> index
      %46 = "hal.interface.binding.subspan"(%42#0, %43) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      "memref.assume_alignment"(%46) <{alignment = 1 : i32}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
      %47 = "hal.interface.binding.subspan"(%42#1, %45) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      "memref.assume_alignment"(%47) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
      %48 = "hal.interface.binding.subspan"(%4, %44) {alignment = 64 : index, binding = 1 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>
      "memref.assume_alignment"(%48) <{alignment = 64 : i32}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>) -> ()
      %49 = "hal.interface.binding.subspan"(%42#2, %45) {alignment = 64 : index, binding = 2 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      "memref.assume_alignment"(%49) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
      %50 = "memref.alloca"(%45) <{operandSegmentSizes = array<i32: 1, 0>}> : (index) -> memref<1x?xi32, #gpu.address_space<private>>
      %51 = "affine.apply"(%45) <{map = affine_map<()[s0] -> (s0 ceildiv 2)>}> : (index) -> index
      "gpu.barrier"() : () -> ()
      "scf.for"(%9, %51, %5) ({
      ^bb0(%arg7: index):
        %62 = "affine.apply"(%arg7) <{map = affine_map<(d0) -> (d0 * 2)>}> : (index) -> index
        %63 = "affine.min"(%arg7, %45) <{map = affine_map<(d0)[s0] -> (d0 * -2 + s0, 2)>}> : (index, index) -> index
        "scf.for"(%4, %63, %2) ({
        ^bb0(%arg8: index):
          %64 = "arith.addi"(%arg8, %62) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
          %65 = "vector.transfer_read"(%47, %4, %64, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
          %66 = "arith.addi"(%65, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
          %67 = "arith.trunci"(%66) : (vector<1xi64>) -> vector<1xi32>
          %68 = "affine.apply"(%arg7, %arg8) <{map = affine_map<(d0)[s0] -> (d0 * 2 + s0)>}> : (index, index) -> index
          "vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
          "vector.transfer_write"(%67, %50, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x?xi32, #gpu.address_space<private>>, index, index) -> ()
          "scf.yield"() : () -> ()
        }) : (index, index, index) -> ()
        "scf.yield"() : () -> ()
      }) : (index, index, index) -> ()
      "gpu.barrier"() : () -> ()
      %52:2 = "affine.delinearize_index"(%9) <{static_basis = array<i64: 4, 16>}> : (index) -> (index, index)
      "scf.forall"(%43) <{mapping = [#iree_codegen.workgroup_mapping<z:1>, #iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>], operandSegmentSizes = array<i32: 0, 1, 0, 0>, staticLowerBound = array<i64: 0, 0, 0, 0>, staticStep = array<i64: 1, 1, 1, 1>, staticUpperBound = array<i64: -9223372036854775808, 32, 2, 2>}> ({
      ^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index):
        %53 = "memref.subview"(%7) <{operandSegmentSizes = array<i32: 1, 0, 0, 0>, static_offsets = array<i64: 0, 0>, static_sizes = array<i64: 1, 1>, static_strides = array<i64: 1, 1>}> : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>
        "gpu.barrier"() : () -> ()
        "scf.for"(%9, %2, %5) ({
        ^bb0(%arg6: index):
          %59 = "vector.transfer_read"(%47, %4, %arg0, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
          %60 = "arith.addi"(%59, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
          %61 = "arith.trunci"(%60) : (vector<1xi64>) -> vector<1xi32>
          "vector.transfer_write"(%61, %7, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x3xi32, #gpu.address_space<workgroup>>, index, index) -> ()
          "scf.yield"() : () -> ()
        }) : (index, index, index) -> ()
        "gpu.barrier"() : () -> ()
        %54 = "affine.apply"(%arg2, %52#0) <{map = affine_map<(d0)[s0] -> (d0 * 4 + s0)>}> : (index, index) -> index
        %55 = "affine.apply"(%arg3, %52#1) <{map = affine_map<(d0)[s0] -> (d0 * 64 + s0 * 4)>}> : (index, index) -> index
        %56 = "memref.subview"(%48, %arg1, %54, %55, %44) <{operandSegmentSizes = array<i32: 1, 3, 1, 0>, static_offsets = array<i64: 0, -9223372036854775808, -9223372036854775808, -9223372036854775808>, static_sizes = array<i64: -9223372036854775808, 1, 1, 4>, static_strides = array<i64: 1, 1, 1, 1>}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>, index, index, index, index) -> memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        %57 = "vector.transfer_read"(%46, %4, %arg0, %arg1, %54, %55, %0) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 5, 1, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, index, index, index, f8E4M3FNUZ) -> vector<4xf8E4M3FNUZ>
        %58 = "arith.extf"(%57) : (vector<4xf8E4M3FNUZ>) -> vector<4xbf16>
        "vector.transfer_write"(%58, %8, %4, %4, %4, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 5, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (vector<4xbf16>, memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, index, index, index, index, index) -> ()
        "iree_linalg_ext.scatter"(%8, %53, %56) <{dimension_map = array<i64: 0>, operandSegmentSizes = array<i32: 2, 1>, unique_indices = true}> ({
        ^bb0(%arg4: bf16, %arg5: bf16):
          "iree_linalg_ext.yield"(%arg4) : (bf16) -> ()
        }) : (memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>, memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
        "gpu.barrier"() : () -> ()
        "scf.forall.in_parallel"() ({
        ^bb0:
        }) : () -> ()
      }) : (index) -> ()
      "memref.dealloc"(%7) : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> ()
      "func.return"() : () -> ()
    }) {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64>} : () -> ()
  }) : () -> ()
  "hal.executable.variant_end"() : () -> ()
 }) {sym_name = "rocm_hsaco_fb", target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>} : () -> ()
 /sharedfile/32/fp8_32_kv16.mlir:19531:13: error: 'vector.transfer_write' op write affecting operations on global resources are restricted to workgroup distributed contexts.
    %7393 = torch.aten.index_put %7391, %7392, %7387, %false_7298 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
            ^
 /sharedfile/32/fp8_32_kv16.mlir:19531:13: note: see current operation: "vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
 /sharedfile/32/fp8_32_kv16.mlir:19531:13: error: 'func.func' op failed on workgroup distribution verification
    %7393 = torch.aten.index_put %7391, %7392, %7387, %false_7298 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
            ^
 /sharedfile/32/fp8_32_kv16.mlir:19531:13: note: see current operation: 
 "func.func"() <{function_type = () -> (), sym_name = "prefill_bs1$async_dispatch_392_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"}> ({
  %0 = "arith.constant"() <{value = 0.000000e+00 : f8E4M3FNUZ}> : () -> f8E4M3FNUZ
  %1 = "arith.constant"() <{value = 0 : i64}> : () -> i64
  %2 = "arith.constant"() <{value = 1 : index}> : () -> index
  %3 = "arith.constant"() <{value = 32 : i64}> : () -> i64
  %4 = "arith.constant"() <{value = 0 : index}> : () -> index
  %5 = "arith.constant"() <{value = 64 : index}> : () -> index
  %6 = "arith.constant"() <{value = dense<38> : vector<1xi64>}> : () -> vector<1xi64>
  %7 = "memref.alloc"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x3xi32, #gpu.address_space<workgroup>>
  %8 = "memref.alloca"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x1x1x1x4xbf16, #gpu.address_space<private>>
  %9 = "gpu.thread_id"() <{dimension = #gpu<dim x>, upper_bound = 64 : index}> : () -> index
  %10 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index} : () -> i32
  %11 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 1 : index} : () -> i32
  %12 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 2 : index} : () -> i32
  %13 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 3 : index} : () -> i32
  %14 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 4 : index} : () -> i32
  %15 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 5 : index} : () -> i32
  %16 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 6 : index} : () -> i32
  %17 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 7 : index} : () -> i32
  %18 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 8 : index} : () -> i32
  %19 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 9 : index} : () -> i32
  %20 = "arith.extui"(%10) : (i32) -> i64
  %21 = "arith.extui"(%11) : (i32) -> i64
  %22 = "arith.shli"(%21, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
  %23 = "arith.ori"(%20, %22) : (i64, i64) -> i64
  %24 = "arith.index_castui"(%23) : (i64) -> index
  %25 = "arith.extui"(%12) : (i32) -> i64
  %26 = "arith.extui"(%13) : (i32) -> i64
  %27 = "arith.shli"(%26, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
  %28 = "arith.ori"(%25, %27) : (i64, i64) -> i64
  %29 = "arith.index_castui"(%28) : (i64) -> index
  %30 = "arith.extui"(%14) : (i32) -> i64
  %31 = "arith.extui"(%15) : (i32) -> i64
  %32 = "arith.shli"(%31, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
  %33 = "arith.ori"(%30, %32) : (i64, i64) -> i64
  %34 = "arith.index_castui"(%33) : (i64) -> index
  %35 = "arith.index_castui"(%16) : (i32) -> index
  %36 = "arith.extui"(%17) : (i32) -> i64
  %37 = "arith.extui"(%18) : (i32) -> i64
  %38 = "arith.shli"(%37, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
  %39 = "arith.ori"(%36, %38) : (i64, i64) -> i64
  %40 = "arith.index_castui"(%39) : (i64) -> index
  %41 = "arith.index_castui"(%19) : (i32) -> index
  %42:6 = "util.assume.int"(%24, %29, %34, %35, %40, %41) <{assumptions = [[#util.int.assumption<umin = 68386816, umax = 5300322304>], [#util.int.assumption<umin = 68501504, umax = 5769969664>], [#util.int.assumption<umin = 68501568, umax = 5770002432>], [#util.int.assumption<umin = 1, umax = 4095>], [#util.int.assumption<umin = 64, umax = 576460752303423424, udiv = 64>], [#util.int.assumption<umin = 1, umax = 4095>]]}> : (index, index, index, index, index, index) -> (index, index, index, index, index, index)
  %43 = "flow.dispatch.workload.ordinal"(%42#3) <{ordinal = 1 : index}> : (index) -> index
  %44 = "flow.dispatch.workload.ordinal"(%42#4) <{ordinal = 2 : index}> : (index) -> index
  %45 = "flow.dispatch.workload.ordinal"(%42#5) <{ordinal = 3 : index}> : (index) -> index
  %46 = "hal.interface.binding.subspan"(%42#0, %43) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  "memref.assume_alignment"(%46) <{alignment = 1 : i32}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
  %47 = "hal.interface.binding.subspan"(%42#1, %45) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  "memref.assume_alignment"(%47) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
  %48 = "hal.interface.binding.subspan"(%4, %44) {alignment = 64 : index, binding = 1 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>
  "memref.assume_alignment"(%48) <{alignment = 64 : i32}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>) -> ()
  %49 = "hal.interface.binding.subspan"(%42#2, %45) {alignment = 64 : index, binding = 2 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  "memref.assume_alignment"(%49) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
  %50 = "memref.alloca"(%45) <{operandSegmentSizes = array<i32: 1, 0>}> : (index) -> memref<1x?xi32, #gpu.address_space<private>>
  %51 = "affine.apply"(%45) <{map = affine_map<()[s0] -> (s0 ceildiv 2)>}> : (index) -> index
  "gpu.barrier"() : () -> ()
  "scf.for"(%9, %51, %5) ({
  ^bb0(%arg7: index):
    %62 = "affine.apply"(%arg7) <{map = affine_map<(d0) -> (d0 * 2)>}> : (index) -> index
    %63 = "affine.min"(%arg7, %45) <{map = affine_map<(d0)[s0] -> (d0 * -2 + s0, 2)>}> : (index, index) -> index
    "scf.for"(%4, %63, %2) ({
    ^bb0(%arg8: index):
      %64 = "arith.addi"(%arg8, %62) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
      %65 = "vector.transfer_read"(%47, %4, %64, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
      %66 = "arith.addi"(%65, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
      %67 = "arith.trunci"(%66) : (vector<1xi64>) -> vector<1xi32>
      %68 = "affine.apply"(%arg7, %arg8) <{map = affine_map<(d0)[s0] -> (d0 * 2 + s0)>}> : (index, index) -> index
      "vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
      "vector.transfer_write"(%67, %50, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x?xi32, #gpu.address_space<private>>, index, index) -> ()
      "scf.yield"() : () -> ()
    }) : (index, index, index) -> ()
    "scf.yield"() : () -> ()
  }) : (index, index, index) -> ()
  "gpu.barrier"() : () -> ()
  %52:2 = "affine.delinearize_index"(%9) <{static_basis = array<i64: 4, 16>}> : (index) -> (index, index)
  "scf.forall"(%43) <{mapping = [#iree_codegen.workgroup_mapping<z:1>, #iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>], operandSegmentSizes = array<i32: 0, 1, 0, 0>, staticLowerBound = array<i64: 0, 0, 0, 0>, staticStep = array<i64: 1, 1, 1, 1>, staticUpperBound = array<i64: -9223372036854775808, 32, 2, 2>}> ({
  ^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index):
    %53 = "memref.subview"(%7) <{operandSegmentSizes = array<i32: 1, 0, 0, 0>, static_offsets = array<i64: 0, 0>, static_sizes = array<i64: 1, 1>, static_strides = array<i64: 1, 1>}> : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>
    "gpu.barrier"() : () -> ()
    "scf.for"(%9, %2, %5) ({
    ^bb0(%arg6: index):
      %59 = "vector.transfer_read"(%47, %4, %arg0, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
      %60 = "arith.addi"(%59, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
      %61 = "arith.trunci"(%60) : (vector<1xi64>) -> vector<1xi32>
      "vector.transfer_write"(%61, %7, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x3xi32, #gpu.address_space<workgroup>>, index, index) -> ()
      "scf.yield"() : () -> ()
    }) : (index, index, index) -> ()
    "gpu.barrier"() : () -> ()
    %54 = "affine.apply"(%arg2, %52#0) <{map = affine_map<(d0)[s0] -> (d0 * 4 + s0)>}> : (index, index) -> index
    %55 = "affine.apply"(%arg3, %52#1) <{map = affine_map<(d0)[s0] -> (d0 * 64 + s0 * 4)>}> : (index, index) -> index
    %56 = "memref.subview"(%48, %arg1, %54, %55, %44) <{operandSegmentSizes = array<i32: 1, 3, 1, 0>, static_offsets = array<i64: 0, -9223372036854775808, -9223372036854775808, -9223372036854775808>, static_sizes = array<i64: -9223372036854775808, 1, 1, 4>, static_strides = array<i64: 1, 1, 1, 1>}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>, index, index, index, index) -> memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %57 = "vector.transfer_read"(%46, %4, %arg0, %arg1, %54, %55, %0) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 5, 1, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, index, index, index, f8E4M3FNUZ) -> vector<4xf8E4M3FNUZ>
    %58 = "arith.extf"(%57) : (vector<4xf8E4M3FNUZ>) -> vector<4xbf16>
    "vector.transfer_write"(%58, %8, %4, %4, %4, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 5, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (vector<4xbf16>, memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, index, index, index, index, index) -> ()
    "iree_linalg_ext.scatter"(%8, %53, %56) <{dimension_map = array<i64: 0>, operandSegmentSizes = array<i32: 2, 1>, unique_indices = true}> ({
    ^bb0(%arg4: bf16, %arg5: bf16):
      "iree_linalg_ext.yield"(%arg4) : (bf16) -> ()
    }) : (memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>, memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
    "gpu.barrier"() : () -> ()
    "scf.forall.in_parallel"() ({
    ^bb0:
    }) : () -> ()
  }) : (index) -> ()
  "memref.dealloc"(%7) : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> ()
  "func.return"() : () -> ()
 }) {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64>} : () -> ()
 /sharedfile/32/fp8_32_kv16.mlir:19531:13: error: failed to run translation of source executable to target executable for backend #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>
    %7393 = torch.aten.index_put %7391, %7392, %7387, %false_7298 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
            ^
 /sharedfile/32/fp8_32_kv16.mlir:19531:13: note: see current operation: 
 "hal.executable.variant"() ({
  "hal.executable.export"() ({
  ^bb0(%arg9: !hal.device, %arg10: index, %arg11: index, %arg12: index, %arg13: index):
    %69:3 = "flow.dispatch.workgroup_count_from_slice"(%arg10, %arg11, %arg12, %arg13) : (index, index, index, index) -> (index, index, index)
    "hal.return"(%69#0, %69#1, %69#2) : (index, index, index) -> ()
  }) {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index, sym_name = "prefill_bs1$async_dispatch_392_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"} : () -> ()
  "builtin.module"() ({
    "func.func"() <{function_type = () -> (), sym_name = "prefill_bs1$async_dispatch_392_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"}> ({
      %0 = "arith.constant"() <{value = 0.000000e+00 : f8E4M3FNUZ}> : () -> f8E4M3FNUZ
      %1 = "arith.constant"() <{value = 0 : i64}> : () -> i64
      %2 = "arith.constant"() <{value = 1 : index}> : () -> index
      %3 = "arith.constant"() <{value = 32 : i64}> : () -> i64
      %4 = "arith.constant"() <{value = 0 : index}> : () -> index
      %5 = "arith.constant"() <{value = 64 : index}> : () -> index
      %6 = "arith.constant"() <{value = dense<38> : vector<1xi64>}> : () -> vector<1xi64>
      %7 = "memref.alloc"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x3xi32, #gpu.address_space<workgroup>>
      %8 = "memref.alloca"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x1x1x1x4xbf16, #gpu.address_space<private>>
      %9 = "gpu.thread_id"() <{dimension = #gpu<dim x>, upper_bound = 64 : index}> : () -> index
      %10 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index} : () -> i32
      %11 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 1 : index} : () -> i32
      %12 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 2 : index} : () -> i32
      %13 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 3 : index} : () -> i32
      %14 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 4 : index} : () -> i32
      %15 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 5 : index} : () -> i32
      %16 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 6 : index} : () -> i32
      %17 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 7 : index} : () -> i32
      %18 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 8 : index} : () -> i32
      %19 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 9 : index} : () -> i32
      %20 = "arith.extui"(%10) : (i32) -> i64
      %21 = "arith.extui"(%11) : (i32) -> i64
      %22 = "arith.shli"(%21, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
      %23 = "arith.ori"(%20, %22) : (i64, i64) -> i64
      %24 = "arith.index_castui"(%23) : (i64) -> index
      %25 = "arith.extui"(%12) : (i32) -> i64
      %26 = "arith.extui"(%13) : (i32) -> i64
      %27 = "arith.shli"(%26, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
      %28 = "arith.ori"(%25, %27) : (i64, i64) -> i64
      %29 = "arith.index_castui"(%28) : (i64) -> index
      %30 = "arith.extui"(%14) : (i32) -> i64
      %31 = "arith.extui"(%15) : (i32) -> i64
      %32 = "arith.shli"(%31, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
      %33 = "arith.ori"(%30, %32) : (i64, i64) -> i64
      %34 = "arith.index_castui"(%33) : (i64) -> index
      %35 = "arith.index_castui"(%16) : (i32) -> index
      %36 = "arith.extui"(%17) : (i32) -> i64
      %37 = "arith.extui"(%18) : (i32) -> i64
      %38 = "arith.shli"(%37, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
      %39 = "arith.ori"(%36, %38) : (i64, i64) -> i64
      %40 = "arith.index_castui"(%39) : (i64) -> index
      %41 = "arith.index_castui"(%19) : (i32) -> index
      %42:6 = "util.assume.int"(%24, %29, %34, %35, %40, %41) <{assumptions = [[#util.int.assumption<umin = 68386816, umax = 5300322304>], [#util.int.assumption<umin = 68501504, umax = 5769969664>], [#util.int.assumption<umin = 68501568, umax = 5770002432>], [#util.int.assumption<umin = 1, umax = 4095>], [#util.int.assumption<umin = 64, umax = 576460752303423424, udiv = 64>], [#util.int.assumption<umin = 1, umax = 4095>]]}> : (index, index, index, index, index, index) -> (index, index, index, index, index, index)
      %43 = "flow.dispatch.workload.ordinal"(%42#3) <{ordinal = 1 : index}> : (index) -> index
      %44 = "flow.dispatch.workload.ordinal"(%42#4) <{ordinal = 2 : index}> : (index) -> index
      %45 = "flow.dispatch.workload.ordinal"(%42#5) <{ordinal = 3 : index}> : (index) -> index
      %46 = "hal.interface.binding.subspan"(%42#0, %43) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      "memref.assume_alignment"(%46) <{alignment = 1 : i32}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
      %47 = "hal.interface.binding.subspan"(%42#1, %45) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      "memref.assume_alignment"(%47) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
      %48 = "hal.interface.binding.subspan"(%4, %44) {alignment = 64 : index, binding = 1 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>
      "memref.assume_alignment"(%48) <{alignment = 64 : i32}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>) -> ()
      %49 = "hal.interface.binding.subspan"(%42#2, %45) {alignment = 64 : index, binding = 2 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      "memref.assume_alignment"(%49) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
      %50 = "memref.alloca"(%45) <{operandSegmentSizes = array<i32: 1, 0>}> : (index) -> memref<1x?xi32, #gpu.address_space<private>>
      %51 = "affine.apply"(%45) <{map = affine_map<()[s0] -> (s0 ceildiv 2)>}> : (index) -> index
      "gpu.barrier"() : () -> ()
      "scf.for"(%9, %51, %5) ({
      ^bb0(%arg7: index):
        %62 = "affine.apply"(%arg7) <{map = affine_map<(d0) -> (d0 * 2)>}> : (index) -> index
        %63 = "affine.min"(%arg7, %45) <{map = affine_map<(d0)[s0] -> (d0 * -2 + s0, 2)>}> : (index, index) -> index
        "scf.for"(%4, %63, %2) ({
        ^bb0(%arg8: index):
          %64 = "arith.addi"(%arg8, %62) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
          %65 = "vector.transfer_read"(%47, %4, %64, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
          %66 = "arith.addi"(%65, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
          %67 = "arith.trunci"(%66) : (vector<1xi64>) -> vector<1xi32>
          %68 = "affine.apply"(%arg7, %arg8) <{map = affine_map<(d0)[s0] -> (d0 * 2 + s0)>}> : (index, index) -> index
          "vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
          "vector.transfer_write"(%67, %50, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x?xi32, #gpu.address_space<private>>, index, index) -> ()
          "scf.yield"() : () -> ()
        }) : (index, index, index) -> ()
        "scf.yield"() : () -> ()
      }) : (index, index, index) -> ()
      "gpu.barrier"() : () -> ()
      %52:2 = "affine.delinearize_index"(%9) <{static_basis = array<i64: 4, 16>}> : (index) -> (index, index)
      "scf.forall"(%43) <{mapping = [#iree_codegen.workgroup_mapping<z:1>, #iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>], operandSegmentSizes = array<i32: 0, 1, 0, 0>, staticLowerBound = array<i64: 0, 0, 0, 0>, staticStep = array<i64: 1, 1, 1, 1>, staticUpperBound = array<i64: -9223372036854775808, 32, 2, 2>}> ({
      ^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index):
        %53 = "memref.subview"(%7) <{operandSegmentSizes = array<i32: 1, 0, 0, 0>, static_offsets = array<i64: 0, 0>, static_sizes = array<i64: 1, 1>, static_strides = array<i64: 1, 1>}> : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>
        "gpu.barrier"() : () -> ()
        "scf.for"(%9, %2, %5) ({
        ^bb0(%arg6: index):
          %59 = "vector.transfer_read"(%47, %4, %arg0, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
          %60 = "arith.addi"(%59, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
          %61 = "arith.trunci"(%60) : (vector<1xi64>) -> vector<1xi32>
          "vector.transfer_write"(%61, %7, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x3xi32, #gpu.address_space<workgroup>>, index, index) -> ()
          "scf.yield"() : () -> ()
        }) : (index, index, index) -> ()
        "gpu.barrier"() : () -> ()
        %54 = "affine.apply"(%arg2, %52#0) <{map = affine_map<(d0)[s0] -> (d0 * 4 + s0)>}> : (index, index) -> index
        %55 = "affine.apply"(%arg3, %52#1) <{map = affine_map<(d0)[s0] -> (d0 * 64 + s0 * 4)>}> : (index, index) -> index
        %56 = "memref.subview"(%48, %arg1, %54, %55, %44) <{operandSegmentSizes = array<i32: 1, 3, 1, 0>, static_offsets = array<i64: 0, -9223372036854775808, -9223372036854775808, -9223372036854775808>, static_sizes = array<i64: -9223372036854775808, 1, 1, 4>, static_strides = array<i64: 1, 1, 1, 1>}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>, index, index, index, index) -> memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        %57 = "vector.transfer_read"(%46, %4, %arg0, %arg1, %54, %55, %0) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 5, 1, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, index, index, index, f8E4M3FNUZ) -> vector<4xf8E4M3FNUZ>
        %58 = "arith.extf"(%57) : (vector<4xf8E4M3FNUZ>) -> vector<4xbf16>
        "vector.transfer_write"(%58, %8, %4, %4, %4, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 5, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (vector<4xbf16>, memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, index, index, index, index, index) -> ()
        "iree_linalg_ext.scatter"(%8, %53, %56) <{dimension_map = array<i64: 0>, operandSegmentSizes = array<i32: 2, 1>, unique_indices = true}> ({
        ^bb0(%arg4: bf16, %arg5: bf16):
          "iree_linalg_ext.yield"(%arg4) : (bf16) -> ()
        }) : (memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>, memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
        "gpu.barrier"() : () -> ()
        "scf.forall.in_parallel"() ({
        ^bb0:
        }) : () -> ()
      }) : (index) -> ()
      "memref.dealloc"(%7) : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> ()
      "func.return"() : () -> ()
    }) {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64>} : () -> ()
  }) : () -> ()
  "hal.executable.variant_end"() : () -> ()
 }) {sym_name = "rocm_hsaco_fb", target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>} : () -> ()
 /sharedfile/32/fp8_32_kv16.mlir:20407:13: error: 'vector.transfer_write' op write affecting operations on global resources are restricted to workgroup distributed contexts.
    %7730 = torch.aten.index_put %7728, %7729, %7724, %false_7670 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
            ^
 /sharedfile/32/fp8_32_kv16.mlir:20407:13: note: see current operation: "vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
 /sharedfile/32/fp8_32_kv16.mlir:20407:13: error: 'func.func' op failed on workgroup distribution verification
    %7730 = torch.aten.index_put %7728, %7729, %7724, %false_7670 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
            ^
 /sharedfile/32/fp8_32_kv16.mlir:20407:13: note: see current operation: 
 "func.func"() <{function_type = () -> (), sym_name = "prefill_bs1$async_dispatch_412_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"}> ({
  %0 = "arith.constant"() <{value = 0.000000e+00 : f8E4M3FNUZ}> : () -> f8E4M3FNUZ
  %1 = "arith.constant"() <{value = 0 : i64}> : () -> i64
  %2 = "arith.constant"() <{value = 1 : index}> : () -> index
  %3 = "arith.constant"() <{value = 32 : i64}> : () -> i64
  %4 = "arith.constant"() <{value = 0 : index}> : () -> index
  %5 = "arith.constant"() <{value = 64 : index}> : () -> index
  %6 = "arith.constant"() <{value = dense<40> : vector<1xi64>}> : () -> vector<1xi64>
  %7 = "memref.alloc"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x3xi32, #gpu.address_space<workgroup>>
  %8 = "memref.alloca"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x1x1x1x4xbf16, #gpu.address_space<private>>
  %9 = "gpu.thread_id"() <{dimension = #gpu<dim x>, upper_bound = 64 : index}> : () -> index
  %10 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index} : () -> i32
  %11 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 1 : index} : () -> i32
  %12 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 2 : index} : () -> i32
  %13 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 3 : index} : () -> i32
  %14 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 4 : index} : () -> i32
  %15 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 5 : index} : () -> i32
  %16 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 6 : index} : () -> i32
  %17 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 7 : index} : () -> i32
  %18 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 8 : index} : () -> i32
  %19 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 9 : index} : () -> i32
  %20 = "arith.extui"(%10) : (i32) -> i64
  %21 = "arith.extui"(%11) : (i32) -> i64
  %22 = "arith.shli"(%21, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
  %23 = "arith.ori"(%20, %22) : (i64, i64) -> i64
  %24 = "arith.index_castui"(%23) : (i64) -> index
  %25 = "arith.extui"(%12) : (i32) -> i64
  %26 = "arith.extui"(%13) : (i32) -> i64
  %27 = "arith.shli"(%26, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
  %28 = "arith.ori"(%25, %27) : (i64, i64) -> i64
  %29 = "arith.index_castui"(%28) : (i64) -> index
  %30 = "arith.extui"(%14) : (i32) -> i64
  %31 = "arith.extui"(%15) : (i32) -> i64
  %32 = "arith.shli"(%31, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
  %33 = "arith.ori"(%30, %32) : (i64, i64) -> i64
  %34 = "arith.index_castui"(%33) : (i64) -> index
  %35 = "arith.index_castui"(%16) : (i32) -> index
  %36 = "arith.extui"(%17) : (i32) -> i64
  %37 = "arith.extui"(%18) : (i32) -> i64
  %38 = "arith.shli"(%37, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
  %39 = "arith.ori"(%36, %38) : (i64, i64) -> i64
  %40 = "arith.index_castui"(%39) : (i64) -> index
  %41 = "arith.index_castui"(%19) : (i32) -> index
  %42:6 = "util.assume.int"(%24, %29, %34, %35, %40, %41) <{assumptions = [[#util.int.assumption<umin = 68386816, umax = 5300322304>], [#util.int.assumption<umin = 68501504, umax = 5769969664>], [#util.int.assumption<umin = 68501568, umax = 5770002432>], [#util.int.assumption<umin = 1, umax = 4095>], [#util.int.assumption<umin = 64, umax = 576460752303423424, udiv = 64>], [#util.int.assumption<umin = 1, umax = 4095>]]}> : (index, index, index, index, index, index) -> (index, index, index, index, index, index)
  %43 = "flow.dispatch.workload.ordinal"(%42#3) <{ordinal = 1 : index}> : (index) -> index
  %44 = "flow.dispatch.workload.ordinal"(%42#4) <{ordinal = 2 : index}> : (index) -> index
  %45 = "flow.dispatch.workload.ordinal"(%42#5) <{ordinal = 3 : index}> : (index) -> index
  %46 = "hal.interface.binding.subspan"(%42#0, %43) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  "memref.assume_alignment"(%46) <{alignment = 1 : i32}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
  %47 = "hal.interface.binding.subspan"(%42#1, %45) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  "memref.assume_alignment"(%47) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
  %48 = "hal.interface.binding.subspan"(%4, %44) {alignment = 64 : index, binding = 1 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>
  "memref.assume_alignment"(%48) <{alignment = 64 : i32}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>) -> ()
  %49 = "hal.interface.binding.subspan"(%42#2, %45) {alignment = 64 : index, binding = 2 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  "memref.assume_alignment"(%49) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
  %50 = "memref.alloca"(%45) <{operandSegmentSizes = array<i32: 1, 0>}> : (index) -> memref<1x?xi32, #gpu.address_space<private>>
  %51 = "affine.apply"(%45) <{map = affine_map<()[s0] -> (s0 ceildiv 2)>}> : (index) -> index
  "gpu.barrier"() : () -> ()
  "scf.for"(%9, %51, %5) ({
  ^bb0(%arg7: index):
    %62 = "affine.apply"(%arg7) <{map = affine_map<(d0) -> (d0 * 2)>}> : (index) -> index
    %63 = "affine.min"(%arg7, %45) <{map = affine_map<(d0)[s0] -> (d0 * -2 + s0, 2)>}> : (index, index) -> index
    "scf.for"(%4, %63, %2) ({
    ^bb0(%arg8: index):
      %64 = "arith.addi"(%arg8, %62) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
      %65 = "vector.transfer_read"(%47, %4, %64, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
      %66 = "arith.addi"(%65, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
      %67 = "arith.trunci"(%66) : (vector<1xi64>) -> vector<1xi32>
      %68 = "affine.apply"(%arg7, %arg8) <{map = affine_map<(d0)[s0] -> (d0 * 2 + s0)>}> : (index, index) -> index
      "vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
      "vector.transfer_write"(%67, %50, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x?xi32, #gpu.address_space<private>>, index, index) -> ()
      "scf.yield"() : () -> ()
    }) : (index, index, index) -> ()
    "scf.yield"() : () -> ()
  }) : (index, index, index) -> ()
  "gpu.barrier"() : () -> ()
  %52:2 = "affine.delinearize_index"(%9) <{static_basis = array<i64: 4, 16>}> : (index) -> (index, index)
  "scf.forall"(%43) <{mapping = [#iree_codegen.workgroup_mapping<z:1>, #iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>], operandSegmentSizes = array<i32: 0, 1, 0, 0>, staticLowerBound = array<i64: 0, 0, 0, 0>, staticStep = array<i64: 1, 1, 1, 1>, staticUpperBound = array<i64: -9223372036854775808, 32, 2, 2>}> ({
  ^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index):
    %53 = "memref.subview"(%7) <{operandSegmentSizes = array<i32: 1, 0, 0, 0>, static_offsets = array<i64: 0, 0>, static_sizes = array<i64: 1, 1>, static_strides = array<i64: 1, 1>}> : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>
    "gpu.barrier"() : () -> ()
    "scf.for"(%9, %2, %5) ({
    ^bb0(%arg6: index):
      %59 = "vector.transfer_read"(%47, %4, %arg0, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
      %60 = "arith.addi"(%59, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
      %61 = "arith.trunci"(%60) : (vector<1xi64>) -> vector<1xi32>
      "vector.transfer_write"(%61, %7, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x3xi32, #gpu.address_space<workgroup>>, index, index) -> ()
      "scf.yield"() : () -> ()
    }) : (index, index, index) -> ()
    "gpu.barrier"() : () -> ()
    %54 = "affine.apply"(%arg2, %52#0) <{map = affine_map<(d0)[s0] -> (d0 * 4 + s0)>}> : (index, index) -> index
    %55 = "affine.apply"(%arg3, %52#1) <{map = affine_map<(d0)[s0] -> (d0 * 64 + s0 * 4)>}> : (index, index) -> index
    %56 = "memref.subview"(%48, %arg1, %54, %55, %44) <{operandSegmentSizes = array<i32: 1, 3, 1, 0>, static_offsets = array<i64: 0, -9223372036854775808, -9223372036854775808, -9223372036854775808>, static_sizes = array<i64: -9223372036854775808, 1, 1, 4>, static_strides = array<i64: 1, 1, 1, 1>}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>, index, index, index, index) -> memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %57 = "vector.transfer_read"(%46, %4, %arg0, %arg1, %54, %55, %0) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 5, 1, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, index, index, index, f8E4M3FNUZ) -> vector<4xf8E4M3FNUZ>
    %58 = "arith.extf"(%57) : (vector<4xf8E4M3FNUZ>) -> vector<4xbf16>
    "vector.transfer_write"(%58, %8, %4, %4, %4, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 5, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (vector<4xbf16>, memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, index, index, index, index, index) -> ()
    "iree_linalg_ext.scatter"(%8, %53, %56) <{dimension_map = array<i64: 0>, operandSegmentSizes = array<i32: 2, 1>, unique_indices = true}> ({
    ^bb0(%arg4: bf16, %arg5: bf16):
      "iree_linalg_ext.yield"(%arg4) : (bf16) -> ()
    }) : (memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>, memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
    "gpu.barrier"() : () -> ()
    "scf.forall.in_parallel"() ({
    ^bb0:
    }) : () -> ()
  }) : (index) -> ()
  "memref.dealloc"(%7) : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> ()
  "func.return"() : () -> ()
 }) {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64>} : () -> ()
 /sharedfile/32/fp8_32_kv16.mlir:20407:13: error: failed to run translation of source executable to target executable for backend #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>
    %7730 = torch.aten.index_put %7728, %7729, %7724, %false_7670 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
            ^
 /sharedfile/32/fp8_32_kv16.mlir:20407:13: note: see current operation: 
 "hal.executable.variant"() ({
  "hal.executable.export"() ({
  ^bb0(%arg9: !hal.device, %arg10: index, %arg11: index, %arg12: index, %arg13: index):
    %69:3 = "flow.dispatch.workgroup_count_from_slice"(%arg10, %arg11, %arg12, %arg13) : (index, index, index, index) -> (index, index, index)
    "hal.return"(%69#0, %69#1, %69#2) : (index, index, index) -> ()
  }) {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index, sym_name = "prefill_bs1$async_dispatch_412_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"} : () -> ()
  "builtin.module"() ({
    "func.func"() <{function_type = () -> (), sym_name = "prefill_bs1$async_dispatch_412_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"}> ({
      %0 = "arith.constant"() <{value = 0.000000e+00 : f8E4M3FNUZ}> : () -> f8E4M3FNUZ
      %1 = "arith.constant"() <{value = 0 : i64}> : () -> i64
      %2 = "arith.constant"() <{value = 1 : index}> : () -> index
      %3 = "arith.constant"() <{value = 32 : i64}> : () -> i64
      %4 = "arith.constant"() <{value = 0 : index}> : () -> index
      %5 = "arith.constant"() <{value = 64 : index}> : () -> index
      %6 = "arith.constant"() <{value = dense<40> : vector<1xi64>}> : () -> vector<1xi64>
      %7 = "memref.alloc"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x3xi32, #gpu.address_space<workgroup>>
      %8 = "memref.alloca"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x1x1x1x4xbf16, #gpu.address_space<private>>
      %9 = "gpu.thread_id"() <{dimension = #gpu<dim x>, upper_bound = 64 : index}> : () -> index
      %10 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index} : () -> i32
      %11 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 1 : index} : () -> i32
      %12 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 2 : index} : () -> i32
      %13 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 3 : index} : () -> i32
      %14 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 4 : index} : () -> i32
      %15 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 5 : index} : () -> i32
      %16 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 6 : index} : () -> i32
      %17 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 7 : index} : () -> i32
      %18 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 8 : index} : () -> i32
      %19 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 9 : index} : () -> i32
      %20 = "arith.extui"(%10) : (i32) -> i64
      %21 = "arith.extui"(%11) : (i32) -> i64
      %22 = "arith.shli"(%21, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
      %23 = "arith.ori"(%20, %22) : (i64, i64) -> i64
      %24 = "arith.index_castui"(%23) : (i64) -> index
      %25 = "arith.extui"(%12) : (i32) -> i64
      %26 = "arith.extui"(%13) : (i32) -> i64
      %27 = "arith.shli"(%26, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
      %28 = "arith.ori"(%25, %27) : (i64, i64) -> i64
      %29 = "arith.index_castui"(%28) : (i64) -> index
      %30 = "arith.extui"(%14) : (i32) -> i64
      %31 = "arith.extui"(%15) : (i32) -> i64
      %32 = "arith.shli"(%31, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
      %33 = "arith.ori"(%30, %32) : (i64, i64) -> i64
      %34 = "arith.index_castui"(%33) : (i64) -> index
      %35 = "arith.index_castui"(%16) : (i32) -> index
      %36 = "arith.extui"(%17) : (i32) -> i64
      %37 = "arith.extui"(%18) : (i32) -> i64
      %38 = "arith.shli"(%37, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
      %39 = "arith.ori"(%36, %38) : (i64, i64) -> i64
      %40 = "arith.index_castui"(%39) : (i64) -> index
      %41 = "arith.index_castui"(%19) : (i32) -> index
      %42:6 = "util.assume.int"(%24, %29, %34, %35, %40, %41) <{assumptions = [[#util.int.assumption<umin = 68386816, umax = 5300322304>], [#util.int.assumption<umin = 68501504, umax = 5769969664>], [#util.int.assumption<umin = 68501568, umax = 5770002432>], [#util.int.assumption<umin = 1, umax = 4095>], [#util.int.assumption<umin = 64, umax = 576460752303423424, udiv = 64>], [#util.int.assumption<umin = 1, umax = 4095>]]}> : (index, index, index, index, index, index) -> (index, index, index, index, index, index)
      %43 = "flow.dispatch.workload.ordinal"(%42#3) <{ordinal = 1 : index}> : (index) -> index
      %44 = "flow.dispatch.workload.ordinal"(%42#4) <{ordinal = 2 : index}> : (index) -> index
      %45 = "flow.dispatch.workload.ordinal"(%42#5) <{ordinal = 3 : index}> : (index) -> index
      %46 = "hal.interface.binding.subspan"(%42#0, %43) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      "memref.assume_alignment"(%46) <{alignment = 1 : i32}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
      %47 = "hal.interface.binding.subspan"(%42#1, %45) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      "memref.assume_alignment"(%47) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
      %48 = "hal.interface.binding.subspan"(%4, %44) {alignment = 64 : index, binding = 1 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>
      "memref.assume_alignment"(%48) <{alignment = 64 : i32}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>) -> ()
      %49 = "hal.interface.binding.subspan"(%42#2, %45) {alignment = 64 : index, binding = 2 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      "memref.assume_alignment"(%49) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
      %50 = "memref.alloca"(%45) <{operandSegmentSizes = array<i32: 1, 0>}> : (index) -> memref<1x?xi32, #gpu.address_space<private>>
      %51 = "affine.apply"(%45) <{map = affine_map<()[s0] -> (s0 ceildiv 2)>}> : (index) -> index
      "gpu.barrier"() : () -> ()
      "scf.for"(%9, %51, %5) ({
      ^bb0(%arg7: index):
        %62 = "affine.apply"(%arg7) <{map = affine_map<(d0) -> (d0 * 2)>}> : (index) -> index
        %63 = "affine.min"(%arg7, %45) <{map = affine_map<(d0)[s0] -> (d0 * -2 + s0, 2)>}> : (index, index) -> index
        "scf.for"(%4, %63, %2) ({
        ^bb0(%arg8: index):
          %64 = "arith.addi"(%arg8, %62) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
          %65 = "vector.transfer_read"(%47, %4, %64, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
          %66 = "arith.addi"(%65, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
          %67 = "arith.trunci"(%66) : (vector<1xi64>) -> vector<1xi32>
          %68 = "affine.apply"(%arg7, %arg8) <{map = affine_map<(d0)[s0] -> (d0 * 2 + s0)>}> : (index, index) -> index
          "vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
          "vector.transfer_write"(%67, %50, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x?xi32, #gpu.address_space<private>>, index, index) -> ()
          "scf.yield"() : () -> ()
        }) : (index, index, index) -> ()
        "scf.yield"() : () -> ()
      }) : (index, index, index) -> ()
      "gpu.barrier"() : () -> ()
      %52:2 = "affine.delinearize_index"(%9) <{static_basis = array<i64: 4, 16>}> : (index) -> (index, index)
      "scf.forall"(%43) <{mapping = [#iree_codegen.workgroup_mapping<z:1>, #iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>], operandSegmentSizes = array<i32: 0, 1, 0, 0>, staticLowerBound = array<i64: 0, 0, 0, 0>, staticStep = array<i64: 1, 1, 1, 1>, staticUpperBound = array<i64: -9223372036854775808, 32, 2, 2>}> ({
      ^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index):
        %53 = "memref.subview"(%7) <{operandSegmentSizes = array<i32: 1, 0, 0, 0>, static_offsets = array<i64: 0, 0>, static_sizes = array<i64: 1, 1>, static_strides = array<i64: 1, 1>}> : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>
        "gpu.barrier"() : () -> ()
        "scf.for"(%9, %2, %5) ({
        ^bb0(%arg6: index):
          %59 = "vector.transfer_read"(%47, %4, %arg0, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
          %60 = "arith.addi"(%59, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
          %61 = "arith.trunci"(%60) : (vector<1xi64>) -> vector<1xi32>
          "vector.transfer_write"(%61, %7, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x3xi32, #gpu.address_space<workgroup>>, index, index) -> ()
          "scf.yield"() : () -> ()
        }) : (index, index, index) -> ()
        "gpu.barrier"() : () -> ()
        %54 = "affine.apply"(%arg2, %52#0) <{map = affine_map<(d0)[s0] -> (d0 * 4 + s0)>}> : (index, index) -> index
        %55 = "affine.apply"(%arg3, %52#1) <{map = affine_map<(d0)[s0] -> (d0 * 64 + s0 * 4)>}> : (index, index) -> index
        %56 = "memref.subview"(%48, %arg1, %54, %55, %44) <{operandSegmentSizes = array<i32: 1, 3, 1, 0>, static_offsets = array<i64: 0, -9223372036854775808, -9223372036854775808, -9223372036854775808>, static_sizes = array<i64: -9223372036854775808, 1, 1, 4>, static_strides = array<i64: 1, 1, 1, 1>}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>, index, index, index, index) -> memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        %57 = "vector.transfer_read"(%46, %4, %arg0, %arg1, %54, %55, %0) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 5, 1, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, index, index, index, f8E4M3FNUZ) -> vector<4xf8E4M3FNUZ>
        %58 = "arith.extf"(%57) : (vector<4xf8E4M3FNUZ>) -> vector<4xbf16>
        "vector.transfer_write"(%58, %8, %4, %4, %4, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 5, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (vector<4xbf16>, memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, index, index, index, index, index) -> ()
        "iree_linalg_ext.scatter"(%8, %53, %56) <{dimension_map = array<i64: 0>, operandSegmentSizes = array<i32: 2, 1>, unique_indices = true}> ({
        ^bb0(%arg4: bf16, %arg5: bf16):
          "iree_linalg_ext.yield"(%arg4) : (bf16) -> ()
        }) : (memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>, memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
        "gpu.barrier"() : () -> ()
        "scf.forall.in_parallel"() ({
        ^bb0:
        }) : () -> ()
      }) : (index) -> ()
      "memref.dealloc"(%7) : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> ()
      "func.return"() : () -> ()
    }) {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64>} : () -> ()
  }) : () -> ()
  "hal.executable.variant_end"() : () -> ()
 }) {sym_name = "rocm_hsaco_fb", target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>} : () -> ()
 /sharedfile/32/fp8_32_kv16.mlir:21283:13: error: 'vector.transfer_write' op write affecting operations on global resources are restricted to workgroup distributed contexts.
    %8067 = torch.aten.index_put %8065, %8066, %8061, %false_8042 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
            ^
 /sharedfile/32/fp8_32_kv16.mlir:21283:13: note: see current operation: "vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
 /sharedfile/32/fp8_32_kv16.mlir:21283:13: error: 'func.func' op failed on workgroup distribution verification
    %8067 = torch.aten.index_put %8065, %8066, %8061, %false_8042 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
            ^
 /sharedfile/32/fp8_32_kv16.mlir:21283:13: note: see current operation: 
 "func.func"() <{function_type = () -> (), sym_name = "prefill_bs1$async_dispatch_432_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"}> ({
  %0 = "arith.constant"() <{value = 0.000000e+00 : f8E4M3FNUZ}> : () -> f8E4M3FNUZ
  %1 = "arith.constant"() <{value = 0 : i64}> : () -> i64
  %2 = "arith.constant"() <{value = 1 : index}> : () -> index
  %3 = "arith.constant"() <{value = 32 : i64}> : () -> i64
  %4 = "arith.constant"() <{value = 0 : index}> : () -> index
  %5 = "arith.constant"() <{value = 64 : index}> : () -> index
  %6 = "arith.constant"() <{value = dense<42> : vector<1xi64>}> : () -> vector<1xi64>
  %7 = "memref.alloc"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x3xi32, #gpu.address_space<workgroup>>
  %8 = "memref.alloca"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x1x1x1x4xbf16, #gpu.address_space<private>>
  %9 = "gpu.thread_id"() <{dimension = #gpu<dim x>, upper_bound = 64 : index}> : () -> index
  %10 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index} : () -> i32
  %11 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 1 : index} : () -> i32
  %12 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 2 : index} : () -> i32
  %13 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 3 : index} : () -> i32
  %14 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 4 : index} : () -> i32
  %15 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 5 : index} : () -> i32
  %16 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 6 : index} : () -> i32
  %17 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 7 : index} : () -> i32
  %18 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 8 : index} : () -> i32
  %19 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 9 : index} : () -> i32
  %20 = "arith.extui"(%10) : (i32) -> i64
  %21 = "arith.extui"(%11) : (i32) -> i64
  %22 = "arith.shli"(%21, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
  %23 = "arith.ori"(%20, %22) : (i64, i64) -> i64
  %24 = "arith.index_castui"(%23) : (i64) -> index
  %25 = "arith.extui"(%12) : (i32) -> i64
  %26 = "arith.extui"(%13) : (i32) -> i64
  %27 = "arith.shli"(%26, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
  %28 = "arith.ori"(%25, %27) : (i64, i64) -> i64
  %29 = "arith.index_castui"(%28) : (i64) -> index
  %30 = "arith.extui"(%14) : (i32) -> i64
  %31 = "arith.extui"(%15) : (i32) -> i64
  %32 = "arith.shli"(%31, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
  %33 = "arith.ori"(%30, %32) : (i64, i64) -> i64
  %34 = "arith.index_castui"(%33) : (i64) -> index
  %35 = "arith.index_castui"(%16) : (i32) -> index
  %36 = "arith.extui"(%17) : (i32) -> i64
  %37 = "arith.extui"(%18) : (i32) -> i64
  %38 = "arith.shli"(%37, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
  %39 = "arith.ori"(%36, %38) : (i64, i64) -> i64
  %40 = "arith.index_castui"(%39) : (i64) -> index
  %41 = "arith.index_castui"(%19) : (i32) -> index
  %42:6 = "util.assume.int"(%24, %29, %34, %35, %40, %41) <{assumptions = [[#util.int.assumption<umin = 68386816, umax = 5300322304>], [#util.int.assumption<umin = 68501504, umax = 5769969664>], [#util.int.assumption<umin = 68501568, umax = 5770002432>], [#util.int.assumption<umin = 1, umax = 4095>], [#util.int.assumption<umin = 64, umax = 576460752303423424, udiv = 64>], [#util.int.assumption<umin = 1, umax = 4095>]]}> : (index, index, index, index, index, index) -> (index, index, index, index, index, index)
  %43 = "flow.dispatch.workload.ordinal"(%42#3) <{ordinal = 1 : index}> : (index) -> index
  %44 = "flow.dispatch.workload.ordinal"(%42#4) <{ordinal = 2 : index}> : (index) -> index
  %45 = "flow.dispatch.workload.ordinal"(%42#5) <{ordinal = 3 : index}> : (index) -> index
  %46 = "hal.interface.binding.subspan"(%42#0, %43) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  "memref.assume_alignment"(%46) <{alignment = 1 : i32}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
  %47 = "hal.interface.binding.subspan"(%42#1, %45) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  "memref.assume_alignment"(%47) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
  %48 = "hal.interface.binding.subspan"(%4, %44) {alignment = 64 : index, binding = 1 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>
  "memref.assume_alignment"(%48) <{alignment = 64 : i32}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>) -> ()
  %49 = "hal.interface.binding.subspan"(%42#2, %45) {alignment = 64 : index, binding = 2 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  "memref.assume_alignment"(%49) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
  %50 = "memref.alloca"(%45) <{operandSegmentSizes = array<i32: 1, 0>}> : (index) -> memref<1x?xi32, #gpu.address_space<private>>
  %51 = "affine.apply"(%45) <{map = affine_map<()[s0] -> (s0 ceildiv 2)>}> : (index) -> index
  "gpu.barrier"() : () -> ()
  "scf.for"(%9, %51, %5) ({
  ^bb0(%arg7: index):
    %62 = "affine.apply"(%arg7) <{map = affine_map<(d0) -> (d0 * 2)>}> : (index) -> index
    %63 = "affine.min"(%arg7, %45) <{map = affine_map<(d0)[s0] -> (d0 * -2 + s0, 2)>}> : (index, index) -> index
    "scf.for"(%4, %63, %2) ({
    ^bb0(%arg8: index):
      %64 = "arith.addi"(%arg8, %62) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
      %65 = "vector.transfer_read"(%47, %4, %64, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
      %66 = "arith.addi"(%65, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
      %67 = "arith.trunci"(%66) : (vector<1xi64>) -> vector<1xi32>
      %68 = "affine.apply"(%arg7, %arg8) <{map = affine_map<(d0)[s0] -> (d0 * 2 + s0)>}> : (index, index) -> index
      "vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
      "vector.transfer_write"(%67, %50, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x?xi32, #gpu.address_space<private>>, index, index) -> ()
      "scf.yield"() : () -> ()
    }) : (index, index, index) -> ()
    "scf.yield"() : () -> ()
  }) : (index, index, index) -> ()
  "gpu.barrier"() : () -> ()
  %52:2 = "affine.delinearize_index"(%9) <{static_basis = array<i64: 4, 16>}> : (index) -> (index, index)
  "scf.forall"(%43) <{mapping = [#iree_codegen.workgroup_mapping<z:1>, #iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>], operandSegmentSizes = array<i32: 0, 1, 0, 0>, staticLowerBound = array<i64: 0, 0, 0, 0>, staticStep = array<i64: 1, 1, 1, 1>, staticUpperBound = array<i64: -9223372036854775808, 32, 2, 2>}> ({
  ^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index):
    %53 = "memref.subview"(%7) <{operandSegmentSizes = array<i32: 1, 0, 0, 0>, static_offsets = array<i64: 0, 0>, static_sizes = array<i64: 1, 1>, static_strides = array<i64: 1, 1>}> : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>
    "gpu.barrier"() : () -> ()
    "scf.for"(%9, %2, %5) ({
    ^bb0(%arg6: index):
      %59 = "vector.transfer_read"(%47, %4, %arg0, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
      %60 = "arith.addi"(%59, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
      %61 = "arith.trunci"(%60) : (vector<1xi64>) -> vector<1xi32>
      "vector.transfer_write"(%61, %7, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x3xi32, #gpu.address_space<workgroup>>, index, index) -> ()
      "scf.yield"() : () -> ()
    }) : (index, index, index) -> ()
    "gpu.barrier"() : () -> ()
    %54 = "affine.apply"(%arg2, %52#0) <{map = affine_map<(d0)[s0] -> (d0 * 4 + s0)>}> : (index, index) -> index
    %55 = "affine.apply"(%arg3, %52#1) <{map = affine_map<(d0)[s0] -> (d0 * 64 + s0 * 4)>}> : (index, index) -> index
    %56 = "memref.subview"(%48, %arg1, %54, %55, %44) <{operandSegmentSizes = array<i32: 1, 3, 1, 0>, static_offsets = array<i64: 0, -9223372036854775808, -9223372036854775808, -9223372036854775808>, static_sizes = array<i64: -9223372036854775808, 1, 1, 4>, static_strides = array<i64: 1, 1, 1, 1>}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>, index, index, index, index) -> memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %57 = "vector.transfer_read"(%46, %4, %arg0, %arg1, %54, %55, %0) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 5, 1, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, index, index, index, f8E4M3FNUZ) -> vector<4xf8E4M3FNUZ>
    %58 = "arith.extf"(%57) : (vector<4xf8E4M3FNUZ>) -> vector<4xbf16>
    "vector.transfer_write"(%58, %8, %4, %4, %4, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 5, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (vector<4xbf16>, memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, index, index, index, index, index) -> ()
    "iree_linalg_ext.scatter"(%8, %53, %56) <{dimension_map = array<i64: 0>, operandSegmentSizes = array<i32: 2, 1>, unique_indices = true}> ({
    ^bb0(%arg4: bf16, %arg5: bf16):
      "iree_linalg_ext.yield"(%arg4) : (bf16) -> ()
    }) : (memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>, memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
    "gpu.barrier"() : () -> ()
    "scf.forall.in_parallel"() ({
    ^bb0:
    }) : () -> ()
  }) : (index) -> ()
  "memref.dealloc"(%7) : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> ()
  "func.return"() : () -> ()
 }) {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64>} : () -> ()
 /sharedfile/32/fp8_32_kv16.mlir:21283:13: error: failed to run translation of source executable to target executable for backend #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>
    %8067 = torch.aten.index_put %8065, %8066, %8061, %false_8042 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
            ^
 /sharedfile/32/fp8_32_kv16.mlir:21283:13: note: see current operation: 
 "hal.executable.variant"() ({
  "hal.executable.export"() ({
  ^bb0(%arg9: !hal.device, %arg10: index, %arg11: index, %arg12: index, %arg13: index):
    %69:3 = "flow.dispatch.workgroup_count_from_slice"(%arg10, %arg11, %arg12, %arg13) : (index, index, index, index) -> (index, index, index)
    "hal.return"(%69#0, %69#1, %69#2) : (index, index, index) -> ()
  }) {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index, sym_name = "prefill_bs1$async_dispatch_432_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"} : () -> ()
  "builtin.module"() ({
    "func.func"() <{function_type = () -> (), sym_name = "prefill_bs1$async_dispatch_432_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"}> ({
      %0 = "arith.constant"() <{value = 0.000000e+00 : f8E4M3FNUZ}> : () -> f8E4M3FNUZ
      %1 = "arith.constant"() <{value = 0 : i64}> : () -> i64
      %2 = "arith.constant"() <{value = 1 : index}> : () -> index
      %3 = "arith.constant"() <{value = 32 : i64}> : () -> i64
      %4 = "arith.constant"() <{value = 0 : index}> : () -> index
      %5 = "arith.constant"() <{value = 64 : index}> : () -> index
      %6 = "arith.constant"() <{value = dense<42> : vector<1xi64>}> : () -> vector<1xi64>
      %7 = "memref.alloc"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x3xi32, #gpu.address_space<workgroup>>
      %8 = "memref.alloca"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x1x1x1x4xbf16, #gpu.address_space<private>>
      %9 = "gpu.thread_id"() <{dimension = #gpu<dim x>, upper_bound = 64 : index}> : () -> index
      %10 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index} : () -> i32
      %11 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 1 : index} : () -> i32
      %12 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 2 : index} : () -> i32
      %13 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 3 : index} : () -> i32
      %14 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 4 : index} : () -> i32
      %15 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 5 : index} : () -> i32
      %16 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 6 : index} : () -> i32
      %17 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 7 : index} : () -> i32
      %18 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 8 : index} : () -> i32
      %19 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 9 : index} : () -> i32
      %20 = "arith.extui"(%10) : (i32) -> i64
      %21 = "arith.extui"(%11) : (i32) -> i64
      %22 = "arith.shli"(%21, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
      %23 = "arith.ori"(%20, %22) : (i64, i64) -> i64
      %24 = "arith.index_castui"(%23) : (i64) -> index
      %25 = "arith.extui"(%12) : (i32) -> i64
      %26 = "arith.extui"(%13) : (i32) -> i64
      %27 = "arith.shli"(%26, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
      %28 = "arith.ori"(%25, %27) : (i64, i64) -> i64
      %29 = "arith.index_castui"(%28) : (i64) -> index
      %30 = "arith.extui"(%14) : (i32) -> i64
      %31 = "arith.extui"(%15) : (i32) -> i64
      %32 = "arith.shli"(%31, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
      %33 = "arith.ori"(%30, %32) : (i64, i64) -> i64
      %34 = "arith.index_castui"(%33) : (i64) -> index
      %35 = "arith.index_castui"(%16) : (i32) -> index
      %36 = "arith.extui"(%17) : (i32) -> i64
      %37 = "arith.extui"(%18) : (i32) -> i64
      %38 = "arith.shli"(%37, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
      %39 = "arith.ori"(%36, %38) : (i64, i64) -> i64
      %40 = "arith.index_castui"(%39) : (i64) -> index
      %41 = "arith.index_castui"(%19) : (i32) -> index
      %42:6 = "util.assume.int"(%24, %29, %34, %35, %40, %41) <{assumptions = [[#util.int.assumption<umin = 68386816, umax = 5300322304>], [#util.int.assumption<umin = 68501504, umax = 5769969664>], [#util.int.assumption<umin = 68501568, umax = 5770002432>], [#util.int.assumption<umin = 1, umax = 4095>], [#util.int.assumption<umin = 64, umax = 576460752303423424, udiv = 64>], [#util.int.assumption<umin = 1, umax = 4095>]]}> : (index, index, index, index, index, index) -> (index, index, index, index, index, index)
      %43 = "flow.dispatch.workload.ordinal"(%42#3) <{ordinal = 1 : index}> : (index) -> index
      %44 = "flow.dispatch.workload.ordinal"(%42#4) <{ordinal = 2 : index}> : (index) -> index
      %45 = "flow.dispatch.workload.ordinal"(%42#5) <{ordinal = 3 : index}> : (index) -> index
      %46 = "hal.interface.binding.subspan"(%42#0, %43) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      "memref.assume_alignment"(%46) <{alignment = 1 : i32}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
      %47 = "hal.interface.binding.subspan"(%42#1, %45) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      "memref.assume_alignment"(%47) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
      %48 = "hal.interface.binding.subspan"(%4, %44) {alignment = 64 : index, binding = 1 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>
      "memref.assume_alignment"(%48) <{alignment = 64 : i32}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>) -> ()
      %49 = "hal.interface.binding.subspan"(%42#2, %45) {alignment = 64 : index, binding = 2 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      "memref.assume_alignment"(%49) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
      %50 = "memref.alloca"(%45) <{operandSegmentSizes = array<i32: 1, 0>}> : (index) -> memref<1x?xi32, #gpu.address_space<private>>
      %51 = "affine.apply"(%45) <{map = affine_map<()[s0] -> (s0 ceildiv 2)>}> : (index) -> index
      "gpu.barrier"() : () -> ()
      "scf.for"(%9, %51, %5) ({
      ^bb0(%arg7: index):
        %62 = "affine.apply"(%arg7) <{map = affine_map<(d0) -> (d0 * 2)>}> : (index) -> index
        %63 = "affine.min"(%arg7, %45) <{map = affine_map<(d0)[s0] -> (d0 * -2 + s0, 2)>}> : (index, index) -> index
        "scf.for"(%4, %63, %2) ({
        ^bb0(%arg8: index):
          %64 = "arith.addi"(%arg8, %62) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
          %65 = "vector.transfer_read"(%47, %4, %64, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
          %66 = "arith.addi"(%65, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
          %67 = "arith.trunci"(%66) : (vector<1xi64>) -> vector<1xi32>
          %68 = "affine.apply"(%arg7, %arg8) <{map = affine_map<(d0)[s0] -> (d0 * 2 + s0)>}> : (index, index) -> index
          "vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
          "vector.transfer_write"(%67, %50, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x?xi32, #gpu.address_space<private>>, index, index) -> ()
          "scf.yield"() : () -> ()
        }) : (index, index, index) -> ()
        "scf.yield"() : () -> ()
      }) : (index, index, index) -> ()
      "gpu.barrier"() : () -> ()
      %52:2 = "affine.delinearize_index"(%9) <{static_basis = array<i64: 4, 16>}> : (index) -> (index, index)
      "scf.forall"(%43) <{mapping = [#iree_codegen.workgroup_mapping<z:1>, #iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>], operandSegmentSizes = array<i32: 0, 1, 0, 0>, staticLowerBound = array<i64: 0, 0, 0, 0>, staticStep = array<i64: 1, 1, 1, 1>, staticUpperBound = array<i64: -9223372036854775808, 32, 2, 2>}> ({
      ^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index):
        %53 = "memref.subview"(%7) <{operandSegmentSizes = array<i32: 1, 0, 0, 0>, static_offsets = array<i64: 0, 0>, static_sizes = array<i64: 1, 1>, static_strides = array<i64: 1, 1>}> : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>
        "gpu.barrier"() : () -> ()
        "scf.for"(%9, %2, %5) ({
        ^bb0(%arg6: index):
          %59 = "vector.transfer_read"(%47, %4, %arg0, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
          %60 = "arith.addi"(%59, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
          %61 = "arith.trunci"(%60) : (vector<1xi64>) -> vector<1xi32>
          "vector.transfer_write"(%61, %7, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x3xi32, #gpu.address_space<workgroup>>, index, index) -> ()
          "scf.yield"() : () -> ()
        }) : (index, index, index) -> ()
        "gpu.barrier"() : () -> ()
        %54 = "affine.apply"(%arg2, %52#0) <{map = affine_map<(d0)[s0] -> (d0 * 4 + s0)>}> : (index, index) -> index
        %55 = "affine.apply"(%arg3, %52#1) <{map = affine_map<(d0)[s0] -> (d0 * 64 + s0 * 4)>}> : (index, index) -> index
        %56 = "memref.subview"(%48, %arg1, %54, %55, %44) <{operandSegmentSizes = array<i32: 1, 3, 1, 0>, static_offsets = array<i64: 0, -9223372036854775808, -9223372036854775808, -9223372036854775808>, static_sizes = array<i64: -9223372036854775808, 1, 1, 4>, static_strides = array<i64: 1, 1, 1, 1>}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>, index, index, index, index) -> memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        %57 = "vector.transfer_read"(%46, %4, %arg0, %arg1, %54, %55, %0) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 5, 1, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, index, index, index, f8E4M3FNUZ) -> vector<4xf8E4M3FNUZ>
        %58 = "arith.extf"(%57) : (vector<4xf8E4M3FNUZ>) -> vector<4xbf16>
        "vector.transfer_write"(%58, %8, %4, %4, %4, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 5, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (vector<4xbf16>, memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, index, index, index, index, index) -> ()
        "iree_linalg_ext.scatter"(%8, %53, %56) <{dimension_map = array<i64: 0>, operandSegmentSizes = array<i32: 2, 1>, unique_indices = true}> ({
        ^bb0(%arg4: bf16, %arg5: bf16):
          "iree_linalg_ext.yield"(%arg4) : (bf16) -> ()
        }) : (memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>, memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
        "gpu.barrier"() : () -> ()
        "scf.forall.in_parallel"() ({
        ^bb0:
        }) : () -> ()
      }) : (index) -> ()
      "memref.dealloc"(%7) : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> ()
      "func.return"() : () -> ()
    }) {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64>} : () -> ()
  }) : () -> ()
  "hal.executable.variant_end"() : () -> ()
 }) {sym_name = "rocm_hsaco_fb", target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>} : () -> ()
 /sharedfile/32/fp8_32_kv16.mlir:22159:13: error: 'vector.transfer_write' op write affecting operations on global resources are restricted to workgroup distributed contexts.
    %8404 = torch.aten.index_put %8402, %8403, %8398, %false_8414 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
            ^
 /sharedfile/32/fp8_32_kv16.mlir:22159:13: note: see current operation: "vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
 /sharedfile/32/fp8_32_kv16.mlir:22159:13: error: 'func.func' op failed on workgroup distribution verification
    %8404 = torch.aten.index_put %8402, %8403, %8398, %false_8414 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
            ^
 /sharedfile/32/fp8_32_kv16.mlir:22159:13: note: see current operation: 
 "func.func"() <{function_type = () -> (), sym_name = "prefill_bs1$async_dispatch_452_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"}> ({
  %0 = "arith.constant"() <{value = 0.000000e+00 : f8E4M3FNUZ}> : () -> f8E4M3FNUZ
  %1 = "arith.constant"() <{value = 0 : i64}> : () -> i64
  %2 = "arith.constant"() <{value = 1 : index}> : () -> index
  %3 = "arith.constant"() <{value = 32 : i64}> : () -> i64
  %4 = "arith.constant"() <{value = 0 : index}> : () -> index
  %5 = "arith.constant"() <{value = 64 : index}> : () -> index
  %6 = "arith.constant"() <{value = dense<44> : vector<1xi64>}> : () -> vector<1xi64>
  %7 = "memref.alloc"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x3xi32, #gpu.address_space<workgroup>>
  %8 = "memref.alloca"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x1x1x1x4xbf16, #gpu.address_space<private>>
  %9 = "gpu.thread_id"() <{dimension = #gpu<dim x>, upper_bound = 64 : index}> : () -> index
  %10 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index} : () -> i32
  %11 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 1 : index} : () -> i32
  %12 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 2 : index} : () -> i32
  %13 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 3 : index} : () -> i32
  %14 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 4 : index} : () -> i32
  %15 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 5 : index} : () -> i32
  %16 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 6 : index} : () -> i32
  %17 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 7 : index} : () -> i32
  %18 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 8 : index} : () -> i32
  %19 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 9 : index} : () -> i32
  %20 = "arith.extui"(%10) : (i32) -> i64
  %21 = "arith.extui"(%11) : (i32) -> i64
  %22 = "arith.shli"(%21, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
  %23 = "arith.ori"(%20, %22) : (i64, i64) -> i64
  %24 = "arith.index_castui"(%23) : (i64) -> index
  %25 = "arith.extui"(%12) : (i32) -> i64
  %26 = "arith.extui"(%13) : (i32) -> i64
  %27 = "arith.shli"(%26, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
  %28 = "arith.ori"(%25, %27) : (i64, i64) -> i64
  %29 = "arith.index_castui"(%28) : (i64) -> index
  %30 = "arith.extui"(%14) : (i32) -> i64
  %31 = "arith.extui"(%15) : (i32) -> i64
  %32 = "arith.shli"(%31, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
  %33 = "arith.ori"(%30, %32) : (i64, i64) -> i64
  %34 = "arith.index_castui"(%33) : (i64) -> index
  %35 = "arith.index_castui"(%16) : (i32) -> index
  %36 = "arith.extui"(%17) : (i32) -> i64
  %37 = "arith.extui"(%18) : (i32) -> i64
  %38 = "arith.shli"(%37, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
  %39 = "arith.ori"(%36, %38) : (i64, i64) -> i64
  %40 = "arith.index_castui"(%39) : (i64) -> index
  %41 = "arith.index_castui"(%19) : (i32) -> index
  %42:6 = "util.assume.int"(%24, %29, %34, %35, %40, %41) <{assumptions = [[#util.int.assumption<umin = 68386816, umax = 5300322304>], [#util.int.assumption<umin = 68501504, umax = 5769969664>], [#util.int.assumption<umin = 68501568, umax = 5770002432>], [#util.int.assumption<umin = 1, umax = 4095>], [#util.int.assumption<umin = 64, umax = 576460752303423424, udiv = 64>], [#util.int.assumption<umin = 1, umax = 4095>]]}> : (index, index, index, index, index, index) -> (index, index, index, index, index, index)
  %43 = "flow.dispatch.workload.ordinal"(%42#3) <{ordinal = 1 : index}> : (index) -> index
  %44 = "flow.dispatch.workload.ordinal"(%42#4) <{ordinal = 2 : index}> : (index) -> index
  %45 = "flow.dispatch.workload.ordinal"(%42#5) <{ordinal = 3 : index}> : (index) -> index
  %46 = "hal.interface.binding.subspan"(%42#0, %43) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  "memref.assume_alignment"(%46) <{alignment = 1 : i32}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
  %47 = "hal.interface.binding.subspan"(%42#1, %45) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  "memref.assume_alignment"(%47) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
  %48 = "hal.interface.binding.subspan"(%4, %44) {alignment = 64 : index, binding = 1 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>
  "memref.assume_alignment"(%48) <{alignment = 64 : i32}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>) -> ()
  %49 = "hal.interface.binding.subspan"(%42#2, %45) {alignment = 64 : index, binding = 2 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  "memref.assume_alignment"(%49) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
  %50 = "memref.alloca"(%45) <{operandSegmentSizes = array<i32: 1, 0>}> : (index) -> memref<1x?xi32, #gpu.address_space<private>>
  %51 = "affine.apply"(%45) <{map = affine_map<()[s0] -> (s0 ceildiv 2)>}> : (index) -> index
  "gpu.barrier"() : () -> ()
  "scf.for"(%9, %51, %5) ({
  ^bb0(%arg7: index):
    %62 = "affine.apply"(%arg7) <{map = affine_map<(d0) -> (d0 * 2)>}> : (index) -> index
    %63 = "affine.min"(%arg7, %45) <{map = affine_map<(d0)[s0] -> (d0 * -2 + s0, 2)>}> : (index, index) -> index
    "scf.for"(%4, %63, %2) ({
    ^bb0(%arg8: index):
      %64 = "arith.addi"(%arg8, %62) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
      %65 = "vector.transfer_read"(%47, %4, %64, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
      %66 = "arith.addi"(%65, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
      %67 = "arith.trunci"(%66) : (vector<1xi64>) -> vector<1xi32>
      %68 = "affine.apply"(%arg7, %arg8) <{map = affine_map<(d0)[s0] -> (d0 * 2 + s0)>}> : (index, index) -> index
      "vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
      "vector.transfer_write"(%67, %50, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x?xi32, #gpu.address_space<private>>, index, index) -> ()
      "scf.yield"() : () -> ()
    }) : (index, index, index) -> ()
    "scf.yield"() : () -> ()
  }) : (index, index, index) -> ()
  "gpu.barrier"() : () -> ()
  %52:2 = "affine.delinearize_index"(%9) <{static_basis = array<i64: 4, 16>}> : (index) -> (index, index)
  "scf.forall"(%43) <{mapping = [#iree_codegen.workgroup_mapping<z:1>, #iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>], operandSegmentSizes = array<i32: 0, 1, 0, 0>, staticLowerBound = array<i64: 0, 0, 0, 0>, staticStep = array<i64: 1, 1, 1, 1>, staticUpperBound = array<i64: -9223372036854775808, 32, 2, 2>}> ({
  ^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index):
    %53 = "memref.subview"(%7) <{operandSegmentSizes = array<i32: 1, 0, 0, 0>, static_offsets = array<i64: 0, 0>, static_sizes = array<i64: 1, 1>, static_strides = array<i64: 1, 1>}> : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>
    "gpu.barrier"() : () -> ()
    "scf.for"(%9, %2, %5) ({
    ^bb0(%arg6: index):
      %59 = "vector.transfer_read"(%47, %4, %arg0, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
      %60 = "arith.addi"(%59, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
      %61 = "arith.trunci"(%60) : (vector<1xi64>) -> vector<1xi32>
      "vector.transfer_write"(%61, %7, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x3xi32, #gpu.address_space<workgroup>>, index, index) -> ()
      "scf.yield"() : () -> ()
    }) : (index, index, index) -> ()
    "gpu.barrier"() : () -> ()
    %54 = "affine.apply"(%arg2, %52#0) <{map = affine_map<(d0)[s0] -> (d0 * 4 + s0)>}> : (index, index) -> index
    %55 = "affine.apply"(%arg3, %52#1) <{map = affine_map<(d0)[s0] -> (d0 * 64 + s0 * 4)>}> : (index, index) -> index
    %56 = "memref.subview"(%48, %arg1, %54, %55, %44) <{operandSegmentSizes = array<i32: 1, 3, 1, 0>, static_offsets = array<i64: 0, -9223372036854775808, -9223372036854775808, -9223372036854775808>, static_sizes = array<i64: -9223372036854775808, 1, 1, 4>, static_strides = array<i64: 1, 1, 1, 1>}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>, index, index, index, index) -> memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %57 = "vector.transfer_read"(%46, %4, %arg0, %arg1, %54, %55, %0) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 5, 1, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, index, index, index, f8E4M3FNUZ) -> vector<4xf8E4M3FNUZ>
    %58 = "arith.extf"(%57) : (vector<4xf8E4M3FNUZ>) -> vector<4xbf16>
    "vector.transfer_write"(%58, %8, %4, %4, %4, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 5, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (vector<4xbf16>, memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, index, index, index, index, index) -> ()
    "iree_linalg_ext.scatter"(%8, %53, %56) <{dimension_map = array<i64: 0>, operandSegmentSizes = array<i32: 2, 1>, unique_indices = true}> ({
    ^bb0(%arg4: bf16, %arg5: bf16):
      "iree_linalg_ext.yield"(%arg4) : (bf16) -> ()
    }) : (memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>, memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
    "gpu.barrier"() : () -> ()
    "scf.forall.in_parallel"() ({
    ^bb0:
    }) : () -> ()
  }) : (index) -> ()
  "memref.dealloc"(%7) : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> ()
  "func.return"() : () -> ()
 }) {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64>} : () -> ()
 /sharedfile/32/fp8_32_kv16.mlir:22159:13: error: failed to run translation of source executable to target executable for backend #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>
    %8404 = torch.aten.index_put %8402, %8403, %8398, %false_8414 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
            ^
 /sharedfile/32/fp8_32_kv16.mlir:22159:13: note: see current operation: 
 "hal.executable.variant"() ({
  "hal.executable.export"() ({
  ^bb0(%arg9: !hal.device, %arg10: index, %arg11: index, %arg12: index, %arg13: index):
    %69:3 = "flow.dispatch.workgroup_count_from_slice"(%arg10, %arg11, %arg12, %arg13) : (index, index, index, index) -> (index, index, index)
    "hal.return"(%69#0, %69#1, %69#2) : (index, index, index) -> ()
  }) {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index, sym_name = "prefill_bs1$async_dispatch_452_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"} : () -> ()
  "builtin.module"() ({
    "func.func"() <{function_type = () -> (), sym_name = "prefill_bs1$async_dispatch_452_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"}> ({
      %0 = "arith.constant"() <{value = 0.000000e+00 : f8E4M3FNUZ}> : () -> f8E4M3FNUZ
      %1 = "arith.constant"() <{value = 0 : i64}> : () -> i64
      %2 = "arith.constant"() <{value = 1 : index}> : () -> index
      %3 = "arith.constant"() <{value = 32 : i64}> : () -> i64
      %4 = "arith.constant"() <{value = 0 : index}> : () -> index
      %5 = "arith.constant"() <{value = 64 : index}> : () -> index
      %6 = "arith.constant"() <{value = dense<44> : vector<1xi64>}> : () -> vector<1xi64>
      %7 = "memref.alloc"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x3xi32, #gpu.address_space<workgroup>>
      %8 = "memref.alloca"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x1x1x1x4xbf16, #gpu.address_space<private>>
      %9 = "gpu.thread_id"() <{dimension = #gpu<dim x>, upper_bound = 64 : index}> : () -> index
      %10 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index} : () -> i32
      %11 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 1 : index} : () -> i32
      %12 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 2 : index} : () -> i32
      %13 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 3 : index} : () -> i32
      %14 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 4 : index} : () -> i32
      %15 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 5 : index} : () -> i32
      %16 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 6 : index} : () -> i32
      %17 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 7 : index} : () -> i32
      %18 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 8 : index} : () -> i32
      %19 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 9 : index} : () -> i32
      %20 = "arith.extui"(%10) : (i32) -> i64
      %21 = "arith.extui"(%11) : (i32) -> i64
      %22 = "arith.shli"(%21, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
      %23 = "arith.ori"(%20, %22) : (i64, i64) -> i64
      %24 = "arith.index_castui"(%23) : (i64) -> index
      %25 = "arith.extui"(%12) : (i32) -> i64
      %26 = "arith.extui"(%13) : (i32) -> i64
      %27 = "arith.shli"(%26, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
      %28 = "arith.ori"(%25, %27) : (i64, i64) -> i64
      %29 = "arith.index_castui"(%28) : (i64) -> index
      %30 = "arith.extui"(%14) : (i32) -> i64
      %31 = "arith.extui"(%15) : (i32) -> i64
      %32 = "arith.shli"(%31, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
      %33 = "arith.ori"(%30, %32) : (i64, i64) -> i64
      %34 = "arith.index_castui"(%33) : (i64) -> index
      %35 = "arith.index_castui"(%16) : (i32) -> index
      %36 = "arith.extui"(%17) : (i32) -> i64
      %37 = "arith.extui"(%18) : (i32) -> i64
      %38 = "arith.shli"(%37, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
      %39 = "arith.ori"(%36, %38) : (i64, i64) -> i64
      %40 = "arith.index_castui"(%39) : (i64) -> index
      %41 = "arith.index_castui"(%19) : (i32) -> index
      %42:6 = "util.assume.int"(%24, %29, %34, %35, %40, %41) <{assumptions = [[#util.int.assumption<umin = 68386816, umax = 5300322304>], [#util.int.assumption<umin = 68501504, umax = 5769969664>], [#util.int.assumption<umin = 68501568, umax = 5770002432>], [#util.int.assumption<umin = 1, umax = 4095>], [#util.int.assumption<umin = 64, umax = 576460752303423424, udiv = 64>], [#util.int.assumption<umin = 1, umax = 4095>]]}> : (index, index, index, index, index, index) -> (index, index, index, index, index, index)
      %43 = "flow.dispatch.workload.ordinal"(%42#3) <{ordinal = 1 : index}> : (index) -> index
      %44 = "flow.dispatch.workload.ordinal"(%42#4) <{ordinal = 2 : index}> : (index) -> index
      %45 = "flow.dispatch.workload.ordinal"(%42#5) <{ordinal = 3 : index}> : (index) -> index
      %46 = "hal.interface.binding.subspan"(%42#0, %43) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      "memref.assume_alignment"(%46) <{alignment = 1 : i32}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
      %47 = "hal.interface.binding.subspan"(%42#1, %45) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      "memref.assume_alignment"(%47) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
      %48 = "hal.interface.binding.subspan"(%4, %44) {alignment = 64 : index, binding = 1 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>
      "memref.assume_alignment"(%48) <{alignment = 64 : i32}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>) -> ()
      %49 = "hal.interface.binding.subspan"(%42#2, %45) {alignment = 64 : index, binding = 2 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      "memref.assume_alignment"(%49) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
      %50 = "memref.alloca"(%45) <{operandSegmentSizes = array<i32: 1, 0>}> : (index) -> memref<1x?xi32, #gpu.address_space<private>>
      %51 = "affine.apply"(%45) <{map = affine_map<()[s0] -> (s0 ceildiv 2)>}> : (index) -> index
      "gpu.barrier"() : () -> ()
      "scf.for"(%9, %51, %5) ({
      ^bb0(%arg7: index):
        %62 = "affine.apply"(%arg7) <{map = affine_map<(d0) -> (d0 * 2)>}> : (index) -> index
        %63 = "affine.min"(%arg7, %45) <{map = affine_map<(d0)[s0] -> (d0 * -2 + s0, 2)>}> : (index, index) -> index
        "scf.for"(%4, %63, %2) ({
        ^bb0(%arg8: index):
          %64 = "arith.addi"(%arg8, %62) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
          %65 = "vector.transfer_read"(%47, %4, %64, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
          %66 = "arith.addi"(%65, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
          %67 = "arith.trunci"(%66) : (vector<1xi64>) -> vector<1xi32>
          %68 = "affine.apply"(%arg7, %arg8) <{map = affine_map<(d0)[s0] -> (d0 * 2 + s0)>}> : (index, index) -> index
          "vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
          "vector.transfer_write"(%67, %50, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x?xi32, #gpu.address_space<private>>, index, index) -> ()
          "scf.yield"() : () -> ()
        }) : (index, index, index) -> ()
        "scf.yield"() : () -> ()
      }) : (index, index, index) -> ()
      "gpu.barrier"() : () -> ()
      %52:2 = "affine.delinearize_index"(%9) <{static_basis = array<i64: 4, 16>}> : (index) -> (index, index)
      "scf.forall"(%43) <{mapping = [#iree_codegen.workgroup_mapping<z:1>, #iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>], operandSegmentSizes = array<i32: 0, 1, 0, 0>, staticLowerBound = array<i64: 0, 0, 0, 0>, staticStep = array<i64: 1, 1, 1, 1>, staticUpperBound = array<i64: -9223372036854775808, 32, 2, 2>}> ({
      ^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index):
        %53 = "memref.subview"(%7) <{operandSegmentSizes = array<i32: 1, 0, 0, 0>, static_offsets = array<i64: 0, 0>, static_sizes = array<i64: 1, 1>, static_strides = array<i64: 1, 1>}> : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>
        "gpu.barrier"() : () -> ()
        "scf.for"(%9, %2, %5) ({
        ^bb0(%arg6: index):
          %59 = "vector.transfer_read"(%47, %4, %arg0, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
          %60 = "arith.addi"(%59, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
          %61 = "arith.trunci"(%60) : (vector<1xi64>) -> vector<1xi32>
          "vector.transfer_write"(%61, %7, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x3xi32, #gpu.address_space<workgroup>>, index, index) -> ()
          "scf.yield"() : () -> ()
        }) : (index, index, index) -> ()
        "gpu.barrier"() : () -> ()
        %54 = "affine.apply"(%arg2, %52#0) <{map = affine_map<(d0)[s0] -> (d0 * 4 + s0)>}> : (index, index) -> index
        %55 = "affine.apply"(%arg3, %52#1) <{map = affine_map<(d0)[s0] -> (d0 * 64 + s0 * 4)>}> : (index, index) -> index
        %56 = "memref.subview"(%48, %arg1, %54, %55, %44) <{operandSegmentSizes = array<i32: 1, 3, 1, 0>, static_offsets = array<i64: 0, -9223372036854775808, -9223372036854775808, -9223372036854775808>, static_sizes = array<i64: -9223372036854775808, 1, 1, 4>, static_strides = array<i64: 1, 1, 1, 1>}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>, index, index, index, index) -> memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        %57 = "vector.transfer_read"(%46, %4, %arg0, %arg1, %54, %55, %0) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 5, 1, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, index, index, index, f8E4M3FNUZ) -> vector<4xf8E4M3FNUZ>
        %58 = "arith.extf"(%57) : (vector<4xf8E4M3FNUZ>) -> vector<4xbf16>
        "vector.transfer_write"(%58, %8, %4, %4, %4, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 5, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (vector<4xbf16>, memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, index, index, index, index, index) -> ()
        "iree_linalg_ext.scatter"(%8, %53, %56) <{dimension_map = array<i64: 0>, operandSegmentSizes = array<i32: 2, 1>, unique_indices = true}> ({
        ^bb0(%arg4: bf16, %arg5: bf16):
          "iree_linalg_ext.yield"(%arg4) : (bf16) -> ()
        }) : (memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>, memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
        "gpu.barrier"() : () -> ()
        "scf.forall.in_parallel"() ({
        ^bb0:
        }) : () -> ()
      }) : (index) -> ()
      "memref.dealloc"(%7) : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> ()
      "func.return"() : () -> ()
    }) {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64>} : () -> ()
  }) : () -> ()
  "hal.executable.variant_end"() : () -> ()
 }) {sym_name = "rocm_hsaco_fb", target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>} : () -> ()
 /sharedfile/32/fp8_32_kv16.mlir:23035:13: error: 'vector.transfer_write' op write affecting operations on global resources are restricted to workgroup distributed contexts.
    %8741 = torch.aten.index_put %8739, %8740, %8735, %false_8786 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
            ^
 /sharedfile/32/fp8_32_kv16.mlir:23035:13: note: see current operation: "vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
 /sharedfile/32/fp8_32_kv16.mlir:23035:13: error: 'func.func' op failed on workgroup distribution verification
    %8741 = torch.aten.index_put %8739, %8740, %8735, %false_8786 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
            ^
 /sharedfile/32/fp8_32_kv16.mlir:23035:13: note: see current operation: 
 "func.func"() <{function_type = () -> (), sym_name = "prefill_bs1$async_dispatch_472_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"}> ({
  %0 = "arith.constant"() <{value = 0.000000e+00 : f8E4M3FNUZ}> : () -> f8E4M3FNUZ
  %1 = "arith.constant"() <{value = 0 : i64}> : () -> i64
  %2 = "arith.constant"() <{value = 1 : index}> : () -> index
  %3 = "arith.constant"() <{value = 32 : i64}> : () -> i64
  %4 = "arith.constant"() <{value = 0 : index}> : () -> index
  %5 = "arith.constant"() <{value = 64 : index}> : () -> index
  %6 = "arith.constant"() <{value = dense<46> : vector<1xi64>}> : () -> vector<1xi64>
  %7 = "memref.alloc"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x3xi32, #gpu.address_space<workgroup>>
  %8 = "memref.alloca"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x1x1x1x4xbf16, #gpu.address_space<private>>
  %9 = "gpu.thread_id"() <{dimension = #gpu<dim x>, upper_bound = 64 : index}> : () -> index
  %10 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index} : () -> i32
  %11 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 1 : index} : () -> i32
  %12 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 2 : index} : () -> i32
  %13 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 3 : index} : () -> i32
  %14 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 4 : index} : () -> i32
  %15 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 5 : index} : () -> i32
  %16 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 6 : index} : () -> i32
  %17 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 7 : index} : () -> i32
  %18 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 8 : index} : () -> i32
  %19 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 9 : index} : () -> i32
  %20 = "arith.extui"(%10) : (i32) -> i64
  %21 = "arith.extui"(%11) : (i32) -> i64
  %22 = "arith.shli"(%21, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
  %23 = "arith.ori"(%20, %22) : (i64, i64) -> i64
  %24 = "arith.index_castui"(%23) : (i64) -> index
  %25 = "arith.extui"(%12) : (i32) -> i64
  %26 = "arith.extui"(%13) : (i32) -> i64
  %27 = "arith.shli"(%26, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
  %28 = "arith.ori"(%25, %27) : (i64, i64) -> i64
  %29 = "arith.index_castui"(%28) : (i64) -> index
  %30 = "arith.extui"(%14) : (i32) -> i64
  %31 = "arith.extui"(%15) : (i32) -> i64
  %32 = "arith.shli"(%31, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
  %33 = "arith.ori"(%30, %32) : (i64, i64) -> i64
  %34 = "arith.index_castui"(%33) : (i64) -> index
  %35 = "arith.index_castui"(%16) : (i32) -> index
  %36 = "arith.extui"(%17) : (i32) -> i64
  %37 = "arith.extui"(%18) : (i32) -> i64
  %38 = "arith.shli"(%37, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
  %39 = "arith.ori"(%36, %38) : (i64, i64) -> i64
  %40 = "arith.index_castui"(%39) : (i64) -> index
  %41 = "arith.index_castui"(%19) : (i32) -> index
  %42:6 = "util.assume.int"(%24, %29, %34, %35, %40, %41) <{assumptions = [[#util.int.assumption<umin = 68386816, umax = 5300322304>], [#util.int.assumption<umin = 68501504, umax = 5769969664>], [#util.int.assumption<umin = 68501568, umax = 5770002432>], [#util.int.assumption<umin = 1, umax = 4095>], [#util.int.assumption<umin = 64, umax = 576460752303423424, udiv = 64>], [#util.int.assumption<umin = 1, umax = 4095>]]}> : (index, index, index, index, index, index) -> (index, index, index, index, index, index)
  %43 = "flow.dispatch.workload.ordinal"(%42#3) <{ordinal = 1 : index}> : (index) -> index
  %44 = "flow.dispatch.workload.ordinal"(%42#4) <{ordinal = 2 : index}> : (index) -> index
  %45 = "flow.dispatch.workload.ordinal"(%42#5) <{ordinal = 3 : index}> : (index) -> index
  %46 = "hal.interface.binding.subspan"(%42#0, %43) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  "memref.assume_alignment"(%46) <{alignment = 1 : i32}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
  %47 = "hal.interface.binding.subspan"(%42#1, %45) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  "memref.assume_alignment"(%47) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
  %48 = "hal.interface.binding.subspan"(%4, %44) {alignment = 64 : index, binding = 1 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>
  "memref.assume_alignment"(%48) <{alignment = 64 : i32}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>) -> ()
  %49 = "hal.interface.binding.subspan"(%42#2, %45) {alignment = 64 : index, binding = 2 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  "memref.assume_alignment"(%49) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
  %50 = "memref.alloca"(%45) <{operandSegmentSizes = array<i32: 1, 0>}> : (index) -> memref<1x?xi32, #gpu.address_space<private>>
  %51 = "affine.apply"(%45) <{map = affine_map<()[s0] -> (s0 ceildiv 2)>}> : (index) -> index
  "gpu.barrier"() : () -> ()
  "scf.for"(%9, %51, %5) ({
  ^bb0(%arg7: index):
    %62 = "affine.apply"(%arg7) <{map = affine_map<(d0) -> (d0 * 2)>}> : (index) -> index
    %63 = "affine.min"(%arg7, %45) <{map = affine_map<(d0)[s0] -> (d0 * -2 + s0, 2)>}> : (index, index) -> index
    "scf.for"(%4, %63, %2) ({
    ^bb0(%arg8: index):
      %64 = "arith.addi"(%arg8, %62) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
      %65 = "vector.transfer_read"(%47, %4, %64, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
      %66 = "arith.addi"(%65, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
      %67 = "arith.trunci"(%66) : (vector<1xi64>) -> vector<1xi32>
      %68 = "affine.apply"(%arg7, %arg8) <{map = affine_map<(d0)[s0] -> (d0 * 2 + s0)>}> : (index, index) -> index
      "vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
      "vector.transfer_write"(%67, %50, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x?xi32, #gpu.address_space<private>>, index, index) -> ()
      "scf.yield"() : () -> ()
    }) : (index, index, index) -> ()
    "scf.yield"() : () -> ()
  }) : (index, index, index) -> ()
  "gpu.barrier"() : () -> ()
  %52:2 = "affine.delinearize_index"(%9) <{static_basis = array<i64: 4, 16>}> : (index) -> (index, index)
  "scf.forall"(%43) <{mapping = [#iree_codegen.workgroup_mapping<z:1>, #iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>], operandSegmentSizes = array<i32: 0, 1, 0, 0>, staticLowerBound = array<i64: 0, 0, 0, 0>, staticStep = array<i64: 1, 1, 1, 1>, staticUpperBound = array<i64: -9223372036854775808, 32, 2, 2>}> ({
  ^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index):
    %53 = "memref.subview"(%7) <{operandSegmentSizes = array<i32: 1, 0, 0, 0>, static_offsets = array<i64: 0, 0>, static_sizes = array<i64: 1, 1>, static_strides = array<i64: 1, 1>}> : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>
    "gpu.barrier"() : () -> ()
    "scf.for"(%9, %2, %5) ({
    ^bb0(%arg6: index):
      %59 = "vector.transfer_read"(%47, %4, %arg0, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
      %60 = "arith.addi"(%59, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
      %61 = "arith.trunci"(%60) : (vector<1xi64>) -> vector<1xi32>
      "vector.transfer_write"(%61, %7, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x3xi32, #gpu.address_space<workgroup>>, index, index) -> ()
      "scf.yield"() : () -> ()
    }) : (index, index, index) -> ()
    "gpu.barrier"() : () -> ()
    %54 = "affine.apply"(%arg2, %52#0) <{map = affine_map<(d0)[s0] -> (d0 * 4 + s0)>}> : (index, index) -> index
    %55 = "affine.apply"(%arg3, %52#1) <{map = affine_map<(d0)[s0] -> (d0 * 64 + s0 * 4)>}> : (index, index) -> index
    %56 = "memref.subview"(%48, %arg1, %54, %55, %44) <{operandSegmentSizes = array<i32: 1, 3, 1, 0>, static_offsets = array<i64: 0, -9223372036854775808, -9223372036854775808, -9223372036854775808>, static_sizes = array<i64: -9223372036854775808, 1, 1, 4>, static_strides = array<i64: 1, 1, 1, 1>}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>, index, index, index, index) -> memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %57 = "vector.transfer_read"(%46, %4, %arg0, %arg1, %54, %55, %0) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 5, 1, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, index, index, index, f8E4M3FNUZ) -> vector<4xf8E4M3FNUZ>
    %58 = "arith.extf"(%57) : (vector<4xf8E4M3FNUZ>) -> vector<4xbf16>
    "vector.transfer_write"(%58, %8, %4, %4, %4, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 5, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (vector<4xbf16>, memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, index, index, index, index, index) -> ()
    "iree_linalg_ext.scatter"(%8, %53, %56) <{dimension_map = array<i64: 0>, operandSegmentSizes = array<i32: 2, 1>, unique_indices = true}> ({
    ^bb0(%arg4: bf16, %arg5: bf16):
      "iree_linalg_ext.yield"(%arg4) : (bf16) -> ()
    }) : (memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>, memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
    "gpu.barrier"() : () -> ()
    "scf.forall.in_parallel"() ({
    ^bb0:
    }) : () -> ()
  }) : (index) -> ()
  "memref.dealloc"(%7) : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> ()
  "func.return"() : () -> ()
 }) {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64>} : () -> ()
 /sharedfile/32/fp8_32_kv16.mlir:23035:13: error: failed to run translation of source executable to target executable for backend #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>
    %8741 = torch.aten.index_put %8739, %8740, %8735, %false_8786 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
            ^
 /sharedfile/32/fp8_32_kv16.mlir:23035:13: note: see current operation: 
 "hal.executable.variant"() ({
  "hal.executable.export"() ({
  ^bb0(%arg9: !hal.device, %arg10: index, %arg11: index, %arg12: index, %arg13: index):
    %69:3 = "flow.dispatch.workgroup_count_from_slice"(%arg10, %arg11, %arg12, %arg13) : (index, index, index, index) -> (index, index, index)
    "hal.return"(%69#0, %69#1, %69#2) : (index, index, index) -> ()
  }) {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index, sym_name = "prefill_bs1$async_dispatch_472_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"} : () -> ()
  "builtin.module"() ({
    "func.func"() <{function_type = () -> (), sym_name = "prefill_bs1$async_dispatch_472_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"}> ({
      %0 = "arith.constant"() <{value = 0.000000e+00 : f8E4M3FNUZ}> : () -> f8E4M3FNUZ
      %1 = "arith.constant"() <{value = 0 : i64}> : () -> i64
      %2 = "arith.constant"() <{value = 1 : index}> : () -> index
      %3 = "arith.constant"() <{value = 32 : i64}> : () -> i64
      %4 = "arith.constant"() <{value = 0 : index}> : () -> index
      %5 = "arith.constant"() <{value = 64 : index}> : () -> index
      %6 = "arith.constant"() <{value = dense<46> : vector<1xi64>}> : () -> vector<1xi64>
      %7 = "memref.alloc"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x3xi32, #gpu.address_space<workgroup>>
      %8 = "memref.alloca"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x1x1x1x4xbf16, #gpu.address_space<private>>
      %9 = "gpu.thread_id"() <{dimension = #gpu<dim x>, upper_bound = 64 : index}> : () -> index
      %10 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index} : () -> i32
      %11 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 1 : index} : () -> i32
      %12 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 2 : index} : () -> i32
      %13 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 3 : index} : () -> i32
      %14 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 4 : index} : () -> i32
      %15 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 5 : index} : () -> i32
      %16 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 6 : index} : () -> i32
      %17 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 7 : index} : () -> i32
      %18 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 8 : index} : () -> i32
      %19 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 9 : index} : () -> i32
      %20 = "arith.extui"(%10) : (i32) -> i64
      %21 = "arith.extui"(%11) : (i32) -> i64
      %22 = "arith.shli"(%21, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
      %23 = "arith.ori"(%20, %22) : (i64, i64) -> i64
      %24 = "arith.index_castui"(%23) : (i64) -> index
      %25 = "arith.extui"(%12) : (i32) -> i64
      %26 = "arith.extui"(%13) : (i32) -> i64
      %27 = "arith.shli"(%26, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
      %28 = "arith.ori"(%25, %27) : (i64, i64) -> i64
      %29 = "arith.index_castui"(%28) : (i64) -> index
      %30 = "arith.extui"(%14) : (i32) -> i64
      %31 = "arith.extui"(%15) : (i32) -> i64
      %32 = "arith.shli"(%31, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
      %33 = "arith.ori"(%30, %32) : (i64, i64) -> i64
      %34 = "arith.index_castui"(%33) : (i64) -> index
      %35 = "arith.index_castui"(%16) : (i32) -> index
      %36 = "arith.extui"(%17) : (i32) -> i64
      %37 = "arith.extui"(%18) : (i32) -> i64
      %38 = "arith.shli"(%37, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
      %39 = "arith.ori"(%36, %38) : (i64, i64) -> i64
      %40 = "arith.index_castui"(%39) : (i64) -> index
      %41 = "arith.index_castui"(%19) : (i32) -> index
      %42:6 = "util.assume.int"(%24, %29, %34, %35, %40, %41) <{assumptions = [[#util.int.assumption<umin = 68386816, umax = 5300322304>], [#util.int.assumption<umin = 68501504, umax = 5769969664>], [#util.int.assumption<umin = 68501568, umax = 5770002432>], [#util.int.assumption<umin = 1, umax = 4095>], [#util.int.assumption<umin = 64, umax = 576460752303423424, udiv = 64>], [#util.int.assumption<umin = 1, umax = 4095>]]}> : (index, index, index, index, index, index) -> (index, index, index, index, index, index)
      %43 = "flow.dispatch.workload.ordinal"(%42#3) <{ordinal = 1 : index}> : (index) -> index
      %44 = "flow.dispatch.workload.ordinal"(%42#4) <{ordinal = 2 : index}> : (index) -> index
      %45 = "flow.dispatch.workload.ordinal"(%42#5) <{ordinal = 3 : index}> : (index) -> index
      %46 = "hal.interface.binding.subspan"(%42#0, %43) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      "memref.assume_alignment"(%46) <{alignment = 1 : i32}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
      %47 = "hal.interface.binding.subspan"(%42#1, %45) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      "memref.assume_alignment"(%47) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
      %48 = "hal.interface.binding.subspan"(%4, %44) {alignment = 64 : index, binding = 1 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>
      "memref.assume_alignment"(%48) <{alignment = 64 : i32}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>) -> ()
      %49 = "hal.interface.binding.subspan"(%42#2, %45) {alignment = 64 : index, binding = 2 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      "memref.assume_alignment"(%49) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
      %50 = "memref.alloca"(%45) <{operandSegmentSizes = array<i32: 1, 0>}> : (index) -> memref<1x?xi32, #gpu.address_space<private>>
      %51 = "affine.apply"(%45) <{map = affine_map<()[s0] -> (s0 ceildiv 2)>}> : (index) -> index
      "gpu.barrier"() : () -> ()
      "scf.for"(%9, %51, %5) ({
      ^bb0(%arg7: index):
        %62 = "affine.apply"(%arg7) <{map = affine_map<(d0) -> (d0 * 2)>}> : (index) -> index
        %63 = "affine.min"(%arg7, %45) <{map = affine_map<(d0)[s0] -> (d0 * -2 + s0, 2)>}> : (index, index) -> index
        "scf.for"(%4, %63, %2) ({
        ^bb0(%arg8: index):
          %64 = "arith.addi"(%arg8, %62) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
          %65 = "vector.transfer_read"(%47, %4, %64, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
          %66 = "arith.addi"(%65, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
          %67 = "arith.trunci"(%66) : (vector<1xi64>) -> vector<1xi32>
          %68 = "affine.apply"(%arg7, %arg8) <{map = affine_map<(d0)[s0] -> (d0 * 2 + s0)>}> : (index, index) -> index
          "vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
          "vector.transfer_write"(%67, %50, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x?xi32, #gpu.address_space<private>>, index, index) -> ()
          "scf.yield"() : () -> ()
        }) : (index, index, index) -> ()
        "scf.yield"() : () -> ()
      }) : (index, index, index) -> ()
      "gpu.barrier"() : () -> ()
      %52:2 = "affine.delinearize_index"(%9) <{static_basis = array<i64: 4, 16>}> : (index) -> (index, index)
      "scf.forall"(%43) <{mapping = [#iree_codegen.workgroup_mapping<z:1>, #iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>], operandSegmentSizes = array<i32: 0, 1, 0, 0>, staticLowerBound = array<i64: 0, 0, 0, 0>, staticStep = array<i64: 1, 1, 1, 1>, staticUpperBound = array<i64: -9223372036854775808, 32, 2, 2>}> ({
      ^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index):
        %53 = "memref.subview"(%7) <{operandSegmentSizes = array<i32: 1, 0, 0, 0>, static_offsets = array<i64: 0, 0>, static_sizes = array<i64: 1, 1>, static_strides = array<i64: 1, 1>}> : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>
        "gpu.barrier"() : () -> ()
        "scf.for"(%9, %2, %5) ({
        ^bb0(%arg6: index):
          %59 = "vector.transfer_read"(%47, %4, %arg0, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
          %60 = "arith.addi"(%59, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
          %61 = "arith.trunci"(%60) : (vector<1xi64>) -> vector<1xi32>
          "vector.transfer_write"(%61, %7, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x3xi32, #gpu.address_space<workgroup>>, index, index) -> ()
          "scf.yield"() : () -> ()
        }) : (index, index, index) -> ()
        "gpu.barrier"() : () -> ()
        %54 = "affine.apply"(%arg2, %52#0) <{map = affine_map<(d0)[s0] -> (d0 * 4 + s0)>}> : (index, index) -> index
        %55 = "affine.apply"(%arg3, %52#1) <{map = affine_map<(d0)[s0] -> (d0 * 64 + s0 * 4)>}> : (index, index) -> index
        %56 = "memref.subview"(%48, %arg1, %54, %55, %44) <{operandSegmentSizes = array<i32: 1, 3, 1, 0>, static_offsets = array<i64: 0, -9223372036854775808, -9223372036854775808, -9223372036854775808>, static_sizes = array<i64: -9223372036854775808, 1, 1, 4>, static_strides = array<i64: 1, 1, 1, 1>}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>, index, index, index, index) -> memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        %57 = "vector.transfer_read"(%46, %4, %arg0, %arg1, %54, %55, %0) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 5, 1, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, index, index, index, f8E4M3FNUZ) -> vector<4xf8E4M3FNUZ>
        %58 = "arith.extf"(%57) : (vector<4xf8E4M3FNUZ>) -> vector<4xbf16>
        "vector.transfer_write"(%58, %8, %4, %4, %4, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 5, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (vector<4xbf16>, memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, index, index, index, index, index) -> ()
        "iree_linalg_ext.scatter"(%8, %53, %56) <{dimension_map = array<i64: 0>, operandSegmentSizes = array<i32: 2, 1>, unique_indices = true}> ({
        ^bb0(%arg4: bf16, %arg5: bf16):
          "iree_linalg_ext.yield"(%arg4) : (bf16) -> ()
        }) : (memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>, memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
        "gpu.barrier"() : () -> ()
        "scf.forall.in_parallel"() ({
        ^bb0:
        }) : () -> ()
      }) : (index) -> ()
      "memref.dealloc"(%7) : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> ()
      "func.return"() : () -> ()
    }) {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64>} : () -> ()
  }) : () -> ()
  "hal.executable.variant_end"() : () -> ()
 }) {sym_name = "rocm_hsaco_fb", target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>} : () -> ()
 /sharedfile/32/fp8_32_kv16.mlir:23911:13: error: 'vector.transfer_write' op write affecting operations on global resources are restricted to workgroup distributed contexts.
    %9078 = torch.aten.index_put %9076, %9077, %9072, %false_9158 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
            ^
 /sharedfile/32/fp8_32_kv16.mlir:23911:13: note: see current operation: "vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
 /sharedfile/32/fp8_32_kv16.mlir:23911:13: error: 'func.func' op failed on workgroup distribution verification
    %9078 = torch.aten.index_put %9076, %9077, %9072, %false_9158 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
            ^
 /sharedfile/32/fp8_32_kv16.mlir:23911:13: note: see current operation: 
 "func.func"() <{function_type = () -> (), sym_name = "prefill_bs1$async_dispatch_492_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"}> ({
  %0 = "arith.constant"() <{value = 0.000000e+00 : f8E4M3FNUZ}> : () -> f8E4M3FNUZ
  %1 = "arith.constant"() <{value = 0 : i64}> : () -> i64
  %2 = "arith.constant"() <{value = 1 : index}> : () -> index
  %3 = "arith.constant"() <{value = 32 : i64}> : () -> i64
  %4 = "arith.constant"() <{value = 0 : index}> : () -> index
  %5 = "arith.constant"() <{value = 64 : index}> : () -> index
  %6 = "arith.constant"() <{value = dense<48> : vector<1xi64>}> : () -> vector<1xi64>
  %7 = "memref.alloc"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x3xi32, #gpu.address_space<workgroup>>
  %8 = "memref.alloca"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x1x1x1x4xbf16, #gpu.address_space<private>>
  %9 = "gpu.thread_id"() <{dimension = #gpu<dim x>, upper_bound = 64 : index}> : () -> index
  %10 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index} : () -> i32
  %11 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 1 : index} : () -> i32
  %12 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 2 : index} : () -> i32
  %13 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 3 : index} : () -> i32
  %14 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 4 : index} : () -> i32
  %15 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 5 : index} : () -> i32
  %16 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 6 : index} : () -> i32
  %17 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 7 : index} : () -> i32
  %18 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 8 : index} : () -> i32
  %19 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 9 : index} : () -> i32
  %20 = "arith.extui"(%10) : (i32) -> i64
  %21 = "arith.extui"(%11) : (i32) -> i64
  %22 = "arith.shli"(%21, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
  %23 = "arith.ori"(%20, %22) : (i64, i64) -> i64
  %24 = "arith.index_castui"(%23) : (i64) -> index
  %25 = "arith.extui"(%12) : (i32) -> i64
  %26 = "arith.extui"(%13) : (i32) -> i64
  %27 = "arith.shli"(%26, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
  %28 = "arith.ori"(%25, %27) : (i64, i64) -> i64
  %29 = "arith.index_castui"(%28) : (i64) -> index
  %30 = "arith.extui"(%14) : (i32) -> i64
  %31 = "arith.extui"(%15) : (i32) -> i64
  %32 = "arith.shli"(%31, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
  %33 = "arith.ori"(%30, %32) : (i64, i64) -> i64
  %34 = "arith.index_castui"(%33) : (i64) -> index
  %35 = "arith.index_castui"(%16) : (i32) -> index
  %36 = "arith.extui"(%17) : (i32) -> i64
  %37 = "arith.extui"(%18) : (i32) -> i64
  %38 = "arith.shli"(%37, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
  %39 = "arith.ori"(%36, %38) : (i64, i64) -> i64
  %40 = "arith.index_castui"(%39) : (i64) -> index
  %41 = "arith.index_castui"(%19) : (i32) -> index
  %42:6 = "util.assume.int"(%24, %29, %34, %35, %40, %41) <{assumptions = [[#util.int.assumption<umin = 68386816, umax = 5300322304>], [#util.int.assumption<umin = 68501504, umax = 5769969664>], [#util.int.assumption<umin = 68501568, umax = 5770002432>], [#util.int.assumption<umin = 1, umax = 4095>], [#util.int.assumption<umin = 64, umax = 576460752303423424, udiv = 64>], [#util.int.assumption<umin = 1, umax = 4095>]]}> : (index, index, index, index, index, index) -> (index, index, index, index, index, index)
  %43 = "flow.dispatch.workload.ordinal"(%42#3) <{ordinal = 1 : index}> : (index) -> index
  %44 = "flow.dispatch.workload.ordinal"(%42#4) <{ordinal = 2 : index}> : (index) -> index
  %45 = "flow.dispatch.workload.ordinal"(%42#5) <{ordinal = 3 : index}> : (index) -> index
  %46 = "hal.interface.binding.subspan"(%42#0, %43) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  "memref.assume_alignment"(%46) <{alignment = 1 : i32}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
  %47 = "hal.interface.binding.subspan"(%42#1, %45) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  "memref.assume_alignment"(%47) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
  %48 = "hal.interface.binding.subspan"(%4, %44) {alignment = 64 : index, binding = 1 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>
  "memref.assume_alignment"(%48) <{alignment = 64 : i32}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>) -> ()
  %49 = "hal.interface.binding.subspan"(%42#2, %45) {alignment = 64 : index, binding = 2 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  "memref.assume_alignment"(%49) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
  %50 = "memref.alloca"(%45) <{operandSegmentSizes = array<i32: 1, 0>}> : (index) -> memref<1x?xi32, #gpu.address_space<private>>
  %51 = "affine.apply"(%45) <{map = affine_map<()[s0] -> (s0 ceildiv 2)>}> : (index) -> index
  "gpu.barrier"() : () -> ()
  "scf.for"(%9, %51, %5) ({
  ^bb0(%arg7: index):
    %62 = "affine.apply"(%arg7) <{map = affine_map<(d0) -> (d0 * 2)>}> : (index) -> index
    %63 = "affine.min"(%arg7, %45) <{map = affine_map<(d0)[s0] -> (d0 * -2 + s0, 2)>}> : (index, index) -> index
    "scf.for"(%4, %63, %2) ({
    ^bb0(%arg8: index):
      %64 = "arith.addi"(%arg8, %62) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
      %65 = "vector.transfer_read"(%47, %4, %64, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
      %66 = "arith.addi"(%65, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
      %67 = "arith.trunci"(%66) : (vector<1xi64>) -> vector<1xi32>
      %68 = "affine.apply"(%arg7, %arg8) <{map = affine_map<(d0)[s0] -> (d0 * 2 + s0)>}> : (index, index) -> index
      "vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
      "vector.transfer_write"(%67, %50, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x?xi32, #gpu.address_space<private>>, index, index) -> ()
      "scf.yield"() : () -> ()
    }) : (index, index, index) -> ()
    "scf.yield"() : () -> ()
  }) : (index, index, index) -> ()
  "gpu.barrier"() : () -> ()
  %52:2 = "affine.delinearize_index"(%9) <{static_basis = array<i64: 4, 16>}> : (index) -> (index, index)
  "scf.forall"(%43) <{mapping = [#iree_codegen.workgroup_mapping<z:1>, #iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>], operandSegmentSizes = array<i32: 0, 1, 0, 0>, staticLowerBound = array<i64: 0, 0, 0, 0>, staticStep = array<i64: 1, 1, 1, 1>, staticUpperBound = array<i64: -9223372036854775808, 32, 2, 2>}> ({
  ^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index):
    %53 = "memref.subview"(%7) <{operandSegmentSizes = array<i32: 1, 0, 0, 0>, static_offsets = array<i64: 0, 0>, static_sizes = array<i64: 1, 1>, static_strides = array<i64: 1, 1>}> : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>
    "gpu.barrier"() : () -> ()
    "scf.for"(%9, %2, %5) ({
    ^bb0(%arg6: index):
      %59 = "vector.transfer_read"(%47, %4, %arg0, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
      %60 = "arith.addi"(%59, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
      %61 = "arith.trunci"(%60) : (vector<1xi64>) -> vector<1xi32>
      "vector.transfer_write"(%61, %7, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x3xi32, #gpu.address_space<workgroup>>, index, index) -> ()
      "scf.yield"() : () -> ()
    }) : (index, index, index) -> ()
    "gpu.barrier"() : () -> ()
    %54 = "affine.apply"(%arg2, %52#0) <{map = affine_map<(d0)[s0] -> (d0 * 4 + s0)>}> : (index, index) -> index
    %55 = "affine.apply"(%arg3, %52#1) <{map = affine_map<(d0)[s0] -> (d0 * 64 + s0 * 4)>}> : (index, index) -> index
    %56 = "memref.subview"(%48, %arg1, %54, %55, %44) <{operandSegmentSizes = array<i32: 1, 3, 1, 0>, static_offsets = array<i64: 0, -9223372036854775808, -9223372036854775808, -9223372036854775808>, static_sizes = array<i64: -9223372036854775808, 1, 1, 4>, static_strides = array<i64: 1, 1, 1, 1>}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>, index, index, index, index) -> memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %57 = "vector.transfer_read"(%46, %4, %arg0, %arg1, %54, %55, %0) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 5, 1, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, index, index, index, f8E4M3FNUZ) -> vector<4xf8E4M3FNUZ>
    %58 = "arith.extf"(%57) : (vector<4xf8E4M3FNUZ>) -> vector<4xbf16>
    "vector.transfer_write"(%58, %8, %4, %4, %4, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 5, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (vector<4xbf16>, memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, index, index, index, index, index) -> ()
    "iree_linalg_ext.scatter"(%8, %53, %56) <{dimension_map = array<i64: 0>, operandSegmentSizes = array<i32: 2, 1>, unique_indices = true}> ({
    ^bb0(%arg4: bf16, %arg5: bf16):
      "iree_linalg_ext.yield"(%arg4) : (bf16) -> ()
    }) : (memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>, memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
    "gpu.barrier"() : () -> ()
    "scf.forall.in_parallel"() ({
    ^bb0:
    }) : () -> ()
  }) : (index) -> ()
  "memref.dealloc"(%7) : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> ()
  "func.return"() : () -> ()
 }) {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64>} : () -> ()
 /sharedfile/32/fp8_32_kv16.mlir:23911:13: error: failed to run translation of source executable to target executable for backend #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>
    %9078 = torch.aten.index_put %9076, %9077, %9072, %false_9158 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
            ^
 /sharedfile/32/fp8_32_kv16.mlir:23911:13: note: see current operation: 
 "hal.executable.variant"() ({
  "hal.executable.export"() ({
  ^bb0(%arg9: !hal.device, %arg10: index, %arg11: index, %arg12: index, %arg13: index):
    %69:3 = "flow.dispatch.workgroup_count_from_slice"(%arg10, %arg11, %arg12, %arg13) : (index, index, index, index) -> (index, index, index)
    "hal.return"(%69#0, %69#1, %69#2) : (index, index, index) -> ()
  }) {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index, sym_name = "prefill_bs1$async_dispatch_492_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"} : () -> ()
  "builtin.module"() ({
    "func.func"() <{function_type = () -> (), sym_name = "prefill_bs1$async_dispatch_492_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"}> ({
      %0 = "arith.constant"() <{value = 0.000000e+00 : f8E4M3FNUZ}> : () -> f8E4M3FNUZ
      %1 = "arith.constant"() <{value = 0 : i64}> : () -> i64
      %2 = "arith.constant"() <{value = 1 : index}> : () -> index
      %3 = "arith.constant"() <{value = 32 : i64}> : () -> i64
      %4 = "arith.constant"() <{value = 0 : index}> : () -> index
      %5 = "arith.constant"() <{value = 64 : index}> : () -> index
      %6 = "arith.constant"() <{value = dense<48> : vector<1xi64>}> : () -> vector<1xi64>
      %7 = "memref.alloc"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x3xi32, #gpu.address_space<workgroup>>
      %8 = "memref.alloca"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x1x1x1x4xbf16, #gpu.address_space<private>>
      %9 = "gpu.thread_id"() <{dimension = #gpu<dim x>, upper_bound = 64 : index}> : () -> index
      %10 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index} : () -> i32
      %11 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 1 : index} : () -> i32
      %12 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 2 : index} : () -> i32
      %13 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 3 : index} : () -> i32
      %14 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 4 : index} : () -> i32
      %15 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 5 : index} : () -> i32
      %16 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 6 : index} : () -> i32
      %17 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 7 : index} : () -> i32
      %18 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 8 : index} : () -> i32
      %19 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 9 : index} : () -> i32
      %20 = "arith.extui"(%10) : (i32) -> i64
      %21 = "arith.extui"(%11) : (i32) -> i64
      %22 = "arith.shli"(%21, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
      %23 = "arith.ori"(%20, %22) : (i64, i64) -> i64
      %24 = "arith.index_castui"(%23) : (i64) -> index
      %25 = "arith.extui"(%12) : (i32) -> i64
      %26 = "arith.extui"(%13) : (i32) -> i64
      %27 = "arith.shli"(%26, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
      %28 = "arith.ori"(%25, %27) : (i64, i64) -> i64
      %29 = "arith.index_castui"(%28) : (i64) -> index
      %30 = "arith.extui"(%14) : (i32) -> i64
      %31 = "arith.extui"(%15) : (i32) -> i64
      %32 = "arith.shli"(%31, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
      %33 = "arith.ori"(%30, %32) : (i64, i64) -> i64
      %34 = "arith.index_castui"(%33) : (i64) -> index
      %35 = "arith.index_castui"(%16) : (i32) -> index
      %36 = "arith.extui"(%17) : (i32) -> i64
      %37 = "arith.extui"(%18) : (i32) -> i64
      %38 = "arith.shli"(%37, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
      %39 = "arith.ori"(%36, %38) : (i64, i64) -> i64
      %40 = "arith.index_castui"(%39) : (i64) -> index
      %41 = "arith.index_castui"(%19) : (i32) -> index
      %42:6 = "util.assume.int"(%24, %29, %34, %35, %40, %41) <{assumptions = [[#util.int.assumption<umin = 68386816, umax = 5300322304>], [#util.int.assumption<umin = 68501504, umax = 5769969664>], [#util.int.assumption<umin = 68501568, umax = 5770002432>], [#util.int.assumption<umin = 1, umax = 4095>], [#util.int.assumption<umin = 64, umax = 576460752303423424, udiv = 64>], [#util.int.assumption<umin = 1, umax = 4095>]]}> : (index, index, index, index, index, index) -> (index, index, index, index, index, index)
      %43 = "flow.dispatch.workload.ordinal"(%42#3) <{ordinal = 1 : index}> : (index) -> index
      %44 = "flow.dispatch.workload.ordinal"(%42#4) <{ordinal = 2 : index}> : (index) -> index
      %45 = "flow.dispatch.workload.ordinal"(%42#5) <{ordinal = 3 : index}> : (index) -> index
      %46 = "hal.interface.binding.subspan"(%42#0, %43) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      "memref.assume_alignment"(%46) <{alignment = 1 : i32}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
      %47 = "hal.interface.binding.subspan"(%42#1, %45) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      "memref.assume_alignment"(%47) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
      %48 = "hal.interface.binding.subspan"(%4, %44) {alignment = 64 : index, binding = 1 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>
      "memref.assume_alignment"(%48) <{alignment = 64 : i32}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>) -> ()
      %49 = "hal.interface.binding.subspan"(%42#2, %45) {alignment = 64 : index, binding = 2 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      "memref.assume_alignment"(%49) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
      %50 = "memref.alloca"(%45) <{operandSegmentSizes = array<i32: 1, 0>}> : (index) -> memref<1x?xi32, #gpu.address_space<private>>
      %51 = "affine.apply"(%45) <{map = affine_map<()[s0] -> (s0 ceildiv 2)>}> : (index) -> index
      "gpu.barrier"() : () -> ()
      "scf.for"(%9, %51, %5) ({
      ^bb0(%arg7: index):
        %62 = "affine.apply"(%arg7) <{map = affine_map<(d0) -> (d0 * 2)>}> : (index) -> index
        %63 = "affine.min"(%arg7, %45) <{map = affine_map<(d0)[s0] -> (d0 * -2 + s0, 2)>}> : (index, index) -> index
        "scf.for"(%4, %63, %2) ({
        ^bb0(%arg8: index):
          %64 = "arith.addi"(%arg8, %62) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
          %65 = "vector.transfer_read"(%47, %4, %64, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
          %66 = "arith.addi"(%65, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
          %67 = "arith.trunci"(%66) : (vector<1xi64>) -> vector<1xi32>
          %68 = "affine.apply"(%arg7, %arg8) <{map = affine_map<(d0)[s0] -> (d0 * 2 + s0)>}> : (index, index) -> index
          "vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
          "vector.transfer_write"(%67, %50, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x?xi32, #gpu.address_space<private>>, index, index) -> ()
          "scf.yield"() : () -> ()
        }) : (index, index, index) -> ()
        "scf.yield"() : () -> ()
      }) : (index, index, index) -> ()
      "gpu.barrier"() : () -> ()
      %52:2 = "affine.delinearize_index"(%9) <{static_basis = array<i64: 4, 16>}> : (index) -> (index, index)
      "scf.forall"(%43) <{mapping = [#iree_codegen.workgroup_mapping<z:1>, #iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>], operandSegmentSizes = array<i32: 0, 1, 0, 0>, staticLowerBound = array<i64: 0, 0, 0, 0>, staticStep = array<i64: 1, 1, 1, 1>, staticUpperBound = array<i64: -9223372036854775808, 32, 2, 2>}> ({
      ^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index):
        %53 = "memref.subview"(%7) <{operandSegmentSizes = array<i32: 1, 0, 0, 0>, static_offsets = array<i64: 0, 0>, static_sizes = array<i64: 1, 1>, static_strides = array<i64: 1, 1>}> : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>
        "gpu.barrier"() : () -> ()
        "scf.for"(%9, %2, %5) ({
        ^bb0(%arg6: index):
          %59 = "vector.transfer_read"(%47, %4, %arg0, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
          %60 = "arith.addi"(%59, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
          %61 = "arith.trunci"(%60) : (vector<1xi64>) -> vector<1xi32>
          "vector.transfer_write"(%61, %7, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x3xi32, #gpu.address_space<workgroup>>, index, index) -> ()
          "scf.yield"() : () -> ()
        }) : (index, index, index) -> ()
        "gpu.barrier"() : () -> ()
        %54 = "affine.apply"(%arg2, %52#0) <{map = affine_map<(d0)[s0] -> (d0 * 4 + s0)>}> : (index, index) -> index
        %55 = "affine.apply"(%arg3, %52#1) <{map = affine_map<(d0)[s0] -> (d0 * 64 + s0 * 4)>}> : (index, index) -> index
        %56 = "memref.subview"(%48, %arg1, %54, %55, %44) <{operandSegmentSizes = array<i32: 1, 3, 1, 0>, static_offsets = array<i64: 0, -9223372036854775808, -9223372036854775808, -9223372036854775808>, static_sizes = array<i64: -9223372036854775808, 1, 1, 4>, static_strides = array<i64: 1, 1, 1, 1>}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>, index, index, index, index) -> memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        %57 = "vector.transfer_read"(%46, %4, %arg0, %arg1, %54, %55, %0) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 5, 1, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, index, index, index, f8E4M3FNUZ) -> vector<4xf8E4M3FNUZ>
        %58 = "arith.extf"(%57) : (vector<4xf8E4M3FNUZ>) -> vector<4xbf16>
        "vector.transfer_write"(%58, %8, %4, %4, %4, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 5, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (vector<4xbf16>, memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, index, index, index, index, index) -> ()
        "iree_linalg_ext.scatter"(%8, %53, %56) <{dimension_map = array<i64: 0>, operandSegmentSizes = array<i32: 2, 1>, unique_indices = true}> ({
        ^bb0(%arg4: bf16, %arg5: bf16):
          "iree_linalg_ext.yield"(%arg4) : (bf16) -> ()
        }) : (memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>, memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
        "gpu.barrier"() : () -> ()
        "scf.forall.in_parallel"() ({
        ^bb0:
        }) : () -> ()
      }) : (index) -> ()
      "memref.dealloc"(%7) : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> ()
      "func.return"() : () -> ()
    }) {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64>} : () -> ()
  }) : () -> ()
  "hal.executable.variant_end"() : () -> ()
 }) {sym_name = "rocm_hsaco_fb", target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>} : () -> ()
 /sharedfile/32/fp8_32_kv16.mlir:24787:13: error: 'vector.transfer_write' op write affecting operations on global resources are restricted to workgroup distributed contexts.
    %9415 = torch.aten.index_put %9413, %9414, %9409, %false_9530 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
            ^
 /sharedfile/32/fp8_32_kv16.mlir:24787:13: note: see current operation: "vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
 /sharedfile/32/fp8_32_kv16.mlir:24787:13: error: 'func.func' op failed on workgroup distribution verification
    %9415 = torch.aten.index_put %9413, %9414, %9409, %false_9530 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
            ^
 /sharedfile/32/fp8_32_kv16.mlir:24787:13: note: see current operation: 
 "func.func"() <{function_type = () -> (), sym_name = "prefill_bs1$async_dispatch_512_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"}> ({
  %0 = "arith.constant"() <{value = 0.000000e+00 : f8E4M3FNUZ}> : () -> f8E4M3FNUZ
  %1 = "arith.constant"() <{value = 0 : i64}> : () -> i64
  %2 = "arith.constant"() <{value = 1 : index}> : () -> index
  %3 = "arith.constant"() <{value = 32 : i64}> : () -> i64
  %4 = "arith.constant"() <{value = 0 : index}> : () -> index
  %5 = "arith.constant"() <{value = 64 : index}> : () -> index
  %6 = "arith.constant"() <{value = dense<50> : vector<1xi64>}> : () -> vector<1xi64>
  %7 = "memref.alloc"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x3xi32, #gpu.address_space<workgroup>>
  %8 = "memref.alloca"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x1x1x1x4xbf16, #gpu.address_space<private>>
  %9 = "gpu.thread_id"() <{dimension = #gpu<dim x>, upper_bound = 64 : index}> : () -> index
  %10 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index} : () -> i32
  %11 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 1 : index} : () -> i32
  %12 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 2 : index} : () -> i32
  %13 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 3 : index} : () -> i32
  %14 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 4 : index} : () -> i32
  %15 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 5 : index} : () -> i32
  %16 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 6 : index} : () -> i32
  %17 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 7 : index} : () -> i32
  %18 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 8 : index} : () -> i32
  %19 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 9 : index} : () -> i32
  %20 = "arith.extui"(%10) : (i32) -> i64
  %21 = "arith.extui"(%11) : (i32) -> i64
  %22 = "arith.shli"(%21, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
  %23 = "arith.ori"(%20, %22) : (i64, i64) -> i64
  %24 = "arith.index_castui"(%23) : (i64) -> index
  %25 = "arith.extui"(%12) : (i32) -> i64
  %26 = "arith.extui"(%13) : (i32) -> i64
  %27 = "arith.shli"(%26, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
  %28 = "arith.ori"(%25, %27) : (i64, i64) -> i64
  %29 = "arith.index_castui"(%28) : (i64) -> index
  %30 = "arith.extui"(%14) : (i32) -> i64
  %31 = "arith.extui"(%15) : (i32) -> i64
  %32 = "arith.shli"(%31, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
  %33 = "arith.ori"(%30, %32) : (i64, i64) -> i64
  %34 = "arith.index_castui"(%33) : (i64) -> index
  %35 = "arith.index_castui"(%16) : (i32) -> index
  %36 = "arith.extui"(%17) : (i32) -> i64
  %37 = "arith.extui"(%18) : (i32) -> i64
  %38 = "arith.shli"(%37, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
  %39 = "arith.ori"(%36, %38) : (i64, i64) -> i64
  %40 = "arith.index_castui"(%39) : (i64) -> index
  %41 = "arith.index_castui"(%19) : (i32) -> index
  %42:6 = "util.assume.int"(%24, %29, %34, %35, %40, %41) <{assumptions = [[#util.int.assumption<umin = 68386816, umax = 5300322304>], [#util.int.assumption<umin = 68501504, umax = 5769969664>], [#util.int.assumption<umin = 68501568, umax = 5770002432>], [#util.int.assumption<umin = 1, umax = 4095>], [#util.int.assumption<umin = 64, umax = 576460752303423424, udiv = 64>], [#util.int.assumption<umin = 1, umax = 4095>]]}> : (index, index, index, index, index, index) -> (index, index, index, index, index, index)
  %43 = "flow.dispatch.workload.ordinal"(%42#3) <{ordinal = 1 : index}> : (index) -> index
  %44 = "flow.dispatch.workload.ordinal"(%42#4) <{ordinal = 2 : index}> : (index) -> index
  %45 = "flow.dispatch.workload.ordinal"(%42#5) <{ordinal = 3 : index}> : (index) -> index
  %46 = "hal.interface.binding.subspan"(%42#0, %43) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  "memref.assume_alignment"(%46) <{alignment = 1 : i32}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
  %47 = "hal.interface.binding.subspan"(%42#1, %45) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  "memref.assume_alignment"(%47) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
  %48 = "hal.interface.binding.subspan"(%4, %44) {alignment = 64 : index, binding = 1 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>
  "memref.assume_alignment"(%48) <{alignment = 64 : i32}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>) -> ()
  %49 = "hal.interface.binding.subspan"(%42#2, %45) {alignment = 64 : index, binding = 2 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  "memref.assume_alignment"(%49) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
  %50 = "memref.alloca"(%45) <{operandSegmentSizes = array<i32: 1, 0>}> : (index) -> memref<1x?xi32, #gpu.address_space<private>>
  %51 = "affine.apply"(%45) <{map = affine_map<()[s0] -> (s0 ceildiv 2)>}> : (index) -> index
  "gpu.barrier"() : () -> ()
  "scf.for"(%9, %51, %5) ({
  ^bb0(%arg7: index):
    %62 = "affine.apply"(%arg7) <{map = affine_map<(d0) -> (d0 * 2)>}> : (index) -> index
    %63 = "affine.min"(%arg7, %45) <{map = affine_map<(d0)[s0] -> (d0 * -2 + s0, 2)>}> : (index, index) -> index
    "scf.for"(%4, %63, %2) ({
    ^bb0(%arg8: index):
      %64 = "arith.addi"(%arg8, %62) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
      %65 = "vector.transfer_read"(%47, %4, %64, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
      %66 = "arith.addi"(%65, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
      %67 = "arith.trunci"(%66) : (vector<1xi64>) -> vector<1xi32>
      %68 = "affine.apply"(%arg7, %arg8) <{map = affine_map<(d0)[s0] -> (d0 * 2 + s0)>}> : (index, index) -> index
      "vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
      "vector.transfer_write"(%67, %50, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x?xi32, #gpu.address_space<private>>, index, index) -> ()
      "scf.yield"() : () -> ()
    }) : (index, index, index) -> ()
    "scf.yield"() : () -> ()
  }) : (index, index, index) -> ()
  "gpu.barrier"() : () -> ()
  %52:2 = "affine.delinearize_index"(%9) <{static_basis = array<i64: 4, 16>}> : (index) -> (index, index)
  "scf.forall"(%43) <{mapping = [#iree_codegen.workgroup_mapping<z:1>, #iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>], operandSegmentSizes = array<i32: 0, 1, 0, 0>, staticLowerBound = array<i64: 0, 0, 0, 0>, staticStep = array<i64: 1, 1, 1, 1>, staticUpperBound = array<i64: -9223372036854775808, 32, 2, 2>}> ({
  ^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index):
    %53 = "memref.subview"(%7) <{operandSegmentSizes = array<i32: 1, 0, 0, 0>, static_offsets = array<i64: 0, 0>, static_sizes = array<i64: 1, 1>, static_strides = array<i64: 1, 1>}> : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>
    "gpu.barrier"() : () -> ()
    "scf.for"(%9, %2, %5) ({
    ^bb0(%arg6: index):
      %59 = "vector.transfer_read"(%47, %4, %arg0, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
      %60 = "arith.addi"(%59, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
      %61 = "arith.trunci"(%60) : (vector<1xi64>) -> vector<1xi32>
      "vector.transfer_write"(%61, %7, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x3xi32, #gpu.address_space<workgroup>>, index, index) -> ()
      "scf.yield"() : () -> ()
    }) : (index, index, index) -> ()
    "gpu.barrier"() : () -> ()
    %54 = "affine.apply"(%arg2, %52#0) <{map = affine_map<(d0)[s0] -> (d0 * 4 + s0)>}> : (index, index) -> index
    %55 = "affine.apply"(%arg3, %52#1) <{map = affine_map<(d0)[s0] -> (d0 * 64 + s0 * 4)>}> : (index, index) -> index
    %56 = "memref.subview"(%48, %arg1, %54, %55, %44) <{operandSegmentSizes = array<i32: 1, 3, 1, 0>, static_offsets = array<i64: 0, -9223372036854775808, -9223372036854775808, -9223372036854775808>, static_sizes = array<i64: -9223372036854775808, 1, 1, 4>, static_strides = array<i64: 1, 1, 1, 1>}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>, index, index, index, index) -> memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %57 = "vector.transfer_read"(%46, %4, %arg0, %arg1, %54, %55, %0) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 5, 1, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, index, index, index, f8E4M3FNUZ) -> vector<4xf8E4M3FNUZ>
    %58 = "arith.extf"(%57) : (vector<4xf8E4M3FNUZ>) -> vector<4xbf16>
    "vector.transfer_write"(%58, %8, %4, %4, %4, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 5, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (vector<4xbf16>, memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, index, index, index, index, index) -> ()
    "iree_linalg_ext.scatter"(%8, %53, %56) <{dimension_map = array<i64: 0>, operandSegmentSizes = array<i32: 2, 1>, unique_indices = true}> ({
    ^bb0(%arg4: bf16, %arg5: bf16):
      "iree_linalg_ext.yield"(%arg4) : (bf16) -> ()
    }) : (memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>, memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
    "gpu.barrier"() : () -> ()
    "scf.forall.in_parallel"() ({
    ^bb0:
    }) : () -> ()
  }) : (index) -> ()
  "memref.dealloc"(%7) : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> ()
  "func.return"() : () -> ()
 }) {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64>} : () -> ()
 /sharedfile/32/fp8_32_kv16.mlir:24787:13: error: failed to run translation of source executable to target executable for backend #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>
    %9415 = torch.aten.index_put %9413, %9414, %9409, %false_9530 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
            ^
 /sharedfile/32/fp8_32_kv16.mlir:24787:13: note: see current operation: 
 "hal.executable.variant"() ({
  "hal.executable.export"() ({
  ^bb0(%arg9: !hal.device, %arg10: index, %arg11: index, %arg12: index, %arg13: index):
    %69:3 = "flow.dispatch.workgroup_count_from_slice"(%arg10, %arg11, %arg12, %arg13) : (index, index, index, index) -> (index, index, index)
    "hal.return"(%69#0, %69#1, %69#2) : (index, index, index) -> ()
  }) {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index, sym_name = "prefill_bs1$async_dispatch_512_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"} : () -> ()
  "builtin.module"() ({
    "func.func"() <{function_type = () -> (), sym_name = "prefill_bs1$async_dispatch_512_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"}> ({
      %0 = "arith.constant"() <{value = 0.000000e+00 : f8E4M3FNUZ}> : () -> f8E4M3FNUZ
      %1 = "arith.constant"() <{value = 0 : i64}> : () -> i64
      %2 = "arith.constant"() <{value = 1 : index}> : () -> index
      %3 = "arith.constant"() <{value = 32 : i64}> : () -> i64
      %4 = "arith.constant"() <{value = 0 : index}> : () -> index
      %5 = "arith.constant"() <{value = 64 : index}> : () -> index
      %6 = "arith.constant"() <{value = dense<50> : vector<1xi64>}> : () -> vector<1xi64>
      %7 = "memref.alloc"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x3xi32, #gpu.address_space<workgroup>>
      %8 = "memref.alloca"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x1x1x1x4xbf16, #gpu.address_space<private>>
      %9 = "gpu.thread_id"() <{dimension = #gpu<dim x>, upper_bound = 64 : index}> : () -> index
      %10 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index} : () -> i32
      %11 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 1 : index} : () -> i32
      %12 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 2 : index} : () -> i32
      %13 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 3 : index} : () -> i32
      %14 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 4 : index} : () -> i32
      %15 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 5 : index} : () -> i32
      %16 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 6 : index} : () -> i32
      %17 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 7 : index} : () -> i32
      %18 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 8 : index} : () -> i32
      %19 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 9 : index} : () -> i32
      %20 = "arith.extui"(%10) : (i32) -> i64
      %21 = "arith.extui"(%11) : (i32) -> i64
      %22 = "arith.shli"(%21, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
      %23 = "arith.ori"(%20, %22) : (i64, i64) -> i64
      %24 = "arith.index_castui"(%23) : (i64) -> index
      %25 = "arith.extui"(%12) : (i32) -> i64
      %26 = "arith.extui"(%13) : (i32) -> i64
      %27 = "arith.shli"(%26, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
      %28 = "arith.ori"(%25, %27) : (i64, i64) -> i64
      %29 = "arith.index_castui"(%28) : (i64) -> index
      %30 = "arith.extui"(%14) : (i32) -> i64
      %31 = "arith.extui"(%15) : (i32) -> i64
      %32 = "arith.shli"(%31, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
      %33 = "arith.ori"(%30, %32) : (i64, i64) -> i64
      %34 = "arith.index_castui"(%33) : (i64) -> index
      %35 = "arith.index_castui"(%16) : (i32) -> index
      %36 = "arith.extui"(%17) : (i32) -> i64
      %37 = "arith.extui"(%18) : (i32) -> i64
      %38 = "arith.shli"(%37, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
      %39 = "arith.ori"(%36, %38) : (i64, i64) -> i64
      %40 = "arith.index_castui"(%39) : (i64) -> index
      %41 = "arith.index_castui"(%19) : (i32) -> index
      %42:6 = "util.assume.int"(%24, %29, %34, %35, %40, %41) <{assumptions = [[#util.int.assumption<umin = 68386816, umax = 5300322304>], [#util.int.assumption<umin = 68501504, umax = 5769969664>], [#util.int.assumption<umin = 68501568, umax = 5770002432>], [#util.int.assumption<umin = 1, umax = 4095>], [#util.int.assumption<umin = 64, umax = 576460752303423424, udiv = 64>], [#util.int.assumption<umin = 1, umax = 4095>]]}> : (index, index, index, index, index, index) -> (index, index, index, index, index, index)
      %43 = "flow.dispatch.workload.ordinal"(%42#3) <{ordinal = 1 : index}> : (index) -> index
      %44 = "flow.dispatch.workload.ordinal"(%42#4) <{ordinal = 2 : index}> : (index) -> index
      %45 = "flow.dispatch.workload.ordinal"(%42#5) <{ordinal = 3 : index}> : (index) -> index
      %46 = "hal.interface.binding.subspan"(%42#0, %43) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      "memref.assume_alignment"(%46) <{alignment = 1 : i32}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
      %47 = "hal.interface.binding.subspan"(%42#1, %45) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      "memref.assume_alignment"(%47) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
      %48 = "hal.interface.binding.subspan"(%4, %44) {alignment = 64 : index, binding = 1 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>
      "memref.assume_alignment"(%48) <{alignment = 64 : i32}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>) -> ()
      %49 = "hal.interface.binding.subspan"(%42#2, %45) {alignment = 64 : index, binding = 2 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      "memref.assume_alignment"(%49) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
      %50 = "memref.alloca"(%45) <{operandSegmentSizes = array<i32: 1, 0>}> : (index) -> memref<1x?xi32, #gpu.address_space<private>>
      %51 = "affine.apply"(%45) <{map = affine_map<()[s0] -> (s0 ceildiv 2)>}> : (index) -> index
      "gpu.barrier"() : () -> ()
      "scf.for"(%9, %51, %5) ({
      ^bb0(%arg7: index):
        %62 = "affine.apply"(%arg7) <{map = affine_map<(d0) -> (d0 * 2)>}> : (index) -> index
        %63 = "affine.min"(%arg7, %45) <{map = affine_map<(d0)[s0] -> (d0 * -2 + s0, 2)>}> : (index, index) -> index
        "scf.for"(%4, %63, %2) ({
        ^bb0(%arg8: index):
          %64 = "arith.addi"(%arg8, %62) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
          %65 = "vector.transfer_read"(%47, %4, %64, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
          %66 = "arith.addi"(%65, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
          %67 = "arith.trunci"(%66) : (vector<1xi64>) -> vector<1xi32>
          %68 = "affine.apply"(%arg7, %arg8) <{map = affine_map<(d0)[s0] -> (d0 * 2 + s0)>}> : (index, index) -> index
          "vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
          "vector.transfer_write"(%67, %50, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x?xi32, #gpu.address_space<private>>, index, index) -> ()
          "scf.yield"() : () -> ()
        }) : (index, index, index) -> ()
        "scf.yield"() : () -> ()
      }) : (index, index, index) -> ()
      "gpu.barrier"() : () -> ()
      %52:2 = "affine.delinearize_index"(%9) <{static_basis = array<i64: 4, 16>}> : (index) -> (index, index)
      "scf.forall"(%43) <{mapping = [#iree_codegen.workgroup_mapping<z:1>, #iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>], operandSegmentSizes = array<i32: 0, 1, 0, 0>, staticLowerBound = array<i64: 0, 0, 0, 0>, staticStep = array<i64: 1, 1, 1, 1>, staticUpperBound = array<i64: -9223372036854775808, 32, 2, 2>}> ({
      ^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index):
        %53 = "memref.subview"(%7) <{operandSegmentSizes = array<i32: 1, 0, 0, 0>, static_offsets = array<i64: 0, 0>, static_sizes = array<i64: 1, 1>, static_strides = array<i64: 1, 1>}> : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>
        "gpu.barrier"() : () -> ()
        "scf.for"(%9, %2, %5) ({
        ^bb0(%arg6: index):
          %59 = "vector.transfer_read"(%47, %4, %arg0, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
          %60 = "arith.addi"(%59, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
          %61 = "arith.trunci"(%60) : (vector<1xi64>) -> vector<1xi32>
          "vector.transfer_write"(%61, %7, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x3xi32, #gpu.address_space<workgroup>>, index, index) -> ()
          "scf.yield"() : () -> ()
        }) : (index, index, index) -> ()
        "gpu.barrier"() : () -> ()
        %54 = "affine.apply"(%arg2, %52#0) <{map = affine_map<(d0)[s0] -> (d0 * 4 + s0)>}> : (index, index) -> index
        %55 = "affine.apply"(%arg3, %52#1) <{map = affine_map<(d0)[s0] -> (d0 * 64 + s0 * 4)>}> : (index, index) -> index
        %56 = "memref.subview"(%48, %arg1, %54, %55, %44) <{operandSegmentSizes = array<i32: 1, 3, 1, 0>, static_offsets = array<i64: 0, -9223372036854775808, -9223372036854775808, -9223372036854775808>, static_sizes = array<i64: -9223372036854775808, 1, 1, 4>, static_strides = array<i64: 1, 1, 1, 1>}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>, index, index, index, index) -> memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        %57 = "vector.transfer_read"(%46, %4, %arg0, %arg1, %54, %55, %0) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 5, 1, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, index, index, index, f8E4M3FNUZ) -> vector<4xf8E4M3FNUZ>
        %58 = "arith.extf"(%57) : (vector<4xf8E4M3FNUZ>) -> vector<4xbf16>
        "vector.transfer_write"(%58, %8, %4, %4, %4, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 5, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (vector<4xbf16>, memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, index, index, index, index, index) -> ()
        "iree_linalg_ext.scatter"(%8, %53, %56) <{dimension_map = array<i64: 0>, operandSegmentSizes = array<i32: 2, 1>, unique_indices = true}> ({
        ^bb0(%arg4: bf16, %arg5: bf16):
          "iree_linalg_ext.yield"(%arg4) : (bf16) -> ()
        }) : (memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>, memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
        "gpu.barrier"() : () -> ()
        "scf.forall.in_parallel"() ({
        ^bb0:
        }) : () -> ()
      }) : (index) -> ()
      "memref.dealloc"(%7) : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> ()
      "func.return"() : () -> ()
    }) {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64>} : () -> ()
  }) : () -> ()
  "hal.executable.variant_end"() : () -> ()
 }) {sym_name = "rocm_hsaco_fb", target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>} : () -> ()
 /sharedfile/32/fp8_32_kv16.mlir:25663:13: error: 'vector.transfer_write' op write affecting operations on global resources are restricted to workgroup distributed contexts.
    %9752 = torch.aten.index_put %9750, %9751, %9746, %false_9902 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
            ^
 /sharedfile/32/fp8_32_kv16.mlir:25663:13: note: see current operation: "vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
 /sharedfile/32/fp8_32_kv16.mlir:25663:13: error: 'func.func' op failed on workgroup distribution verification
    %9752 = torch.aten.index_put %9750, %9751, %9746, %false_9902 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
            ^
 /sharedfile/32/fp8_32_kv16.mlir:25663:13: note: see current operation: 
 "func.func"() <{function_type = () -> (), sym_name = "prefill_bs1$async_dispatch_532_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"}> ({
  %0 = "arith.constant"() <{value = 0.000000e+00 : f8E4M3FNUZ}> : () -> f8E4M3FNUZ
  %1 = "arith.constant"() <{value = 0 : i64}> : () -> i64
  %2 = "arith.constant"() <{value = 1 : index}> : () -> index
  %3 = "arith.constant"() <{value = 32 : i64}> : () -> i64
  %4 = "arith.constant"() <{value = 0 : index}> : () -> index
  %5 = "arith.constant"() <{value = 64 : index}> : () -> index
  %6 = "arith.constant"() <{value = dense<52> : vector<1xi64>}> : () -> vector<1xi64>
  %7 = "memref.alloc"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x3xi32, #gpu.address_space<workgroup>>
  %8 = "memref.alloca"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x1x1x1x4xbf16, #gpu.address_space<private>>
  %9 = "gpu.thread_id"() <{dimension = #gpu<dim x>, upper_bound = 64 : index}> : () -> index
  %10 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index} : () -> i32
  %11 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 1 : index} : () -> i32
  %12 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 2 : index} : () -> i32
  %13 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 3 : index} : () -> i32
  %14 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 4 : index} : () -> i32
  %15 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 5 : index} : () -> i32
  %16 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 6 : index} : () -> i32
  %17 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 7 : index} : () -> i32
  %18 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 8 : index} : () -> i32
  %19 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 9 : index} : () -> i32
  %20 = "arith.extui"(%10) : (i32) -> i64
  %21 = "arith.extui"(%11) : (i32) -> i64
  %22 = "arith.shli"(%21, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
  %23 = "arith.ori"(%20, %22) : (i64, i64) -> i64
  %24 = "arith.index_castui"(%23) : (i64) -> index
  %25 = "arith.extui"(%12) : (i32) -> i64
  %26 = "arith.extui"(%13) : (i32) -> i64
  %27 = "arith.shli"(%26, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
  %28 = "arith.ori"(%25, %27) : (i64, i64) -> i64
  %29 = "arith.index_castui"(%28) : (i64) -> index
  %30 = "arith.extui"(%14) : (i32) -> i64
  %31 = "arith.extui"(%15) : (i32) -> i64
  %32 = "arith.shli"(%31, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
  %33 = "arith.ori"(%30, %32) : (i64, i64) -> i64
  %34 = "arith.index_castui"(%33) : (i64) -> index
  %35 = "arith.index_castui"(%16) : (i32) -> index
  %36 = "arith.extui"(%17) : (i32) -> i64
  %37 = "arith.extui"(%18) : (i32) -> i64
  %38 = "arith.shli"(%37, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
  %39 = "arith.ori"(%36, %38) : (i64, i64) -> i64
  %40 = "arith.index_castui"(%39) : (i64) -> index
  %41 = "arith.index_castui"(%19) : (i32) -> index
  %42:6 = "util.assume.int"(%24, %29, %34, %35, %40, %41) <{assumptions = [[#util.int.assumption<umin = 68386816, umax = 5300322304>], [#util.int.assumption<umin = 68501504, umax = 5769969664>], [#util.int.assumption<umin = 68501568, umax = 5770002432>], [#util.int.assumption<umin = 1, umax = 4095>], [#util.int.assumption<umin = 64, umax = 576460752303423424, udiv = 64>], [#util.int.assumption<umin = 1, umax = 4095>]]}> : (index, index, index, index, index, index) -> (index, index, index, index, index, index)
  %43 = "flow.dispatch.workload.ordinal"(%42#3) <{ordinal = 1 : index}> : (index) -> index
  %44 = "flow.dispatch.workload.ordinal"(%42#4) <{ordinal = 2 : index}> : (index) -> index
  %45 = "flow.dispatch.workload.ordinal"(%42#5) <{ordinal = 3 : index}> : (index) -> index
  %46 = "hal.interface.binding.subspan"(%42#0, %43) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  "memref.assume_alignment"(%46) <{alignment = 1 : i32}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
  %47 = "hal.interface.binding.subspan"(%42#1, %45) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  "memref.assume_alignment"(%47) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
  %48 = "hal.interface.binding.subspan"(%4, %44) {alignment = 64 : index, binding = 1 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>
  "memref.assume_alignment"(%48) <{alignment = 64 : i32}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>) -> ()
  %49 = "hal.interface.binding.subspan"(%42#2, %45) {alignment = 64 : index, binding = 2 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  "memref.assume_alignment"(%49) <{alignment = 1 : i32}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
  %50 = "memref.alloca"(%45) <{operandSegmentSizes = array<i32: 1, 0>}> : (index) -> memref<1x?xi32, #gpu.address_space<private>>
  %51 = "affine.apply"(%45) <{map = affine_map<()[s0] -> (s0 ceildiv 2)>}> : (index) -> index
  "gpu.barrier"() : () -> ()
  "scf.for"(%9, %51, %5) ({
  ^bb0(%arg7: index):
    %62 = "affine.apply"(%arg7) <{map = affine_map<(d0) -> (d0 * 2)>}> : (index) -> index
    %63 = "affine.min"(%arg7, %45) <{map = affine_map<(d0)[s0] -> (d0 * -2 + s0, 2)>}> : (index, index) -> index
    "scf.for"(%4, %63, %2) ({
    ^bb0(%arg8: index):
      %64 = "arith.addi"(%arg8, %62) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
      %65 = "vector.transfer_read"(%47, %4, %64, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
      %66 = "arith.addi"(%65, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
      %67 = "arith.trunci"(%66) : (vector<1xi64>) -> vector<1xi32>
      %68 = "affine.apply"(%arg7, %arg8) <{map = affine_map<(d0)[s0] -> (d0 * 2 + s0)>}> : (index, index) -> index
      "vector.transfer_write"(%66, %49, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi64>, memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index) -> ()
      "vector.transfer_write"(%67, %50, %4, %68) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x?xi32, #gpu.address_space<private>>, index, index) -> ()
      "scf.yield"() : () -> ()
    }) : (index, index, index) -> ()
    "scf.yield"() : () -> ()
  }) : (index, index, index) -> ()
  "gpu.barrier"() : () -> ()
  %52:2 = "affine.delinearize_index"(%9) <{static_basis = array<i64: 4, 16>}> : (index) -> (index, index)
  "scf.forall"(%43) <{mapping = [#iree_codegen.workgroup_mapping<z:1>, #iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>], operandSegmentSizes = array<i32: 0, 1, 0, 0>, staticLowerBound = array<i64: 0, 0, 0, 0>, staticStep = array<i64: 1, 1, 1, 1>, staticUpperBound = array<i64: -9223372036854775808, 32, 2, 2>}> ({
  ^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index):
    %53 = "memref.subview"(%7) <{operandSegmentSizes = array<i32: 1, 0, 0, 0>, static_offsets = array<i64: 0, 0>, static_sizes = array<i64: 1, 1>, static_strides = array<i64: 1, 1>}> : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>
    "gpu.barrier"() : () -> ()
    "scf.for"(%9, %2, %5) ({
    ^bb0(%arg6: index):
      %59 = "vector.transfer_read"(%47, %4, %arg0, %1) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (memref<1x?xi64, strided<[?, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, i64) -> vector<1xi64>
      %60 = "arith.addi"(%59, %6) <{overflowFlags = #arith.overflow<none>}> : (vector<1xi64>, vector<1xi64>) -> vector<1xi64>
      %61 = "arith.trunci"(%60) : (vector<1xi64>) -> vector<1xi32>
      "vector.transfer_write"(%61, %7, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d1)>}> : (vector<1xi32>, memref<1x3xi32, #gpu.address_space<workgroup>>, index, index) -> ()
      "scf.yield"() : () -> ()
    }) : (index, index, index) -> ()
    "gpu.barrier"() : () -> ()
    %54 = "affine.apply"(%arg2, %52#0) <{map = affine_map<(d0)[s0] -> (d0 * 4 + s0)>}> : (index, index) -> index
    %55 = "affine.apply"(%arg3, %52#1) <{map = affine_map<(d0)[s0] -> (d0 * 64 + s0 * 4)>}> : (index, index) -> index
    %56 = "memref.subview"(%48, %arg1, %54, %55, %44) <{operandSegmentSizes = array<i32: 1, 3, 1, 0>, static_offsets = array<i64: 0, -9223372036854775808, -9223372036854775808, -9223372036854775808>, static_sizes = array<i64: -9223372036854775808, 1, 1, 4>, static_strides = array<i64: 1, 1, 1, 1>}> : (memref<?x32x8x128xbf16, #hal.descriptor_type<storage_buffer>>, index, index, index, index) -> memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %57 = "vector.transfer_read"(%46, %4, %arg0, %arg1, %54, %55, %0) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 5, 1, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (memref<1x?x32x8x128xf8E4M3FNUZ, strided<[?, 32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, index, index, index, index, index, f8E4M3FNUZ) -> vector<4xf8E4M3FNUZ>
    %58 = "arith.extf"(%57) : (vector<4xf8E4M3FNUZ>) -> vector<4xbf16>
    "vector.transfer_write"(%58, %8, %4, %4, %4, %4, %4) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 5, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d4)>}> : (vector<4xbf16>, memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, index, index, index, index, index) -> ()
    "iree_linalg_ext.scatter"(%8, %53, %56) <{dimension_map = array<i64: 0>, operandSegmentSizes = array<i32: 2, 1>, unique_indices = true}> ({
    ^bb0(%arg4: bf16, %arg5: bf16):
      "iree_linalg_ext.yield"(%arg4) : (bf16) -> ()
    }) : (memref<1x1x1x1x4xbf16, #gpu.address_space<private>>, memref<1x1xi32, strided<[3, 1]>, #gpu.address_space<workgroup>>, memref<?x1x1x4xbf16, strided<[32768, 1024, 128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) -> ()
    "gpu.barrier"() : () -> ()
    "scf.forall.in_parallel"() ({
    ^bb0:
    }) : () -> ()
  }) : (index) -> ()
  "memref.dealloc"(%7) : (memref<1x3xi32, #gpu.address_space<workgroup>>) -> ()
  "func.return"() : () -> ()
 }) {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64>} : () -> ()
 /sharedfile/32/fp8_32_kv16.mlir:25663:13: error: failed to run translation of source executable to target executable for backend #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>
    %9752 = torch.aten.index_put %9750, %9751, %9746, %false_9902 : !torch.vtensor<[?,32,8,128],bf16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],bf16>, !torch.bool -> !torch.vtensor<[?,32,8,128],bf16>
            ^
 /sharedfile/32/fp8_32_kv16.mlir:25663:13: note: see current operation: 
 "hal.executable.variant"() ({
  "hal.executable.export"() ({
  ^bb0(%arg9: !hal.device, %arg10: index, %arg11: index, %arg12: index, %arg13: index):
    %69:3 = "flow.dispatch.workgroup_count_from_slice"(%arg10, %arg11, %arg12, %arg13) : (index, index, index, index) -> (index, index, index)
    "hal.return"(%69#0, %69#1, %69#2) : (index, index, index) -> ()
  }) {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index, sym_name = "prefill_bs1$async_dispatch_532_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"} : () -> ()
  "builtin.module"() ({
    "func.func"() <{function_type = () -> (), sym_name = "prefill_bs1$async_dispatch_532_scatter_1xDx32x8x128xbf16_dispatch_tensor_store"}> ({
      %0 = "arith.constant"() <{value = 0.000000e+00 : f8E4M3FNUZ}> : () -> f8E4M3FNUZ
      %1 = "arith.constant"() <{value = 0 : i64}> : () -> i64
      %2 = "arith.constant"() <{value = 1 : index}> : () -> index
      %3 = "arith.constant"() <{value = 32 : i64}> : () -> i64
      %4 = "arith.constant"() <{value = 0 : index}> : () -> index
      %5 = "arith.constant"() <{value = 64 : index}> : () -> index
      %6 = "arith.constant"() <{value = dense<52> : vector<1xi64>}> : () -> vector<1xi64>
      %7 = "memref.alloc"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x3xi32, #gpu.address_space<workgroup>>
      %8 = "memref.alloca"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x1x1x1x4xbf16, #gpu.address_space<private>>
      %9 = "gpu.thread_id"() <{dimension = #gpu<dim x>, upper_bound = 64 : index}> : () -> index
      %10 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index} : () -> i32
      %11 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 1 : index} : () -> i32
      %12 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 2 : index} : () -> i32
      %13 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 3 : index} : () -> i32
      %14 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 4 : index} : () -> i32
      %15 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 5 : index} : () -> i32
      %16 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 6 : index} : () -> i32
      %17 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 7 : index} : () -> i32
      %18 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 8 : index} : () -> i32
      %19 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 9 : index} : () -> i32
      %20 = "arith.extui"(%10) : (i32) -> i64
      %21 = "arith.extui"(%11) : (i32) -> i64
      %22 = "arith.shli"(%21, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
      %23 = "arith.ori"(%20, %22) : (i64, i64) -> i64
      %24 = "arith.index_castui"(%23) : (i64) -> index
      %25 = "arith.extui"(%12) : (i32) -> i64
      %26 = "arith.extui"(%13) : (i32) -> i64
      %27 = "arith.shli"(%26, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
      %28 = "arith.ori"(%25, %27) : (i64, i64) -> i64
      %29 = "arith.index_castui"(%28) : (i64) -> index
      %30 = "arith.extui"(%14) : (i32) -> i64
      %31 = "arith.extui"(%15) : (i32) -> i64
      %32 = "arith.shli"(%31, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
      %33 = "arith.ori"(%30, %32) : (i64, i64) -> i64
      %34 = "arith.index_castui"(%33) : (i64) -> index
      %35 = "arith.index_castui"(%16) : (i32) -> index
      %36 = "arith.extui"(%17) : (i32) -> i64
      %37 = "arith.extui"(%18) : (i32) -> i64
      %38 = "arith.shli"(%37, %3) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -