bjacob · October 7, 2024 15:16
diff --git a/README.md b/README.md
diff --git a/log.mlir b/log.mlir
 // -----// IR Dump After AssignLegacyTargetDevicesPass (iree-hal-assign-legacy-target-devices) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647]>>, ukernels = "none"}>
 #map = affine_map<(d0, d1, d2) -> (d0, d2)>
 #map1 = affine_map<(d0, d1, d2) -> (d1, d2)>
 #map2 = affine_map<(d0, d1, d2) -> (d0, d1)>
 #pipeline_layout = #hal.pipeline.layout<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
 #device_target_hip = #hal.device.target<"hip", {legacy_sync}, [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {hal.device.targets = [#device_target_hip]} {
  hal.executable public @foo_dispatch_6 {
    hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) {
      hal.executable.export public @foo_dispatch_6 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index):
        %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg1, %arg2, %arg3, %arg4, %arg5, %arg6
        hal.return %x, %y, %z : index, index, index
      }
      builtin.module {
        func.func @foo_dispatch_6() {
          %c32_i64 = arith.constant 32 : i64
          %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32
          %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32
          %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32
          %3 = hal.interface.constant.load layout(#pipeline_layout) ordinal(3) : i32
          %4 = hal.interface.constant.load layout(#pipeline_layout) ordinal(4) : i32
          %5 = hal.interface.constant.load layout(#pipeline_layout) ordinal(5) : i32
          %6 = hal.interface.constant.load layout(#pipeline_layout) ordinal(6) : i32
          %7 = hal.interface.constant.load layout(#pipeline_layout) ordinal(7) : i32
          %8 = hal.interface.constant.load layout(#pipeline_layout) ordinal(8) : i32
          %9 = hal.interface.constant.load layout(#pipeline_layout) ordinal(9) : i32
          %10 = hal.interface.constant.load layout(#pipeline_layout) ordinal(10) : i32
          %11 = hal.interface.constant.load layout(#pipeline_layout) ordinal(11) : i32
          %12 = hal.interface.constant.load layout(#pipeline_layout) ordinal(12) : i32
          %13 = hal.interface.constant.load layout(#pipeline_layout) ordinal(13) : i32
          %14 = hal.interface.constant.load layout(#pipeline_layout) ordinal(14) : i32
          %15 = hal.interface.constant.load layout(#pipeline_layout) ordinal(15) : i32
          %16 = hal.interface.constant.load layout(#pipeline_layout) ordinal(16) : i32
          %17 = hal.interface.constant.load layout(#pipeline_layout) ordinal(17) : i32
          %18 = arith.extui %0 : i32 to i64
          %19 = arith.extui %1 : i32 to i64
          %20 = arith.shli %19, %c32_i64 : i64
          %21 = arith.ori %18, %20 : i64
          %22 = arith.index_castui %21 : i64 to index
          %23 = arith.extui %2 : i32 to i64
          %24 = arith.extui %3 : i32 to i64
          %25 = arith.shli %24, %c32_i64 : i64
          %26 = arith.ori %23, %25 : i64
          %27 = arith.index_castui %26 : i64 to index
          %28 = arith.extui %4 : i32 to i64
          %29 = arith.extui %5 : i32 to i64
          %30 = arith.shli %29, %c32_i64 : i64
          %31 = arith.ori %28, %30 : i64
          %32 = arith.index_castui %31 : i64 to index
          %33 = arith.extui %6 : i32 to i64
          %34 = arith.extui %7 : i32 to i64
          %35 = arith.shli %34, %c32_i64 : i64
          %36 = arith.ori %33, %35 : i64
          %37 = arith.index_castui %36 : i64 to index
          %38 = arith.extui %8 : i32 to i64
          %39 = arith.extui %9 : i32 to i64
          %40 = arith.shli %39, %c32_i64 : i64
          %41 = arith.ori %38, %40 : i64
          %42 = arith.index_castui %41 : i64 to index
          %43 = arith.extui %10 : i32 to i64
          %44 = arith.extui %11 : i32 to i64
          %45 = arith.shli %44, %c32_i64 : i64
          %46 = arith.ori %43, %45 : i64
          %47 = arith.index_castui %46 : i64 to index
          %48 = arith.extui %12 : i32 to i64
          %49 = arith.extui %13 : i32 to i64
          %50 = arith.shli %49, %c32_i64 : i64
          %51 = arith.ori %48, %50 : i64
          %52 = arith.index_castui %51 : i64 to index
          %53 = arith.extui %14 : i32 to i64
          %54 = arith.extui %15 : i32 to i64
          %55 = arith.shli %54, %c32_i64 : i64
          %56 = arith.ori %53, %55 : i64
          %57 = arith.index_castui %56 : i64 to index
          %58 = arith.extui %16 : i32 to i64
          %59 = arith.extui %17 : i32 to i64
          %60 = arith.shli %59, %c32_i64 : i64
          %61 = arith.ori %58, %60 : i64
          %62 = arith.index_castui %61 : i64 to index
          %63 = flow.dispatch.workload.ordinal %37, 0 : index
          %64 = flow.dispatch.workload.ordinal %42, 1 : index
          %65 = flow.dispatch.workload.ordinal %47, 2 : index
          %66 = flow.dispatch.workload.ordinal %52, 3 : index
          %67 = flow.dispatch.workload.ordinal %57, 4 : index
          %68 = flow.dispatch.workload.ordinal %62, 5 : index
          %69 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%63, %64}
          %70 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%65, %66}
          %71 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68}
          %72 = flow.dispatch.tensor.load %69, offsets = [0, 0, 0, 0, 0, 0, 0], sizes = [%63, %64, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%63, %64} -> tensor<?x?x8x4x16x2x8xi8>
          %73 = flow.dispatch.tensor.load %70, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%65, %66, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%65, %66} -> tensor<?x?x4x2x4x16x2x8xi8>
          %74 = flow.dispatch.tensor.load %71, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%67, %68, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68} -> tensor<?x?x8x4x2x4x16x4xi32>
          %75 = iree_gpu.multi_mma %72, %73, %74 {indexing_maps = [#map, #map1, #map2], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic =  MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32>
          flow.dispatch.tensor.store %75, %71, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%67, %68, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<?x?x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68}
          return
        }
      }
    }
  }
 }


 // -----// IR Dump After MaterializeTargetDevicesPass (iree-hal-materialize-target-devices) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647]>>, ukernels = "none"}>
 #map = affine_map<(d0, d1, d2) -> (d0, d2)>
 #map1 = affine_map<(d0, d1, d2) -> (d1, d2)>
 #map2 = affine_map<(d0, d1, d2) -> (d0, d1)>
 #pipeline_layout = #hal.pipeline.layout<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
 #device_target_hip = #hal.device.target<"hip", {legacy_sync}, [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  hal.executable public @foo_dispatch_6 {
    hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) {
      hal.executable.export public @foo_dispatch_6 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index):
        %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg1, %arg2, %arg3, %arg4, %arg5, %arg6
        hal.return %x, %y, %z : index, index, index
      }
      builtin.module {
        func.func @foo_dispatch_6() {
          %c32_i64 = arith.constant 32 : i64
          %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32
          %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32
          %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32
          %3 = hal.interface.constant.load layout(#pipeline_layout) ordinal(3) : i32
          %4 = hal.interface.constant.load layout(#pipeline_layout) ordinal(4) : i32
          %5 = hal.interface.constant.load layout(#pipeline_layout) ordinal(5) : i32
          %6 = hal.interface.constant.load layout(#pipeline_layout) ordinal(6) : i32
          %7 = hal.interface.constant.load layout(#pipeline_layout) ordinal(7) : i32
          %8 = hal.interface.constant.load layout(#pipeline_layout) ordinal(8) : i32
          %9 = hal.interface.constant.load layout(#pipeline_layout) ordinal(9) : i32
          %10 = hal.interface.constant.load layout(#pipeline_layout) ordinal(10) : i32
          %11 = hal.interface.constant.load layout(#pipeline_layout) ordinal(11) : i32
          %12 = hal.interface.constant.load layout(#pipeline_layout) ordinal(12) : i32
          %13 = hal.interface.constant.load layout(#pipeline_layout) ordinal(13) : i32
          %14 = hal.interface.constant.load layout(#pipeline_layout) ordinal(14) : i32
          %15 = hal.interface.constant.load layout(#pipeline_layout) ordinal(15) : i32
          %16 = hal.interface.constant.load layout(#pipeline_layout) ordinal(16) : i32
          %17 = hal.interface.constant.load layout(#pipeline_layout) ordinal(17) : i32
          %18 = arith.extui %0 : i32 to i64
          %19 = arith.extui %1 : i32 to i64
          %20 = arith.shli %19, %c32_i64 : i64
          %21 = arith.ori %18, %20 : i64
          %22 = arith.index_castui %21 : i64 to index
          %23 = arith.extui %2 : i32 to i64
          %24 = arith.extui %3 : i32 to i64
          %25 = arith.shli %24, %c32_i64 : i64
          %26 = arith.ori %23, %25 : i64
          %27 = arith.index_castui %26 : i64 to index
          %28 = arith.extui %4 : i32 to i64
          %29 = arith.extui %5 : i32 to i64
          %30 = arith.shli %29, %c32_i64 : i64
          %31 = arith.ori %28, %30 : i64
          %32 = arith.index_castui %31 : i64 to index
          %33 = arith.extui %6 : i32 to i64
          %34 = arith.extui %7 : i32 to i64
          %35 = arith.shli %34, %c32_i64 : i64
          %36 = arith.ori %33, %35 : i64
          %37 = arith.index_castui %36 : i64 to index
          %38 = arith.extui %8 : i32 to i64
          %39 = arith.extui %9 : i32 to i64
          %40 = arith.shli %39, %c32_i64 : i64
          %41 = arith.ori %38, %40 : i64
          %42 = arith.index_castui %41 : i64 to index
          %43 = arith.extui %10 : i32 to i64
          %44 = arith.extui %11 : i32 to i64
          %45 = arith.shli %44, %c32_i64 : i64
          %46 = arith.ori %43, %45 : i64
          %47 = arith.index_castui %46 : i64 to index
          %48 = arith.extui %12 : i32 to i64
          %49 = arith.extui %13 : i32 to i64
          %50 = arith.shli %49, %c32_i64 : i64
          %51 = arith.ori %48, %50 : i64
          %52 = arith.index_castui %51 : i64 to index
          %53 = arith.extui %14 : i32 to i64
          %54 = arith.extui %15 : i32 to i64
          %55 = arith.shli %54, %c32_i64 : i64
          %56 = arith.ori %53, %55 : i64
          %57 = arith.index_castui %56 : i64 to index
          %58 = arith.extui %16 : i32 to i64
          %59 = arith.extui %17 : i32 to i64
          %60 = arith.shli %59, %c32_i64 : i64
          %61 = arith.ori %58, %60 : i64
          %62 = arith.index_castui %61 : i64 to index
          %63 = flow.dispatch.workload.ordinal %37, 0 : index
          %64 = flow.dispatch.workload.ordinal %42, 1 : index
          %65 = flow.dispatch.workload.ordinal %47, 2 : index
          %66 = flow.dispatch.workload.ordinal %52, 3 : index
          %67 = flow.dispatch.workload.ordinal %57, 4 : index
          %68 = flow.dispatch.workload.ordinal %62, 5 : index
          %69 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%63, %64}
          %70 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%65, %66}
          %71 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68}
          %72 = flow.dispatch.tensor.load %69, offsets = [0, 0, 0, 0, 0, 0, 0], sizes = [%63, %64, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%63, %64} -> tensor<?x?x8x4x16x2x8xi8>
          %73 = flow.dispatch.tensor.load %70, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%65, %66, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%65, %66} -> tensor<?x?x4x2x4x16x2x8xi8>
          %74 = flow.dispatch.tensor.load %71, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%67, %68, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68} -> tensor<?x?x8x4x2x4x16x4xi32>
          %75 = iree_gpu.multi_mma %72, %73, %74 {indexing_maps = [#map, #map1, #map2], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic =  MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32>
          flow.dispatch.tensor.store %75, %71, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%67, %68, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<?x?x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68}
          return
        }
      }
    }
  }
 }


 // -----// IR Dump After ResolveDevicePromisesPass (iree-hal-resolve-device-promises) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647]>>, ukernels = "none"}>
 #map = affine_map<(d0, d1, d2) -> (d0, d2)>
 #map1 = affine_map<(d0, d1, d2) -> (d1, d2)>
 #map2 = affine_map<(d0, d1, d2) -> (d0, d1)>
 #pipeline_layout = #hal.pipeline.layout<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
 #device_target_hip = #hal.device.target<"hip", {legacy_sync}, [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  hal.executable public @foo_dispatch_6 {
    hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) {
      hal.executable.export public @foo_dispatch_6 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index):
        %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg1, %arg2, %arg3, %arg4, %arg5, %arg6
        hal.return %x, %y, %z : index, index, index
      }
      builtin.module {
        func.func @foo_dispatch_6() {
          %c32_i64 = arith.constant 32 : i64
          %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32
          %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32
          %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32
          %3 = hal.interface.constant.load layout(#pipeline_layout) ordinal(3) : i32
          %4 = hal.interface.constant.load layout(#pipeline_layout) ordinal(4) : i32
          %5 = hal.interface.constant.load layout(#pipeline_layout) ordinal(5) : i32
          %6 = hal.interface.constant.load layout(#pipeline_layout) ordinal(6) : i32
          %7 = hal.interface.constant.load layout(#pipeline_layout) ordinal(7) : i32
          %8 = hal.interface.constant.load layout(#pipeline_layout) ordinal(8) : i32
          %9 = hal.interface.constant.load layout(#pipeline_layout) ordinal(9) : i32
          %10 = hal.interface.constant.load layout(#pipeline_layout) ordinal(10) : i32
          %11 = hal.interface.constant.load layout(#pipeline_layout) ordinal(11) : i32
          %12 = hal.interface.constant.load layout(#pipeline_layout) ordinal(12) : i32
          %13 = hal.interface.constant.load layout(#pipeline_layout) ordinal(13) : i32
          %14 = hal.interface.constant.load layout(#pipeline_layout) ordinal(14) : i32
          %15 = hal.interface.constant.load layout(#pipeline_layout) ordinal(15) : i32
          %16 = hal.interface.constant.load layout(#pipeline_layout) ordinal(16) : i32
          %17 = hal.interface.constant.load layout(#pipeline_layout) ordinal(17) : i32
          %18 = arith.extui %0 : i32 to i64
          %19 = arith.extui %1 : i32 to i64
          %20 = arith.shli %19, %c32_i64 : i64
          %21 = arith.ori %18, %20 : i64
          %22 = arith.index_castui %21 : i64 to index
          %23 = arith.extui %2 : i32 to i64
          %24 = arith.extui %3 : i32 to i64
          %25 = arith.shli %24, %c32_i64 : i64
          %26 = arith.ori %23, %25 : i64
          %27 = arith.index_castui %26 : i64 to index
          %28 = arith.extui %4 : i32 to i64
          %29 = arith.extui %5 : i32 to i64
          %30 = arith.shli %29, %c32_i64 : i64
          %31 = arith.ori %28, %30 : i64
          %32 = arith.index_castui %31 : i64 to index
          %33 = arith.extui %6 : i32 to i64
          %34 = arith.extui %7 : i32 to i64
          %35 = arith.shli %34, %c32_i64 : i64
          %36 = arith.ori %33, %35 : i64
          %37 = arith.index_castui %36 : i64 to index
          %38 = arith.extui %8 : i32 to i64
          %39 = arith.extui %9 : i32 to i64
          %40 = arith.shli %39, %c32_i64 : i64
          %41 = arith.ori %38, %40 : i64
          %42 = arith.index_castui %41 : i64 to index
          %43 = arith.extui %10 : i32 to i64
          %44 = arith.extui %11 : i32 to i64
          %45 = arith.shli %44, %c32_i64 : i64
          %46 = arith.ori %43, %45 : i64
          %47 = arith.index_castui %46 : i64 to index
          %48 = arith.extui %12 : i32 to i64
          %49 = arith.extui %13 : i32 to i64
          %50 = arith.shli %49, %c32_i64 : i64
          %51 = arith.ori %48, %50 : i64
          %52 = arith.index_castui %51 : i64 to index
          %53 = arith.extui %14 : i32 to i64
          %54 = arith.extui %15 : i32 to i64
          %55 = arith.shli %54, %c32_i64 : i64
          %56 = arith.ori %53, %55 : i64
          %57 = arith.index_castui %56 : i64 to index
          %58 = arith.extui %16 : i32 to i64
          %59 = arith.extui %17 : i32 to i64
          %60 = arith.shli %59, %c32_i64 : i64
          %61 = arith.ori %58, %60 : i64
          %62 = arith.index_castui %61 : i64 to index
          %63 = flow.dispatch.workload.ordinal %37, 0 : index
          %64 = flow.dispatch.workload.ordinal %42, 1 : index
          %65 = flow.dispatch.workload.ordinal %47, 2 : index
          %66 = flow.dispatch.workload.ordinal %52, 3 : index
          %67 = flow.dispatch.workload.ordinal %57, 4 : index
          %68 = flow.dispatch.workload.ordinal %62, 5 : index
          %69 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%63, %64}
          %70 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%65, %66}
          %71 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68}
          %72 = flow.dispatch.tensor.load %69, offsets = [0, 0, 0, 0, 0, 0, 0], sizes = [%63, %64, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%63, %64} -> tensor<?x?x8x4x16x2x8xi8>
          %73 = flow.dispatch.tensor.load %70, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%65, %66, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%65, %66} -> tensor<?x?x4x2x4x16x2x8xi8>
          %74 = flow.dispatch.tensor.load %71, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%67, %68, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68} -> tensor<?x?x8x4x2x4x16x4xi32>
          %75 = iree_gpu.multi_mma %72, %73, %74 {indexing_maps = [#map, #map1, #map2], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic =  MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32>
          flow.dispatch.tensor.store %75, %71, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%67, %68, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<?x?x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68}
          return
        }
      }
    }
  }
 }


 // -----// IR Dump After ResolveDeviceAliasesPass (iree-hal-resolve-device-aliases) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647]>>, ukernels = "none"}>
 #map = affine_map<(d0, d1, d2) -> (d0, d2)>
 #map1 = affine_map<(d0, d1, d2) -> (d1, d2)>
 #map2 = affine_map<(d0, d1, d2) -> (d0, d1)>
 #pipeline_layout = #hal.pipeline.layout<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
 #device_target_hip = #hal.device.target<"hip", {legacy_sync}, [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  hal.executable public @foo_dispatch_6 {
    hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) {
      hal.executable.export public @foo_dispatch_6 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index):
        %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg1, %arg2, %arg3, %arg4, %arg5, %arg6
        hal.return %x, %y, %z : index, index, index
      }
      builtin.module {
        func.func @foo_dispatch_6() {
          %c32_i64 = arith.constant 32 : i64
          %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32
          %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32
          %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32
          %3 = hal.interface.constant.load layout(#pipeline_layout) ordinal(3) : i32
          %4 = hal.interface.constant.load layout(#pipeline_layout) ordinal(4) : i32
          %5 = hal.interface.constant.load layout(#pipeline_layout) ordinal(5) : i32
          %6 = hal.interface.constant.load layout(#pipeline_layout) ordinal(6) : i32
          %7 = hal.interface.constant.load layout(#pipeline_layout) ordinal(7) : i32
          %8 = hal.interface.constant.load layout(#pipeline_layout) ordinal(8) : i32
          %9 = hal.interface.constant.load layout(#pipeline_layout) ordinal(9) : i32
          %10 = hal.interface.constant.load layout(#pipeline_layout) ordinal(10) : i32
          %11 = hal.interface.constant.load layout(#pipeline_layout) ordinal(11) : i32
          %12 = hal.interface.constant.load layout(#pipeline_layout) ordinal(12) : i32
          %13 = hal.interface.constant.load layout(#pipeline_layout) ordinal(13) : i32
          %14 = hal.interface.constant.load layout(#pipeline_layout) ordinal(14) : i32
          %15 = hal.interface.constant.load layout(#pipeline_layout) ordinal(15) : i32
          %16 = hal.interface.constant.load layout(#pipeline_layout) ordinal(16) : i32
          %17 = hal.interface.constant.load layout(#pipeline_layout) ordinal(17) : i32
          %18 = arith.extui %0 : i32 to i64
          %19 = arith.extui %1 : i32 to i64
          %20 = arith.shli %19, %c32_i64 : i64
          %21 = arith.ori %18, %20 : i64
          %22 = arith.index_castui %21 : i64 to index
          %23 = arith.extui %2 : i32 to i64
          %24 = arith.extui %3 : i32 to i64
          %25 = arith.shli %24, %c32_i64 : i64
          %26 = arith.ori %23, %25 : i64
          %27 = arith.index_castui %26 : i64 to index
          %28 = arith.extui %4 : i32 to i64
          %29 = arith.extui %5 : i32 to i64
          %30 = arith.shli %29, %c32_i64 : i64
          %31 = arith.ori %28, %30 : i64
          %32 = arith.index_castui %31 : i64 to index
          %33 = arith.extui %6 : i32 to i64
          %34 = arith.extui %7 : i32 to i64
          %35 = arith.shli %34, %c32_i64 : i64
          %36 = arith.ori %33, %35 : i64
          %37 = arith.index_castui %36 : i64 to index
          %38 = arith.extui %8 : i32 to i64
          %39 = arith.extui %9 : i32 to i64
          %40 = arith.shli %39, %c32_i64 : i64
          %41 = arith.ori %38, %40 : i64
          %42 = arith.index_castui %41 : i64 to index
          %43 = arith.extui %10 : i32 to i64
          %44 = arith.extui %11 : i32 to i64
          %45 = arith.shli %44, %c32_i64 : i64
          %46 = arith.ori %43, %45 : i64
          %47 = arith.index_castui %46 : i64 to index
          %48 = arith.extui %12 : i32 to i64
          %49 = arith.extui %13 : i32 to i64
          %50 = arith.shli %49, %c32_i64 : i64
          %51 = arith.ori %48, %50 : i64
          %52 = arith.index_castui %51 : i64 to index
          %53 = arith.extui %14 : i32 to i64
          %54 = arith.extui %15 : i32 to i64
          %55 = arith.shli %54, %c32_i64 : i64
          %56 = arith.ori %53, %55 : i64
          %57 = arith.index_castui %56 : i64 to index
          %58 = arith.extui %16 : i32 to i64
          %59 = arith.extui %17 : i32 to i64
          %60 = arith.shli %59, %c32_i64 : i64
          %61 = arith.ori %58, %60 : i64
          %62 = arith.index_castui %61 : i64 to index
          %63 = flow.dispatch.workload.ordinal %37, 0 : index
          %64 = flow.dispatch.workload.ordinal %42, 1 : index
          %65 = flow.dispatch.workload.ordinal %47, 2 : index
          %66 = flow.dispatch.workload.ordinal %52, 3 : index
          %67 = flow.dispatch.workload.ordinal %57, 4 : index
          %68 = flow.dispatch.workload.ordinal %62, 5 : index
          %69 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%63, %64}
          %70 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%65, %66}
          %71 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68}
          %72 = flow.dispatch.tensor.load %69, offsets = [0, 0, 0, 0, 0, 0, 0], sizes = [%63, %64, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%63, %64} -> tensor<?x?x8x4x16x2x8xi8>
          %73 = flow.dispatch.tensor.load %70, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%65, %66, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%65, %66} -> tensor<?x?x4x2x4x16x2x8xi8>
          %74 = flow.dispatch.tensor.load %71, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%67, %68, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68} -> tensor<?x?x8x4x2x4x16x4xi32>
          %75 = iree_gpu.multi_mma %72, %73, %74 {indexing_maps = [#map, #map1, #map2], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic =  MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32>
          flow.dispatch.tensor.store %75, %71, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%67, %68, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<?x?x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68}
          return
        }
      }
    }
  }
 }


 // -----// IR Dump After VerifyDevicesPass (iree-hal-verify-devices) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647]>>, ukernels = "none"}>
 #map = affine_map<(d0, d1, d2) -> (d0, d2)>
 #map1 = affine_map<(d0, d1, d2) -> (d1, d2)>
 #map2 = affine_map<(d0, d1, d2) -> (d0, d1)>
 #pipeline_layout = #hal.pipeline.layout<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
 #device_target_hip = #hal.device.target<"hip", {legacy_sync}, [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  hal.executable public @foo_dispatch_6 {
    hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) {
      hal.executable.export public @foo_dispatch_6 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index):
        %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg1, %arg2, %arg3, %arg4, %arg5, %arg6
        hal.return %x, %y, %z : index, index, index
      }
      builtin.module {
        func.func @foo_dispatch_6() {
          %c32_i64 = arith.constant 32 : i64
          %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32
          %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32
          %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32
          %3 = hal.interface.constant.load layout(#pipeline_layout) ordinal(3) : i32
          %4 = hal.interface.constant.load layout(#pipeline_layout) ordinal(4) : i32
          %5 = hal.interface.constant.load layout(#pipeline_layout) ordinal(5) : i32
          %6 = hal.interface.constant.load layout(#pipeline_layout) ordinal(6) : i32
          %7 = hal.interface.constant.load layout(#pipeline_layout) ordinal(7) : i32
          %8 = hal.interface.constant.load layout(#pipeline_layout) ordinal(8) : i32
          %9 = hal.interface.constant.load layout(#pipeline_layout) ordinal(9) : i32
          %10 = hal.interface.constant.load layout(#pipeline_layout) ordinal(10) : i32
          %11 = hal.interface.constant.load layout(#pipeline_layout) ordinal(11) : i32
          %12 = hal.interface.constant.load layout(#pipeline_layout) ordinal(12) : i32
          %13 = hal.interface.constant.load layout(#pipeline_layout) ordinal(13) : i32
          %14 = hal.interface.constant.load layout(#pipeline_layout) ordinal(14) : i32
          %15 = hal.interface.constant.load layout(#pipeline_layout) ordinal(15) : i32
          %16 = hal.interface.constant.load layout(#pipeline_layout) ordinal(16) : i32
          %17 = hal.interface.constant.load layout(#pipeline_layout) ordinal(17) : i32
          %18 = arith.extui %0 : i32 to i64
          %19 = arith.extui %1 : i32 to i64
          %20 = arith.shli %19, %c32_i64 : i64
          %21 = arith.ori %18, %20 : i64
          %22 = arith.index_castui %21 : i64 to index
          %23 = arith.extui %2 : i32 to i64
          %24 = arith.extui %3 : i32 to i64
          %25 = arith.shli %24, %c32_i64 : i64
          %26 = arith.ori %23, %25 : i64
          %27 = arith.index_castui %26 : i64 to index
          %28 = arith.extui %4 : i32 to i64
          %29 = arith.extui %5 : i32 to i64
          %30 = arith.shli %29, %c32_i64 : i64
          %31 = arith.ori %28, %30 : i64
          %32 = arith.index_castui %31 : i64 to index
          %33 = arith.extui %6 : i32 to i64
          %34 = arith.extui %7 : i32 to i64
          %35 = arith.shli %34, %c32_i64 : i64
          %36 = arith.ori %33, %35 : i64
          %37 = arith.index_castui %36 : i64 to index
          %38 = arith.extui %8 : i32 to i64
          %39 = arith.extui %9 : i32 to i64
          %40 = arith.shli %39, %c32_i64 : i64
          %41 = arith.ori %38, %40 : i64
          %42 = arith.index_castui %41 : i64 to index
          %43 = arith.extui %10 : i32 to i64
          %44 = arith.extui %11 : i32 to i64
          %45 = arith.shli %44, %c32_i64 : i64
          %46 = arith.ori %43, %45 : i64
          %47 = arith.index_castui %46 : i64 to index
          %48 = arith.extui %12 : i32 to i64
          %49 = arith.extui %13 : i32 to i64
          %50 = arith.shli %49, %c32_i64 : i64
          %51 = arith.ori %48, %50 : i64
          %52 = arith.index_castui %51 : i64 to index
          %53 = arith.extui %14 : i32 to i64
          %54 = arith.extui %15 : i32 to i64
          %55 = arith.shli %54, %c32_i64 : i64
          %56 = arith.ori %53, %55 : i64
          %57 = arith.index_castui %56 : i64 to index
          %58 = arith.extui %16 : i32 to i64
          %59 = arith.extui %17 : i32 to i64
          %60 = arith.shli %59, %c32_i64 : i64
          %61 = arith.ori %58, %60 : i64
          %62 = arith.index_castui %61 : i64 to index
          %63 = flow.dispatch.workload.ordinal %37, 0 : index
          %64 = flow.dispatch.workload.ordinal %42, 1 : index
          %65 = flow.dispatch.workload.ordinal %47, 2 : index
          %66 = flow.dispatch.workload.ordinal %52, 3 : index
          %67 = flow.dispatch.workload.ordinal %57, 4 : index
          %68 = flow.dispatch.workload.ordinal %62, 5 : index
          %69 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%63, %64}
          %70 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%65, %66}
          %71 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68}
          %72 = flow.dispatch.tensor.load %69, offsets = [0, 0, 0, 0, 0, 0, 0], sizes = [%63, %64, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%63, %64} -> tensor<?x?x8x4x16x2x8xi8>
          %73 = flow.dispatch.tensor.load %70, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%65, %66, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%65, %66} -> tensor<?x?x4x2x4x16x2x8xi8>
          %74 = flow.dispatch.tensor.load %71, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%67, %68, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68} -> tensor<?x?x8x4x2x4x16x4xi32>
          %75 = iree_gpu.multi_mma %72, %73, %74 {indexing_maps = [#map, #map1, #map2], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic =  MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32>
          flow.dispatch.tensor.store %75, %71, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%67, %68, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<?x?x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68}
          return
        }
      }
    }
  }
 }


 // -----// IR Dump After GPUGeneralizeNamedOpsPass (iree-codegen-gpu-generalize-named-ops) //----- //
 func.func @foo_dispatch_6() {
  %c32_i64 = arith.constant 32 : i64
  %0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
  %8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
  %9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
  %10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32
  %11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32
  %12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32
  %13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32
  %14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32
  %15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32
  %16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32
  %17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32
  %18 = arith.extui %0 : i32 to i64
  %19 = arith.extui %1 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %2 : i32 to i64
  %24 = arith.extui %3 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = arith.extui %4 : i32 to i64
  %29 = arith.extui %5 : i32 to i64
  %30 = arith.shli %29, %c32_i64 : i64
  %31 = arith.ori %28, %30 : i64
  %32 = arith.index_castui %31 : i64 to index
  %33 = arith.extui %6 : i32 to i64
  %34 = arith.extui %7 : i32 to i64
  %35 = arith.shli %34, %c32_i64 : i64
  %36 = arith.ori %33, %35 : i64
  %37 = arith.index_castui %36 : i64 to index
  %38 = arith.extui %8 : i32 to i64
  %39 = arith.extui %9 : i32 to i64
  %40 = arith.shli %39, %c32_i64 : i64
  %41 = arith.ori %38, %40 : i64
  %42 = arith.index_castui %41 : i64 to index
  %43 = arith.extui %10 : i32 to i64
  %44 = arith.extui %11 : i32 to i64
  %45 = arith.shli %44, %c32_i64 : i64
  %46 = arith.ori %43, %45 : i64
  %47 = arith.index_castui %46 : i64 to index
  %48 = arith.extui %12 : i32 to i64
  %49 = arith.extui %13 : i32 to i64
  %50 = arith.shli %49, %c32_i64 : i64
  %51 = arith.ori %48, %50 : i64
  %52 = arith.index_castui %51 : i64 to index
  %53 = arith.extui %14 : i32 to i64
  %54 = arith.extui %15 : i32 to i64
  %55 = arith.shli %54, %c32_i64 : i64
  %56 = arith.ori %53, %55 : i64
  %57 = arith.index_castui %56 : i64 to index
  %58 = arith.extui %16 : i32 to i64
  %59 = arith.extui %17 : i32 to i64
  %60 = arith.shli %59, %c32_i64 : i64
  %61 = arith.ori %58, %60 : i64
  %62 = arith.index_castui %61 : i64 to index
  %63 = flow.dispatch.workload.ordinal %37, 0 : index
  %64 = flow.dispatch.workload.ordinal %42, 1 : index
  %65 = flow.dispatch.workload.ordinal %47, 2 : index
  %66 = flow.dispatch.workload.ordinal %52, 3 : index
  %67 = flow.dispatch.workload.ordinal %57, 4 : index
  %68 = flow.dispatch.workload.ordinal %62, 5 : index
  %69 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%63, %64}
  %70 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%65, %66}
  %71 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68}
  %72 = flow.dispatch.tensor.load %69, offsets = [0, 0, 0, 0, 0, 0, 0], sizes = [%63, %64, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%63, %64} -> tensor<?x?x8x4x16x2x8xi8>
  %73 = flow.dispatch.tensor.load %70, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%65, %66, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%65, %66} -> tensor<?x?x4x2x4x16x2x8xi8>
  %74 = flow.dispatch.tensor.load %71, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%67, %68, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68} -> tensor<?x?x8x4x2x4x16x4xi32>
  %75 = iree_gpu.multi_mma %72, %73, %74 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic =  MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32>
  flow.dispatch.tensor.store %75, %71, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%67, %68, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<?x?x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68}
  return
 }

 // -----// IR Dump After TypePropagationPass (iree-codegen-type-propagation) //----- //
 func.func @foo_dispatch_6() {
  %c32_i64 = arith.constant 32 : i64
  %0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
  %8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
  %9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
  %10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32
  %11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32
  %12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32
  %13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32
  %14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32
  %15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32
  %16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32
  %17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32
  %18 = arith.extui %0 : i32 to i64
  %19 = arith.extui %1 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %2 : i32 to i64
  %24 = arith.extui %3 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = arith.extui %4 : i32 to i64
  %29 = arith.extui %5 : i32 to i64
  %30 = arith.shli %29, %c32_i64 : i64
  %31 = arith.ori %28, %30 : i64
  %32 = arith.index_castui %31 : i64 to index
  %33 = arith.extui %6 : i32 to i64
  %34 = arith.extui %7 : i32 to i64
  %35 = arith.shli %34, %c32_i64 : i64
  %36 = arith.ori %33, %35 : i64
  %37 = arith.index_castui %36 : i64 to index
  %38 = arith.extui %8 : i32 to i64
  %39 = arith.extui %9 : i32 to i64
  %40 = arith.shli %39, %c32_i64 : i64
  %41 = arith.ori %38, %40 : i64
  %42 = arith.index_castui %41 : i64 to index
  %43 = arith.extui %10 : i32 to i64
  %44 = arith.extui %11 : i32 to i64
  %45 = arith.shli %44, %c32_i64 : i64
  %46 = arith.ori %43, %45 : i64
  %47 = arith.index_castui %46 : i64 to index
  %48 = arith.extui %12 : i32 to i64
  %49 = arith.extui %13 : i32 to i64
  %50 = arith.shli %49, %c32_i64 : i64
  %51 = arith.ori %48, %50 : i64
  %52 = arith.index_castui %51 : i64 to index
  %53 = arith.extui %14 : i32 to i64
  %54 = arith.extui %15 : i32 to i64
  %55 = arith.shli %54, %c32_i64 : i64
  %56 = arith.ori %53, %55 : i64
  %57 = arith.index_castui %56 : i64 to index
  %58 = arith.extui %16 : i32 to i64
  %59 = arith.extui %17 : i32 to i64
  %60 = arith.shli %59, %c32_i64 : i64
  %61 = arith.ori %58, %60 : i64
  %62 = arith.index_castui %61 : i64 to index
  %63 = flow.dispatch.workload.ordinal %37, 0 : index
  %64 = flow.dispatch.workload.ordinal %42, 1 : index
  %65 = flow.dispatch.workload.ordinal %47, 2 : index
  %66 = flow.dispatch.workload.ordinal %52, 3 : index
  %67 = flow.dispatch.workload.ordinal %57, 4 : index
  %68 = flow.dispatch.workload.ordinal %62, 5 : index
  %69 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%63, %64}
  %70 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%65, %66}
  %71 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68}
  %72 = flow.dispatch.tensor.load %69, offsets = [0, 0, 0, 0, 0, 0, 0], sizes = [%63, %64, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%63, %64} -> tensor<?x?x8x4x16x2x8xi8>
  %73 = flow.dispatch.tensor.load %70, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%65, %66, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%65, %66} -> tensor<?x?x4x2x4x16x2x8xi8>
  %74 = flow.dispatch.tensor.load %71, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%67, %68, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68} -> tensor<?x?x8x4x2x4x16x4xi32>
  %75 = iree_gpu.multi_mma %72, %73, %74 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic =  MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32>
  flow.dispatch.tensor.store %75, %71, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%67, %68, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<?x?x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68}
  return
 }

 // -----// IR Dump After BubbleUpOrdinalOpsPass (iree-codegen-bubble-up-ordinal-ops) //----- //
 func.func @foo_dispatch_6() {
  %c32_i64 = arith.constant 32 : i64
  %0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
  %8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
  %9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
  %10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32
  %11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32
  %12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32
  %13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32
  %14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32
  %15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32
  %16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32
  %17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32
  %18 = arith.extui %0 : i32 to i64
  %19 = arith.extui %1 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %2 : i32 to i64
  %24 = arith.extui %3 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = arith.extui %4 : i32 to i64
  %29 = arith.extui %5 : i32 to i64
  %30 = arith.shli %29, %c32_i64 : i64
  %31 = arith.ori %28, %30 : i64
  %32 = arith.index_castui %31 : i64 to index
  %33 = arith.extui %6 : i32 to i64
  %34 = arith.extui %7 : i32 to i64
  %35 = arith.shli %34, %c32_i64 : i64
  %36 = arith.ori %33, %35 : i64
  %37 = arith.index_castui %36 : i64 to index
  %38 = arith.extui %8 : i32 to i64
  %39 = arith.extui %9 : i32 to i64
  %40 = arith.shli %39, %c32_i64 : i64
  %41 = arith.ori %38, %40 : i64
  %42 = arith.index_castui %41 : i64 to index
  %43 = arith.extui %10 : i32 to i64
  %44 = arith.extui %11 : i32 to i64
  %45 = arith.shli %44, %c32_i64 : i64
  %46 = arith.ori %43, %45 : i64
  %47 = arith.index_castui %46 : i64 to index
  %48 = arith.extui %12 : i32 to i64
  %49 = arith.extui %13 : i32 to i64
  %50 = arith.shli %49, %c32_i64 : i64
  %51 = arith.ori %48, %50 : i64
  %52 = arith.index_castui %51 : i64 to index
  %53 = arith.extui %14 : i32 to i64
  %54 = arith.extui %15 : i32 to i64
  %55 = arith.shli %54, %c32_i64 : i64
  %56 = arith.ori %53, %55 : i64
  %57 = arith.index_castui %56 : i64 to index
  %58 = arith.extui %16 : i32 to i64
  %59 = arith.extui %17 : i32 to i64
  %60 = arith.shli %59, %c32_i64 : i64
  %61 = arith.ori %58, %60 : i64
  %62 = arith.index_castui %61 : i64 to index
  %63 = flow.dispatch.workload.ordinal %37, 0 : index
  %64 = flow.dispatch.workload.ordinal %42, 1 : index
  %65 = flow.dispatch.workload.ordinal %47, 2 : index
  %66 = flow.dispatch.workload.ordinal %52, 3 : index
  %67 = flow.dispatch.workload.ordinal %57, 4 : index
  %68 = flow.dispatch.workload.ordinal %62, 5 : index
  %69 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%63, %64}
  %70 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%65, %66}
  %71 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68}
  %72 = flow.dispatch.tensor.load %69, offsets = [0, 0, 0, 0, 0, 0, 0], sizes = [%63, %64, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%63, %64} -> tensor<?x?x8x4x16x2x8xi8>
  %73 = flow.dispatch.tensor.load %70, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%65, %66, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%65, %66} -> tensor<?x?x4x2x4x16x2x8xi8>
  %74 = flow.dispatch.tensor.load %71, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%67, %68, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68} -> tensor<?x?x8x4x2x4x16x4xi32>
  %75 = iree_gpu.multi_mma %72, %73, %74 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic =  MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32>
  flow.dispatch.tensor.store %75, %71, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%67, %68, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<?x?x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68}
  return
 }

 // -----// IR Dump After BufferizeCopyOnlyDispatchesPass (iree-codegen-bufferize-copy-only-dispatches) //----- //
 func.func @foo_dispatch_6() {
  %c32_i64 = arith.constant 32 : i64
  %0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
  %8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
  %9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
  %10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32
  %11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32
  %12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32
  %13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32
  %14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32
  %15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32
  %16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32
  %17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32
  %18 = arith.extui %0 : i32 to i64
  %19 = arith.extui %1 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %2 : i32 to i64
  %24 = arith.extui %3 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = arith.extui %4 : i32 to i64
  %29 = arith.extui %5 : i32 to i64
  %30 = arith.shli %29, %c32_i64 : i64
  %31 = arith.ori %28, %30 : i64
  %32 = arith.index_castui %31 : i64 to index
  %33 = arith.extui %6 : i32 to i64
  %34 = arith.extui %7 : i32 to i64
  %35 = arith.shli %34, %c32_i64 : i64
  %36 = arith.ori %33, %35 : i64
  %37 = arith.index_castui %36 : i64 to index
  %38 = arith.extui %8 : i32 to i64
  %39 = arith.extui %9 : i32 to i64
  %40 = arith.shli %39, %c32_i64 : i64
  %41 = arith.ori %38, %40 : i64
  %42 = arith.index_castui %41 : i64 to index
  %43 = arith.extui %10 : i32 to i64
  %44 = arith.extui %11 : i32 to i64
  %45 = arith.shli %44, %c32_i64 : i64
  %46 = arith.ori %43, %45 : i64
  %47 = arith.index_castui %46 : i64 to index
  %48 = arith.extui %12 : i32 to i64
  %49 = arith.extui %13 : i32 to i64
  %50 = arith.shli %49, %c32_i64 : i64
  %51 = arith.ori %48, %50 : i64
  %52 = arith.index_castui %51 : i64 to index
  %53 = arith.extui %14 : i32 to i64
  %54 = arith.extui %15 : i32 to i64
  %55 = arith.shli %54, %c32_i64 : i64
  %56 = arith.ori %53, %55 : i64
  %57 = arith.index_castui %56 : i64 to index
  %58 = arith.extui %16 : i32 to i64
  %59 = arith.extui %17 : i32 to i64
  %60 = arith.shli %59, %c32_i64 : i64
  %61 = arith.ori %58, %60 : i64
  %62 = arith.index_castui %61 : i64 to index
  %63 = flow.dispatch.workload.ordinal %37, 0 : index
  %64 = flow.dispatch.workload.ordinal %42, 1 : index
  %65 = flow.dispatch.workload.ordinal %47, 2 : index
  %66 = flow.dispatch.workload.ordinal %52, 3 : index
  %67 = flow.dispatch.workload.ordinal %57, 4 : index
  %68 = flow.dispatch.workload.ordinal %62, 5 : index
  %69 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%63, %64}
  %70 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%65, %66}
  %71 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68}
  %72 = flow.dispatch.tensor.load %69, offsets = [0, 0, 0, 0, 0, 0, 0], sizes = [%63, %64, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%63, %64} -> tensor<?x?x8x4x16x2x8xi8>
  %73 = flow.dispatch.tensor.load %70, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%65, %66, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%65, %66} -> tensor<?x?x4x2x4x16x2x8xi8>
  %74 = flow.dispatch.tensor.load %71, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%67, %68, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68} -> tensor<?x?x8x4x2x4x16x4xi32>
  %75 = iree_gpu.multi_mma %72, %73, %74 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic =  MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32>
  flow.dispatch.tensor.store %75, %71, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%67, %68, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<?x?x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68}
  return
 }

 // -----// IR Dump After DecomposeSoftmaxPass (iree-codegen-decompose-softmax) //----- //
 func.func @foo_dispatch_6() {
  %c32_i64 = arith.constant 32 : i64
  %0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
  %8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
  %9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
  %10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32
  %11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32
  %12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32
  %13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32
  %14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32
  %15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32
  %16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32
  %17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32
  %18 = arith.extui %0 : i32 to i64
  %19 = arith.extui %1 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %2 : i32 to i64
  %24 = arith.extui %3 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = arith.extui %4 : i32 to i64
  %29 = arith.extui %5 : i32 to i64
  %30 = arith.shli %29, %c32_i64 : i64
  %31 = arith.ori %28, %30 : i64
  %32 = arith.index_castui %31 : i64 to index
  %33 = arith.extui %6 : i32 to i64
  %34 = arith.extui %7 : i32 to i64
  %35 = arith.shli %34, %c32_i64 : i64
  %36 = arith.ori %33, %35 : i64
  %37 = arith.index_castui %36 : i64 to index
  %38 = arith.extui %8 : i32 to i64
  %39 = arith.extui %9 : i32 to i64
  %40 = arith.shli %39, %c32_i64 : i64
  %41 = arith.ori %38, %40 : i64
  %42 = arith.index_castui %41 : i64 to index
  %43 = arith.extui %10 : i32 to i64
  %44 = arith.extui %11 : i32 to i64
  %45 = arith.shli %44, %c32_i64 : i64
  %46 = arith.ori %43, %45 : i64
  %47 = arith.index_castui %46 : i64 to index
  %48 = arith.extui %12 : i32 to i64
  %49 = arith.extui %13 : i32 to i64
  %50 = arith.shli %49, %c32_i64 : i64
  %51 = arith.ori %48, %50 : i64
  %52 = arith.index_castui %51 : i64 to index
  %53 = arith.extui %14 : i32 to i64
  %54 = arith.extui %15 : i32 to i64
  %55 = arith.shli %54, %c32_i64 : i64
  %56 = arith.ori %53, %55 : i64
  %57 = arith.index_castui %56 : i64 to index
  %58 = arith.extui %16 : i32 to i64
  %59 = arith.extui %17 : i32 to i64
  %60 = arith.shli %59, %c32_i64 : i64
  %61 = arith.ori %58, %60 : i64
  %62 = arith.index_castui %61 : i64 to index
  %63 = flow.dispatch.workload.ordinal %37, 0 : index
  %64 = flow.dispatch.workload.ordinal %42, 1 : index
  %65 = flow.dispatch.workload.ordinal %47, 2 : index
  %66 = flow.dispatch.workload.ordinal %52, 3 : index
  %67 = flow.dispatch.workload.ordinal %57, 4 : index
  %68 = flow.dispatch.workload.ordinal %62, 5 : index
  %69 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%63, %64}
  %70 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%65, %66}
  %71 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68}
  %72 = flow.dispatch.tensor.load %69, offsets = [0, 0, 0, 0, 0, 0, 0], sizes = [%63, %64, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%63, %64} -> tensor<?x?x8x4x16x2x8xi8>
  %73 = flow.dispatch.tensor.load %70, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%65, %66, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%65, %66} -> tensor<?x?x4x2x4x16x2x8xi8>
  %74 = flow.dispatch.tensor.load %71, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%67, %68, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68} -> tensor<?x?x8x4x2x4x16x4xi32>
  %75 = iree_gpu.multi_mma %72, %73, %74 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic =  MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32>
  flow.dispatch.tensor.store %75, %71, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%67, %68, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<?x?x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68}
  return
 }

 // -----// IR Dump After MaterializeEncodingIntoNopPass (iree-codegen-materialize-encoding-into-nop) //----- //
 func.func @foo_dispatch_6() {
  %c32_i64 = arith.constant 32 : i64
  %0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
  %8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
  %9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
  %10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32
  %11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32
  %12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32
  %13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32
  %14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32
  %15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32
  %16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32
  %17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32
  %18 = arith.extui %0 : i32 to i64
  %19 = arith.extui %1 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %2 : i32 to i64
  %24 = arith.extui %3 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = arith.extui %4 : i32 to i64
  %29 = arith.extui %5 : i32 to i64
  %30 = arith.shli %29, %c32_i64 : i64
  %31 = arith.ori %28, %30 : i64
  %32 = arith.index_castui %31 : i64 to index
  %33 = arith.extui %6 : i32 to i64
  %34 = arith.extui %7 : i32 to i64
  %35 = arith.shli %34, %c32_i64 : i64
  %36 = arith.ori %33, %35 : i64
  %37 = arith.index_castui %36 : i64 to index
  %38 = arith.extui %8 : i32 to i64
  %39 = arith.extui %9 : i32 to i64
  %40 = arith.shli %39, %c32_i64 : i64
  %41 = arith.ori %38, %40 : i64
  %42 = arith.index_castui %41 : i64 to index
  %43 = arith.extui %10 : i32 to i64
  %44 = arith.extui %11 : i32 to i64
  %45 = arith.shli %44, %c32_i64 : i64
  %46 = arith.ori %43, %45 : i64
  %47 = arith.index_castui %46 : i64 to index
  %48 = arith.extui %12 : i32 to i64
  %49 = arith.extui %13 : i32 to i64
  %50 = arith.shli %49, %c32_i64 : i64
  %51 = arith.ori %48, %50 : i64
  %52 = arith.index_castui %51 : i64 to index
  %53 = arith.extui %14 : i32 to i64
  %54 = arith.extui %15 : i32 to i64
  %55 = arith.shli %54, %c32_i64 : i64
  %56 = arith.ori %53, %55 : i64
  %57 = arith.index_castui %56 : i64 to index
  %58 = arith.extui %16 : i32 to i64
  %59 = arith.extui %17 : i32 to i64
  %60 = arith.shli %59, %c32_i64 : i64
  %61 = arith.ori %58, %60 : i64
  %62 = arith.index_castui %61 : i64 to index
  %63 = flow.dispatch.workload.ordinal %37, 0 : index
  %64 = flow.dispatch.workload.ordinal %42, 1 : index
  %65 = flow.dispatch.workload.ordinal %47, 2 : index
  %66 = flow.dispatch.workload.ordinal %52, 3 : index
  %67 = flow.dispatch.workload.ordinal %57, 4 : index
  %68 = flow.dispatch.workload.ordinal %62, 5 : index
  %69 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%63, %64}
  %70 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%65, %66}
  %71 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68}
  %72 = flow.dispatch.tensor.load %69, offsets = [0, 0, 0, 0, 0, 0, 0], sizes = [%63, %64, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%63, %64} -> tensor<?x?x8x4x16x2x8xi8>
  %73 = flow.dispatch.tensor.load %70, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%65, %66, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%65, %66} -> tensor<?x?x4x2x4x16x2x8xi8>
  %74 = flow.dispatch.tensor.load %71, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%67, %68, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68} -> tensor<?x?x8x4x2x4x16x4xi32>
  %75 = iree_gpu.multi_mma %72, %73, %74 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic =  MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32>
  flow.dispatch.tensor.store %75, %71, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%67, %68, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<?x?x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68}
  return
 }

 // -----// IR Dump After BufferizeCopyOnlyDispatchesPass (iree-codegen-bufferize-copy-only-dispatches) //----- //
 func.func @foo_dispatch_6() {
  %c32_i64 = arith.constant 32 : i64
  %0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
  %8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
  %9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
  %10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32
  %11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32
  %12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32
  %13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32
  %14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32
  %15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32
  %16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32
  %17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32
  %18 = arith.extui %0 : i32 to i64
  %19 = arith.extui %1 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %2 : i32 to i64
  %24 = arith.extui %3 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = arith.extui %4 : i32 to i64
  %29 = arith.extui %5 : i32 to i64
  %30 = arith.shli %29, %c32_i64 : i64
  %31 = arith.ori %28, %30 : i64
  %32 = arith.index_castui %31 : i64 to index
  %33 = arith.extui %6 : i32 to i64
  %34 = arith.extui %7 : i32 to i64
  %35 = arith.shli %34, %c32_i64 : i64
  %36 = arith.ori %33, %35 : i64
  %37 = arith.index_castui %36 : i64 to index
  %38 = arith.extui %8 : i32 to i64
  %39 = arith.extui %9 : i32 to i64
  %40 = arith.shli %39, %c32_i64 : i64
  %41 = arith.ori %38, %40 : i64
  %42 = arith.index_castui %41 : i64 to index
  %43 = arith.extui %10 : i32 to i64
  %44 = arith.extui %11 : i32 to i64
  %45 = arith.shli %44, %c32_i64 : i64
  %46 = arith.ori %43, %45 : i64
  %47 = arith.index_castui %46 : i64 to index
  %48 = arith.extui %12 : i32 to i64
  %49 = arith.extui %13 : i32 to i64
  %50 = arith.shli %49, %c32_i64 : i64
  %51 = arith.ori %48, %50 : i64
  %52 = arith.index_castui %51 : i64 to index
  %53 = arith.extui %14 : i32 to i64
  %54 = arith.extui %15 : i32 to i64
  %55 = arith.shli %54, %c32_i64 : i64
  %56 = arith.ori %53, %55 : i64
  %57 = arith.index_castui %56 : i64 to index
  %58 = arith.extui %16 : i32 to i64
  %59 = arith.extui %17 : i32 to i64
  %60 = arith.shli %59, %c32_i64 : i64
  %61 = arith.ori %58, %60 : i64
  %62 = arith.index_castui %61 : i64 to index
  %63 = flow.dispatch.workload.ordinal %37, 0 : index
  %64 = flow.dispatch.workload.ordinal %42, 1 : index
  %65 = flow.dispatch.workload.ordinal %47, 2 : index
  %66 = flow.dispatch.workload.ordinal %52, 3 : index
  %67 = flow.dispatch.workload.ordinal %57, 4 : index
  %68 = flow.dispatch.workload.ordinal %62, 5 : index
  %69 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%63, %64}
  %70 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%65, %66}
  %71 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68}
  %72 = flow.dispatch.tensor.load %69, offsets = [0, 0, 0, 0, 0, 0, 0], sizes = [%63, %64, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%63, %64} -> tensor<?x?x8x4x16x2x8xi8>
  %73 = flow.dispatch.tensor.load %70, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%65, %66, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%65, %66} -> tensor<?x?x4x2x4x16x2x8xi8>
  %74 = flow.dispatch.tensor.load %71, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%67, %68, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68} -> tensor<?x?x8x4x2x4x16x4xi32>
  %75 = iree_gpu.multi_mma %72, %73, %74 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic =  MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32>
  flow.dispatch.tensor.store %75, %71, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%67, %68, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<?x?x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68}
  return
 }

 // -----// IR Dump After Canonicalizer (canonicalize) //----- //
 func.func @foo_dispatch_6() {
  %c32_i64 = arith.constant 32 : i64
  %0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
  %8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
  %9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
  %10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32
  %11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32
  %12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32
  %13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32
  %14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32
  %15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32
  %16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32
  %17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32
  %18 = arith.extui %0 : i32 to i64
  %19 = arith.extui %1 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %2 : i32 to i64
  %24 = arith.extui %3 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = arith.extui %4 : i32 to i64
  %29 = arith.extui %5 : i32 to i64
  %30 = arith.shli %29, %c32_i64 : i64
  %31 = arith.ori %28, %30 : i64
  %32 = arith.index_castui %31 : i64 to index
  %33 = arith.extui %6 : i32 to i64
  %34 = arith.extui %7 : i32 to i64
  %35 = arith.shli %34, %c32_i64 : i64
  %36 = arith.ori %33, %35 : i64
  %37 = arith.index_castui %36 : i64 to index
  %38 = arith.extui %8 : i32 to i64
  %39 = arith.extui %9 : i32 to i64
  %40 = arith.shli %39, %c32_i64 : i64
  %41 = arith.ori %38, %40 : i64
  %42 = arith.index_castui %41 : i64 to index
  %43 = arith.extui %10 : i32 to i64
  %44 = arith.extui %11 : i32 to i64
  %45 = arith.shli %44, %c32_i64 : i64
  %46 = arith.ori %43, %45 : i64
  %47 = arith.index_castui %46 : i64 to index
  %48 = arith.extui %12 : i32 to i64
  %49 = arith.extui %13 : i32 to i64
  %50 = arith.shli %49, %c32_i64 : i64
  %51 = arith.ori %48, %50 : i64
  %52 = arith.index_castui %51 : i64 to index
  %53 = arith.extui %14 : i32 to i64
  %54 = arith.extui %15 : i32 to i64
  %55 = arith.shli %54, %c32_i64 : i64
  %56 = arith.ori %53, %55 : i64
  %57 = arith.index_castui %56 : i64 to index
  %58 = arith.extui %16 : i32 to i64
  %59 = arith.extui %17 : i32 to i64
  %60 = arith.shli %59, %c32_i64 : i64
  %61 = arith.ori %58, %60 : i64
  %62 = arith.index_castui %61 : i64 to index
  %63 = flow.dispatch.workload.ordinal %37, 0 : index
  %64 = flow.dispatch.workload.ordinal %42, 1 : index
  %65 = flow.dispatch.workload.ordinal %47, 2 : index
  %66 = flow.dispatch.workload.ordinal %52, 3 : index
  %67 = flow.dispatch.workload.ordinal %57, 4 : index
  %68 = flow.dispatch.workload.ordinal %62, 5 : index
  %69 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%63, %64}
  %70 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%65, %66}
  %71 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68}
  %72 = flow.dispatch.tensor.load %69, offsets = [0, 0, 0, 0, 0, 0, 0], sizes = [%63, %64, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%63, %64} -> tensor<?x?x8x4x16x2x8xi8>
  %73 = flow.dispatch.tensor.load %70, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%65, %66, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%65, %66} -> tensor<?x?x4x2x4x16x2x8xi8>
  %74 = flow.dispatch.tensor.load %71, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%67, %68, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68} -> tensor<?x?x8x4x2x4x16x4xi32>
  %75 = iree_gpu.multi_mma %72, %73, %74 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic =  MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32>
  flow.dispatch.tensor.store %75, %71, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%67, %68, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<?x?x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68}
  return
 }

 // -----// IR Dump After MaterializeUserConfigsPass (iree-codegen-materialize-user-configs) //----- //
 module {
  func.func @foo_dispatch_6() {
    %c32_i64 = arith.constant 32 : i64
    %0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
    %1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
    %2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
    %3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
    %4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
    %5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
    %6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
    %7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
    %8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
    %9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
    %10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32
    %11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32
    %12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32
    %13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32
    %14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32
    %15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32
    %16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32
    %17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32
    %18 = arith.extui %0 : i32 to i64
    %19 = arith.extui %1 : i32 to i64
    %20 = arith.shli %19, %c32_i64 : i64
    %21 = arith.ori %18, %20 : i64
    %22 = arith.index_castui %21 : i64 to index
    %23 = arith.extui %2 : i32 to i64
    %24 = arith.extui %3 : i32 to i64
    %25 = arith.shli %24, %c32_i64 : i64
    %26 = arith.ori %23, %25 : i64
    %27 = arith.index_castui %26 : i64 to index
    %28 = arith.extui %4 : i32 to i64
    %29 = arith.extui %5 : i32 to i64
    %30 = arith.shli %29, %c32_i64 : i64
    %31 = arith.ori %28, %30 : i64
    %32 = arith.index_castui %31 : i64 to index
    %33 = arith.extui %6 : i32 to i64
    %34 = arith.extui %7 : i32 to i64
    %35 = arith.shli %34, %c32_i64 : i64
    %36 = arith.ori %33, %35 : i64
    %37 = arith.index_castui %36 : i64 to index
    %38 = arith.extui %8 : i32 to i64
    %39 = arith.extui %9 : i32 to i64
    %40 = arith.shli %39, %c32_i64 : i64
    %41 = arith.ori %38, %40 : i64
    %42 = arith.index_castui %41 : i64 to index
    %43 = arith.extui %10 : i32 to i64
    %44 = arith.extui %11 : i32 to i64
    %45 = arith.shli %44, %c32_i64 : i64
    %46 = arith.ori %43, %45 : i64
    %47 = arith.index_castui %46 : i64 to index
    %48 = arith.extui %12 : i32 to i64
    %49 = arith.extui %13 : i32 to i64
    %50 = arith.shli %49, %c32_i64 : i64
    %51 = arith.ori %48, %50 : i64
    %52 = arith.index_castui %51 : i64 to index
    %53 = arith.extui %14 : i32 to i64
    %54 = arith.extui %15 : i32 to i64
    %55 = arith.shli %54, %c32_i64 : i64
    %56 = arith.ori %53, %55 : i64
    %57 = arith.index_castui %56 : i64 to index
    %58 = arith.extui %16 : i32 to i64
    %59 = arith.extui %17 : i32 to i64
    %60 = arith.shli %59, %c32_i64 : i64
    %61 = arith.ori %58, %60 : i64
    %62 = arith.index_castui %61 : i64 to index
    %63 = flow.dispatch.workload.ordinal %37, 0 : index
    %64 = flow.dispatch.workload.ordinal %42, 1 : index
    %65 = flow.dispatch.workload.ordinal %47, 2 : index
    %66 = flow.dispatch.workload.ordinal %52, 3 : index
    %67 = flow.dispatch.workload.ordinal %57, 4 : index
    %68 = flow.dispatch.workload.ordinal %62, 5 : index
    %69 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%63, %64}
    %70 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%65, %66}
    %71 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68}
    %72 = flow.dispatch.tensor.load %69, offsets = [0, 0, 0, 0, 0, 0, 0], sizes = [%63, %64, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%63, %64} -> tensor<?x?x8x4x16x2x8xi8>
    %73 = flow.dispatch.tensor.load %70, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%65, %66, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%65, %66} -> tensor<?x?x4x2x4x16x2x8xi8>
    %74 = flow.dispatch.tensor.load %71, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%67, %68, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68} -> tensor<?x?x8x4x2x4x16x4xi32>
    %75 = iree_gpu.multi_mma %72, %73, %74 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic =  MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32>
    flow.dispatch.tensor.store %75, %71, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%67, %68, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<?x?x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68}
    return
  }
 }

 // -----// IR Dump After LLVMGPUSelectLoweringStrategyPass (iree-llvmgpu-select-lowering-strategy) //----- //
 module {
  func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} {
    %c32_i64 = arith.constant 32 : i64
    %0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
    %1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
    %2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
    %3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
    %4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
    %5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
    %6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
    %7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
    %8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
    %9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
    %10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32
    %11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32
    %12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32
    %13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32
    %14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32
    %15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32
    %16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32
    %17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32
    %18 = arith.extui %0 : i32 to i64
    %19 = arith.extui %1 : i32 to i64
    %20 = arith.shli %19, %c32_i64 : i64
    %21 = arith.ori %18, %20 : i64
    %22 = arith.index_castui %21 : i64 to index
    %23 = arith.extui %2 : i32 to i64
    %24 = arith.extui %3 : i32 to i64
    %25 = arith.shli %24, %c32_i64 : i64
    %26 = arith.ori %23, %25 : i64
    %27 = arith.index_castui %26 : i64 to index
    %28 = arith.extui %4 : i32 to i64
    %29 = arith.extui %5 : i32 to i64
    %30 = arith.shli %29, %c32_i64 : i64
    %31 = arith.ori %28, %30 : i64
    %32 = arith.index_castui %31 : i64 to index
    %33 = arith.extui %6 : i32 to i64
    %34 = arith.extui %7 : i32 to i64
    %35 = arith.shli %34, %c32_i64 : i64
    %36 = arith.ori %33, %35 : i64
    %37 = arith.index_castui %36 : i64 to index
    %38 = arith.extui %8 : i32 to i64
    %39 = arith.extui %9 : i32 to i64
    %40 = arith.shli %39, %c32_i64 : i64
    %41 = arith.ori %38, %40 : i64
    %42 = arith.index_castui %41 : i64 to index
    %43 = arith.extui %10 : i32 to i64
    %44 = arith.extui %11 : i32 to i64
    %45 = arith.shli %44, %c32_i64 : i64
    %46 = arith.ori %43, %45 : i64
    %47 = arith.index_castui %46 : i64 to index
    %48 = arith.extui %12 : i32 to i64
    %49 = arith.extui %13 : i32 to i64
    %50 = arith.shli %49, %c32_i64 : i64
    %51 = arith.ori %48, %50 : i64
    %52 = arith.index_castui %51 : i64 to index
    %53 = arith.extui %14 : i32 to i64
    %54 = arith.extui %15 : i32 to i64
    %55 = arith.shli %54, %c32_i64 : i64
    %56 = arith.ori %53, %55 : i64
    %57 = arith.index_castui %56 : i64 to index
    %58 = arith.extui %16 : i32 to i64
    %59 = arith.extui %17 : i32 to i64
    %60 = arith.shli %59, %c32_i64 : i64
    %61 = arith.ori %58, %60 : i64
    %62 = arith.index_castui %61 : i64 to index
    %63 = flow.dispatch.workload.ordinal %37, 0 : index
    %64 = flow.dispatch.workload.ordinal %42, 1 : index
    %65 = flow.dispatch.workload.ordinal %47, 2 : index
    %66 = flow.dispatch.workload.ordinal %52, 3 : index
    %67 = flow.dispatch.workload.ordinal %57, 4 : index
    %68 = flow.dispatch.workload.ordinal %62, 5 : index
    %69 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%63, %64}
    %70 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%65, %66}
    %71 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68}
    %72 = flow.dispatch.tensor.load %69, offsets = [0, 0, 0, 0, 0, 0, 0], sizes = [%63, %64, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%63, %64} -> tensor<?x?x8x4x16x2x8xi8>
    %73 = flow.dispatch.tensor.load %70, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%65, %66, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%65, %66} -> tensor<?x?x4x2x4x16x2x8xi8>
    %74 = flow.dispatch.tensor.load %71, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%67, %68, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68} -> tensor<?x?x8x4x2x4x16x4xi32>
    %75 = iree_gpu.multi_mma %72, %73, %74 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic =  MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>, lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1], reduction = [0, 0, 1], workgroup = [1, 1, 0]}>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32>
    flow.dispatch.tensor.store %75, %71, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%67, %68, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<?x?x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68}
    return
  }
 }

 // -----// IR Dump After ConfigureTargetExecutableVariantsPass (iree-hal-configure-target-executable-variants) //----- //
 hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb", {iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647]>>, ukernels = "none"}>) {
  hal.executable.export public @foo_dispatch_6 ordinal(0) layout(#hal.pipeline.layout<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) {
  ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index):
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg1, %arg2, %arg3, %arg4, %arg5, %arg6
    hal.return %x, %y, %z : index, index, index
  }
  builtin.module {
    func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} {
      %c32_i64 = arith.constant 32 : i64
      %0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
      %1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
      %2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
      %3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
      %4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
      %5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
      %6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
      %7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
      %8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
      %9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
      %10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32
      %11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32
      %12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32
      %13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32
      %14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32
      %15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32
      %16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32
      %17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32
      %18 = arith.extui %0 : i32 to i64
      %19 = arith.extui %1 : i32 to i64
      %20 = arith.shli %19, %c32_i64 : i64
      %21 = arith.ori %18, %20 : i64
      %22 = arith.index_castui %21 : i64 to index
      %23 = arith.extui %2 : i32 to i64
      %24 = arith.extui %3 : i32 to i64
      %25 = arith.shli %24, %c32_i64 : i64
      %26 = arith.ori %23, %25 : i64
      %27 = arith.index_castui %26 : i64 to index
      %28 = arith.extui %4 : i32 to i64
      %29 = arith.extui %5 : i32 to i64
      %30 = arith.shli %29, %c32_i64 : i64
      %31 = arith.ori %28, %30 : i64
      %32 = arith.index_castui %31 : i64 to index
      %33 = arith.extui %6 : i32 to i64
      %34 = arith.extui %7 : i32 to i64
      %35 = arith.shli %34, %c32_i64 : i64
      %36 = arith.ori %33, %35 : i64
      %37 = arith.index_castui %36 : i64 to index
      %38 = arith.extui %8 : i32 to i64
      %39 = arith.extui %9 : i32 to i64
      %40 = arith.shli %39, %c32_i64 : i64
      %41 = arith.ori %38, %40 : i64
      %42 = arith.index_castui %41 : i64 to index
      %43 = arith.extui %10 : i32 to i64
      %44 = arith.extui %11 : i32 to i64
      %45 = arith.shli %44, %c32_i64 : i64
      %46 = arith.ori %43, %45 : i64
      %47 = arith.index_castui %46 : i64 to index
      %48 = arith.extui %12 : i32 to i64
      %49 = arith.extui %13 : i32 to i64
      %50 = arith.shli %49, %c32_i64 : i64
      %51 = arith.ori %48, %50 : i64
      %52 = arith.index_castui %51 : i64 to index
      %53 = arith.extui %14 : i32 to i64
      %54 = arith.extui %15 : i32 to i64
      %55 = arith.shli %54, %c32_i64 : i64
      %56 = arith.ori %53, %55 : i64
      %57 = arith.index_castui %56 : i64 to index
      %58 = arith.extui %16 : i32 to i64
      %59 = arith.extui %17 : i32 to i64
      %60 = arith.shli %59, %c32_i64 : i64
      %61 = arith.ori %58, %60 : i64
      %62 = arith.index_castui %61 : i64 to index
      %63 = flow.dispatch.workload.ordinal %37, 0 : index
      %64 = flow.dispatch.workload.ordinal %42, 1 : index
      %65 = flow.dispatch.workload.ordinal %47, 2 : index
      %66 = flow.dispatch.workload.ordinal %52, 3 : index
      %67 = flow.dispatch.workload.ordinal %57, 4 : index
      %68 = flow.dispatch.workload.ordinal %62, 5 : index
      %69 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%63, %64}
      %70 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%65, %66}
      %71 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68}
      %72 = flow.dispatch.tensor.load %69, offsets = [0, 0, 0, 0, 0, 0, 0], sizes = [%63, %64, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%63, %64} -> tensor<?x?x8x4x16x2x8xi8>
      %73 = flow.dispatch.tensor.load %70, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%65, %66, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%65, %66} -> tensor<?x?x4x2x4x16x2x8xi8>
      %74 = flow.dispatch.tensor.load %71, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%67, %68, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68} -> tensor<?x?x8x4x2x4x16x4xi32>
      %75 = iree_gpu.multi_mma %72, %73, %74 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic =  MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>, lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1], reduction = [0, 0, 1], workgroup = [1, 1, 0]}>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32>
      flow.dispatch.tensor.store %75, %71, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%67, %68, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<?x?x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68}
      return
    }
  }
 }

 // -----// IR Dump After ConfigureExecutablesPass (iree-hal-configure-executables) //----- //
 hal.executable public @foo_dispatch_6 {
  hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb", {iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647]>>, ukernels = "none"}>) {
    hal.executable.export public @foo_dispatch_6 ordinal(0) layout(#hal.pipeline.layout<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) {
    ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index):
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg1, %arg2, %arg3, %arg4, %arg5, %arg6
      hal.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} {
        %c32_i64 = arith.constant 32 : i64
        %0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
        %1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
        %2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
        %3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
        %4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
        %5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
        %6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
        %7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
        %8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
        %9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
        %10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32
        %11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32
        %12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32
        %13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32
        %14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32
        %15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32
        %16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32
        %17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32
        %18 = arith.extui %0 : i32 to i64
        %19 = arith.extui %1 : i32 to i64
        %20 = arith.shli %19, %c32_i64 : i64
        %21 = arith.ori %18, %20 : i64
        %22 = arith.index_castui %21 : i64 to index
        %23 = arith.extui %2 : i32 to i64
        %24 = arith.extui %3 : i32 to i64
        %25 = arith.shli %24, %c32_i64 : i64
        %26 = arith.ori %23, %25 : i64
        %27 = arith.index_castui %26 : i64 to index
        %28 = arith.extui %4 : i32 to i64
        %29 = arith.extui %5 : i32 to i64
        %30 = arith.shli %29, %c32_i64 : i64
        %31 = arith.ori %28, %30 : i64
        %32 = arith.index_castui %31 : i64 to index
        %33 = arith.extui %6 : i32 to i64
        %34 = arith.extui %7 : i32 to i64
        %35 = arith.shli %34, %c32_i64 : i64
        %36 = arith.ori %33, %35 : i64
        %37 = arith.index_castui %36 : i64 to index
        %38 = arith.extui %8 : i32 to i64
        %39 = arith.extui %9 : i32 to i64
        %40 = arith.shli %39, %c32_i64 : i64
        %41 = arith.ori %38, %40 : i64
        %42 = arith.index_castui %41 : i64 to index
        %43 = arith.extui %10 : i32 to i64
        %44 = arith.extui %11 : i32 to i64
        %45 = arith.shli %44, %c32_i64 : i64
        %46 = arith.ori %43, %45 : i64
        %47 = arith.index_castui %46 : i64 to index
        %48 = arith.extui %12 : i32 to i64
        %49 = arith.extui %13 : i32 to i64
        %50 = arith.shli %49, %c32_i64 : i64
        %51 = arith.ori %48, %50 : i64
        %52 = arith.index_castui %51 : i64 to index
        %53 = arith.extui %14 : i32 to i64
        %54 = arith.extui %15 : i32 to i64
        %55 = arith.shli %54, %c32_i64 : i64
        %56 = arith.ori %53, %55 : i64
        %57 = arith.index_castui %56 : i64 to index
        %58 = arith.extui %16 : i32 to i64
        %59 = arith.extui %17 : i32 to i64
        %60 = arith.shli %59, %c32_i64 : i64
        %61 = arith.ori %58, %60 : i64
        %62 = arith.index_castui %61 : i64 to index
        %63 = flow.dispatch.workload.ordinal %37, 0 : index
        %64 = flow.dispatch.workload.ordinal %42, 1 : index
        %65 = flow.dispatch.workload.ordinal %47, 2 : index
        %66 = flow.dispatch.workload.ordinal %52, 3 : index
        %67 = flow.dispatch.workload.ordinal %57, 4 : index
        %68 = flow.dispatch.workload.ordinal %62, 5 : index
        %69 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%63, %64}
        %70 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%65, %66}
        %71 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68}
        %72 = flow.dispatch.tensor.load %69, offsets = [0, 0, 0, 0, 0, 0, 0], sizes = [%63, %64, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%63, %64} -> tensor<?x?x8x4x16x2x8xi8>
        %73 = flow.dispatch.tensor.load %70, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%65, %66, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%65, %66} -> tensor<?x?x4x2x4x16x2x8xi8>
        %74 = flow.dispatch.tensor.load %71, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%67, %68, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68} -> tensor<?x?x8x4x2x4x16x4xi32>
        %75 = iree_gpu.multi_mma %72, %73, %74 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic =  MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>, lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1], reduction = [0, 0, 1], workgroup = [1, 1, 0]}>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32>
        flow.dispatch.tensor.store %75, %71, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%67, %68, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<?x?x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68}
        return
      }
    }
  }
 }

 // -----// IR Dump After LowerExecutableUsingTransformDialectPass (iree-codegen-lower-executable-using-transform-dialect) //----- //
 module {
  func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} {
    %c32_i64 = arith.constant 32 : i64
    %0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
    %1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
    %2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
    %3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
    %4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
    %5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
    %6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
    %7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
    %8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
    %9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
    %10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32
    %11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32
    %12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32
    %13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32
    %14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32
    %15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32
    %16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32
    %17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32
    %18 = arith.extui %0 : i32 to i64
    %19 = arith.extui %1 : i32 to i64
    %20 = arith.shli %19, %c32_i64 : i64
    %21 = arith.ori %18, %20 : i64
    %22 = arith.index_castui %21 : i64 to index
    %23 = arith.extui %2 : i32 to i64
    %24 = arith.extui %3 : i32 to i64
    %25 = arith.shli %24, %c32_i64 : i64
    %26 = arith.ori %23, %25 : i64
    %27 = arith.index_castui %26 : i64 to index
    %28 = arith.extui %4 : i32 to i64
    %29 = arith.extui %5 : i32 to i64
    %30 = arith.shli %29, %c32_i64 : i64
    %31 = arith.ori %28, %30 : i64
    %32 = arith.index_castui %31 : i64 to index
    %33 = arith.extui %6 : i32 to i64
    %34 = arith.extui %7 : i32 to i64
    %35 = arith.shli %34, %c32_i64 : i64
    %36 = arith.ori %33, %35 : i64
    %37 = arith.index_castui %36 : i64 to index
    %38 = arith.extui %8 : i32 to i64
    %39 = arith.extui %9 : i32 to i64
    %40 = arith.shli %39, %c32_i64 : i64
    %41 = arith.ori %38, %40 : i64
    %42 = arith.index_castui %41 : i64 to index
    %43 = arith.extui %10 : i32 to i64
    %44 = arith.extui %11 : i32 to i64
    %45 = arith.shli %44, %c32_i64 : i64
    %46 = arith.ori %43, %45 : i64
    %47 = arith.index_castui %46 : i64 to index
    %48 = arith.extui %12 : i32 to i64
    %49 = arith.extui %13 : i32 to i64
    %50 = arith.shli %49, %c32_i64 : i64
    %51 = arith.ori %48, %50 : i64
    %52 = arith.index_castui %51 : i64 to index
    %53 = arith.extui %14 : i32 to i64
    %54 = arith.extui %15 : i32 to i64
    %55 = arith.shli %54, %c32_i64 : i64
    %56 = arith.ori %53, %55 : i64
    %57 = arith.index_castui %56 : i64 to index
    %58 = arith.extui %16 : i32 to i64
    %59 = arith.extui %17 : i32 to i64
    %60 = arith.shli %59, %c32_i64 : i64
    %61 = arith.ori %58, %60 : i64
    %62 = arith.index_castui %61 : i64 to index
    %63 = flow.dispatch.workload.ordinal %37, 0 : index
    %64 = flow.dispatch.workload.ordinal %42, 1 : index
    %65 = flow.dispatch.workload.ordinal %47, 2 : index
    %66 = flow.dispatch.workload.ordinal %52, 3 : index
    %67 = flow.dispatch.workload.ordinal %57, 4 : index
    %68 = flow.dispatch.workload.ordinal %62, 5 : index
    %69 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%63, %64}
    %70 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%65, %66}
    %71 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68}
    %72 = flow.dispatch.tensor.load %69, offsets = [0, 0, 0, 0, 0, 0, 0], sizes = [%63, %64, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%63, %64} -> tensor<?x?x8x4x16x2x8xi8>
    %73 = flow.dispatch.tensor.load %70, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%65, %66, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%65, %66} -> tensor<?x?x4x2x4x16x2x8xi8>
    %74 = flow.dispatch.tensor.load %71, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%67, %68, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68} -> tensor<?x?x8x4x2x4x16x4xi32>
    %75 = iree_gpu.multi_mma %72, %73, %74 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic =  MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>, lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1], reduction = [0, 0, 1], workgroup = [1, 1, 0]}>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32>
    flow.dispatch.tensor.store %75, %71, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%67, %68, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<?x?x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68}
    return
  }
 }

 // -----// IR Dump After TileAndDistributeToWorkgroupsPass (iree-codegen-tile-and-distribute-to-workgroups) //----- //
 func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} {
  %c32_i64 = arith.constant 32 : i64
  %0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
  %8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
  %9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
  %10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32
  %11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32
  %12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32
  %13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32
  %14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32
  %15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32
  %16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32
  %17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32
  %18 = arith.extui %0 : i32 to i64
  %19 = arith.extui %1 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %2 : i32 to i64
  %24 = arith.extui %3 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = arith.extui %4 : i32 to i64
  %29 = arith.extui %5 : i32 to i64
  %30 = arith.shli %29, %c32_i64 : i64
  %31 = arith.ori %28, %30 : i64
  %32 = arith.index_castui %31 : i64 to index
  %33 = arith.extui %6 : i32 to i64
  %34 = arith.extui %7 : i32 to i64
  %35 = arith.shli %34, %c32_i64 : i64
  %36 = arith.ori %33, %35 : i64
  %37 = arith.index_castui %36 : i64 to index
  %38 = arith.extui %8 : i32 to i64
  %39 = arith.extui %9 : i32 to i64
  %40 = arith.shli %39, %c32_i64 : i64
  %41 = arith.ori %38, %40 : i64
  %42 = arith.index_castui %41 : i64 to index
  %43 = arith.extui %10 : i32 to i64
  %44 = arith.extui %11 : i32 to i64
  %45 = arith.shli %44, %c32_i64 : i64
  %46 = arith.ori %43, %45 : i64
  %47 = arith.index_castui %46 : i64 to index
  %48 = arith.extui %12 : i32 to i64
  %49 = arith.extui %13 : i32 to i64
  %50 = arith.shli %49, %c32_i64 : i64
  %51 = arith.ori %48, %50 : i64
  %52 = arith.index_castui %51 : i64 to index
  %53 = arith.extui %14 : i32 to i64
  %54 = arith.extui %15 : i32 to i64
  %55 = arith.shli %54, %c32_i64 : i64
  %56 = arith.ori %53, %55 : i64
  %57 = arith.index_castui %56 : i64 to index
  %58 = arith.extui %16 : i32 to i64
  %59 = arith.extui %17 : i32 to i64
  %60 = arith.shli %59, %c32_i64 : i64
  %61 = arith.ori %58, %60 : i64
  %62 = arith.index_castui %61 : i64 to index
  %63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42}
  %64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52}
  %65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %66 = flow.dispatch.tensor.load %63, offsets = [%workgroup_id_y, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} -> tensor<1x?x8x4x16x2x8xi8>
  %67 = flow.dispatch.tensor.load %64, offsets = [%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} -> tensor<1x?x4x2x4x16x2x8xi8>
  %68 = flow.dispatch.tensor.load %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} -> tensor<1x1x8x4x2x4x16x4xi32>
  %69 = iree_gpu.multi_mma %66, %67, %68 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic =  MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>, lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1], reduction = [0, 0, 1], workgroup = [1, 1, 0]}>} : tensor<1x?x8x4x16x2x8xi8>, tensor<1x?x4x2x4x16x2x8xi8> into tensor<1x1x8x4x2x4x16x4xi32>
  %70 = arith.extui %14 : i32 to i64
  %71 = arith.extui %15 : i32 to i64
  %72 = arith.shli %71, %c32_i64 : i64
  %73 = arith.ori %70, %72 : i64
  %74 = arith.index_castui %73 : i64 to index
  %75 = arith.extui %16 : i32 to i64
  %76 = arith.extui %17 : i32 to i64
  %77 = arith.shli %76, %c32_i64 : i64
  %78 = arith.ori %75, %77 : i64
  %79 = arith.index_castui %78 : i64 to index
  flow.dispatch.tensor.store %69, %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%74, %79}
  return
 }

 // -----// IR Dump After CSE (cse) //----- //
 func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} {
  %c32_i64 = arith.constant 32 : i64
  %0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
  %8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
  %9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
  %10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32
  %11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32
  %12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32
  %13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32
  %14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32
  %15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32
  %16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32
  %17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32
  %18 = arith.extui %0 : i32 to i64
  %19 = arith.extui %1 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %2 : i32 to i64
  %24 = arith.extui %3 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = arith.extui %4 : i32 to i64
  %29 = arith.extui %5 : i32 to i64
  %30 = arith.shli %29, %c32_i64 : i64
  %31 = arith.ori %28, %30 : i64
  %32 = arith.index_castui %31 : i64 to index
  %33 = arith.extui %6 : i32 to i64
  %34 = arith.extui %7 : i32 to i64
  %35 = arith.shli %34, %c32_i64 : i64
  %36 = arith.ori %33, %35 : i64
  %37 = arith.index_castui %36 : i64 to index
  %38 = arith.extui %8 : i32 to i64
  %39 = arith.extui %9 : i32 to i64
  %40 = arith.shli %39, %c32_i64 : i64
  %41 = arith.ori %38, %40 : i64
  %42 = arith.index_castui %41 : i64 to index
  %43 = arith.extui %10 : i32 to i64
  %44 = arith.extui %11 : i32 to i64
  %45 = arith.shli %44, %c32_i64 : i64
  %46 = arith.ori %43, %45 : i64
  %47 = arith.index_castui %46 : i64 to index
  %48 = arith.extui %12 : i32 to i64
  %49 = arith.extui %13 : i32 to i64
  %50 = arith.shli %49, %c32_i64 : i64
  %51 = arith.ori %48, %50 : i64
  %52 = arith.index_castui %51 : i64 to index
  %53 = arith.extui %14 : i32 to i64
  %54 = arith.extui %15 : i32 to i64
  %55 = arith.shli %54, %c32_i64 : i64
  %56 = arith.ori %53, %55 : i64
  %57 = arith.index_castui %56 : i64 to index
  %58 = arith.extui %16 : i32 to i64
  %59 = arith.extui %17 : i32 to i64
  %60 = arith.shli %59, %c32_i64 : i64
  %61 = arith.ori %58, %60 : i64
  %62 = arith.index_castui %61 : i64 to index
  %63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42}
  %64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52}
  %65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %66 = flow.dispatch.tensor.load %63, offsets = [%workgroup_id_y, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} -> tensor<1x?x8x4x16x2x8xi8>
  %67 = flow.dispatch.tensor.load %64, offsets = [%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} -> tensor<1x?x4x2x4x16x2x8xi8>
  %68 = flow.dispatch.tensor.load %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} -> tensor<1x1x8x4x2x4x16x4xi32>
  %69 = iree_gpu.multi_mma %66, %67, %68 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic =  MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>, lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1], reduction = [0, 0, 1], workgroup = [1, 1, 0]}>} : tensor<1x?x8x4x16x2x8xi8>, tensor<1x?x4x2x4x16x2x8xi8> into tensor<1x1x8x4x2x4x16x4xi32>
  flow.dispatch.tensor.store %69, %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
  return
 }

 // -----// IR Dump After Canonicalizer (canonicalize) //----- //
 func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} {
  %c32_i64 = arith.constant 32 : i64
  %0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
  %8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
  %9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
  %10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32
  %11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32
  %12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32
  %13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32
  %14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32
  %15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32
  %16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32
  %17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32
  %18 = arith.extui %0 : i32 to i64
  %19 = arith.extui %1 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %2 : i32 to i64
  %24 = arith.extui %3 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = arith.extui %4 : i32 to i64
  %29 = arith.extui %5 : i32 to i64
  %30 = arith.shli %29, %c32_i64 : i64
  %31 = arith.ori %28, %30 : i64
  %32 = arith.index_castui %31 : i64 to index
  %33 = arith.extui %6 : i32 to i64
  %34 = arith.extui %7 : i32 to i64
  %35 = arith.shli %34, %c32_i64 : i64
  %36 = arith.ori %33, %35 : i64
  %37 = arith.index_castui %36 : i64 to index
  %38 = arith.extui %8 : i32 to i64
  %39 = arith.extui %9 : i32 to i64
  %40 = arith.shli %39, %c32_i64 : i64
  %41 = arith.ori %38, %40 : i64
  %42 = arith.index_castui %41 : i64 to index
  %43 = arith.extui %10 : i32 to i64
  %44 = arith.extui %11 : i32 to i64
  %45 = arith.shli %44, %c32_i64 : i64
  %46 = arith.ori %43, %45 : i64
  %47 = arith.index_castui %46 : i64 to index
  %48 = arith.extui %12 : i32 to i64
  %49 = arith.extui %13 : i32 to i64
  %50 = arith.shli %49, %c32_i64 : i64
  %51 = arith.ori %48, %50 : i64
  %52 = arith.index_castui %51 : i64 to index
  %53 = arith.extui %14 : i32 to i64
  %54 = arith.extui %15 : i32 to i64
  %55 = arith.shli %54, %c32_i64 : i64
  %56 = arith.ori %53, %55 : i64
  %57 = arith.index_castui %56 : i64 to index
  %58 = arith.extui %16 : i32 to i64
  %59 = arith.extui %17 : i32 to i64
  %60 = arith.shli %59, %c32_i64 : i64
  %61 = arith.ori %58, %60 : i64
  %62 = arith.index_castui %61 : i64 to index
  %63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42}
  %64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52}
  %65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %66 = flow.dispatch.tensor.load %63, offsets = [%workgroup_id_y, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} -> tensor<1x?x8x4x16x2x8xi8>
  %67 = flow.dispatch.tensor.load %64, offsets = [%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} -> tensor<1x?x4x2x4x16x2x8xi8>
  %68 = flow.dispatch.tensor.load %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} -> tensor<1x1x8x4x2x4x16x4xi32>
  %69 = iree_gpu.multi_mma %66, %67, %68 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic =  MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>, lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1], reduction = [0, 0, 1], workgroup = [1, 1, 0]}>} : tensor<1x?x8x4x16x2x8xi8>, tensor<1x?x4x2x4x16x2x8xi8> into tensor<1x1x8x4x2x4x16x4xi32>
  flow.dispatch.tensor.store %69, %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
  return
 }

 // -----// IR Dump After CSE (cse) //----- //
 func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} {
  %c32_i64 = arith.constant 32 : i64
  %0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
  %8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
  %9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
  %10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32
  %11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32
  %12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32
  %13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32
  %14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32
  %15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32
  %16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32
  %17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32
  %18 = arith.extui %0 : i32 to i64
  %19 = arith.extui %1 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %2 : i32 to i64
  %24 = arith.extui %3 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = arith.extui %4 : i32 to i64
  %29 = arith.extui %5 : i32 to i64
  %30 = arith.shli %29, %c32_i64 : i64
  %31 = arith.ori %28, %30 : i64
  %32 = arith.index_castui %31 : i64 to index
  %33 = arith.extui %6 : i32 to i64
  %34 = arith.extui %7 : i32 to i64
  %35 = arith.shli %34, %c32_i64 : i64
  %36 = arith.ori %33, %35 : i64
  %37 = arith.index_castui %36 : i64 to index
  %38 = arith.extui %8 : i32 to i64
  %39 = arith.extui %9 : i32 to i64
  %40 = arith.shli %39, %c32_i64 : i64
  %41 = arith.ori %38, %40 : i64
  %42 = arith.index_castui %41 : i64 to index
  %43 = arith.extui %10 : i32 to i64
  %44 = arith.extui %11 : i32 to i64
  %45 = arith.shli %44, %c32_i64 : i64
  %46 = arith.ori %43, %45 : i64
  %47 = arith.index_castui %46 : i64 to index
  %48 = arith.extui %12 : i32 to i64
  %49 = arith.extui %13 : i32 to i64
  %50 = arith.shli %49, %c32_i64 : i64
  %51 = arith.ori %48, %50 : i64
  %52 = arith.index_castui %51 : i64 to index
  %53 = arith.extui %14 : i32 to i64
  %54 = arith.extui %15 : i32 to i64
  %55 = arith.shli %54, %c32_i64 : i64
  %56 = arith.ori %53, %55 : i64
  %57 = arith.index_castui %56 : i64 to index
  %58 = arith.extui %16 : i32 to i64
  %59 = arith.extui %17 : i32 to i64
  %60 = arith.shli %59, %c32_i64 : i64
  %61 = arith.ori %58, %60 : i64
  %62 = arith.index_castui %61 : i64 to index
  %63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42}
  %64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52}
  %65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %66 = flow.dispatch.tensor.load %63, offsets = [%workgroup_id_y, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} -> tensor<1x?x8x4x16x2x8xi8>
  %67 = flow.dispatch.tensor.load %64, offsets = [%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} -> tensor<1x?x4x2x4x16x2x8xi8>
  %68 = flow.dispatch.tensor.load %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} -> tensor<1x1x8x4x2x4x16x4xi32>
  %69 = iree_gpu.multi_mma %66, %67, %68 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic =  MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>, lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1], reduction = [0, 0, 1], workgroup = [1, 1, 0]}>} : tensor<1x?x8x4x16x2x8xi8>, tensor<1x?x4x2x4x16x2x8xi8> into tensor<1x1x8x4x2x4x16x4xi32>
  flow.dispatch.tensor.store %69, %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
  return
 }

 // -----// IR Dump After GPUPromoteMatmulOperandsPass (iree-codegen-gpu-promote-matmul-operands) //----- //
 func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} {
  %c32_i64 = arith.constant 32 : i64
  %0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
  %8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
  %9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
  %10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32
  %11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32
  %12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32
  %13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32
  %14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32
  %15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32
  %16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32
  %17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32
  %18 = arith.extui %0 : i32 to i64
  %19 = arith.extui %1 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %2 : i32 to i64
  %24 = arith.extui %3 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = arith.extui %4 : i32 to i64
  %29 = arith.extui %5 : i32 to i64
  %30 = arith.shli %29, %c32_i64 : i64
  %31 = arith.ori %28, %30 : i64
  %32 = arith.index_castui %31 : i64 to index
  %33 = arith.extui %6 : i32 to i64
  %34 = arith.extui %7 : i32 to i64
  %35 = arith.shli %34, %c32_i64 : i64
  %36 = arith.ori %33, %35 : i64
  %37 = arith.index_castui %36 : i64 to index
  %38 = arith.extui %8 : i32 to i64
  %39 = arith.extui %9 : i32 to i64
  %40 = arith.shli %39, %c32_i64 : i64
  %41 = arith.ori %38, %40 : i64
  %42 = arith.index_castui %41 : i64 to index
  %43 = arith.extui %10 : i32 to i64
  %44 = arith.extui %11 : i32 to i64
  %45 = arith.shli %44, %c32_i64 : i64
  %46 = arith.ori %43, %45 : i64
  %47 = arith.index_castui %46 : i64 to index
  %48 = arith.extui %12 : i32 to i64
  %49 = arith.extui %13 : i32 to i64
  %50 = arith.shli %49, %c32_i64 : i64
  %51 = arith.ori %48, %50 : i64
  %52 = arith.index_castui %51 : i64 to index
  %53 = arith.extui %14 : i32 to i64
  %54 = arith.extui %15 : i32 to i64
  %55 = arith.shli %54, %c32_i64 : i64
  %56 = arith.ori %53, %55 : i64
  %57 = arith.index_castui %56 : i64 to index
  %58 = arith.extui %16 : i32 to i64
  %59 = arith.extui %17 : i32 to i64
  %60 = arith.shli %59, %c32_i64 : i64
  %61 = arith.ori %58, %60 : i64
  %62 = arith.index_castui %61 : i64 to index
  %63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42}
  %64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52}
  %65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %66 = flow.dispatch.tensor.load %63, offsets = [%workgroup_id_y, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} -> tensor<1x?x8x4x16x2x8xi8>
  %67 = flow.dispatch.tensor.load %64, offsets = [%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} -> tensor<1x?x4x2x4x16x2x8xi8>
  %68 = flow.dispatch.tensor.load %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} -> tensor<1x1x8x4x2x4x16x4xi32>
  %c1 = arith.constant 1 : index
  %dim = tensor.dim %66, %c1 : tensor<1x?x8x4x16x2x8xi8>
  %69 = tensor.empty(%dim) : tensor<1x?x8x4x16x2x8xi8>
  %70 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%66 : tensor<1x?x8x4x16x2x8xi8>) outs(%69 : tensor<1x?x8x4x16x2x8xi8>) -> tensor<1x?x8x4x16x2x8xi8>
  %c1_0 = arith.constant 1 : index
  %dim_1 = tensor.dim %67, %c1_0 : tensor<1x?x4x2x4x16x2x8xi8>
  %71 = tensor.empty(%dim_1) : tensor<1x?x4x2x4x16x2x8xi8>
  %72 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%67 : tensor<1x?x4x2x4x16x2x8xi8>) outs(%71 : tensor<1x?x4x2x4x16x2x8xi8>) -> tensor<1x?x4x2x4x16x2x8xi8>
  %73 = iree_gpu.multi_mma %70, %72, %68 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic =  MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>, lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1], reduction = [0, 0, 1], workgroup = [1, 1, 0]}>} : tensor<1x?x8x4x16x2x8xi8>, tensor<1x?x4x2x4x16x2x8xi8> into tensor<1x1x8x4x2x4x16x4xi32>
  flow.dispatch.tensor.store %73, %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
  return
 }

 // -----// IR Dump After PackToIntrinsicsPass (iree-gpu-pack-to-intrinsics) //----- //
 func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} {
  %c1 = arith.constant 1 : index
  %c32_i64 = arith.constant 32 : i64
  %0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
  %8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
  %9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
  %10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32
  %11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32
  %12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32
  %13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32
  %14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32
  %15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32
  %16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32
  %17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32
  %18 = arith.extui %0 : i32 to i64
  %19 = arith.extui %1 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %2 : i32 to i64
  %24 = arith.extui %3 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = arith.extui %4 : i32 to i64
  %29 = arith.extui %5 : i32 to i64
  %30 = arith.shli %29, %c32_i64 : i64
  %31 = arith.ori %28, %30 : i64
  %32 = arith.index_castui %31 : i64 to index
  %33 = arith.extui %6 : i32 to i64
  %34 = arith.extui %7 : i32 to i64
  %35 = arith.shli %34, %c32_i64 : i64
  %36 = arith.ori %33, %35 : i64
  %37 = arith.index_castui %36 : i64 to index
  %38 = arith.extui %8 : i32 to i64
  %39 = arith.extui %9 : i32 to i64
  %40 = arith.shli %39, %c32_i64 : i64
  %41 = arith.ori %38, %40 : i64
  %42 = arith.index_castui %41 : i64 to index
  %43 = arith.extui %10 : i32 to i64
  %44 = arith.extui %11 : i32 to i64
  %45 = arith.shli %44, %c32_i64 : i64
  %46 = arith.ori %43, %45 : i64
  %47 = arith.index_castui %46 : i64 to index
  %48 = arith.extui %12 : i32 to i64
  %49 = arith.extui %13 : i32 to i64
  %50 = arith.shli %49, %c32_i64 : i64
  %51 = arith.ori %48, %50 : i64
  %52 = arith.index_castui %51 : i64 to index
  %53 = arith.extui %14 : i32 to i64
  %54 = arith.extui %15 : i32 to i64
  %55 = arith.shli %54, %c32_i64 : i64
  %56 = arith.ori %53, %55 : i64
  %57 = arith.index_castui %56 : i64 to index
  %58 = arith.extui %16 : i32 to i64
  %59 = arith.extui %17 : i32 to i64
  %60 = arith.shli %59, %c32_i64 : i64
  %61 = arith.ori %58, %60 : i64
  %62 = arith.index_castui %61 : i64 to index
  %63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42}
  %64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52}
  %65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %66 = flow.dispatch.tensor.load %63, offsets = [%workgroup_id_y, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} -> tensor<1x?x8x4x16x2x8xi8>
  %67 = flow.dispatch.tensor.load %64, offsets = [%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} -> tensor<1x?x4x2x4x16x2x8xi8>
  %68 = flow.dispatch.tensor.load %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} -> tensor<1x1x8x4x2x4x16x4xi32>
  %dim = tensor.dim %66, %c1 : tensor<1x?x8x4x16x2x8xi8>
  %69 = tensor.empty(%dim) : tensor<1x?x8x4x16x2x8xi8>
  %70 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%66 : tensor<1x?x8x4x16x2x8xi8>) outs(%69 : tensor<1x?x8x4x16x2x8xi8>) -> tensor<1x?x8x4x16x2x8xi8>
  %dim_0 = tensor.dim %67, %c1 : tensor<1x?x4x2x4x16x2x8xi8>
  %71 = tensor.empty(%dim_0) : tensor<1x?x4x2x4x16x2x8xi8>
  %72 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%67 : tensor<1x?x4x2x4x16x2x8xi8>) outs(%71 : tensor<1x?x4x2x4x16x2x8xi8>) -> tensor<1x?x4x2x4x16x2x8xi8>
  %73 = iree_gpu.multi_mma %70, %72, %68 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic =  MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>, lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1], reduction = [0, 0, 1], workgroup = [1, 1, 0]}>} : tensor<1x?x8x4x16x2x8xi8>, tensor<1x?x4x2x4x16x2x8xi8> into tensor<1x1x8x4x2x4x16x4xi32>
  flow.dispatch.tensor.store %73, %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
  return
 }

 // -----// IR Dump After ConcretizeMmaShapesPass (iree-gpu-concretize-mma-shapes) //----- //
 func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} {
  %c1 = arith.constant 1 : index
  %c32_i64 = arith.constant 32 : i64
  %0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
  %8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
  %9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
  %10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32
  %11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32
  %12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32
  %13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32
  %14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32
  %15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32
  %16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32
  %17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32
  %18 = arith.extui %0 : i32 to i64
  %19 = arith.extui %1 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %2 : i32 to i64
  %24 = arith.extui %3 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = arith.extui %4 : i32 to i64
  %29 = arith.extui %5 : i32 to i64
  %30 = arith.shli %29, %c32_i64 : i64
  %31 = arith.ori %28, %30 : i64
  %32 = arith.index_castui %31 : i64 to index
  %33 = arith.extui %6 : i32 to i64
  %34 = arith.extui %7 : i32 to i64
  %35 = arith.shli %34, %c32_i64 : i64
  %36 = arith.ori %33, %35 : i64
  %37 = arith.index_castui %36 : i64 to index
  %38 = arith.extui %8 : i32 to i64
  %39 = arith.extui %9 : i32 to i64
  %40 = arith.shli %39, %c32_i64 : i64
  %41 = arith.ori %38, %40 : i64
  %42 = arith.index_castui %41 : i64 to index
  %43 = arith.extui %10 : i32 to i64
  %44 = arith.extui %11 : i32 to i64
  %45 = arith.shli %44, %c32_i64 : i64
  %46 = arith.ori %43, %45 : i64
  %47 = arith.index_castui %46 : i64 to index
  %48 = arith.extui %12 : i32 to i64
  %49 = arith.extui %13 : i32 to i64
  %50 = arith.shli %49, %c32_i64 : i64
  %51 = arith.ori %48, %50 : i64
  %52 = arith.index_castui %51 : i64 to index
  %53 = arith.extui %14 : i32 to i64
  %54 = arith.extui %15 : i32 to i64
  %55 = arith.shli %54, %c32_i64 : i64
  %56 = arith.ori %53, %55 : i64
  %57 = arith.index_castui %56 : i64 to index
  %58 = arith.extui %16 : i32 to i64
  %59 = arith.extui %17 : i32 to i64
  %60 = arith.shli %59, %c32_i64 : i64
  %61 = arith.ori %58, %60 : i64
  %62 = arith.index_castui %61 : i64 to index
  %63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42}
  %64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52}
  %65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %66 = flow.dispatch.tensor.load %63, offsets = [%workgroup_id_y, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} -> tensor<1x?x8x4x16x2x8xi8>
  %67 = flow.dispatch.tensor.load %64, offsets = [%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} -> tensor<1x?x4x2x4x16x2x8xi8>
  %68 = flow.dispatch.tensor.load %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} -> tensor<1x1x8x4x2x4x16x4xi32>
  %dim = tensor.dim %66, %c1 : tensor<1x?x8x4x16x2x8xi8>
  %69 = tensor.empty(%dim) : tensor<1x?x8x4x16x2x8xi8>
  %70 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%66 : tensor<1x?x8x4x16x2x8xi8>) outs(%69 : tensor<1x?x8x4x16x2x8xi8>) -> tensor<1x?x8x4x16x2x8xi8>
  %dim_0 = tensor.dim %67, %c1 : tensor<1x?x4x2x4x16x2x8xi8>
  %71 = tensor.empty(%dim_0) : tensor<1x?x4x2x4x16x2x8xi8>
  %72 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%67 : tensor<1x?x4x2x4x16x2x8xi8>) outs(%71 : tensor<1x?x4x2x4x16x2x8xi8>) -> tensor<1x?x4x2x4x16x2x8xi8>
  %73 = iree_gpu.multi_mma %70, %72, %68 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic =  MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>, lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1], reduction = [0, 0, 1], workgroup = [1, 1, 0]}>} : tensor<1x?x8x4x16x2x8xi8>, tensor<1x?x4x2x4x16x2x8xi8> into tensor<1x1x8x4x2x4x16x4xi32>
  flow.dispatch.tensor.store %73, %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
  return
 }

 // -----// IR Dump After GPUApplyTilingLevelPass (iree-codegen-gpu-apply-tiling-level) //----- //
 func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} {
  %c0 = arith.constant 0 : index
  %c1 = arith.constant 1 : index
  %c32_i64 = arith.constant 32 : i64
  %0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
  %8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
  %9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
  %10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32
  %11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32
  %12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32
  %13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32
  %14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32
  %15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32
  %16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32
  %17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32
  %18 = arith.extui %0 : i32 to i64
  %19 = arith.extui %1 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %2 : i32 to i64
  %24 = arith.extui %3 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = arith.extui %4 : i32 to i64
  %29 = arith.extui %5 : i32 to i64
  %30 = arith.shli %29, %c32_i64 : i64
  %31 = arith.ori %28, %30 : i64
  %32 = arith.index_castui %31 : i64 to index
  %33 = arith.extui %6 : i32 to i64
  %34 = arith.extui %7 : i32 to i64
  %35 = arith.shli %34, %c32_i64 : i64
  %36 = arith.ori %33, %35 : i64
  %37 = arith.index_castui %36 : i64 to index
  %38 = arith.extui %8 : i32 to i64
  %39 = arith.extui %9 : i32 to i64
  %40 = arith.shli %39, %c32_i64 : i64
  %41 = arith.ori %38, %40 : i64
  %42 = arith.index_castui %41 : i64 to index
  %43 = arith.extui %10 : i32 to i64
  %44 = arith.extui %11 : i32 to i64
  %45 = arith.shli %44, %c32_i64 : i64
  %46 = arith.ori %43, %45 : i64
  %47 = arith.index_castui %46 : i64 to index
  %48 = arith.extui %12 : i32 to i64
  %49 = arith.extui %13 : i32 to i64
  %50 = arith.shli %49, %c32_i64 : i64
  %51 = arith.ori %48, %50 : i64
  %52 = arith.index_castui %51 : i64 to index
  %53 = arith.extui %14 : i32 to i64
  %54 = arith.extui %15 : i32 to i64
  %55 = arith.shli %54, %c32_i64 : i64
  %56 = arith.ori %53, %55 : i64
  %57 = arith.index_castui %56 : i64 to index
  %58 = arith.extui %16 : i32 to i64
  %59 = arith.extui %17 : i32 to i64
  %60 = arith.shli %59, %c32_i64 : i64
  %61 = arith.ori %58, %60 : i64
  %62 = arith.index_castui %61 : i64 to index
  %63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42}
  %64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52}
  %65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %66 = flow.dispatch.tensor.load %63, offsets = [%workgroup_id_y, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} -> tensor<1x?x8x4x16x2x8xi8>
  %67 = flow.dispatch.tensor.load %64, offsets = [%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} -> tensor<1x?x4x2x4x16x2x8xi8>
  %68 = flow.dispatch.tensor.load %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} -> tensor<1x1x8x4x2x4x16x4xi32>
  %dim = tensor.dim %66, %c1 : tensor<1x?x8x4x16x2x8xi8>
  %69 = tensor.empty(%dim) : tensor<1x?x8x4x16x2x8xi8>
  %70 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%66 : tensor<1x?x8x4x16x2x8xi8>) outs(%69 : tensor<1x?x8x4x16x2x8xi8>) -> tensor<1x?x8x4x16x2x8xi8>
  %dim_0 = tensor.dim %70, %c1 : tensor<1x?x8x4x16x2x8xi8>
  %71 = scf.for %arg0 = %c0 to %dim_0 step %c1 iter_args(%arg1 = %68) -> (tensor<1x1x8x4x2x4x16x4xi32>) {
    %extracted_slice = tensor.extract_slice %66[0, %arg0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 16, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x?x8x4x16x2x8xi8> to tensor<1x1x8x4x16x2x8xi8>
    %72 = tensor.empty() : tensor<1x1x8x4x16x2x8xi8>
    %73 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice : tensor<1x1x8x4x16x2x8xi8>) outs(%72 : tensor<1x1x8x4x16x2x8xi8>) -> tensor<1x1x8x4x16x2x8xi8>
    %extracted_slice_1 = tensor.extract_slice %67[0, %arg0, 0, 0, 0, 0, 0, 0] [1, 1, 4, 2, 4, 16, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x?x4x2x4x16x2x8xi8> to tensor<1x1x4x2x4x16x2x8xi8>
    %74 = tensor.empty() : tensor<1x1x4x2x4x16x2x8xi8>
    %75 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_1 : tensor<1x1x4x2x4x16x2x8xi8>) outs(%74 : tensor<1x1x4x2x4x16x2x8xi8>) -> tensor<1x1x4x2x4x16x2x8xi8>
    %76 = iree_gpu.multi_mma %73, %75, %arg1 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic =  MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>, lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1], reduction = [0, 0, 1], workgroup = [1, 1, 0]}>} : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8> into tensor<1x1x8x4x2x4x16x4xi32>
    scf.yield %76 : tensor<1x1x8x4x2x4x16x4xi32>
  }
  flow.dispatch.tensor.store %71, %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
  return
 }

 // -----// IR Dump After Canonicalizer (canonicalize) //----- //
 func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} {
  %c0 = arith.constant 0 : index
  %c1 = arith.constant 1 : index
  %c32_i64 = arith.constant 32 : i64
  %0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
  %8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
  %9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
  %10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32
  %11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32
  %12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32
  %13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32
  %14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32
  %15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32
  %16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32
  %17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32
  %18 = arith.extui %0 : i32 to i64
  %19 = arith.extui %1 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %2 : i32 to i64
  %24 = arith.extui %3 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = arith.extui %4 : i32 to i64
  %29 = arith.extui %5 : i32 to i64
  %30 = arith.shli %29, %c32_i64 : i64
  %31 = arith.ori %28, %30 : i64
  %32 = arith.index_castui %31 : i64 to index
  %33 = arith.extui %6 : i32 to i64
  %34 = arith.extui %7 : i32 to i64
  %35 = arith.shli %34, %c32_i64 : i64
  %36 = arith.ori %33, %35 : i64
  %37 = arith.index_castui %36 : i64 to index
  %38 = arith.extui %8 : i32 to i64
  %39 = arith.extui %9 : i32 to i64
  %40 = arith.shli %39, %c32_i64 : i64
  %41 = arith.ori %38, %40 : i64
  %42 = arith.index_castui %41 : i64 to index
  %43 = arith.extui %10 : i32 to i64
  %44 = arith.extui %11 : i32 to i64
  %45 = arith.shli %44, %c32_i64 : i64
  %46 = arith.ori %43, %45 : i64
  %47 = arith.index_castui %46 : i64 to index
  %48 = arith.extui %12 : i32 to i64
  %49 = arith.extui %13 : i32 to i64
  %50 = arith.shli %49, %c32_i64 : i64
  %51 = arith.ori %48, %50 : i64
  %52 = arith.index_castui %51 : i64 to index
  %53 = arith.extui %14 : i32 to i64
  %54 = arith.extui %15 : i32 to i64
  %55 = arith.shli %54, %c32_i64 : i64
  %56 = arith.ori %53, %55 : i64
  %57 = arith.index_castui %56 : i64 to index
  %58 = arith.extui %16 : i32 to i64
  %59 = arith.extui %17 : i32 to i64
  %60 = arith.shli %59, %c32_i64 : i64
  %61 = arith.ori %58, %60 : i64
  %62 = arith.index_castui %61 : i64 to index
  %63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42}
  %64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52}
  %65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %66 = flow.dispatch.tensor.load %63, offsets = [%workgroup_id_y, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} -> tensor<1x?x8x4x16x2x8xi8>
  %67 = flow.dispatch.tensor.load %64, offsets = [%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} -> tensor<1x?x4x2x4x16x2x8xi8>
  %68 = flow.dispatch.tensor.load %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} -> tensor<1x1x8x4x2x4x16x4xi32>
  %69 = scf.for %arg0 = %c0 to %42 step %c1 iter_args(%arg1 = %68) -> (tensor<1x1x8x4x2x4x16x4xi32>) {
    %extracted_slice = tensor.extract_slice %66[0, %arg0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 16, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x?x8x4x16x2x8xi8> to tensor<1x1x8x4x16x2x8xi8>
    %70 = tensor.empty() : tensor<1x1x8x4x16x2x8xi8>
    %71 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice : tensor<1x1x8x4x16x2x8xi8>) outs(%70 : tensor<1x1x8x4x16x2x8xi8>) -> tensor<1x1x8x4x16x2x8xi8>
    %extracted_slice_0 = tensor.extract_slice %67[0, %arg0, 0, 0, 0, 0, 0, 0] [1, 1, 4, 2, 4, 16, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x?x4x2x4x16x2x8xi8> to tensor<1x1x4x2x4x16x2x8xi8>
    %72 = tensor.empty() : tensor<1x1x4x2x4x16x2x8xi8>
    %73 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_0 : tensor<1x1x4x2x4x16x2x8xi8>) outs(%72 : tensor<1x1x4x2x4x16x2x8xi8>) -> tensor<1x1x4x2x4x16x2x8xi8>
    %74 = iree_gpu.multi_mma %71, %73, %arg1 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic =  MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>, lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1], reduction = [0, 0, 1], workgroup = [1, 1, 0]}>} : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8> into tensor<1x1x8x4x2x4x16x4xi32>
    scf.yield %74 : tensor<1x1x8x4x2x4x16x4xi32>
  }
  flow.dispatch.tensor.store %69, %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
  return
 }

 // -----// IR Dump After CSE (cse) //----- //
 func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} {
  %c0 = arith.constant 0 : index
  %c1 = arith.constant 1 : index
  %c32_i64 = arith.constant 32 : i64
  %0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
  %8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
  %9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
  %10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32
  %11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32
  %12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32
  %13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32
  %14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32
  %15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32
  %16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32
  %17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32
  %18 = arith.extui %0 : i32 to i64
  %19 = arith.extui %1 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %2 : i32 to i64
  %24 = arith.extui %3 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = arith.extui %4 : i32 to i64
  %29 = arith.extui %5 : i32 to i64
  %30 = arith.shli %29, %c32_i64 : i64
  %31 = arith.ori %28, %30 : i64
  %32 = arith.index_castui %31 : i64 to index
  %33 = arith.extui %6 : i32 to i64
  %34 = arith.extui %7 : i32 to i64
  %35 = arith.shli %34, %c32_i64 : i64
  %36 = arith.ori %33, %35 : i64
  %37 = arith.index_castui %36 : i64 to index
  %38 = arith.extui %8 : i32 to i64
  %39 = arith.extui %9 : i32 to i64
  %40 = arith.shli %39, %c32_i64 : i64
  %41 = arith.ori %38, %40 : i64
  %42 = arith.index_castui %41 : i64 to index
  %43 = arith.extui %10 : i32 to i64
  %44 = arith.extui %11 : i32 to i64
  %45 = arith.shli %44, %c32_i64 : i64
  %46 = arith.ori %43, %45 : i64
  %47 = arith.index_castui %46 : i64 to index
  %48 = arith.extui %12 : i32 to i64
  %49 = arith.extui %13 : i32 to i64
  %50 = arith.shli %49, %c32_i64 : i64
  %51 = arith.ori %48, %50 : i64
  %52 = arith.index_castui %51 : i64 to index
  %53 = arith.extui %14 : i32 to i64
  %54 = arith.extui %15 : i32 to i64
  %55 = arith.shli %54, %c32_i64 : i64
  %56 = arith.ori %53, %55 : i64
  %57 = arith.index_castui %56 : i64 to index
  %58 = arith.extui %16 : i32 to i64
  %59 = arith.extui %17 : i32 to i64
  %60 = arith.shli %59, %c32_i64 : i64
  %61 = arith.ori %58, %60 : i64
  %62 = arith.index_castui %61 : i64 to index
  %63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42}
  %64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52}
  %65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %66 = flow.dispatch.tensor.load %63, offsets = [%workgroup_id_y, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} -> tensor<1x?x8x4x16x2x8xi8>
  %67 = flow.dispatch.tensor.load %64, offsets = [%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} -> tensor<1x?x4x2x4x16x2x8xi8>
  %68 = flow.dispatch.tensor.load %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} -> tensor<1x1x8x4x2x4x16x4xi32>
  %69 = scf.for %arg0 = %c0 to %42 step %c1 iter_args(%arg1 = %68) -> (tensor<1x1x8x4x2x4x16x4xi32>) {
    %extracted_slice = tensor.extract_slice %66[0, %arg0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 16, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x?x8x4x16x2x8xi8> to tensor<1x1x8x4x16x2x8xi8>
    %70 = tensor.empty() : tensor<1x1x8x4x16x2x8xi8>
    %71 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice : tensor<1x1x8x4x16x2x8xi8>) outs(%70 : tensor<1x1x8x4x16x2x8xi8>) -> tensor<1x1x8x4x16x2x8xi8>
    %extracted_slice_0 = tensor.extract_slice %67[0, %arg0, 0, 0, 0, 0, 0, 0] [1, 1, 4, 2, 4, 16, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x?x4x2x4x16x2x8xi8> to tensor<1x1x4x2x4x16x2x8xi8>
    %72 = tensor.empty() : tensor<1x1x4x2x4x16x2x8xi8>
    %73 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_0 : tensor<1x1x4x2x4x16x2x8xi8>) outs(%72 : tensor<1x1x4x2x4x16x2x8xi8>) -> tensor<1x1x4x2x4x16x2x8xi8>
    %74 = iree_gpu.multi_mma %71, %73, %arg1 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic =  MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>, lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1], reduction = [0, 0, 1], workgroup = [1, 1, 0]}>} : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8> into tensor<1x1x8x4x2x4x16x4xi32>
    scf.yield %74 : tensor<1x1x8x4x2x4x16x4xi32>
  }
  flow.dispatch.tensor.store %69, %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
  return
 }

 // -----// IR Dump After DecomposePackUnPackOpsPass (iree-codegen-decompose-pack-unpack-ops) //----- //
 func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} {
  %c0 = arith.constant 0 : index
  %c1 = arith.constant 1 : index
  %c32_i64 = arith.constant 32 : i64
  %0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
  %8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
  %9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
  %10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32
  %11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32
  %12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32
  %13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32
  %14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32
  %15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32
  %16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32
  %17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32
  %18 = arith.extui %0 : i32 to i64
  %19 = arith.extui %1 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %2 : i32 to i64
  %24 = arith.extui %3 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = arith.extui %4 : i32 to i64
  %29 = arith.extui %5 : i32 to i64
  %30 = arith.shli %29, %c32_i64 : i64
  %31 = arith.ori %28, %30 : i64
  %32 = arith.index_castui %31 : i64 to index
  %33 = arith.extui %6 : i32 to i64
  %34 = arith.extui %7 : i32 to i64
  %35 = arith.shli %34, %c32_i64 : i64
  %36 = arith.ori %33, %35 : i64
  %37 = arith.index_castui %36 : i64 to index
  %38 = arith.extui %8 : i32 to i64
  %39 = arith.extui %9 : i32 to i64
  %40 = arith.shli %39, %c32_i64 : i64
  %41 = arith.ori %38, %40 : i64
  %42 = arith.index_castui %41 : i64 to index
  %43 = arith.extui %10 : i32 to i64
  %44 = arith.extui %11 : i32 to i64
  %45 = arith.shli %44, %c32_i64 : i64
  %46 = arith.ori %43, %45 : i64
  %47 = arith.index_castui %46 : i64 to index
  %48 = arith.extui %12 : i32 to i64
  %49 = arith.extui %13 : i32 to i64
  %50 = arith.shli %49, %c32_i64 : i64
  %51 = arith.ori %48, %50 : i64
  %52 = arith.index_castui %51 : i64 to index
  %53 = arith.extui %14 : i32 to i64
  %54 = arith.extui %15 : i32 to i64
  %55 = arith.shli %54, %c32_i64 : i64
  %56 = arith.ori %53, %55 : i64
  %57 = arith.index_castui %56 : i64 to index
  %58 = arith.extui %16 : i32 to i64
  %59 = arith.extui %17 : i32 to i64
  %60 = arith.shli %59, %c32_i64 : i64
  %61 = arith.ori %58, %60 : i64
  %62 = arith.index_castui %61 : i64 to index
  %63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42}
  %64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52}
  %65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %66 = flow.dispatch.tensor.load %63, offsets = [%workgroup_id_y, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} -> tensor<1x?x8x4x16x2x8xi8>
  %67 = flow.dispatch.tensor.load %64, offsets = [%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} -> tensor<1x?x4x2x4x16x2x8xi8>
  %68 = flow.dispatch.tensor.load %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} -> tensor<1x1x8x4x2x4x16x4xi32>
  %69 = scf.for %arg0 = %c0 to %42 step %c1 iter_args(%arg1 = %68) -> (tensor<1x1x8x4x2x4x16x4xi32>) {
    %extracted_slice = tensor.extract_slice %66[0, %arg0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 16, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x?x8x4x16x2x8xi8> to tensor<1x1x8x4x16x2x8xi8>
    %70 = tensor.empty() : tensor<1x1x8x4x16x2x8xi8>
    %71 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice : tensor<1x1x8x4x16x2x8xi8>) outs(%70 : tensor<1x1x8x4x16x2x8xi8>) -> tensor<1x1x8x4x16x2x8xi8>
    %extracted_slice_0 = tensor.extract_slice %67[0, %arg0, 0, 0, 0, 0, 0, 0] [1, 1, 4, 2, 4, 16, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x?x4x2x4x16x2x8xi8> to tensor<1x1x4x2x4x16x2x8xi8>
    %72 = tensor.empty() : tensor<1x1x4x2x4x16x2x8xi8>
    %73 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_0 : tensor<1x1x4x2x4x16x2x8xi8>) outs(%72 : tensor<1x1x4x2x4x16x2x8xi8>) -> tensor<1x1x4x2x4x16x2x8xi8>
    %74 = iree_gpu.multi_mma %71, %73, %arg1 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic =  MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>, lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1], reduction = [0, 0, 1], workgroup = [1, 1, 0]}>} : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8> into tensor<1x1x8x4x2x4x16x4xi32>
    scf.yield %74 : tensor<1x1x8x4x2x4x16x4xi32>
  }
  flow.dispatch.tensor.store %69, %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
  return
 }

 // -----// IR Dump After ConcretizeMmaShapesPass (iree-gpu-concretize-mma-shapes) //----- //
 func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} {
  %c0 = arith.constant 0 : index
  %c1 = arith.constant 1 : index
  %c32_i64 = arith.constant 32 : i64
  %0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
  %8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
  %9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
  %10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32
  %11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32
  %12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32
  %13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32
  %14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32
  %15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32
  %16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32
  %17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32
  %18 = arith.extui %0 : i32 to i64
  %19 = arith.extui %1 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %2 : i32 to i64
  %24 = arith.extui %3 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = arith.extui %4 : i32 to i64
  %29 = arith.extui %5 : i32 to i64
  %30 = arith.shli %29, %c32_i64 : i64
  %31 = arith.ori %28, %30 : i64
  %32 = arith.index_castui %31 : i64 to index
  %33 = arith.extui %6 : i32 to i64
  %34 = arith.extui %7 : i32 to i64
  %35 = arith.shli %34, %c32_i64 : i64
  %36 = arith.ori %33, %35 : i64
  %37 = arith.index_castui %36 : i64 to index
  %38 = arith.extui %8 : i32 to i64
  %39 = arith.extui %9 : i32 to i64
  %40 = arith.shli %39, %c32_i64 : i64
  %41 = arith.ori %38, %40 : i64
  %42 = arith.index_castui %41 : i64 to index
  %43 = arith.extui %10 : i32 to i64
  %44 = arith.extui %11 : i32 to i64
  %45 = arith.shli %44, %c32_i64 : i64
  %46 = arith.ori %43, %45 : i64
  %47 = arith.index_castui %46 : i64 to index
  %48 = arith.extui %12 : i32 to i64
  %49 = arith.extui %13 : i32 to i64
  %50 = arith.shli %49, %c32_i64 : i64
  %51 = arith.ori %48, %50 : i64
  %52 = arith.index_castui %51 : i64 to index
  %53 = arith.extui %14 : i32 to i64
  %54 = arith.extui %15 : i32 to i64
  %55 = arith.shli %54, %c32_i64 : i64
  %56 = arith.ori %53, %55 : i64
  %57 = arith.index_castui %56 : i64 to index
  %58 = arith.extui %16 : i32 to i64
  %59 = arith.extui %17 : i32 to i64
  %60 = arith.shli %59, %c32_i64 : i64
  %61 = arith.ori %58, %60 : i64
  %62 = arith.index_castui %61 : i64 to index
  %63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42}
  %64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52}
  %65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %66 = flow.dispatch.tensor.load %63, offsets = [%workgroup_id_y, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} -> tensor<1x?x8x4x16x2x8xi8>
  %67 = flow.dispatch.tensor.load %64, offsets = [%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} -> tensor<1x?x4x2x4x16x2x8xi8>
  %68 = flow.dispatch.tensor.load %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} -> tensor<1x1x8x4x2x4x16x4xi32>
  %69 = scf.for %arg0 = %c0 to %42 step %c1 iter_args(%arg1 = %68) -> (tensor<1x1x8x4x2x4x16x4xi32>) {
    %extracted_slice = tensor.extract_slice %66[0, %arg0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 16, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x?x8x4x16x2x8xi8> to tensor<1x1x8x4x16x2x8xi8>
    %70 = tensor.empty() : tensor<1x1x8x4x16x2x8xi8>
    %71 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice : tensor<1x1x8x4x16x2x8xi8>) outs(%70 : tensor<1x1x8x4x16x2x8xi8>) -> tensor<1x1x8x4x16x2x8xi8>
    %extracted_slice_0 = tensor.extract_slice %67[0, %arg0, 0, 0, 0, 0, 0, 0] [1, 1, 4, 2, 4, 16, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x?x4x2x4x16x2x8xi8> to tensor<1x1x4x2x4x16x2x8xi8>
    %72 = tensor.empty() : tensor<1x1x4x2x4x16x2x8xi8>
    %73 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_0 : tensor<1x1x4x2x4x16x2x8xi8>) outs(%72 : tensor<1x1x4x2x4x16x2x8xi8>) -> tensor<1x1x4x2x4x16x2x8xi8>
    %74 = iree_gpu.multi_mma %71, %73, %arg1 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic =  MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>, lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1], reduction = [0, 0, 1], workgroup = [1, 1, 0]}>} : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8> into tensor<1x1x8x4x2x4x16x4xi32>
    scf.yield %74 : tensor<1x1x8x4x2x4x16x4xi32>
  }
  flow.dispatch.tensor.store %69, %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
  return
 }

 // -----// IR Dump After PropagateReshapesByExpansionPass (iree-codegen-propagate-reshapes-by-expansion) //----- //
 func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} {
  %c0 = arith.constant 0 : index
  %c1 = arith.constant 1 : index
  %c32_i64 = arith.constant 32 : i64
  %0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
  %8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
  %9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
  %10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32
  %11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32
  %12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32
  %13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32
  %14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32
  %15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32
  %16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32
  %17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32
  %18 = arith.extui %0 : i32 to i64
  %19 = arith.extui %1 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %2 : i32 to i64
  %24 = arith.extui %3 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = arith.extui %4 : i32 to i64
  %29 = arith.extui %5 : i32 to i64
  %30 = arith.shli %29, %c32_i64 : i64
  %31 = arith.ori %28, %30 : i64
  %32 = arith.index_castui %31 : i64 to index
  %33 = arith.extui %6 : i32 to i64
  %34 = arith.extui %7 : i32 to i64
  %35 = arith.shli %34, %c32_i64 : i64
  %36 = arith.ori %33, %35 : i64
  %37 = arith.index_castui %36 : i64 to index
  %38 = arith.extui %8 : i32 to i64
  %39 = arith.extui %9 : i32 to i64
  %40 = arith.shli %39, %c32_i64 : i64
  %41 = arith.ori %38, %40 : i64
  %42 = arith.index_castui %41 : i64 to index
  %43 = arith.extui %10 : i32 to i64
  %44 = arith.extui %11 : i32 to i64
  %45 = arith.shli %44, %c32_i64 : i64
  %46 = arith.ori %43, %45 : i64
  %47 = arith.index_castui %46 : i64 to index
  %48 = arith.extui %12 : i32 to i64
  %49 = arith.extui %13 : i32 to i64
  %50 = arith.shli %49, %c32_i64 : i64
  %51 = arith.ori %48, %50 : i64
  %52 = arith.index_castui %51 : i64 to index
  %53 = arith.extui %14 : i32 to i64
  %54 = arith.extui %15 : i32 to i64
  %55 = arith.shli %54, %c32_i64 : i64
  %56 = arith.ori %53, %55 : i64
  %57 = arith.index_castui %56 : i64 to index
  %58 = arith.extui %16 : i32 to i64
  %59 = arith.extui %17 : i32 to i64
  %60 = arith.shli %59, %c32_i64 : i64
  %61 = arith.ori %58, %60 : i64
  %62 = arith.index_castui %61 : i64 to index
  %63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42}
  %64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52}
  %65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %66 = flow.dispatch.tensor.load %63, offsets = [%workgroup_id_y, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} -> tensor<1x?x8x4x16x2x8xi8>
  %67 = flow.dispatch.tensor.load %64, offsets = [%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} -> tensor<1x?x4x2x4x16x2x8xi8>
  %68 = flow.dispatch.tensor.load %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} -> tensor<1x1x8x4x2x4x16x4xi32>
  %69 = scf.for %arg0 = %c0 to %42 step %c1 iter_args(%arg1 = %68) -> (tensor<1x1x8x4x2x4x16x4xi32>) {
    %extracted_slice = tensor.extract_slice %66[0, %arg0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 16, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x?x8x4x16x2x8xi8> to tensor<1x1x8x4x16x2x8xi8>
    %70 = tensor.empty() : tensor<1x1x8x4x16x2x8xi8>
    %71 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice : tensor<1x1x8x4x16x2x8xi8>) outs(%70 : tensor<1x1x8x4x16x2x8xi8>) -> tensor<1x1x8x4x16x2x8xi8>
    %extracted_slice_0 = tensor.extract_slice %67[0, %arg0, 0, 0, 0, 0, 0, 0] [1, 1, 4, 2, 4, 16, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x?x4x2x4x16x2x8xi8> to tensor<1x1x4x2x4x16x2x8xi8>
    %72 = tensor.empty() : tensor<1x1x4x2x4x16x2x8xi8>
    %73 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_0 : tensor<1x1x4x2x4x16x2x8xi8>) outs(%72 : tensor<1x1x4x2x4x16x2x8xi8>) -> tensor<1x1x4x2x4x16x2x8xi8>
    %74 = iree_gpu.multi_mma %71, %73, %arg1 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic =  MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>, lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1], reduction = [0, 0, 1], workgroup = [1, 1, 0]}>} : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8> into tensor<1x1x8x4x2x4x16x4xi32>
    scf.yield %74 : tensor<1x1x8x4x2x4x16x4xi32>
  }
  flow.dispatch.tensor.store %69, %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
  return
 }

 // -----// IR Dump After Canonicalizer (canonicalize) //----- //
 func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} {
  %c0 = arith.constant 0 : index
  %c1 = arith.constant 1 : index
  %c32_i64 = arith.constant 32 : i64
  %0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
  %8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
  %9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
  %10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32
  %11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32
  %12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32
  %13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32
  %14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32
  %15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32
  %16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32
  %17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32
  %18 = arith.extui %0 : i32 to i64
  %19 = arith.extui %1 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %2 : i32 to i64
  %24 = arith.extui %3 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = arith.extui %4 : i32 to i64
  %29 = arith.extui %5 : i32 to i64
  %30 = arith.shli %29, %c32_i64 : i64
  %31 = arith.ori %28, %30 : i64
  %32 = arith.index_castui %31 : i64 to index
  %33 = arith.extui %6 : i32 to i64
  %34 = arith.extui %7 : i32 to i64
  %35 = arith.shli %34, %c32_i64 : i64
  %36 = arith.ori %33, %35 : i64
  %37 = arith.index_castui %36 : i64 to index
  %38 = arith.extui %8 : i32 to i64
  %39 = arith.extui %9 : i32 to i64
  %40 = arith.shli %39, %c32_i64 : i64
  %41 = arith.ori %38, %40 : i64
  %42 = arith.index_castui %41 : i64 to index
  %43 = arith.extui %10 : i32 to i64
  %44 = arith.extui %11 : i32 to i64
  %45 = arith.shli %44, %c32_i64 : i64
  %46 = arith.ori %43, %45 : i64
  %47 = arith.index_castui %46 : i64 to index
  %48 = arith.extui %12 : i32 to i64
  %49 = arith.extui %13 : i32 to i64
  %50 = arith.shli %49, %c32_i64 : i64
  %51 = arith.ori %48, %50 : i64
  %52 = arith.index_castui %51 : i64 to index
  %53 = arith.extui %14 : i32 to i64
  %54 = arith.extui %15 : i32 to i64
  %55 = arith.shli %54, %c32_i64 : i64
  %56 = arith.ori %53, %55 : i64
  %57 = arith.index_castui %56 : i64 to index
  %58 = arith.extui %16 : i32 to i64
  %59 = arith.extui %17 : i32 to i64
  %60 = arith.shli %59, %c32_i64 : i64
  %61 = arith.ori %58, %60 : i64
  %62 = arith.index_castui %61 : i64 to index
  %63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42}
  %64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52}
  %65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %66 = flow.dispatch.tensor.load %63, offsets = [%workgroup_id_y, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} -> tensor<1x?x8x4x16x2x8xi8>
  %67 = flow.dispatch.tensor.load %64, offsets = [%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} -> tensor<1x?x4x2x4x16x2x8xi8>
  %68 = flow.dispatch.tensor.load %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} -> tensor<1x1x8x4x2x4x16x4xi32>
  %69 = scf.for %arg0 = %c0 to %42 step %c1 iter_args(%arg1 = %68) -> (tensor<1x1x8x4x2x4x16x4xi32>) {
    %extracted_slice = tensor.extract_slice %66[0, %arg0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 16, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x?x8x4x16x2x8xi8> to tensor<1x1x8x4x16x2x8xi8>
    %70 = tensor.empty() : tensor<1x1x8x4x16x2x8xi8>
    %71 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice : tensor<1x1x8x4x16x2x8xi8>) outs(%70 : tensor<1x1x8x4x16x2x8xi8>) -> tensor<1x1x8x4x16x2x8xi8>
    %extracted_slice_0 = tensor.extract_slice %67[0, %arg0, 0, 0, 0, 0, 0, 0] [1, 1, 4, 2, 4, 16, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x?x4x2x4x16x2x8xi8> to tensor<1x1x4x2x4x16x2x8xi8>
    %72 = tensor.empty() : tensor<1x1x4x2x4x16x2x8xi8>
    %73 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_0 : tensor<1x1x4x2x4x16x2x8xi8>) outs(%72 : tensor<1x1x4x2x4x16x2x8xi8>) -> tensor<1x1x4x2x4x16x2x8xi8>
    %74 = iree_gpu.multi_mma %71, %73, %arg1 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic =  MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>, lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1], reduction = [0, 0, 1], workgroup = [1, 1, 0]}>} : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8> into tensor<1x1x8x4x2x4x16x4xi32>
    scf.yield %74 : tensor<1x1x8x4x2x4x16x4xi32>
  }
  flow.dispatch.tensor.store %69, %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
  return
 }

 // -----// IR Dump After CSE (cse) //----- //
 func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} {
  %c0 = arith.constant 0 : index
  %c1 = arith.constant 1 : index
  %c32_i64 = arith.constant 32 : i64
  %0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
  %8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
  %9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
  %10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32
  %11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32
  %12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32
  %13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32
  %14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32
  %15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32
  %16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32
  %17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32
  %18 = arith.extui %0 : i32 to i64
  %19 = arith.extui %1 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %2 : i32 to i64
  %24 = arith.extui %3 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = arith.extui %4 : i32 to i64
  %29 = arith.extui %5 : i32 to i64
  %30 = arith.shli %29, %c32_i64 : i64
  %31 = arith.ori %28, %30 : i64
  %32 = arith.index_castui %31 : i64 to index
  %33 = arith.extui %6 : i32 to i64
  %34 = arith.extui %7 : i32 to i64
  %35 = arith.shli %34, %c32_i64 : i64
  %36 = arith.ori %33, %35 : i64
  %37 = arith.index_castui %36 : i64 to index
  %38 = arith.extui %8 : i32 to i64
  %39 = arith.extui %9 : i32 to i64
  %40 = arith.shli %39, %c32_i64 : i64
  %41 = arith.ori %38, %40 : i64
  %42 = arith.index_castui %41 : i64 to index
  %43 = arith.extui %10 : i32 to i64
  %44 = arith.extui %11 : i32 to i64
  %45 = arith.shli %44, %c32_i64 : i64
  %46 = arith.ori %43, %45 : i64
  %47 = arith.index_castui %46 : i64 to index
  %48 = arith.extui %12 : i32 to i64
  %49 = arith.extui %13 : i32 to i64
  %50 = arith.shli %49, %c32_i64 : i64
  %51 = arith.ori %48, %50 : i64
  %52 = arith.index_castui %51 : i64 to index
  %53 = arith.extui %14 : i32 to i64
  %54 = arith.extui %15 : i32 to i64
  %55 = arith.shli %54, %c32_i64 : i64
  %56 = arith.ori %53, %55 : i64
  %57 = arith.index_castui %56 : i64 to index
  %58 = arith.extui %16 : i32 to i64
  %59 = arith.extui %17 : i32 to i64
  %60 = arith.shli %59, %c32_i64 : i64
  %61 = arith.ori %58, %60 : i64
  %62 = arith.index_castui %61 : i64 to index
  %63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42}
  %64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52}
  %65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %66 = flow.dispatch.tensor.load %63, offsets = [%workgroup_id_y, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} -> tensor<1x?x8x4x16x2x8xi8>
  %67 = flow.dispatch.tensor.load %64, offsets = [%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} -> tensor<1x?x4x2x4x16x2x8xi8>
  %68 = flow.dispatch.tensor.load %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} -> tensor<1x1x8x4x2x4x16x4xi32>
  %69 = scf.for %arg0 = %c0 to %42 step %c1 iter_args(%arg1 = %68) -> (tensor<1x1x8x4x2x4x16x4xi32>) {
    %extracted_slice = tensor.extract_slice %66[0, %arg0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 16, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x?x8x4x16x2x8xi8> to tensor<1x1x8x4x16x2x8xi8>
    %70 = tensor.empty() : tensor<1x1x8x4x16x2x8xi8>
    %71 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice : tensor<1x1x8x4x16x2x8xi8>) outs(%70 : tensor<1x1x8x4x16x2x8xi8>) -> tensor<1x1x8x4x16x2x8xi8>
    %extracted_slice_0 = tensor.extract_slice %67[0, %arg0, 0, 0, 0, 0, 0, 0] [1, 1, 4, 2, 4, 16, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x?x4x2x4x16x2x8xi8> to tensor<1x1x4x2x4x16x2x8xi8>
    %72 = tensor.empty() : tensor<1x1x4x2x4x16x2x8xi8>
    %73 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_0 : tensor<1x1x4x2x4x16x2x8xi8>) outs(%72 : tensor<1x1x4x2x4x16x2x8xi8>) -> tensor<1x1x4x2x4x16x2x8xi8>
    %74 = iree_gpu.multi_mma %71, %73, %arg1 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic =  MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>, lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1], reduction = [0, 0, 1], workgroup = [1, 1, 0]}>} : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8> into tensor<1x1x8x4x2x4x16x4xi32>
    scf.yield %74 : tensor<1x1x8x4x2x4x16x4xi32>
  }
  flow.dispatch.tensor.store %69, %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
  return
 }

 // -----// IR Dump After ConvertToDestinationPassingStylePass (iree-codegen-convert-to-destination-passing-style) //----- //
 func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} {
  %c0 = arith.constant 0 : index
  %c1 = arith.constant 1 : index
  %c32_i64 = arith.constant 32 : i64
  %0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
  %8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
  %9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
  %10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32
  %11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32
  %12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32
  %13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32
  %14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32
  %15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32
  %16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32
  %17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32
  %18 = arith.extui %0 : i32 to i64
  %19 = arith.extui %1 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %2 : i32 to i64
  %24 = arith.extui %3 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = arith.extui %4 : i32 to i64
  %29 = arith.extui %5 : i32 to i64
  %30 = arith.shli %29, %c32_i64 : i64
  %31 = arith.ori %28, %30 : i64
  %32 = arith.index_castui %31 : i64 to index
  %33 = arith.extui %6 : i32 to i64
  %34 = arith.extui %7 : i32 to i64
  %35 = arith.shli %34, %c32_i64 : i64
  %36 = arith.ori %33, %35 : i64
  %37 = arith.index_castui %36 : i64 to index
  %38 = arith.extui %8 : i32 to i64
  %39 = arith.extui %9 : i32 to i64
  %40 = arith.shli %39, %c32_i64 : i64
  %41 = arith.ori %38, %40 : i64
  %42 = arith.index_castui %41 : i64 to index
  %43 = arith.extui %10 : i32 to i64
  %44 = arith.extui %11 : i32 to i64
  %45 = arith.shli %44, %c32_i64 : i64
  %46 = arith.ori %43, %45 : i64
  %47 = arith.index_castui %46 : i64 to index
  %48 = arith.extui %12 : i32 to i64
  %49 = arith.extui %13 : i32 to i64
  %50 = arith.shli %49, %c32_i64 : i64
  %51 = arith.ori %48, %50 : i64
  %52 = arith.index_castui %51 : i64 to index
  %53 = arith.extui %14 : i32 to i64
  %54 = arith.extui %15 : i32 to i64
  %55 = arith.shli %54, %c32_i64 : i64
  %56 = arith.ori %53, %55 : i64
  %57 = arith.index_castui %56 : i64 to index
  %58 = arith.extui %16 : i32 to i64
  %59 = arith.extui %17 : i32 to i64
  %60 = arith.shli %59, %c32_i64 : i64
  %61 = arith.ori %58, %60 : i64
  %62 = arith.index_castui %61 : i64 to index
  %63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42}
  %64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52}
  %65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %66 = flow.dispatch.tensor.load %63, offsets = [%workgroup_id_y, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} -> tensor<1x?x8x4x16x2x8xi8>
  %67 = flow.dispatch.tensor.load %64, offsets = [%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} -> tensor<1x?x4x2x4x16x2x8xi8>
  %68 = flow.dispatch.tensor.load %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} -> tensor<1x1x8x4x2x4x16x4xi32>
  %69 = scf.for %arg0 = %c0 to %42 step %c1 iter_args(%arg1 = %68) -> (tensor<1x1x8x4x2x4x16x4xi32>) {
    %extracted_slice = tensor.extract_slice %66[0, %arg0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 16, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x?x8x4x16x2x8xi8> to tensor<1x1x8x4x16x2x8xi8>
    %70 = tensor.empty() : tensor<1x1x8x4x16x2x8xi8>
    %71 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice : tensor<1x1x8x4x16x2x8xi8>) outs(%70 : tensor<1x1x8x4x16x2x8xi8>) -> tensor<1x1x8x4x16x2x8xi8>
    %extracted_slice_0 = tensor.extract_slice %67[0, %arg0, 0, 0, 0, 0, 0, 0] [1, 1, 4, 2, 4, 16, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x?x4x2x4x16x2x8xi8> to tensor<1x1x4x2x4x16x2x8xi8>
    %72 = tensor.empty() : tensor<1x1x4x2x4x16x2x8xi8>
    %73 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_0 : tensor<1x1x4x2x4x16x2x8xi8>) outs(%72 : tensor<1x1x4x2x4x16x2x8xi8>) -> tensor<1x1x4x2x4x16x2x8xi8>
    %74 = iree_gpu.multi_mma %71, %73, %arg1 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic =  MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>, lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1], reduction = [0, 0, 1], workgroup = [1, 1, 0]}>} : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8> into tensor<1x1x8x4x2x4x16x4xi32>
    scf.yield %74 : tensor<1x1x8x4x2x4x16x4xi32>
  }
  flow.dispatch.tensor.store %69, %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
  return
 }

 // -----// IR Dump After GPUApplyTilingLevelPass (iree-codegen-gpu-apply-tiling-level) //----- //
 func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} {
  %c0 = arith.constant 0 : index
  %c1 = arith.constant 1 : index
  %c32_i64 = arith.constant 32 : i64
  %0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
  %8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
  %9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
  %10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32
  %11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32
  %12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32
  %13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32
  %14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32
  %15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32
  %16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32
  %17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32
  %18 = arith.extui %0 : i32 to i64
  %19 = arith.extui %1 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %2 : i32 to i64
  %24 = arith.extui %3 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = arith.extui %4 : i32 to i64
  %29 = arith.extui %5 : i32 to i64
  %30 = arith.shli %29, %c32_i64 : i64
  %31 = arith.ori %28, %30 : i64
  %32 = arith.index_castui %31 : i64 to index
  %33 = arith.extui %6 : i32 to i64
  %34 = arith.extui %7 : i32 to i64
  %35 = arith.shli %34, %c32_i64 : i64
  %36 = arith.ori %33, %35 : i64
  %37 = arith.index_castui %36 : i64 to index
  %38 = arith.extui %8 : i32 to i64
  %39 = arith.extui %9 : i32 to i64
  %40 = arith.shli %39, %c32_i64 : i64
  %41 = arith.ori %38, %40 : i64
  %42 = arith.index_castui %41 : i64 to index
  %43 = arith.extui %10 : i32 to i64
  %44 = arith.extui %11 : i32 to i64
  %45 = arith.shli %44, %c32_i64 : i64
  %46 = arith.ori %43, %45 : i64
  %47 = arith.index_castui %46 : i64 to index
  %48 = arith.extui %12 : i32 to i64
  %49 = arith.extui %13 : i32 to i64
  %50 = arith.shli %49, %c32_i64 : i64
  %51 = arith.ori %48, %50 : i64
  %52 = arith.index_castui %51 : i64 to index
  %53 = arith.extui %14 : i32 to i64
  %54 = arith.extui %15 : i32 to i64
  %55 = arith.shli %54, %c32_i64 : i64
  %56 = arith.ori %53, %55 : i64
  %57 = arith.index_castui %56 : i64 to index
  %58 = arith.extui %16 : i32 to i64
  %59 = arith.extui %17 : i32 to i64
  %60 = arith.shli %59, %c32_i64 : i64
  %61 = arith.ori %58, %60 : i64
  %62 = arith.index_castui %61 : i64 to index
  %63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42}
  %64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52}
  %65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %66 = flow.dispatch.tensor.load %63, offsets = [%workgroup_id_y, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} -> tensor<1x?x8x4x16x2x8xi8>
  %67 = flow.dispatch.tensor.load %64, offsets = [%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} -> tensor<1x?x4x2x4x16x2x8xi8>
  %68 = flow.dispatch.tensor.load %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} -> tensor<1x1x8x4x2x4x16x4xi32>
  %69 = scf.for %arg0 = %c0 to %42 step %c1 iter_args(%arg1 = %68) -> (tensor<1x1x8x4x2x4x16x4xi32>) {
    %70 = tensor.empty() : tensor<1x1x8x4x16x2x8xi8>
    %71 = scf.forall (%arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8) = (0, 0, 0, 0, 0, 0, 0) to (1, 1, 8, 4, 16, 2, 8) step (1, 1, 1, 1, 1, 2, 8) shared_outs(%arg9 = %70) -> (tensor<1x1x8x4x16x2x8xi8>) {
      %75 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%arg3, %arg0]
      %extracted_slice = tensor.extract_slice %66[%arg2, %75, %arg4, %arg5, %arg6, %arg7, %arg8] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x?x8x4x16x2x8xi8> to tensor<1x1x1x1x1x2x8xi8>
      %extracted_slice_0 = tensor.extract_slice %arg9[%arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x16x2x8xi8> to tensor<1x1x1x1x1x2x8xi8>
      %76 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice : tensor<1x1x1x1x1x2x8xi8>) outs(%extracted_slice_0 : tensor<1x1x1x1x1x2x8xi8>) -> tensor<1x1x1x1x1x2x8xi8>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %76 into %arg9[%arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x2x8xi8> into tensor<1x1x8x4x16x2x8xi8>
      }
    } {mapping = [#gpu.thread<linear_dim_6>, #gpu.thread<linear_dim_5>, #gpu.thread<linear_dim_4>, #gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
    %72 = tensor.empty() : tensor<1x1x4x2x4x16x2x8xi8>
    %73 = scf.forall (%arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9) = (0, 0, 0, 0, 0, 0, 0, 0) to (1, 1, 4, 2, 4, 16, 2, 8) step (1, 1, 1, 1, 1, 1, 2, 8) shared_outs(%arg10 = %72) -> (tensor<1x1x4x2x4x16x2x8xi8>) {
      %75 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%arg3, %arg0]
      %extracted_slice = tensor.extract_slice %67[%arg2, %75, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x?x4x2x4x16x2x8xi8> to tensor<1x1x1x1x1x1x2x8xi8>
      %extracted_slice_0 = tensor.extract_slice %arg10[%arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x4x2x4x16x2x8xi8> to tensor<1x1x1x1x1x1x2x8xi8>
      %76 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice : tensor<1x1x1x1x1x1x2x8xi8>) outs(%extracted_slice_0 : tensor<1x1x1x1x1x1x2x8xi8>) -> tensor<1x1x1x1x1x1x2x8xi8>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %76 into %arg10[%arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x1x2x8xi8> into tensor<1x1x4x2x4x16x2x8xi8>
      }
    } {mapping = [#gpu.thread<linear_dim_7>, #gpu.thread<linear_dim_6>, #gpu.thread<linear_dim_5>, #gpu.thread<linear_dim_4>, #gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
    %74 = iree_gpu.multi_mma %71, %73, %arg1 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic =  MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>, lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1], reduction = [0, 0, 1], workgroup = [1, 1, 0]}>} : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8> into tensor<1x1x8x4x2x4x16x4xi32>
    scf.yield %74 : tensor<1x1x8x4x2x4x16x4xi32>
  }
  flow.dispatch.tensor.store %69, %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
  return
 }

 // -----// IR Dump After Canonicalizer (canonicalize) //----- //
 func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} {
  %c0 = arith.constant 0 : index
  %c1 = arith.constant 1 : index
  %c32_i64 = arith.constant 32 : i64
  %0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
  %8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
  %9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
  %10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32
  %11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32
  %12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32
  %13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32
  %14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32
  %15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32
  %16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32
  %17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32
  %18 = arith.extui %0 : i32 to i64
  %19 = arith.extui %1 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %2 : i32 to i64
  %24 = arith.extui %3 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = arith.extui %4 : i32 to i64
  %29 = arith.extui %5 : i32 to i64
  %30 = arith.shli %29, %c32_i64 : i64
  %31 = arith.ori %28, %30 : i64
  %32 = arith.index_castui %31 : i64 to index
  %33 = arith.extui %6 : i32 to i64
  %34 = arith.extui %7 : i32 to i64
  %35 = arith.shli %34, %c32_i64 : i64
  %36 = arith.ori %33, %35 : i64
  %37 = arith.index_castui %36 : i64 to index
  %38 = arith.extui %8 : i32 to i64
  %39 = arith.extui %9 : i32 to i64
  %40 = arith.shli %39, %c32_i64 : i64
  %41 = arith.ori %38, %40 : i64
  %42 = arith.index_castui %41 : i64 to index
  %43 = arith.extui %10 : i32 to i64
  %44 = arith.extui %11 : i32 to i64
  %45 = arith.shli %44, %c32_i64 : i64
  %46 = arith.ori %43, %45 : i64
  %47 = arith.index_castui %46 : i64 to index
  %48 = arith.extui %12 : i32 to i64
  %49 = arith.extui %13 : i32 to i64
  %50 = arith.shli %49, %c32_i64 : i64
  %51 = arith.ori %48, %50 : i64
  %52 = arith.index_castui %51 : i64 to index
  %53 = arith.extui %14 : i32 to i64
  %54 = arith.extui %15 : i32 to i64
  %55 = arith.shli %54, %c32_i64 : i64
  %56 = arith.ori %53, %55 : i64
  %57 = arith.index_castui %56 : i64 to index
  %58 = arith.extui %16 : i32 to i64
  %59 = arith.extui %17 : i32 to i64
  %60 = arith.shli %59, %c32_i64 : i64
  %61 = arith.ori %58, %60 : i64
  %62 = arith.index_castui %61 : i64 to index
  %63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42}
  %64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52}
  %65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %66 = flow.dispatch.tensor.load %63, offsets = [%workgroup_id_y, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} -> tensor<1x?x8x4x16x2x8xi8>
  %67 = flow.dispatch.tensor.load %64, offsets = [%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} -> tensor<1x?x4x2x4x16x2x8xi8>
  %68 = flow.dispatch.tensor.load %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} -> tensor<1x1x8x4x2x4x16x4xi32>
  %69 = scf.for %arg0 = %c0 to %42 step %c1 iter_args(%arg1 = %68) -> (tensor<1x1x8x4x2x4x16x4xi32>) {
    %70 = tensor.empty() : tensor<1x1x8x4x16x2x8xi8>
    %71 = scf.forall (%arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8) = (0, 0, 0, 0, 0, 0, 0) to (1, 1, 8, 4, 16, 2, 8) step (1, 1, 1, 1, 1, 2, 8) shared_outs(%arg9 = %70) -> (tensor<1x1x8x4x16x2x8xi8>) {
      %75 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%arg3, %arg0]
      %extracted_slice = tensor.extract_slice %66[%arg2, %75, %arg4, %arg5, %arg6, %arg7, %arg8] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x?x8x4x16x2x8xi8> to tensor<1x1x1x1x1x2x8xi8>
      %extracted_slice_0 = tensor.extract_slice %arg9[%arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x16x2x8xi8> to tensor<1x1x1x1x1x2x8xi8>
      %76 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice : tensor<1x1x1x1x1x2x8xi8>) outs(%extracted_slice_0 : tensor<1x1x1x1x1x2x8xi8>) -> tensor<1x1x1x1x1x2x8xi8>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %76 into %arg9[%arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x2x8xi8> into tensor<1x1x8x4x16x2x8xi8>
      }
    } {mapping = [#gpu.thread<linear_dim_6>, #gpu.thread<linear_dim_5>, #gpu.thread<linear_dim_4>, #gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
    %72 = tensor.empty() : tensor<1x1x4x2x4x16x2x8xi8>
    %73 = scf.forall (%arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9) = (0, 0, 0, 0, 0, 0, 0, 0) to (1, 1, 4, 2, 4, 16, 2, 8) step (1, 1, 1, 1, 1, 1, 2, 8) shared_outs(%arg10 = %72) -> (tensor<1x1x4x2x4x16x2x8xi8>) {
      %75 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%arg3, %arg0]
      %extracted_slice = tensor.extract_slice %67[%arg2, %75, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x?x4x2x4x16x2x8xi8> to tensor<1x1x1x1x1x1x2x8xi8>
      %extracted_slice_0 = tensor.extract_slice %arg10[%arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x4x2x4x16x2x8xi8> to tensor<1x1x1x1x1x1x2x8xi8>
      %76 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice : tensor<1x1x1x1x1x1x2x8xi8>) outs(%extracted_slice_0 : tensor<1x1x1x1x1x1x2x8xi8>) -> tensor<1x1x1x1x1x1x2x8xi8>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %76 into %arg10[%arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x1x2x8xi8> into tensor<1x1x4x2x4x16x2x8xi8>
      }
    } {mapping = [#gpu.thread<linear_dim_7>, #gpu.thread<linear_dim_6>, #gpu.thread<linear_dim_5>, #gpu.thread<linear_dim_4>, #gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
    %74 = iree_gpu.multi_mma %71, %73, %arg1 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic =  MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>, lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1], reduction = [0, 0, 1], workgroup = [1, 1, 0]}>} : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8> into tensor<1x1x8x4x2x4x16x4xi32>
    scf.yield %74 : tensor<1x1x8x4x2x4x16x4xi32>
  }
  flow.dispatch.tensor.store %69, %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
  return
 }

 // -----// IR Dump After CSE (cse) //----- //
 func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} {
  %c0 = arith.constant 0 : index
  %c1 = arith.constant 1 : index
  %c32_i64 = arith.constant 32 : i64
  %0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
  %8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
  %9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
  %10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32
  %11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32
  %12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32
  %13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32
  %14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32
  %15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32
  %16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32
  %17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32
  %18 = arith.extui %0 : i32 to i64
  %19 = arith.extui %1 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %2 : i32 to i64
  %24 = arith.extui %3 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = arith.extui %4 : i32 to i64
  %29 = arith.extui %5 : i32 to i64
  %30 = arith.shli %29, %c32_i64 : i64
  %31 = arith.ori %28, %30 : i64
  %32 = arith.index_castui %31 : i64 to index
  %33 = arith.extui %6 : i32 to i64
  %34 = arith.extui %7 : i32 to i64
  %35 = arith.shli %34, %c32_i64 : i64
  %36 = arith.ori %33, %35 : i64
  %37 = arith.index_castui %36 : i64 to index
  %38 = arith.extui %8 : i32 to i64
  %39 = arith.extui %9 : i32 to i64
  %40 = arith.shli %39, %c32_i64 : i64
  %41 = arith.ori %38, %40 : i64
  %42 = arith.index_castui %41 : i64 to index
  %43 = arith.extui %10 : i32 to i64
  %44 = arith.extui %11 : i32 to i64
  %45 = arith.shli %44, %c32_i64 : i64
  %46 = arith.ori %43, %45 : i64
  %47 = arith.index_castui %46 : i64 to index
  %48 = arith.extui %12 : i32 to i64
  %49 = arith.extui %13 : i32 to i64
  %50 = arith.shli %49, %c32_i64 : i64
  %51 = arith.ori %48, %50 : i64
  %52 = arith.index_castui %51 : i64 to index
  %53 = arith.extui %14 : i32 to i64
  %54 = arith.extui %15 : i32 to i64
  %55 = arith.shli %54, %c32_i64 : i64
  %56 = arith.ori %53, %55 : i64
  %57 = arith.index_castui %56 : i64 to index
  %58 = arith.extui %16 : i32 to i64
  %59 = arith.extui %17 : i32 to i64
  %60 = arith.shli %59, %c32_i64 : i64
  %61 = arith.ori %58, %60 : i64
  %62 = arith.index_castui %61 : i64 to index
  %63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42}
  %64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52}
  %65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %66 = flow.dispatch.tensor.load %63, offsets = [%workgroup_id_y, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} -> tensor<1x?x8x4x16x2x8xi8>
  %67 = flow.dispatch.tensor.load %64, offsets = [%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} -> tensor<1x?x4x2x4x16x2x8xi8>
  %68 = flow.dispatch.tensor.load %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} -> tensor<1x1x8x4x2x4x16x4xi32>
  %69 = scf.for %arg0 = %c0 to %42 step %c1 iter_args(%arg1 = %68) -> (tensor<1x1x8x4x2x4x16x4xi32>) {
    %70 = tensor.empty() : tensor<1x1x8x4x16x2x8xi8>
    %71 = scf.forall (%arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8) = (0, 0, 0, 0, 0, 0, 0) to (1, 1, 8, 4, 16, 2, 8) step (1, 1, 1, 1, 1, 2, 8) shared_outs(%arg9 = %70) -> (tensor<1x1x8x4x16x2x8xi8>) {
      %75 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%arg3, %arg0]
      %extracted_slice = tensor.extract_slice %66[%arg2, %75, %arg4, %arg5, %arg6, %arg7, %arg8] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x?x8x4x16x2x8xi8> to tensor<1x1x1x1x1x2x8xi8>
      %extracted_slice_0 = tensor.extract_slice %arg9[%arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x16x2x8xi8> to tensor<1x1x1x1x1x2x8xi8>
      %76 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice : tensor<1x1x1x1x1x2x8xi8>) outs(%extracted_slice_0 : tensor<1x1x1x1x1x2x8xi8>) -> tensor<1x1x1x1x1x2x8xi8>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %76 into %arg9[%arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x2x8xi8> into tensor<1x1x8x4x16x2x8xi8>
      }
    } {mapping = [#gpu.thread<linear_dim_6>, #gpu.thread<linear_dim_5>, #gpu.thread<linear_dim_4>, #gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
    %72 = tensor.empty() : tensor<1x1x4x2x4x16x2x8xi8>
    %73 = scf.forall (%arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9) = (0, 0, 0, 0, 0, 0, 0, 0) to (1, 1, 4, 2, 4, 16, 2, 8) step (1, 1, 1, 1, 1, 1, 2, 8) shared_outs(%arg10 = %72) -> (tensor<1x1x4x2x4x16x2x8xi8>) {
      %75 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%arg3, %arg0]
      %extracted_slice = tensor.extract_slice %67[%arg2, %75, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x?x4x2x4x16x2x8xi8> to tensor<1x1x1x1x1x1x2x8xi8>
      %extracted_slice_0 = tensor.extract_slice %arg10[%arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x4x2x4x16x2x8xi8> to tensor<1x1x1x1x1x1x2x8xi8>
      %76 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice : tensor<1x1x1x1x1x1x2x8xi8>) outs(%extracted_slice_0 : tensor<1x1x1x1x1x1x2x8xi8>) -> tensor<1x1x1x1x1x1x2x8xi8>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %76 into %arg10[%arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x1x2x8xi8> into tensor<1x1x4x2x4x16x2x8xi8>
      }
    } {mapping = [#gpu.thread<linear_dim_7>, #gpu.thread<linear_dim_6>, #gpu.thread<linear_dim_5>, #gpu.thread<linear_dim_4>, #gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
    %74 = iree_gpu.multi_mma %71, %73, %arg1 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic =  MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>, lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1], reduction = [0, 0, 1], workgroup = [1, 1, 0]}>} : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8> into tensor<1x1x8x4x2x4x16x4xi32>
    scf.yield %74 : tensor<1x1x8x4x2x4x16x4xi32>
  }
  flow.dispatch.tensor.store %69, %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
  return
 }

 // -----// IR Dump After GPUApplyTilingLevelPass (iree-codegen-gpu-apply-tiling-level) //----- //
 func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} {
  %c0 = arith.constant 0 : index
  %c1 = arith.constant 1 : index
  %c32_i64 = arith.constant 32 : i64
  %0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
  %8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
  %9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
  %10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32
  %11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32
  %12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32
  %13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32
  %14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32
  %15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32
  %16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32
  %17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32
  %18 = arith.extui %0 : i32 to i64
  %19 = arith.extui %1 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %2 : i32 to i64
  %24 = arith.extui %3 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = arith.extui %4 : i32 to i64
  %29 = arith.extui %5 : i32 to i64
  %30 = arith.shli %29, %c32_i64 : i64
  %31 = arith.ori %28, %30 : i64
  %32 = arith.index_castui %31 : i64 to index
  %33 = arith.extui %6 : i32 to i64
  %34 = arith.extui %7 : i32 to i64
  %35 = arith.shli %34, %c32_i64 : i64
  %36 = arith.ori %33, %35 : i64
  %37 = arith.index_castui %36 : i64 to index
  %38 = arith.extui %8 : i32 to i64
  %39 = arith.extui %9 : i32 to i64
  %40 = arith.shli %39, %c32_i64 : i64
  %41 = arith.ori %38, %40 : i64
  %42 = arith.index_castui %41 : i64 to index
  %43 = arith.extui %10 : i32 to i64
  %44 = arith.extui %11 : i32 to i64
  %45 = arith.shli %44, %c32_i64 : i64
  %46 = arith.ori %43, %45 : i64
  %47 = arith.index_castui %46 : i64 to index
  %48 = arith.extui %12 : i32 to i64
  %49 = arith.extui %13 : i32 to i64
  %50 = arith.shli %49, %c32_i64 : i64
  %51 = arith.ori %48, %50 : i64
  %52 = arith.index_castui %51 : i64 to index
  %53 = arith.extui %14 : i32 to i64
  %54 = arith.extui %15 : i32 to i64
  %55 = arith.shli %54, %c32_i64 : i64
  %56 = arith.ori %53, %55 : i64
  %57 = arith.index_castui %56 : i64 to index
  %58 = arith.extui %16 : i32 to i64
  %59 = arith.extui %17 : i32 to i64
  %60 = arith.shli %59, %c32_i64 : i64
  %61 = arith.ori %58, %60 : i64
  %62 = arith.index_castui %61 : i64 to index
  %63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42}
  %64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52}
  %65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %66 = flow.dispatch.tensor.load %63, offsets = [%workgroup_id_y, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} -> tensor<1x?x8x4x16x2x8xi8>
  %67 = flow.dispatch.tensor.load %64, offsets = [%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} -> tensor<1x?x4x2x4x16x2x8xi8>
  %68 = flow.dispatch.tensor.load %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} -> tensor<1x1x8x4x2x4x16x4xi32>
  %69 = scf.for %arg0 = %c0 to %42 step %c1 iter_args(%arg1 = %68) -> (tensor<1x1x8x4x2x4x16x4xi32>) {
    %70 = tensor.empty() : tensor<1x1x8x4x16x2x8xi8>
    %71 = scf.forall (%arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8) = (0, 0, 0, 0, 0, 0, 0) to (1, 1, 8, 4, 16, 2, 8) step (1, 1, 1, 1, 1, 2, 8) shared_outs(%arg9 = %70) -> (tensor<1x1x8x4x16x2x8xi8>) {
      %75 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%arg3, %arg0]
      %extracted_slice = tensor.extract_slice %66[%arg2, %75, %arg4, %arg5, %arg6, %arg7, %arg8] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x?x8x4x16x2x8xi8> to tensor<1x1x1x1x1x2x8xi8>
      %extracted_slice_0 = tensor.extract_slice %arg9[%arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x16x2x8xi8> to tensor<1x1x1x1x1x2x8xi8>
      %76 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice : tensor<1x1x1x1x1x2x8xi8>) outs(%extracted_slice_0 : tensor<1x1x1x1x1x2x8xi8>) -> tensor<1x1x1x1x1x2x8xi8>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %76 into %arg9[%arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x2x8xi8> into tensor<1x1x8x4x16x2x8xi8>
      }
    } {mapping = [#gpu.thread<linear_dim_6>, #gpu.thread<linear_dim_5>, #gpu.thread<linear_dim_4>, #gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
    %72 = tensor.empty() : tensor<1x1x4x2x4x16x2x8xi8>
    %73 = scf.forall (%arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9) = (0, 0, 0, 0, 0, 0, 0, 0) to (1, 1, 4, 2, 4, 16, 2, 8) step (1, 1, 1, 1, 1, 1, 2, 8) shared_outs(%arg10 = %72) -> (tensor<1x1x4x2x4x16x2x8xi8>) {
      %75 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%arg3, %arg0]
      %extracted_slice = tensor.extract_slice %67[%arg2, %75, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x?x4x2x4x16x2x8xi8> to tensor<1x1x1x1x1x1x2x8xi8>
      %extracted_slice_0 = tensor.extract_slice %arg10[%arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x4x2x4x16x2x8xi8> to tensor<1x1x1x1x1x1x2x8xi8>
      %76 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice : tensor<1x1x1x1x1x1x2x8xi8>) outs(%extracted_slice_0 : tensor<1x1x1x1x1x1x2x8xi8>) -> tensor<1x1x1x1x1x1x2x8xi8>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %76 into %arg10[%arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x1x2x8xi8> into tensor<1x1x4x2x4x16x2x8xi8>
      }
    } {mapping = [#gpu.thread<linear_dim_7>, #gpu.thread<linear_dim_6>, #gpu.thread<linear_dim_5>, #gpu.thread<linear_dim_4>, #gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
    %74 = iree_gpu.multi_mma %71, %73, %arg1 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic =  MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>, lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1], reduction = [0, 0, 1], workgroup = [1, 1, 0]}>} : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8> into tensor<1x1x8x4x2x4x16x4xi32>
    scf.yield %74 : tensor<1x1x8x4x2x4x16x4xi32>
  }
  flow.dispatch.tensor.store %69, %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
  return
 }

 // -----// IR Dump After DistributeMmaToLanesPass (iree-gpu-distribute-mma-to-lanes) //----- //
 func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} {
  %c16 = arith.constant 16 : index
  %c4 = arith.constant 4 : index
  %c0 = arith.constant 0 : index
  %c1 = arith.constant 1 : index
  %c32_i64 = arith.constant 32 : i64
  %0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
  %8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
  %9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
  %10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32
  %11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32
  %12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32
  %13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32
  %14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32
  %15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32
  %16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32
  %17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32
  %18 = arith.extui %0 : i32 to i64
  %19 = arith.extui %1 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %2 : i32 to i64
  %24 = arith.extui %3 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = arith.extui %4 : i32 to i64
  %29 = arith.extui %5 : i32 to i64
  %30 = arith.shli %29, %c32_i64 : i64
  %31 = arith.ori %28, %30 : i64
  %32 = arith.index_castui %31 : i64 to index
  %33 = arith.extui %6 : i32 to i64
  %34 = arith.extui %7 : i32 to i64
  %35 = arith.shli %34, %c32_i64 : i64
  %36 = arith.ori %33, %35 : i64
  %37 = arith.index_castui %36 : i64 to index
  %38 = arith.extui %8 : i32 to i64
  %39 = arith.extui %9 : i32 to i64
  %40 = arith.shli %39, %c32_i64 : i64
  %41 = arith.ori %38, %40 : i64
  %42 = arith.index_castui %41 : i64 to index
  %43 = arith.extui %10 : i32 to i64
  %44 = arith.extui %11 : i32 to i64
  %45 = arith.shli %44, %c32_i64 : i64
  %46 = arith.ori %43, %45 : i64
  %47 = arith.index_castui %46 : i64 to index
  %48 = arith.extui %12 : i32 to i64
  %49 = arith.extui %13 : i32 to i64
  %50 = arith.shli %49, %c32_i64 : i64
  %51 = arith.ori %48, %50 : i64
  %52 = arith.index_castui %51 : i64 to index
  %53 = arith.extui %14 : i32 to i64
  %54 = arith.extui %15 : i32 to i64
  %55 = arith.shli %54, %c32_i64 : i64
  %56 = arith.ori %53, %55 : i64
  %57 = arith.index_castui %56 : i64 to index
  %58 = arith.extui %16 : i32 to i64
  %59 = arith.extui %17 : i32 to i64
  %60 = arith.shli %59, %c32_i64 : i64
  %61 = arith.ori %58, %60 : i64
  %62 = arith.index_castui %61 : i64 to index
  %63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42}
  %64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52}
  %65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %66 = flow.dispatch.tensor.load %63, offsets = [%workgroup_id_y, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} -> tensor<1x?x8x4x16x2x8xi8>
  %67 = flow.dispatch.tensor.load %64, offsets = [%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} -> tensor<1x?x4x2x4x16x2x8xi8>
  %68 = flow.dispatch.tensor.load %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} -> tensor<1x1x8x4x2x4x16x4xi32>
  %69 = scf.for %arg0 = %c0 to %42 step %c1 iter_args(%arg1 = %68) -> (tensor<1x1x8x4x2x4x16x4xi32>) {
    %70 = tensor.empty() : tensor<1x1x8x4x16x2x8xi8>
    %71 = scf.forall (%arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8) = (0, 0, 0, 0, 0, 0, 0) to (1, 1, 8, 4, 16, 2, 8) step (1, 1, 1, 1, 1, 2, 8) shared_outs(%arg9 = %70) -> (tensor<1x1x8x4x16x2x8xi8>) {
      %75 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%arg3, %arg0]
      %extracted_slice = tensor.extract_slice %66[%arg2, %75, %arg4, %arg5, %arg6, %arg7, %arg8] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x?x8x4x16x2x8xi8> to tensor<1x1x1x1x1x2x8xi8>
      %extracted_slice_0 = tensor.extract_slice %arg9[%arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x16x2x8xi8> to tensor<1x1x1x1x1x2x8xi8>
      %76 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice : tensor<1x1x1x1x1x2x8xi8>) outs(%extracted_slice_0 : tensor<1x1x1x1x1x2x8xi8>) -> tensor<1x1x1x1x1x2x8xi8>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %76 into %arg9[%arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x2x8xi8> into tensor<1x1x8x4x16x2x8xi8>
      }
    } {mapping = [#gpu.thread<linear_dim_6>, #gpu.thread<linear_dim_5>, #gpu.thread<linear_dim_4>, #gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
    %72 = tensor.empty() : tensor<1x1x4x2x4x16x2x8xi8>
    %73 = scf.forall (%arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9) = (0, 0, 0, 0, 0, 0, 0, 0) to (1, 1, 4, 2, 4, 16, 2, 8) step (1, 1, 1, 1, 1, 1, 2, 8) shared_outs(%arg10 = %72) -> (tensor<1x1x4x2x4x16x2x8xi8>) {
      %75 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%arg3, %arg0]
      %extracted_slice = tensor.extract_slice %67[%arg2, %75, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x?x4x2x4x16x2x8xi8> to tensor<1x1x1x1x1x1x2x8xi8>
      %extracted_slice_0 = tensor.extract_slice %arg10[%arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x4x2x4x16x2x8xi8> to tensor<1x1x1x1x1x1x2x8xi8>
      %76 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice : tensor<1x1x1x1x1x1x2x8xi8>) outs(%extracted_slice_0 : tensor<1x1x1x1x1x1x2x8xi8>) -> tensor<1x1x1x1x1x1x2x8xi8>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %76 into %arg10[%arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x1x2x8xi8> into tensor<1x1x4x2x4x16x2x8xi8>
      }
    } {mapping = [#gpu.thread<linear_dim_7>, #gpu.thread<linear_dim_6>, #gpu.thread<linear_dim_5>, #gpu.thread<linear_dim_4>, #gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
    %74 = scf.forall (%arg2) in (256) shared_outs(%arg3 = %arg1) -> (tensor<1x1x8x4x2x4x16x4xi32>) {
      %75 = affine.apply affine_map<(d0) -> (d0 mod 64)>(%arg2)
      %76:5 = affine.delinearize_index %75 into (%c1, %c4, %c16, %c1, %c1) : index, index, index, index, index
      %extracted_slice = tensor.extract_slice %71[0, 0, %76#0, %76#1, %76#2, %76#3, %76#4] [1, 1, 8, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x16x2x8xi8> to tensor<1x1x8x1x1x2x8xi8>
      %77 = affine.apply affine_map<(d0) -> (d0 mod 256)>(%arg2)
      %78:6 = affine.delinearize_index %77 into (%c4, %c1, %c4, %c16, %c1, %c1) : index, index, index, index, index, index
      %extracted_slice_0 = tensor.extract_slice %73[0, 0, %78#0, %78#1, %78#2, %78#3, %78#4, %78#5] [1, 1, 1, 2, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x4x2x4x16x2x8xi8> to tensor<1x1x1x2x1x1x2x8xi8>
      %79 = affine.apply affine_map<(d0) -> (d0 mod 256)>(%arg2)
      %80:6 = affine.delinearize_index %79 into (%c1, %c4, %c1, %c4, %c16, %c1) : index, index, index, index, index, index
      %extracted_slice_1 = tensor.extract_slice %arg3[0, 0, %80#0, %80#1, %80#2, %80#3, %80#4, %80#5] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> to tensor<1x1x8x1x2x1x1x4xi32>
      %81 = iree_gpu.multi_mma %extracted_slice, %extracted_slice_0, %extracted_slice_1 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic =  MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<1x1x8x1x1x2x8xi8>, tensor<1x1x1x2x1x1x2x8xi8> into tensor<1x1x8x1x2x1x1x4xi32>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %81 into %arg3[0, 0, %80#0, %80#1, %80#2, %80#3, %80#4, %80#5] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x1x2x1x1x4xi32> into tensor<1x1x8x4x2x4x16x4xi32>
      }
    } {mapping = [#gpu.thread<linear_dim_0>]}
    scf.yield %74 : tensor<1x1x8x4x2x4x16x4xi32>
  }
  flow.dispatch.tensor.store %69, %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
  return
 }

 // -----// IR Dump After NormalizeLoopBoundsPass (iree-codegen-normalize-loop-bounds) //----- //
 func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} {
  %c16 = arith.constant 16 : index
  %c4 = arith.constant 4 : index
  %c0 = arith.constant 0 : index
  %c1 = arith.constant 1 : index
  %c32_i64 = arith.constant 32 : i64
  %0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
  %8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
  %9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
  %10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32
  %11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32
  %12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32
  %13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32
  %14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32
  %15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32
  %16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32
  %17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32
  %18 = arith.extui %0 : i32 to i64
  %19 = arith.extui %1 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %2 : i32 to i64
  %24 = arith.extui %3 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = arith.extui %4 : i32 to i64
  %29 = arith.extui %5 : i32 to i64
  %30 = arith.shli %29, %c32_i64 : i64
  %31 = arith.ori %28, %30 : i64
  %32 = arith.index_castui %31 : i64 to index
  %33 = arith.extui %6 : i32 to i64
  %34 = arith.extui %7 : i32 to i64
  %35 = arith.shli %34, %c32_i64 : i64
  %36 = arith.ori %33, %35 : i64
  %37 = arith.index_castui %36 : i64 to index
  %38 = arith.extui %8 : i32 to i64
  %39 = arith.extui %9 : i32 to i64
  %40 = arith.shli %39, %c32_i64 : i64
  %41 = arith.ori %38, %40 : i64
  %42 = arith.index_castui %41 : i64 to index
  %43 = arith.extui %10 : i32 to i64
  %44 = arith.extui %11 : i32 to i64
  %45 = arith.shli %44, %c32_i64 : i64
  %46 = arith.ori %43, %45 : i64
  %47 = arith.index_castui %46 : i64 to index
  %48 = arith.extui %12 : i32 to i64
  %49 = arith.extui %13 : i32 to i64
  %50 = arith.shli %49, %c32_i64 : i64
  %51 = arith.ori %48, %50 : i64
  %52 = arith.index_castui %51 : i64 to index
  %53 = arith.extui %14 : i32 to i64
  %54 = arith.extui %15 : i32 to i64
  %55 = arith.shli %54, %c32_i64 : i64
  %56 = arith.ori %53, %55 : i64
  %57 = arith.index_castui %56 : i64 to index
  %58 = arith.extui %16 : i32 to i64
  %59 = arith.extui %17 : i32 to i64
  %60 = arith.shli %59, %c32_i64 : i64
  %61 = arith.ori %58, %60 : i64
  %62 = arith.index_castui %61 : i64 to index
  %63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42}
  %64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52}
  %65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %66 = flow.dispatch.tensor.load %63, offsets = [%workgroup_id_y, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} -> tensor<1x?x8x4x16x2x8xi8>
  %67 = flow.dispatch.tensor.load %64, offsets = [%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} -> tensor<1x?x4x2x4x16x2x8xi8>
  %68 = flow.dispatch.tensor.load %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} -> tensor<1x1x8x4x2x4x16x4xi32>
  %69 = scf.for %arg0 = %c0 to %42 step %c1 iter_args(%arg1 = %68) -> (tensor<1x1x8x4x2x4x16x4xi32>) {
    %70 = tensor.empty() : tensor<1x1x8x4x16x2x8xi8>
    %71 = scf.forall (%arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8) in (1, 1, 8, 4, 16, 1, 1) shared_outs(%arg9 = %70) -> (tensor<1x1x8x4x16x2x8xi8>) {
      %75 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg8)
      %76 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg7)
      %77 = affine.apply affine_map<(d0) -> (d0)>(%arg6)
      %78 = affine.apply affine_map<(d0) -> (d0)>(%arg5)
      %79 = affine.apply affine_map<(d0) -> (d0)>(%arg4)
      %80 = affine.apply affine_map<(d0) -> (d0)>(%arg3)
      %81 = affine.apply affine_map<(d0) -> (d0)>(%arg2)
      %82 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%80, %arg0]
      %extracted_slice = tensor.extract_slice %66[%81, %82, %79, %78, %77, %76, %75] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x?x8x4x16x2x8xi8> to tensor<1x1x1x1x1x2x8xi8>
      %extracted_slice_0 = tensor.extract_slice %arg9[%81, %80, %79, %78, %77, %76, %75] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x16x2x8xi8> to tensor<1x1x1x1x1x2x8xi8>
      %83 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice : tensor<1x1x1x1x1x2x8xi8>) outs(%extracted_slice_0 : tensor<1x1x1x1x1x2x8xi8>) -> tensor<1x1x1x1x1x2x8xi8>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %83 into %arg9[%81, %80, %79, %78, %77, %76, %75] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x2x8xi8> into tensor<1x1x8x4x16x2x8xi8>
      }
    } {mapping = [#gpu.thread<linear_dim_6>, #gpu.thread<linear_dim_5>, #gpu.thread<linear_dim_4>, #gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
    %72 = tensor.empty() : tensor<1x1x4x2x4x16x2x8xi8>
    %73 = scf.forall (%arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9) in (1, 1, 4, 2, 4, 16, 1, 1) shared_outs(%arg10 = %72) -> (tensor<1x1x4x2x4x16x2x8xi8>) {
      %75 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg9)
      %76 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg8)
      %77 = affine.apply affine_map<(d0) -> (d0)>(%arg7)
      %78 = affine.apply affine_map<(d0) -> (d0)>(%arg6)
      %79 = affine.apply affine_map<(d0) -> (d0)>(%arg5)
      %80 = affine.apply affine_map<(d0) -> (d0)>(%arg4)
      %81 = affine.apply affine_map<(d0) -> (d0)>(%arg3)
      %82 = affine.apply affine_map<(d0) -> (d0)>(%arg2)
      %83 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%81, %arg0]
      %extracted_slice = tensor.extract_slice %67[%82, %83, %80, %79, %78, %77, %76, %75] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x?x4x2x4x16x2x8xi8> to tensor<1x1x1x1x1x1x2x8xi8>
      %extracted_slice_0 = tensor.extract_slice %arg10[%82, %81, %80, %79, %78, %77, %76, %75] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x4x2x4x16x2x8xi8> to tensor<1x1x1x1x1x1x2x8xi8>
      %84 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice : tensor<1x1x1x1x1x1x2x8xi8>) outs(%extracted_slice_0 : tensor<1x1x1x1x1x1x2x8xi8>) -> tensor<1x1x1x1x1x1x2x8xi8>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %84 into %arg10[%82, %81, %80, %79, %78, %77, %76, %75] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x1x2x8xi8> into tensor<1x1x4x2x4x16x2x8xi8>
      }
    } {mapping = [#gpu.thread<linear_dim_7>, #gpu.thread<linear_dim_6>, #gpu.thread<linear_dim_5>, #gpu.thread<linear_dim_4>, #gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
    %74 = scf.forall (%arg2) in (256) shared_outs(%arg3 = %arg1) -> (tensor<1x1x8x4x2x4x16x4xi32>) {
      %75 = affine.apply affine_map<(d0) -> (d0 mod 64)>(%arg2)
      %76:5 = affine.delinearize_index %75 into (%c1, %c4, %c16, %c1, %c1) : index, index, index, index, index
      %extracted_slice = tensor.extract_slice %71[0, 0, %76#0, %76#1, %76#2, %76#3, %76#4] [1, 1, 8, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x16x2x8xi8> to tensor<1x1x8x1x1x2x8xi8>
      %77 = affine.apply affine_map<(d0) -> (d0 mod 256)>(%arg2)
      %78:6 = affine.delinearize_index %77 into (%c4, %c1, %c4, %c16, %c1, %c1) : index, index, index, index, index, index
      %extracted_slice_0 = tensor.extract_slice %73[0, 0, %78#0, %78#1, %78#2, %78#3, %78#4, %78#5] [1, 1, 1, 2, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x4x2x4x16x2x8xi8> to tensor<1x1x1x2x1x1x2x8xi8>
      %79 = affine.apply affine_map<(d0) -> (d0 mod 256)>(%arg2)
      %80:6 = affine.delinearize_index %79 into (%c1, %c4, %c1, %c4, %c16, %c1) : index, index, index, index, index, index
      %extracted_slice_1 = tensor.extract_slice %arg3[0, 0, %80#0, %80#1, %80#2, %80#3, %80#4, %80#5] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> to tensor<1x1x8x1x2x1x1x4xi32>
      %81 = iree_gpu.multi_mma %extracted_slice, %extracted_slice_0, %extracted_slice_1 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic =  MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<1x1x8x1x1x2x8xi8>, tensor<1x1x1x2x1x1x2x8xi8> into tensor<1x1x8x1x2x1x1x4xi32>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %81 into %arg3[0, 0, %80#0, %80#1, %80#2, %80#3, %80#4, %80#5] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x1x2x1x1x4xi32> into tensor<1x1x8x4x2x4x16x4xi32>
      }
    } {mapping = [#gpu.thread<linear_dim_0>]}
    scf.yield %74 : tensor<1x1x8x4x2x4x16x4xi32>
  }
  flow.dispatch.tensor.store %69, %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
  return
 }

 // -----// IR Dump After Canonicalizer (canonicalize) //----- //
 func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} {
  %c16 = arith.constant 16 : index
  %c4 = arith.constant 4 : index
  %c0 = arith.constant 0 : index
  %c1 = arith.constant 1 : index
  %c32_i64 = arith.constant 32 : i64
  %0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
  %8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
  %9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
  %10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32
  %11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32
  %12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32
  %13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32
  %14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32
  %15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32
  %16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32
  %17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32
  %18 = arith.extui %0 : i32 to i64
  %19 = arith.extui %1 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %2 : i32 to i64
  %24 = arith.extui %3 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = arith.extui %4 : i32 to i64
  %29 = arith.extui %5 : i32 to i64
  %30 = arith.shli %29, %c32_i64 : i64
  %31 = arith.ori %28, %30 : i64
  %32 = arith.index_castui %31 : i64 to index
  %33 = arith.extui %6 : i32 to i64
  %34 = arith.extui %7 : i32 to i64
  %35 = arith.shli %34, %c32_i64 : i64
  %36 = arith.ori %33, %35 : i64
  %37 = arith.index_castui %36 : i64 to index
  %38 = arith.extui %8 : i32 to i64
  %39 = arith.extui %9 : i32 to i64
  %40 = arith.shli %39, %c32_i64 : i64
  %41 = arith.ori %38, %40 : i64
  %42 = arith.index_castui %41 : i64 to index
  %43 = arith.extui %10 : i32 to i64
  %44 = arith.extui %11 : i32 to i64
  %45 = arith.shli %44, %c32_i64 : i64
  %46 = arith.ori %43, %45 : i64
  %47 = arith.index_castui %46 : i64 to index
  %48 = arith.extui %12 : i32 to i64
  %49 = arith.extui %13 : i32 to i64
  %50 = arith.shli %49, %c32_i64 : i64
  %51 = arith.ori %48, %50 : i64
  %52 = arith.index_castui %51 : i64 to index
  %53 = arith.extui %14 : i32 to i64
  %54 = arith.extui %15 : i32 to i64
  %55 = arith.shli %54, %c32_i64 : i64
  %56 = arith.ori %53, %55 : i64
  %57 = arith.index_castui %56 : i64 to index
  %58 = arith.extui %16 : i32 to i64
  %59 = arith.extui %17 : i32 to i64
  %60 = arith.shli %59, %c32_i64 : i64
  %61 = arith.ori %58, %60 : i64
  %62 = arith.index_castui %61 : i64 to index
  %63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42}
  %64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52}
  %65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %66 = flow.dispatch.tensor.load %63, offsets = [%workgroup_id_y, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} -> tensor<1x?x8x4x16x2x8xi8>
  %67 = flow.dispatch.tensor.load %64, offsets = [%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} -> tensor<1x?x4x2x4x16x2x8xi8>
  %68 = flow.dispatch.tensor.load %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} -> tensor<1x1x8x4x2x4x16x4xi32>
  %69 = scf.for %arg0 = %c0 to %42 step %c1 iter_args(%arg1 = %68) -> (tensor<1x1x8x4x2x4x16x4xi32>) {
    %70 = tensor.empty() : tensor<1x1x8x4x16x2x8xi8>
    %71 = scf.forall (%arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8) in (1, 1, 8, 4, 16, 1, 1) shared_outs(%arg9 = %70) -> (tensor<1x1x8x4x16x2x8xi8>) {
      %75 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg8)
      %76 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg7)
      %77 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%arg3, %arg0]
      %extracted_slice = tensor.extract_slice %66[%arg2, %77, %arg4, %arg5, %arg6, %76, %75] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x?x8x4x16x2x8xi8> to tensor<1x1x1x1x1x2x8xi8>
      %extracted_slice_0 = tensor.extract_slice %arg9[%arg2, %arg3, %arg4, %arg5, %arg6, %76, %75] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x16x2x8xi8> to tensor<1x1x1x1x1x2x8xi8>
      %78 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice : tensor<1x1x1x1x1x2x8xi8>) outs(%extracted_slice_0 : tensor<1x1x1x1x1x2x8xi8>) -> tensor<1x1x1x1x1x2x8xi8>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %78 into %arg9[%arg2, %arg3, %arg4, %arg5, %arg6, %76, %75] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x2x8xi8> into tensor<1x1x8x4x16x2x8xi8>
      }
    } {mapping = [#gpu.thread<linear_dim_6>, #gpu.thread<linear_dim_5>, #gpu.thread<linear_dim_4>, #gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
    %72 = tensor.empty() : tensor<1x1x4x2x4x16x2x8xi8>
    %73 = scf.forall (%arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9) in (1, 1, 4, 2, 4, 16, 1, 1) shared_outs(%arg10 = %72) -> (tensor<1x1x4x2x4x16x2x8xi8>) {
      %75 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg9)
      %76 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg8)
      %77 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%arg3, %arg0]
      %extracted_slice = tensor.extract_slice %67[%arg2, %77, %arg4, %arg5, %arg6, %arg7, %76, %75] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x?x4x2x4x16x2x8xi8> to tensor<1x1x1x1x1x1x2x8xi8>
      %extracted_slice_0 = tensor.extract_slice %arg10[%arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %76, %75] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x4x2x4x16x2x8xi8> to tensor<1x1x1x1x1x1x2x8xi8>
      %78 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice : tensor<1x1x1x1x1x1x2x8xi8>) outs(%extracted_slice_0 : tensor<1x1x1x1x1x1x2x8xi8>) -> tensor<1x1x1x1x1x1x2x8xi8>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %78 into %arg10[%arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %76, %75] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x1x2x8xi8> into tensor<1x1x4x2x4x16x2x8xi8>
      }
    } {mapping = [#gpu.thread<linear_dim_7>, #gpu.thread<linear_dim_6>, #gpu.thread<linear_dim_5>, #gpu.thread<linear_dim_4>, #gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
    %74 = scf.forall (%arg2) in (256) shared_outs(%arg3 = %arg1) -> (tensor<1x1x8x4x2x4x16x4xi32>) {
      %75 = affine.apply affine_map<(d0) -> (d0 mod 64)>(%arg2)
      %76:2 = affine.delinearize_index %75 into (%c4, %c16) : index, index
      %extracted_slice = tensor.extract_slice %71[0, 0, 0, %76#0, %76#1, 0, 0] [1, 1, 8, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x16x2x8xi8> to tensor<1x1x8x1x1x2x8xi8>
      %77 = affine.apply affine_map<(d0) -> (d0 mod 256)>(%arg2)
      %78:3 = affine.delinearize_index %77 into (%c4, %c4, %c16) : index, index, index
      %extracted_slice_0 = tensor.extract_slice %73[0, 0, %78#0, 0, %78#1, %78#2, 0, 0] [1, 1, 1, 2, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x4x2x4x16x2x8xi8> to tensor<1x1x1x2x1x1x2x8xi8>
      %79 = affine.apply affine_map<(d0) -> (d0 mod 256)>(%arg2)
      %80:3 = affine.delinearize_index %79 into (%c4, %c4, %c16) : index, index, index
      %extracted_slice_1 = tensor.extract_slice %arg3[0, 0, 0, %80#0, 0, %80#1, %80#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> to tensor<1x1x8x1x2x1x1x4xi32>
      %81 = iree_gpu.multi_mma %extracted_slice, %extracted_slice_0, %extracted_slice_1 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic =  MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<1x1x8x1x1x2x8xi8>, tensor<1x1x1x2x1x1x2x8xi8> into tensor<1x1x8x1x2x1x1x4xi32>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %81 into %arg3[0, 0, 0, %80#0, 0, %80#1, %80#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x1x2x1x1x4xi32> into tensor<1x1x8x4x2x4x16x4xi32>
      }
    } {mapping = [#gpu.thread<linear_dim_0>]}
    scf.yield %74 : tensor<1x1x8x4x2x4x16x4xi32>
  }
  flow.dispatch.tensor.store %69, %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
  return
 }

 // -----// IR Dump After CSE (cse) //----- //
 func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} {
  %c16 = arith.constant 16 : index
  %c4 = arith.constant 4 : index
  %c0 = arith.constant 0 : index
  %c1 = arith.constant 1 : index
  %c32_i64 = arith.constant 32 : i64
  %0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
  %8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
  %9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
  %10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32
  %11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32
  %12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32
  %13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32
  %14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32
  %15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32
  %16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32
  %17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32
  %18 = arith.extui %0 : i32 to i64
  %19 = arith.extui %1 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %2 : i32 to i64
  %24 = arith.extui %3 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = arith.extui %4 : i32 to i64
  %29 = arith.extui %5 : i32 to i64
  %30 = arith.shli %29, %c32_i64 : i64
  %31 = arith.ori %28, %30 : i64
  %32 = arith.index_castui %31 : i64 to index
  %33 = arith.extui %6 : i32 to i64
  %34 = arith.extui %7 : i32 to i64
  %35 = arith.shli %34, %c32_i64 : i64
  %36 = arith.ori %33, %35 : i64
  %37 = arith.index_castui %36 : i64 to index
  %38 = arith.extui %8 : i32 to i64
  %39 = arith.extui %9 : i32 to i64
  %40 = arith.shli %39, %c32_i64 : i64
  %41 = arith.ori %38, %40 : i64
  %42 = arith.index_castui %41 : i64 to index
  %43 = arith.extui %10 : i32 to i64
  %44 = arith.extui %11 : i32 to i64
  %45 = arith.shli %44, %c32_i64 : i64
  %46 = arith.ori %43, %45 : i64
  %47 = arith.index_castui %46 : i64 to index
  %48 = arith.extui %12 : i32 to i64
  %49 = arith.extui %13 : i32 to i64
  %50 = arith.shli %49, %c32_i64 : i64
  %51 = arith.ori %48, %50 : i64
  %52 = arith.index_castui %51 : i64 to index
  %53 = arith.extui %14 : i32 to i64
  %54 = arith.extui %15 : i32 to i64
  %55 = arith.shli %54, %c32_i64 : i64
  %56 = arith.ori %53, %55 : i64
  %57 = arith.index_castui %56 : i64 to index
  %58 = arith.extui %16 : i32 to i64
  %59 = arith.extui %17 : i32 to i64
  %60 = arith.shli %59, %c32_i64 : i64
  %61 = arith.ori %58, %60 : i64
  %62 = arith.index_castui %61 : i64 to index
  %63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42}
  %64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52}
  %65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %66 = flow.dispatch.tensor.load %63, offsets = [%workgroup_id_y, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} -> tensor<1x?x8x4x16x2x8xi8>
  %67 = flow.dispatch.tensor.load %64, offsets = [%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} -> tensor<1x?x4x2x4x16x2x8xi8>
  %68 = flow.dispatch.tensor.load %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} -> tensor<1x1x8x4x2x4x16x4xi32>
  %69 = scf.for %arg0 = %c0 to %42 step %c1 iter_args(%arg1 = %68) -> (tensor<1x1x8x4x2x4x16x4xi32>) {
    %70 = tensor.empty() : tensor<1x1x8x4x16x2x8xi8>
    %71 = scf.forall (%arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8) in (1, 1, 8, 4, 16, 1, 1) shared_outs(%arg9 = %70) -> (tensor<1x1x8x4x16x2x8xi8>) {
      %75 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg8)
      %76 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg7)
      %77 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%arg3, %arg0]
      %extracted_slice = tensor.extract_slice %66[%arg2, %77, %arg4, %arg5, %arg6, %76, %75] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x?x8x4x16x2x8xi8> to tensor<1x1x1x1x1x2x8xi8>
      %extracted_slice_0 = tensor.extract_slice %arg9[%arg2, %arg3, %arg4, %arg5, %arg6, %76, %75] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x16x2x8xi8> to tensor<1x1x1x1x1x2x8xi8>
      %78 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice : tensor<1x1x1x1x1x2x8xi8>) outs(%extracted_slice_0 : tensor<1x1x1x1x1x2x8xi8>) -> tensor<1x1x1x1x1x2x8xi8>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %78 into %arg9[%arg2, %arg3, %arg4, %arg5, %arg6, %76, %75] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x2x8xi8> into tensor<1x1x8x4x16x2x8xi8>
      }
    } {mapping = [#gpu.thread<linear_dim_6>, #gpu.thread<linear_dim_5>, #gpu.thread<linear_dim_4>, #gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
    %72 = tensor.empty() : tensor<1x1x4x2x4x16x2x8xi8>
    %73 = scf.forall (%arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9) in (1, 1, 4, 2, 4, 16, 1, 1) shared_outs(%arg10 = %72) -> (tensor<1x1x4x2x4x16x2x8xi8>) {
      %75 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg9)
      %76 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg8)
      %77 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%arg3, %arg0]
      %extracted_slice = tensor.extract_slice %67[%arg2, %77, %arg4, %arg5, %arg6, %arg7, %76, %75] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x?x4x2x4x16x2x8xi8> to tensor<1x1x1x1x1x1x2x8xi8>
      %extracted_slice_0 = tensor.extract_slice %arg10[%arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %76, %75] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x4x2x4x16x2x8xi8> to tensor<1x1x1x1x1x1x2x8xi8>
      %78 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice : tensor<1x1x1x1x1x1x2x8xi8>) outs(%extracted_slice_0 : tensor<1x1x1x1x1x1x2x8xi8>) -> tensor<1x1x1x1x1x1x2x8xi8>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %78 into %arg10[%arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %76, %75] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x1x2x8xi8> into tensor<1x1x4x2x4x16x2x8xi8>
      }
    } {mapping = [#gpu.thread<linear_dim_7>, #gpu.thread<linear_dim_6>, #gpu.thread<linear_dim_5>, #gpu.thread<linear_dim_4>, #gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
    %74 = scf.forall (%arg2) in (256) shared_outs(%arg3 = %arg1) -> (tensor<1x1x8x4x2x4x16x4xi32>) {
      %75 = affine.apply affine_map<(d0) -> (d0 mod 64)>(%arg2)
      %76:2 = affine.delinearize_index %75 into (%c4, %c16) : index, index
      %extracted_slice = tensor.extract_slice %71[0, 0, 0, %76#0, %76#1, 0, 0] [1, 1, 8, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x16x2x8xi8> to tensor<1x1x8x1x1x2x8xi8>
      %77 = affine.apply affine_map<(d0) -> (d0 mod 256)>(%arg2)
      %78:3 = affine.delinearize_index %77 into (%c4, %c4, %c16) : index, index, index
      %extracted_slice_0 = tensor.extract_slice %73[0, 0, %78#0, 0, %78#1, %78#2, 0, 0] [1, 1, 1, 2, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x4x2x4x16x2x8xi8> to tensor<1x1x1x2x1x1x2x8xi8>
      %extracted_slice_1 = tensor.extract_slice %arg3[0, 0, 0, %78#0, 0, %78#1, %78#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> to tensor<1x1x8x1x2x1x1x4xi32>
      %79 = iree_gpu.multi_mma %extracted_slice, %extracted_slice_0, %extracted_slice_1 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic =  MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<1x1x8x1x1x2x8xi8>, tensor<1x1x1x2x1x1x2x8xi8> into tensor<1x1x8x1x2x1x1x4xi32>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %79 into %arg3[0, 0, 0, %78#0, 0, %78#1, %78#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x1x2x1x1x4xi32> into tensor<1x1x8x4x2x4x16x4xi32>
      }
    } {mapping = [#gpu.thread<linear_dim_0>]}
    scf.yield %74 : tensor<1x1x8x4x2x4x16x4xi32>
  }
  flow.dispatch.tensor.store %69, %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
  return
 }

 // -----// IR Dump After LoopInvariantCodeMotion (loop-invariant-code-motion) //----- //
 func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} {
  %c16 = arith.constant 16 : index
  %c4 = arith.constant 4 : index
  %c0 = arith.constant 0 : index
  %c1 = arith.constant 1 : index
  %c32_i64 = arith.constant 32 : i64
  %0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
  %8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
  %9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
  %10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32
  %11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32
  %12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32
  %13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32
  %14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32
  %15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32
  %16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32
  %17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32
  %18 = arith.extui %0 : i32 to i64
  %19 = arith.extui %1 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %2 : i32 to i64
  %24 = arith.extui %3 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = arith.extui %4 : i32 to i64
  %29 = arith.extui %5 : i32 to i64
  %30 = arith.shli %29, %c32_i64 : i64
  %31 = arith.ori %28, %30 : i64
  %32 = arith.index_castui %31 : i64 to index
  %33 = arith.extui %6 : i32 to i64
  %34 = arith.extui %7 : i32 to i64
  %35 = arith.shli %34, %c32_i64 : i64
  %36 = arith.ori %33, %35 : i64
  %37 = arith.index_castui %36 : i64 to index
  %38 = arith.extui %8 : i32 to i64
  %39 = arith.extui %9 : i32 to i64
  %40 = arith.shli %39, %c32_i64 : i64
  %41 = arith.ori %38, %40 : i64
  %42 = arith.index_castui %41 : i64 to index
  %43 = arith.extui %10 : i32 to i64
  %44 = arith.extui %11 : i32 to i64
  %45 = arith.shli %44, %c32_i64 : i64
  %46 = arith.ori %43, %45 : i64
  %47 = arith.index_castui %46 : i64 to index
  %48 = arith.extui %12 : i32 to i64
  %49 = arith.extui %13 : i32 to i64
  %50 = arith.shli %49, %c32_i64 : i64
  %51 = arith.ori %48, %50 : i64
  %52 = arith.index_castui %51 : i64 to index
  %53 = arith.extui %14 : i32 to i64
  %54 = arith.extui %15 : i32 to i64
  %55 = arith.shli %54, %c32_i64 : i64
  %56 = arith.ori %53, %55 : i64
  %57 = arith.index_castui %56 : i64 to index
  %58 = arith.extui %16 : i32 to i64
  %59 = arith.extui %17 : i32 to i64
  %60 = arith.shli %59, %c32_i64 : i64
  %61 = arith.ori %58, %60 : i64
  %62 = arith.index_castui %61 : i64 to index
  %63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42}
  %64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52}
  %65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %66 = flow.dispatch.tensor.load %63, offsets = [%workgroup_id_y, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} -> tensor<1x?x8x4x16x2x8xi8>
  %67 = flow.dispatch.tensor.load %64, offsets = [%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} -> tensor<1x?x4x2x4x16x2x8xi8>
  %68 = flow.dispatch.tensor.load %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} -> tensor<1x1x8x4x2x4x16x4xi32>
  %69 = tensor.empty() : tensor<1x1x8x4x16x2x8xi8>
  %70 = tensor.empty() : tensor<1x1x4x2x4x16x2x8xi8>
  %71 = scf.for %arg0 = %c0 to %42 step %c1 iter_args(%arg1 = %68) -> (tensor<1x1x8x4x2x4x16x4xi32>) {
    %72 = scf.forall (%arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8) in (1, 1, 8, 4, 16, 1, 1) shared_outs(%arg9 = %69) -> (tensor<1x1x8x4x16x2x8xi8>) {
      %75 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg8)
      %76 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg7)
      %77 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%arg3, %arg0]
      %extracted_slice = tensor.extract_slice %66[%arg2, %77, %arg4, %arg5, %arg6, %76, %75] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x?x8x4x16x2x8xi8> to tensor<1x1x1x1x1x2x8xi8>
      %extracted_slice_0 = tensor.extract_slice %arg9[%arg2, %arg3, %arg4, %arg5, %arg6, %76, %75] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x16x2x8xi8> to tensor<1x1x1x1x1x2x8xi8>
      %78 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice : tensor<1x1x1x1x1x2x8xi8>) outs(%extracted_slice_0 : tensor<1x1x1x1x1x2x8xi8>) -> tensor<1x1x1x1x1x2x8xi8>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %78 into %arg9[%arg2, %arg3, %arg4, %arg5, %arg6, %76, %75] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x2x8xi8> into tensor<1x1x8x4x16x2x8xi8>
      }
    } {mapping = [#gpu.thread<linear_dim_6>, #gpu.thread<linear_dim_5>, #gpu.thread<linear_dim_4>, #gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
    %73 = scf.forall (%arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9) in (1, 1, 4, 2, 4, 16, 1, 1) shared_outs(%arg10 = %70) -> (tensor<1x1x4x2x4x16x2x8xi8>) {
      %75 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg9)
      %76 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg8)
      %77 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%arg3, %arg0]
      %extracted_slice = tensor.extract_slice %67[%arg2, %77, %arg4, %arg5, %arg6, %arg7, %76, %75] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x?x4x2x4x16x2x8xi8> to tensor<1x1x1x1x1x1x2x8xi8>
      %extracted_slice_0 = tensor.extract_slice %arg10[%arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %76, %75] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x4x2x4x16x2x8xi8> to tensor<1x1x1x1x1x1x2x8xi8>
      %78 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice : tensor<1x1x1x1x1x1x2x8xi8>) outs(%extracted_slice_0 : tensor<1x1x1x1x1x1x2x8xi8>) -> tensor<1x1x1x1x1x1x2x8xi8>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %78 into %arg10[%arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %76, %75] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x1x2x8xi8> into tensor<1x1x4x2x4x16x2x8xi8>
      }
    } {mapping = [#gpu.thread<linear_dim_7>, #gpu.thread<linear_dim_6>, #gpu.thread<linear_dim_5>, #gpu.thread<linear_dim_4>, #gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
    %74 = scf.forall (%arg2) in (256) shared_outs(%arg3 = %arg1) -> (tensor<1x1x8x4x2x4x16x4xi32>) {
      %75 = affine.apply affine_map<(d0) -> (d0 mod 64)>(%arg2)
      %76:2 = affine.delinearize_index %75 into (%c4, %c16) : index, index
      %extracted_slice = tensor.extract_slice %72[0, 0, 0, %76#0, %76#1, 0, 0] [1, 1, 8, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x16x2x8xi8> to tensor<1x1x8x1x1x2x8xi8>
      %77 = affine.apply affine_map<(d0) -> (d0 mod 256)>(%arg2)
      %78:3 = affine.delinearize_index %77 into (%c4, %c4, %c16) : index, index, index
      %extracted_slice_0 = tensor.extract_slice %73[0, 0, %78#0, 0, %78#1, %78#2, 0, 0] [1, 1, 1, 2, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x4x2x4x16x2x8xi8> to tensor<1x1x1x2x1x1x2x8xi8>
      %extracted_slice_1 = tensor.extract_slice %arg3[0, 0, 0, %78#0, 0, %78#1, %78#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> to tensor<1x1x8x1x2x1x1x4xi32>
      %79 = iree_gpu.multi_mma %extracted_slice, %extracted_slice_0, %extracted_slice_1 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic =  MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<1x1x8x1x1x2x8xi8>, tensor<1x1x1x2x1x1x2x8xi8> into tensor<1x1x8x1x2x1x1x4xi32>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %79 into %arg3[0, 0, 0, %78#0, 0, %78#1, %78#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x1x2x1x1x4xi32> into tensor<1x1x8x4x2x4x16x4xi32>
      }
    } {mapping = [#gpu.thread<linear_dim_0>]}
    scf.yield %74 : tensor<1x1x8x4x2x4x16x4xi32>
  }
  flow.dispatch.tensor.store %71, %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
  return
 }

 // -----// IR Dump After OptimizeTensorInsertExtractSlicesPass (iree-codegen-optimize-tensor-insert-extract-slices) //----- //
 func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} {
  %c16 = arith.constant 16 : index
  %c4 = arith.constant 4 : index
  %c0 = arith.constant 0 : index
  %c1 = arith.constant 1 : index
  %c32_i64 = arith.constant 32 : i64
  %0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
  %8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
  %9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
  %10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32
  %11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32
  %12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32
  %13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32
  %14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32
  %15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32
  %16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32
  %17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32
  %18 = arith.extui %0 : i32 to i64
  %19 = arith.extui %1 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %2 : i32 to i64
  %24 = arith.extui %3 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = arith.extui %4 : i32 to i64
  %29 = arith.extui %5 : i32 to i64
  %30 = arith.shli %29, %c32_i64 : i64
  %31 = arith.ori %28, %30 : i64
  %32 = arith.index_castui %31 : i64 to index
  %33 = arith.extui %6 : i32 to i64
  %34 = arith.extui %7 : i32 to i64
  %35 = arith.shli %34, %c32_i64 : i64
  %36 = arith.ori %33, %35 : i64
  %37 = arith.index_castui %36 : i64 to index
  %38 = arith.extui %8 : i32 to i64
  %39 = arith.extui %9 : i32 to i64
  %40 = arith.shli %39, %c32_i64 : i64
  %41 = arith.ori %38, %40 : i64
  %42 = arith.index_castui %41 : i64 to index
  %43 = arith.extui %10 : i32 to i64
  %44 = arith.extui %11 : i32 to i64
  %45 = arith.shli %44, %c32_i64 : i64
  %46 = arith.ori %43, %45 : i64
  %47 = arith.index_castui %46 : i64 to index
  %48 = arith.extui %12 : i32 to i64
  %49 = arith.extui %13 : i32 to i64
  %50 = arith.shli %49, %c32_i64 : i64
  %51 = arith.ori %48, %50 : i64
  %52 = arith.index_castui %51 : i64 to index
  %53 = arith.extui %14 : i32 to i64
  %54 = arith.extui %15 : i32 to i64
  %55 = arith.shli %54, %c32_i64 : i64
  %56 = arith.ori %53, %55 : i64
  %57 = arith.index_castui %56 : i64 to index
  %58 = arith.extui %16 : i32 to i64
  %59 = arith.extui %17 : i32 to i64
  %60 = arith.shli %59, %c32_i64 : i64
  %61 = arith.ori %58, %60 : i64
  %62 = arith.index_castui %61 : i64 to index
  %63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42}
  %64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52}
  %65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %66 = flow.dispatch.tensor.load %63, offsets = [%workgroup_id_y, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} -> tensor<1x?x8x4x16x2x8xi8>
  %67 = flow.dispatch.tensor.load %64, offsets = [%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} -> tensor<1x?x4x2x4x16x2x8xi8>
  %68 = flow.dispatch.tensor.load %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} -> tensor<1x1x8x4x2x4x16x4xi32>
  %69 = tensor.empty() : tensor<1x1x8x4x16x2x8xi8>
  %70 = tensor.empty() : tensor<1x1x4x2x4x16x2x8xi8>
  %71 = scf.for %arg0 = %c0 to %42 step %c1 iter_args(%arg1 = %68) -> (tensor<1x1x8x4x2x4x16x4xi32>) {
    %72 = scf.forall (%arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8) in (1, 1, 8, 4, 16, 1, 1) shared_outs(%arg9 = %69) -> (tensor<1x1x8x4x16x2x8xi8>) {
      %75 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg8)
      %76 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg7)
      %77 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%arg3, %arg0]
      %extracted_slice = tensor.extract_slice %66[%arg2, %77, %arg4, %arg5, %arg6, %76, %75] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x?x8x4x16x2x8xi8> to tensor<1x1x1x1x1x2x8xi8>
      %extracted_slice_0 = tensor.extract_slice %arg9[%arg2, %arg3, %arg4, %arg5, %arg6, %76, %75] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x16x2x8xi8> to tensor<1x1x1x1x1x2x8xi8>
      %78 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice : tensor<1x1x1x1x1x2x8xi8>) outs(%extracted_slice_0 : tensor<1x1x1x1x1x2x8xi8>) -> tensor<1x1x1x1x1x2x8xi8>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %78 into %arg9[%arg2, %arg3, %arg4, %arg5, %arg6, %76, %75] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x2x8xi8> into tensor<1x1x8x4x16x2x8xi8>
      }
    } {mapping = [#gpu.thread<linear_dim_6>, #gpu.thread<linear_dim_5>, #gpu.thread<linear_dim_4>, #gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
    %73 = scf.forall (%arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9) in (1, 1, 4, 2, 4, 16, 1, 1) shared_outs(%arg10 = %70) -> (tensor<1x1x4x2x4x16x2x8xi8>) {
      %75 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg9)
      %76 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg8)
      %77 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%arg3, %arg0]
      %extracted_slice = tensor.extract_slice %67[%arg2, %77, %arg4, %arg5, %arg6, %arg7, %76, %75] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x?x4x2x4x16x2x8xi8> to tensor<1x1x1x1x1x1x2x8xi8>
      %extracted_slice_0 = tensor.extract_slice %arg10[%arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %76, %75] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x4x2x4x16x2x8xi8> to tensor<1x1x1x1x1x1x2x8xi8>
      %78 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice : tensor<1x1x1x1x1x1x2x8xi8>) outs(%extracted_slice_0 : tensor<1x1x1x1x1x1x2x8xi8>) -> tensor<1x1x1x1x1x1x2x8xi8>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %78 into %arg10[%arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %76, %75] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x1x2x8xi8> into tensor<1x1x4x2x4x16x2x8xi8>
      }
    } {mapping = [#gpu.thread<linear_dim_7>, #gpu.thread<linear_dim_6>, #gpu.thread<linear_dim_5>, #gpu.thread<linear_dim_4>, #gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
    %74 = scf.forall (%arg2) in (256) shared_outs(%arg3 = %arg1) -> (tensor<1x1x8x4x2x4x16x4xi32>) {
      %75 = affine.apply affine_map<(d0) -> (d0 mod 64)>(%arg2)
      %76:2 = affine.delinearize_index %75 into (%c4, %c16) : index, index
      %extracted_slice = tensor.extract_slice %72[0, 0, 0, %76#0, %76#1, 0, 0] [1, 1, 8, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x16x2x8xi8> to tensor<1x1x8x1x1x2x8xi8>
      %77 = affine.apply affine_map<(d0) -> (d0 mod 256)>(%arg2)
      %78:3 = affine.delinearize_index %77 into (%c4, %c4, %c16) : index, index, index
      %extracted_slice_0 = tensor.extract_slice %73[0, 0, %78#0, 0, %78#1, %78#2, 0, 0] [1, 1, 1, 2, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x4x2x4x16x2x8xi8> to tensor<1x1x1x2x1x1x2x8xi8>
      %extracted_slice_1 = tensor.extract_slice %arg3[0, 0, 0, %78#0, 0, %78#1, %78#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> to tensor<1x1x8x1x2x1x1x4xi32>
      %79 = iree_gpu.multi_mma %extracted_slice, %extracted_slice_0, %extracted_slice_1 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic =  MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<1x1x8x1x1x2x8xi8>, tensor<1x1x1x2x1x1x2x8xi8> into tensor<1x1x8x1x2x1x1x4xi32>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %79 into %arg3[0, 0, 0, %78#0, 0, %78#1, %78#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x1x2x1x1x4xi32> into tensor<1x1x8x4x2x4x16x4xi32>
      }
    } {mapping = [#gpu.thread<linear_dim_0>]}
    scf.yield %74 : tensor<1x1x8x4x2x4x16x4xi32>
  }
  flow.dispatch.tensor.store %71, %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
  return
 }

 // -----// IR Dump After FuseAndHoistParallelLoopsPass (iree-gpu-fuse-and-hoist-parallel-loops) //----- //
 func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} {
  %c2 = arith.constant 2 : index
  %c256 = arith.constant 256 : index
  %c512 = arith.constant 512 : index
  %c8 = arith.constant 8 : index
  %c16 = arith.constant 16 : index
  %c4 = arith.constant 4 : index
  %c0 = arith.constant 0 : index
  %c1 = arith.constant 1 : index
  %c32_i64 = arith.constant 32 : i64
  %0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
  %8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
  %9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
  %10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32
  %11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32
  %12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32
  %13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32
  %14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32
  %15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32
  %16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32
  %17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32
  %18 = arith.extui %0 : i32 to i64
  %19 = arith.extui %1 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %2 : i32 to i64
  %24 = arith.extui %3 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = arith.extui %4 : i32 to i64
  %29 = arith.extui %5 : i32 to i64
  %30 = arith.shli %29, %c32_i64 : i64
  %31 = arith.ori %28, %30 : i64
  %32 = arith.index_castui %31 : i64 to index
  %33 = arith.extui %6 : i32 to i64
  %34 = arith.extui %7 : i32 to i64
  %35 = arith.shli %34, %c32_i64 : i64
  %36 = arith.ori %33, %35 : i64
  %37 = arith.index_castui %36 : i64 to index
  %38 = arith.extui %8 : i32 to i64
  %39 = arith.extui %9 : i32 to i64
  %40 = arith.shli %39, %c32_i64 : i64
  %41 = arith.ori %38, %40 : i64
  %42 = arith.index_castui %41 : i64 to index
  %43 = arith.extui %10 : i32 to i64
  %44 = arith.extui %11 : i32 to i64
  %45 = arith.shli %44, %c32_i64 : i64
  %46 = arith.ori %43, %45 : i64
  %47 = arith.index_castui %46 : i64 to index
  %48 = arith.extui %12 : i32 to i64
  %49 = arith.extui %13 : i32 to i64
  %50 = arith.shli %49, %c32_i64 : i64
  %51 = arith.ori %48, %50 : i64
  %52 = arith.index_castui %51 : i64 to index
  %53 = arith.extui %14 : i32 to i64
  %54 = arith.extui %15 : i32 to i64
  %55 = arith.shli %54, %c32_i64 : i64
  %56 = arith.ori %53, %55 : i64
  %57 = arith.index_castui %56 : i64 to index
  %58 = arith.extui %16 : i32 to i64
  %59 = arith.extui %17 : i32 to i64
  %60 = arith.shli %59, %c32_i64 : i64
  %61 = arith.ori %58, %60 : i64
  %62 = arith.index_castui %61 : i64 to index
  %63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42}
  %64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52}
  %65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %66 = flow.dispatch.tensor.load %63, offsets = [%workgroup_id_y, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} -> tensor<1x?x8x4x16x2x8xi8>
  %67 = flow.dispatch.tensor.load %64, offsets = [%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} -> tensor<1x?x4x2x4x16x2x8xi8>
  %68 = flow.dispatch.tensor.load %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} -> tensor<1x1x8x4x2x4x16x4xi32>
  %69 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<1x1x8x4x16x2x8xi8>
  %70 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<1x1x4x2x4x16x2x8xi8>
  %71 = scf.forall (%arg0) in (256) shared_outs(%arg1 = %68) -> (tensor<1x1x8x4x2x4x16x4xi32>) {
    %72 = affine.apply affine_map<(d0) -> (d0 mod 256)>(%arg0)
    %73:3 = affine.delinearize_index %72 into (%c4, %c4, %c16) : index, index, index
    %extracted_slice = tensor.extract_slice %arg1[0, 0, 0, %73#0, 0, %73#1, %73#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> to tensor<1x1x8x1x2x1x1x4xi32>
    %74 = scf.for %arg2 = %c0 to %42 step %c1 iter_args(%arg3 = %extracted_slice) -> (tensor<1x1x8x1x2x1x1x4xi32>) {
      %75 = iree_gpu.barrier_region ins(%69 : tensor<1x1x8x4x16x2x8xi8>) {
      ^bb0(%arg4: tensor<1x1x8x4x16x2x8xi8>):
        %80 = scf.for %arg5 = %c0 to %c512 step %c256 iter_args(%arg6 = %arg4) -> (tensor<1x1x8x4x16x2x8xi8>) {
          %81 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg5, %arg0)
          %82:7 = affine.delinearize_index %81 into (%c1, %c1, %c8, %c4, %c16, %c1, %c1) : index, index, index, index, index, index, index
          %83 = affine.apply affine_map<(d0) -> (d0 * 8)>(%82#6)
          %84 = affine.apply affine_map<(d0) -> (d0 * 2)>(%82#5)
          %85 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%82#1, %arg2]
          %extracted_slice_2 = tensor.extract_slice %66[%82#0, %85, %82#2, %82#3, %82#4, %84, %83] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x?x8x4x16x2x8xi8> to tensor<1x1x1x1x1x2x8xi8>
          %extracted_slice_3 = tensor.extract_slice %arg6[%82#0, %82#1, %82#2, %82#3, %82#4, %84, %83] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x16x2x8xi8> to tensor<1x1x1x1x1x2x8xi8>
          %86 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_2 : tensor<1x1x1x1x1x2x8xi8>) outs(%extracted_slice_3 : tensor<1x1x1x1x1x2x8xi8>) -> tensor<1x1x1x1x1x2x8xi8>
          %inserted_slice = tensor.insert_slice %86 into %arg6[%82#0, %82#1, %82#2, %82#3, %82#4, %84, %83] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x2x8xi8> into tensor<1x1x8x4x16x2x8xi8>
          scf.yield %inserted_slice : tensor<1x1x8x4x16x2x8xi8>
        } {unroll_loop}
        iree_gpu.yield %80 : tensor<1x1x8x4x16x2x8xi8>
      } : tensor<1x1x8x4x16x2x8xi8>
      %76 = iree_gpu.barrier_region ins(%70 : tensor<1x1x4x2x4x16x2x8xi8>) {
      ^bb0(%arg4: tensor<1x1x4x2x4x16x2x8xi8>):
        %80 = scf.for %arg5 = %c0 to %c512 step %c256 iter_args(%arg6 = %arg4) -> (tensor<1x1x4x2x4x16x2x8xi8>) {
          %81 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg5, %arg0)
          %82:8 = affine.delinearize_index %81 into (%c1, %c1, %c4, %c2, %c4, %c16, %c1, %c1) : index, index, index, index, index, index, index, index
          %83 = affine.apply affine_map<(d0) -> (d0 * 8)>(%82#7)
          %84 = affine.apply affine_map<(d0) -> (d0 * 2)>(%82#6)
          %85 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%82#1, %arg2]
          %extracted_slice_2 = tensor.extract_slice %67[%82#0, %85, %82#2, %82#3, %82#4, %82#5, %84, %83] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x?x4x2x4x16x2x8xi8> to tensor<1x1x1x1x1x1x2x8xi8>
          %extracted_slice_3 = tensor.extract_slice %arg6[%82#0, %82#1, %82#2, %82#3, %82#4, %82#5, %84, %83] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x4x2x4x16x2x8xi8> to tensor<1x1x1x1x1x1x2x8xi8>
          %86 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_2 : tensor<1x1x1x1x1x1x2x8xi8>) outs(%extracted_slice_3 : tensor<1x1x1x1x1x1x2x8xi8>) -> tensor<1x1x1x1x1x1x2x8xi8>
          %inserted_slice = tensor.insert_slice %86 into %arg6[%82#0, %82#1, %82#2, %82#3, %82#4, %82#5, %84, %83] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x1x2x8xi8> into tensor<1x1x4x2x4x16x2x8xi8>
          scf.yield %inserted_slice : tensor<1x1x4x2x4x16x2x8xi8>
        } {unroll_loop}
        iree_gpu.yield %80 : tensor<1x1x4x2x4x16x2x8xi8>
      } : tensor<1x1x4x2x4x16x2x8xi8>
      %77 = affine.apply affine_map<(d0) -> (d0 mod 64)>(%arg0)
      %78:2 = affine.delinearize_index %77 into (%c4, %c16) : index, index
      %extracted_slice_0 = tensor.extract_slice %75[0, 0, 0, %78#0, %78#1, 0, 0] [1, 1, 8, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x16x2x8xi8> to tensor<1x1x8x1x1x2x8xi8>
      %extracted_slice_1 = tensor.extract_slice %76[0, 0, %73#0, 0, %73#1, %73#2, 0, 0] [1, 1, 1, 2, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x4x2x4x16x2x8xi8> to tensor<1x1x1x2x1x1x2x8xi8>
      %79 = iree_gpu.multi_mma %extracted_slice_0, %extracted_slice_1, %arg3 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic =  MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<1x1x8x1x1x2x8xi8>, tensor<1x1x1x2x1x1x2x8xi8> into tensor<1x1x8x1x2x1x1x4xi32>
      scf.yield %79 : tensor<1x1x8x1x2x1x1x4xi32>
    }
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %74 into %arg1[0, 0, 0, %73#0, 0, %73#1, %73#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x1x2x1x1x4xi32> into tensor<1x1x8x4x2x4x16x4xi32>
    }
  } {mapping = [#gpu.thread<linear_dim_0>]}
  flow.dispatch.tensor.store %71, %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
  return
 }

 // -----// IR Dump After Canonicalizer (canonicalize) //----- //
 func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} {
  %c2 = arith.constant 2 : index
  %c256 = arith.constant 256 : index
  %c512 = arith.constant 512 : index
  %c8 = arith.constant 8 : index
  %c16 = arith.constant 16 : index
  %c4 = arith.constant 4 : index
  %c0 = arith.constant 0 : index
  %c1 = arith.constant 1 : index
  %c32_i64 = arith.constant 32 : i64
  %0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
  %8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
  %9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
  %10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32
  %11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32
  %12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32
  %13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32
  %14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32
  %15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32
  %16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32
  %17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32
  %18 = arith.extui %0 : i32 to i64
  %19 = arith.extui %1 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %2 : i32 to i64
  %24 = arith.extui %3 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = arith.extui %4 : i32 to i64
  %29 = arith.extui %5 : i32 to i64
  %30 = arith.shli %29, %c32_i64 : i64
  %31 = arith.ori %28, %30 : i64
  %32 = arith.index_castui %31 : i64 to index
  %33 = arith.extui %6 : i32 to i64
  %34 = arith.extui %7 : i32 to i64
  %35 = arith.shli %34, %c32_i64 : i64
  %36 = arith.ori %33, %35 : i64
  %37 = arith.index_castui %36 : i64 to index
  %38 = arith.extui %8 : i32 to i64
  %39 = arith.extui %9 : i32 to i64
  %40 = arith.shli %39, %c32_i64 : i64
  %41 = arith.ori %38, %40 : i64
  %42 = arith.index_castui %41 : i64 to index
  %43 = arith.extui %10 : i32 to i64
  %44 = arith.extui %11 : i32 to i64
  %45 = arith.shli %44, %c32_i64 : i64
  %46 = arith.ori %43, %45 : i64
  %47 = arith.index_castui %46 : i64 to index
  %48 = arith.extui %12 : i32 to i64
  %49 = arith.extui %13 : i32 to i64
  %50 = arith.shli %49, %c32_i64 : i64
  %51 = arith.ori %48, %50 : i64
  %52 = arith.index_castui %51 : i64 to index
  %53 = arith.extui %14 : i32 to i64
  %54 = arith.extui %15 : i32 to i64
  %55 = arith.shli %54, %c32_i64 : i64
  %56 = arith.ori %53, %55 : i64
  %57 = arith.index_castui %56 : i64 to index
  %58 = arith.extui %16 : i32 to i64
  %59 = arith.extui %17 : i32 to i64
  %60 = arith.shli %59, %c32_i64 : i64
  %61 = arith.ori %58, %60 : i64
  %62 = arith.index_castui %61 : i64 to index
  %63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42}
  %64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52}
  %65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %66 = flow.dispatch.tensor.load %63, offsets = [%workgroup_id_y, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} -> tensor<1x?x8x4x16x2x8xi8>
  %67 = flow.dispatch.tensor.load %64, offsets = [%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} -> tensor<1x?x4x2x4x16x2x8xi8>
  %68 = flow.dispatch.tensor.load %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} -> tensor<1x1x8x4x2x4x16x4xi32>
  %69 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<1x1x8x4x16x2x8xi8>
  %70 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<1x1x4x2x4x16x2x8xi8>
  %71 = scf.forall (%arg0) in (256) shared_outs(%arg1 = %68) -> (tensor<1x1x8x4x2x4x16x4xi32>) {
    %72 = affine.apply affine_map<(d0) -> (d0 mod 256)>(%arg0)
    %73:3 = affine.delinearize_index %72 into (%c4, %c4, %c16) : index, index, index
    %extracted_slice = tensor.extract_slice %arg1[0, 0, 0, %73#0, 0, %73#1, %73#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> to tensor<1x1x8x1x2x1x1x4xi32>
    %74 = scf.for %arg2 = %c0 to %42 step %c1 iter_args(%arg3 = %extracted_slice) -> (tensor<1x1x8x1x2x1x1x4xi32>) {
      %75 = iree_gpu.barrier_region ins(%69 : tensor<1x1x8x4x16x2x8xi8>) {
      ^bb0(%arg4: tensor<1x1x8x4x16x2x8xi8>):
        %80 = scf.for %arg5 = %c0 to %c512 step %c256 iter_args(%arg6 = %arg4) -> (tensor<1x1x8x4x16x2x8xi8>) {
          %81 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg5, %arg0)
          %82:3 = affine.delinearize_index %81 into (%c8, %c4, %c16) : index, index, index
          %extracted_slice_2 = tensor.extract_slice %66[0, %arg2, %82#0, %82#1, %82#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x?x8x4x16x2x8xi8> to tensor<1x1x1x1x1x2x8xi8>
          %extracted_slice_3 = tensor.extract_slice %arg6[0, 0, %82#0, %82#1, %82#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x16x2x8xi8> to tensor<1x1x1x1x1x2x8xi8>
          %83 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_2 : tensor<1x1x1x1x1x2x8xi8>) outs(%extracted_slice_3 : tensor<1x1x1x1x1x2x8xi8>) -> tensor<1x1x1x1x1x2x8xi8>
          %inserted_slice = tensor.insert_slice %83 into %arg6[0, 0, %82#0, %82#1, %82#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x2x8xi8> into tensor<1x1x8x4x16x2x8xi8>
          scf.yield %inserted_slice : tensor<1x1x8x4x16x2x8xi8>
        } {unroll_loop}
        iree_gpu.yield %80 : tensor<1x1x8x4x16x2x8xi8>
      } : tensor<1x1x8x4x16x2x8xi8>
      %76 = iree_gpu.barrier_region ins(%70 : tensor<1x1x4x2x4x16x2x8xi8>) {
      ^bb0(%arg4: tensor<1x1x4x2x4x16x2x8xi8>):
        %80 = scf.for %arg5 = %c0 to %c512 step %c256 iter_args(%arg6 = %arg4) -> (tensor<1x1x4x2x4x16x2x8xi8>) {
          %81 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg5, %arg0)
          %82:4 = affine.delinearize_index %81 into (%c4, %c2, %c4, %c16) : index, index, index, index
          %extracted_slice_2 = tensor.extract_slice %67[0, %arg2, %82#0, %82#1, %82#2, %82#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x?x4x2x4x16x2x8xi8> to tensor<1x1x1x1x1x1x2x8xi8>
          %extracted_slice_3 = tensor.extract_slice %arg6[0, 0, %82#0, %82#1, %82#2, %82#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x4x2x4x16x2x8xi8> to tensor<1x1x1x1x1x1x2x8xi8>
          %83 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_2 : tensor<1x1x1x1x1x1x2x8xi8>) outs(%extracted_slice_3 : tensor<1x1x1x1x1x1x2x8xi8>) -> tensor<1x1x1x1x1x1x2x8xi8>
          %inserted_slice = tensor.insert_slice %83 into %arg6[0, 0, %82#0, %82#1, %82#2, %82#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x1x2x8xi8> into tensor<1x1x4x2x4x16x2x8xi8>
          scf.yield %inserted_slice : tensor<1x1x4x2x4x16x2x8xi8>
        } {unroll_loop}
        iree_gpu.yield %80 : tensor<1x1x4x2x4x16x2x8xi8>
      } : tensor<1x1x4x2x4x16x2x8xi8>
      %77 = affine.apply affine_map<(d0) -> (d0 mod 64)>(%arg0)
      %78:2 = affine.delinearize_index %77 into (%c4, %c16) : index, index
      %extracted_slice_0 = tensor.extract_slice %75[0, 0, 0, %78#0, %78#1, 0, 0] [1, 1, 8, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x16x2x8xi8> to tensor<1x1x8x1x1x2x8xi8>
      %extracted_slice_1 = tensor.extract_slice %76[0, 0, %73#0, 0, %73#1, %73#2, 0, 0] [1, 1, 1, 2, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x4x2x4x16x2x8xi8> to tensor<1x1x1x2x1x1x2x8xi8>
      %79 = iree_gpu.multi_mma %extracted_slice_0, %extracted_slice_1, %arg3 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic =  MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<1x1x8x1x1x2x8xi8>, tensor<1x1x1x2x1x1x2x8xi8> into tensor<1x1x8x1x2x1x1x4xi32>
      scf.yield %79 : tensor<1x1x8x1x2x1x1x4xi32>
    }
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %74 into %arg1[0, 0, 0, %73#0, 0, %73#1, %73#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x1x2x1x1x4xi32> into tensor<1x1x8x4x2x4x16x4xi32>
    }
  } {mapping = [#gpu.thread<linear_dim_0>]}
  flow.dispatch.tensor.store %71, %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
  return
 }

 // -----// IR Dump After CSE (cse) //----- //
 func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} {
  %c2 = arith.constant 2 : index
  %c256 = arith.constant 256 : index
  %c512 = arith.constant 512 : index
  %c8 = arith.constant 8 : index
  %c16 = arith.constant 16 : index
  %c4 = arith.constant 4 : index
  %c0 = arith.constant 0 : index
  %c1 = arith.constant 1 : index
  %c32_i64 = arith.constant 32 : i64
  %0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
  %8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
  %9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
  %10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32
  %11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32
  %12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32
  %13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32
  %14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32
  %15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32
  %16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32
  %17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32
  %18 = arith.extui %0 : i32 to i64
  %19 = arith.extui %1 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %2 : i32 to i64
  %24 = arith.extui %3 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = arith.extui %4 : i32 to i64
  %29 = arith.extui %5 : i32 to i64
  %30 = arith.shli %29, %c32_i64 : i64
  %31 = arith.ori %28, %30 : i64
  %32 = arith.index_castui %31 : i64 to index
  %33 = arith.extui %6 : i32 to i64
  %34 = arith.extui %7 : i32 to i64
  %35 = arith.shli %34, %c32_i64 : i64
  %36 = arith.ori %33, %35 : i64
  %37 = arith.index_castui %36 : i64 to index
  %38 = arith.extui %8 : i32 to i64
  %39 = arith.extui %9 : i32 to i64
  %40 = arith.shli %39, %c32_i64 : i64
  %41 = arith.ori %38, %40 : i64
  %42 = arith.index_castui %41 : i64 to index
  %43 = arith.extui %10 : i32 to i64
  %44 = arith.extui %11 : i32 to i64
  %45 = arith.shli %44, %c32_i64 : i64
  %46 = arith.ori %43, %45 : i64
  %47 = arith.index_castui %46 : i64 to index
  %48 = arith.extui %12 : i32 to i64
  %49 = arith.extui %13 : i32 to i64
  %50 = arith.shli %49, %c32_i64 : i64
  %51 = arith.ori %48, %50 : i64
  %52 = arith.index_castui %51 : i64 to index
  %53 = arith.extui %14 : i32 to i64
  %54 = arith.extui %15 : i32 to i64
  %55 = arith.shli %54, %c32_i64 : i64
  %56 = arith.ori %53, %55 : i64
  %57 = arith.index_castui %56 : i64 to index
  %58 = arith.extui %16 : i32 to i64
  %59 = arith.extui %17 : i32 to i64
  %60 = arith.shli %59, %c32_i64 : i64
  %61 = arith.ori %58, %60 : i64
  %62 = arith.index_castui %61 : i64 to index
  %63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42}
  %64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52}
  %65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %66 = flow.dispatch.tensor.load %63, offsets = [%workgroup_id_y, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} -> tensor<1x?x8x4x16x2x8xi8>
  %67 = flow.dispatch.tensor.load %64, offsets = [%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} -> tensor<1x?x4x2x4x16x2x8xi8>
  %68 = flow.dispatch.tensor.load %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} -> tensor<1x1x8x4x2x4x16x4xi32>
  %69 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<1x1x8x4x16x2x8xi8>
  %70 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<1x1x4x2x4x16x2x8xi8>
  %71 = scf.forall (%arg0) in (256) shared_outs(%arg1 = %68) -> (tensor<1x1x8x4x2x4x16x4xi32>) {
    %72 = affine.apply affine_map<(d0) -> (d0 mod 256)>(%arg0)
    %73:3 = affine.delinearize_index %72 into (%c4, %c4, %c16) : index, index, index
    %extracted_slice = tensor.extract_slice %arg1[0, 0, 0, %73#0, 0, %73#1, %73#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> to tensor<1x1x8x1x2x1x1x4xi32>
    %74 = scf.for %arg2 = %c0 to %42 step %c1 iter_args(%arg3 = %extracted_slice) -> (tensor<1x1x8x1x2x1x1x4xi32>) {
      %75 = iree_gpu.barrier_region ins(%69 : tensor<1x1x8x4x16x2x8xi8>) {
      ^bb0(%arg4: tensor<1x1x8x4x16x2x8xi8>):
        %80 = scf.for %arg5 = %c0 to %c512 step %c256 iter_args(%arg6 = %arg4) -> (tensor<1x1x8x4x16x2x8xi8>) {
          %81 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg5, %arg0)
          %82:3 = affine.delinearize_index %81 into (%c8, %c4, %c16) : index, index, index
          %extracted_slice_2 = tensor.extract_slice %66[0, %arg2, %82#0, %82#1, %82#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x?x8x4x16x2x8xi8> to tensor<1x1x1x1x1x2x8xi8>
          %extracted_slice_3 = tensor.extract_slice %arg6[0, 0, %82#0, %82#1, %82#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x16x2x8xi8> to tensor<1x1x1x1x1x2x8xi8>
          %83 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_2 : tensor<1x1x1x1x1x2x8xi8>) outs(%extracted_slice_3 : tensor<1x1x1x1x1x2x8xi8>) -> tensor<1x1x1x1x1x2x8xi8>
          %inserted_slice = tensor.insert_slice %83 into %arg6[0, 0, %82#0, %82#1, %82#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x2x8xi8> into tensor<1x1x8x4x16x2x8xi8>
          scf.yield %inserted_slice : tensor<1x1x8x4x16x2x8xi8>
        } {unroll_loop}
        iree_gpu.yield %80 : tensor<1x1x8x4x16x2x8xi8>
      } : tensor<1x1x8x4x16x2x8xi8>
      %76 = iree_gpu.barrier_region ins(%70 : tensor<1x1x4x2x4x16x2x8xi8>) {
      ^bb0(%arg4: tensor<1x1x4x2x4x16x2x8xi8>):
        %80 = scf.for %arg5 = %c0 to %c512 step %c256 iter_args(%arg6 = %arg4) -> (tensor<1x1x4x2x4x16x2x8xi8>) {
          %81 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg5, %arg0)
          %82:4 = affine.delinearize_index %81 into (%c4, %c2, %c4, %c16) : index, index, index, index
          %extracted_slice_2 = tensor.extract_slice %67[0, %arg2, %82#0, %82#1, %82#2, %82#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x?x4x2x4x16x2x8xi8> to tensor<1x1x1x1x1x1x2x8xi8>
          %extracted_slice_3 = tensor.extract_slice %arg6[0, 0, %82#0, %82#1, %82#2, %82#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x4x2x4x16x2x8xi8> to tensor<1x1x1x1x1x1x2x8xi8>
          %83 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_2 : tensor<1x1x1x1x1x1x2x8xi8>) outs(%extracted_slice_3 : tensor<1x1x1x1x1x1x2x8xi8>) -> tensor<1x1x1x1x1x1x2x8xi8>
          %inserted_slice = tensor.insert_slice %83 into %arg6[0, 0, %82#0, %82#1, %82#2, %82#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x1x2x8xi8> into tensor<1x1x4x2x4x16x2x8xi8>
          scf.yield %inserted_slice : tensor<1x1x4x2x4x16x2x8xi8>
        } {unroll_loop}
        iree_gpu.yield %80 : tensor<1x1x4x2x4x16x2x8xi8>
      } : tensor<1x1x4x2x4x16x2x8xi8>
      %77 = affine.apply affine_map<(d0) -> (d0 mod 64)>(%arg0)
      %78:2 = affine.delinearize_index %77 into (%c4, %c16) : index, index
      %extracted_slice_0 = tensor.extract_slice %75[0, 0, 0, %78#0, %78#1, 0, 0] [1, 1, 8, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x16x2x8xi8> to tensor<1x1x8x1x1x2x8xi8>
      %extracted_slice_1 = tensor.extract_slice %76[0, 0, %73#0, 0, %73#1, %73#2, 0, 0] [1, 1, 1, 2, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x4x2x4x16x2x8xi8> to tensor<1x1x1x2x1x1x2x8xi8>
      %79 = iree_gpu.multi_mma %extracted_slice_0, %extracted_slice_1, %arg3 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic =  MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<1x1x8x1x1x2x8xi8>, tensor<1x1x1x2x1x1x2x8xi8> into tensor<1x1x8x1x2x1x1x4xi32>
      scf.yield %79 : tensor<1x1x8x1x2x1x1x4xi32>
    }
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %74 into %arg1[0, 0, 0, %73#0, 0, %73#1, %73#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x1x2x1x1x4xi32> into tensor<1x1x8x4x2x4x16x4xi32>
    }
  } {mapping = [#gpu.thread<linear_dim_0>]}
  flow.dispatch.tensor.store %71, %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
  return
 }

 // -----// IR Dump After LoopInvariantCodeMotion (loop-invariant-code-motion) //----- //
 func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} {
  %c2 = arith.constant 2 : index
  %c256 = arith.constant 256 : index
  %c512 = arith.constant 512 : index
  %c8 = arith.constant 8 : index
  %c16 = arith.constant 16 : index
  %c4 = arith.constant 4 : index
  %c0 = arith.constant 0 : index
  %c1 = arith.constant 1 : index
  %c32_i64 = arith.constant 32 : i64
  %0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
  %8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
  %9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
  %10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32
  %11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32
  %12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32
  %13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32
  %14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32
  %15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32
  %16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32
  %17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32
  %18 = arith.extui %0 : i32 to i64
  %19 = arith.extui %1 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %2 : i32 to i64
  %24 = arith.extui %3 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = arith.extui %4 : i32 to i64
  %29 = arith.extui %5 : i32 to i64
  %30 = arith.shli %29, %c32_i64 : i64
  %31 = arith.ori %28, %30 : i64
  %32 = arith.index_castui %31 : i64 to index
  %33 = arith.extui %6 : i32 to i64
  %34 = arith.extui %7 : i32 to i64
  %35 = arith.shli %34, %c32_i64 : i64
  %36 = arith.ori %33, %35 : i64
  %37 = arith.index_castui %36 : i64 to index
  %38 = arith.extui %8 : i32 to i64
  %39 = arith.extui %9 : i32 to i64
  %40 = arith.shli %39, %c32_i64 : i64
  %41 = arith.ori %38, %40 : i64
  %42 = arith.index_castui %41 : i64 to index
  %43 = arith.extui %10 : i32 to i64
  %44 = arith.extui %11 : i32 to i64
  %45 = arith.shli %44, %c32_i64 : i64
  %46 = arith.ori %43, %45 : i64
  %47 = arith.index_castui %46 : i64 to index
  %48 = arith.extui %12 : i32 to i64
  %49 = arith.extui %13 : i32 to i64
  %50 = arith.shli %49, %c32_i64 : i64
  %51 = arith.ori %48, %50 : i64
  %52 = arith.index_castui %51 : i64 to index
  %53 = arith.extui %14 : i32 to i64
  %54 = arith.extui %15 : i32 to i64
  %55 = arith.shli %54, %c32_i64 : i64
  %56 = arith.ori %53, %55 : i64
  %57 = arith.index_castui %56 : i64 to index
  %58 = arith.extui %16 : i32 to i64
  %59 = arith.extui %17 : i32 to i64
  %60 = arith.shli %59, %c32_i64 : i64
  %61 = arith.ori %58, %60 : i64
  %62 = arith.index_castui %61 : i64 to index
  %63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42}
  %64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52}
  %65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %66 = flow.dispatch.tensor.load %63, offsets = [%workgroup_id_y, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} -> tensor<1x?x8x4x16x2x8xi8>
  %67 = flow.dispatch.tensor.load %64, offsets = [%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} -> tensor<1x?x4x2x4x16x2x8xi8>
  %68 = flow.dispatch.tensor.load %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} -> tensor<1x1x8x4x2x4x16x4xi32>
  %69 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<1x1x8x4x16x2x8xi8>
  %70 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<1x1x4x2x4x16x2x8xi8>
  %71 = scf.forall (%arg0) in (256) shared_outs(%arg1 = %68) -> (tensor<1x1x8x4x2x4x16x4xi32>) {
    %72 = affine.apply affine_map<(d0) -> (d0 mod 256)>(%arg0)
    %73:3 = affine.delinearize_index %72 into (%c4, %c4, %c16) : index, index, index
    %extracted_slice = tensor.extract_slice %arg1[0, 0, 0, %73#0, 0, %73#1, %73#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> to tensor<1x1x8x1x2x1x1x4xi32>
    %74 = affine.apply affine_map<(d0) -> (d0 mod 64)>(%arg0)
    %75:2 = affine.delinearize_index %74 into (%c4, %c16) : index, index
    %76 = scf.for %arg2 = %c0 to %42 step %c1 iter_args(%arg3 = %extracted_slice) -> (tensor<1x1x8x1x2x1x1x4xi32>) {
      %77 = iree_gpu.barrier_region ins(%69 : tensor<1x1x8x4x16x2x8xi8>) {
      ^bb0(%arg4: tensor<1x1x8x4x16x2x8xi8>):
        %80 = scf.for %arg5 = %c0 to %c512 step %c256 iter_args(%arg6 = %arg4) -> (tensor<1x1x8x4x16x2x8xi8>) {
          %81 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg5, %arg0)
          %82:3 = affine.delinearize_index %81 into (%c8, %c4, %c16) : index, index, index
          %extracted_slice_2 = tensor.extract_slice %66[0, %arg2, %82#0, %82#1, %82#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x?x8x4x16x2x8xi8> to tensor<1x1x1x1x1x2x8xi8>
          %extracted_slice_3 = tensor.extract_slice %arg6[0, 0, %82#0, %82#1, %82#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x16x2x8xi8> to tensor<1x1x1x1x1x2x8xi8>
          %83 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_2 : tensor<1x1x1x1x1x2x8xi8>) outs(%extracted_slice_3 : tensor<1x1x1x1x1x2x8xi8>) -> tensor<1x1x1x1x1x2x8xi8>
          %inserted_slice = tensor.insert_slice %83 into %arg6[0, 0, %82#0, %82#1, %82#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x2x8xi8> into tensor<1x1x8x4x16x2x8xi8>
          scf.yield %inserted_slice : tensor<1x1x8x4x16x2x8xi8>
        } {unroll_loop}
        iree_gpu.yield %80 : tensor<1x1x8x4x16x2x8xi8>
      } : tensor<1x1x8x4x16x2x8xi8>
      %78 = iree_gpu.barrier_region ins(%70 : tensor<1x1x4x2x4x16x2x8xi8>) {
      ^bb0(%arg4: tensor<1x1x4x2x4x16x2x8xi8>):
        %80 = scf.for %arg5 = %c0 to %c512 step %c256 iter_args(%arg6 = %arg4) -> (tensor<1x1x4x2x4x16x2x8xi8>) {
          %81 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg5, %arg0)
          %82:4 = affine.delinearize_index %81 into (%c4, %c2, %c4, %c16) : index, index, index, index
          %extracted_slice_2 = tensor.extract_slice %67[0, %arg2, %82#0, %82#1, %82#2, %82#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x?x4x2x4x16x2x8xi8> to tensor<1x1x1x1x1x1x2x8xi8>
          %extracted_slice_3 = tensor.extract_slice %arg6[0, 0, %82#0, %82#1, %82#2, %82#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x4x2x4x16x2x8xi8> to tensor<1x1x1x1x1x1x2x8xi8>
          %83 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_2 : tensor<1x1x1x1x1x1x2x8xi8>) outs(%extracted_slice_3 : tensor<1x1x1x1x1x1x2x8xi8>) -> tensor<1x1x1x1x1x1x2x8xi8>
          %inserted_slice = tensor.insert_slice %83 into %arg6[0, 0, %82#0, %82#1, %82#2, %82#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x1x2x8xi8> into tensor<1x1x4x2x4x16x2x8xi8>
          scf.yield %inserted_slice : tensor<1x1x4x2x4x16x2x8xi8>
        } {unroll_loop}
        iree_gpu.yield %80 : tensor<1x1x4x2x4x16x2x8xi8>
      } : tensor<1x1x4x2x4x16x2x8xi8>
      %extracted_slice_0 = tensor.extract_slice %77[0, 0, 0, %75#0, %75#1, 0, 0] [1, 1, 8, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x16x2x8xi8> to tensor<1x1x8x1x1x2x8xi8>
      %extracted_slice_1 = tensor.extract_slice %78[0, 0, %73#0, 0, %73#1, %73#2, 0, 0] [1, 1, 1, 2, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x4x2x4x16x2x8xi8> to tensor<1x1x1x2x1x1x2x8xi8>
      %79 = iree_gpu.multi_mma %extracted_slice_0, %extracted_slice_1, %arg3 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic =  MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<1x1x8x1x1x2x8xi8>, tensor<1x1x1x2x1x1x2x8xi8> into tensor<1x1x8x1x2x1x1x4xi32>
      scf.yield %79 : tensor<1x1x8x1x2x1x1x4xi32>
    }
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %76 into %arg1[0, 0, 0, %73#0, 0, %73#1, %73#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x1x2x1x1x4xi32> into tensor<1x1x8x4x2x4x16x4xi32>
    }
  } {mapping = [#gpu.thread<linear_dim_0>]}
  flow.dispatch.tensor.store %71, %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
  return
 }

 // -----// IR Dump After CombineBarrierRegionsPass (iree-gpu-combine-barrier-regions) //----- //
 func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} {
  %c2 = arith.constant 2 : index
  %c256 = arith.constant 256 : index
  %c512 = arith.constant 512 : index
  %c8 = arith.constant 8 : index
  %c16 = arith.constant 16 : index
  %c4 = arith.constant 4 : index
  %c0 = arith.constant 0 : index
  %c1 = arith.constant 1 : index
  %c32_i64 = arith.constant 32 : i64
  %0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
  %8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
  %9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
  %10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32
  %11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32
  %12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32
  %13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32
  %14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32
  %15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32
  %16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32
  %17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32
  %18 = arith.extui %0 : i32 to i64
  %19 = arith.extui %1 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %2 : i32 to i64
  %24 = arith.extui %3 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = arith.extui %4 : i32 to i64
  %29 = arith.extui %5 : i32 to i64
  %30 = arith.shli %29, %c32_i64 : i64
  %31 = arith.ori %28, %30 : i64
  %32 = arith.index_castui %31 : i64 to index
  %33 = arith.extui %6 : i32 to i64
  %34 = arith.extui %7 : i32 to i64
  %35 = arith.shli %34, %c32_i64 : i64
  %36 = arith.ori %33, %35 : i64
  %37 = arith.index_castui %36 : i64 to index
  %38 = arith.extui %8 : i32 to i64
  %39 = arith.extui %9 : i32 to i64
  %40 = arith.shli %39, %c32_i64 : i64
  %41 = arith.ori %38, %40 : i64
  %42 = arith.index_castui %41 : i64 to index
  %43 = arith.extui %10 : i32 to i64
  %44 = arith.extui %11 : i32 to i64
  %45 = arith.shli %44, %c32_i64 : i64
  %46 = arith.ori %43, %45 : i64
  %47 = arith.index_castui %46 : i64 to index
  %48 = arith.extui %12 : i32 to i64
  %49 = arith.extui %13 : i32 to i64
  %50 = arith.shli %49, %c32_i64 : i64
  %51 = arith.ori %48, %50 : i64
  %52 = arith.index_castui %51 : i64 to index
  %53 = arith.extui %14 : i32 to i64
  %54 = arith.extui %15 : i32 to i64
  %55 = arith.shli %54, %c32_i64 : i64
  %56 = arith.ori %53, %55 : i64
  %57 = arith.index_castui %56 : i64 to index
  %58 = arith.extui %16 : i32 to i64
  %59 = arith.extui %17 : i32 to i64
  %60 = arith.shli %59, %c32_i64 : i64
  %61 = arith.ori %58, %60 : i64
  %62 = arith.index_castui %61 : i64 to index
  %63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42}
  %64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52}
  %65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %66 = flow.dispatch.tensor.load %63, offsets = [%workgroup_id_y, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} -> tensor<1x?x8x4x16x2x8xi8>
  %67 = flow.dispatch.tensor.load %64, offsets = [%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} -> tensor<1x?x4x2x4x16x2x8xi8>
  %68 = flow.dispatch.tensor.load %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} -> tensor<1x1x8x4x2x4x16x4xi32>
  %69 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<1x1x8x4x16x2x8xi8>
  %70 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<1x1x4x2x4x16x2x8xi8>
  %71 = scf.forall (%arg0) in (256) shared_outs(%arg1 = %68) -> (tensor<1x1x8x4x2x4x16x4xi32>) {
    %72 = affine.apply affine_map<(d0) -> (d0 mod 256)>(%arg0)
    %73:3 = affine.delinearize_index %72 into (%c4, %c4, %c16) : index, index, index
    %extracted_slice = tensor.extract_slice %arg1[0, 0, 0, %73#0, 0, %73#1, %73#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> to tensor<1x1x8x1x2x1x1x4xi32>
    %74 = affine.apply affine_map<(d0) -> (d0 mod 64)>(%arg0)
    %75:2 = affine.delinearize_index %74 into (%c4, %c16) : index, index
    %76 = scf.for %arg2 = %c0 to %42 step %c1 iter_args(%arg3 = %extracted_slice) -> (tensor<1x1x8x1x2x1x1x4xi32>) {
      %77:2 = iree_gpu.barrier_region ins(%69, %70 : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8>) {
      ^bb0(%arg4: tensor<1x1x8x4x16x2x8xi8>, %arg5: tensor<1x1x4x2x4x16x2x8xi8>):
        %79 = scf.for %arg6 = %c0 to %c512 step %c256 iter_args(%arg7 = %arg4) -> (tensor<1x1x8x4x16x2x8xi8>) {
          %81 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg6, %arg0)
          %82:3 = affine.delinearize_index %81 into (%c8, %c4, %c16) : index, index, index
          %extracted_slice_2 = tensor.extract_slice %66[0, %arg2, %82#0, %82#1, %82#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x?x8x4x16x2x8xi8> to tensor<1x1x1x1x1x2x8xi8>
          %extracted_slice_3 = tensor.extract_slice %arg7[0, 0, %82#0, %82#1, %82#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x16x2x8xi8> to tensor<1x1x1x1x1x2x8xi8>
          %83 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_2 : tensor<1x1x1x1x1x2x8xi8>) outs(%extracted_slice_3 : tensor<1x1x1x1x1x2x8xi8>) -> tensor<1x1x1x1x1x2x8xi8>
          %inserted_slice = tensor.insert_slice %83 into %arg7[0, 0, %82#0, %82#1, %82#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x2x8xi8> into tensor<1x1x8x4x16x2x8xi8>
          scf.yield %inserted_slice : tensor<1x1x8x4x16x2x8xi8>
        } {unroll_loop}
        %80 = scf.for %arg6 = %c0 to %c512 step %c256 iter_args(%arg7 = %arg5) -> (tensor<1x1x4x2x4x16x2x8xi8>) {
          %81 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg6, %arg0)
          %82:4 = affine.delinearize_index %81 into (%c4, %c2, %c4, %c16) : index, index, index, index
          %extracted_slice_2 = tensor.extract_slice %67[0, %arg2, %82#0, %82#1, %82#2, %82#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x?x4x2x4x16x2x8xi8> to tensor<1x1x1x1x1x1x2x8xi8>
          %extracted_slice_3 = tensor.extract_slice %arg7[0, 0, %82#0, %82#1, %82#2, %82#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x4x2x4x16x2x8xi8> to tensor<1x1x1x1x1x1x2x8xi8>
          %83 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_2 : tensor<1x1x1x1x1x1x2x8xi8>) outs(%extracted_slice_3 : tensor<1x1x1x1x1x1x2x8xi8>) -> tensor<1x1x1x1x1x1x2x8xi8>
          %inserted_slice = tensor.insert_slice %83 into %arg7[0, 0, %82#0, %82#1, %82#2, %82#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x1x2x8xi8> into tensor<1x1x4x2x4x16x2x8xi8>
          scf.yield %inserted_slice : tensor<1x1x4x2x4x16x2x8xi8>
        } {unroll_loop}
        iree_gpu.yield %79, %80 : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8>
      } : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8>
      %extracted_slice_0 = tensor.extract_slice %77#0[0, 0, 0, %75#0, %75#1, 0, 0] [1, 1, 8, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x16x2x8xi8> to tensor<1x1x8x1x1x2x8xi8>
      %extracted_slice_1 = tensor.extract_slice %77#1[0, 0, %73#0, 0, %73#1, %73#2, 0, 0] [1, 1, 1, 2, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x4x2x4x16x2x8xi8> to tensor<1x1x1x2x1x1x2x8xi8>
      %78 = iree_gpu.multi_mma %extracted_slice_0, %extracted_slice_1, %arg3 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic =  MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<1x1x8x1x1x2x8xi8>, tensor<1x1x1x2x1x1x2x8xi8> into tensor<1x1x8x1x2x1x1x4xi32>
      scf.yield %78 : tensor<1x1x8x1x2x1x1x4xi32>
    }
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %76 into %arg1[0, 0, 0, %73#0, 0, %73#1, %73#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x1x2x1x1x4xi32> into tensor<1x1x8x4x2x4x16x4xi32>
    }
  } {mapping = [#gpu.thread<linear_dim_0>]}
  flow.dispatch.tensor.store %71, %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
  return
 }

 // -----// IR Dump After VectorizeIREEGPUOpsPass (iree-gpu-vectorize-ops) //----- //
 func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} {
  %c0_i32 = arith.constant 0 : i32
  %c0_i8 = arith.constant 0 : i8
  %c2 = arith.constant 2 : index
  %c256 = arith.constant 256 : index
  %c512 = arith.constant 512 : index
  %c8 = arith.constant 8 : index
  %c16 = arith.constant 16 : index
  %c4 = arith.constant 4 : index
  %c0 = arith.constant 0 : index
  %c1 = arith.constant 1 : index
  %c32_i64 = arith.constant 32 : i64
  %0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
  %8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
  %9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
  %10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32
  %11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32
  %12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32
  %13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32
  %14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32
  %15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32
  %16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32
  %17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32
  %18 = arith.extui %0 : i32 to i64
  %19 = arith.extui %1 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %2 : i32 to i64
  %24 = arith.extui %3 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = arith.extui %4 : i32 to i64
  %29 = arith.extui %5 : i32 to i64
  %30 = arith.shli %29, %c32_i64 : i64
  %31 = arith.ori %28, %30 : i64
  %32 = arith.index_castui %31 : i64 to index
  %33 = arith.extui %6 : i32 to i64
  %34 = arith.extui %7 : i32 to i64
  %35 = arith.shli %34, %c32_i64 : i64
  %36 = arith.ori %33, %35 : i64
  %37 = arith.index_castui %36 : i64 to index
  %38 = arith.extui %8 : i32 to i64
  %39 = arith.extui %9 : i32 to i64
  %40 = arith.shli %39, %c32_i64 : i64
  %41 = arith.ori %38, %40 : i64
  %42 = arith.index_castui %41 : i64 to index
  %43 = arith.extui %10 : i32 to i64
  %44 = arith.extui %11 : i32 to i64
  %45 = arith.shli %44, %c32_i64 : i64
  %46 = arith.ori %43, %45 : i64
  %47 = arith.index_castui %46 : i64 to index
  %48 = arith.extui %12 : i32 to i64
  %49 = arith.extui %13 : i32 to i64
  %50 = arith.shli %49, %c32_i64 : i64
  %51 = arith.ori %48, %50 : i64
  %52 = arith.index_castui %51 : i64 to index
  %53 = arith.extui %14 : i32 to i64
  %54 = arith.extui %15 : i32 to i64
  %55 = arith.shli %54, %c32_i64 : i64
  %56 = arith.ori %53, %55 : i64
  %57 = arith.index_castui %56 : i64 to index
  %58 = arith.extui %16 : i32 to i64
  %59 = arith.extui %17 : i32 to i64
  %60 = arith.shli %59, %c32_i64 : i64
  %61 = arith.ori %58, %60 : i64
  %62 = arith.index_castui %61 : i64 to index
  %63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42}
  %64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52}
  %65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %66 = flow.dispatch.tensor.load %63, offsets = [%workgroup_id_y, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} -> tensor<1x?x8x4x16x2x8xi8>
  %67 = flow.dispatch.tensor.load %64, offsets = [%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} -> tensor<1x?x4x2x4x16x2x8xi8>
  %68 = flow.dispatch.tensor.load %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} -> tensor<1x1x8x4x2x4x16x4xi32>
  %69 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<1x1x8x4x16x2x8xi8>
  %70 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<1x1x4x2x4x16x2x8xi8>
  %71 = scf.forall (%arg0) in (256) shared_outs(%arg1 = %68) -> (tensor<1x1x8x4x2x4x16x4xi32>) {
    %72 = affine.apply affine_map<(d0) -> (d0 mod 256)>(%arg0)
    %73:3 = affine.delinearize_index %72 into (%c4, %c4, %c16) : index, index, index
    %extracted_slice = tensor.extract_slice %arg1[0, 0, 0, %73#0, 0, %73#1, %73#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> to tensor<1x1x8x1x2x1x1x4xi32>
    %74 = affine.apply affine_map<(d0) -> (d0 mod 64)>(%arg0)
    %75:2 = affine.delinearize_index %74 into (%c4, %c16) : index, index
    %76 = scf.for %arg2 = %c0 to %42 step %c1 iter_args(%arg3 = %extracted_slice) -> (tensor<1x1x8x1x2x1x1x4xi32>) {
      %77:2 = iree_gpu.barrier_region ins(%69, %70 : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8>) {
      ^bb0(%arg4: tensor<1x1x8x4x16x2x8xi8>, %arg5: tensor<1x1x4x2x4x16x2x8xi8>):
        %83 = scf.for %arg6 = %c0 to %c512 step %c256 iter_args(%arg7 = %arg4) -> (tensor<1x1x8x4x16x2x8xi8>) {
          %85 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg6, %arg0)
          %86:3 = affine.delinearize_index %85 into (%c8, %c4, %c16) : index, index, index
          %extracted_slice_2 = tensor.extract_slice %66[0, %arg2, %86#0, %86#1, %86#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x?x8x4x16x2x8xi8> to tensor<1x1x1x1x1x2x8xi8>
          %extracted_slice_3 = tensor.extract_slice %arg7[0, 0, %86#0, %86#1, %86#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x16x2x8xi8> to tensor<1x1x1x1x1x2x8xi8>
          %87 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_2 : tensor<1x1x1x1x1x2x8xi8>) outs(%extracted_slice_3 : tensor<1x1x1x1x1x2x8xi8>) -> tensor<1x1x1x1x1x2x8xi8>
          %inserted_slice = tensor.insert_slice %87 into %arg7[0, 0, %86#0, %86#1, %86#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x2x8xi8> into tensor<1x1x8x4x16x2x8xi8>
          scf.yield %inserted_slice : tensor<1x1x8x4x16x2x8xi8>
        } {unroll_loop}
        %84 = scf.for %arg6 = %c0 to %c512 step %c256 iter_args(%arg7 = %arg5) -> (tensor<1x1x4x2x4x16x2x8xi8>) {
          %85 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg6, %arg0)
          %86:4 = affine.delinearize_index %85 into (%c4, %c2, %c4, %c16) : index, index, index, index
          %extracted_slice_2 = tensor.extract_slice %67[0, %arg2, %86#0, %86#1, %86#2, %86#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x?x4x2x4x16x2x8xi8> to tensor<1x1x1x1x1x1x2x8xi8>
          %extracted_slice_3 = tensor.extract_slice %arg7[0, 0, %86#0, %86#1, %86#2, %86#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x4x2x4x16x2x8xi8> to tensor<1x1x1x1x1x1x2x8xi8>
          %87 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_2 : tensor<1x1x1x1x1x1x2x8xi8>) outs(%extracted_slice_3 : tensor<1x1x1x1x1x1x2x8xi8>) -> tensor<1x1x1x1x1x1x2x8xi8>
          %inserted_slice = tensor.insert_slice %87 into %arg7[0, 0, %86#0, %86#1, %86#2, %86#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x1x2x8xi8> into tensor<1x1x4x2x4x16x2x8xi8>
          scf.yield %inserted_slice : tensor<1x1x4x2x4x16x2x8xi8>
        } {unroll_loop}
        iree_gpu.yield %83, %84 : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8>
      } : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8>
      %extracted_slice_0 = tensor.extract_slice %77#0[0, 0, 0, %75#0, %75#1, 0, 0] [1, 1, 8, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x16x2x8xi8> to tensor<1x1x8x1x1x2x8xi8>
      %extracted_slice_1 = tensor.extract_slice %77#1[0, 0, %73#0, 0, %73#1, %73#2, 0, 0] [1, 1, 1, 2, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x4x2x4x16x2x8xi8> to tensor<1x1x1x2x1x1x2x8xi8>
      %78 = vector.transfer_read %extracted_slice_0[%c0, %c0, %c0, %c0, %c0, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true]} : tensor<1x1x8x1x1x2x8xi8>, vector<1x1x8x1x1x2x8xi8>
      %79 = vector.transfer_read %extracted_slice_1[%c0, %c0, %c0, %c0, %c0, %c0, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true, true]} : tensor<1x1x1x2x1x1x2x8xi8>, vector<1x1x1x2x1x1x2x8xi8>
      %80 = vector.transfer_read %arg3[%c0, %c0, %c0, %c0, %c0, %c0, %c0, %c0], %c0_i32 {in_bounds = [true, true, true, true, true, true, true, true]} : tensor<1x1x8x1x2x1x1x4xi32>, vector<1x1x8x1x2x1x1x4xi32>
      %81 = iree_gpu.multi_mma %78, %79, %80 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic =  MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : vector<1x1x8x1x1x2x8xi8>, vector<1x1x1x2x1x1x2x8xi8> into vector<1x1x8x1x2x1x1x4xi32>
      %82 = vector.transfer_write %81, %arg3[%c0, %c0, %c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true, true, true]} : vector<1x1x8x1x2x1x1x4xi32>, tensor<1x1x8x1x2x1x1x4xi32>
      scf.yield %82 : tensor<1x1x8x1x2x1x1x4xi32>
    }
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %76 into %arg1[0, 0, 0, %73#0, 0, %73#1, %73#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x1x2x1x1x4xi32> into tensor<1x1x8x4x2x4x16x4xi32>
    }
  } {mapping = [#gpu.thread<linear_dim_0>]}
  flow.dispatch.tensor.store %71, %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
  return
 }

 // -----// IR Dump After DecomposeConvolutionToLowerDimOpsPass (iree-codegen-decompose-convolution-to-lower-dim-ops) //----- //
 func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} {
  %c0_i32 = arith.constant 0 : i32
  %c0_i8 = arith.constant 0 : i8
  %c2 = arith.constant 2 : index
  %c256 = arith.constant 256 : index
  %c512 = arith.constant 512 : index
  %c8 = arith.constant 8 : index
  %c16 = arith.constant 16 : index
  %c4 = arith.constant 4 : index
  %c0 = arith.constant 0 : index
  %c1 = arith.constant 1 : index
  %c32_i64 = arith.constant 32 : i64
  %0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
  %8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
  %9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
  %10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32
  %11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32
  %12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32
  %13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32
  %14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32
  %15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32
  %16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32
  %17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32
  %18 = arith.extui %0 : i32 to i64
  %19 = arith.extui %1 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %2 : i32 to i64
  %24 = arith.extui %3 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = arith.extui %4 : i32 to i64
  %29 = arith.extui %5 : i32 to i64
  %30 = arith.shli %29, %c32_i64 : i64
  %31 = arith.ori %28, %30 : i64
  %32 = arith.index_castui %31 : i64 to index
  %33 = arith.extui %6 : i32 to i64
  %34 = arith.extui %7 : i32 to i64
  %35 = arith.shli %34, %c32_i64 : i64
  %36 = arith.ori %33, %35 : i64
  %37 = arith.index_castui %36 : i64 to index
  %38 = arith.extui %8 : i32 to i64
  %39 = arith.extui %9 : i32 to i64
  %40 = arith.shli %39, %c32_i64 : i64
  %41 = arith.ori %38, %40 : i64
  %42 = arith.index_castui %41 : i64 to index
  %43 = arith.extui %10 : i32 to i64
  %44 = arith.extui %11 : i32 to i64
  %45 = arith.shli %44, %c32_i64 : i64
  %46 = arith.ori %43, %45 : i64
  %47 = arith.index_castui %46 : i64 to index
  %48 = arith.extui %12 : i32 to i64
  %49 = arith.extui %13 : i32 to i64
  %50 = arith.shli %49, %c32_i64 : i64
  %51 = arith.ori %48, %50 : i64
  %52 = arith.index_castui %51 : i64 to index
  %53 = arith.extui %14 : i32 to i64
  %54 = arith.extui %15 : i32 to i64
  %55 = arith.shli %54, %c32_i64 : i64
  %56 = arith.ori %53, %55 : i64
  %57 = arith.index_castui %56 : i64 to index
  %58 = arith.extui %16 : i32 to i64
  %59 = arith.extui %17 : i32 to i64
  %60 = arith.shli %59, %c32_i64 : i64
  %61 = arith.ori %58, %60 : i64
  %62 = arith.index_castui %61 : i64 to index
  %63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42}
  %64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52}
  %65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %66 = flow.dispatch.tensor.load %63, offsets = [%workgroup_id_y, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} -> tensor<1x?x8x4x16x2x8xi8>
  %67 = flow.dispatch.tensor.load %64, offsets = [%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} -> tensor<1x?x4x2x4x16x2x8xi8>
  %68 = flow.dispatch.tensor.load %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} -> tensor<1x1x8x4x2x4x16x4xi32>
  %69 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<1x1x8x4x16x2x8xi8>
  %70 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<1x1x4x2x4x16x2x8xi8>
  %71 = scf.forall (%arg0) in (256) shared_outs(%arg1 = %68) -> (tensor<1x1x8x4x2x4x16x4xi32>) {
    %72 = affine.apply affine_map<(d0) -> (d0 mod 256)>(%arg0)
    %73:3 = affine.delinearize_index %72 into (%c4, %c4, %c16) : index, index, index
    %extracted_slice = tensor.extract_slice %arg1[0, 0, 0, %73#0, 0, %73#1, %73#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> to tensor<1x1x8x1x2x1x1x4xi32>
    %74 = affine.apply affine_map<(d0) -> (d0 mod 64)>(%arg0)
    %75:2 = affine.delinearize_index %74 into (%c4, %c16) : index, index
    %76 = scf.for %arg2 = %c0 to %42 step %c1 iter_args(%arg3 = %extracted_slice) -> (tensor<1x1x8x1x2x1x1x4xi32>) {
      %77:2 = iree_gpu.barrier_region ins(%69, %70 : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8>) {
      ^bb0(%arg4: tensor<1x1x8x4x16x2x8xi8>, %arg5: tensor<1x1x4x2x4x16x2x8xi8>):
        %83 = scf.for %arg6 = %c0 to %c512 step %c256 iter_args(%arg7 = %arg4) -> (tensor<1x1x8x4x16x2x8xi8>) {
          %85 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg6, %arg0)
          %86:3 = affine.delinearize_index %85 into (%c8, %c4, %c16) : index, index, index
          %extracted_slice_2 = tensor.extract_slice %66[0, %arg2, %86#0, %86#1, %86#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x?x8x4x16x2x8xi8> to tensor<1x1x1x1x1x2x8xi8>
          %extracted_slice_3 = tensor.extract_slice %arg7[0, 0, %86#0, %86#1, %86#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x16x2x8xi8> to tensor<1x1x1x1x1x2x8xi8>
          %87 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_2 : tensor<1x1x1x1x1x2x8xi8>) outs(%extracted_slice_3 : tensor<1x1x1x1x1x2x8xi8>) -> tensor<1x1x1x1x1x2x8xi8>
          %inserted_slice = tensor.insert_slice %87 into %arg7[0, 0, %86#0, %86#1, %86#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x2x8xi8> into tensor<1x1x8x4x16x2x8xi8>
          scf.yield %inserted_slice : tensor<1x1x8x4x16x2x8xi8>
        } {unroll_loop}
        %84 = scf.for %arg6 = %c0 to %c512 step %c256 iter_args(%arg7 = %arg5) -> (tensor<1x1x4x2x4x16x2x8xi8>) {
          %85 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg6, %arg0)
          %86:4 = affine.delinearize_index %85 into (%c4, %c2, %c4, %c16) : index, index, index, index
          %extracted_slice_2 = tensor.extract_slice %67[0, %arg2, %86#0, %86#1, %86#2, %86#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x?x4x2x4x16x2x8xi8> to tensor<1x1x1x1x1x1x2x8xi8>
          %extracted_slice_3 = tensor.extract_slice %arg7[0, 0, %86#0, %86#1, %86#2, %86#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x4x2x4x16x2x8xi8> to tensor<1x1x1x1x1x1x2x8xi8>
          %87 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_2 : tensor<1x1x1x1x1x1x2x8xi8>) outs(%extracted_slice_3 : tensor<1x1x1x1x1x1x2x8xi8>) -> tensor<1x1x1x1x1x1x2x8xi8>
          %inserted_slice = tensor.insert_slice %87 into %arg7[0, 0, %86#0, %86#1, %86#2, %86#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x1x2x8xi8> into tensor<1x1x4x2x4x16x2x8xi8>
          scf.yield %inserted_slice : tensor<1x1x4x2x4x16x2x8xi8>
        } {unroll_loop}
        iree_gpu.yield %83, %84 : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8>
      } : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8>
      %extracted_slice_0 = tensor.extract_slice %77#0[0, 0, 0, %75#0, %75#1, 0, 0] [1, 1, 8, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x16x2x8xi8> to tensor<1x1x8x1x1x2x8xi8>
      %extracted_slice_1 = tensor.extract_slice %77#1[0, 0, %73#0, 0, %73#1, %73#2, 0, 0] [1, 1, 1, 2, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x4x2x4x16x2x8xi8> to tensor<1x1x1x2x1x1x2x8xi8>
      %78 = vector.transfer_read %extracted_slice_0[%c0, %c0, %c0, %c0, %c0, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true]} : tensor<1x1x8x1x1x2x8xi8>, vector<1x1x8x1x1x2x8xi8>
      %79 = vector.transfer_read %extracted_slice_1[%c0, %c0, %c0, %c0, %c0, %c0, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true, true]} : tensor<1x1x1x2x1x1x2x8xi8>, vector<1x1x1x2x1x1x2x8xi8>
      %80 = vector.transfer_read %arg3[%c0, %c0, %c0, %c0, %c0, %c0, %c0, %c0], %c0_i32 {in_bounds = [true, true, true, true, true, true, true, true]} : tensor<1x1x8x1x2x1x1x4xi32>, vector<1x1x8x1x2x1x1x4xi32>
      %81 = iree_gpu.multi_mma %78, %79, %80 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic =  MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : vector<1x1x8x1x1x2x8xi8>, vector<1x1x1x2x1x1x2x8xi8> into vector<1x1x8x1x2x1x1x4xi32>
      %82 = vector.transfer_write %81, %arg3[%c0, %c0, %c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true, true, true]} : vector<1x1x8x1x2x1x1x4xi32>, tensor<1x1x8x1x2x1x1x4xi32>
      scf.yield %82 : tensor<1x1x8x1x2x1x1x4xi32>
    }
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %76 into %arg1[0, 0, 0, %73#0, 0, %73#1, %73#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x1x2x1x1x4xi32> into tensor<1x1x8x4x2x4x16x4xi32>
    }
  } {mapping = [#gpu.thread<linear_dim_0>]}
  flow.dispatch.tensor.store %71, %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
  return
 }

 // -----// IR Dump After DecomposeIm2colPass (iree-linalg-ext-decompose-im2col) //----- //
 func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} {
  %c0_i32 = arith.constant 0 : i32
  %c0_i8 = arith.constant 0 : i8
  %c2 = arith.constant 2 : index
  %c256 = arith.constant 256 : index
  %c512 = arith.constant 512 : index
  %c8 = arith.constant 8 : index
  %c16 = arith.constant 16 : index
  %c4 = arith.constant 4 : index
  %c0 = arith.constant 0 : index
  %c1 = arith.constant 1 : index
  %c32_i64 = arith.constant 32 : i64
  %0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
  %8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
  %9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
  %10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32
  %11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32
  %12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32
  %13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32
  %14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32
  %15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32
  %16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32
  %17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32
  %18 = arith.extui %0 : i32 to i64
  %19 = arith.extui %1 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %2 : i32 to i64
  %24 = arith.extui %3 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = arith.extui %4 : i32 to i64
  %29 = arith.extui %5 : i32 to i64
  %30 = arith.shli %29, %c32_i64 : i64
  %31 = arith.ori %28, %30 : i64
  %32 = arith.index_castui %31 : i64 to index
  %33 = arith.extui %6 : i32 to i64
  %34 = arith.extui %7 : i32 to i64
  %35 = arith.shli %34, %c32_i64 : i64
  %36 = arith.ori %33, %35 : i64
  %37 = arith.index_castui %36 : i64 to index
  %38 = arith.extui %8 : i32 to i64
  %39 = arith.extui %9 : i32 to i64
  %40 = arith.shli %39, %c32_i64 : i64
  %41 = arith.ori %38, %40 : i64
  %42 = arith.index_castui %41 : i64 to index
  %43 = arith.extui %10 : i32 to i64
  %44 = arith.extui %11 : i32 to i64
  %45 = arith.shli %44, %c32_i64 : i64
  %46 = arith.ori %43, %45 : i64
  %47 = arith.index_castui %46 : i64 to index
  %48 = arith.extui %12 : i32 to i64
  %49 = arith.extui %13 : i32 to i64
  %50 = arith.shli %49, %c32_i64 : i64
  %51 = arith.ori %48, %50 : i64
  %52 = arith.index_castui %51 : i64 to index
  %53 = arith.extui %14 : i32 to i64
  %54 = arith.extui %15 : i32 to i64
  %55 = arith.shli %54, %c32_i64 : i64
  %56 = arith.ori %53, %55 : i64
  %57 = arith.index_castui %56 : i64 to index
  %58 = arith.extui %16 : i32 to i64
  %59 = arith.extui %17 : i32 to i64
  %60 = arith.shli %59, %c32_i64 : i64
  %61 = arith.ori %58, %60 : i64
  %62 = arith.index_castui %61 : i64 to index
  %63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42}
  %64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52}
  %65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %66 = flow.dispatch.tensor.load %63, offsets = [%workgroup_id_y, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} -> tensor<1x?x8x4x16x2x8xi8>
  %67 = flow.dispatch.tensor.load %64, offsets = [%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} -> tensor<1x?x4x2x4x16x2x8xi8>
  %68 = flow.dispatch.tensor.load %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} -> tensor<1x1x8x4x2x4x16x4xi32>
  %69 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<1x1x8x4x16x2x8xi8>
  %70 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<1x1x4x2x4x16x2x8xi8>
  %71 = scf.forall (%arg0) in (256) shared_outs(%arg1 = %68) -> (tensor<1x1x8x4x2x4x16x4xi32>) {
    %72 = affine.apply affine_map<(d0) -> (d0 mod 256)>(%arg0)
    %73:3 = affine.delinearize_index %72 into (%c4, %c4, %c16) : index, index, index
    %extracted_slice = tensor.extract_slice %arg1[0, 0, 0, %73#0, 0, %73#1, %73#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> to tensor<1x1x8x1x2x1x1x4xi32>
    %74 = affine.apply affine_map<(d0) -> (d0 mod 64)>(%arg0)
    %75:2 = affine.delinearize_index %74 into (%c4, %c16) : index, index
    %76 = scf.for %arg2 = %c0 to %42 step %c1 iter_args(%arg3 = %extracted_slice) -> (tensor<1x1x8x1x2x1x1x4xi32>) {
      %77:2 = iree_gpu.barrier_region ins(%69, %70 : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8>) {
      ^bb0(%arg4: tensor<1x1x8x4x16x2x8xi8>, %arg5: tensor<1x1x4x2x4x16x2x8xi8>):
        %83 = scf.for %arg6 = %c0 to %c512 step %c256 iter_args(%arg7 = %arg4) -> (tensor<1x1x8x4x16x2x8xi8>) {
          %85 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg6, %arg0)
          %86:3 = affine.delinearize_index %85 into (%c8, %c4, %c16) : index, index, index
          %extracted_slice_2 = tensor.extract_slice %66[0, %arg2, %86#0, %86#1, %86#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x?x8x4x16x2x8xi8> to tensor<1x1x1x1x1x2x8xi8>
          %extracted_slice_3 = tensor.extract_slice %arg7[0, 0, %86#0, %86#1, %86#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x16x2x8xi8> to tensor<1x1x1x1x1x2x8xi8>
          %87 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_2 : tensor<1x1x1x1x1x2x8xi8>) outs(%extracted_slice_3 : tensor<1x1x1x1x1x2x8xi8>) -> tensor<1x1x1x1x1x2x8xi8>
          %inserted_slice = tensor.insert_slice %87 into %arg7[0, 0, %86#0, %86#1, %86#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x2x8xi8> into tensor<1x1x8x4x16x2x8xi8>
          scf.yield %inserted_slice : tensor<1x1x8x4x16x2x8xi8>
        } {unroll_loop}
        %84 = scf.for %arg6 = %c0 to %c512 step %c256 iter_args(%arg7 = %arg5) -> (tensor<1x1x4x2x4x16x2x8xi8>) {
          %85 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg6, %arg0)
          %86:4 = affine.delinearize_index %85 into (%c4, %c2, %c4, %c16) : index, index, index, index
          %extracted_slice_2 = tensor.extract_slice %67[0, %arg2, %86#0, %86#1, %86#2, %86#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x?x4x2x4x16x2x8xi8> to tensor<1x1x1x1x1x1x2x8xi8>
          %extracted_slice_3 = tensor.extract_slice %arg7[0, 0, %86#0, %86#1, %86#2, %86#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x4x2x4x16x2x8xi8> to tensor<1x1x1x1x1x1x2x8xi8>
          %87 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_2 : tensor<1x1x1x1x1x1x2x8xi8>) outs(%extracted_slice_3 : tensor<1x1x1x1x1x1x2x8xi8>) -> tensor<1x1x1x1x1x1x2x8xi8>
          %inserted_slice = tensor.insert_slice %87 into %arg7[0, 0, %86#0, %86#1, %86#2, %86#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x1x2x8xi8> into tensor<1x1x4x2x4x16x2x8xi8>
          scf.yield %inserted_slice : tensor<1x1x4x2x4x16x2x8xi8>
        } {unroll_loop}
        iree_gpu.yield %83, %84 : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8>
      } : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8>
      %extracted_slice_0 = tensor.extract_slice %77#0[0, 0, 0, %75#0, %75#1, 0, 0] [1, 1, 8, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x16x2x8xi8> to tensor<1x1x8x1x1x2x8xi8>
      %extracted_slice_1 = tensor.extract_slice %77#1[0, 0, %73#0, 0, %73#1, %73#2, 0, 0] [1, 1, 1, 2, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x4x2x4x16x2x8xi8> to tensor<1x1x1x2x1x1x2x8xi8>
      %78 = vector.transfer_read %extracted_slice_0[%c0, %c0, %c0, %c0, %c0, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true]} : tensor<1x1x8x1x1x2x8xi8>, vector<1x1x8x1x1x2x8xi8>
      %79 = vector.transfer_read %extracted_slice_1[%c0, %c0, %c0, %c0, %c0, %c0, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true, true]} : tensor<1x1x1x2x1x1x2x8xi8>, vector<1x1x1x2x1x1x2x8xi8>
      %80 = vector.transfer_read %arg3[%c0, %c0, %c0, %c0, %c0, %c0, %c0, %c0], %c0_i32 {in_bounds = [true, true, true, true, true, true, true, true]} : tensor<1x1x8x1x2x1x1x4xi32>, vector<1x1x8x1x2x1x1x4xi32>
      %81 = iree_gpu.multi_mma %78, %79, %80 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic =  MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : vector<1x1x8x1x1x2x8xi8>, vector<1x1x1x2x1x1x2x8xi8> into vector<1x1x8x1x2x1x1x4xi32>
      %82 = vector.transfer_write %81, %arg3[%c0, %c0, %c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true, true, true]} : vector<1x1x8x1x2x1x1x4xi32>, tensor<1x1x8x1x2x1x1x4xi32>
      scf.yield %82 : tensor<1x1x8x1x2x1x1x4xi32>
    }
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %76 into %arg1[0, 0, 0, %73#0, 0, %73#1, %73#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x1x2x1x1x4xi32> into tensor<1x1x8x4x2x4x16x4xi32>
    }
  } {mapping = [#gpu.thread<linear_dim_0>]}
  flow.dispatch.tensor.store %71, %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
  return
 }

 // -----// IR Dump After VectorizeIREEVectorExtOpsPass (iree-vector-ext-vectorize-ops) //----- //
 func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} {
  %c0_i32 = arith.constant 0 : i32
  %c0_i8 = arith.constant 0 : i8
  %c2 = arith.constant 2 : index
  %c256 = arith.constant 256 : index
  %c512 = arith.constant 512 : index
  %c8 = arith.constant 8 : index
  %c16 = arith.constant 16 : index
  %c4 = arith.constant 4 : index
  %c0 = arith.constant 0 : index
  %c1 = arith.constant 1 : index
  %c32_i64 = arith.constant 32 : i64
  %0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
  %8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
  %9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
  %10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32
  %11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32
  %12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32
  %13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32
  %14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32
  %15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32
  %16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32
  %17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32
  %18 = arith.extui %0 : i32 to i64
  %19 = arith.extui %1 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %2 : i32 to i64
  %24 = arith.extui %3 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = arith.extui %4 : i32 to i64
  %29 = arith.extui %5 : i32 to i64
  %30 = arith.shli %29, %c32_i64 : i64
  %31 = arith.ori %28, %30 : i64
  %32 = arith.index_castui %31 : i64 to index
  %33 = arith.extui %6 : i32 to i64
  %34 = arith.extui %7 : i32 to i64
  %35 = arith.shli %34, %c32_i64 : i64
  %36 = arith.ori %33, %35 : i64
  %37 = arith.index_castui %36 : i64 to index
  %38 = arith.extui %8 : i32 to i64
  %39 = arith.extui %9 : i32 to i64
  %40 = arith.shli %39, %c32_i64 : i64
  %41 = arith.ori %38, %40 : i64
  %42 = arith.index_castui %41 : i64 to index
  %43 = arith.extui %10 : i32 to i64
  %44 = arith.extui %11 : i32 to i64
  %45 = arith.shli %44, %c32_i64 : i64
  %46 = arith.ori %43, %45 : i64
  %47 = arith.index_castui %46 : i64 to index
  %48 = arith.extui %12 : i32 to i64
  %49 = arith.extui %13 : i32 to i64
  %50 = arith.shli %49, %c32_i64 : i64
  %51 = arith.ori %48, %50 : i64
  %52 = arith.index_castui %51 : i64 to index
  %53 = arith.extui %14 : i32 to i64
  %54 = arith.extui %15 : i32 to i64
  %55 = arith.shli %54, %c32_i64 : i64
  %56 = arith.ori %53, %55 : i64
  %57 = arith.index_castui %56 : i64 to index
  %58 = arith.extui %16 : i32 to i64
  %59 = arith.extui %17 : i32 to i64
  %60 = arith.shli %59, %c32_i64 : i64
  %61 = arith.ori %58, %60 : i64
  %62 = arith.index_castui %61 : i64 to index
  %63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42}
  %64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52}
  %65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %66 = flow.dispatch.tensor.load %63, offsets = [%workgroup_id_y, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} -> tensor<1x?x8x4x16x2x8xi8>
  %67 = flow.dispatch.tensor.load %64, offsets = [%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} -> tensor<1x?x4x2x4x16x2x8xi8>
  %68 = flow.dispatch.tensor.load %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} -> tensor<1x1x8x4x2x4x16x4xi32>
  %69 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<1x1x8x4x16x2x8xi8>
  %70 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<1x1x4x2x4x16x2x8xi8>
  %71 = scf.forall (%arg0) in (256) shared_outs(%arg1 = %68) -> (tensor<1x1x8x4x2x4x16x4xi32>) {
    %72 = affine.apply affine_map<(d0) -> (d0 mod 256)>(%arg0)
    %73:3 = affine.delinearize_index %72 into (%c4, %c4, %c16) : index, index, index
    %extracted_slice = tensor.extract_slice %arg1[0, 0, 0, %73#0, 0, %73#1, %73#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> to tensor<1x1x8x1x2x1x1x4xi32>
    %74 = affine.apply affine_map<(d0) -> (d0 mod 64)>(%arg0)
    %75:2 = affine.delinearize_index %74 into (%c4, %c16) : index, index
    %76 = scf.for %arg2 = %c0 to %42 step %c1 iter_args(%arg3 = %extracted_slice) -> (tensor<1x1x8x1x2x1x1x4xi32>) {
      %77:2 = iree_gpu.barrier_region ins(%69, %70 : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8>) {
      ^bb0(%arg4: tensor<1x1x8x4x16x2x8xi8>, %arg5: tensor<1x1x4x2x4x16x2x8xi8>):
        %83 = scf.for %arg6 = %c0 to %c512 step %c256 iter_args(%arg7 = %arg4) -> (tensor<1x1x8x4x16x2x8xi8>) {
          %85 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg6, %arg0)
          %86:3 = affine.delinearize_index %85 into (%c8, %c4, %c16) : index, index, index
          %extracted_slice_2 = tensor.extract_slice %66[0, %arg2, %86#0, %86#1, %86#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x?x8x4x16x2x8xi8> to tensor<1x1x1x1x1x2x8xi8>
          %extracted_slice_3 = tensor.extract_slice %arg7[0, 0, %86#0, %86#1, %86#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x16x2x8xi8> to tensor<1x1x1x1x1x2x8xi8>
          %87 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_2 : tensor<1x1x1x1x1x2x8xi8>) outs(%extracted_slice_3 : tensor<1x1x1x1x1x2x8xi8>) -> tensor<1x1x1x1x1x2x8xi8>
          %inserted_slice = tensor.insert_slice %87 into %arg7[0, 0, %86#0, %86#1, %86#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x2x8xi8> into tensor<1x1x8x4x16x2x8xi8>
          scf.yield %inserted_slice : tensor<1x1x8x4x16x2x8xi8>
        } {unroll_loop}
        %84 = scf.for %arg6 = %c0 to %c512 step %c256 iter_args(%arg7 = %arg5) -> (tensor<1x1x4x2x4x16x2x8xi8>) {
          %85 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg6, %arg0)
          %86:4 = affine.delinearize_index %85 into (%c4, %c2, %c4, %c16) : index, index, index, index
          %extracted_slice_2 = tensor.extract_slice %67[0, %arg2, %86#0, %86#1, %86#2, %86#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x?x4x2x4x16x2x8xi8> to tensor<1x1x1x1x1x1x2x8xi8>
          %extracted_slice_3 = tensor.extract_slice %arg7[0, 0, %86#0, %86#1, %86#2, %86#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x4x2x4x16x2x8xi8> to tensor<1x1x1x1x1x1x2x8xi8>
          %87 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_2 : tensor<1x1x1x1x1x1x2x8xi8>) outs(%extracted_slice_3 : tensor<1x1x1x1x1x1x2x8xi8>) -> tensor<1x1x1x1x1x1x2x8xi8>
          %inserted_slice = tensor.insert_slice %87 into %arg7[0, 0, %86#0, %86#1, %86#2, %86#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x1x2x8xi8> into tensor<1x1x4x2x4x16x2x8xi8>
          scf.yield %inserted_slice : tensor<1x1x4x2x4x16x2x8xi8>
        } {unroll_loop}
        iree_gpu.yield %83, %84 : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8>
      } : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8>
      %extracted_slice_0 = tensor.extract_slice %77#0[0, 0, 0, %75#0, %75#1, 0, 0] [1, 1, 8, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x16x2x8xi8> to tensor<1x1x8x1x1x2x8xi8>
      %extracted_slice_1 = tensor.extract_slice %77#1[0, 0, %73#0, 0, %73#1, %73#2, 0, 0] [1, 1, 1, 2, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x4x2x4x16x2x8xi8> to tensor<1x1x1x2x1x1x2x8xi8>
      %78 = vector.transfer_read %extracted_slice_0[%c0, %c0, %c0, %c0, %c0, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true]} : tensor<1x1x8x1x1x2x8xi8>, vector<1x1x8x1x1x2x8xi8>
      %79 = vector.transfer_read %extracted_slice_1[%c0, %c0, %c0, %c0, %c0, %c0, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true, true]} : tensor<1x1x1x2x1x1x2x8xi8>, vector<1x1x1x2x1x1x2x8xi8>
      %80 = vector.transfer_read %arg3[%c0, %c0, %c0, %c0, %c0, %c0, %c0, %c0], %c0_i32 {in_bounds = [true, true, true, true, true, true, true, true]} : tensor<1x1x8x1x2x1x1x4xi32>, vector<1x1x8x1x2x1x1x4xi32>
      %81 = iree_gpu.multi_mma %78, %79, %80 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic =  MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : vector<1x1x8x1x1x2x8xi8>, vector<1x1x1x2x1x1x2x8xi8> into vector<1x1x8x1x2x1x1x4xi32>
      %82 = vector.transfer_write %81, %arg3[%c0, %c0, %c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true, true, true]} : vector<1x1x8x1x2x1x1x4xi32>, tensor<1x1x8x1x2x1x1x4xi32>
      scf.yield %82 : tensor<1x1x8x1x2x1x1x4xi32>
    }
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %76 into %arg1[0, 0, 0, %73#0, 0, %73#1, %73#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x1x2x1x1x4xi32> into tensor<1x1x8x4x2x4x16x4xi32>
    }
  } {mapping = [#gpu.thread<linear_dim_0>]}
  flow.dispatch.tensor.store %71, %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
  return
 }

 // -----// IR Dump After GenericVectorizationPass (iree-codegen-generic-vectorization) //----- //
 func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} {
  %c0_i32 = arith.constant 0 : i32
  %c0_i8 = arith.constant 0 : i8
  %c2 = arith.constant 2 : index
  %c256 = arith.constant 256 : index
  %c512 = arith.constant 512 : index
  %c8 = arith.constant 8 : index
  %c16 = arith.constant 16 : index
  %c4 = arith.constant 4 : index
  %c0 = arith.constant 0 : index
  %c1 = arith.constant 1 : index
  %c32_i64 = arith.constant 32 : i64
  %0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
  %8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
  %9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
  %10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32
  %11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32
  %12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32
  %13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32
  %14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32
  %15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32
  %16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32
  %17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32
  %18 = arith.extui %0 : i32 to i64
  %19 = arith.extui %1 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %2 : i32 to i64
  %24 = arith.extui %3 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = arith.extui %4 : i32 to i64
  %29 = arith.extui %5 : i32 to i64
  %30 = arith.shli %29, %c32_i64 : i64
  %31 = arith.ori %28, %30 : i64
  %32 = arith.index_castui %31 : i64 to index
  %33 = arith.extui %6 : i32 to i64
  %34 = arith.extui %7 : i32 to i64
  %35 = arith.shli %34, %c32_i64 : i64
  %36 = arith.ori %33, %35 : i64
  %37 = arith.index_castui %36 : i64 to index
  %38 = arith.extui %8 : i32 to i64
  %39 = arith.extui %9 : i32 to i64
  %40 = arith.shli %39, %c32_i64 : i64
  %41 = arith.ori %38, %40 : i64
  %42 = arith.index_castui %41 : i64 to index
  %43 = arith.extui %10 : i32 to i64
  %44 = arith.extui %11 : i32 to i64
  %45 = arith.shli %44, %c32_i64 : i64
  %46 = arith.ori %43, %45 : i64
  %47 = arith.index_castui %46 : i64 to index
  %48 = arith.extui %12 : i32 to i64
  %49 = arith.extui %13 : i32 to i64
  %50 = arith.shli %49, %c32_i64 : i64
  %51 = arith.ori %48, %50 : i64
  %52 = arith.index_castui %51 : i64 to index
  %53 = arith.extui %14 : i32 to i64
  %54 = arith.extui %15 : i32 to i64
  %55 = arith.shli %54, %c32_i64 : i64
  %56 = arith.ori %53, %55 : i64
  %57 = arith.index_castui %56 : i64 to index
  %58 = arith.extui %16 : i32 to i64
  %59 = arith.extui %17 : i32 to i64
  %60 = arith.shli %59, %c32_i64 : i64
  %61 = arith.ori %58, %60 : i64
  %62 = arith.index_castui %61 : i64 to index
  %63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42}
  %64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52}
  %65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %66 = flow.dispatch.tensor.load %63, offsets = [%workgroup_id_y, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} -> tensor<1x?x8x4x16x2x8xi8>
  %67 = flow.dispatch.tensor.load %64, offsets = [%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} -> tensor<1x?x4x2x4x16x2x8xi8>
  %68 = flow.dispatch.tensor.load %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} -> tensor<1x1x8x4x2x4x16x4xi32>
  %69 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<1x1x8x4x16x2x8xi8>
  %70 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<1x1x4x2x4x16x2x8xi8>
  %71 = scf.forall (%arg0) in (256) shared_outs(%arg1 = %68) -> (tensor<1x1x8x4x2x4x16x4xi32>) {
    %72 = affine.apply affine_map<(d0) -> (d0 mod 256)>(%arg0)
    %73:3 = affine.delinearize_index %72 into (%c4, %c4, %c16) : index, index, index
    %extracted_slice = tensor.extract_slice %arg1[0, 0, 0, %73#0, 0, %73#1, %73#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> to tensor<1x1x8x1x2x1x1x4xi32>
    %74 = affine.apply affine_map<(d0) -> (d0 mod 64)>(%arg0)
    %75:2 = affine.delinearize_index %74 into (%c4, %c16) : index, index
    %76 = scf.for %arg2 = %c0 to %42 step %c1 iter_args(%arg3 = %extracted_slice) -> (tensor<1x1x8x1x2x1x1x4xi32>) {
      %77:2 = iree_gpu.barrier_region ins(%69, %70 : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8>) {
      ^bb0(%arg4: tensor<1x1x8x4x16x2x8xi8>, %arg5: tensor<1x1x4x2x4x16x2x8xi8>):
        %83 = scf.for %arg6 = %c0 to %c512 step %c256 iter_args(%arg7 = %arg4) -> (tensor<1x1x8x4x16x2x8xi8>) {
          %85 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg6, %arg0)
          %86:3 = affine.delinearize_index %85 into (%c8, %c4, %c16) : index, index, index
          %extracted_slice_2 = tensor.extract_slice %66[0, %arg2, %86#0, %86#1, %86#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x?x8x4x16x2x8xi8> to tensor<1x1x1x1x1x2x8xi8>
          %inserted_slice = tensor.insert_slice %extracted_slice_2 into %arg7[0, 0, %86#0, %86#1, %86#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x2x8xi8> into tensor<1x1x8x4x16x2x8xi8>
          scf.yield %inserted_slice : tensor<1x1x8x4x16x2x8xi8>
        } {unroll_loop}
        %84 = scf.for %arg6 = %c0 to %c512 step %c256 iter_args(%arg7 = %arg5) -> (tensor<1x1x4x2x4x16x2x8xi8>) {
          %85 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg6, %arg0)
          %86:4 = affine.delinearize_index %85 into (%c4, %c2, %c4, %c16) : index, index, index, index
          %extracted_slice_2 = tensor.extract_slice %67[0, %arg2, %86#0, %86#1, %86#2, %86#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x?x4x2x4x16x2x8xi8> to tensor<1x1x1x1x1x1x2x8xi8>
          %inserted_slice = tensor.insert_slice %extracted_slice_2 into %arg7[0, 0, %86#0, %86#1, %86#2, %86#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x1x2x8xi8> into tensor<1x1x4x2x4x16x2x8xi8>
          scf.yield %inserted_slice : tensor<1x1x4x2x4x16x2x8xi8>
        } {unroll_loop}
        iree_gpu.yield %83, %84 : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8>
      } : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8>
      %extracted_slice_0 = tensor.extract_slice %77#0[0, 0, 0, %75#0, %75#1, 0, 0] [1, 1, 8, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x16x2x8xi8> to tensor<1x1x8x1x1x2x8xi8>
      %extracted_slice_1 = tensor.extract_slice %77#1[0, 0, %73#0, 0, %73#1, %73#2, 0, 0] [1, 1, 1, 2, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x4x2x4x16x2x8xi8> to tensor<1x1x1x2x1x1x2x8xi8>
      %78 = vector.transfer_read %extracted_slice_0[%c0, %c0, %c0, %c0, %c0, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true]} : tensor<1x1x8x1x1x2x8xi8>, vector<1x1x8x1x1x2x8xi8>
      %79 = vector.transfer_read %extracted_slice_1[%c0, %c0, %c0, %c0, %c0, %c0, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true, true]} : tensor<1x1x1x2x1x1x2x8xi8>, vector<1x1x1x2x1x1x2x8xi8>
      %80 = vector.transfer_read %arg3[%c0, %c0, %c0, %c0, %c0, %c0, %c0, %c0], %c0_i32 {in_bounds = [true, true, true, true, true, true, true, true]} : tensor<1x1x8x1x2x1x1x4xi32>, vector<1x1x8x1x2x1x1x4xi32>
      %81 = iree_gpu.multi_mma %78, %79, %80 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic =  MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : vector<1x1x8x1x1x2x8xi8>, vector<1x1x1x2x1x1x2x8xi8> into vector<1x1x8x1x2x1x1x4xi32>
      %82 = vector.transfer_write %81, %arg3[%c0, %c0, %c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true, true, true]} : vector<1x1x8x1x2x1x1x4xi32>, tensor<1x1x8x1x2x1x1x4xi32>
      scf.yield %82 : tensor<1x1x8x1x2x1x1x4xi32>
    }
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %76 into %arg1[0, 0, 0, %73#0, 0, %73#1, %73#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x1x2x1x1x4xi32> into tensor<1x1x8x4x2x4x16x4xi32>
    }
  } {mapping = [#gpu.thread<linear_dim_0>]}
  flow.dispatch.tensor.store %71, %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
  return
 }

 // -----// IR Dump After Canonicalizer (canonicalize) //----- //
 func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} {
  %c0_i32 = arith.constant 0 : i32
  %c0_i8 = arith.constant 0 : i8
  %c2 = arith.constant 2 : index
  %c256 = arith.constant 256 : index
  %c512 = arith.constant 512 : index
  %c8 = arith.constant 8 : index
  %c16 = arith.constant 16 : index
  %c4 = arith.constant 4 : index
  %c0 = arith.constant 0 : index
  %c1 = arith.constant 1 : index
  %c32_i64 = arith.constant 32 : i64
  %0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
  %8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
  %9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
  %10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32
  %11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32
  %12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32
  %13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32
  %14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32
  %15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32
  %16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32
  %17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32
  %18 = arith.extui %0 : i32 to i64
  %19 = arith.extui %1 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %2 : i32 to i64
  %24 = arith.extui %3 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = arith.extui %4 : i32 to i64
  %29 = arith.extui %5 : i32 to i64
  %30 = arith.shli %29, %c32_i64 : i64
  %31 = arith.ori %28, %30 : i64
  %32 = arith.index_castui %31 : i64 to index
  %33 = arith.extui %6 : i32 to i64
  %34 = arith.extui %7 : i32 to i64
  %35 = arith.shli %34, %c32_i64 : i64
  %36 = arith.ori %33, %35 : i64
  %37 = arith.index_castui %36 : i64 to index
  %38 = arith.extui %8 : i32 to i64
  %39 = arith.extui %9 : i32 to i64
  %40 = arith.shli %39, %c32_i64 : i64
  %41 = arith.ori %38, %40 : i64
  %42 = arith.index_castui %41 : i64 to index
  %43 = arith.extui %10 : i32 to i64
  %44 = arith.extui %11 : i32 to i64
  %45 = arith.shli %44, %c32_i64 : i64
  %46 = arith.ori %43, %45 : i64
  %47 = arith.index_castui %46 : i64 to index
  %48 = arith.extui %12 : i32 to i64
  %49 = arith.extui %13 : i32 to i64
  %50 = arith.shli %49, %c32_i64 : i64
  %51 = arith.ori %48, %50 : i64
  %52 = arith.index_castui %51 : i64 to index
  %53 = arith.extui %14 : i32 to i64
  %54 = arith.extui %15 : i32 to i64
  %55 = arith.shli %54, %c32_i64 : i64
  %56 = arith.ori %53, %55 : i64
  %57 = arith.index_castui %56 : i64 to index
  %58 = arith.extui %16 : i32 to i64
  %59 = arith.extui %17 : i32 to i64
  %60 = arith.shli %59, %c32_i64 : i64
  %61 = arith.ori %58, %60 : i64
  %62 = arith.index_castui %61 : i64 to index
  %63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42}
  %64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52}
  %65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %66 = flow.dispatch.tensor.load %63, offsets = [%workgroup_id_y, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} -> tensor<1x?x8x4x16x2x8xi8>
  %67 = flow.dispatch.tensor.load %64, offsets = [%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} -> tensor<1x?x4x2x4x16x2x8xi8>
  %68 = flow.dispatch.tensor.load %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} -> tensor<1x1x8x4x2x4x16x4xi32>
  %69 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<1x1x8x4x16x2x8xi8>
  %70 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<1x1x4x2x4x16x2x8xi8>
  %71 = scf.forall (%arg0) in (256) shared_outs(%arg1 = %68) -> (tensor<1x1x8x4x2x4x16x4xi32>) {
    %72 = affine.apply affine_map<(d0) -> (d0 mod 256)>(%arg0)
    %73:3 = affine.delinearize_index %72 into (%c4, %c4, %c16) : index, index, index
    %extracted_slice = tensor.extract_slice %arg1[0, 0, 0, %73#0, 0, %73#1, %73#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> to tensor<1x1x8x1x2x1x1x4xi32>
    %74 = affine.apply affine_map<(d0) -> (d0 mod 64)>(%arg0)
    %75:2 = affine.delinearize_index %74 into (%c4, %c16) : index, index
    %76 = scf.for %arg2 = %c0 to %42 step %c1 iter_args(%arg3 = %extracted_slice) -> (tensor<1x1x8x1x2x1x1x4xi32>) {
      %77:2 = iree_gpu.barrier_region ins(%69, %70 : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8>) {
      ^bb0(%arg4: tensor<1x1x8x4x16x2x8xi8>, %arg5: tensor<1x1x4x2x4x16x2x8xi8>):
        %83 = scf.for %arg6 = %c0 to %c512 step %c256 iter_args(%arg7 = %arg4) -> (tensor<1x1x8x4x16x2x8xi8>) {
          %85 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg6, %arg0)
          %86:3 = affine.delinearize_index %85 into (%c8, %c4, %c16) : index, index, index
          %extracted_slice_2 = tensor.extract_slice %66[0, %arg2, %86#0, %86#1, %86#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x?x8x4x16x2x8xi8> to tensor<1x1x1x1x1x2x8xi8>
          %inserted_slice = tensor.insert_slice %extracted_slice_2 into %arg7[0, 0, %86#0, %86#1, %86#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x2x8xi8> into tensor<1x1x8x4x16x2x8xi8>
          scf.yield %inserted_slice : tensor<1x1x8x4x16x2x8xi8>
        } {unroll_loop}
        %84 = scf.for %arg6 = %c0 to %c512 step %c256 iter_args(%arg7 = %arg5) -> (tensor<1x1x4x2x4x16x2x8xi8>) {
          %85 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg6, %arg0)
          %86:4 = affine.delinearize_index %85 into (%c4, %c2, %c4, %c16) : index, index, index, index
          %extracted_slice_2 = tensor.extract_slice %67[0, %arg2, %86#0, %86#1, %86#2, %86#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x?x4x2x4x16x2x8xi8> to tensor<1x1x1x1x1x1x2x8xi8>
          %inserted_slice = tensor.insert_slice %extracted_slice_2 into %arg7[0, 0, %86#0, %86#1, %86#2, %86#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x1x2x8xi8> into tensor<1x1x4x2x4x16x2x8xi8>
          scf.yield %inserted_slice : tensor<1x1x4x2x4x16x2x8xi8>
        } {unroll_loop}
        iree_gpu.yield %83, %84 : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8>
      } : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8>
      %extracted_slice_0 = tensor.extract_slice %77#0[0, 0, 0, %75#0, %75#1, 0, 0] [1, 1, 8, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x16x2x8xi8> to tensor<1x1x8x1x1x2x8xi8>
      %extracted_slice_1 = tensor.extract_slice %77#1[0, 0, %73#0, 0, %73#1, %73#2, 0, 0] [1, 1, 1, 2, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x4x2x4x16x2x8xi8> to tensor<1x1x1x2x1x1x2x8xi8>
      %78 = vector.transfer_read %extracted_slice_0[%c0, %c0, %c0, %c0, %c0, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true]} : tensor<1x1x8x1x1x2x8xi8>, vector<1x1x8x1x1x2x8xi8>
      %79 = vector.transfer_read %extracted_slice_1[%c0, %c0, %c0, %c0, %c0, %c0, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true, true]} : tensor<1x1x1x2x1x1x2x8xi8>, vector<1x1x1x2x1x1x2x8xi8>
      %80 = vector.transfer_read %arg3[%c0, %c0, %c0, %c0, %c0, %c0, %c0, %c0], %c0_i32 {in_bounds = [true, true, true, true, true, true, true, true]} : tensor<1x1x8x1x2x1x1x4xi32>, vector<1x1x8x1x2x1x1x4xi32>
      %81 = iree_gpu.multi_mma %78, %79, %80 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic =  MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : vector<1x1x8x1x1x2x8xi8>, vector<1x1x1x2x1x1x2x8xi8> into vector<1x1x8x1x2x1x1x4xi32>
      %82 = vector.transfer_write %81, %arg3[%c0, %c0, %c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true, true, true]} : vector<1x1x8x1x2x1x1x4xi32>, tensor<1x1x8x1x2x1x1x4xi32>
      scf.yield %82 : tensor<1x1x8x1x2x1x1x4xi32>
    }
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %76 into %arg1[0, 0, 0, %73#0, 0, %73#1, %73#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x1x2x1x1x4xi32> into tensor<1x1x8x4x2x4x16x4xi32>
    }
  } {mapping = [#gpu.thread<linear_dim_0>]}
  flow.dispatch.tensor.store %71, %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
  return
 }

 // -----// IR Dump After CSE (cse) //----- //
 func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} {
  %c0_i32 = arith.constant 0 : i32
  %c0_i8 = arith.constant 0 : i8
  %c2 = arith.constant 2 : index
  %c256 = arith.constant 256 : index
  %c512 = arith.constant 512 : index
  %c8 = arith.constant 8 : index
  %c16 = arith.constant 16 : index
  %c4 = arith.constant 4 : index
  %c0 = arith.constant 0 : index
  %c1 = arith.constant 1 : index
  %c32_i64 = arith.constant 32 : i64
  %0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
  %8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
  %9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
  %10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32
  %11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32
  %12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32
  %13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32
  %14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32
  %15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32
  %16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32
  %17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32
  %18 = arith.extui %0 : i32 to i64
  %19 = arith.extui %1 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %2 : i32 to i64
  %24 = arith.extui %3 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = arith.extui %4 : i32 to i64
  %29 = arith.extui %5 : i32 to i64
  %30 = arith.shli %29, %c32_i64 : i64
  %31 = arith.ori %28, %30 : i64
  %32 = arith.index_castui %31 : i64 to index
  %33 = arith.extui %6 : i32 to i64
  %34 = arith.extui %7 : i32 to i64
  %35 = arith.shli %34, %c32_i64 : i64
  %36 = arith.ori %33, %35 : i64
  %37 = arith.index_castui %36 : i64 to index
  %38 = arith.extui %8 : i32 to i64
  %39 = arith.extui %9 : i32 to i64
  %40 = arith.shli %39, %c32_i64 : i64
  %41 = arith.ori %38, %40 : i64
  %42 = arith.index_castui %41 : i64 to index
  %43 = arith.extui %10 : i32 to i64
  %44 = arith.extui %11 : i32 to i64
  %45 = arith.shli %44, %c32_i64 : i64
  %46 = arith.ori %43, %45 : i64
  %47 = arith.index_castui %46 : i64 to index
  %48 = arith.extui %12 : i32 to i64
  %49 = arith.extui %13 : i32 to i64
  %50 = arith.shli %49, %c32_i64 : i64
  %51 = arith.ori %48, %50 : i64
  %52 = arith.index_castui %51 : i64 to index
  %53 = arith.extui %14 : i32 to i64
  %54 = arith.extui %15 : i32 to i64
  %55 = arith.shli %54, %c32_i64 : i64
  %56 = arith.ori %53, %55 : i64
  %57 = arith.index_castui %56 : i64 to index
  %58 = arith.extui %16 : i32 to i64
  %59 = arith.extui %17 : i32 to i64
  %60 = arith.shli %59, %c32_i64 : i64
  %61 = arith.ori %58, %60 : i64
  %62 = arith.index_castui %61 : i64 to index
  %63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42}
  %64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52}
  %65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %66 = flow.dispatch.tensor.load %63, offsets = [%workgroup_id_y, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} -> tensor<1x?x8x4x16x2x8xi8>
  %67 = flow.dispatch.tensor.load %64, offsets = [%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} -> tensor<1x?x4x2x4x16x2x8xi8>
  %68 = flow.dispatch.tensor.load %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} -> tensor<1x1x8x4x2x4x16x4xi32>
  %69 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<1x1x8x4x16x2x8xi8>
  %70 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<1x1x4x2x4x16x2x8xi8>
  %71 = scf.forall (%arg0) in (256) shared_outs(%arg1 = %68) -> (tensor<1x1x8x4x2x4x16x4xi32>) {
    %72 = affine.apply affine_map<(d0) -> (d0 mod 256)>(%arg0)
    %73:3 = affine.delinearize_index %72 into (%c4, %c4, %c16) : index, index, index
    %extracted_slice = tensor.extract_slice %arg1[0, 0, 0, %73#0, 0, %73#1, %73#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> to tensor<1x1x8x1x2x1x1x4xi32>
    %74 = affine.apply affine_map<(d0) -> (d0 mod 64)>(%arg0)
    %75:2 = affine.delinearize_index %74 into (%c4, %c16) : index, index
    %76 = scf.for %arg2 = %c0 to %42 step %c1 iter_args(%arg3 = %extracted_slice) -> (tensor<1x1x8x1x2x1x1x4xi32>) {
      %77:2 = iree_gpu.barrier_region ins(%69, %70 : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8>) {
      ^bb0(%arg4: tensor<1x1x8x4x16x2x8xi8>, %arg5: tensor<1x1x4x2x4x16x2x8xi8>):
        %83 = scf.for %arg6 = %c0 to %c512 step %c256 iter_args(%arg7 = %arg4) -> (tensor<1x1x8x4x16x2x8xi8>) {
          %85 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg6, %arg0)
          %86:3 = affine.delinearize_index %85 into (%c8, %c4, %c16) : index, index, index
          %extracted_slice_2 = tensor.extract_slice %66[0, %arg2, %86#0, %86#1, %86#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x?x8x4x16x2x8xi8> to tensor<1x1x1x1x1x2x8xi8>
          %inserted_slice = tensor.insert_slice %extracted_slice_2 into %arg7[0, 0, %86#0, %86#1, %86#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x2x8xi8> into tensor<1x1x8x4x16x2x8xi8>
          scf.yield %inserted_slice : tensor<1x1x8x4x16x2x8xi8>
        } {unroll_loop}
        %84 = scf.for %arg6 = %c0 to %c512 step %c256 iter_args(%arg7 = %arg5) -> (tensor<1x1x4x2x4x16x2x8xi8>) {
          %85 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg6, %arg0)
          %86:4 = affine.delinearize_index %85 into (%c4, %c2, %c4, %c16) : index, index, index, index
          %extracted_slice_2 = tensor.extract_slice %67[0, %arg2, %86#0, %86#1, %86#2, %86#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x?x4x2x4x16x2x8xi8> to tensor<1x1x1x1x1x1x2x8xi8>
          %inserted_slice = tensor.insert_slice %extracted_slice_2 into %arg7[0, 0, %86#0, %86#1, %86#2, %86#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x1x2x8xi8> into tensor<1x1x4x2x4x16x2x8xi8>
          scf.yield %inserted_slice : tensor<1x1x4x2x4x16x2x8xi8>
        } {unroll_loop}
        iree_gpu.yield %83, %84 : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8>
      } : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8>
      %extracted_slice_0 = tensor.extract_slice %77#0[0, 0, 0, %75#0, %75#1, 0, 0] [1, 1, 8, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x16x2x8xi8> to tensor<1x1x8x1x1x2x8xi8>
      %extracted_slice_1 = tensor.extract_slice %77#1[0, 0, %73#0, 0, %73#1, %73#2, 0, 0] [1, 1, 1, 2, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x4x2x4x16x2x8xi8> to tensor<1x1x1x2x1x1x2x8xi8>
      %78 = vector.transfer_read %extracted_slice_0[%c0, %c0, %c0, %c0, %c0, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true]} : tensor<1x1x8x1x1x2x8xi8>, vector<1x1x8x1x1x2x8xi8>
      %79 = vector.transfer_read %extracted_slice_1[%c0, %c0, %c0, %c0, %c0, %c0, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true, true]} : tensor<1x1x1x2x1x1x2x8xi8>, vector<1x1x1x2x1x1x2x8xi8>
      %80 = vector.transfer_read %arg3[%c0, %c0, %c0, %c0, %c0, %c0, %c0, %c0], %c0_i32 {in_bounds = [true, true, true, true, true, true, true, true]} : tensor<1x1x8x1x2x1x1x4xi32>, vector<1x1x8x1x2x1x1x4xi32>
      %81 = iree_gpu.multi_mma %78, %79, %80 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic =  MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : vector<1x1x8x1x1x2x8xi8>, vector<1x1x1x2x1x1x2x8xi8> into vector<1x1x8x1x2x1x1x4xi32>
      %82 = vector.transfer_write %81, %arg3[%c0, %c0, %c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true, true, true]} : vector<1x1x8x1x2x1x1x4xi32>, tensor<1x1x8x1x2x1x1x4xi32>
      scf.yield %82 : tensor<1x1x8x1x2x1x1x4xi32>
    }
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %76 into %arg1[0, 0, 0, %73#0, 0, %73#1, %73#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x1x2x1x1x4xi32> into tensor<1x1x8x4x2x4x16x4xi32>
    }
  } {mapping = [#gpu.thread<linear_dim_0>]}
  flow.dispatch.tensor.store %71, %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
  return
 }

 // -----// IR Dump After OptimizeTensorInsertExtractSlicesPass (iree-codegen-optimize-tensor-insert-extract-slices) //----- //
 func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} {
  %c0_i32 = arith.constant 0 : i32
  %c0_i8 = arith.constant 0 : i8
  %c2 = arith.constant 2 : index
  %c256 = arith.constant 256 : index
  %c512 = arith.constant 512 : index
  %c8 = arith.constant 8 : index
  %c16 = arith.constant 16 : index
  %c4 = arith.constant 4 : index
  %c0 = arith.constant 0 : index
  %c1 = arith.constant 1 : index
  %c32_i64 = arith.constant 32 : i64
  %0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
  %8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
  %9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
  %10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32
  %11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32
  %12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32
  %13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32
  %14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32
  %15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32
  %16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32
  %17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32
  %18 = arith.extui %0 : i32 to i64
  %19 = arith.extui %1 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %2 : i32 to i64
  %24 = arith.extui %3 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = arith.extui %4 : i32 to i64
  %29 = arith.extui %5 : i32 to i64
  %30 = arith.shli %29, %c32_i64 : i64
  %31 = arith.ori %28, %30 : i64
  %32 = arith.index_castui %31 : i64 to index
  %33 = arith.extui %6 : i32 to i64
  %34 = arith.extui %7 : i32 to i64
  %35 = arith.shli %34, %c32_i64 : i64
  %36 = arith.ori %33, %35 : i64
  %37 = arith.index_castui %36 : i64 to index
  %38 = arith.extui %8 : i32 to i64
  %39 = arith.extui %9 : i32 to i64
  %40 = arith.shli %39, %c32_i64 : i64
  %41 = arith.ori %38, %40 : i64
  %42 = arith.index_castui %41 : i64 to index
  %43 = arith.extui %10 : i32 to i64
  %44 = arith.extui %11 : i32 to i64
  %45 = arith.shli %44, %c32_i64 : i64
  %46 = arith.ori %43, %45 : i64
  %47 = arith.index_castui %46 : i64 to index
  %48 = arith.extui %12 : i32 to i64
  %49 = arith.extui %13 : i32 to i64
  %50 = arith.shli %49, %c32_i64 : i64
  %51 = arith.ori %48, %50 : i64
  %52 = arith.index_castui %51 : i64 to index
  %53 = arith.extui %14 : i32 to i64
  %54 = arith.extui %15 : i32 to i64
  %55 = arith.shli %54, %c32_i64 : i64
  %56 = arith.ori %53, %55 : i64
  %57 = arith.index_castui %56 : i64 to index
  %58 = arith.extui %16 : i32 to i64
  %59 = arith.extui %17 : i32 to i64
  %60 = arith.shli %59, %c32_i64 : i64
  %61 = arith.ori %58, %60 : i64
  %62 = arith.index_castui %61 : i64 to index
  %63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42}
  %64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52}
  %65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %66 = flow.dispatch.tensor.load %63, offsets = [%workgroup_id_y, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} -> tensor<1x?x8x4x16x2x8xi8>
  %67 = flow.dispatch.tensor.load %64, offsets = [%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} -> tensor<1x?x4x2x4x16x2x8xi8>
  %68 = flow.dispatch.tensor.load %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} -> tensor<1x1x8x4x2x4x16x4xi32>
  %69 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<1x1x8x4x16x2x8xi8>
  %70 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<1x1x4x2x4x16x2x8xi8>
  %71 = scf.forall (%arg0) in (256) shared_outs(%arg1 = %68) -> (tensor<1x1x8x4x2x4x16x4xi32>) {
    %72 = affine.apply affine_map<(d0) -> (d0 mod 256)>(%arg0)
    %73:3 = affine.delinearize_index %72 into (%c4, %c4, %c16) : index, index, index
    %extracted_slice = tensor.extract_slice %arg1[0, 0, 0, %73#0, 0, %73#1, %73#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> to tensor<1x1x8x1x2x1x1x4xi32>
    %74 = affine.apply affine_map<(d0) -> (d0 mod 64)>(%arg0)
    %75:2 = affine.delinearize_index %74 into (%c4, %c16) : index, index
    %76 = vector.transfer_read %arg1[%c0, %c0, %c0, %73#0, %c0, %73#1, %73#2, %c0], %c0_i32 {in_bounds = [true, true, true, true, true, true, true, true]} : tensor<1x1x8x4x2x4x16x4xi32>, vector<1x1x8x1x2x1x1x4xi32>
    %77 = scf.for %arg2 = %c0 to %42 step %c1 iter_args(%arg3 = %76) -> (vector<1x1x8x1x2x1x1x4xi32>) {
      %79:2 = iree_gpu.barrier_region ins(%69, %70 : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8>) {
      ^bb0(%arg4: tensor<1x1x8x4x16x2x8xi8>, %arg5: tensor<1x1x4x2x4x16x2x8xi8>):
        %83 = scf.for %arg6 = %c0 to %c512 step %c256 iter_args(%arg7 = %arg4) -> (tensor<1x1x8x4x16x2x8xi8>) {
          %85 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg6, %arg0)
          %86:3 = affine.delinearize_index %85 into (%c8, %c4, %c16) : index, index, index
          %extracted_slice_0 = tensor.extract_slice %66[0, %arg2, %86#0, %86#1, %86#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x?x8x4x16x2x8xi8> to tensor<1x1x1x1x1x2x8xi8>
          %inserted_slice = tensor.insert_slice %extracted_slice_0 into %arg7[0, 0, %86#0, %86#1, %86#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x2x8xi8> into tensor<1x1x8x4x16x2x8xi8>
          scf.yield %inserted_slice : tensor<1x1x8x4x16x2x8xi8>
        } {unroll_loop}
        %84 = scf.for %arg6 = %c0 to %c512 step %c256 iter_args(%arg7 = %arg5) -> (tensor<1x1x4x2x4x16x2x8xi8>) {
          %85 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg6, %arg0)
          %86:4 = affine.delinearize_index %85 into (%c4, %c2, %c4, %c16) : index, index, index, index
          %extracted_slice_0 = tensor.extract_slice %67[0, %arg2, %86#0, %86#1, %86#2, %86#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x?x4x2x4x16x2x8xi8> to tensor<1x1x1x1x1x1x2x8xi8>
          %inserted_slice = tensor.insert_slice %extracted_slice_0 into %arg7[0, 0, %86#0, %86#1, %86#2, %86#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x1x2x8xi8> into tensor<1x1x4x2x4x16x2x8xi8>
          scf.yield %inserted_slice : tensor<1x1x4x2x4x16x2x8xi8>
        } {unroll_loop}
        iree_gpu.yield %83, %84 : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8>
      } : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8>
      %80 = vector.transfer_read %79#0[%c0, %c0, %c0, %75#0, %75#1, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true]} : tensor<1x1x8x4x16x2x8xi8>, vector<1x1x8x1x1x2x8xi8>
      %81 = vector.transfer_read %79#1[%c0, %c0, %73#0, %c0, %73#1, %73#2, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true, true]} : tensor<1x1x4x2x4x16x2x8xi8>, vector<1x1x1x2x1x1x2x8xi8>
      %82 = iree_gpu.multi_mma %80, %81, %arg3 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic =  MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : vector<1x1x8x1x1x2x8xi8>, vector<1x1x1x2x1x1x2x8xi8> into vector<1x1x8x1x2x1x1x4xi32>
      scf.yield %82 : vector<1x1x8x1x2x1x1x4xi32>
    }
    %78 = vector.transfer_write %77, %extracted_slice[%c0, %c0, %c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true, true, true]} : vector<1x1x8x1x2x1x1x4xi32>, tensor<1x1x8x1x2x1x1x4xi32>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %78 into %arg1[0, 0, 0, %73#0, 0, %73#1, %73#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x1x2x1x1x4xi32> into tensor<1x1x8x4x2x4x16x4xi32>
    }
  } {mapping = [#gpu.thread<linear_dim_0>]}
  flow.dispatch.tensor.store %71, %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
  return
 }

 // -----// IR Dump After Canonicalizer (canonicalize) //----- //
 func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} {
  %c0_i32 = arith.constant 0 : i32
  %c0_i8 = arith.constant 0 : i8
  %c2 = arith.constant 2 : index
  %c256 = arith.constant 256 : index
  %c512 = arith.constant 512 : index
  %c8 = arith.constant 8 : index
  %c16 = arith.constant 16 : index
  %c4 = arith.constant 4 : index
  %c0 = arith.constant 0 : index
  %c1 = arith.constant 1 : index
  %c32_i64 = arith.constant 32 : i64
  %0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
  %8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
  %9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
  %10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32
  %11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32
  %12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32
  %13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32
  %14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32
  %15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32
  %16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32
  %17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32
  %18 = arith.extui %0 : i32 to i64
  %19 = arith.extui %1 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %2 : i32 to i64
  %24 = arith.extui %3 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = arith.extui %4 : i32 to i64
  %29 = arith.extui %5 : i32 to i64
  %30 = arith.shli %29, %c32_i64 : i64
  %31 = arith.ori %28, %30 : i64
  %32 = arith.index_castui %31 : i64 to index
  %33 = arith.extui %6 : i32 to i64
  %34 = arith.extui %7 : i32 to i64
  %35 = arith.shli %34, %c32_i64 : i64
  %36 = arith.ori %33, %35 : i64
  %37 = arith.index_castui %36 : i64 to index
  %38 = arith.extui %8 : i32 to i64
  %39 = arith.extui %9 : i32 to i64
  %40 = arith.shli %39, %c32_i64 : i64
  %41 = arith.ori %38, %40 : i64
  %42 = arith.index_castui %41 : i64 to index
  %43 = arith.extui %10 : i32 to i64
  %44 = arith.extui %11 : i32 to i64
  %45 = arith.shli %44, %c32_i64 : i64
  %46 = arith.ori %43, %45 : i64
  %47 = arith.index_castui %46 : i64 to index
  %48 = arith.extui %12 : i32 to i64
  %49 = arith.extui %13 : i32 to i64
  %50 = arith.shli %49, %c32_i64 : i64
  %51 = arith.ori %48, %50 : i64
  %52 = arith.index_castui %51 : i64 to index
  %53 = arith.extui %14 : i32 to i64
  %54 = arith.extui %15 : i32 to i64
  %55 = arith.shli %54, %c32_i64 : i64
  %56 = arith.ori %53, %55 : i64
  %57 = arith.index_castui %56 : i64 to index
  %58 = arith.extui %16 : i32 to i64
  %59 = arith.extui %17 : i32 to i64
  %60 = arith.shli %59, %c32_i64 : i64
  %61 = arith.ori %58, %60 : i64
  %62 = arith.index_castui %61 : i64 to index
  %63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42}
  %64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52}
  %65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %66 = flow.dispatch.tensor.load %63, offsets = [%workgroup_id_y, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} -> tensor<1x?x8x4x16x2x8xi8>
  %67 = flow.dispatch.tensor.load %64, offsets = [%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} -> tensor<1x?x4x2x4x16x2x8xi8>
  %68 = flow.dispatch.tensor.load %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} -> tensor<1x1x8x4x2x4x16x4xi32>
  %69 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<1x1x8x4x16x2x8xi8>
  %70 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<1x1x4x2x4x16x2x8xi8>
  %71 = scf.forall (%arg0) in (256) shared_outs(%arg1 = %68) -> (tensor<1x1x8x4x2x4x16x4xi32>) {
    %72 = affine.apply affine_map<(d0) -> (d0 mod 256)>(%arg0)
    %73:3 = affine.delinearize_index %72 into (%c4, %c4, %c16) : index, index, index
    %extracted_slice = tensor.extract_slice %arg1[0, 0, 0, %73#0, 0, %73#1, %73#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> to tensor<1x1x8x1x2x1x1x4xi32>
    %74 = affine.apply affine_map<(d0) -> (d0 mod 64)>(%arg0)
    %75:2 = affine.delinearize_index %74 into (%c4, %c16) : index, index
    %76 = vector.transfer_read %arg1[%c0, %c0, %c0, %73#0, %c0, %73#1, %73#2, %c0], %c0_i32 {in_bounds = [true, true, true, true, true, true, true, true]} : tensor<1x1x8x4x2x4x16x4xi32>, vector<1x1x8x1x2x1x1x4xi32>
    %77 = scf.for %arg2 = %c0 to %42 step %c1 iter_args(%arg3 = %76) -> (vector<1x1x8x1x2x1x1x4xi32>) {
      %79:2 = iree_gpu.barrier_region ins(%69, %70 : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8>) {
      ^bb0(%arg4: tensor<1x1x8x4x16x2x8xi8>, %arg5: tensor<1x1x4x2x4x16x2x8xi8>):
        %83 = scf.for %arg6 = %c0 to %c512 step %c256 iter_args(%arg7 = %arg4) -> (tensor<1x1x8x4x16x2x8xi8>) {
          %85 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg6, %arg0)
          %86:3 = affine.delinearize_index %85 into (%c8, %c4, %c16) : index, index, index
          %extracted_slice_0 = tensor.extract_slice %66[0, %arg2, %86#0, %86#1, %86#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x?x8x4x16x2x8xi8> to tensor<1x1x1x1x1x2x8xi8>
          %inserted_slice = tensor.insert_slice %extracted_slice_0 into %arg7[0, 0, %86#0, %86#1, %86#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x2x8xi8> into tensor<1x1x8x4x16x2x8xi8>
          scf.yield %inserted_slice : tensor<1x1x8x4x16x2x8xi8>
        } {unroll_loop}
        %84 = scf.for %arg6 = %c0 to %c512 step %c256 iter_args(%arg7 = %arg5) -> (tensor<1x1x4x2x4x16x2x8xi8>) {
          %85 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg6, %arg0)
          %86:4 = affine.delinearize_index %85 into (%c4, %c2, %c4, %c16) : index, index, index, index
          %extracted_slice_0 = tensor.extract_slice %67[0, %arg2, %86#0, %86#1, %86#2, %86#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x?x4x2x4x16x2x8xi8> to tensor<1x1x1x1x1x1x2x8xi8>
          %inserted_slice = tensor.insert_slice %extracted_slice_0 into %arg7[0, 0, %86#0, %86#1, %86#2, %86#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x1x2x8xi8> into tensor<1x1x4x2x4x16x2x8xi8>
          scf.yield %inserted_slice : tensor<1x1x4x2x4x16x2x8xi8>
        } {unroll_loop}
        iree_gpu.yield %83, %84 : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8>
      } : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8>
      %80 = vector.transfer_read %79#0[%c0, %c0, %c0, %75#0, %75#1, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true]} : tensor<1x1x8x4x16x2x8xi8>, vector<1x1x8x1x1x2x8xi8>
      %81 = vector.transfer_read %79#1[%c0, %c0, %73#0, %c0, %73#1, %73#2, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true, true]} : tensor<1x1x4x2x4x16x2x8xi8>, vector<1x1x1x2x1x1x2x8xi8>
      %82 = iree_gpu.multi_mma %80, %81, %arg3 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic =  MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : vector<1x1x8x1x1x2x8xi8>, vector<1x1x1x2x1x1x2x8xi8> into vector<1x1x8x1x2x1x1x4xi32>
      scf.yield %82 : vector<1x1x8x1x2x1x1x4xi32>
    }
    %78 = vector.transfer_write %77, %extracted_slice[%c0, %c0, %c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true, true, true]} : vector<1x1x8x1x2x1x1x4xi32>, tensor<1x1x8x1x2x1x1x4xi32>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %78 into %arg1[0, 0, 0, %73#0, 0, %73#1, %73#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x1x2x1x1x4xi32> into tensor<1x1x8x4x2x4x16x4xi32>
    }
  } {mapping = [#gpu.thread<linear_dim_0>]}
  flow.dispatch.tensor.store %71, %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
  return
 }

 // -----// IR Dump After CSE (cse) //----- //
 func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} {
  %c0_i32 = arith.constant 0 : i32
  %c0_i8 = arith.constant 0 : i8
  %c2 = arith.constant 2 : index
  %c256 = arith.constant 256 : index
  %c512 = arith.constant 512 : index
  %c8 = arith.constant 8 : index
  %c16 = arith.constant 16 : index
  %c4 = arith.constant 4 : index
  %c0 = arith.constant 0 : index
  %c1 = arith.constant 1 : index
  %c32_i64 = arith.constant 32 : i64
  %0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
  %8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
  %9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
  %10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32
  %11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32
  %12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32
  %13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32
  %14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32
  %15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32
  %16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32
  %17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32
  %18 = arith.extui %0 : i32 to i64
  %19 = arith.extui %1 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %2 : i32 to i64
  %24 = arith.extui %3 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = arith.extui %4 : i32 to i64
  %29 = arith.extui %5 : i32 to i64
  %30 = arith.shli %29, %c32_i64 : i64
  %31 = arith.ori %28, %30 : i64
  %32 = arith.index_castui %31 : i64 to index
  %33 = arith.extui %6 : i32 to i64
  %34 = arith.extui %7 : i32 to i64
  %35 = arith.shli %34, %c32_i64 : i64
  %36 = arith.ori %33, %35 : i64
  %37 = arith.index_castui %36 : i64 to index
  %38 = arith.extui %8 : i32 to i64
  %39 = arith.extui %9 : i32 to i64
  %40 = arith.shli %39, %c32_i64 : i64
  %41 = arith.ori %38, %40 : i64
  %42 = arith.index_castui %41 : i64 to index
  %43 = arith.extui %10 : i32 to i64
  %44 = arith.extui %11 : i32 to i64
  %45 = arith.shli %44, %c32_i64 : i64
  %46 = arith.ori %43, %45 : i64
  %47 = arith.index_castui %46 : i64 to index
  %48 = arith.extui %12 : i32 to i64
  %49 = arith.extui %13 : i32 to i64
  %50 = arith.shli %49, %c32_i64 : i64
  %51 = arith.ori %48, %50 : i64
  %52 = arith.index_castui %51 : i64 to index
  %53 = arith.extui %14 : i32 to i64
  %54 = arith.extui %15 : i32 to i64
  %55 = arith.shli %54, %c32_i64 : i64
  %56 = arith.ori %53, %55 : i64
  %57 = arith.index_castui %56 : i64 to index
  %58 = arith.extui %16 : i32 to i64
  %59 = arith.extui %17 : i32 to i64
  %60 = arith.shli %59, %c32_i64 : i64
  %61 = arith.ori %58, %60 : i64
  %62 = arith.index_castui %61 : i64 to index
  %63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42}
  %64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52}
  %65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %66 = flow.dispatch.tensor.load %63, offsets = [%workgroup_id_y, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} -> tensor<1x?x8x4x16x2x8xi8>
  %67 = flow.dispatch.tensor.load %64, offsets = [%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} -> tensor<1x?x4x2x4x16x2x8xi8>
  %68 = flow.dispatch.tensor.load %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} -> tensor<1x1x8x4x2x4x16x4xi32>
  %69 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<1x1x8x4x16x2x8xi8>
  %70 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<1x1x4x2x4x16x2x8xi8>
  %71 = scf.forall (%arg0) in (256) shared_outs(%arg1 = %68) -> (tensor<1x1x8x4x2x4x16x4xi32>) {
    %72 = affine.apply affine_map<(d0) -> (d0 mod 256)>(%arg0)
    %73:3 = affine.delinearize_index %72 into (%c4, %c4, %c16) : index, index, index
    %extracted_slice = tensor.extract_slice %arg1[0, 0, 0, %73#0, 0, %73#1, %73#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> to tensor<1x1x8x1x2x1x1x4xi32>
    %74 = affine.apply affine_map<(d0) -> (d0 mod 64)>(%arg0)
    %75:2 = affine.delinearize_index %74 into (%c4, %c16) : index, index
    %76 = vector.transfer_read %arg1[%c0, %c0, %c0, %73#0, %c0, %73#1, %73#2, %c0], %c0_i32 {in_bounds = [true, true, true, true, true, true, true, true]} : tensor<1x1x8x4x2x4x16x4xi32>, vector<1x1x8x1x2x1x1x4xi32>
    %77 = scf.for %arg2 = %c0 to %42 step %c1 iter_args(%arg3 = %76) -> (vector<1x1x8x1x2x1x1x4xi32>) {
      %79:2 = iree_gpu.barrier_region ins(%69, %70 : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8>) {
      ^bb0(%arg4: tensor<1x1x8x4x16x2x8xi8>, %arg5: tensor<1x1x4x2x4x16x2x8xi8>):
        %83 = scf.for %arg6 = %c0 to %c512 step %c256 iter_args(%arg7 = %arg4) -> (tensor<1x1x8x4x16x2x8xi8>) {
          %85 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg6, %arg0)
          %86:3 = affine.delinearize_index %85 into (%c8, %c4, %c16) : index, index, index
          %extracted_slice_0 = tensor.extract_slice %66[0, %arg2, %86#0, %86#1, %86#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x?x8x4x16x2x8xi8> to tensor<1x1x1x1x1x2x8xi8>
          %inserted_slice = tensor.insert_slice %extracted_slice_0 into %arg7[0, 0, %86#0, %86#1, %86#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x2x8xi8> into tensor<1x1x8x4x16x2x8xi8>
          scf.yield %inserted_slice : tensor<1x1x8x4x16x2x8xi8>
        } {unroll_loop}
        %84 = scf.for %arg6 = %c0 to %c512 step %c256 iter_args(%arg7 = %arg5) -> (tensor<1x1x4x2x4x16x2x8xi8>) {
          %85 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg6, %arg0)
          %86:4 = affine.delinearize_index %85 into (%c4, %c2, %c4, %c16) : index, index, index, index
          %extracted_slice_0 = tensor.extract_slice %67[0, %arg2, %86#0, %86#1, %86#2, %86#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x?x4x2x4x16x2x8xi8> to tensor<1x1x1x1x1x1x2x8xi8>
          %inserted_slice = tensor.insert_slice %extracted_slice_0 into %arg7[0, 0, %86#0, %86#1, %86#2, %86#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x1x2x8xi8> into tensor<1x1x4x2x4x16x2x8xi8>
          scf.yield %inserted_slice : tensor<1x1x4x2x4x16x2x8xi8>
        } {unroll_loop}
        iree_gpu.yield %83, %84 : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8>
      } : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8>
      %80 = vector.transfer_read %79#0[%c0, %c0, %c0, %75#0, %75#1, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true]} : tensor<1x1x8x4x16x2x8xi8>, vector<1x1x8x1x1x2x8xi8>
      %81 = vector.transfer_read %79#1[%c0, %c0, %73#0, %c0, %73#1, %73#2, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true, true]} : tensor<1x1x4x2x4x16x2x8xi8>, vector<1x1x1x2x1x1x2x8xi8>
      %82 = iree_gpu.multi_mma %80, %81, %arg3 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic =  MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : vector<1x1x8x1x1x2x8xi8>, vector<1x1x1x2x1x1x2x8xi8> into vector<1x1x8x1x2x1x1x4xi32>
      scf.yield %82 : vector<1x1x8x1x2x1x1x4xi32>
    }
    %78 = vector.transfer_write %77, %extracted_slice[%c0, %c0, %c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true, true, true]} : vector<1x1x8x1x2x1x1x4xi32>, tensor<1x1x8x1x2x1x1x4xi32>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %78 into %arg1[0, 0, 0, %73#0, 0, %73#1, %73#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x1x2x1x1x4xi32> into tensor<1x1x8x4x2x4x16x4xi32>
    }
  } {mapping = [#gpu.thread<linear_dim_0>]}
  flow.dispatch.tensor.store %71, %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
  return
 }

 // -----// IR Dump After CleanupBufferAllocViewPass (iree-codegen-cleanup-buffer-alloc-view) //----- //
 func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} {
  %c0_i32 = arith.constant 0 : i32
  %c0_i8 = arith.constant 0 : i8
  %c2 = arith.constant 2 : index
  %c256 = arith.constant 256 : index
  %c512 = arith.constant 512 : index
  %c8 = arith.constant 8 : index
  %c16 = arith.constant 16 : index
  %c4 = arith.constant 4 : index
  %c0 = arith.constant 0 : index
  %c1 = arith.constant 1 : index
  %c32_i64 = arith.constant 32 : i64
  %0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
  %8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
  %9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
  %10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32
  %11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32
  %12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32
  %13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32
  %14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32
  %15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32
  %16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32
  %17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32
  %18 = arith.extui %0 : i32 to i64
  %19 = arith.extui %1 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %2 : i32 to i64
  %24 = arith.extui %3 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = arith.extui %4 : i32 to i64
  %29 = arith.extui %5 : i32 to i64
  %30 = arith.shli %29, %c32_i64 : i64
  %31 = arith.ori %28, %30 : i64
  %32 = arith.index_castui %31 : i64 to index
  %33 = arith.extui %6 : i32 to i64
  %34 = arith.extui %7 : i32 to i64
  %35 = arith.shli %34, %c32_i64 : i64
  %36 = arith.ori %33, %35 : i64
  %37 = arith.index_castui %36 : i64 to index
  %38 = arith.extui %8 : i32 to i64
  %39 = arith.extui %9 : i32 to i64
  %40 = arith.shli %39, %c32_i64 : i64
  %41 = arith.ori %38, %40 : i64
  %42 = arith.index_castui %41 : i64 to index
  %43 = arith.extui %10 : i32 to i64
  %44 = arith.extui %11 : i32 to i64
  %45 = arith.shli %44, %c32_i64 : i64
  %46 = arith.ori %43, %45 : i64
  %47 = arith.index_castui %46 : i64 to index
  %48 = arith.extui %12 : i32 to i64
  %49 = arith.extui %13 : i32 to i64
  %50 = arith.shli %49, %c32_i64 : i64
  %51 = arith.ori %48, %50 : i64
  %52 = arith.index_castui %51 : i64 to index
  %53 = arith.extui %14 : i32 to i64
  %54 = arith.extui %15 : i32 to i64
  %55 = arith.shli %54, %c32_i64 : i64
  %56 = arith.ori %53, %55 : i64
  %57 = arith.index_castui %56 : i64 to index
  %58 = arith.extui %16 : i32 to i64
  %59 = arith.extui %17 : i32 to i64
  %60 = arith.shli %59, %c32_i64 : i64
  %61 = arith.ori %58, %60 : i64
  %62 = arith.index_castui %61 : i64 to index
  %63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42}
  %64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52}
  %65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %66 = flow.dispatch.tensor.load %63, offsets = [%workgroup_id_y, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} -> tensor<1x?x8x4x16x2x8xi8>
  %67 = flow.dispatch.tensor.load %64, offsets = [%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} -> tensor<1x?x4x2x4x16x2x8xi8>
  %68 = flow.dispatch.tensor.load %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} -> tensor<1x1x8x4x2x4x16x4xi32>
  %69 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<1x1x8x4x16x2x8xi8>
  %70 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<1x1x4x2x4x16x2x8xi8>
  %71 = scf.forall (%arg0) in (256) shared_outs(%arg1 = %68) -> (tensor<1x1x8x4x2x4x16x4xi32>) {
    %72 = affine.apply affine_map<(d0) -> (d0 mod 256)>(%arg0)
    %73:3 = affine.delinearize_index %72 into (%c4, %c4, %c16) : index, index, index
    %extracted_slice = tensor.extract_slice %arg1[0, 0, 0, %73#0, 0, %73#1, %73#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> to tensor<1x1x8x1x2x1x1x4xi32>
    %74 = affine.apply affine_map<(d0) -> (d0 mod 64)>(%arg0)
    %75:2 = affine.delinearize_index %74 into (%c4, %c16) : index, index
    %76 = vector.transfer_read %arg1[%c0, %c0, %c0, %73#0, %c0, %73#1, %73#2, %c0], %c0_i32 {in_bounds = [true, true, true, true, true, true, true, true]} : tensor<1x1x8x4x2x4x16x4xi32>, vector<1x1x8x1x2x1x1x4xi32>
    %77 = scf.for %arg2 = %c0 to %42 step %c1 iter_args(%arg3 = %76) -> (vector<1x1x8x1x2x1x1x4xi32>) {
      %79:2 = iree_gpu.barrier_region ins(%69, %70 : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8>) {
      ^bb0(%arg4: tensor<1x1x8x4x16x2x8xi8>, %arg5: tensor<1x1x4x2x4x16x2x8xi8>):
        %83 = scf.for %arg6 = %c0 to %c512 step %c256 iter_args(%arg7 = %arg4) -> (tensor<1x1x8x4x16x2x8xi8>) {
          %85 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg6, %arg0)
          %86:3 = affine.delinearize_index %85 into (%c8, %c4, %c16) : index, index, index
          %extracted_slice_0 = tensor.extract_slice %66[0, %arg2, %86#0, %86#1, %86#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x?x8x4x16x2x8xi8> to tensor<1x1x1x1x1x2x8xi8>
          %inserted_slice = tensor.insert_slice %extracted_slice_0 into %arg7[0, 0, %86#0, %86#1, %86#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x2x8xi8> into tensor<1x1x8x4x16x2x8xi8>
          scf.yield %inserted_slice : tensor<1x1x8x4x16x2x8xi8>
        } {unroll_loop}
        %84 = scf.for %arg6 = %c0 to %c512 step %c256 iter_args(%arg7 = %arg5) -> (tensor<1x1x4x2x4x16x2x8xi8>) {
          %85 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg6, %arg0)
          %86:4 = affine.delinearize_index %85 into (%c4, %c2, %c4, %c16) : index, index, index, index
          %extracted_slice_0 = tensor.extract_slice %67[0, %arg2, %86#0, %86#1, %86#2, %86#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x?x4x2x4x16x2x8xi8> to tensor<1x1x1x1x1x1x2x8xi8>
          %inserted_slice = tensor.insert_slice %extracted_slice_0 into %arg7[0, 0, %86#0, %86#1, %86#2, %86#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x1x2x8xi8> into tensor<1x1x4x2x4x16x2x8xi8>
          scf.yield %inserted_slice : tensor<1x1x4x2x4x16x2x8xi8>
        } {unroll_loop}
        iree_gpu.yield %83, %84 : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8>
      } : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8>
      %80 = vector.transfer_read %79#0[%c0, %c0, %c0, %75#0, %75#1, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true]} : tensor<1x1x8x4x16x2x8xi8>, vector<1x1x8x1x1x2x8xi8>
      %81 = vector.transfer_read %79#1[%c0, %c0, %73#0, %c0, %73#1, %73#2, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true, true]} : tensor<1x1x4x2x4x16x2x8xi8>, vector<1x1x1x2x1x1x2x8xi8>
      %82 = iree_gpu.multi_mma %80, %81, %arg3 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic =  MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : vector<1x1x8x1x1x2x8xi8>, vector<1x1x1x2x1x1x2x8xi8> into vector<1x1x8x1x2x1x1x4xi32>
      scf.yield %82 : vector<1x1x8x1x2x1x1x4xi32>
    }
    %78 = vector.transfer_write %77, %extracted_slice[%c0, %c0, %c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true, true, true]} : vector<1x1x8x1x2x1x1x4xi32>, tensor<1x1x8x1x2x1x1x4xi32>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %78 into %arg1[0, 0, 0, %73#0, 0, %73#1, %73#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x1x2x1x1x4xi32> into tensor<1x1x8x4x2x4x16x4xi32>
    }
  } {mapping = [#gpu.thread<linear_dim_0>]}
  flow.dispatch.tensor.store %71, %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
  return
 }

 // -----// IR Dump After GPUCombineValueBarriersPass (iree-codegen-gpu-combine-value-barriers) //----- //
 func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} {
  %c0_i32 = arith.constant 0 : i32
  %c0_i8 = arith.constant 0 : i8
  %c2 = arith.constant 2 : index
  %c256 = arith.constant 256 : index
  %c512 = arith.constant 512 : index
  %c8 = arith.constant 8 : index
  %c16 = arith.constant 16 : index
  %c4 = arith.constant 4 : index
  %c0 = arith.constant 0 : index
  %c1 = arith.constant 1 : index
  %c32_i64 = arith.constant 32 : i64
  %0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
  %8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
  %9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
  %10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32
  %11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32
  %12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32
  %13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32
  %14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32
  %15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32
  %16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32
  %17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32
  %18 = arith.extui %0 : i32 to i64
  %19 = arith.extui %1 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %2 : i32 to i64
  %24 = arith.extui %3 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = arith.extui %4 : i32 to i64
  %29 = arith.extui %5 : i32 to i64
  %30 = arith.shli %29, %c32_i64 : i64
  %31 = arith.ori %28, %30 : i64
  %32 = arith.index_castui %31 : i64 to index
  %33 = arith.extui %6 : i32 to i64
  %34 = arith.extui %7 : i32 to i64
  %35 = arith.shli %34, %c32_i64 : i64
  %36 = arith.ori %33, %35 : i64
  %37 = arith.index_castui %36 : i64 to index
  %38 = arith.extui %8 : i32 to i64
  %39 = arith.extui %9 : i32 to i64
  %40 = arith.shli %39, %c32_i64 : i64
  %41 = arith.ori %38, %40 : i64
  %42 = arith.index_castui %41 : i64 to index
  %43 = arith.extui %10 : i32 to i64
  %44 = arith.extui %11 : i32 to i64
  %45 = arith.shli %44, %c32_i64 : i64
  %46 = arith.ori %43, %45 : i64
  %47 = arith.index_castui %46 : i64 to index
  %48 = arith.extui %12 : i32 to i64
  %49 = arith.extui %13 : i32 to i64
  %50 = arith.shli %49, %c32_i64 : i64
  %51 = arith.ori %48, %50 : i64
  %52 = arith.index_castui %51 : i64 to index
  %53 = arith.extui %14 : i32 to i64
  %54 = arith.extui %15 : i32 to i64
  %55 = arith.shli %54, %c32_i64 : i64
  %56 = arith.ori %53, %55 : i64
  %57 = arith.index_castui %56 : i64 to index
  %58 = arith.extui %16 : i32 to i64
  %59 = arith.extui %17 : i32 to i64
  %60 = arith.shli %59, %c32_i64 : i64
  %61 = arith.ori %58, %60 : i64
  %62 = arith.index_castui %61 : i64 to index
  %63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42}
  %64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52}
  %65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %66 = flow.dispatch.tensor.load %63, offsets = [%workgroup_id_y, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} -> tensor<1x?x8x4x16x2x8xi8>
  %67 = flow.dispatch.tensor.load %64, offsets = [%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} -> tensor<1x?x4x2x4x16x2x8xi8>
  %68 = flow.dispatch.tensor.load %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} -> tensor<1x1x8x4x2x4x16x4xi32>
  %69 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<1x1x8x4x16x2x8xi8>
  %70 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<1x1x4x2x4x16x2x8xi8>
  %71 = scf.forall (%arg0) in (256) shared_outs(%arg1 = %68) -> (tensor<1x1x8x4x2x4x16x4xi32>) {
    %72 = affine.apply affine_map<(d0) -> (d0 mod 256)>(%arg0)
    %73:3 = affine.delinearize_index %72 into (%c4, %c4, %c16) : index, index, index
    %extracted_slice = tensor.extract_slice %arg1[0, 0, 0, %73#0, 0, %73#1, %73#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> to tensor<1x1x8x1x2x1x1x4xi32>
    %74 = affine.apply affine_map<(d0) -> (d0 mod 64)>(%arg0)
    %75:2 = affine.delinearize_index %74 into (%c4, %c16) : index, index
    %76 = vector.transfer_read %arg1[%c0, %c0, %c0, %73#0, %c0, %73#1, %73#2, %c0], %c0_i32 {in_bounds = [true, true, true, true, true, true, true, true]} : tensor<1x1x8x4x2x4x16x4xi32>, vector<1x1x8x1x2x1x1x4xi32>
    %77 = scf.for %arg2 = %c0 to %42 step %c1 iter_args(%arg3 = %76) -> (vector<1x1x8x1x2x1x1x4xi32>) {
      %79:2 = iree_gpu.barrier_region ins(%69, %70 : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8>) {
      ^bb0(%arg4: tensor<1x1x8x4x16x2x8xi8>, %arg5: tensor<1x1x4x2x4x16x2x8xi8>):
        %83 = scf.for %arg6 = %c0 to %c512 step %c256 iter_args(%arg7 = %arg4) -> (tensor<1x1x8x4x16x2x8xi8>) {
          %85 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg6, %arg0)
          %86:3 = affine.delinearize_index %85 into (%c8, %c4, %c16) : index, index, index
          %extracted_slice_0 = tensor.extract_slice %66[0, %arg2, %86#0, %86#1, %86#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x?x8x4x16x2x8xi8> to tensor<1x1x1x1x1x2x8xi8>
          %inserted_slice = tensor.insert_slice %extracted_slice_0 into %arg7[0, 0, %86#0, %86#1, %86#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x2x8xi8> into tensor<1x1x8x4x16x2x8xi8>
          scf.yield %inserted_slice : tensor<1x1x8x4x16x2x8xi8>
        } {unroll_loop}
        %84 = scf.for %arg6 = %c0 to %c512 step %c256 iter_args(%arg7 = %arg5) -> (tensor<1x1x4x2x4x16x2x8xi8>) {
          %85 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg6, %arg0)
          %86:4 = affine.delinearize_index %85 into (%c4, %c2, %c4, %c16) : index, index, index, index
          %extracted_slice_0 = tensor.extract_slice %67[0, %arg2, %86#0, %86#1, %86#2, %86#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x?x4x2x4x16x2x8xi8> to tensor<1x1x1x1x1x1x2x8xi8>
          %inserted_slice = tensor.insert_slice %extracted_slice_0 into %arg7[0, 0, %86#0, %86#1, %86#2, %86#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x1x2x8xi8> into tensor<1x1x4x2x4x16x2x8xi8>
          scf.yield %inserted_slice : tensor<1x1x4x2x4x16x2x8xi8>
        } {unroll_loop}
        iree_gpu.yield %83, %84 : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8>
      } : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8>
      %80 = vector.transfer_read %79#0[%c0, %c0, %c0, %75#0, %75#1, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true]} : tensor<1x1x8x4x16x2x8xi8>, vector<1x1x8x1x1x2x8xi8>
      %81 = vector.transfer_read %79#1[%c0, %c0, %73#0, %c0, %73#1, %73#2, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true, true]} : tensor<1x1x4x2x4x16x2x8xi8>, vector<1x1x1x2x1x1x2x8xi8>
      %82 = iree_gpu.multi_mma %80, %81, %arg3 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic =  MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : vector<1x1x8x1x1x2x8xi8>, vector<1x1x1x2x1x1x2x8xi8> into vector<1x1x8x1x2x1x1x4xi32>
      scf.yield %82 : vector<1x1x8x1x2x1x1x4xi32>
    }
    %78 = vector.transfer_write %77, %extracted_slice[%c0, %c0, %c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true, true, true]} : vector<1x1x8x1x2x1x1x4xi32>, tensor<1x1x8x1x2x1x1x4xi32>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %78 into %arg1[0, 0, 0, %73#0, 0, %73#1, %73#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x1x2x1x1x4xi32> into tensor<1x1x8x4x2x4x16x4xi32>
    }
  } {mapping = [#gpu.thread<linear_dim_0>]}
  flow.dispatch.tensor.store %71, %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
  return
 }

 // -----// IR Dump After EliminateEmptyTensorsPass (iree-eliminate-empty-tensors) //----- //
 func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} {
  %c0_i32 = arith.constant 0 : i32
  %c0_i8 = arith.constant 0 : i8
  %c2 = arith.constant 2 : index
  %c256 = arith.constant 256 : index
  %c512 = arith.constant 512 : index
  %c8 = arith.constant 8 : index
  %c16 = arith.constant 16 : index
  %c4 = arith.constant 4 : index
  %c0 = arith.constant 0 : index
  %c1 = arith.constant 1 : index
  %c32_i64 = arith.constant 32 : i64
  %0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
  %8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
  %9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
  %10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32
  %11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32
  %12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32
  %13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32
  %14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32
  %15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32
  %16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32
  %17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32
  %18 = arith.extui %0 : i32 to i64
  %19 = arith.extui %1 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %2 : i32 to i64
  %24 = arith.extui %3 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = arith.extui %4 : i32 to i64
  %29 = arith.extui %5 : i32 to i64
  %30 = arith.shli %29, %c32_i64 : i64
  %31 = arith.ori %28, %30 : i64
  %32 = arith.index_castui %31 : i64 to index
  %33 = arith.extui %6 : i32 to i64
  %34 = arith.extui %7 : i32 to i64
  %35 = arith.shli %34, %c32_i64 : i64
  %36 = arith.ori %33, %35 : i64
  %37 = arith.index_castui %36 : i64 to index
  %38 = arith.extui %8 : i32 to i64
  %39 = arith.extui %9 : i32 to i64
  %40 = arith.shli %39, %c32_i64 : i64
  %41 = arith.ori %38, %40 : i64
  %42 = arith.index_castui %41 : i64 to index
  %43 = arith.extui %10 : i32 to i64
  %44 = arith.extui %11 : i32 to i64
  %45 = arith.shli %44, %c32_i64 : i64
  %46 = arith.ori %43, %45 : i64
  %47 = arith.index_castui %46 : i64 to index
  %48 = arith.extui %12 : i32 to i64
  %49 = arith.extui %13 : i32 to i64
  %50 = arith.shli %49, %c32_i64 : i64
  %51 = arith.ori %48, %50 : i64
  %52 = arith.index_castui %51 : i64 to index
  %53 = arith.extui %14 : i32 to i64
  %54 = arith.extui %15 : i32 to i64
  %55 = arith.shli %54, %c32_i64 : i64
  %56 = arith.ori %53, %55 : i64
  %57 = arith.index_castui %56 : i64 to index
  %58 = arith.extui %16 : i32 to i64
  %59 = arith.extui %17 : i32 to i64
  %60 = arith.shli %59, %c32_i64 : i64
  %61 = arith.ori %58, %60 : i64
  %62 = arith.index_castui %61 : i64 to index
  %63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42}
  %64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52}
  %65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %66 = flow.dispatch.tensor.load %63, offsets = [%workgroup_id_y, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} -> tensor<1x?x8x4x16x2x8xi8>
  %67 = flow.dispatch.tensor.load %64, offsets = [%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} -> tensor<1x?x4x2x4x16x2x8xi8>
  %68 = flow.dispatch.tensor.load %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} -> tensor<1x1x8x4x2x4x16x4xi32>
  %69 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<1x1x8x4x16x2x8xi8>
  %70 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<1x1x4x2x4x16x2x8xi8>
  %71 = scf.forall (%arg0) in (256) shared_outs(%arg1 = %68) -> (tensor<1x1x8x4x2x4x16x4xi32>) {
    %72 = affine.apply affine_map<(d0) -> (d0 mod 256)>(%arg0)
    %73:3 = affine.delinearize_index %72 into (%c4, %c4, %c16) : index, index, index
    %extracted_slice = tensor.extract_slice %arg1[0, 0, 0, %73#0, 0, %73#1, %73#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> to tensor<1x1x8x1x2x1x1x4xi32>
    %74 = affine.apply affine_map<(d0) -> (d0 mod 64)>(%arg0)
    %75:2 = affine.delinearize_index %74 into (%c4, %c16) : index, index
    %76 = vector.transfer_read %arg1[%c0, %c0, %c0, %73#0, %c0, %73#1, %73#2, %c0], %c0_i32 {in_bounds = [true, true, true, true, true, true, true, true]} : tensor<1x1x8x4x2x4x16x4xi32>, vector<1x1x8x1x2x1x1x4xi32>
    %77 = scf.for %arg2 = %c0 to %42 step %c1 iter_args(%arg3 = %76) -> (vector<1x1x8x1x2x1x1x4xi32>) {
      %79:2 = iree_gpu.barrier_region ins(%69, %70 : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8>) {
      ^bb0(%arg4: tensor<1x1x8x4x16x2x8xi8>, %arg5: tensor<1x1x4x2x4x16x2x8xi8>):
        %83 = scf.for %arg6 = %c0 to %c512 step %c256 iter_args(%arg7 = %arg4) -> (tensor<1x1x8x4x16x2x8xi8>) {
          %85 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg6, %arg0)
          %86:3 = affine.delinearize_index %85 into (%c8, %c4, %c16) : index, index, index
          %extracted_slice_0 = tensor.extract_slice %66[0, %arg2, %86#0, %86#1, %86#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x?x8x4x16x2x8xi8> to tensor<1x1x1x1x1x2x8xi8>
          %inserted_slice = tensor.insert_slice %extracted_slice_0 into %arg7[0, 0, %86#0, %86#1, %86#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x2x8xi8> into tensor<1x1x8x4x16x2x8xi8>
          scf.yield %inserted_slice : tensor<1x1x8x4x16x2x8xi8>
        } {unroll_loop}
        %84 = scf.for %arg6 = %c0 to %c512 step %c256 iter_args(%arg7 = %arg5) -> (tensor<1x1x4x2x4x16x2x8xi8>) {
          %85 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg6, %arg0)
          %86:4 = affine.delinearize_index %85 into (%c4, %c2, %c4, %c16) : index, index, index, index
          %extracted_slice_0 = tensor.extract_slice %67[0, %arg2, %86#0, %86#1, %86#2, %86#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x?x4x2x4x16x2x8xi8> to tensor<1x1x1x1x1x1x2x8xi8>
          %inserted_slice = tensor.insert_slice %extracted_slice_0 into %arg7[0, 0, %86#0, %86#1, %86#2, %86#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x1x2x8xi8> into tensor<1x1x4x2x4x16x2x8xi8>
          scf.yield %inserted_slice : tensor<1x1x4x2x4x16x2x8xi8>
        } {unroll_loop}
        iree_gpu.yield %83, %84 : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8>
      } : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8>
      %80 = vector.transfer_read %79#0[%c0, %c0, %c0, %75#0, %75#1, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true]} : tensor<1x1x8x4x16x2x8xi8>, vector<1x1x8x1x1x2x8xi8>
      %81 = vector.transfer_read %79#1[%c0, %c0, %73#0, %c0, %73#1, %73#2, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true, true]} : tensor<1x1x4x2x4x16x2x8xi8>, vector<1x1x1x2x1x1x2x8xi8>
      %82 = iree_gpu.multi_mma %80, %81, %arg3 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic =  MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : vector<1x1x8x1x1x2x8xi8>, vector<1x1x1x2x1x1x2x8xi8> into vector<1x1x8x1x2x1x1x4xi32>
      scf.yield %82 : vector<1x1x8x1x2x1x1x4xi32>
    }
    %78 = vector.transfer_write %77, %extracted_slice[%c0, %c0, %c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true, true, true]} : vector<1x1x8x1x2x1x1x4xi32>, tensor<1x1x8x1x2x1x1x4xi32>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %78 into %arg1[0, 0, 0, %73#0, 0, %73#1, %73#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x1x2x1x1x4xi32> into tensor<1x1x8x4x2x4x16x4xi32>
    }
  } {mapping = [#gpu.thread<linear_dim_0>]}
  flow.dispatch.tensor.store %71, %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
  return
 }

 // -----// IR Dump After EmptyTensorToAllocTensor (empty-tensor-to-alloc-tensor) //----- //
 func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} {
  %c0_i32 = arith.constant 0 : i32
  %c0_i8 = arith.constant 0 : i8
  %c2 = arith.constant 2 : index
  %c256 = arith.constant 256 : index
  %c512 = arith.constant 512 : index
  %c8 = arith.constant 8 : index
  %c16 = arith.constant 16 : index
  %c4 = arith.constant 4 : index
  %c0 = arith.constant 0 : index
  %c1 = arith.constant 1 : index
  %c32_i64 = arith.constant 32 : i64
  %0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
  %8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
  %9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
  %10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32
  %11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32
  %12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32
  %13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32
  %14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32
  %15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32
  %16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32
  %17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32
  %18 = arith.extui %0 : i32 to i64
  %19 = arith.extui %1 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %2 : i32 to i64
  %24 = arith.extui %3 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = arith.extui %4 : i32 to i64
  %29 = arith.extui %5 : i32 to i64
  %30 = arith.shli %29, %c32_i64 : i64
  %31 = arith.ori %28, %30 : i64
  %32 = arith.index_castui %31 : i64 to index
  %33 = arith.extui %6 : i32 to i64
  %34 = arith.extui %7 : i32 to i64
  %35 = arith.shli %34, %c32_i64 : i64
  %36 = arith.ori %33, %35 : i64
  %37 = arith.index_castui %36 : i64 to index
  %38 = arith.extui %8 : i32 to i64
  %39 = arith.extui %9 : i32 to i64
  %40 = arith.shli %39, %c32_i64 : i64
  %41 = arith.ori %38, %40 : i64
  %42 = arith.index_castui %41 : i64 to index
  %43 = arith.extui %10 : i32 to i64
  %44 = arith.extui %11 : i32 to i64
  %45 = arith.shli %44, %c32_i64 : i64
  %46 = arith.ori %43, %45 : i64
  %47 = arith.index_castui %46 : i64 to index
  %48 = arith.extui %12 : i32 to i64
  %49 = arith.extui %13 : i32 to i64
  %50 = arith.shli %49, %c32_i64 : i64
  %51 = arith.ori %48, %50 : i64
  %52 = arith.index_castui %51 : i64 to index
  %53 = arith.extui %14 : i32 to i64
  %54 = arith.extui %15 : i32 to i64
  %55 = arith.shli %54, %c32_i64 : i64
  %56 = arith.ori %53, %55 : i64
  %57 = arith.index_castui %56 : i64 to index
  %58 = arith.extui %16 : i32 to i64
  %59 = arith.extui %17 : i32 to i64
  %60 = arith.shli %59, %c32_i64 : i64
  %61 = arith.ori %58, %60 : i64
  %62 = arith.index_castui %61 : i64 to index
  %63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42}
  %64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52}
  %65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %66 = flow.dispatch.tensor.load %63, offsets = [%workgroup_id_y, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} -> tensor<1x?x8x4x16x2x8xi8>
  %67 = flow.dispatch.tensor.load %64, offsets = [%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} -> tensor<1x?x4x2x4x16x2x8xi8>
  %68 = flow.dispatch.tensor.load %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} -> tensor<1x1x8x4x2x4x16x4xi32>
  %69 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<1x1x8x4x16x2x8xi8>
  %70 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<1x1x4x2x4x16x2x8xi8>
  %71 = scf.forall (%arg0) in (256) shared_outs(%arg1 = %68) -> (tensor<1x1x8x4x2x4x16x4xi32>) {
    %72 = affine.apply affine_map<(d0) -> (d0 mod 256)>(%arg0)
    %73:3 = affine.delinearize_index %72 into (%c4, %c4, %c16) : index, index, index
    %extracted_slice = tensor.extract_slice %arg1[0, 0, 0, %73#0, 0, %73#1, %73#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> to tensor<1x1x8x1x2x1x1x4xi32>
    %74 = affine.apply affine_map<(d0) -> (d0 mod 64)>(%arg0)
    %75:2 = affine.delinearize_index %74 into (%c4, %c16) : index, index
    %76 = vector.transfer_read %arg1[%c0, %c0, %c0, %73#0, %c0, %73#1, %73#2, %c0], %c0_i32 {in_bounds = [true, true, true, true, true, true, true, true]} : tensor<1x1x8x4x2x4x16x4xi32>, vector<1x1x8x1x2x1x1x4xi32>
    %77 = scf.for %arg2 = %c0 to %42 step %c1 iter_args(%arg3 = %76) -> (vector<1x1x8x1x2x1x1x4xi32>) {
      %79:2 = iree_gpu.barrier_region ins(%69, %70 : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8>) {
      ^bb0(%arg4: tensor<1x1x8x4x16x2x8xi8>, %arg5: tensor<1x1x4x2x4x16x2x8xi8>):
        %83 = scf.for %arg6 = %c0 to %c512 step %c256 iter_args(%arg7 = %arg4) -> (tensor<1x1x8x4x16x2x8xi8>) {
          %85 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg6, %arg0)
          %86:3 = affine.delinearize_index %85 into (%c8, %c4, %c16) : index, index, index
          %extracted_slice_0 = tensor.extract_slice %66[0, %arg2, %86#0, %86#1, %86#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x?x8x4x16x2x8xi8> to tensor<1x1x1x1x1x2x8xi8>
          %inserted_slice = tensor.insert_slice %extracted_slice_0 into %arg7[0, 0, %86#0, %86#1, %86#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x2x8xi8> into tensor<1x1x8x4x16x2x8xi8>
          scf.yield %inserted_slice : tensor<1x1x8x4x16x2x8xi8>
        } {unroll_loop}
        %84 = scf.for %arg6 = %c0 to %c512 step %c256 iter_args(%arg7 = %arg5) -> (tensor<1x1x4x2x4x16x2x8xi8>) {
          %85 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg6, %arg0)
          %86:4 = affine.delinearize_index %85 into (%c4, %c2, %c4, %c16) : index, index, index, index
          %extracted_slice_0 = tensor.extract_slice %67[0, %arg2, %86#0, %86#1, %86#2, %86#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x?x4x2x4x16x2x8xi8> to tensor<1x1x1x1x1x1x2x8xi8>
          %inserted_slice = tensor.insert_slice %extracted_slice_0 into %arg7[0, 0, %86#0, %86#1, %86#2, %86#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x1x2x8xi8> into tensor<1x1x4x2x4x16x2x8xi8>
          scf.yield %inserted_slice : tensor<1x1x4x2x4x16x2x8xi8>
        } {unroll_loop}
        iree_gpu.yield %83, %84 : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8>
      } : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8>
      %80 = vector.transfer_read %79#0[%c0, %c0, %c0, %75#0, %75#1, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true]} : tensor<1x1x8x4x16x2x8xi8>, vector<1x1x8x1x1x2x8xi8>
      %81 = vector.transfer_read %79#1[%c0, %c0, %73#0, %c0, %73#1, %73#2, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true, true]} : tensor<1x1x4x2x4x16x2x8xi8>, vector<1x1x1x2x1x1x2x8xi8>
      %82 = iree_gpu.multi_mma %80, %81, %arg3 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic =  MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : vector<1x1x8x1x1x2x8xi8>, vector<1x1x1x2x1x1x2x8xi8> into vector<1x1x8x1x2x1x1x4xi32>
      scf.yield %82 : vector<1x1x8x1x2x1x1x4xi32>
    }
    %78 = vector.transfer_write %77, %extracted_slice[%c0, %c0, %c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true, true, true]} : vector<1x1x8x1x2x1x1x4xi32>, tensor<1x1x8x1x2x1x1x4xi32>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %78 into %arg1[0, 0, 0, %73#0, 0, %73#1, %73#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x1x2x1x1x4xi32> into tensor<1x1x8x4x2x4x16x4xi32>
    }
  } {mapping = [#gpu.thread<linear_dim_0>]}
  flow.dispatch.tensor.store %71, %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
  return
 }

 // -----// IR Dump After GPUInferMemorySpacePass (iree-codegen-gpu-infer-memory-space) //----- //
 func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} {
  %c0_i32 = arith.constant 0 : i32
  %c0_i8 = arith.constant 0 : i8
  %c2 = arith.constant 2 : index
  %c256 = arith.constant 256 : index
  %c512 = arith.constant 512 : index
  %c8 = arith.constant 8 : index
  %c16 = arith.constant 16 : index
  %c4 = arith.constant 4 : index
  %c0 = arith.constant 0 : index
  %c1 = arith.constant 1 : index
  %c32_i64 = arith.constant 32 : i64
  %0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
  %8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
  %9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
  %10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32
  %11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32
  %12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32
  %13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32
  %14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32
  %15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32
  %16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32
  %17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32
  %18 = arith.extui %0 : i32 to i64
  %19 = arith.extui %1 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %2 : i32 to i64
  %24 = arith.extui %3 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = arith.extui %4 : i32 to i64
  %29 = arith.extui %5 : i32 to i64
  %30 = arith.shli %29, %c32_i64 : i64
  %31 = arith.ori %28, %30 : i64
  %32 = arith.index_castui %31 : i64 to index
  %33 = arith.extui %6 : i32 to i64
  %34 = arith.extui %7 : i32 to i64
  %35 = arith.shli %34, %c32_i64 : i64
  %36 = arith.ori %33, %35 : i64
  %37 = arith.index_castui %36 : i64 to index
  %38 = arith.extui %8 : i32 to i64
  %39 = arith.extui %9 : i32 to i64
  %40 = arith.shli %39, %c32_i64 : i64
  %41 = arith.ori %38, %40 : i64
  %42 = arith.index_castui %41 : i64 to index
  %43 = arith.extui %10 : i32 to i64
  %44 = arith.extui %11 : i32 to i64
  %45 = arith.shli %44, %c32_i64 : i64
  %46 = arith.ori %43, %45 : i64
  %47 = arith.index_castui %46 : i64 to index
  %48 = arith.extui %12 : i32 to i64
  %49 = arith.extui %13 : i32 to i64
  %50 = arith.shli %49, %c32_i64 : i64
  %51 = arith.ori %48, %50 : i64
  %52 = arith.index_castui %51 : i64 to index
  %53 = arith.extui %14 : i32 to i64
  %54 = arith.extui %15 : i32 to i64
  %55 = arith.shli %54, %c32_i64 : i64
  %56 = arith.ori %53, %55 : i64
  %57 = arith.index_castui %56 : i64 to index
  %58 = arith.extui %16 : i32 to i64
  %59 = arith.extui %17 : i32 to i64
  %60 = arith.shli %59, %c32_i64 : i64
  %61 = arith.ori %58, %60 : i64
  %62 = arith.index_castui %61 : i64 to index
  %63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42}
  %64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52}
  %65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %66 = flow.dispatch.tensor.load %63, offsets = [%workgroup_id_y, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} -> tensor<1x?x8x4x16x2x8xi8>
  %67 = flow.dispatch.tensor.load %64, offsets = [%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} -> tensor<1x?x4x2x4x16x2x8xi8>
  %68 = flow.dispatch.tensor.load %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} -> tensor<1x1x8x4x2x4x16x4xi32>
  %69 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<1x1x8x4x16x2x8xi8>
  %70 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<1x1x4x2x4x16x2x8xi8>
  %71 = scf.forall (%arg0) in (256) shared_outs(%arg1 = %68) -> (tensor<1x1x8x4x2x4x16x4xi32>) {
    %72 = affine.apply affine_map<(d0) -> (d0 mod 256)>(%arg0)
    %73:3 = affine.delinearize_index %72 into (%c4, %c4, %c16) : index, index, index
    %extracted_slice = tensor.extract_slice %arg1[0, 0, 0, %73#0, 0, %73#1, %73#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> to tensor<1x1x8x1x2x1x1x4xi32>
    %74 = affine.apply affine_map<(d0) -> (d0 mod 64)>(%arg0)
    %75:2 = affine.delinearize_index %74 into (%c4, %c16) : index, index
    %76 = vector.transfer_read %arg1[%c0, %c0, %c0, %73#0, %c0, %73#1, %73#2, %c0], %c0_i32 {in_bounds = [true, true, true, true, true, true, true, true]} : tensor<1x1x8x4x2x4x16x4xi32>, vector<1x1x8x1x2x1x1x4xi32>
    %77 = scf.for %arg2 = %c0 to %42 step %c1 iter_args(%arg3 = %76) -> (vector<1x1x8x1x2x1x1x4xi32>) {
      %79:2 = iree_gpu.barrier_region ins(%69, %70 : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8>) {
      ^bb0(%arg4: tensor<1x1x8x4x16x2x8xi8>, %arg5: tensor<1x1x4x2x4x16x2x8xi8>):
        %83 = scf.for %arg6 = %c0 to %c512 step %c256 iter_args(%arg7 = %arg4) -> (tensor<1x1x8x4x16x2x8xi8>) {
          %85 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg6, %arg0)
          %86:3 = affine.delinearize_index %85 into (%c8, %c4, %c16) : index, index, index
          %extracted_slice_0 = tensor.extract_slice %66[0, %arg2, %86#0, %86#1, %86#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x?x8x4x16x2x8xi8> to tensor<1x1x1x1x1x2x8xi8>
          %inserted_slice = tensor.insert_slice %extracted_slice_0 into %arg7[0, 0, %86#0, %86#1, %86#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x2x8xi8> into tensor<1x1x8x4x16x2x8xi8>
          scf.yield %inserted_slice : tensor<1x1x8x4x16x2x8xi8>
        } {unroll_loop}
        %84 = scf.for %arg6 = %c0 to %c512 step %c256 iter_args(%arg7 = %arg5) -> (tensor<1x1x4x2x4x16x2x8xi8>) {
          %85 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg6, %arg0)
          %86:4 = affine.delinearize_index %85 into (%c4, %c2, %c4, %c16) : index, index, index, index
          %extracted_slice_0 = tensor.extract_slice %67[0, %arg2, %86#0, %86#1, %86#2, %86#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x?x4x2x4x16x2x8xi8> to tensor<1x1x1x1x1x1x2x8xi8>
          %inserted_slice = tensor.insert_slice %extracted_slice_0 into %arg7[0, 0, %86#0, %86#1, %86#2, %86#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x1x2x8xi8> into tensor<1x1x4x2x4x16x2x8xi8>
          scf.yield %inserted_slice : tensor<1x1x4x2x4x16x2x8xi8>
        } {unroll_loop}
        iree_gpu.yield %83, %84 : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8>
      } : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8>
      %80 = vector.transfer_read %79#0[%c0, %c0, %c0, %75#0, %75#1, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true]} : tensor<1x1x8x4x16x2x8xi8>, vector<1x1x8x1x1x2x8xi8>
      %81 = vector.transfer_read %79#1[%c0, %c0, %73#0, %c0, %73#1, %73#2, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true, true]} : tensor<1x1x4x2x4x16x2x8xi8>, vector<1x1x1x2x1x1x2x8xi8>
      %82 = iree_gpu.multi_mma %80, %81, %arg3 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic =  MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : vector<1x1x8x1x1x2x8xi8>, vector<1x1x1x2x1x1x2x8xi8> into vector<1x1x8x1x2x1x1x4xi32>
      scf.yield %82 : vector<1x1x8x1x2x1x1x4xi32>
    }
    %78 = vector.transfer_write %77, %extracted_slice[%c0, %c0, %c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true, true, true]} : vector<1x1x8x1x2x1x1x4xi32>, tensor<1x1x8x1x2x1x1x4xi32>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %78 into %arg1[0, 0, 0, %73#0, 0, %73#1, %73#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x1x2x1x1x4xi32> into tensor<1x1x8x4x2x4x16x4xi32>
    }
  } {mapping = [#gpu.thread<linear_dim_0>]}
  flow.dispatch.tensor.store %71, %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
  return
 }

 // -----// IR Dump After IREEComprehensiveBufferizePass (iree-codegen-iree-comprehensive-bufferize) //----- //
 func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} {
  %c0_i32 = arith.constant 0 : i32
  %c0_i8 = arith.constant 0 : i8
  %c2 = arith.constant 2 : index
  %c256 = arith.constant 256 : index
  %c512 = arith.constant 512 : index
  %c8 = arith.constant 8 : index
  %c16 = arith.constant 16 : index
  %c4 = arith.constant 4 : index
  %c0 = arith.constant 0 : index
  %c1 = arith.constant 1 : index
  %c32_i64 = arith.constant 32 : i64
  %0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
  %8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
  %9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
  %10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32
  %11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32
  %12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32
  %13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32
  %14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32
  %15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32
  %16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32
  %17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32
  %18 = arith.extui %0 : i32 to i64
  %19 = arith.extui %1 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %2 : i32 to i64
  %24 = arith.extui %3 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = arith.extui %4 : i32 to i64
  %29 = arith.extui %5 : i32 to i64
  %30 = arith.shli %29, %c32_i64 : i64
  %31 = arith.ori %28, %30 : i64
  %32 = arith.index_castui %31 : i64 to index
  %33 = arith.extui %6 : i32 to i64
  %34 = arith.extui %7 : i32 to i64
  %35 = arith.shli %34, %c32_i64 : i64
  %36 = arith.ori %33, %35 : i64
  %37 = arith.index_castui %36 : i64 to index
  %38 = arith.extui %8 : i32 to i64
  %39 = arith.extui %9 : i32 to i64
  %40 = arith.shli %39, %c32_i64 : i64
  %41 = arith.ori %38, %40 : i64
  %42 = arith.index_castui %41 : i64 to index
  %43 = arith.extui %10 : i32 to i64
  %44 = arith.extui %11 : i32 to i64
  %45 = arith.shli %44, %c32_i64 : i64
  %46 = arith.ori %43, %45 : i64
  %47 = arith.index_castui %46 : i64 to index
  %48 = arith.extui %12 : i32 to i64
  %49 = arith.extui %13 : i32 to i64
  %50 = arith.shli %49, %c32_i64 : i64
  %51 = arith.ori %48, %50 : i64
  %52 = arith.index_castui %51 : i64 to index
  %53 = arith.extui %14 : i32 to i64
  %54 = arith.extui %15 : i32 to i64
  %55 = arith.shli %54, %c32_i64 : i64
  %56 = arith.ori %53, %55 : i64
  %57 = arith.index_castui %56 : i64 to index
  %58 = arith.extui %16 : i32 to i64
  %59 = arith.extui %17 : i32 to i64
  %60 = arith.shli %59, %c32_i64 : i64
  %61 = arith.ori %58, %60 : i64
  %62 = arith.index_castui %61 : i64 to index
  %63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : memref<?x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>{%37, %42}
  memref.assume_alignment %63, 1 : memref<?x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : memref<?x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>{%47, %52}
  memref.assume_alignment %64, 1 : memref<?x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : memref<?x?x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>{%57, %62}
  memref.assume_alignment %65, 1 : memref<?x?x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %subview = memref.subview %63[%workgroup_id_y, 0, 0, 0, 0, 0, 0] [1, %42, 8, 4, 16, 2, 8] [1, 1, 1, 1, 1, 1, 1] : memref<?x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %subview_0 = memref.subview %64[%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0] [1, %42, 4, 2, 4, 16, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : memref<?x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %subview_1 = memref.subview %65[%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 2, 4, 16, 4] [1, 1, 1, 1, 1, 1, 1, 1] : memref<?x?x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %alloc = memref.alloc() : memref<1x1x8x4x16x2x8xi8, #gpu.address_space<workgroup>>
  %alloc_2 = memref.alloc() : memref<1x1x4x2x4x16x2x8xi8, #gpu.address_space<workgroup>>
  scf.forall (%arg0) in (256) {
    %66 = affine.apply affine_map<(d0) -> (d0 mod 256)>(%arg0)
    %67:3 = affine.delinearize_index %66 into (%c4, %c4, %c16) : index, index, index
    %subview_4 = memref.subview %subview_1[0, 0, 0, %67#0, 0, %67#1, %67#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x8x1x2x1x1x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %68 = affine.apply affine_map<(d0) -> (d0 mod 64)>(%arg0)
    %69:2 = affine.delinearize_index %68 into (%c4, %c16) : index, index
    %70 = vector.transfer_read %subview_1[%c0, %c0, %c0, %67#0, %c0, %67#1, %67#2, %c0], %c0_i32 {in_bounds = [true, true, true, true, true, true, true, true]} : memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x1x8x1x2x1x1x4xi32>
    %71 = scf.for %arg1 = %c0 to %42 step %c1 iter_args(%arg2 = %70) -> (vector<1x1x8x1x2x1x1x4xi32>) {
      gpu.barrier
      %72 = scf.for %arg3 = %c0 to %c512 step %c256 iter_args(%arg4 = %alloc) -> (memref<1x1x8x4x16x2x8xi8, #gpu.address_space<workgroup>>) {
        %77 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg3, %arg0)
        %78:3 = affine.delinearize_index %77 into (%c8, %c4, %c16) : index, index, index
        %subview_6 = memref.subview %subview[0, %arg1, %78#0, %78#1, %78#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : memref<1x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1x1x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        %subview_7 = memref.subview %arg4[0, 0, %78#0, %78#1, %78#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : memref<1x1x8x4x16x2x8xi8, #gpu.address_space<workgroup>> to memref<1x1x1x1x1x2x8xi8, strided<[8192, 8192, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>>
        memref.copy %subview_6, %subview_7 : memref<1x1x1x1x1x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1x1x2x8xi8, strided<[8192, 8192, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>>
        scf.yield %arg4 : memref<1x1x8x4x16x2x8xi8, #gpu.address_space<workgroup>>
      } {unroll_loop}
      %73 = scf.for %arg3 = %c0 to %c512 step %c256 iter_args(%arg4 = %alloc_2) -> (memref<1x1x4x2x4x16x2x8xi8, #gpu.address_space<workgroup>>) {
        %77 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg3, %arg0)
        %78:4 = affine.delinearize_index %77 into (%c4, %c2, %c4, %c16) : index, index, index, index
        %subview_6 = memref.subview %subview_0[0, %arg1, %78#0, %78#1, %78#2, %78#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : memref<1x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1x1x1x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        %subview_7 = memref.subview %arg4[0, 0, %78#0, %78#1, %78#2, %78#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : memref<1x1x4x2x4x16x2x8xi8, #gpu.address_space<workgroup>> to memref<1x1x1x1x1x1x2x8xi8, strided<[8192, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>>
        memref.copy %subview_6, %subview_7 : memref<1x1x1x1x1x1x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1x1x1x2x8xi8, strided<[8192, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>>
        scf.yield %arg4 : memref<1x1x4x2x4x16x2x8xi8, #gpu.address_space<workgroup>>
      } {unroll_loop}
      gpu.barrier
      %74 = vector.transfer_read %72[%c0, %c0, %c0, %69#0, %69#1, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true]} : memref<1x1x8x4x16x2x8xi8, #gpu.address_space<workgroup>>, vector<1x1x8x1x1x2x8xi8>
      %75 = vector.transfer_read %73[%c0, %c0, %67#0, %c0, %67#1, %67#2, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true, true]} : memref<1x1x4x2x4x16x2x8xi8, #gpu.address_space<workgroup>>, vector<1x1x1x2x1x1x2x8xi8>
      %76 = iree_gpu.multi_mma %74, %75, %arg2 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic =  MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : vector<1x1x8x1x1x2x8xi8>, vector<1x1x1x2x1x1x2x8xi8> into vector<1x1x8x1x2x1x1x4xi32>
      scf.yield %76 : vector<1x1x8x1x2x1x1x4xi32>
    }
    vector.transfer_write %71, %subview_4[%c0, %c0, %c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true, true, true]} : vector<1x1x8x1x2x1x1x4xi32>, memref<1x1x8x1x2x1x1x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %subview_5 = memref.subview %subview_1[0, 0, 0, %67#0, 0, %67#1, %67#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x8x1x2x1x1x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    memref.copy %subview_4, %subview_5 : memref<1x1x8x1x2x1x1x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x8x1x2x1x1x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  } {mapping = [#gpu.thread<linear_dim_0>]}
  %subview_3 = memref.subview %65[%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 2, 4, 16, 4] [1, 1, 1, 1, 1, 1, 1, 1] : memref<?x?x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  memref.copy %subview_1, %subview_3 : memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  return
 }

 // -----// IR Dump After ResolveShapedTypeResultDims (resolve-shaped-type-result-dims) //----- //
 func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} {
  %c0_i32 = arith.constant 0 : i32
  %c0_i8 = arith.constant 0 : i8
  %c2 = arith.constant 2 : index
  %c256 = arith.constant 256 : index
  %c512 = arith.constant 512 : index
  %c8 = arith.constant 8 : index
  %c16 = arith.constant 16 : index
  %c4 = arith.constant 4 : index
  %c0 = arith.constant 0 : index
  %c1 = arith.constant 1 : index
  %c32_i64 = arith.constant 32 : i64
  %0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
  %8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
  %9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
  %10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32
  %11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32
  %12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32
  %13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32
  %14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32
  %15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32
  %16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32
  %17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32
  %18 = arith.extui %0 : i32 to i64
  %19 = arith.extui %1 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %2 : i32 to i64
  %24 = arith.extui %3 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = arith.extui %4 : i32 to i64
  %29 = arith.extui %5 : i32 to i64
  %30 = arith.shli %29, %c32_i64 : i64
  %31 = arith.ori %28, %30 : i64
  %32 = arith.index_castui %31 : i64 to index
  %33 = arith.extui %6 : i32 to i64
  %34 = arith.extui %7 : i32 to i64
  %35 = arith.shli %34, %c32_i64 : i64
  %36 = arith.ori %33, %35 : i64
  %37 = arith.index_castui %36 : i64 to index
  %38 = arith.extui %8 : i32 to i64
  %39 = arith.extui %9 : i32 to i64
  %40 = arith.shli %39, %c32_i64 : i64
  %41 = arith.ori %38, %40 : i64
  %42 = arith.index_castui %41 : i64 to index
  %43 = arith.extui %10 : i32 to i64
  %44 = arith.extui %11 : i32 to i64
  %45 = arith.shli %44, %c32_i64 : i64
  %46 = arith.ori %43, %45 : i64
  %47 = arith.index_castui %46 : i64 to index
  %48 = arith.extui %12 : i32 to i64
  %49 = arith.extui %13 : i32 to i64
  %50 = arith.shli %49, %c32_i64 : i64
  %51 = arith.ori %48, %50 : i64
  %52 = arith.index_castui %51 : i64 to index
  %53 = arith.extui %14 : i32 to i64
  %54 = arith.extui %15 : i32 to i64
  %55 = arith.shli %54, %c32_i64 : i64
  %56 = arith.ori %53, %55 : i64
  %57 = arith.index_castui %56 : i64 to index
  %58 = arith.extui %16 : i32 to i64
  %59 = arith.extui %17 : i32 to i64
  %60 = arith.shli %59, %c32_i64 : i64
  %61 = arith.ori %58, %60 : i64
  %62 = arith.index_castui %61 : i64 to index
  %63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : memref<?x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>{%37, %42}
  memref.assume_alignment %63, 1 : memref<?x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : memref<?x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>{%47, %52}
  memref.assume_alignment %64, 1 : memref<?x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : memref<?x?x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>{%57, %62}
  memref.assume_alignment %65, 1 : memref<?x?x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %subview = memref.subview %63[%workgroup_id_y, 0, 0, 0, 0, 0, 0] [1, %42, 8, 4, 16, 2, 8] [1, 1, 1, 1, 1, 1, 1] : memref<?x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %subview_0 = memref.subview %64[%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0] [1, %42, 4, 2, 4, 16, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : memref<?x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %subview_1 = memref.subview %65[%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 2, 4, 16, 4] [1, 1, 1, 1, 1, 1, 1, 1] : memref<?x?x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %alloc = memref.alloc() : memref<1x1x8x4x16x2x8xi8, #gpu.address_space<workgroup>>
  %alloc_2 = memref.alloc() : memref<1x1x4x2x4x16x2x8xi8, #gpu.address_space<workgroup>>
  scf.forall (%arg0) in (256) {
    %66 = affine.apply affine_map<(d0) -> (d0 mod 256)>(%arg0)
    %67:3 = affine.delinearize_index %66 into (%c4, %c4, %c16) : index, index, index
    %subview_4 = memref.subview %subview_1[0, 0, 0, %67#0, 0, %67#1, %67#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x8x1x2x1x1x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %68 = affine.apply affine_map<(d0) -> (d0 mod 64)>(%arg0)
    %69:2 = affine.delinearize_index %68 into (%c4, %c16) : index, index
    %70 = vector.transfer_read %subview_1[%c0, %c0, %c0, %67#0, %c0, %67#1, %67#2, %c0], %c0_i32 {in_bounds = [true, true, true, true, true, true, true, true]} : memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x1x8x1x2x1x1x4xi32>
    %71 = scf.for %arg1 = %c0 to %42 step %c1 iter_args(%arg2 = %70) -> (vector<1x1x8x1x2x1x1x4xi32>) {
      gpu.barrier
      %72 = scf.for %arg3 = %c0 to %c512 step %c256 iter_args(%arg4 = %alloc) -> (memref<1x1x8x4x16x2x8xi8, #gpu.address_space<workgroup>>) {
        %77 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg3, %arg0)
        %78:3 = affine.delinearize_index %77 into (%c8, %c4, %c16) : index, index, index
        %subview_6 = memref.subview %subview[0, %arg1, %78#0, %78#1, %78#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : memref<1x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1x1x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        %subview_7 = memref.subview %arg4[0, 0, %78#0, %78#1, %78#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : memref<1x1x8x4x16x2x8xi8, #gpu.address_space<workgroup>> to memref<1x1x1x1x1x2x8xi8, strided<[8192, 8192, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>>
        memref.copy %subview_6, %subview_7 : memref<1x1x1x1x1x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1x1x2x8xi8, strided<[8192, 8192, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>>
        scf.yield %arg4 : memref<1x1x8x4x16x2x8xi8, #gpu.address_space<workgroup>>
      } {unroll_loop}
      %73 = scf.for %arg3 = %c0 to %c512 step %c256 iter_args(%arg4 = %alloc_2) -> (memref<1x1x4x2x4x16x2x8xi8, #gpu.address_space<workgroup>>) {
        %77 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg3, %arg0)
        %78:4 = affine.delinearize_index %77 into (%c4, %c2, %c4, %c16) : index, index, index, index
        %subview_6 = memref.subview %subview_0[0, %arg1, %78#0, %78#1, %78#2, %78#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : memref<1x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1x1x1x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        %subview_7 = memref.subview %arg4[0, 0, %78#0, %78#1, %78#2, %78#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : memref<1x1x4x2x4x16x2x8xi8, #gpu.address_space<workgroup>> to memref<1x1x1x1x1x1x2x8xi8, strided<[8192, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>>
        memref.copy %subview_6, %subview_7 : memref<1x1x1x1x1x1x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1x1x1x2x8xi8, strided<[8192, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>>
        scf.yield %arg4 : memref<1x1x4x2x4x16x2x8xi8, #gpu.address_space<workgroup>>
      } {unroll_loop}
      gpu.barrier
      %74 = vector.transfer_read %72[%c0, %c0, %c0, %69#0, %69#1, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true]} : memref<1x1x8x4x16x2x8xi8, #gpu.address_space<workgroup>>, vector<1x1x8x1x1x2x8xi8>
      %75 = vector.transfer_read %73[%c0, %c0, %67#0, %c0, %67#1, %67#2, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true, true]} : memref<1x1x4x2x4x16x2x8xi8, #gpu.address_space<workgroup>>, vector<1x1x1x2x1x1x2x8xi8>
      %76 = iree_gpu.multi_mma %74, %75, %arg2 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic =  MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : vector<1x1x8x1x1x2x8xi8>, vector<1x1x1x2x1x1x2x8xi8> into vector<1x1x8x1x2x1x1x4xi32>
      scf.yield %76 : vector<1x1x8x1x2x1x1x4xi32>
    }
    vector.transfer_write %71, %subview_4[%c0, %c0, %c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true, true, true]} : vector<1x1x8x1x2x1x1x4xi32>, memref<1x1x8x1x2x1x1x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %subview_5 = memref.subview %subview_1[0, 0, 0, %67#0, 0, %67#1, %67#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x8x1x2x1x1x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    memref.copy %subview_4, %subview_5 : memref<1x1x8x1x2x1x1x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x8x1x2x1x1x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  } {mapping = [#gpu.thread<linear_dim_0>]}
  %subview_3 = memref.subview %65[%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 2, 4, 16, 4] [1, 1, 1, 1, 1, 1, 1, 1] : memref<?x?x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  memref.copy %subview_1, %subview_3 : memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  return
 }

 // -----// IR Dump After Canonicalizer (canonicalize) //----- //
 func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} {
  %c0_i32 = arith.constant 0 : i32
  %c0_i8 = arith.constant 0 : i8
  %c2 = arith.constant 2 : index
  %c256 = arith.constant 256 : index
  %c512 = arith.constant 512 : index
  %c8 = arith.constant 8 : index
  %c16 = arith.constant 16 : index
  %c4 = arith.constant 4 : index
  %c0 = arith.constant 0 : index
  %c1 = arith.constant 1 : index
  %c32_i64 = arith.constant 32 : i64
  %0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
  %8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
  %9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
  %10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32
  %11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32
  %12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32
  %13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32
  %14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32
  %15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32
  %16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32
  %17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32
  %18 = arith.extui %0 : i32 to i64
  %19 = arith.extui %1 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %2 : i32 to i64
  %24 = arith.extui %3 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = arith.extui %4 : i32 to i64
  %29 = arith.extui %5 : i32 to i64
  %30 = arith.shli %29, %c32_i64 : i64
  %31 = arith.ori %28, %30 : i64
  %32 = arith.index_castui %31 : i64 to index
  %33 = arith.extui %6 : i32 to i64
  %34 = arith.extui %7 : i32 to i64
  %35 = arith.shli %34, %c32_i64 : i64
  %36 = arith.ori %33, %35 : i64
  %37 = arith.index_castui %36 : i64 to index
  %38 = arith.extui %8 : i32 to i64
  %39 = arith.extui %9 : i32 to i64
  %40 = arith.shli %39, %c32_i64 : i64
  %41 = arith.ori %38, %40 : i64
  %42 = arith.index_castui %41 : i64 to index
  %43 = arith.extui %10 : i32 to i64
  %44 = arith.extui %11 : i32 to i64
  %45 = arith.shli %44, %c32_i64 : i64
  %46 = arith.ori %43, %45 : i64
  %47 = arith.index_castui %46 : i64 to index
  %48 = arith.extui %12 : i32 to i64
  %49 = arith.extui %13 : i32 to i64
  %50 = arith.shli %49, %c32_i64 : i64
  %51 = arith.ori %48, %50 : i64
  %52 = arith.index_castui %51 : i64 to index
  %53 = arith.extui %14 : i32 to i64
  %54 = arith.extui %15 : i32 to i64
  %55 = arith.shli %54, %c32_i64 : i64
  %56 = arith.ori %53, %55 : i64
  %57 = arith.index_castui %56 : i64 to index
  %58 = arith.extui %16 : i32 to i64
  %59 = arith.extui %17 : i32 to i64
  %60 = arith.shli %59, %c32_i64 : i64
  %61 = arith.ori %58, %60 : i64
  %62 = arith.index_castui %61 : i64 to index
  %63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : memref<?x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>{%37, %42}
  memref.assume_alignment %63, 1 : memref<?x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : memref<?x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>{%47, %52}
  memref.assume_alignment %64, 1 : memref<?x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : memref<?x?x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>{%57, %62}
  memref.assume_alignment %65, 1 : memref<?x?x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %subview = memref.subview %63[%workgroup_id_y, 0, 0, 0, 0, 0, 0] [1, %42, 8, 4, 16, 2, 8] [1, 1, 1, 1, 1, 1, 1] : memref<?x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %subview_0 = memref.subview %64[%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0] [1, %42, 4, 2, 4, 16, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : memref<?x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %subview_1 = memref.subview %65[%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 2, 4, 16, 4] [1, 1, 1, 1, 1, 1, 1, 1] : memref<?x?x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %alloc = memref.alloc() : memref<1x1x8x4x16x2x8xi8, #gpu.address_space<workgroup>>
  %alloc_2 = memref.alloc() : memref<1x1x4x2x4x16x2x8xi8, #gpu.address_space<workgroup>>
  scf.forall (%arg0) in (256) {
    %66 = affine.apply affine_map<(d0) -> (d0 mod 256)>(%arg0)
    %67:3 = affine.delinearize_index %66 into (%c4, %c4, %c16) : index, index, index
    %subview_4 = memref.subview %subview_1[0, 0, 0, %67#0, 0, %67#1, %67#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x8x1x2x1x1x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %68 = affine.apply affine_map<(d0) -> (d0 mod 64)>(%arg0)
    %69:2 = affine.delinearize_index %68 into (%c4, %c16) : index, index
    %70 = vector.transfer_read %subview_1[%c0, %c0, %c0, %67#0, %c0, %67#1, %67#2, %c0], %c0_i32 {in_bounds = [true, true, true, true, true, true, true, true]} : memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x1x8x1x2x1x1x4xi32>
    %71 = scf.for %arg1 = %c0 to %42 step %c1 iter_args(%arg2 = %70) -> (vector<1x1x8x1x2x1x1x4xi32>) {
      gpu.barrier
      scf.for %arg3 = %c0 to %c512 step %c256 {
        %75 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg3, %arg0)
        %76:3 = affine.delinearize_index %75 into (%c8, %c4, %c16) : index, index, index
        %subview_6 = memref.subview %subview[0, %arg1, %76#0, %76#1, %76#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : memref<1x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1x1x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        %subview_7 = memref.subview %alloc[0, 0, %76#0, %76#1, %76#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : memref<1x1x8x4x16x2x8xi8, #gpu.address_space<workgroup>> to memref<1x1x1x1x1x2x8xi8, strided<[8192, 8192, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>>
        memref.copy %subview_6, %subview_7 : memref<1x1x1x1x1x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1x1x2x8xi8, strided<[8192, 8192, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>>
      } {unroll_loop}
      scf.for %arg3 = %c0 to %c512 step %c256 {
        %75 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg3, %arg0)
        %76:4 = affine.delinearize_index %75 into (%c4, %c2, %c4, %c16) : index, index, index, index
        %subview_6 = memref.subview %subview_0[0, %arg1, %76#0, %76#1, %76#2, %76#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : memref<1x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1x1x1x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        %subview_7 = memref.subview %alloc_2[0, 0, %76#0, %76#1, %76#2, %76#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : memref<1x1x4x2x4x16x2x8xi8, #gpu.address_space<workgroup>> to memref<1x1x1x1x1x1x2x8xi8, strided<[8192, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>>
        memref.copy %subview_6, %subview_7 : memref<1x1x1x1x1x1x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1x1x1x2x8xi8, strided<[8192, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>>
      } {unroll_loop}
      gpu.barrier
      %72 = vector.transfer_read %alloc[%c0, %c0, %c0, %69#0, %69#1, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true]} : memref<1x1x8x4x16x2x8xi8, #gpu.address_space<workgroup>>, vector<1x1x8x1x1x2x8xi8>
      %73 = vector.transfer_read %alloc_2[%c0, %c0, %67#0, %c0, %67#1, %67#2, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true, true]} : memref<1x1x4x2x4x16x2x8xi8, #gpu.address_space<workgroup>>, vector<1x1x1x2x1x1x2x8xi8>
      %74 = iree_gpu.multi_mma %72, %73, %arg2 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic =  MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : vector<1x1x8x1x1x2x8xi8>, vector<1x1x1x2x1x1x2x8xi8> into vector<1x1x8x1x2x1x1x4xi32>
      scf.yield %74 : vector<1x1x8x1x2x1x1x4xi32>
    }
    vector.transfer_write %71, %subview_4[%c0, %c0, %c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true, true, true]} : vector<1x1x8x1x2x1x1x4xi32>, memref<1x1x8x1x2x1x1x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %subview_5 = memref.subview %subview_1[0, 0, 0, %67#0, 0, %67#1, %67#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x8x1x2x1x1x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    memref.copy %subview_4, %subview_5 : memref<1x1x8x1x2x1x1x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x8x1x2x1x1x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  } {mapping = [#gpu.thread<linear_dim_0>]}
  %subview_3 = memref.subview %65[%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 2, 4, 16, 4] [1, 1, 1, 1, 1, 1, 1, 1] : memref<?x?x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  memref.copy %subview_1, %subview_3 : memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  return
 }

 // -----// IR Dump After CSE (cse) //----- //
 func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} {
  %c0_i32 = arith.constant 0 : i32
  %c0_i8 = arith.constant 0 : i8
  %c2 = arith.constant 2 : index
  %c256 = arith.constant 256 : index
  %c512 = arith.constant 512 : index
  %c8 = arith.constant 8 : index
  %c16 = arith.constant 16 : index
  %c4 = arith.constant 4 : index
  %c0 = arith.constant 0 : index
  %c1 = arith.constant 1 : index
  %c32_i64 = arith.constant 32 : i64
  %0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
  %8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
  %9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
  %10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32
  %11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32
  %12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32
  %13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32
  %14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32
  %15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32
  %16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32
  %17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32
  %18 = arith.extui %0 : i32 to i64
  %19 = arith.extui %1 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %2 : i32 to i64
  %24 = arith.extui %3 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = arith.extui %4 : i32 to i64
  %29 = arith.extui %5 : i32 to i64
  %30 = arith.shli %29, %c32_i64 : i64
  %31 = arith.ori %28, %30 : i64
  %32 = arith.index_castui %31 : i64 to index
  %33 = arith.extui %6 : i32 to i64
  %34 = arith.extui %7 : i32 to i64
  %35 = arith.shli %34, %c32_i64 : i64
  %36 = arith.ori %33, %35 : i64
  %37 = arith.index_castui %36 : i64 to index
  %38 = arith.extui %8 : i32 to i64
  %39 = arith.extui %9 : i32 to i64
  %40 = arith.shli %39, %c32_i64 : i64
  %41 = arith.ori %38, %40 : i64
  %42 = arith.index_castui %41 : i64 to index
  %43 = arith.extui %10 : i32 to i64
  %44 = arith.extui %11 : i32 to i64
  %45 = arith.shli %44, %c32_i64 : i64
  %46 = arith.ori %43, %45 : i64
  %47 = arith.index_castui %46 : i64 to index
  %48 = arith.extui %12 : i32 to i64
  %49 = arith.extui %13 : i32 to i64
  %50 = arith.shli %49, %c32_i64 : i64
  %51 = arith.ori %48, %50 : i64
  %52 = arith.index_castui %51 : i64 to index
  %53 = arith.extui %14 : i32 to i64
  %54 = arith.extui %15 : i32 to i64
  %55 = arith.shli %54, %c32_i64 : i64
  %56 = arith.ori %53, %55 : i64
  %57 = arith.index_castui %56 : i64 to index
  %58 = arith.extui %16 : i32 to i64
  %59 = arith.extui %17 : i32 to i64
  %60 = arith.shli %59, %c32_i64 : i64
  %61 = arith.ori %58, %60 : i64
  %62 = arith.index_castui %61 : i64 to index
  %63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : memref<?x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>{%37, %42}
  memref.assume_alignment %63, 1 : memref<?x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : memref<?x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>{%47, %52}
  memref.assume_alignment %64, 1 : memref<?x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : memref<?x?x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>{%57, %62}
  memref.assume_alignment %65, 1 : memref<?x?x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %subview = memref.subview %63[%workgroup_id_y, 0, 0, 0, 0, 0, 0] [1, %42, 8, 4, 16, 2, 8] [1, 1, 1, 1, 1, 1, 1] : memref<?x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %subview_0 = memref.subview %64[%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0] [1, %42, 4, 2, 4, 16, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : memref<?x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %subview_1 = memref.subview %65[%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 2, 4, 16, 4] [1, 1, 1, 1, 1, 1, 1, 1] : memref<?x?x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %alloc = memref.alloc() : memref<1x1x8x4x16x2x8xi8, #gpu.address_space<workgroup>>
  %alloc_2 = memref.alloc() : memref<1x1x4x2x4x16x2x8xi8, #gpu.address_space<workgroup>>
  scf.forall (%arg0) in (256) {
    %66 = affine.apply affine_map<(d0) -> (d0 mod 256)>(%arg0)
    %67:3 = affine.delinearize_index %66 into (%c4, %c4, %c16) : index, index, index
    %subview_3 = memref.subview %subview_1[0, 0, 0, %67#0, 0, %67#1, %67#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x8x1x2x1x1x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %68 = affine.apply affine_map<(d0) -> (d0 mod 64)>(%arg0)
    %69:2 = affine.delinearize_index %68 into (%c4, %c16) : index, index
    %70 = vector.transfer_read %subview_1[%c0, %c0, %c0, %67#0, %c0, %67#1, %67#2, %c0], %c0_i32 {in_bounds = [true, true, true, true, true, true, true, true]} : memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x1x8x1x2x1x1x4xi32>
    %71 = scf.for %arg1 = %c0 to %42 step %c1 iter_args(%arg2 = %70) -> (vector<1x1x8x1x2x1x1x4xi32>) {
      gpu.barrier
      scf.for %arg3 = %c0 to %c512 step %c256 {
        %75 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg3, %arg0)
        %76:3 = affine.delinearize_index %75 into (%c8, %c4, %c16) : index, index, index
        %subview_4 = memref.subview %subview[0, %arg1, %76#0, %76#1, %76#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : memref<1x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1x1x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        %subview_5 = memref.subview %alloc[0, 0, %76#0, %76#1, %76#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : memref<1x1x8x4x16x2x8xi8, #gpu.address_space<workgroup>> to memref<1x1x1x1x1x2x8xi8, strided<[8192, 8192, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>>
        memref.copy %subview_4, %subview_5 : memref<1x1x1x1x1x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1x1x2x8xi8, strided<[8192, 8192, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>>
      } {unroll_loop}
      scf.for %arg3 = %c0 to %c512 step %c256 {
        %75 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg3, %arg0)
        %76:4 = affine.delinearize_index %75 into (%c4, %c2, %c4, %c16) : index, index, index, index
        %subview_4 = memref.subview %subview_0[0, %arg1, %76#0, %76#1, %76#2, %76#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : memref<1x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1x1x1x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        %subview_5 = memref.subview %alloc_2[0, 0, %76#0, %76#1, %76#2, %76#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : memref<1x1x4x2x4x16x2x8xi8, #gpu.address_space<workgroup>> to memref<1x1x1x1x1x1x2x8xi8, strided<[8192, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>>
        memref.copy %subview_4, %subview_5 : memref<1x1x1x1x1x1x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1x1x1x2x8xi8, strided<[8192, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>>
      } {unroll_loop}
      gpu.barrier
      %72 = vector.transfer_read %alloc[%c0, %c0, %c0, %69#0, %69#1, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true]} : memref<1x1x8x4x16x2x8xi8, #gpu.address_space<workgroup>>, vector<1x1x8x1x1x2x8xi8>
      %73 = vector.transfer_read %alloc_2[%c0, %c0, %67#0, %c0, %67#1, %67#2, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true, true]} : memref<1x1x4x2x4x16x2x8xi8, #gpu.address_space<workgroup>>, vector<1x1x1x2x1x1x2x8xi8>
      %74 = iree_gpu.multi_mma %72, %73, %arg2 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic =  MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : vector<1x1x8x1x1x2x8xi8>, vector<1x1x1x2x1x1x2x8xi8> into vector<1x1x8x1x2x1x1x4xi32>
      scf.yield %74 : vector<1x1x8x1x2x1x1x4xi32>
    }
    vector.transfer_write %71, %subview_3[%c0, %c0, %c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true, true, true]} : vector<1x1x8x1x2x1x1x4xi32>, memref<1x1x8x1x2x1x1x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    memref.copy %subview_3, %subview_3 : memref<1x1x8x1x2x1x1x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x8x1x2x1x1x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  } {mapping = [#gpu.thread<linear_dim_0>]}
  memref.copy %subview_1, %subview_1 : memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  return
 }

 // -----// IR Dump After Canonicalizer (canonicalize) //----- //
 func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} {
  %c0_i32 = arith.constant 0 : i32
  %c0_i8 = arith.constant 0 : i8
  %c2 = arith.constant 2 : index
  %c256 = arith.constant 256 : index
  %c512 = arith.constant 512 : index
  %c8 = arith.constant 8 : index
  %c16 = arith.constant 16 : index
  %c4 = arith.constant 4 : index
  %c0 = arith.constant 0 : index
  %c1 = arith.constant 1 : index
  %c32_i64 = arith.constant 32 : i64
  %0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
  %8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
  %9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
  %10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32
  %11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32
  %12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32
  %13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32
  %14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32
  %15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32
  %16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32
  %17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32
  %18 = arith.extui %0 : i32 to i64
  %19 = arith.extui %1 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %2 : i32 to i64
  %24 = arith.extui %3 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = arith.extui %4 : i32 to i64
  %29 = arith.extui %5 : i32 to i64
  %30 = arith.shli %29, %c32_i64 : i64
  %31 = arith.ori %28, %30 : i64
  %32 = arith.index_castui %31 : i64 to index
  %33 = arith.extui %6 : i32 to i64
  %34 = arith.extui %7 : i32 to i64
  %35 = arith.shli %34, %c32_i64 : i64
  %36 = arith.ori %33, %35 : i64
  %37 = arith.index_castui %36 : i64 to index
  %38 = arith.extui %8 : i32 to i64
  %39 = arith.extui %9 : i32 to i64
  %40 = arith.shli %39, %c32_i64 : i64
  %41 = arith.ori %38, %40 : i64
  %42 = arith.index_castui %41 : i64 to index
  %43 = arith.extui %10 : i32 to i64
  %44 = arith.extui %11 : i32 to i64
  %45 = arith.shli %44, %c32_i64 : i64
  %46 = arith.ori %43, %45 : i64
  %47 = arith.index_castui %46 : i64 to index
  %48 = arith.extui %12 : i32 to i64
  %49 = arith.extui %13 : i32 to i64
  %50 = arith.shli %49, %c32_i64 : i64
  %51 = arith.ori %48, %50 : i64
  %52 = arith.index_castui %51 : i64 to index
  %53 = arith.extui %14 : i32 to i64
  %54 = arith.extui %15 : i32 to i64
  %55 = arith.shli %54, %c32_i64 : i64
  %56 = arith.ori %53, %55 : i64
  %57 = arith.index_castui %56 : i64 to index
  %58 = arith.extui %16 : i32 to i64
  %59 = arith.extui %17 : i32 to i64
  %60 = arith.shli %59, %c32_i64 : i64
  %61 = arith.ori %58, %60 : i64
  %62 = arith.index_castui %61 : i64 to index
  %63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : memref<?x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>{%37, %42}
  memref.assume_alignment %63, 1 : memref<?x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : memref<?x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>{%47, %52}
  memref.assume_alignment %64, 1 : memref<?x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : memref<?x?x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>{%57, %62}
  memref.assume_alignment %65, 1 : memref<?x?x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %subview = memref.subview %63[%workgroup_id_y, 0, 0, 0, 0, 0, 0] [1, %42, 8, 4, 16, 2, 8] [1, 1, 1, 1, 1, 1, 1] : memref<?x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %subview_0 = memref.subview %64[%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0] [1, %42, 4, 2, 4, 16, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : memref<?x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %subview_1 = memref.subview %65[%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 2, 4, 16, 4] [1, 1, 1, 1, 1, 1, 1, 1] : memref<?x?x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %alloc = memref.alloc() : memref<1x1x8x4x16x2x8xi8, #gpu.address_space<workgroup>>
  %alloc_2 = memref.alloc() : memref<1x1x4x2x4x16x2x8xi8, #gpu.address_space<workgroup>>
  scf.forall (%arg0) in (256) {
    %66 = affine.apply affine_map<(d0) -> (d0 mod 256)>(%arg0)
    %67:3 = affine.delinearize_index %66 into (%c4, %c4, %c16) : index, index, index
    %subview_3 = memref.subview %subview_1[0, 0, 0, %67#0, 0, %67#1, %67#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x8x1x2x1x1x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %68 = affine.apply affine_map<(d0) -> (d0 mod 64)>(%arg0)
    %69:2 = affine.delinearize_index %68 into (%c4, %c16) : index, index
    %70 = vector.transfer_read %subview_1[%c0, %c0, %c0, %67#0, %c0, %67#1, %67#2, %c0], %c0_i32 {in_bounds = [true, true, true, true, true, true, true, true]} : memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x1x8x1x2x1x1x4xi32>
    %71 = scf.for %arg1 = %c0 to %42 step %c1 iter_args(%arg2 = %70) -> (vector<1x1x8x1x2x1x1x4xi32>) {
      gpu.barrier
      scf.for %arg3 = %c0 to %c512 step %c256 {
        %75 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg3, %arg0)
        %76:3 = affine.delinearize_index %75 into (%c8, %c4, %c16) : index, index, index
        %subview_4 = memref.subview %subview[0, %arg1, %76#0, %76#1, %76#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : memref<1x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1x1x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        %subview_5 = memref.subview %alloc[0, 0, %76#0, %76#1, %76#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : memref<1x1x8x4x16x2x8xi8, #gpu.address_space<workgroup>> to memref<1x1x1x1x1x2x8xi8, strided<[8192, 8192, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>>
        memref.copy %subview_4, %subview_5 : memref<1x1x1x1x1x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1x1x2x8xi8, strided<[8192, 8192, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>>
      } {unroll_loop}
      scf.for %arg3 = %c0 to %c512 step %c256 {
        %75 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg3, %arg0)
        %76:4 = affine.delinearize_index %75 into (%c4, %c2, %c4, %c16) : index, index, index, index
        %subview_4 = memref.subview %subview_0[0, %arg1, %76#0, %76#1, %76#2, %76#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : memref<1x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1x1x1x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        %subview_5 = memref.subview %alloc_2[0, 0, %76#0, %76#1, %76#2, %76#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : memref<1x1x4x2x4x16x2x8xi8, #gpu.address_space<workgroup>> to memref<1x1x1x1x1x1x2x8xi8, strided<[8192, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>>
        memref.copy %subview_4, %subview_5 : memref<1x1x1x1x1x1x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1x1x1x2x8xi8, strided<[8192, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>>
      } {unroll_loop}
      gpu.barrier
      %72 = vector.transfer_read %alloc[%c0, %c0, %c0, %69#0, %69#1, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true]} : memref<1x1x8x4x16x2x8xi8, #gpu.address_space<workgroup>>, vector<1x1x8x1x1x2x8xi8>
      %73 = vector.transfer_read %alloc_2[%c0, %c0, %67#0, %c0, %67#1, %67#2, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true, true]} : memref<1x1x4x2x4x16x2x8xi8, #gpu.address_space<workgroup>>, vector<1x1x1x2x1x1x2x8xi8>
      %74 = iree_gpu.multi_mma %72, %73, %arg2 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic =  MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : vector<1x1x8x1x1x2x8xi8>, vector<1x1x1x2x1x1x2x8xi8> into vector<1x1x8x1x2x1x1x4xi32>
      scf.yield %74 : vector<1x1x8x1x2x1x1x4xi32>
    }
    vector.transfer_write %71, %subview_3[%c0, %c0, %c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true, true, true]} : vector<1x1x8x1x2x1x1x4xi32>, memref<1x1x8x1x2x1x1x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  } {mapping = [#gpu.thread<linear_dim_0>]}
  return
 }

 // -----// IR Dump After CleanupBufferAllocViewPass (iree-codegen-cleanup-buffer-alloc-view) //----- //
 func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} {
  %c0_i32 = arith.constant 0 : i32
  %c0_i8 = arith.constant 0 : i8
  %c2 = arith.constant 2 : index
  %c256 = arith.constant 256 : index
  %c512 = arith.constant 512 : index
  %c8 = arith.constant 8 : index
  %c16 = arith.constant 16 : index
  %c4 = arith.constant 4 : index
  %c0 = arith.constant 0 : index
  %c1 = arith.constant 1 : index
  %c32_i64 = arith.constant 32 : i64
  %0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
  %8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
  %9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
  %10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32
  %11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32
  %12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32
  %13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32
  %14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32
  %15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32
  %16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32
  %17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32
  %18 = arith.extui %0 : i32 to i64
  %19 = arith.extui %1 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %2 : i32 to i64
  %24 = arith.extui %3 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = arith.extui %4 : i32 to i64
  %29 = arith.extui %5 : i32 to i64
  %30 = arith.shli %29, %c32_i64 : i64
  %31 = arith.ori %28, %30 : i64
  %32 = arith.index_castui %31 : i64 to index
  %33 = arith.extui %6 : i32 to i64
  %34 = arith.extui %7 : i32 to i64
  %35 = arith.shli %34, %c32_i64 : i64
  %36 = arith.ori %33, %35 : i64
  %37 = arith.index_castui %36 : i64 to index
  %38 = arith.extui %8 : i32 to i64
  %39 = arith.extui %9 : i32 to i64
  %40 = arith.shli %39, %c32_i64 : i64
  %41 = arith.ori %38, %40 : i64
  %42 = arith.index_castui %41 : i64 to index
  %43 = arith.extui %10 : i32 to i64
  %44 = arith.extui %11 : i32 to i64
  %45 = arith.shli %44, %c32_i64 : i64
  %46 = arith.ori %43, %45 : i64
  %47 = arith.index_castui %46 : i64 to index
  %48 = arith.extui %12 : i32 to i64
  %49 = arith.extui %13 : i32 to i64
  %50 = arith.shli %49, %c32_i64 : i64
  %51 = arith.ori %48, %50 : i64
  %52 = arith.index_castui %51 : i64 to index
  %53 = arith.extui %14 : i32 to i64
  %54 = arith.extui %15 : i32 to i64
  %55 = arith.shli %54, %c32_i64 : i64
  %56 = arith.ori %53, %55 : i64
  %57 = arith.index_castui %56 : i64 to index
  %58 = arith.extui %16 : i32 to i64
  %59 = arith.extui %17 : i32 to i64
  %60 = arith.shli %59, %c32_i64 : i64
  %61 = arith.ori %58, %60 : i64
  %62 = arith.index_castui %61 : i64 to index
  %63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : memref<?x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>{%37, %42}
  memref.assume_alignment %63, 1 : memref<?x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : memref<?x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>{%47, %52}
  memref.assume_alignment %64, 1 : memref<?x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : memref<?x?x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>{%57, %62}
  memref.assume_alignment %65, 1 : memref<?x?x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %subview = memref.subview %63[%workgroup_id_y, 0, 0, 0, 0, 0, 0] [1, %42, 8, 4, 16, 2, 8] [1, 1, 1, 1, 1, 1, 1] : memref<?x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %subview_0 = memref.subview %64[%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0] [1, %42, 4, 2, 4, 16, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : memref<?x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %subview_1 = memref.subview %65[%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 2, 4, 16, 4] [1, 1, 1, 1, 1, 1, 1, 1] : memref<?x?x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %alloc = memref.alloc() : memref<1x1x8x4x16x2x8xi8, #gpu.address_space<workgroup>>
  %alloc_2 = memref.alloc() : memref<1x1x4x2x4x16x2x8xi8, #gpu.address_space<workgroup>>
  scf.forall (%arg0) in (256) {
    %66 = affine.apply affine_map<(d0) -> (d0 mod 256)>(%arg0)
    %67:3 = affine.delinearize_index %66 into (%c4, %c4, %c16) : index, index, index
    %subview_3 = memref.subview %subview_1[0, 0, 0, %67#0, 0, %67#1, %67#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x8x1x2x1x1x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %68 = affine.apply affine_map<(d0) -> (d0 mod 64)>(%arg0)
    %69:2 = affine.delinearize_index %68 into (%c4, %c16) : index, index
    %70 = vector.transfer_read %subview_1[%c0, %c0, %c0, %67#0, %c0, %67#1, %67#2, %c0], %c0_i32 {in_bounds = [true, true, true, true, true, true, true, true]} : memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x1x8x1x2x1x1x4xi32>
    %71 = scf.for %arg1 = %c0 to %42 step %c1 iter_args(%arg2 = %70) -> (vector<1x1x8x1x2x1x1x4xi32>) {
      gpu.barrier
      scf.for %arg3 = %c0 to %c512 step %c256 {
        %75 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg3, %arg0)
        %76:3 = affine.delinearize_index %75 into (%c8, %c4, %c16) : index, index, index
        %subview_4 = memref.subview %subview[0, %arg1, %76#0, %76#1, %76#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : memref<1x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1x1x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        %subview_5 = memref.subview %alloc[0, 0, %76#0, %76#1, %76#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : memref<1x1x8x4x16x2x8xi8, #gpu.address_space<workgroup>> to memref<1x1x1x1x1x2x8xi8, strided<[8192, 8192, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>>
        memref.copy %subview_4, %subview_5 : memref<1x1x1x1x1x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1x1x2x8xi8, strided<[8192, 8192, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>>
      } {unroll_loop}
      scf.for %arg3 = %c0 to %c512 step %c256 {
        %75 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg3, %arg0)
        %76:4 = affine.delinearize_index %75 into (%c4, %c2, %c4, %c16) : index, index, index, index
        %subview_4 = memref.subview %subview_0[0, %arg1, %76#0, %76#1, %76#2, %76#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : memref<1x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1x1x1x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        %subview_5 = memref.subview %alloc_2[0, 0, %76#0, %76#1, %76#2, %76#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : memref<1x1x4x2x4x16x2x8xi8, #gpu.address_space<workgroup>> to memref<1x1x1x1x1x1x2x8xi8, strided<[8192, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>>
        memref.copy %subview_4, %subview_5 : memref<1x1x1x1x1x1x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1x1x1x2x8xi8, strided<[8192, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>>
      } {unroll_loop}
      gpu.barrier
      %72 = vector.transfer_read %alloc[%c0, %c0, %c0, %69#0, %69#1, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true]} : memref<1x1x8x4x16x2x8xi8, #gpu.address_space<workgroup>>, vector<1x1x8x1x1x2x8xi8>
      %73 = vector.transfer_read %alloc_2[%c0, %c0, %67#0, %c0, %67#1, %67#2, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true, true]} : memref<1x1x4x2x4x16x2x8xi8, #gpu.address_space<workgroup>>, vector<1x1x1x2x1x1x2x8xi8>
      %74 = iree_gpu.multi_mma %72, %73, %arg2 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic =  MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : vector<1x1x8x1x1x2x8xi8>, vector<1x1x1x2x1x1x2x8xi8> into vector<1x1x8x1x2x1x1x4xi32>
      scf.yield %74 : vector<1x1x8x1x2x1x1x4xi32>
    }
    vector.transfer_write %71, %subview_3[%c0, %c0, %c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true, true, true]} : vector<1x1x8x1x2x1x1x4xi32>, memref<1x1x8x1x2x1x1x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  } {mapping = [#gpu.thread<linear_dim_0>]}
  return
 }

 // -----// IR Dump After Canonicalizer (canonicalize) //----- //
 func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} {
  %c0_i32 = arith.constant 0 : i32
  %c0_i8 = arith.constant 0 : i8
  %c2 = arith.constant 2 : index
  %c256 = arith.constant 256 : index
  %c512 = arith.constant 512 : index
  %c8 = arith.constant 8 : index
  %c16 = arith.constant 16 : index
  %c4 = arith.constant 4 : index
  %c0 = arith.constant 0 : index
  %c1 = arith.constant 1 : index
  %c32_i64 = arith.constant 32 : i64
  %0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
  %8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
  %9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
  %10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32
  %11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32
  %12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32
  %13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32
  %14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32
  %15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32
  %16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32
  %17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32
  %18 = arith.extui %0 : i32 to i64
  %19 = arith.extui %1 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %2 : i32 to i64
  %24 = arith.extui %3 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = arith.extui %4 : i32 to i64
  %29 = arith.extui %5 : i32 to i64
  %30 = arith.shli %29, %c32_i64 : i64
  %31 = arith.ori %28, %30 : i64
  %32 = arith.index_castui %31 : i64 to index
  %33 = arith.extui %6 : i32 to i64
  %34 = arith.extui %7 : i32 to i64
  %35 = arith.shli %34, %c32_i64 : i64
  %36 = arith.ori %33, %35 : i64
  %37 = arith.index_castui %36 : i64 to index
  %38 = arith.extui %8 : i32 to i64
  %39 = arith.extui %9 : i32 to i64
  %40 = arith.shli %39, %c32_i64 : i64
  %41 = arith.ori %38, %40 : i64
  %42 = arith.index_castui %41 : i64 to index
  %43 = arith.extui %10 : i32 to i64
  %44 = arith.extui %11 : i32 to i64
  %45 = arith.shli %44, %c32_i64 : i64
  %46 = arith.ori %43, %45 : i64
  %47 = arith.index_castui %46 : i64 to index
  %48 = arith.extui %12 : i32 to i64
  %49 = arith.extui %13 : i32 to i64
  %50 = arith.shli %49, %c32_i64 : i64
  %51 = arith.ori %48, %50 : i64
  %52 = arith.index_castui %51 : i64 to index
  %53 = arith.extui %14 : i32 to i64
  %54 = arith.extui %15 : i32 to i64
  %55 = arith.shli %54, %c32_i64 : i64
  %56 = arith.ori %53, %55 : i64
  %57 = arith.index_castui %56 : i64 to index
  %58 = arith.extui %16 : i32 to i64
  %59 = arith.extui %17 : i32 to i64
  %60 = arith.shli %59, %c32_i64 : i64
  %61 = arith.ori %58, %60 : i64
  %62 = arith.index_castui %61 : i64 to index
  %63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : memref<?x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>{%37, %42}
  memref.assume_alignment %63, 1 : memref<?x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : memref<?x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>{%47, %52}
  memref.assume_alignment %64, 1 : memref<?x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : memref<?x?x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>{%57, %62}
  memref.assume_alignment %65, 1 : memref<?x?x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %subview = memref.subview %63[%workgroup_id_y, 0, 0, 0, 0, 0, 0] [1, %42, 8, 4, 16, 2, 8] [1, 1, 1, 1, 1, 1, 1] : memref<?x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %subview_0 = memref.subview %64[%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0] [1, %42, 4, 2, 4, 16, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : memref<?x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %subview_1 = memref.subview %65[%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 2, 4, 16, 4] [1, 1, 1, 1, 1, 1, 1, 1] : memref<?x?x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %alloc = memref.alloc() : memref<1x1x8x4x16x2x8xi8, #gpu.address_space<workgroup>>
  %alloc_2 = memref.alloc() : memref<1x1x4x2x4x16x2x8xi8, #gpu.address_space<workgroup>>
  scf.forall (%arg0) in (256) {
    %66 = affine.apply affine_map<(d0) -> (d0 mod 256)>(%arg0)
    %67:3 = affine.delinearize_index %66 into (%c4, %c4, %c16) : index, index, index
    %subview_3 = memref.subview %subview_1[0, 0, 0, %67#0, 0, %67#1, %67#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x8x1x2x1x1x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %68 = affine.apply affine_map<(d0) -> (d0 mod 64)>(%arg0)
    %69:2 = affine.delinearize_index %68 into (%c4, %c16) : index, index
    %70 = vector.transfer_read %subview_1[%c0, %c0, %c0, %67#0, %c0, %67#1, %67#2, %c0], %c0_i32 {in_bounds = [true, true, true, true, true, true, true, true]} : memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x1x8x1x2x1x1x4xi32>
    %71 = scf.for %arg1 = %c0 to %42 step %c1 iter_args(%arg2 = %70) -> (vector<1x1x8x1x2x1x1x4xi32>) {
      gpu.barrier
      scf.for %arg3 = %c0 to %c512 step %c256 {
        %75 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg3, %arg0)
        %76:3 = affine.delinearize_index %75 into (%c8, %c4, %c16) : index, index, index
        %subview_4 = memref.subview %subview[0, %arg1, %76#0, %76#1, %76#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : memref<1x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1x1x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        %subview_5 = memref.subview %alloc[0, 0, %76#0, %76#1, %76#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : memref<1x1x8x4x16x2x8xi8, #gpu.address_space<workgroup>> to memref<1x1x1x1x1x2x8xi8, strided<[8192, 8192, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>>
        memref.copy %subview_4, %subview_5 : memref<1x1x1x1x1x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1x1x2x8xi8, strided<[8192, 8192, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>>
      } {unroll_loop}
      scf.for %arg3 = %c0 to %c512 step %c256 {
        %75 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg3, %arg0)
        %76:4 = affine.delinearize_index %75 into (%c4, %c2, %c4, %c16) : index, index, index, index
        %subview_4 = memref.subview %subview_0[0, %arg1, %76#0, %76#1, %76#2, %76#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : memref<1x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1x1x1x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        %subview_5 = memref.subview %alloc_2[0, 0, %76#0, %76#1, %76#2, %76#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : memref<1x1x4x2x4x16x2x8xi8, #gpu.address_space<workgroup>> to memref<1x1x1x1x1x1x2x8xi8, strided<[8192, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>>
        memref.copy %subview_4, %subview_5 : memref<1x1x1x1x1x1x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1x1x1x2x8xi8, strided<[8192, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>>
      } {unroll_loop}
      gpu.barrier
      %72 = vector.transfer_read %alloc[%c0, %c0, %c0, %69#0, %69#1, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true]} : memref<1x1x8x4x16x2x8xi8, #gpu.address_space<workgroup>>, vector<1x1x8x1x1x2x8xi8>
      %73 = vector.transfer_read %alloc_2[%c0, %c0, %67#0, %c0, %67#1, %67#2, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true, true]} : memref<1x1x4x2x4x16x2x8xi8, #gpu.address_space<workgroup>>, vector<1x1x1x2x1x1x2x8xi8>
      %74 = iree_gpu.multi_mma %72, %73, %arg2 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic =  MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : vector<1x1x8x1x1x2x8xi8>, vector<1x1x1x2x1x1x2x8xi8> into vector<1x1x8x1x2x1x1x4xi32>
      scf.yield %74 : vector<1x1x8x1x2x1x1x4xi32>
    }
    vector.transfer_write %71, %subview_3[%c0, %c0, %c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true, true, true]} : vector<1x1x8x1x2x1x1x4xi32>, memref<1x1x8x1x2x1x1x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  } {mapping = [#gpu.thread<linear_dim_0>]}
  return
 }

 // -----// IR Dump After CSE (cse) //----- //
 func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} {
  %c0_i32 = arith.constant 0 : i32
  %c0_i8 = arith.constant 0 : i8
  %c2 = arith.constant 2 : index
  %c256 = arith.constant 256 : index
  %c512 = arith.constant 512 : index
  %c8 = arith.constant 8 : index
  %c16 = arith.constant 16 : index
  %c4 = arith.constant 4 : index
  %c0 = arith.constant 0 : index
  %c1 = arith.constant 1 : index
  %c32_i64 = arith.constant 32 : i64
  %0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
  %8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
  %9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
  %10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32
  %11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32
  %12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32
  %13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32
  %14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32
  %15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32
  %16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32
  %17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32
  %18 = arith.extui %0 : i32 to i64
  %19 = arith.extui %1 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %2 : i32 to i64
  %24 = arith.extui %3 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = arith.extui %4 : i32 to i64
  %29 = arith.extui %5 : i32 to i64
  %30 = arith.shli %29, %c32_i64 : i64
  %31 = arith.ori %28, %30 : i64
  %32 = arith.index_castui %31 : i64 to index
  %33 = arith.extui %6 : i32 to i64
  %34 = arith.extui %7 : i32 to i64
  %35 = arith.shli %34, %c32_i64 : i64
  %36 = arith.ori %33, %35 : i64
  %37 = arith.index_castui %36 : i64 to index
  %38 = arith.extui %8 : i32 to i64
  %39 = arith.extui %9 : i32 to i64
  %40 = arith.shli %39, %c32_i64 : i64
  %41 = arith.ori %38, %40 : i64
  %42 = arith.index_castui %41 : i64 to index
  %43 = arith.extui %10 : i32 to i64
  %44 = arith.extui %11 : i32 to i64
  %45 = arith.shli %44, %c32_i64 : i64
  %46 = arith.ori %43, %45 : i64
  %47 = arith.index_castui %46 : i64 to index
  %48 = arith.extui %12 : i32 to i64
  %49 = arith.extui %13 : i32 to i64
  %50 = arith.shli %49, %c32_i64 : i64
  %51 = arith.ori %48, %50 : i64
  %52 = arith.index_castui %51 : i64 to index
  %53 = arith.extui %14 : i32 to i64
  %54 = arith.extui %15 : i32 to i64
  %55 = arith.shli %54, %c32_i64 : i64
  %56 = arith.ori %53, %55 : i64
  %57 = arith.index_castui %56 : i64 to index
  %58 = arith.extui %16 : i32 to i64
  %59 = arith.extui %17 : i32 to i64
  %60 = arith.shli %59, %c32_i64 : i64
  %61 = arith.ori %58, %60 : i64
  %62 = arith.index_castui %61 : i64 to index
  %63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : memref<?x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>{%37, %42}
  memref.assume_alignment %63, 1 : memref<?x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : memref<?x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>{%47, %52}
  memref.assume_alignment %64, 1 : memref<?x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : memref<?x?x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>{%57, %62}
  memref.assume_alignment %65, 1 : memref<?x?x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %subview = memref.subview %63[%workgroup_id_y, 0, 0, 0, 0, 0, 0] [1, %42, 8, 4, 16, 2, 8] [1, 1, 1, 1, 1, 1, 1] : memref<?x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %subview_0 = memref.subview %64[%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0] [1, %42, 4, 2, 4, 16, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : memref<?x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %subview_1 = memref.subview %65[%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 2, 4, 16, 4] [1, 1, 1, 1, 1, 1, 1, 1] : memref<?x?x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %alloc = memref.alloc() : memref<1x1x8x4x16x2x8xi8, #gpu.address_space<workgroup>>
  %alloc_2 = memref.alloc() : memref<1x1x4x2x4x16x2x8xi8, #gpu.address_space<workgroup>>
  scf.forall (%arg0) in (256) {
    %66 = affine.apply affine_map<(d0) -> (d0 mod 256)>(%arg0)
    %67:3 = affine.delinearize_index %66 into (%c4, %c4, %c16) : index, index, index
    %subview_3 = memref.subview %subview_1[0, 0, 0, %67#0, 0, %67#1, %67#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x8x1x2x1x1x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %68 = affine.apply affine_map<(d0) -> (d0 mod 64)>(%arg0)
    %69:2 = affine.delinearize_index %68 into (%c4, %c16) : index, index
    %70 = vector.transfer_read %subview_1[%c0, %c0, %c0, %67#0, %c0, %67#1, %67#2, %c0], %c0_i32 {in_bounds = [true, true, true, true, true, true, true, true]} : memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x1x8x1x2x1x1x4xi32>
    %71 = scf.for %arg1 = %c0 to %42 step %c1 iter_args(%arg2 = %70) -> (vector<1x1x8x1x2x1x1x4xi32>) {
      gpu.barrier
      scf.for %arg3 = %c0 to %c512 step %c256 {
        %75 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg3, %arg0)
        %76:3 = affine.delinearize_index %75 into (%c8, %c4, %c16) : index, index, index
        %subview_4 = memref.subview %subview[0, %arg1, %76#0, %76#1, %76#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : memref<1x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1x1x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        %subview_5 = memref.subview %alloc[0, 0, %76#0, %76#1, %76#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : memref<1x1x8x4x16x2x8xi8, #gpu.address_space<workgroup>> to memref<1x1x1x1x1x2x8xi8, strided<[8192, 8192, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>>
        memref.copy %subview_4, %subview_5 : memref<1x1x1x1x1x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1x1x2x8xi8, strided<[8192, 8192, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>>
      } {unroll_loop}
      scf.for %arg3 = %c0 to %c512 step %c256 {
        %75 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg3, %arg0)
        %76:4 = affine.delinearize_index %75 into (%c4, %c2, %c4, %c16) : index, index, index, index
        %subview_4 = memref.subview %subview_0[0, %arg1, %76#0, %76#1, %76#2, %76#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : memref<1x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1x1x1x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        %subview_5 = memref.subview %alloc_2[0, 0, %76#0, %76#1, %76#2, %76#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : memref<1x1x4x2x4x16x2x8xi8, #gpu.address_space<workgroup>> to memref<1x1x1x1x1x1x2x8xi8, strided<[8192, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>>
        memref.copy %subview_4, %subview_5 : memref<1x1x1x1x1x1x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1x1x1x2x8xi8, strided<[8192, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>>
      } {unroll_loop}
      gpu.barrier
      %72 = vector.transfer_read %alloc[%c0, %c0, %c0, %69#0, %69#1, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true]} : memref<1x1x8x4x16x2x8xi8, #gpu.address_space<workgroup>>, vector<1x1x8x1x1x2x8xi8>
      %73 = vector.transfer_read %alloc_2[%c0, %c0, %67#0, %c0, %67#1, %67#2, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true, true]} : memref<1x1x4x2x4x16x2x8xi8, #gpu.address_space<workgroup>>, vector<1x1x1x2x1x1x2x8xi8>
      %74 = iree_gpu.multi_mma %72, %73, %arg2 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic =  MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : vector<1x1x8x1x1x2x8xi8>, vector<1x1x1x2x1x1x2x8xi8> into vector<1x1x8x1x2x1x1x4xi32>
      scf.yield %74 : vector<1x1x8x1x2x1x1x4xi32>
    }
    vector.transfer_write %71, %subview_3[%c0, %c0, %c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true, true, true]} : vector<1x1x8x1x2x1x1x4xi32>, memref<1x1x8x1x2x1x1x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  } {mapping = [#gpu.thread<linear_dim_0>]}
  return
 }

 // -----// IR Dump After GPUVerifyDistributionPass (iree-codegen-gpu-verify-distribution) //----- //
 func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} {
  %c0_i32 = arith.constant 0 : i32
  %c0_i8 = arith.constant 0 : i8
  %c2 = arith.constant 2 : index
  %c256 = arith.constant 256 : index
  %c512 = arith.constant 512 : index
  %c8 = arith.constant 8 : index
  %c16 = arith.constant 16 : index
  %c4 = arith.constant 4 : index
  %c0 = arith.constant 0 : index
  %c1 = arith.constant 1 : index
  %c32_i64 = arith.constant 32 : i64
  %0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
  %8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
  %9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
  %10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32
  %11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32
  %12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32
  %13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32
  %14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32
  %15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32
  %16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32
  %17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32
  %18 = arith.extui %0 : i32 to i64
  %19 = arith.extui %1 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %2 : i32 to i64
  %24 = arith.extui %3 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = arith.extui %4 : i32 to i64
  %29 = arith.extui %5 : i32 to i64
  %30 = arith.shli %29, %c32_i64 : i64
  %31 = arith.ori %28, %30 : i64
  %32 = arith.index_castui %31 : i64 to index
  %33 = arith.extui %6 : i32 to i64
  %34 = arith.extui %7 : i32 to i64
  %35 = arith.shli %34, %c32_i64 : i64
  %36 = arith.ori %33, %35 : i64
  %37 = arith.index_castui %36 : i64 to index
  %38 = arith.extui %8 : i32 to i64
  %39 = arith.extui %9 : i32 to i64
  %40 = arith.shli %39, %c32_i64 : i64
  %41 = arith.ori %38, %40 : i64
  %42 = arith.index_castui %41 : i64 to index
  %43 = arith.extui %10 : i32 to i64
  %44 = arith.extui %11 : i32 to i64
  %45 = arith.shli %44, %c32_i64 : i64
  %46 = arith.ori %43, %45 : i64
  %47 = arith.index_castui %46 : i64 to index
  %48 = arith.extui %12 : i32 to i64
  %49 = arith.extui %13 : i32 to i64
  %50 = arith.shli %49, %c32_i64 : i64
  %51 = arith.ori %48, %50 : i64
  %52 = arith.index_castui %51 : i64 to index
  %53 = arith.extui %14 : i32 to i64
  %54 = arith.extui %15 : i32 to i64
  %55 = arith.shli %54, %c32_i64 : i64
  %56 = arith.ori %53, %55 : i64
  %57 = arith.index_castui %56 : i64 to index
  %58 = arith.extui %16 : i32 to i64
  %59 = arith.extui %17 : i32 to i64
  %60 = arith.shli %59, %c32_i64 : i64
  %61 = arith.ori %58, %60 : i64
  %62 = arith.index_castui %61 : i64 to index
  %63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : memref<?x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>{%37, %42}
  memref.assume_alignment %63, 1 : memref<?x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : memref<?x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>{%47, %52}
  memref.assume_alignment %64, 1 : memref<?x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : memref<?x?x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>{%57, %62}
  memref.assume_alignment %65, 1 : memref<?x?x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %subview = memref.subview %63[%workgroup_id_y, 0, 0, 0, 0, 0, 0] [1, %42, 8, 4, 16, 2, 8] [1, 1, 1, 1, 1, 1, 1] : memref<?x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %subview_0 = memref.subview %64[%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0] [1, %42, 4, 2, 4, 16, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : memref<?x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %subview_1 = memref.subview %65[%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 2, 4, 16, 4] [1, 1, 1, 1, 1, 1, 1, 1] : memref<?x?x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %alloc = memref.alloc() : memref<1x1x8x4x16x2x8xi8, #gpu.address_space<workgroup>>
  %alloc_2 = memref.alloc() : memref<1x1x4x2x4x16x2x8xi8, #gpu.address_space<workgroup>>
  scf.forall (%arg0) in (256) {
    %66 = affine.apply affine_map<(d0) -> (d0 mod 256)>(%arg0)
    %67:3 = affine.delinearize_index %66 into (%c4, %c4, %c16) : index, index, index
    %subview_3 = memref.subview %subview_1[0, 0, 0, %67#0, 0, %67#1, %67#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x8x1x2x1x1x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %68 = affine.apply affine_map<(d0) -> (d0 mod 64)>(%arg0)
    %69:2 = affine.delinearize_index %68 into (%c4, %c16) : index, index
    %70 = vector.transfer_read %subview_1[%c0, %c0, %c0, %67#0, %c0, %67#1, %67#2, %c0], %c0_i32 {in_bounds = [true, true, true, true, true, true, true, true]} : memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x1x8x1x2x1x1x4xi32>
    %71 = scf.for %arg1 = %c0 to %42 step %c1 iter_args(%arg2 = %70) -> (vector<1x1x8x1x2x1x1x4xi32>) {
      gpu.barrier
      scf.for %arg3 = %c0 to %c512 step %c256 {
        %75 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg3, %arg0)
        %76:3 = affine.delinearize_index %75 into (%c8, %c4, %c16) : index, index, index
        %subview_4 = memref.subview %subview[0, %arg1, %76#0, %76#1, %76#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : memref<1x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1x1x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        %subview_5 = memref.subview %alloc[0, 0, %76#0, %76#1, %76#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : memref<1x1x8x4x16x2x8xi8, #gpu.address_space<workgroup>> to memref<1x1x1x1x1x2x8xi8, strided<[8192, 8192, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>>
        memref.copy %subview_4, %subview_5 : memref<1x1x1x1x1x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1x1x2x8xi8, strided<[8192, 8192, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>>
      } {unroll_loop}
      scf.for %arg3 = %c0 to %c512 step %c256 {
        %75 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg3, %arg0)
        %76:4 = affine.delinearize_index %75 into (%c4, %c2, %c4, %c16) : index, index, index, index
        %subview_4 = memref.subview %subview_0[0, %arg1, %76#0, %76#1, %76#2, %76#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : memref<1x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1x1x1x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        %subview_5 = memref.subview %alloc_2[0, 0, %76#0, %76#1, %76#2, %76#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : memref<1x1x4x2x4x16x2x8xi8, #gpu.address_space<workgroup>> to memref<1x1x1x1x1x1x2x8xi8, strided<[8192, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>>
        memref.copy %subview_4, %subview_5 : memref<1x1x1x1x1x1x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1x1x1x2x8xi8, strided<[8192, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>>
      } {unroll_loop}
      gpu.barrier
      %72 = vector.transfer_read %alloc[%c0, %c0, %c0, %69#0, %69#1, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true]} : memref<1x1x8x4x16x2x8xi8, #gpu.address_space<workgroup>>, vector<1x1x8x1x1x2x8xi8>
      %73 = vector.transfer_read %alloc_2[%c0, %c0, %67#0, %c0, %67#1, %67#2, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true, true]} : memref<1x1x4x2x4x16x2x8xi8, #gpu.address_space<workgroup>>, vector<1x1x1x2x1x1x2x8xi8>
      %74 = iree_gpu.multi_mma %72, %73, %arg2 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic =  MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : vector<1x1x8x1x1x2x8xi8>, vector<1x1x1x2x1x1x2x8xi8> into vector<1x1x8x1x2x1x1x4xi32>
      scf.yield %74 : vector<1x1x8x1x2x1x1x4xi32>
    }
    vector.transfer_write %71, %subview_3[%c0, %c0, %c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true, true, true]} : vector<1x1x8x1x2x1x1x4xi32>, memref<1x1x8x1x2x1x1x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  } {mapping = [#gpu.thread<linear_dim_0>]}
  return
 }

 // -----// IR Dump After GPUDistributeForallPass (iree-codegen-gpu-distribute-forall) //----- //
 func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} {
  %thread_id_x = gpu.thread_id  x
  %thread_id_y = gpu.thread_id  y
  %thread_id_z = gpu.thread_id  z
  %0 = affine.apply affine_map<()[s0, s1, s2] -> (s0 + s1 * 256 + s2 * 256)>()[%thread_id_x, %thread_id_y, %thread_id_z]
  %c0_i32 = arith.constant 0 : i32
  %c0_i8 = arith.constant 0 : i8
  %c2 = arith.constant 2 : index
  %c256 = arith.constant 256 : index
  %c512 = arith.constant 512 : index
  %c8 = arith.constant 8 : index
  %c16 = arith.constant 16 : index
  %c4 = arith.constant 4 : index
  %c0 = arith.constant 0 : index
  %c1 = arith.constant 1 : index
  %c32_i64 = arith.constant 32 : i64
  %1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
  %2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
  %3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
  %4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
  %5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
  %6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
  %7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
  %8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
  %9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
  %10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
  %11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32
  %12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32
  %13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32
  %14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32
  %15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32
  %16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32
  %17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32
  %18 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32
  %19 = arith.extui %1 : i32 to i64
  %20 = arith.extui %2 : i32 to i64
  %21 = arith.shli %20, %c32_i64 : i64
  %22 = arith.ori %19, %21 : i64
  %23 = arith.index_castui %22 : i64 to index
  %24 = arith.extui %3 : i32 to i64
  %25 = arith.extui %4 : i32 to i64
  %26 = arith.shli %25, %c32_i64 : i64
  %27 = arith.ori %24, %26 : i64
  %28 = arith.index_castui %27 : i64 to index
  %29 = arith.extui %5 : i32 to i64
  %30 = arith.extui %6 : i32 to i64
  %31 = arith.shli %30, %c32_i64 : i64
  %32 = arith.ori %29, %31 : i64
  %33 = arith.index_castui %32 : i64 to index
  %34 = arith.extui %7 : i32 to i64
  %35 = arith.extui %8 : i32 to i64
  %36 = arith.shli %35, %c32_i64 : i64
  %37 = arith.ori %34, %36 : i64
  %38 = arith.index_castui %37 : i64 to index
  %39 = arith.extui %9 : i32 to i64
  %40 = arith.extui %10 : i32 to i64
  %41 = arith.shli %40, %c32_i64 : i64
  %42 = arith.ori %39, %41 : i64
  %43 = arith.index_castui %42 : i64 to index
  %44 = arith.extui %11 : i32 to i64
  %45 = arith.extui %12 : i32 to i64
  %46 = arith.shli %45, %c32_i64 : i64
  %47 = arith.ori %44, %46 : i64
  %48 = arith.index_castui %47 : i64 to index
  %49 = arith.extui %13 : i32 to i64
  %50 = arith.extui %14 : i32 to i64
  %51 = arith.shli %50, %c32_i64 : i64
  %52 = arith.ori %49, %51 : i64
  %53 = arith.index_castui %52 : i64 to index
  %54 = arith.extui %15 : i32 to i64
  %55 = arith.extui %16 : i32 to i64
  %56 = arith.shli %55, %c32_i64 : i64
  %57 = arith.ori %54, %56 : i64
  %58 = arith.index_castui %57 : i64 to index
  %59 = arith.extui %17 : i32 to i64
  %60 = arith.extui %18 : i32 to i64
  %61 = arith.shli %60, %c32_i64 : i64
  %62 = arith.ori %59, %61 : i64
  %63 = arith.index_castui %62 : i64 to index
  %64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%23) flags("ReadOnly|Indirect") : memref<?x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>{%38, %43}
  memref.assume_alignment %64, 1 : memref<?x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%28) flags("ReadOnly|Indirect") : memref<?x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>{%48, %53}
  memref.assume_alignment %65, 1 : memref<?x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %66 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%33) flags(Indirect) : memref<?x?x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>{%58, %63}
  memref.assume_alignment %66, 1 : memref<?x?x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %subview = memref.subview %64[%workgroup_id_y, 0, 0, 0, 0, 0, 0] [1, %43, 8, 4, 16, 2, 8] [1, 1, 1, 1, 1, 1, 1] : memref<?x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %subview_0 = memref.subview %65[%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0] [1, %43, 4, 2, 4, 16, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : memref<?x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %subview_1 = memref.subview %66[%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 2, 4, 16, 4] [1, 1, 1, 1, 1, 1, 1, 1] : memref<?x?x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %alloc = memref.alloc() : memref<1x1x8x4x16x2x8xi8, #gpu.address_space<workgroup>>
  %alloc_2 = memref.alloc() : memref<1x1x4x2x4x16x2x8xi8, #gpu.address_space<workgroup>>
  %c256_3 = arith.constant 256 : index
  %c0_4 = arith.constant 0 : index
  %c256_5 = arith.constant 256 : index
  %c256_6 = arith.constant 256 : index
  scf.for %arg0 = %c0_4 to %c256_5 step %c256_6 {
    %67 = affine.apply affine_map<(d0)[s0, s1, s2] -> (d0 + s0 + s1 * 256 + s2 * 256)>(%arg0)[%thread_id_x, %thread_id_y, %thread_id_z]
    %68 = affine.delinearize_index %67 into (%c256_3) : index
    %69 = affine.apply affine_map<(d0) -> (d0 mod 256)>(%68)
    %70:3 = affine.delinearize_index %69 into (%c4, %c4, %c16) : index, index, index
    %subview_7 = memref.subview %subview_1[0, 0, 0, %70#0, 0, %70#1, %70#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x8x1x2x1x1x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %71 = affine.apply affine_map<(d0) -> (d0 mod 64)>(%68)
    %72:2 = affine.delinearize_index %71 into (%c4, %c16) : index, index
    %73 = vector.transfer_read %subview_1[%c0, %c0, %c0, %70#0, %c0, %70#1, %70#2, %c0], %c0_i32 {in_bounds = [true, true, true, true, true, true, true, true]} : memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x1x8x1x2x1x1x4xi32>
    %74 = scf.for %arg1 = %c0 to %43 step %c1 iter_args(%arg2 = %73) -> (vector<1x1x8x1x2x1x1x4xi32>) {
      gpu.barrier
      scf.for %arg3 = %c0 to %c512 step %c256 {
        %78 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg3, %68)
        %79:3 = affine.delinearize_index %78 into (%c8, %c4, %c16) : index, index, index
        %subview_8 = memref.subview %subview[0, %arg1, %79#0, %79#1, %79#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : memref<1x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1x1x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        %subview_9 = memref.subview %alloc[0, 0, %79#0, %79#1, %79#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : memref<1x1x8x4x16x2x8xi8, #gpu.address_space<workgroup>> to memref<1x1x1x1x1x2x8xi8, strided<[8192, 8192, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>>
        memref.copy %subview_8, %subview_9 : memref<1x1x1x1x1x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1x1x2x8xi8, strided<[8192, 8192, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>>
      } {unroll_loop}
      scf.for %arg3 = %c0 to %c512 step %c256 {
        %78 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg3, %68)
        %79:4 = affine.delinearize_index %78 into (%c4, %c2, %c4, %c16) : index, index, index, index
        %subview_8 = memref.subview %subview_0[0, %arg1, %79#0, %79#1, %79#2, %79#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : memref<1x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1x1x1x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        %subview_9 = memref.subview %alloc_2[0, 0, %79#0, %79#1, %79#2, %79#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : memref<1x1x4x2x4x16x2x8xi8, #gpu.address_space<workgroup>> to memref<1x1x1x1x1x1x2x8xi8, strided<[8192, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>>
        memref.copy %subview_8, %subview_9 : memref<1x1x1x1x1x1x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1x1x1x2x8xi8, strided<[8192, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>>
      } {unroll_loop}
      gpu.barrier
      %75 = vector.transfer_read %alloc[%c0, %c0, %c0, %72#0, %72#1, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true]} : memref<1x1x8x4x16x2x8xi8, #gpu.address_space<workgroup>>, vector<1x1x8x1x1x2x8xi8>
      %76 = vector.transfer_read %alloc_2[%c0, %c0, %70#0, %c0, %70#1, %70#2, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true, true]} : memref<1x1x4x2x4x16x2x8xi8, #gpu.address_space<workgroup>>, vector<1x1x1x2x1x1x2x8xi8>
      %77 = iree_gpu.multi_mma %75, %76, %arg2 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic =  MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : vector<1x1x8x1x1x2x8xi8>, vector<1x1x1x2x1x1x2x8xi8> into vector<1x1x8x1x2x1x1x4xi32>
      scf.yield %77 : vector<1x1x8x1x2x1x1x4xi32>
    }
    vector.transfer_write %74, %subview_7[%c0, %c0, %c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true, true, true]} : vector<1x1x8x1x2x1x1x4xi32>, memref<1x1x8x1x2x1x1x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  }
  return
 }

 // -----// IR Dump After VectorizeMemrefCopyPass (iree-codegen-vectorize-memref-copy) //----- //
 func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} {
  %c32_i64 = arith.constant 32 : i64
  %c1 = arith.constant 1 : index
  %c0 = arith.constant 0 : index
  %c4 = arith.constant 4 : index
  %c16 = arith.constant 16 : index
  %c8 = arith.constant 8 : index
  %c512 = arith.constant 512 : index
  %c256 = arith.constant 256 : index
  %c2 = arith.constant 2 : index
  %c0_i8 = arith.constant 0 : i8
  %c0_i32 = arith.constant 0 : i32
  %thread_id_x = gpu.thread_id  x
  %thread_id_y = gpu.thread_id  y
  %thread_id_z = gpu.thread_id  z
  %0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
  %8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
  %9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
  %10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32
  %11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32
  %12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32
  %13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32
  %14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32
  %15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32
  %16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32
  %17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32
  %18 = arith.extui %0 : i32 to i64
  %19 = arith.extui %1 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %2 : i32 to i64
  %24 = arith.extui %3 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = arith.extui %4 : i32 to i64
  %29 = arith.extui %5 : i32 to i64
  %30 = arith.shli %29, %c32_i64 : i64
  %31 = arith.ori %28, %30 : i64
  %32 = arith.index_castui %31 : i64 to index
  %33 = arith.extui %6 : i32 to i64
  %34 = arith.extui %7 : i32 to i64
  %35 = arith.shli %34, %c32_i64 : i64
  %36 = arith.ori %33, %35 : i64
  %37 = arith.index_castui %36 : i64 to index
  %38 = arith.extui %8 : i32 to i64
  %39 = arith.extui %9 : i32 to i64
  %40 = arith.shli %39, %c32_i64 : i64
  %41 = arith.ori %38, %40 : i64
  %42 = arith.index_castui %41 : i64 to index
  %43 = arith.extui %10 : i32 to i64
  %44 = arith.extui %11 : i32 to i64
  %45 = arith.shli %44, %c32_i64 : i64
  %46 = arith.ori %43, %45 : i64
  %47 = arith.index_castui %46 : i64 to index
  %48 = arith.extui %12 : i32 to i64
  %49 = arith.extui %13 : i32 to i64
  %50 = arith.shli %49, %c32_i64 : i64
  %51 = arith.ori %48, %50 : i64
  %52 = arith.index_castui %51 : i64 to index
  %53 = arith.extui %14 : i32 to i64
  %54 = arith.extui %15 : i32 to i64
  %55 = arith.shli %54, %c32_i64 : i64
  %56 = arith.ori %53, %55 : i64
  %57 = arith.index_castui %56 : i64 to index
  %58 = arith.extui %16 : i32 to i64
  %59 = arith.extui %17 : i32 to i64
  %60 = arith.shli %59, %c32_i64 : i64
  %61 = arith.ori %58, %60 : i64
  %62 = arith.index_castui %61 : i64 to index
  %63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : memref<?x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>{%37, %42}
  memref.assume_alignment %63, 1 : memref<?x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : memref<?x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>{%47, %52}
  memref.assume_alignment %64, 1 : memref<?x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : memref<?x?x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>{%57, %62}
  memref.assume_alignment %65, 1 : memref<?x?x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %subview = memref.subview %63[%workgroup_id_y, 0, 0, 0, 0, 0, 0] [1, %42, 8, 4, 16, 2, 8] [1, 1, 1, 1, 1, 1, 1] : memref<?x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %subview_0 = memref.subview %64[%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0] [1, %42, 4, 2, 4, 16, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : memref<?x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %subview_1 = memref.subview %65[%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 2, 4, 16, 4] [1, 1, 1, 1, 1, 1, 1, 1] : memref<?x?x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %alloc = memref.alloc() : memref<1x1x8x4x16x2x8xi8, #gpu.address_space<workgroup>>
  %alloc_2 = memref.alloc() : memref<1x1x4x2x4x16x2x8xi8, #gpu.address_space<workgroup>>
  scf.for %arg0 = %c0 to %c256 step %c256 {
    %66 = affine.apply affine_map<(d0)[s0, s1, s2] -> (d0 + s0 + s1 * 256 + s2 * 256)>(%arg0)[%thread_id_x, %thread_id_y, %thread_id_z]
    %67 = affine.delinearize_index %66 into (%c256) : index
    %68 = affine.apply affine_map<(d0) -> (d0 mod 256)>(%67)
    %69:3 = affine.delinearize_index %68 into (%c4, %c4, %c16) : index, index, index
    %subview_3 = memref.subview %subview_1[0, 0, 0, %69#0, 0, %69#1, %69#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x8x1x2x1x1x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %70 = affine.apply affine_map<(d0) -> (d0 mod 64)>(%67)
    %71:2 = affine.delinearize_index %70 into (%c4, %c16) : index, index
    %72 = vector.transfer_read %subview_1[%c0, %c0, %c0, %69#0, %c0, %69#1, %69#2, %c0], %c0_i32 {in_bounds = [true, true, true, true, true, true, true, true]} : memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x1x8x1x2x1x1x4xi32>
    %73 = scf.for %arg1 = %c0 to %42 step %c1 iter_args(%arg2 = %72) -> (vector<1x1x8x1x2x1x1x4xi32>) {
      gpu.barrier
      scf.for %arg3 = %c0 to %c512 step %c256 {
        %77 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg3, %67)
        %78:3 = affine.delinearize_index %77 into (%c8, %c4, %c16) : index, index, index
        %subview_4 = memref.subview %subview[0, %arg1, %78#0, %78#1, %78#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : memref<1x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1x1x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        %subview_5 = memref.subview %alloc[0, 0, %78#0, %78#1, %78#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : memref<1x1x8x4x16x2x8xi8, #gpu.address_space<workgroup>> to memref<1x1x1x1x1x2x8xi8, strided<[8192, 8192, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>>
        %79 = vector.transfer_read %subview_4[%c0, %c0, %c0, %c0, %c0, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true]} : memref<1x1x1x1x1x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x1x1x1x1x2x8xi8>
        vector.transfer_write %79, %subview_5[%c0, %c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true, true]} : vector<1x1x1x1x1x2x8xi8>, memref<1x1x1x1x1x2x8xi8, strided<[8192, 8192, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>>
      } {unroll_loop}
      scf.for %arg3 = %c0 to %c512 step %c256 {
        %77 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg3, %67)
        %78:4 = affine.delinearize_index %77 into (%c4, %c2, %c4, %c16) : index, index, index, index
        %subview_4 = memref.subview %subview_0[0, %arg1, %78#0, %78#1, %78#2, %78#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : memref<1x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1x1x1x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        %subview_5 = memref.subview %alloc_2[0, 0, %78#0, %78#1, %78#2, %78#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : memref<1x1x4x2x4x16x2x8xi8, #gpu.address_space<workgroup>> to memref<1x1x1x1x1x1x2x8xi8, strided<[8192, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>>
        %79 = vector.transfer_read %subview_4[%c0, %c0, %c0, %c0, %c0, %c0, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true, true]} : memref<1x1x1x1x1x1x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x1x1x1x1x1x2x8xi8>
        vector.transfer_write %79, %subview_5[%c0, %c0, %c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true, true, true]} : vector<1x1x1x1x1x1x2x8xi8>, memref<1x1x1x1x1x1x2x8xi8, strided<[8192, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>>
      } {unroll_loop}
      gpu.barrier
      %74 = vector.transfer_read %alloc[%c0, %c0, %c0, %71#0, %71#1, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true]} : memref<1x1x8x4x16x2x8xi8, #gpu.address_space<workgroup>>, vector<1x1x8x1x1x2x8xi8>
      %75 = vector.transfer_read %alloc_2[%c0, %c0, %69#0, %c0, %69#1, %69#2, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true, true]} : memref<1x1x4x2x4x16x2x8xi8, #gpu.address_space<workgroup>>, vector<1x1x1x2x1x1x2x8xi8>
      %76 = iree_gpu.multi_mma %74, %75, %arg2 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic =  MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : vector<1x1x8x1x1x2x8xi8>, vector<1x1x1x2x1x1x2x8xi8> into vector<1x1x8x1x2x1x1x4xi32>
      scf.yield %76 : vector<1x1x8x1x2x1x1x4xi32>
    }
    vector.transfer_write %73, %subview_3[%c0, %c0, %c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true, true, true]} : vector<1x1x8x1x2x1x1x4xi32>, memref<1x1x8x1x2x1x1x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  }
  return
 }

 // -----// IR Dump After UnrollToIntrinsicsPass (iree-gpu-unroll-to-intrinsics) //----- //
 func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} {
  %c32_i64 = arith.constant 32 : i64
  %c1 = arith.constant 1 : index
  %c0 = arith.constant 0 : index
  %c4 = arith.constant 4 : index
  %c16 = arith.constant 16 : index
  %c8 = arith.constant 8 : index
  %c512 = arith.constant 512 : index
  %c256 = arith.constant 256 : index
  %c2 = arith.constant 2 : index
  %c0_i8 = arith.constant 0 : i8
  %c0_i32 = arith.constant 0 : i32
  %thread_id_x = gpu.thread_id  x
  %thread_id_y = gpu.thread_id  y
  %thread_id_z = gpu.thread_id  z
  %0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
  %8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
  %9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
  %10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32
  %11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32
  %12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32
  %13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32
  %14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32
  %15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32
  %16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32
  %17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32
  %18 = arith.extui %0 : i32 to i64
  %19 = arith.extui %1 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %2 : i32 to i64
  %24 = arith.extui %3 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = arith.extui %4 : i32 to i64
  %29 = arith.extui %5 : i32 to i64
  %30 = arith.shli %29, %c32_i64 : i64
  %31 = arith.ori %28, %30 : i64
  %32 = arith.index_castui %31 : i64 to index
  %33 = arith.extui %6 : i32 to i64
  %34 = arith.extui %7 : i32 to i64
  %35 = arith.shli %34, %c32_i64 : i64
  %36 = arith.ori %33, %35 : i64
  %37 = arith.index_castui %36 : i64 to index
  %38 = arith.extui %8 : i32 to i64
  %39 = arith.extui %9 : i32 to i64
  %40 = arith.shli %39, %c32_i64 : i64
  %41 = arith.ori %38, %40 : i64
  %42 = arith.index_castui %41 : i64 to index
  %43 = arith.extui %10 : i32 to i64
  %44 = arith.extui %11 : i32 to i64
  %45 = arith.shli %44, %c32_i64 : i64
  %46 = arith.ori %43, %45 : i64
  %47 = arith.index_castui %46 : i64 to index
  %48 = arith.extui %12 : i32 to i64
  %49 = arith.extui %13 : i32 to i64
  %50 = arith.shli %49, %c32_i64 : i64
  %51 = arith.ori %48, %50 : i64
  %52 = arith.index_castui %51 : i64 to index
  %53 = arith.extui %14 : i32 to i64
  %54 = arith.extui %15 : i32 to i64
  %55 = arith.shli %54, %c32_i64 : i64
  %56 = arith.ori %53, %55 : i64
  %57 = arith.index_castui %56 : i64 to index
  %58 = arith.extui %16 : i32 to i64
  %59 = arith.extui %17 : i32 to i64
  %60 = arith.shli %59, %c32_i64 : i64
  %61 = arith.ori %58, %60 : i64
  %62 = arith.index_castui %61 : i64 to index
  %63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : memref<?x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>{%37, %42}
  memref.assume_alignment %63, 1 : memref<?x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : memref<?x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>{%47, %52}
  memref.assume_alignment %64, 1 : memref<?x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : memref<?x?x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>{%57, %62}
  memref.assume_alignment %65, 1 : memref<?x?x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %subview = memref.subview %63[%workgroup_id_y, 0, 0, 0, 0, 0, 0] [1, %42, 8, 4, 16, 2, 8] [1, 1, 1, 1, 1, 1, 1] : memref<?x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %subview_0 = memref.subview %64[%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0] [1, %42, 4, 2, 4, 16, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : memref<?x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %subview_1 = memref.subview %65[%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 2, 4, 16, 4] [1, 1, 1, 1, 1, 1, 1, 1] : memref<?x?x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %alloc = memref.alloc() : memref<1x1x8x4x16x2x8xi8, #gpu.address_space<workgroup>>
  %alloc_2 = memref.alloc() : memref<1x1x4x2x4x16x2x8xi8, #gpu.address_space<workgroup>>
  scf.for %arg0 = %c0 to %c256 step %c256 {
    %66 = affine.apply affine_map<(d0)[s0, s1, s2] -> (d0 + s0 + s1 * 256 + s2 * 256)>(%arg0)[%thread_id_x, %thread_id_y, %thread_id_z]
    %67 = affine.delinearize_index %66 into (%c256) : index
    %68 = affine.apply affine_map<(d0) -> (d0 mod 256)>(%67)
    %69:3 = affine.delinearize_index %68 into (%c4, %c4, %c16) : index, index, index
    %subview_3 = memref.subview %subview_1[0, 0, 0, %69#0, 0, %69#1, %69#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x8x1x2x1x1x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %70 = affine.apply affine_map<(d0) -> (d0 mod 64)>(%67)
    %71:2 = affine.delinearize_index %70 into (%c4, %c16) : index, index
    %72 = vector.transfer_read %subview_1[%c0, %c0, %c0, %69#0, %c0, %69#1, %69#2, %c0], %c0_i32 {in_bounds = [true, true, true, true, true, true, true, true]} : memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x1x8x1x2x1x1x4xi32>
    %73 = scf.for %arg1 = %c0 to %42 step %c1 iter_args(%arg2 = %72) -> (vector<1x1x8x1x2x1x1x4xi32>) {
      gpu.barrier
      scf.for %arg3 = %c0 to %c512 step %c256 {
        %81 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg3, %67)
        %82:3 = affine.delinearize_index %81 into (%c8, %c4, %c16) : index, index, index
        %subview_4 = memref.subview %subview[0, %arg1, %82#0, %82#1, %82#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : memref<1x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1x1x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        %subview_5 = memref.subview %alloc[0, 0, %82#0, %82#1, %82#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : memref<1x1x8x4x16x2x8xi8, #gpu.address_space<workgroup>> to memref<1x1x1x1x1x2x8xi8, strided<[8192, 8192, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>>
        %83 = vector.transfer_read %subview_4[%c0, %c0, %c0, %c0, %c0, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true]} : memref<1x1x1x1x1x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x1x1x1x1x2x8xi8>
        vector.transfer_write %83, %subview_5[%c0, %c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true, true]} : vector<1x1x1x1x1x2x8xi8>, memref<1x1x1x1x1x2x8xi8, strided<[8192, 8192, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>>
      } {unroll_loop}
      scf.for %arg3 = %c0 to %c512 step %c256 {
        %81 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg3, %67)
        %82:4 = affine.delinearize_index %81 into (%c4, %c2, %c4, %c16) : index, index, index, index
        %subview_4 = memref.subview %subview_0[0, %arg1, %82#0, %82#1, %82#2, %82#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : memref<1x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1x1x1x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        %subview_5 = memref.subview %alloc_2[0, 0, %82#0, %82#1, %82#2, %82#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : memref<1x1x4x2x4x16x2x8xi8, #gpu.address_space<workgroup>> to memref<1x1x1x1x1x1x2x8xi8, strided<[8192, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>>
        %83 = vector.transfer_read %subview_4[%c0, %c0, %c0, %c0, %c0, %c0, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true, true]} : memref<1x1x1x1x1x1x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x1x1x1x1x1x2x8xi8>
        vector.transfer_write %83, %subview_5[%c0, %c0, %c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true, true, true]} : vector<1x1x1x1x1x1x2x8xi8>, memref<1x1x1x1x1x1x2x8xi8, strided<[8192, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>>
      } {unroll_loop}
      gpu.barrier
      %74 = vector.transfer_read %alloc[%c0, %c0, %c0, %71#0, %71#1, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true]} : memref<1x1x8x4x16x2x8xi8, #gpu.address_space<workgroup>>, vector<1x1x8x1x1x2x8xi8>
      %75 = vector.transfer_read %alloc_2[%c0, %c0, %69#0, %c0, %69#1, %69#2, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true, true]} : memref<1x1x4x2x4x16x2x8xi8, #gpu.address_space<workgroup>>, vector<1x1x1x2x1x1x2x8xi8>
      %76 = vector.extract %74[0, 0] : vector<8x1x1x2x8xi8> from vector<1x1x8x1x1x2x8xi8>
      %77 = vector.extract %75[0, 0] : vector<1x2x1x1x2x8xi8> from vector<1x1x1x2x1x1x2x8xi8>
      %78 = vector.extract %arg2[0, 0] : vector<8x1x2x1x1x4xi32> from vector<1x1x8x1x2x1x1x4xi32>
      %79 = iree_gpu.multi_mma %76, %77, %78 {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = [], kind = #iree_gpu.data_tiled_mma_layout<intrinsic =  MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : vector<8x1x1x2x8xi8>, vector<1x2x1x1x2x8xi8> into vector<8x1x2x1x1x4xi32>
      %80 = vector.broadcast %79 : vector<8x1x2x1x1x4xi32> to vector<1x1x8x1x2x1x1x4xi32>
      scf.yield %80 : vector<1x1x8x1x2x1x1x4xi32>
    }
    vector.transfer_write %73, %subview_3[%c0, %c0, %c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true, true, true]} : vector<1x1x8x1x2x1x1x4xi32>, memref<1x1x8x1x2x1x1x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  }
  return
 }

 // -----// IR Dump After Canonicalizer (canonicalize) //----- //
 func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} {
  %c32_i64 = arith.constant 32 : i64
  %c1 = arith.constant 1 : index
  %c0 = arith.constant 0 : index
  %c4 = arith.constant 4 : index
  %c16 = arith.constant 16 : index
  %c8 = arith.constant 8 : index
  %c512 = arith.constant 512 : index
  %c256 = arith.constant 256 : index
  %c2 = arith.constant 2 : index
  %c0_i8 = arith.constant 0 : i8
  %c0_i32 = arith.constant 0 : i32
  %thread_id_x = gpu.thread_id  x
  %thread_id_y = gpu.thread_id  y
  %thread_id_z = gpu.thread_id  z
  %0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
  %8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
  %9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
  %10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32
  %11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32
  %12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32
  %13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32
  %14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32
  %15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32
  %16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32
  %17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32
  %18 = arith.extui %0 : i32 to i64
  %19 = arith.extui %1 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %2 : i32 to i64
  %24 = arith.extui %3 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = arith.extui %4 : i32 to i64
  %29 = arith.extui %5 : i32 to i64
  %30 = arith.shli %29, %c32_i64 : i64
  %31 = arith.ori %28, %30 : i64
  %32 = arith.index_castui %31 : i64 to index
  %33 = arith.extui %6 : i32 to i64
  %34 = arith.extui %7 : i32 to i64
  %35 = arith.shli %34, %c32_i64 : i64
  %36 = arith.ori %33, %35 : i64
  %37 = arith.index_castui %36 : i64 to index
  %38 = arith.extui %8 : i32 to i64
  %39 = arith.extui %9 : i32 to i64
  %40 = arith.shli %39, %c32_i64 : i64
  %41 = arith.ori %38, %40 : i64
  %42 = arith.index_castui %41 : i64 to index
  %43 = arith.extui %10 : i32 to i64
  %44 = arith.extui %11 : i32 to i64
  %45 = arith.shli %44, %c32_i64 : i64
  %46 = arith.ori %43, %45 : i64
  %47 = arith.index_castui %46 : i64 to index
  %48 = arith.extui %12 : i32 to i64
  %49 = arith.extui %13 : i32 to i64
  %50 = arith.shli %49, %c32_i64 : i64
  %51 = arith.ori %48, %50 : i64
  %52 = arith.index_castui %51 : i64 to index
  %53 = arith.extui %14 : i32 to i64
  %54 = arith.extui %15 : i32 to i64
  %55 = arith.shli %54, %c32_i64 : i64
  %56 = arith.ori %53, %55 : i64
  %57 = arith.index_castui %56 : i64 to index
  %58 = arith.extui %16 : i32 to i64
  %59 = arith.extui %17 : i32 to i64
  %60 = arith.shli %59, %c32_i64 : i64
  %61 = arith.ori %58, %60 : i64
  %62 = arith.index_castui %61 : i64 to index
  %63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : memref<?x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>{%37, %42}
  memref.assume_alignment %63, 1 : memref<?x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : memref<?x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>{%47, %52}
  memref.assume_alignment %64, 1 : memref<?x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : memref<?x?x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>{%57, %62}
  memref.assume_alignment %65, 1 : memref<?x?x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %subview = memref.subview %63[%workgroup_id_y, 0, 0, 0, 0, 0, 0] [1, %42, 8, 4, 16, 2, 8] [1, 1, 1, 1, 1, 1, 1] : memref<?x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %subview_0 = memref.subview %64[%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0] [1, %42, 4, 2, 4, 16, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : memref<?x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %subview_1 = memref.subview %65[%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 2, 4, 16, 4] [1, 1, 1, 1, 1, 1, 1, 1] : memref<?x?x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %alloc = memref.alloc() : memref<1x1x8x4x16x2x8xi8, #gpu.address_space<workgroup>>
  %alloc_2 = memref.alloc() : memref<1x1x4x2x4x16x2x8xi8, #gpu.address_space<workgroup>>
  %66 = affine.apply affine_map<()[s0, s1, s2] -> (s0 + s1 * 256 + s2 * 256)>()[%thread_id_x, %thread_id_y, %thread_id_z]
  %67 = affine.delinearize_index %66 into (%c256) : index
  %68 = affine.apply affine_map<()[s0] -> (s0 mod 256)>()[%67]
  %69:3 = affine.delinearize_index %68 into (%c4, %c4, %c16) : index, index, index
  %subview_3 = memref.subview %subview_1[0, 0, 0, %69#0, 0, %69#1, %69#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x8x1x2x1x1x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %70 = affine.apply affine_map<()[s0] -> (s0 mod 64)>()[%67]
  %71:2 = affine.delinearize_index %70 into (%c4, %c16) : index, index
  %72 = vector.transfer_read %subview_1[%c0, %c0, %c0, %69#0, %c0, %69#1, %69#2, %c0], %c0_i32 {in_bounds = [true, true, true, true, true, true, true, true]} : memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x1x8x1x2x1x1x4xi32>
  %73 = scf.for %arg0 = %c0 to %42 step %c1 iter_args(%arg1 = %72) -> (vector<1x1x8x1x2x1x1x4xi32>) {
    gpu.barrier
    scf.for %arg2 = %c0 to %c512 step %c256 {
      %81 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%arg2)[%67]
      %82:3 = affine.delinearize_index %81 into (%c8, %c4, %c16) : index, index, index
      %subview_4 = memref.subview %subview[0, %arg0, %82#0, %82#1, %82#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : memref<1x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1x1x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %subview_5 = memref.subview %alloc[0, 0, %82#0, %82#1, %82#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : memref<1x1x8x4x16x2x8xi8, #gpu.address_space<workgroup>> to memref<1x1x1x1x1x2x8xi8, strided<[8192, 8192, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>>
      %83 = vector.transfer_read %subview_4[%c0, %c0, %c0, %c0, %c0, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true]} : memref<1x1x1x1x1x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x1x1x1x1x2x8xi8>
      vector.transfer_write %83, %subview_5[%c0, %c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true, true]} : vector<1x1x1x1x1x2x8xi8>, memref<1x1x1x1x1x2x8xi8, strided<[8192, 8192, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>>
    } {unroll_loop}
    scf.for %arg2 = %c0 to %c512 step %c256 {
      %81 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%arg2)[%67]
      %82:4 = affine.delinearize_index %81 into (%c4, %c2, %c4, %c16) : index, index, index, index
      %subview_4 = memref.subview %subview_0[0, %arg0, %82#0, %82#1, %82#2, %82#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : memref<1x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1x1x1x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %subview_5 = memref.subview %alloc_2[0, 0, %82#0, %82#1, %82#2, %82#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : memref<1x1x4x2x4x16x2x8xi8, #gpu.address_space<workgroup>> to memref<1x1x1x1x1x1x2x8xi8, strided<[8192, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>>
      %83 = vector.transfer_read %subview_4[%c0, %c0, %c0, %c0, %c0, %c0, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true, true]} : memref<1x1x1x1x1x1x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x1x1x1x1x1x2x8xi8>
      vector.transfer_write %83, %subview_5[%c0, %c0, %c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true, true, true]} : vector<1x1x1x1x1x1x2x8xi8>, memref<1x1x1x1x1x1x2x8xi8, strided<[8192, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>>
    } {unroll_loop}
    gpu.barrier
    %74 = vector.transfer_read %alloc[%c0, %c0, %c0, %71#0, %71#1, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true]} : memref<1x1x8x4x16x2x8xi8, #gpu.address_space<workgroup>>, vector<1x1x8x1x1x2x8xi8>
    %75 = vector.transfer_read %alloc_2[%c0, %c0, %69#0, %c0, %69#1, %69#2, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true, true]} : memref<1x1x4x2x4x16x2x8xi8, #gpu.address_space<workgroup>>, vector<1x1x1x2x1x1x2x8xi8>
    %76 = vector.extract %74[0, 0] : vector<8x1x1x2x8xi8> from vector<1x1x8x1x1x2x8xi8>
    %77 = vector.extract %75[0, 0] : vector<1x2x1x1x2x8xi8> from vector<1x1x1x2x1x1x2x8xi8>
    %78 = vector.extract %arg1[0, 0] : vector<8x1x2x1x1x4xi32> from vector<1x1x8x1x2x1x1x4xi32>
    %79 = iree_gpu.multi_mma %76, %77, %78 {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = [], kind = #iree_gpu.data_tiled_mma_layout<intrinsic =  MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : vector<8x1x1x2x8xi8>, vector<1x2x1x1x2x8xi8> into vector<8x1x2x1x1x4xi32>
    %80 = vector.broadcast %79 : vector<8x1x2x1x1x4xi32> to vector<1x1x8x1x2x1x1x4xi32>
    scf.yield %80 : vector<1x1x8x1x2x1x1x4xi32>
  }
  vector.transfer_write %73, %subview_3[%c0, %c0, %c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true, true, true]} : vector<1x1x8x1x2x1x1x4xi32>, memref<1x1x8x1x2x1x1x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  return
 }

 // -----// IR Dump After CSE (cse) //----- //
 func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} {
  %c32_i64 = arith.constant 32 : i64
  %c1 = arith.constant 1 : index
  %c0 = arith.constant 0 : index
  %c4 = arith.constant 4 : index
  %c16 = arith.constant 16 : index
  %c8 = arith.constant 8 : index
  %c512 = arith.constant 512 : index
  %c256 = arith.constant 256 : index
  %c2 = arith.constant 2 : index
  %c0_i8 = arith.constant 0 : i8
  %c0_i32 = arith.constant 0 : i32
  %thread_id_x = gpu.thread_id  x
  %thread_id_y = gpu.thread_id  y
  %thread_id_z = gpu.thread_id  z
  %0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
  %8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
  %9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
  %10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32
  %11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32
  %12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32
  %13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32
  %14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32
  %15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32
  %16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32
  %17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32
  %18 = arith.extui %0 : i32 to i64
  %19 = arith.extui %1 : i32 to i64
  %20 = arith.shli %19, %c32_i64 : i64
  %21 = arith.ori %18, %20 : i64
  %22 = arith.index_castui %21 : i64 to index
  %23 = arith.extui %2 : i32 to i64
  %24 = arith.extui %3 : i32 to i64
  %25 = arith.shli %24, %c32_i64 : i64
  %26 = arith.ori %23, %25 : i64
  %27 = arith.index_castui %26 : i64 to index
  %28 = arith.extui %4 : i32 to i64
  %29 = arith.extui %5 : i32 to i64
  %30 = arith.shli %29, %c32_i64 : i64
  %31 = arith.ori %28, %30 : i64
  %32 = arith.index_castui %31 : i64 to index
  %33 = arith.extui %6 : i32 to i64
  %34 = arith.extui %7 : i32 to i64
  %35 = arith.shli %34, %c32_i64 : i64
  %36 = arith.ori %33, %35 : i64
  %37 = arith.index_castui %36 : i64 to index
  %38 = arith.extui %8 : i32 to i64
  %39 = arith.extui %9 : i32 to i64
  %40 = arith.shli %39, %c32_i64 : i64
  %41 = arith.ori %38, %40 : i64
  %42 = arith.index_castui %41 : i64 to index
  %43 = arith.extui %10 : i32 to i64
  %44 = arith.extui %11 : i32 to i64
  %45 = arith.shli %44, %c32_i64 : i64
  %46 = arith.ori %43, %45 : i64
  %47 = arith.index_castui %46 : i64 to index
  %48 = arith.extui %12 : i32 to i64
  %49 = arith.extui %13 : i32 to i64
  %50 = arith.shli %49, %c32_i64 : i64
  %51 = arith.ori %48, %50 : i64
  %52 = arith.index_castui %51 : i64 to index
  %53 = arith.extui %14 : i32 to i64
  %54 = arith.extui %15 : i32 to i64
  %55 = arith.shli %54, %c32_i64 : i64
  %56 = arith.ori %53, %55 : i64
  %57 = arith.index_castui %56 : i64 to index
  %58 = arith.extui %16 : i32 to i64
  %59 = arith.extui %17 : i32 to i64
  %60 = arith.shli %59, %c32_i64 : i64
  %61 = arith.ori %58, %60 : i64
  %62 = arith.index_castui %61 : i64 to index
  %63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : memref<?x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>{%37, %42}
  memref.assume_alignment %63, 1 : memref<?x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : memref<?x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>{%47, %52}
  memref.assume_alignment %64, 1 : memref<?x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : memref<?x?x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>{%57, %62}
  memref.assume_alignment %65, 1 : memref<?x?x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %subview = memref.subview %63[%workgroup_id_y, 0, 0, 0, 0, 0, 0] [1, %42, 8, 4, 16, 2, 8] [1, 1, 1, 1, 1, 1, 1] : memref<?x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %subview_0 = memref.subview %64[%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0] [1, %42, 4, 2, 4, 16, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : memref<?x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  %subview_1 = memref.subview %65[%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 2, 4, 16, 4] [1, 1, 1, 1, 1, 1, 1, 1] : memref<?x?x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x8x4x2x4x
diff --git a/module_foo_dispatch_6.mlir b/module_foo_dispatch_6.mlir