pashu123 · May 22, 2025 20:45
diff --git a/full.txt b/full.txt
 // -----// IR Dump After LLVMCPUSelectLoweringStrategyPass (iree-llvmcpu-select-lowering-strategy) //----- //
 #config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>
 #config1 = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0, 0, 0], [1, 1, 0, 2, 16, 0], [0, 0, 1, 0, 0, 1]]>
 #executable_target_embedded_elf_x86_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", iree.encoding.resolver = #iree_cpu.cpu_encoding_layout<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "none"}>
 #map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #map1 = affine_map<(d0, d1, d2, d3) -> (d0, d2)>
 #pipeline_layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
 #translation = #iree_codegen.translation_info<pipeline = Mmt4dTilingExpert>
 module {
  func.func @mmt4d_bias_relu_dispatch_0_mmt4d_DxDxDx16x16x1_f32() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64, translation_info = #translation} {
    %c0 = arith.constant 0 : index
    %c32_i64 = arith.constant 32 : i64
    %cst = arith.constant 0.000000e+00 : f32
    %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32
    %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32
    %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32
    %3 = hal.interface.constant.load layout(#pipeline_layout) ordinal(3) : i32
    %4 = hal.interface.constant.load layout(#pipeline_layout) ordinal(4) : i32
    %5 = hal.interface.constant.load layout(#pipeline_layout) ordinal(5) : i32
    %6 = hal.interface.constant.load layout(#pipeline_layout) ordinal(6) : i32
    %7 = hal.interface.constant.load layout(#pipeline_layout) ordinal(7) : i32
    %8 = hal.interface.constant.load layout(#pipeline_layout) ordinal(8) : i32
    %9 = hal.interface.constant.load layout(#pipeline_layout) ordinal(9) : i32
    %10 = arith.extui %0 : i32 to i64
    %11 = arith.extui %1 : i32 to i64
    %12 = arith.shli %11, %c32_i64 : i64
    %13 = arith.ori %10, %12 : i64
    %14 = arith.index_castui %13 : i64 to index
    %15 = arith.extui %2 : i32 to i64
    %16 = arith.extui %3 : i32 to i64
    %17 = arith.shli %16, %c32_i64 : i64
    %18 = arith.ori %15, %17 : i64
    %19 = arith.index_castui %18 : i64 to index
    %20 = arith.extui %4 : i32 to i64
    %21 = arith.extui %5 : i32 to i64
    %22 = arith.shli %21, %c32_i64 : i64
    %23 = arith.ori %20, %22 : i64
    %24 = arith.index_castui %23 : i64 to index
    %25 = arith.extui %6 : i32 to i64
    %26 = arith.extui %7 : i32 to i64
    %27 = arith.shli %26, %c32_i64 : i64
    %28 = arith.ori %25, %27 : i64
    %29 = arith.index_castui %28 : i64 to index
    %30 = arith.extui %8 : i32 to i64
    %31 = arith.extui %9 : i32 to i64
    %32 = arith.shli %31, %c32_i64 : i64
    %33 = arith.ori %30, %32 : i64
    %34 = arith.index_castui %33 : i64 to index
    %35:5 = util.assume.int 
        %14<umin = 0, umax = 9007199254740991>, 
        %19<umin = 0, umax = 9007199254740991>, 
        %24<umin = 0, umax = 9007199254740991>, 
        %29<umin = 0, umax = 9007199254740991>, 
        %34<umin = 0, umax = 9007199254740991>
      : index, index, index, index, index
    %36 = iree_tensor_ext.dispatch.workload.ordinal %35#0, 0 : index
    %37 = iree_tensor_ext.dispatch.workload.ordinal %35#1, 1 : index
    %38 = iree_tensor_ext.dispatch.workload.ordinal %35#2, 2 : index
    %39 = iree_tensor_ext.dispatch.workload.ordinal %35#3, 3 : index
    %40 = iree_tensor_ext.dispatch.workload.ordinal %35#4, 4 : index
    %41 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%39, %36}
    %42 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%37, %40}
    %43 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x16xf32>>{%38}
    %44 = hal.interface.binding.subspan layout(#pipeline_layout) binding(3) alignment(64) offset(%c0) flags(Indirect) : !iree_tensor_ext.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%39, %40}
    %45 = iree_tensor_ext.dispatch.tensor.load %41, offsets = [0, 0, 0, 0], sizes = [%39, %36, 16, 1], strides = [1, 1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%39, %36} -> tensor<?x?x16x1xf32>
    %46 = iree_tensor_ext.dispatch.tensor.load %42, offsets = [0, 0, 0, 0], sizes = [%37, %40, 16, 1], strides = [1, 1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%37, %40} -> tensor<?x?x16x1xf32>
    %47 = iree_tensor_ext.dispatch.tensor.load %43, offsets = [0, 0], sizes = [%38, 16], strides = [1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x16xf32>>{%38} -> tensor<?x16xf32>
    %48 = tensor.empty(%39, %40) : tensor<?x?x16x16xf32>
    %49 = linalg.fill {lowering_config = #config} ins(%cst : f32) outs(%48 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32>
    %50 = linalg.mmt4d {lowering_config = #config1} ins(%45, %46 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%49 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32>
    %51 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%50, %47 : tensor<?x?x16x16xf32>, tensor<?x16xf32>) outs(%48 : tensor<?x?x16x16xf32>) attrs =  {lowering_config = #config} {
    ^bb0(%in: f32, %in_0: f32, %out: f32):
      %52 = arith.addf %in, %in_0 : f32
      %53 = arith.maximumf %52, %cst : f32
      linalg.yield %53 : f32
    } -> tensor<?x?x16x16xf32>
    iree_tensor_ext.dispatch.tensor.store %51, %44, offsets = [0, 0, 0, 0], sizes = [%39, %40, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%39, %40}
    return
  }
 }


 // -----// IR Dump After TileAndDistributeToWorkgroupsUsingForallOpPass (iree-codegen-tile-and-distribute-to-workgroups-using-forall-op) //----- //
 func.func @mmt4d_bias_relu_dispatch_0_mmt4d_DxDxDx16x16x1_f32() attributes {hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", iree.encoding.resolver = #iree_cpu.cpu_encoding_layout<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "none"}>, translation_info = #iree_codegen.translation_info<pipeline = Mmt4dTilingExpert>} {
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
  %8 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
  %9 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
  %10 = arith.extui %0 : i32 to i64
  %11 = arith.extui %1 : i32 to i64
  %12 = arith.shli %11, %c32_i64 : i64
  %13 = arith.ori %10, %12 : i64
  %14 = arith.index_castui %13 : i64 to index
  %15 = arith.extui %2 : i32 to i64
  %16 = arith.extui %3 : i32 to i64
  %17 = arith.shli %16, %c32_i64 : i64
  %18 = arith.ori %15, %17 : i64
  %19 = arith.index_castui %18 : i64 to index
  %20 = arith.extui %4 : i32 to i64
  %21 = arith.extui %5 : i32 to i64
  %22 = arith.shli %21, %c32_i64 : i64
  %23 = arith.ori %20, %22 : i64
  %24 = arith.index_castui %23 : i64 to index
  %25 = arith.extui %6 : i32 to i64
  %26 = arith.extui %7 : i32 to i64
  %27 = arith.shli %26, %c32_i64 : i64
  %28 = arith.ori %25, %27 : i64
  %29 = arith.index_castui %28 : i64 to index
  %30 = arith.extui %8 : i32 to i64
  %31 = arith.extui %9 : i32 to i64
  %32 = arith.shli %31, %c32_i64 : i64
  %33 = arith.ori %30, %32 : i64
  %34 = arith.index_castui %33 : i64 to index
  %35:5 = util.assume.int 
      %14<umin = 0, umax = 9007199254740991>, 
      %19<umin = 0, umax = 9007199254740991>, 
      %24<umin = 0, umax = 9007199254740991>, 
      %29<umin = 0, umax = 9007199254740991>, 
      %34<umin = 0, umax = 9007199254740991>
    : index, index, index, index, index
  %36 = iree_tensor_ext.dispatch.workload.ordinal %35#0, 0 : index
  %37 = iree_tensor_ext.dispatch.workload.ordinal %35#1, 1 : index
  %38 = iree_tensor_ext.dispatch.workload.ordinal %35#2, 2 : index
  %39 = iree_tensor_ext.dispatch.workload.ordinal %35#3, 3 : index
  %40 = iree_tensor_ext.dispatch.workload.ordinal %35#4, 4 : index
  %41 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%39, %36}
  %42 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%37, %40}
  %43 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x16xf32>>{%38}
  %44 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(3) alignment(64) offset(%c0) flags(Indirect) : !iree_tensor_ext.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%39, %40}
  %45 = iree_tensor_ext.dispatch.tensor.load %41, offsets = [0, 0, 0, 0], sizes = [%39, %36, 16, 1], strides = [1, 1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%39, %36} -> tensor<?x?x16x1xf32>
  %46 = iree_tensor_ext.dispatch.tensor.load %42, offsets = [0, 0, 0, 0], sizes = [%37, %40, 16, 1], strides = [1, 1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%37, %40} -> tensor<?x?x16x1xf32>
  %47 = iree_tensor_ext.dispatch.tensor.load %43, offsets = [0, 0], sizes = [%38, 16], strides = [1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x16xf32>>{%38} -> tensor<?x16xf32>
  %48 = tensor.empty(%39, %40) : tensor<?x?x16x16xf32>
  %49 = scf.forall (%arg0, %arg1) in (%39, %37) shared_outs(%arg2 = %48) -> (tensor<?x?x16x16xf32>) {
    %extracted_slice = tensor.extract_slice %45[%arg0, 0, 0, 0] [1, %36, 16, 1] [1, 1, 1, 1] : tensor<?x?x16x1xf32> to tensor<1x?x16x1xf32>
    %extracted_slice_0 = tensor.extract_slice %46[%arg1, 0, 0, 0] [1, %36, 16, 1] [1, 1, 1, 1] : tensor<?x?x16x1xf32> to tensor<1x?x16x1xf32>
    %50 = tensor.empty() : tensor<1x1x16x16xf32>
    %51 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>} ins(%cst : f32) outs(%50 : tensor<1x1x16x16xf32>) -> tensor<1x1x16x16xf32>
    %52 = linalg.mmt4d {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0, 0, 0], [1, 1, 0, 2, 16, 0], [0, 0, 1, 0, 0, 1]]>} ins(%extracted_slice, %extracted_slice_0 : tensor<1x?x16x1xf32>, tensor<1x?x16x1xf32>) outs(%51 : tensor<1x1x16x16xf32>) -> tensor<1x1x16x16xf32>
    %extracted_slice_1 = tensor.extract_slice %47[%arg0, 0] [1, 16] [1, 1] : tensor<?x16xf32> to tensor<1x16xf32>
    %extracted_slice_2 = tensor.extract_slice %arg2[%arg0, %arg1, 0, 0] [1, 1, 16, 16] [1, 1, 1, 1] : tensor<?x?x16x16xf32> to tensor<1x1x16x16xf32>
    %53 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%52, %extracted_slice_1 : tensor<1x1x16x16xf32>, tensor<1x16xf32>) outs(%extracted_slice_2 : tensor<1x1x16x16xf32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>} {
    ^bb0(%in: f32, %in_3: f32, %out: f32):
      %54 = arith.addf %in, %in_3 : f32
      %55 = arith.maximumf %54, %cst : f32
      linalg.yield %55 : f32
    } -> tensor<1x1x16x16xf32>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %53 into %arg2[%arg0, %arg1, 0, 0] [1, 1, 16, 16] [1, 1, 1, 1] : tensor<1x1x16x16xf32> into tensor<?x?x16x16xf32>
    }
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  iree_tensor_ext.dispatch.tensor.store %49, %44, offsets = [0, 0, 0, 0], sizes = [%39, %40, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%39, %40}
  return
 }

 // -----// IR Dump After ConfigTrackingCanonicalizerPass (iree-codegen-config-tracking-canonicalize) //----- //
 func.func @mmt4d_bias_relu_dispatch_0_mmt4d_DxDxDx16x16x1_f32() attributes {hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", iree.encoding.resolver = #iree_cpu.cpu_encoding_layout<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "none"}>, translation_info = #iree_codegen.translation_info<pipeline = Mmt4dTilingExpert>} {
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
  %8 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
  %9 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
  %10 = arith.extui %0 : i32 to i64
  %11 = arith.extui %1 : i32 to i64
  %12 = arith.shli %11, %c32_i64 : i64
  %13 = arith.ori %10, %12 : i64
  %14 = arith.index_castui %13 : i64 to index
  %15 = arith.extui %2 : i32 to i64
  %16 = arith.extui %3 : i32 to i64
  %17 = arith.shli %16, %c32_i64 : i64
  %18 = arith.ori %15, %17 : i64
  %19 = arith.index_castui %18 : i64 to index
  %20 = arith.extui %4 : i32 to i64
  %21 = arith.extui %5 : i32 to i64
  %22 = arith.shli %21, %c32_i64 : i64
  %23 = arith.ori %20, %22 : i64
  %24 = arith.index_castui %23 : i64 to index
  %25 = arith.extui %6 : i32 to i64
  %26 = arith.extui %7 : i32 to i64
  %27 = arith.shli %26, %c32_i64 : i64
  %28 = arith.ori %25, %27 : i64
  %29 = arith.index_castui %28 : i64 to index
  %30 = arith.extui %8 : i32 to i64
  %31 = arith.extui %9 : i32 to i64
  %32 = arith.shli %31, %c32_i64 : i64
  %33 = arith.ori %30, %32 : i64
  %34 = arith.index_castui %33 : i64 to index
  %35:5 = util.assume.int 
      %14<umin = 0, umax = 9007199254740991>, 
      %19<umin = 0, umax = 9007199254740991>, 
      %24<umin = 0, umax = 9007199254740991>, 
      %29<umin = 0, umax = 9007199254740991>, 
      %34<umin = 0, umax = 9007199254740991>
    : index, index, index, index, index
  %36 = iree_tensor_ext.dispatch.workload.ordinal %35#0, 0 : index
  %37 = iree_tensor_ext.dispatch.workload.ordinal %35#1, 1 : index
  %38 = iree_tensor_ext.dispatch.workload.ordinal %35#2, 2 : index
  %39 = iree_tensor_ext.dispatch.workload.ordinal %35#3, 3 : index
  %40 = iree_tensor_ext.dispatch.workload.ordinal %35#4, 4 : index
  %41 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%39, %36}
  %42 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%37, %40}
  %43 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x16xf32>>{%38}
  %44 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(3) alignment(64) offset(%c0) flags(Indirect) : !iree_tensor_ext.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%39, %40}
  %45 = iree_tensor_ext.dispatch.tensor.load %41, offsets = [0, 0, 0, 0], sizes = [%39, %36, 16, 1], strides = [1, 1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%39, %36} -> tensor<?x?x16x1xf32>
  %46 = iree_tensor_ext.dispatch.tensor.load %42, offsets = [0, 0, 0, 0], sizes = [%37, %40, 16, 1], strides = [1, 1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%37, %40} -> tensor<?x?x16x1xf32>
  %47 = iree_tensor_ext.dispatch.tensor.load %43, offsets = [0, 0], sizes = [%38, 16], strides = [1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x16xf32>>{%38} -> tensor<?x16xf32>
  %48 = tensor.empty(%39, %40) : tensor<?x?x16x16xf32>
  %49 = scf.forall (%arg0, %arg1) in (%39, %37) shared_outs(%arg2 = %48) -> (tensor<?x?x16x16xf32>) {
    %extracted_slice = tensor.extract_slice %45[%arg0, 0, 0, 0] [1, %36, 16, 1] [1, 1, 1, 1] : tensor<?x?x16x1xf32> to tensor<1x?x16x1xf32>
    %extracted_slice_0 = tensor.extract_slice %46[%arg1, 0, 0, 0] [1, %36, 16, 1] [1, 1, 1, 1] : tensor<?x?x16x1xf32> to tensor<1x?x16x1xf32>
    %50 = tensor.empty() : tensor<1x1x16x16xf32>
    %51 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>} ins(%cst : f32) outs(%50 : tensor<1x1x16x16xf32>) -> tensor<1x1x16x16xf32>
    %52 = linalg.mmt4d {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0, 0, 0], [1, 1, 0, 2, 16, 0], [0, 0, 1, 0, 0, 1]]>} ins(%extracted_slice, %extracted_slice_0 : tensor<1x?x16x1xf32>, tensor<1x?x16x1xf32>) outs(%51 : tensor<1x1x16x16xf32>) -> tensor<1x1x16x16xf32>
    %extracted_slice_1 = tensor.extract_slice %47[%arg0, 0] [1, 16] [1, 1] : tensor<?x16xf32> to tensor<1x16xf32>
    %extracted_slice_2 = tensor.extract_slice %arg2[%arg0, %arg1, 0, 0] [1, 1, 16, 16] [1, 1, 1, 1] : tensor<?x?x16x16xf32> to tensor<1x1x16x16xf32>
    %53 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%52, %extracted_slice_1 : tensor<1x1x16x16xf32>, tensor<1x16xf32>) outs(%extracted_slice_2 : tensor<1x1x16x16xf32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>} {
    ^bb0(%in: f32, %in_3: f32, %out: f32):
      %54 = arith.addf %in, %in_3 : f32
      %55 = arith.maximumf %54, %cst : f32
      linalg.yield %55 : f32
    } -> tensor<1x1x16x16xf32>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %53 into %arg2[%arg0, %arg1, 0, 0] [1, 1, 16, 16] [1, 1, 1, 1] : tensor<1x1x16x16xf32> into tensor<?x?x16x16xf32>
    }
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  iree_tensor_ext.dispatch.tensor.store %49, %44, offsets = [0, 0, 0, 0], sizes = [%39, %40, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%39, %40}
  return
 }

 // -----// IR Dump After CSE (cse) //----- //
 func.func @mmt4d_bias_relu_dispatch_0_mmt4d_DxDxDx16x16x1_f32() attributes {hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", iree.encoding.resolver = #iree_cpu.cpu_encoding_layout<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "none"}>, translation_info = #iree_codegen.translation_info<pipeline = Mmt4dTilingExpert>} {
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
  %8 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
  %9 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
  %10 = arith.extui %0 : i32 to i64
  %11 = arith.extui %1 : i32 to i64
  %12 = arith.shli %11, %c32_i64 : i64
  %13 = arith.ori %10, %12 : i64
  %14 = arith.index_castui %13 : i64 to index
  %15 = arith.extui %2 : i32 to i64
  %16 = arith.extui %3 : i32 to i64
  %17 = arith.shli %16, %c32_i64 : i64
  %18 = arith.ori %15, %17 : i64
  %19 = arith.index_castui %18 : i64 to index
  %20 = arith.extui %4 : i32 to i64
  %21 = arith.extui %5 : i32 to i64
  %22 = arith.shli %21, %c32_i64 : i64
  %23 = arith.ori %20, %22 : i64
  %24 = arith.index_castui %23 : i64 to index
  %25 = arith.extui %6 : i32 to i64
  %26 = arith.extui %7 : i32 to i64
  %27 = arith.shli %26, %c32_i64 : i64
  %28 = arith.ori %25, %27 : i64
  %29 = arith.index_castui %28 : i64 to index
  %30 = arith.extui %8 : i32 to i64
  %31 = arith.extui %9 : i32 to i64
  %32 = arith.shli %31, %c32_i64 : i64
  %33 = arith.ori %30, %32 : i64
  %34 = arith.index_castui %33 : i64 to index
  %35:5 = util.assume.int 
      %14<umin = 0, umax = 9007199254740991>, 
      %19<umin = 0, umax = 9007199254740991>, 
      %24<umin = 0, umax = 9007199254740991>, 
      %29<umin = 0, umax = 9007199254740991>, 
      %34<umin = 0, umax = 9007199254740991>
    : index, index, index, index, index
  %36 = iree_tensor_ext.dispatch.workload.ordinal %35#0, 0 : index
  %37 = iree_tensor_ext.dispatch.workload.ordinal %35#1, 1 : index
  %38 = iree_tensor_ext.dispatch.workload.ordinal %35#2, 2 : index
  %39 = iree_tensor_ext.dispatch.workload.ordinal %35#3, 3 : index
  %40 = iree_tensor_ext.dispatch.workload.ordinal %35#4, 4 : index
  %41 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%39, %36}
  %42 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%37, %40}
  %43 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x16xf32>>{%38}
  %44 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(3) alignment(64) offset(%c0) flags(Indirect) : !iree_tensor_ext.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%39, %40}
  %45 = iree_tensor_ext.dispatch.tensor.load %41, offsets = [0, 0, 0, 0], sizes = [%39, %36, 16, 1], strides = [1, 1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%39, %36} -> tensor<?x?x16x1xf32>
  %46 = iree_tensor_ext.dispatch.tensor.load %42, offsets = [0, 0, 0, 0], sizes = [%37, %40, 16, 1], strides = [1, 1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%37, %40} -> tensor<?x?x16x1xf32>
  %47 = iree_tensor_ext.dispatch.tensor.load %43, offsets = [0, 0], sizes = [%38, 16], strides = [1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x16xf32>>{%38} -> tensor<?x16xf32>
  %48 = tensor.empty(%39, %40) : tensor<?x?x16x16xf32>
  %49 = scf.forall (%arg0, %arg1) in (%39, %37) shared_outs(%arg2 = %48) -> (tensor<?x?x16x16xf32>) {
    %extracted_slice = tensor.extract_slice %45[%arg0, 0, 0, 0] [1, %36, 16, 1] [1, 1, 1, 1] : tensor<?x?x16x1xf32> to tensor<1x?x16x1xf32>
    %extracted_slice_0 = tensor.extract_slice %46[%arg1, 0, 0, 0] [1, %36, 16, 1] [1, 1, 1, 1] : tensor<?x?x16x1xf32> to tensor<1x?x16x1xf32>
    %50 = tensor.empty() : tensor<1x1x16x16xf32>
    %51 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>} ins(%cst : f32) outs(%50 : tensor<1x1x16x16xf32>) -> tensor<1x1x16x16xf32>
    %52 = linalg.mmt4d {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0, 0, 0], [1, 1, 0, 2, 16, 0], [0, 0, 1, 0, 0, 1]]>} ins(%extracted_slice, %extracted_slice_0 : tensor<1x?x16x1xf32>, tensor<1x?x16x1xf32>) outs(%51 : tensor<1x1x16x16xf32>) -> tensor<1x1x16x16xf32>
    %extracted_slice_1 = tensor.extract_slice %47[%arg0, 0] [1, 16] [1, 1] : tensor<?x16xf32> to tensor<1x16xf32>
    %extracted_slice_2 = tensor.extract_slice %arg2[%arg0, %arg1, 0, 0] [1, 1, 16, 16] [1, 1, 1, 1] : tensor<?x?x16x16xf32> to tensor<1x1x16x16xf32>
    %53 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%52, %extracted_slice_1 : tensor<1x1x16x16xf32>, tensor<1x16xf32>) outs(%extracted_slice_2 : tensor<1x1x16x16xf32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>} {
    ^bb0(%in: f32, %in_3: f32, %out: f32):
      %54 = arith.addf %in, %in_3 : f32
      %55 = arith.maximumf %54, %cst : f32
      linalg.yield %55 : f32
    } -> tensor<1x1x16x16xf32>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %53 into %arg2[%arg0, %arg1, 0, 0] [1, 1, 16, 16] [1, 1, 1, 1] : tensor<1x1x16x16xf32> into tensor<?x?x16x16xf32>
    }
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  iree_tensor_ext.dispatch.tensor.store %49, %44, offsets = [0, 0, 0, 0], sizes = [%39, %40, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%39, %40}
  return
 }

 // -----// IR Dump After FuseTensorPadWithConsumerPass (iree-codegen-fuse-tensor-pad-with-consumer) //----- //
 func.func @mmt4d_bias_relu_dispatch_0_mmt4d_DxDxDx16x16x1_f32() attributes {hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", iree.encoding.resolver = #iree_cpu.cpu_encoding_layout<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "none"}>, translation_info = #iree_codegen.translation_info<pipeline = Mmt4dTilingExpert>} {
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
  %8 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
  %9 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
  %10 = arith.extui %0 : i32 to i64
  %11 = arith.extui %1 : i32 to i64
  %12 = arith.shli %11, %c32_i64 : i64
  %13 = arith.ori %10, %12 : i64
  %14 = arith.index_castui %13 : i64 to index
  %15 = arith.extui %2 : i32 to i64
  %16 = arith.extui %3 : i32 to i64
  %17 = arith.shli %16, %c32_i64 : i64
  %18 = arith.ori %15, %17 : i64
  %19 = arith.index_castui %18 : i64 to index
  %20 = arith.extui %4 : i32 to i64
  %21 = arith.extui %5 : i32 to i64
  %22 = arith.shli %21, %c32_i64 : i64
  %23 = arith.ori %20, %22 : i64
  %24 = arith.index_castui %23 : i64 to index
  %25 = arith.extui %6 : i32 to i64
  %26 = arith.extui %7 : i32 to i64
  %27 = arith.shli %26, %c32_i64 : i64
  %28 = arith.ori %25, %27 : i64
  %29 = arith.index_castui %28 : i64 to index
  %30 = arith.extui %8 : i32 to i64
  %31 = arith.extui %9 : i32 to i64
  %32 = arith.shli %31, %c32_i64 : i64
  %33 = arith.ori %30, %32 : i64
  %34 = arith.index_castui %33 : i64 to index
  %35:5 = util.assume.int 
      %14<umin = 0, umax = 9007199254740991>, 
      %19<umin = 0, umax = 9007199254740991>, 
      %24<umin = 0, umax = 9007199254740991>, 
      %29<umin = 0, umax = 9007199254740991>, 
      %34<umin = 0, umax = 9007199254740991>
    : index, index, index, index, index
  %36 = iree_tensor_ext.dispatch.workload.ordinal %35#0, 0 : index
  %37 = iree_tensor_ext.dispatch.workload.ordinal %35#1, 1 : index
  %38 = iree_tensor_ext.dispatch.workload.ordinal %35#2, 2 : index
  %39 = iree_tensor_ext.dispatch.workload.ordinal %35#3, 3 : index
  %40 = iree_tensor_ext.dispatch.workload.ordinal %35#4, 4 : index
  %41 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%39, %36}
  %42 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%37, %40}
  %43 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x16xf32>>{%38}
  %44 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(3) alignment(64) offset(%c0) flags(Indirect) : !iree_tensor_ext.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%39, %40}
  %45 = iree_tensor_ext.dispatch.tensor.load %41, offsets = [0, 0, 0, 0], sizes = [%39, %36, 16, 1], strides = [1, 1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%39, %36} -> tensor<?x?x16x1xf32>
  %46 = iree_tensor_ext.dispatch.tensor.load %42, offsets = [0, 0, 0, 0], sizes = [%37, %40, 16, 1], strides = [1, 1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%37, %40} -> tensor<?x?x16x1xf32>
  %47 = iree_tensor_ext.dispatch.tensor.load %43, offsets = [0, 0], sizes = [%38, 16], strides = [1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x16xf32>>{%38} -> tensor<?x16xf32>
  %48 = tensor.empty(%39, %40) : tensor<?x?x16x16xf32>
  %49 = scf.forall (%arg0, %arg1) in (%39, %37) shared_outs(%arg2 = %48) -> (tensor<?x?x16x16xf32>) {
    %extracted_slice = tensor.extract_slice %45[%arg0, 0, 0, 0] [1, %36, 16, 1] [1, 1, 1, 1] : tensor<?x?x16x1xf32> to tensor<1x?x16x1xf32>
    %extracted_slice_0 = tensor.extract_slice %46[%arg1, 0, 0, 0] [1, %36, 16, 1] [1, 1, 1, 1] : tensor<?x?x16x1xf32> to tensor<1x?x16x1xf32>
    %50 = tensor.empty() : tensor<1x1x16x16xf32>
    %51 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>} ins(%cst : f32) outs(%50 : tensor<1x1x16x16xf32>) -> tensor<1x1x16x16xf32>
    %52 = linalg.mmt4d {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0, 0, 0], [1, 1, 0, 2, 16, 0], [0, 0, 1, 0, 0, 1]]>} ins(%extracted_slice, %extracted_slice_0 : tensor<1x?x16x1xf32>, tensor<1x?x16x1xf32>) outs(%51 : tensor<1x1x16x16xf32>) -> tensor<1x1x16x16xf32>
    %extracted_slice_1 = tensor.extract_slice %47[%arg0, 0] [1, 16] [1, 1] : tensor<?x16xf32> to tensor<1x16xf32>
    %extracted_slice_2 = tensor.extract_slice %arg2[%arg0, %arg1, 0, 0] [1, 1, 16, 16] [1, 1, 1, 1] : tensor<?x?x16x16xf32> to tensor<1x1x16x16xf32>
    %53 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%52, %extracted_slice_1 : tensor<1x1x16x16xf32>, tensor<1x16xf32>) outs(%extracted_slice_2 : tensor<1x1x16x16xf32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>} {
    ^bb0(%in: f32, %in_3: f32, %out: f32):
      %54 = arith.addf %in, %in_3 : f32
      %55 = arith.maximumf %54, %cst : f32
      linalg.yield %55 : f32
    } -> tensor<1x1x16x16xf32>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %53 into %arg2[%arg0, %arg1, 0, 0] [1, 1, 16, 16] [1, 1, 1, 1] : tensor<1x1x16x16xf32> into tensor<?x?x16x16xf32>
    }
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  iree_tensor_ext.dispatch.tensor.store %49, %44, offsets = [0, 0, 0, 0], sizes = [%39, %40, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%39, %40}
  return
 }

 // -----// IR Dump After ConcretizePadResultShapePass (iree-codegen-concretize-pad-result-shape) //----- //
 func.func @mmt4d_bias_relu_dispatch_0_mmt4d_DxDxDx16x16x1_f32() attributes {hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", iree.encoding.resolver = #iree_cpu.cpu_encoding_layout<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "none"}>, translation_info = #iree_codegen.translation_info<pipeline = Mmt4dTilingExpert>} {
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
  %8 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
  %9 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
  %10 = arith.extui %0 : i32 to i64
  %11 = arith.extui %1 : i32 to i64
  %12 = arith.shli %11, %c32_i64 : i64
  %13 = arith.ori %10, %12 : i64
  %14 = arith.index_castui %13 : i64 to index
  %15 = arith.extui %2 : i32 to i64
  %16 = arith.extui %3 : i32 to i64
  %17 = arith.shli %16, %c32_i64 : i64
  %18 = arith.ori %15, %17 : i64
  %19 = arith.index_castui %18 : i64 to index
  %20 = arith.extui %4 : i32 to i64
  %21 = arith.extui %5 : i32 to i64
  %22 = arith.shli %21, %c32_i64 : i64
  %23 = arith.ori %20, %22 : i64
  %24 = arith.index_castui %23 : i64 to index
  %25 = arith.extui %6 : i32 to i64
  %26 = arith.extui %7 : i32 to i64
  %27 = arith.shli %26, %c32_i64 : i64
  %28 = arith.ori %25, %27 : i64
  %29 = arith.index_castui %28 : i64 to index
  %30 = arith.extui %8 : i32 to i64
  %31 = arith.extui %9 : i32 to i64
  %32 = arith.shli %31, %c32_i64 : i64
  %33 = arith.ori %30, %32 : i64
  %34 = arith.index_castui %33 : i64 to index
  %35:5 = util.assume.int 
      %14<umin = 0, umax = 9007199254740991>, 
      %19<umin = 0, umax = 9007199254740991>, 
      %24<umin = 0, umax = 9007199254740991>, 
      %29<umin = 0, umax = 9007199254740991>, 
      %34<umin = 0, umax = 9007199254740991>
    : index, index, index, index, index
  %36 = iree_tensor_ext.dispatch.workload.ordinal %35#0, 0 : index
  %37 = iree_tensor_ext.dispatch.workload.ordinal %35#1, 1 : index
  %38 = iree_tensor_ext.dispatch.workload.ordinal %35#2, 2 : index
  %39 = iree_tensor_ext.dispatch.workload.ordinal %35#3, 3 : index
  %40 = iree_tensor_ext.dispatch.workload.ordinal %35#4, 4 : index
  %41 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%39, %36}
  %42 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%37, %40}
  %43 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x16xf32>>{%38}
  %44 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(3) alignment(64) offset(%c0) flags(Indirect) : !iree_tensor_ext.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%39, %40}
  %45 = iree_tensor_ext.dispatch.tensor.load %41, offsets = [0, 0, 0, 0], sizes = [%39, %36, 16, 1], strides = [1, 1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%39, %36} -> tensor<?x?x16x1xf32>
  %46 = iree_tensor_ext.dispatch.tensor.load %42, offsets = [0, 0, 0, 0], sizes = [%37, %40, 16, 1], strides = [1, 1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%37, %40} -> tensor<?x?x16x1xf32>
  %47 = iree_tensor_ext.dispatch.tensor.load %43, offsets = [0, 0], sizes = [%38, 16], strides = [1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x16xf32>>{%38} -> tensor<?x16xf32>
  %48 = tensor.empty(%39, %40) : tensor<?x?x16x16xf32>
  %49 = scf.forall (%arg0, %arg1) in (%39, %37) shared_outs(%arg2 = %48) -> (tensor<?x?x16x16xf32>) {
    %extracted_slice = tensor.extract_slice %45[%arg0, 0, 0, 0] [1, %36, 16, 1] [1, 1, 1, 1] : tensor<?x?x16x1xf32> to tensor<1x?x16x1xf32>
    %extracted_slice_0 = tensor.extract_slice %46[%arg1, 0, 0, 0] [1, %36, 16, 1] [1, 1, 1, 1] : tensor<?x?x16x1xf32> to tensor<1x?x16x1xf32>
    %50 = tensor.empty() : tensor<1x1x16x16xf32>
    %51 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>} ins(%cst : f32) outs(%50 : tensor<1x1x16x16xf32>) -> tensor<1x1x16x16xf32>
    %52 = linalg.mmt4d {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0, 0, 0], [1, 1, 0, 2, 16, 0], [0, 0, 1, 0, 0, 1]]>} ins(%extracted_slice, %extracted_slice_0 : tensor<1x?x16x1xf32>, tensor<1x?x16x1xf32>) outs(%51 : tensor<1x1x16x16xf32>) -> tensor<1x1x16x16xf32>
    %extracted_slice_1 = tensor.extract_slice %47[%arg0, 0] [1, 16] [1, 1] : tensor<?x16xf32> to tensor<1x16xf32>
    %extracted_slice_2 = tensor.extract_slice %arg2[%arg0, %arg1, 0, 0] [1, 1, 16, 16] [1, 1, 1, 1] : tensor<?x?x16x16xf32> to tensor<1x1x16x16xf32>
    %53 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%52, %extracted_slice_1 : tensor<1x1x16x16xf32>, tensor<1x16xf32>) outs(%extracted_slice_2 : tensor<1x1x16x16xf32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>} {
    ^bb0(%in: f32, %in_3: f32, %out: f32):
      %54 = arith.addf %in, %in_3 : f32
      %55 = arith.maximumf %54, %cst : f32
      linalg.yield %55 : f32
    } -> tensor<1x1x16x16xf32>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %53 into %arg2[%arg0, %arg1, 0, 0] [1, 1, 16, 16] [1, 1, 1, 1] : tensor<1x1x16x16xf32> into tensor<?x?x16x16xf32>
    }
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  iree_tensor_ext.dispatch.tensor.store %49, %44, offsets = [0, 0, 0, 0], sizes = [%39, %40, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%39, %40}
  return
 }

 // -----// IR Dump After PropagateDispatchSizeBoundsPass (iree-codegen-propagate-dispatch-size-bounds) //----- //
 func.func @mmt4d_bias_relu_dispatch_0_mmt4d_DxDxDx16x16x1_f32() attributes {hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", iree.encoding.resolver = #iree_cpu.cpu_encoding_layout<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "none"}>, translation_info = #iree_codegen.translation_info<pipeline = Mmt4dTilingExpert>} {
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
  %8 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
  %9 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
  %10 = arith.extui %0 : i32 to i64
  %11 = arith.extui %1 : i32 to i64
  %12 = arith.shli %11, %c32_i64 : i64
  %13 = arith.ori %10, %12 : i64
  %14 = arith.index_castui %13 : i64 to index
  %15 = arith.extui %2 : i32 to i64
  %16 = arith.extui %3 : i32 to i64
  %17 = arith.shli %16, %c32_i64 : i64
  %18 = arith.ori %15, %17 : i64
  %19 = arith.index_castui %18 : i64 to index
  %20 = arith.extui %4 : i32 to i64
  %21 = arith.extui %5 : i32 to i64
  %22 = arith.shli %21, %c32_i64 : i64
  %23 = arith.ori %20, %22 : i64
  %24 = arith.index_castui %23 : i64 to index
  %25 = arith.extui %6 : i32 to i64
  %26 = arith.extui %7 : i32 to i64
  %27 = arith.shli %26, %c32_i64 : i64
  %28 = arith.ori %25, %27 : i64
  %29 = arith.index_castui %28 : i64 to index
  %30 = arith.extui %8 : i32 to i64
  %31 = arith.extui %9 : i32 to i64
  %32 = arith.shli %31, %c32_i64 : i64
  %33 = arith.ori %30, %32 : i64
  %34 = arith.index_castui %33 : i64 to index
  %35:5 = util.assume.int 
      %14<umin = 0, umax = 9007199254740991>, 
      %19<umin = 0, umax = 9007199254740991>, 
      %24<umin = 0, umax = 9007199254740991>, 
      %29<umin = 0, umax = 9007199254740991>, 
      %34<umin = 0, umax = 9007199254740991>
    : index, index, index, index, index
  %36 = iree_tensor_ext.dispatch.workload.ordinal %35#0, 0 : index
  %37 = iree_tensor_ext.dispatch.workload.ordinal %35#1, 1 : index
  %38 = iree_tensor_ext.dispatch.workload.ordinal %35#2, 2 : index
  %39 = iree_tensor_ext.dispatch.workload.ordinal %35#3, 3 : index
  %40 = iree_tensor_ext.dispatch.workload.ordinal %35#4, 4 : index
  %41 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%39, %36}
  %42 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%37, %40}
  %43 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x16xf32>>{%38}
  %44 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(3) alignment(64) offset(%c0) flags(Indirect) : !iree_tensor_ext.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%39, %40}
  %45 = iree_tensor_ext.dispatch.tensor.load %41, offsets = [0, 0, 0, 0], sizes = [%39, %36, 16, 1], strides = [1, 1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%39, %36} -> tensor<?x?x16x1xf32>
  %46 = iree_tensor_ext.dispatch.tensor.load %42, offsets = [0, 0, 0, 0], sizes = [%37, %40, 16, 1], strides = [1, 1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%37, %40} -> tensor<?x?x16x1xf32>
  %47 = iree_tensor_ext.dispatch.tensor.load %43, offsets = [0, 0], sizes = [%38, 16], strides = [1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x16xf32>>{%38} -> tensor<?x16xf32>
  %48 = tensor.empty(%39, %40) : tensor<?x?x16x16xf32>
  %49 = scf.forall (%arg0, %arg1) in (%39, %37) shared_outs(%arg2 = %48) -> (tensor<?x?x16x16xf32>) {
    %extracted_slice = tensor.extract_slice %45[%arg0, 0, 0, 0] [1, %36, 16, 1] [1, 1, 1, 1] : tensor<?x?x16x1xf32> to tensor<1x?x16x1xf32>
    %extracted_slice_0 = tensor.extract_slice %46[%arg1, 0, 0, 0] [1, %36, 16, 1] [1, 1, 1, 1] : tensor<?x?x16x1xf32> to tensor<1x?x16x1xf32>
    %50 = tensor.empty() : tensor<1x1x16x16xf32>
    %51 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>} ins(%cst : f32) outs(%50 : tensor<1x1x16x16xf32>) -> tensor<1x1x16x16xf32>
    %52 = linalg.mmt4d {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0, 0, 0], [1, 1, 0, 2, 16, 0], [0, 0, 1, 0, 0, 1]]>} ins(%extracted_slice, %extracted_slice_0 : tensor<1x?x16x1xf32>, tensor<1x?x16x1xf32>) outs(%51 : tensor<1x1x16x16xf32>) -> tensor<1x1x16x16xf32>
    %extracted_slice_1 = tensor.extract_slice %47[%arg0, 0] [1, 16] [1, 1] : tensor<?x16xf32> to tensor<1x16xf32>
    %extracted_slice_2 = tensor.extract_slice %arg2[%arg0, %arg1, 0, 0] [1, 1, 16, 16] [1, 1, 1, 1] : tensor<?x?x16x16xf32> to tensor<1x1x16x16xf32>
    %53 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%52, %extracted_slice_1 : tensor<1x1x16x16xf32>, tensor<1x16xf32>) outs(%extracted_slice_2 : tensor<1x1x16x16xf32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>} {
    ^bb0(%in: f32, %in_3: f32, %out: f32):
      %54 = arith.addf %in, %in_3 : f32
      %55 = arith.maximumf %54, %cst : f32
      linalg.yield %55 : f32
    } -> tensor<1x1x16x16xf32>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %53 into %arg2[%arg0, %arg1, 0, 0] [1, 1, 16, 16] [1, 1, 1, 1] : tensor<1x1x16x16xf32> into tensor<?x?x16x16xf32>
    }
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  iree_tensor_ext.dispatch.tensor.store %49, %44, offsets = [0, 0, 0, 0], sizes = [%39, %40, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%39, %40}
  return
 }

 // -----// IR Dump After LLVMCPUTileRootAndFuseProducerConsumerPass (iree-llvmcpu-tile-root-and-fuse-producer-consumer) //----- //
 func.func @mmt4d_bias_relu_dispatch_0_mmt4d_DxDxDx16x16x1_f32() attributes {hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", iree.encoding.resolver = #iree_cpu.cpu_encoding_layout<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "none"}>, translation_info = #iree_codegen.translation_info<pipeline = Mmt4dTilingExpert>} {
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
  %8 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
  %9 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
  %10 = arith.extui %0 : i32 to i64
  %11 = arith.extui %1 : i32 to i64
  %12 = arith.shli %11, %c32_i64 : i64
  %13 = arith.ori %10, %12 : i64
  %14 = arith.index_castui %13 : i64 to index
  %15 = arith.extui %2 : i32 to i64
  %16 = arith.extui %3 : i32 to i64
  %17 = arith.shli %16, %c32_i64 : i64
  %18 = arith.ori %15, %17 : i64
  %19 = arith.index_castui %18 : i64 to index
  %20 = arith.extui %4 : i32 to i64
  %21 = arith.extui %5 : i32 to i64
  %22 = arith.shli %21, %c32_i64 : i64
  %23 = arith.ori %20, %22 : i64
  %24 = arith.index_castui %23 : i64 to index
  %25 = arith.extui %6 : i32 to i64
  %26 = arith.extui %7 : i32 to i64
  %27 = arith.shli %26, %c32_i64 : i64
  %28 = arith.ori %25, %27 : i64
  %29 = arith.index_castui %28 : i64 to index
  %30 = arith.extui %8 : i32 to i64
  %31 = arith.extui %9 : i32 to i64
  %32 = arith.shli %31, %c32_i64 : i64
  %33 = arith.ori %30, %32 : i64
  %34 = arith.index_castui %33 : i64 to index
  %35:5 = util.assume.int 
      %14<umin = 0, umax = 9007199254740991>, 
      %19<umin = 0, umax = 9007199254740991>, 
      %24<umin = 0, umax = 9007199254740991>, 
      %29<umin = 0, umax = 9007199254740991>, 
      %34<umin = 0, umax = 9007199254740991>
    : index, index, index, index, index
  %36 = iree_tensor_ext.dispatch.workload.ordinal %35#0, 0 : index
  %37 = iree_tensor_ext.dispatch.workload.ordinal %35#1, 1 : index
  %38 = iree_tensor_ext.dispatch.workload.ordinal %35#2, 2 : index
  %39 = iree_tensor_ext.dispatch.workload.ordinal %35#3, 3 : index
  %40 = iree_tensor_ext.dispatch.workload.ordinal %35#4, 4 : index
  %41 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%39, %36}
  %42 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%37, %40}
  %43 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x16xf32>>{%38}
  %44 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(3) alignment(64) offset(%c0) flags(Indirect) : !iree_tensor_ext.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%39, %40}
  %45 = iree_tensor_ext.dispatch.tensor.load %41, offsets = [0, 0, 0, 0], sizes = [%39, %36, 16, 1], strides = [1, 1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%39, %36} -> tensor<?x?x16x1xf32>
  %46 = iree_tensor_ext.dispatch.tensor.load %42, offsets = [0, 0, 0, 0], sizes = [%37, %40, 16, 1], strides = [1, 1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%37, %40} -> tensor<?x?x16x1xf32>
  %47 = iree_tensor_ext.dispatch.tensor.load %43, offsets = [0, 0], sizes = [%38, 16], strides = [1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x16xf32>>{%38} -> tensor<?x16xf32>
  %48 = tensor.empty(%39, %40) : tensor<?x?x16x16xf32>
  %49 = scf.forall (%arg0, %arg1) in (%39, %37) shared_outs(%arg2 = %48) -> (tensor<?x?x16x16xf32>) {
    %extracted_slice = tensor.extract_slice %47[%arg0, 0] [1, 16] [1, 1] : tensor<?x16xf32> to tensor<1x16xf32>
    %extracted_slice_0 = tensor.extract_slice %arg2[%arg0, %arg1, 0, 0] [1, 1, 16, 16] [1, 1, 1, 1] : tensor<?x?x16x16xf32> to tensor<1x1x16x16xf32>
    %50 = scf.forall (%arg3) = (0) to (16) step (2) shared_outs(%arg4 = %extracted_slice_0) -> (tensor<1x1x16x16xf32>) {
      %extracted_slice_1 = tensor.extract_slice %45[%arg0, 0, %arg3, 0] [1, %36, 2, 1] [1, 1, 1, 1] : tensor<?x?x16x1xf32> to tensor<1x?x2x1xf32>
      %extracted_slice_2 = tensor.extract_slice %46[%arg1, 0, 0, 0] [1, %36, 16, 1] [1, 1, 1, 1] : tensor<?x?x16x1xf32> to tensor<1x?x16x1xf32>
      %51 = tensor.empty() : tensor<1x1x2x16xf32>
      %52 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>} ins(%cst : f32) outs(%51 : tensor<1x1x2x16xf32>) -> tensor<1x1x2x16xf32>
      %53 = linalg.mmt4d {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0, 0, 0], [1, 1, 0, 2, 16, 0], [0, 0, 1, 0, 0, 1]]>} ins(%extracted_slice_1, %extracted_slice_2 : tensor<1x?x2x1xf32>, tensor<1x?x16x1xf32>) outs(%52 : tensor<1x1x2x16xf32>) -> tensor<1x1x2x16xf32>
      %extracted_slice_3 = tensor.extract_slice %extracted_slice[0, %arg3] [1, 2] [1, 1] : tensor<1x16xf32> to tensor<1x2xf32>
      %extracted_slice_4 = tensor.extract_slice %arg4[0, 0, %arg3, 0] [1, 1, 2, 16] [1, 1, 1, 1] : tensor<1x1x16x16xf32> to tensor<1x1x2x16xf32>
      %54 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%53, %extracted_slice_3 : tensor<1x1x2x16xf32>, tensor<1x2xf32>) outs(%extracted_slice_4 : tensor<1x1x2x16xf32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>} {
      ^bb0(%in: f32, %in_5: f32, %out: f32):
        %55 = arith.addf %in, %in_5 : f32
        %56 = arith.maximumf %55, %cst : f32
        linalg.yield %56 : f32
      } -> tensor<1x1x2x16xf32>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %54 into %arg4[%c0, %c0, %arg3, %c0] [1, 1, 2, 16] [1, 1, 1, 1] : tensor<1x1x2x16xf32> into tensor<1x1x16x16xf32>
      }
    }
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %50 into %arg2[%arg0, %arg1, 0, 0] [1, 1, 16, 16] [1, 1, 1, 1] : tensor<1x1x16x16xf32> into tensor<?x?x16x16xf32>
    }
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  iree_tensor_ext.dispatch.tensor.store %49, %44, offsets = [0, 0, 0, 0], sizes = [%39, %40, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%39, %40}
  return
 }

 // -----// IR Dump After CPUPrepareUkernelsPass (iree-codegen-cpu-prepare-ukernels) //----- //
 func.func @mmt4d_bias_relu_dispatch_0_mmt4d_DxDxDx16x16x1_f32() attributes {hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", iree.encoding.resolver = #iree_cpu.cpu_encoding_layout<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "none"}>, translation_info = #iree_codegen.translation_info<pipeline = Mmt4dTilingExpert>} {
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
  %8 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
  %9 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
  %10 = arith.extui %0 : i32 to i64
  %11 = arith.extui %1 : i32 to i64
  %12 = arith.shli %11, %c32_i64 : i64
  %13 = arith.ori %10, %12 : i64
  %14 = arith.index_castui %13 : i64 to index
  %15 = arith.extui %2 : i32 to i64
  %16 = arith.extui %3 : i32 to i64
  %17 = arith.shli %16, %c32_i64 : i64
  %18 = arith.ori %15, %17 : i64
  %19 = arith.index_castui %18 : i64 to index
  %20 = arith.extui %4 : i32 to i64
  %21 = arith.extui %5 : i32 to i64
  %22 = arith.shli %21, %c32_i64 : i64
  %23 = arith.ori %20, %22 : i64
  %24 = arith.index_castui %23 : i64 to index
  %25 = arith.extui %6 : i32 to i64
  %26 = arith.extui %7 : i32 to i64
  %27 = arith.shli %26, %c32_i64 : i64
  %28 = arith.ori %25, %27 : i64
  %29 = arith.index_castui %28 : i64 to index
  %30 = arith.extui %8 : i32 to i64
  %31 = arith.extui %9 : i32 to i64
  %32 = arith.shli %31, %c32_i64 : i64
  %33 = arith.ori %30, %32 : i64
  %34 = arith.index_castui %33 : i64 to index
  %35:5 = util.assume.int 
      %14<umin = 0, umax = 9007199254740991>, 
      %19<umin = 0, umax = 9007199254740991>, 
      %24<umin = 0, umax = 9007199254740991>, 
      %29<umin = 0, umax = 9007199254740991>, 
      %34<umin = 0, umax = 9007199254740991>
    : index, index, index, index, index
  %36 = iree_tensor_ext.dispatch.workload.ordinal %35#0, 0 : index
  %37 = iree_tensor_ext.dispatch.workload.ordinal %35#1, 1 : index
  %38 = iree_tensor_ext.dispatch.workload.ordinal %35#2, 2 : index
  %39 = iree_tensor_ext.dispatch.workload.ordinal %35#3, 3 : index
  %40 = iree_tensor_ext.dispatch.workload.ordinal %35#4, 4 : index
  %41 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%39, %36}
  %42 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%37, %40}
  %43 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x16xf32>>{%38}
  %44 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(3) alignment(64) offset(%c0) flags(Indirect) : !iree_tensor_ext.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%39, %40}
  %45 = iree_tensor_ext.dispatch.tensor.load %41, offsets = [0, 0, 0, 0], sizes = [%39, %36, 16, 1], strides = [1, 1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%39, %36} -> tensor<?x?x16x1xf32>
  %46 = iree_tensor_ext.dispatch.tensor.load %42, offsets = [0, 0, 0, 0], sizes = [%37, %40, 16, 1], strides = [1, 1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%37, %40} -> tensor<?x?x16x1xf32>
  %47 = iree_tensor_ext.dispatch.tensor.load %43, offsets = [0, 0], sizes = [%38, 16], strides = [1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x16xf32>>{%38} -> tensor<?x16xf32>
  %48 = tensor.empty(%39, %40) : tensor<?x?x16x16xf32>
  %49 = scf.forall (%arg0, %arg1) in (%39, %37) shared_outs(%arg2 = %48) -> (tensor<?x?x16x16xf32>) {
    %extracted_slice = tensor.extract_slice %arg2[%arg0, %arg1, 0, 0] [1, 1, 16, 16] [1, 1, 1, 1] : tensor<?x?x16x16xf32> to tensor<1x1x16x16xf32>
    %50 = scf.forall (%arg3) = (0) to (16) step (2) shared_outs(%arg4 = %extracted_slice) -> (tensor<1x1x16x16xf32>) {
      %extracted_slice_0 = tensor.extract_slice %45[%arg0, 0, %arg3, 0] [1, %36, 2, 1] [1, 1, 1, 1] : tensor<?x?x16x1xf32> to tensor<1x?x2x1xf32>
      %extracted_slice_1 = tensor.extract_slice %46[%arg1, 0, 0, 0] [1, %36, 16, 1] [1, 1, 1, 1] : tensor<?x?x16x1xf32> to tensor<1x?x16x1xf32>
      %51 = tensor.empty() : tensor<1x1x2x16xf32>
      %52 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>} ins(%cst : f32) outs(%51 : tensor<1x1x2x16xf32>) -> tensor<1x1x2x16xf32>
      %53 = linalg.mmt4d {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0, 0, 0], [1, 1, 0, 2, 16, 0], [0, 0, 1, 0, 0, 1]]>} ins(%extracted_slice_0, %extracted_slice_1 : tensor<1x?x2x1xf32>, tensor<1x?x16x1xf32>) outs(%52 : tensor<1x1x2x16xf32>) -> tensor<1x1x2x16xf32>
      %extracted_slice_2 = tensor.extract_slice %47[%arg0, %arg3] [1, 2] [1, 1] : tensor<?x16xf32> to tensor<1x2xf32>
      %extracted_slice_3 = tensor.extract_slice %arg4[0, 0, %arg3, 0] [1, 1, 2, 16] [1, 1, 1, 1] : tensor<1x1x16x16xf32> to tensor<1x1x2x16xf32>
      %54 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%53, %extracted_slice_2 : tensor<1x1x2x16xf32>, tensor<1x2xf32>) outs(%extracted_slice_3 : tensor<1x1x2x16xf32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>} {
      ^bb0(%in: f32, %in_4: f32, %out: f32):
        %55 = arith.addf %in, %in_4 : f32
        %56 = arith.maximumf %55, %cst : f32
        linalg.yield %56 : f32
      } -> tensor<1x1x2x16xf32>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %54 into %arg4[%c0, %c0, %arg3, %c0] [1, 1, 2, 16] [1, 1, 1, 1] : tensor<1x1x2x16xf32> into tensor<1x1x16x16xf32>
      }
    }
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %50 into %arg2[%arg0, %arg1, 0, 0] [1, 1, 16, 16] [1, 1, 1, 1] : tensor<1x1x16x16xf32> into tensor<?x?x16x16xf32>
    }
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  iree_tensor_ext.dispatch.tensor.store %49, %44, offsets = [0, 0, 0, 0], sizes = [%39, %40, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%39, %40}
  return
 }

 // -----// IR Dump After CPULowerToUKernelsPass (iree-codegen-cpu-lower-to-ukernels) //----- //
 func.func @mmt4d_bias_relu_dispatch_0_mmt4d_DxDxDx16x16x1_f32() attributes {hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", iree.encoding.resolver = #iree_cpu.cpu_encoding_layout<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "none"}>, translation_info = #iree_codegen.translation_info<pipeline = Mmt4dTilingExpert>} {
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
  %8 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
  %9 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
  %10 = arith.extui %0 : i32 to i64
  %11 = arith.extui %1 : i32 to i64
  %12 = arith.shli %11, %c32_i64 : i64
  %13 = arith.ori %10, %12 : i64
  %14 = arith.index_castui %13 : i64 to index
  %15 = arith.extui %2 : i32 to i64
  %16 = arith.extui %3 : i32 to i64
  %17 = arith.shli %16, %c32_i64 : i64
  %18 = arith.ori %15, %17 : i64
  %19 = arith.index_castui %18 : i64 to index
  %20 = arith.extui %4 : i32 to i64
  %21 = arith.extui %5 : i32 to i64
  %22 = arith.shli %21, %c32_i64 : i64
  %23 = arith.ori %20, %22 : i64
  %24 = arith.index_castui %23 : i64 to index
  %25 = arith.extui %6 : i32 to i64
  %26 = arith.extui %7 : i32 to i64
  %27 = arith.shli %26, %c32_i64 : i64
  %28 = arith.ori %25, %27 : i64
  %29 = arith.index_castui %28 : i64 to index
  %30 = arith.extui %8 : i32 to i64
  %31 = arith.extui %9 : i32 to i64
  %32 = arith.shli %31, %c32_i64 : i64
  %33 = arith.ori %30, %32 : i64
  %34 = arith.index_castui %33 : i64 to index
  %35:5 = util.assume.int 
      %14<umin = 0, umax = 9007199254740991>, 
      %19<umin = 0, umax = 9007199254740991>, 
      %24<umin = 0, umax = 9007199254740991>, 
      %29<umin = 0, umax = 9007199254740991>, 
      %34<umin = 0, umax = 9007199254740991>
    : index, index, index, index, index
  %36 = iree_tensor_ext.dispatch.workload.ordinal %35#0, 0 : index
  %37 = iree_tensor_ext.dispatch.workload.ordinal %35#1, 1 : index
  %38 = iree_tensor_ext.dispatch.workload.ordinal %35#2, 2 : index
  %39 = iree_tensor_ext.dispatch.workload.ordinal %35#3, 3 : index
  %40 = iree_tensor_ext.dispatch.workload.ordinal %35#4, 4 : index
  %41 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%39, %36}
  %42 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%37, %40}
  %43 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x16xf32>>{%38}
  %44 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(3) alignment(64) offset(%c0) flags(Indirect) : !iree_tensor_ext.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%39, %40}
  %45 = iree_tensor_ext.dispatch.tensor.load %41, offsets = [0, 0, 0, 0], sizes = [%39, %36, 16, 1], strides = [1, 1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%39, %36} -> tensor<?x?x16x1xf32>
  %46 = iree_tensor_ext.dispatch.tensor.load %42, offsets = [0, 0, 0, 0], sizes = [%37, %40, 16, 1], strides = [1, 1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%37, %40} -> tensor<?x?x16x1xf32>
  %47 = iree_tensor_ext.dispatch.tensor.load %43, offsets = [0, 0], sizes = [%38, 16], strides = [1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x16xf32>>{%38} -> tensor<?x16xf32>
  %48 = tensor.empty(%39, %40) : tensor<?x?x16x16xf32>
  %49 = scf.forall (%arg0, %arg1) in (%39, %37) shared_outs(%arg2 = %48) -> (tensor<?x?x16x16xf32>) {
    %extracted_slice = tensor.extract_slice %arg2[%arg0, %arg1, 0, 0] [1, 1, 16, 16] [1, 1, 1, 1] : tensor<?x?x16x16xf32> to tensor<1x1x16x16xf32>
    %50 = scf.forall (%arg3) = (0) to (16) step (2) shared_outs(%arg4 = %extracted_slice) -> (tensor<1x1x16x16xf32>) {
      %extracted_slice_0 = tensor.extract_slice %45[%arg0, 0, %arg3, 0] [1, %36, 2, 1] [1, 1, 1, 1] : tensor<?x?x16x1xf32> to tensor<1x?x2x1xf32>
      %extracted_slice_1 = tensor.extract_slice %46[%arg1, 0, 0, 0] [1, %36, 16, 1] [1, 1, 1, 1] : tensor<?x?x16x1xf32> to tensor<1x?x16x1xf32>
      %51 = tensor.empty() : tensor<1x1x2x16xf32>
      %52 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>} ins(%cst : f32) outs(%51 : tensor<1x1x2x16xf32>) -> tensor<1x1x2x16xf32>
      %53 = linalg.mmt4d {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0, 0, 0], [1, 1, 0, 2, 16, 0], [0, 0, 1, 0, 0, 1]]>} ins(%extracted_slice_0, %extracted_slice_1 : tensor<1x?x2x1xf32>, tensor<1x?x16x1xf32>) outs(%52 : tensor<1x1x2x16xf32>) -> tensor<1x1x2x16xf32>
      %extracted_slice_2 = tensor.extract_slice %47[%arg0, %arg3] [1, 2] [1, 1] : tensor<?x16xf32> to tensor<1x2xf32>
      %extracted_slice_3 = tensor.extract_slice %arg4[0, 0, %arg3, 0] [1, 1, 2, 16] [1, 1, 1, 1] : tensor<1x1x16x16xf32> to tensor<1x1x2x16xf32>
      %54 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%53, %extracted_slice_2 : tensor<1x1x2x16xf32>, tensor<1x2xf32>) outs(%extracted_slice_3 : tensor<1x1x2x16xf32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>} {
      ^bb0(%in: f32, %in_4: f32, %out: f32):
        %55 = arith.addf %in, %in_4 : f32
        %56 = arith.maximumf %55, %cst : f32
        linalg.yield %56 : f32
      } -> tensor<1x1x2x16xf32>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %54 into %arg4[%c0, %c0, %arg3, %c0] [1, 1, 2, 16] [1, 1, 1, 1] : tensor<1x1x2x16xf32> into tensor<1x1x16x16xf32>
      }
    }
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %50 into %arg2[%arg0, %arg1, 0, 0] [1, 1, 16, 16] [1, 1, 1, 1] : tensor<1x1x16x16xf32> into tensor<?x?x16x16xf32>
    }
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  iree_tensor_ext.dispatch.tensor.store %49, %44, offsets = [0, 0, 0, 0], sizes = [%39, %40, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%39, %40}
  return
 }

 // -----// IR Dump After LLVMCPUTileRootAndFuseProducerConsumerPass (iree-llvmcpu-tile-root-and-fuse-producer-consumer) //----- //
 func.func @mmt4d_bias_relu_dispatch_0_mmt4d_DxDxDx16x16x1_f32() attributes {hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", iree.encoding.resolver = #iree_cpu.cpu_encoding_layout<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "none"}>, translation_info = #iree_codegen.translation_info<pipeline = Mmt4dTilingExpert>} {
  %c1 = arith.constant 1 : index
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
  %8 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
  %9 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
  %10 = arith.extui %0 : i32 to i64
  %11 = arith.extui %1 : i32 to i64
  %12 = arith.shli %11, %c32_i64 : i64
  %13 = arith.ori %10, %12 : i64
  %14 = arith.index_castui %13 : i64 to index
  %15 = arith.extui %2 : i32 to i64
  %16 = arith.extui %3 : i32 to i64
  %17 = arith.shli %16, %c32_i64 : i64
  %18 = arith.ori %15, %17 : i64
  %19 = arith.index_castui %18 : i64 to index
  %20 = arith.extui %4 : i32 to i64
  %21 = arith.extui %5 : i32 to i64
  %22 = arith.shli %21, %c32_i64 : i64
  %23 = arith.ori %20, %22 : i64
  %24 = arith.index_castui %23 : i64 to index
  %25 = arith.extui %6 : i32 to i64
  %26 = arith.extui %7 : i32 to i64
  %27 = arith.shli %26, %c32_i64 : i64
  %28 = arith.ori %25, %27 : i64
  %29 = arith.index_castui %28 : i64 to index
  %30 = arith.extui %8 : i32 to i64
  %31 = arith.extui %9 : i32 to i64
  %32 = arith.shli %31, %c32_i64 : i64
  %33 = arith.ori %30, %32 : i64
  %34 = arith.index_castui %33 : i64 to index
  %35:5 = util.assume.int 
      %14<umin = 0, umax = 9007199254740991>, 
      %19<umin = 0, umax = 9007199254740991>, 
      %24<umin = 0, umax = 9007199254740991>, 
      %29<umin = 0, umax = 9007199254740991>, 
      %34<umin = 0, umax = 9007199254740991>
    : index, index, index, index, index
  %36 = iree_tensor_ext.dispatch.workload.ordinal %35#0, 0 : index
  %37 = iree_tensor_ext.dispatch.workload.ordinal %35#1, 1 : index
  %38 = iree_tensor_ext.dispatch.workload.ordinal %35#2, 2 : index
  %39 = iree_tensor_ext.dispatch.workload.ordinal %35#3, 3 : index
  %40 = iree_tensor_ext.dispatch.workload.ordinal %35#4, 4 : index
  %41 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%39, %36}
  %42 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%37, %40}
  %43 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x16xf32>>{%38}
  %44 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(3) alignment(64) offset(%c0) flags(Indirect) : !iree_tensor_ext.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%39, %40}
  %45 = iree_tensor_ext.dispatch.tensor.load %41, offsets = [0, 0, 0, 0], sizes = [%39, %36, 16, 1], strides = [1, 1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%39, %36} -> tensor<?x?x16x1xf32>
  %46 = iree_tensor_ext.dispatch.tensor.load %42, offsets = [0, 0, 0, 0], sizes = [%37, %40, 16, 1], strides = [1, 1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%37, %40} -> tensor<?x?x16x1xf32>
  %47 = iree_tensor_ext.dispatch.tensor.load %43, offsets = [0, 0], sizes = [%38, 16], strides = [1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x16xf32>>{%38} -> tensor<?x16xf32>
  %48 = tensor.empty(%39, %40) : tensor<?x?x16x16xf32>
  %49 = scf.forall (%arg0, %arg1) in (%39, %37) shared_outs(%arg2 = %48) -> (tensor<?x?x16x16xf32>) {
    %extracted_slice = tensor.extract_slice %arg2[%arg0, %arg1, 0, 0] [1, 1, 16, 16] [1, 1, 1, 1] : tensor<?x?x16x16xf32> to tensor<1x1x16x16xf32>
    %50 = scf.forall (%arg3) = (0) to (16) step (2) shared_outs(%arg4 = %extracted_slice) -> (tensor<1x1x16x16xf32>) {
      %51 = tensor.empty() : tensor<1x1x2x16xf32>
      %52 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>} ins(%cst : f32) outs(%51 : tensor<1x1x2x16xf32>) -> tensor<1x1x2x16xf32>
      %53 = scf.for %arg5 = %c0 to %36 step %c1 iter_args(%arg6 = %52) -> (tensor<1x1x2x16xf32>) {
        %extracted_slice_2 = tensor.extract_slice %45[%arg0, %arg5, %arg3, 0] [1, 1, 2, 1] [1, 1, 1, 1] : tensor<?x?x16x1xf32> to tensor<1x1x2x1xf32>
        %extracted_slice_3 = tensor.extract_slice %46[%arg1, %arg5, 0, 0] [1, 1, 16, 1] [1, 1, 1, 1] : tensor<?x?x16x1xf32> to tensor<1x1x16x1xf32>
        %55 = linalg.mmt4d {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0, 0, 0], [1, 1, 0, 2, 16, 0], [0, 0, 1, 0, 0, 1]]>} ins(%extracted_slice_2, %extracted_slice_3 : tensor<1x1x2x1xf32>, tensor<1x1x16x1xf32>) outs(%arg6 : tensor<1x1x2x16xf32>) -> tensor<1x1x2x16xf32>
        scf.yield %55 : tensor<1x1x2x16xf32>
      }
      %extracted_slice_0 = tensor.extract_slice %47[%arg0, %arg3] [1, 2] [1, 1] : tensor<?x16xf32> to tensor<1x2xf32>
      %extracted_slice_1 = tensor.extract_slice %arg4[0, 0, %arg3, 0] [1, 1, 2, 16] [1, 1, 1, 1] : tensor<1x1x16x16xf32> to tensor<1x1x2x16xf32>
      %54 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%53, %extracted_slice_0 : tensor<1x1x2x16xf32>, tensor<1x2xf32>) outs(%extracted_slice_1 : tensor<1x1x2x16xf32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>} {
      ^bb0(%in: f32, %in_2: f32, %out: f32):
        %55 = arith.addf %in, %in_2 : f32
        %56 = arith.maximumf %55, %cst : f32
        linalg.yield %56 : f32
      } -> tensor<1x1x2x16xf32>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %54 into %arg4[%c0, %c0, %arg3, %c0] [1, 1, 2, 16] [1, 1, 1, 1] : tensor<1x1x2x16xf32> into tensor<1x1x16x16xf32>
      }
    }
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %50 into %arg2[%arg0, %arg1, 0, 0] [1, 1, 16, 16] [1, 1, 1, 1] : tensor<1x1x16x16xf32> into tensor<?x?x16x16xf32>
    }
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  iree_tensor_ext.dispatch.tensor.store %49, %44, offsets = [0, 0, 0, 0], sizes = [%39, %40, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%39, %40}
  return
 }

 // -----// IR Dump After GenericVectorizationPass (iree-codegen-generic-vectorization) //----- //
 func.func @mmt4d_bias_relu_dispatch_0_mmt4d_DxDxDx16x16x1_f32() attributes {hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", iree.encoding.resolver = #iree_cpu.cpu_encoding_layout<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "none"}>, translation_info = #iree_codegen.translation_info<pipeline = Mmt4dTilingExpert>} {
  %c1 = arith.constant 1 : index
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
  %8 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
  %9 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
  %10 = arith.extui %0 : i32 to i64
  %11 = arith.extui %1 : i32 to i64
  %12 = arith.shli %11, %c32_i64 : i64
  %13 = arith.ori %10, %12 : i64
  %14 = arith.index_castui %13 : i64 to index
  %15 = arith.extui %2 : i32 to i64
  %16 = arith.extui %3 : i32 to i64
  %17 = arith.shli %16, %c32_i64 : i64
  %18 = arith.ori %15, %17 : i64
  %19 = arith.index_castui %18 : i64 to index
  %20 = arith.extui %4 : i32 to i64
  %21 = arith.extui %5 : i32 to i64
  %22 = arith.shli %21, %c32_i64 : i64
  %23 = arith.ori %20, %22 : i64
  %24 = arith.index_castui %23 : i64 to index
  %25 = arith.extui %6 : i32 to i64
  %26 = arith.extui %7 : i32 to i64
  %27 = arith.shli %26, %c32_i64 : i64
  %28 = arith.ori %25, %27 : i64
  %29 = arith.index_castui %28 : i64 to index
  %30 = arith.extui %8 : i32 to i64
  %31 = arith.extui %9 : i32 to i64
  %32 = arith.shli %31, %c32_i64 : i64
  %33 = arith.ori %30, %32 : i64
  %34 = arith.index_castui %33 : i64 to index
  %35:5 = util.assume.int 
      %14<umin = 0, umax = 9007199254740991>, 
      %19<umin = 0, umax = 9007199254740991>, 
      %24<umin = 0, umax = 9007199254740991>, 
      %29<umin = 0, umax = 9007199254740991>, 
      %34<umin = 0, umax = 9007199254740991>
    : index, index, index, index, index
  %36 = iree_tensor_ext.dispatch.workload.ordinal %35#0, 0 : index
  %37 = iree_tensor_ext.dispatch.workload.ordinal %35#1, 1 : index
  %38 = iree_tensor_ext.dispatch.workload.ordinal %35#2, 2 : index
  %39 = iree_tensor_ext.dispatch.workload.ordinal %35#3, 3 : index
  %40 = iree_tensor_ext.dispatch.workload.ordinal %35#4, 4 : index
  %41 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%39, %36}
  %42 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%37, %40}
  %43 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x16xf32>>{%38}
  %44 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(3) alignment(64) offset(%c0) flags(Indirect) : !iree_tensor_ext.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%39, %40}
  %45 = iree_tensor_ext.dispatch.tensor.load %41, offsets = [0, 0, 0, 0], sizes = [%39, %36, 16, 1], strides = [1, 1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%39, %36} -> tensor<?x?x16x1xf32>
  %46 = iree_tensor_ext.dispatch.tensor.load %42, offsets = [0, 0, 0, 0], sizes = [%37, %40, 16, 1], strides = [1, 1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%37, %40} -> tensor<?x?x16x1xf32>
  %47 = iree_tensor_ext.dispatch.tensor.load %43, offsets = [0, 0], sizes = [%38, 16], strides = [1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x16xf32>>{%38} -> tensor<?x16xf32>
  %48 = tensor.empty(%39, %40) : tensor<?x?x16x16xf32>
  %49 = scf.forall (%arg0, %arg1) in (%39, %37) shared_outs(%arg2 = %48) -> (tensor<?x?x16x16xf32>) {
    %extracted_slice = tensor.extract_slice %arg2[%arg0, %arg1, 0, 0] [1, 1, 16, 16] [1, 1, 1, 1] : tensor<?x?x16x16xf32> to tensor<1x1x16x16xf32>
    %50 = scf.forall (%arg3) = (0) to (16) step (2) shared_outs(%arg4 = %extracted_slice) -> (tensor<1x1x16x16xf32>) {
      %51 = tensor.empty() : tensor<1x1x2x16xf32>
      %52 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>} ins(%cst : f32) outs(%51 : tensor<1x1x2x16xf32>) -> tensor<1x1x2x16xf32>
      %53 = scf.for %arg5 = %c0 to %36 step %c1 iter_args(%arg6 = %52) -> (tensor<1x1x2x16xf32>) {
        %extracted_slice_2 = tensor.extract_slice %45[%arg0, %arg5, %arg3, 0] [1, 1, 2, 1] [1, 1, 1, 1] : tensor<?x?x16x1xf32> to tensor<1x1x2x1xf32>
        %extracted_slice_3 = tensor.extract_slice %46[%arg1, %arg5, 0, 0] [1, 1, 16, 1] [1, 1, 1, 1] : tensor<?x?x16x1xf32> to tensor<1x1x16x1xf32>
        %55 = vector.transfer_read %extracted_slice_2[%c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true]} : tensor<1x1x2x1xf32>, vector<1x1x2x1xf32>
        %56 = vector.transfer_read %extracted_slice_3[%c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true]} : tensor<1x1x16x1xf32>, vector<1x1x16x1xf32>
        %57 = vector.transfer_read %arg6[%c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true]} : tensor<1x1x2x16xf32>, vector<1x1x2x16xf32>
        %58 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d2, d4, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %55, %56, %57 : vector<1x1x2x1xf32>, vector<1x1x16x1xf32> into vector<1x1x2x16xf32>
        %59 = vector.transfer_write %58, %arg6[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x2x16xf32>, tensor<1x1x2x16xf32>
        scf.yield %59 : tensor<1x1x2x16xf32>
      }
      %extracted_slice_0 = tensor.extract_slice %47[%arg0, %arg3] [1, 2] [1, 1] : tensor<?x16xf32> to tensor<1x2xf32>
      %extracted_slice_1 = tensor.extract_slice %arg4[0, 0, %arg3, 0] [1, 1, 2, 16] [1, 1, 1, 1] : tensor<1x1x16x16xf32> to tensor<1x1x2x16xf32>
      %54 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%53, %extracted_slice_0 : tensor<1x1x2x16xf32>, tensor<1x2xf32>) outs(%extracted_slice_1 : tensor<1x1x2x16xf32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>} {
      ^bb0(%in: f32, %in_2: f32, %out: f32):
        %55 = arith.addf %in, %in_2 : f32
        %56 = arith.maximumf %55, %cst : f32
        linalg.yield %56 : f32
      } -> tensor<1x1x2x16xf32>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %54 into %arg4[%c0, %c0, %arg3, %c0] [1, 1, 2, 16] [1, 1, 1, 1] : tensor<1x1x2x16xf32> into tensor<1x1x16x16xf32>
      }
    }
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %50 into %arg2[%arg0, %arg1, 0, 0] [1, 1, 16, 16] [1, 1, 1, 1] : tensor<1x1x16x16xf32> into tensor<?x?x16x16xf32>
    }
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  iree_tensor_ext.dispatch.tensor.store %49, %44, offsets = [0, 0, 0, 0], sizes = [%39, %40, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%39, %40}
  return
 }

 // -----// IR Dump After OptimizeTensorInsertExtractSlicesPass (iree-codegen-optimize-tensor-insert-extract-slices) //----- //
 func.func @mmt4d_bias_relu_dispatch_0_mmt4d_DxDxDx16x16x1_f32() attributes {hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", iree.encoding.resolver = #iree_cpu.cpu_encoding_layout<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "none"}>, translation_info = #iree_codegen.translation_info<pipeline = Mmt4dTilingExpert>} {
  %c1 = arith.constant 1 : index
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
  %8 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
  %9 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
  %10 = arith.extui %0 : i32 to i64
  %11 = arith.extui %1 : i32 to i64
  %12 = arith.shli %11, %c32_i64 : i64
  %13 = arith.ori %10, %12 : i64
  %14 = arith.index_castui %13 : i64 to index
  %15 = arith.extui %2 : i32 to i64
  %16 = arith.extui %3 : i32 to i64
  %17 = arith.shli %16, %c32_i64 : i64
  %18 = arith.ori %15, %17 : i64
  %19 = arith.index_castui %18 : i64 to index
  %20 = arith.extui %4 : i32 to i64
  %21 = arith.extui %5 : i32 to i64
  %22 = arith.shli %21, %c32_i64 : i64
  %23 = arith.ori %20, %22 : i64
  %24 = arith.index_castui %23 : i64 to index
  %25 = arith.extui %6 : i32 to i64
  %26 = arith.extui %7 : i32 to i64
  %27 = arith.shli %26, %c32_i64 : i64
  %28 = arith.ori %25, %27 : i64
  %29 = arith.index_castui %28 : i64 to index
  %30 = arith.extui %8 : i32 to i64
  %31 = arith.extui %9 : i32 to i64
  %32 = arith.shli %31, %c32_i64 : i64
  %33 = arith.ori %30, %32 : i64
  %34 = arith.index_castui %33 : i64 to index
  %35:5 = util.assume.int 
      %14<umin = 0, umax = 9007199254740991>, 
      %19<umin = 0, umax = 9007199254740991>, 
      %24<umin = 0, umax = 9007199254740991>, 
      %29<umin = 0, umax = 9007199254740991>, 
      %34<umin = 0, umax = 9007199254740991>
    : index, index, index, index, index
  %36 = iree_tensor_ext.dispatch.workload.ordinal %35#0, 0 : index
  %37 = iree_tensor_ext.dispatch.workload.ordinal %35#1, 1 : index
  %38 = iree_tensor_ext.dispatch.workload.ordinal %35#2, 2 : index
  %39 = iree_tensor_ext.dispatch.workload.ordinal %35#3, 3 : index
  %40 = iree_tensor_ext.dispatch.workload.ordinal %35#4, 4 : index
  %41 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%39, %36}
  %42 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%37, %40}
  %43 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x16xf32>>{%38}
  %44 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(3) alignment(64) offset(%c0) flags(Indirect) : !iree_tensor_ext.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%39, %40}
  %45 = iree_tensor_ext.dispatch.tensor.load %41, offsets = [0, 0, 0, 0], sizes = [%39, %36, 16, 1], strides = [1, 1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%39, %36} -> tensor<?x?x16x1xf32>
  %46 = iree_tensor_ext.dispatch.tensor.load %42, offsets = [0, 0, 0, 0], sizes = [%37, %40, 16, 1], strides = [1, 1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%37, %40} -> tensor<?x?x16x1xf32>
  %47 = iree_tensor_ext.dispatch.tensor.load %43, offsets = [0, 0], sizes = [%38, 16], strides = [1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x16xf32>>{%38} -> tensor<?x16xf32>
  %48 = tensor.empty(%39, %40) : tensor<?x?x16x16xf32>
  %49 = scf.forall (%arg0, %arg1) in (%39, %37) shared_outs(%arg2 = %48) -> (tensor<?x?x16x16xf32>) {
    %extracted_slice = tensor.extract_slice %arg2[%arg0, %arg1, 0, 0] [1, 1, 16, 16] [1, 1, 1, 1] : tensor<?x?x16x16xf32> to tensor<1x1x16x16xf32>
    %50 = scf.forall (%arg3) = (0) to (16) step (2) shared_outs(%arg4 = %extracted_slice) -> (tensor<1x1x16x16xf32>) {
      %51 = tensor.empty() : tensor<1x1x2x16xf32>
      %extracted_slice_0 = tensor.extract_slice %arg4[0, 0, %arg3, 0] [1, 1, 2, 16] [1, 1, 1, 1] : tensor<1x1x16x16xf32> to tensor<1x1x2x16xf32>
      %extracted_slice_1 = tensor.extract_slice %47[%arg0, %arg3] [1, 2] [1, 1] : tensor<?x16xf32> to tensor<1x2xf32>
      %52 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>} ins(%cst : f32) outs(%51 : tensor<1x1x2x16xf32>) -> tensor<1x1x2x16xf32>
      %53 = vector.transfer_read %52[%c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true]} : tensor<1x1x2x16xf32>, vector<1x1x2x16xf32>
      %54 = scf.for %arg5 = %c0 to %36 step %c1 iter_args(%arg6 = %53) -> (vector<1x1x2x16xf32>) {
        %57 = vector.transfer_read %45[%arg0, %arg5, %arg3, %c0], %cst {in_bounds = [true, true, true, true]} : tensor<?x?x16x1xf32>, vector<1x1x2x1xf32>
        %58 = vector.transfer_read %46[%arg1, %arg5, %c0, %c0], %cst {in_bounds = [true, true, true, true]} : tensor<?x?x16x1xf32>, vector<1x1x16x1xf32>
        %59 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d2, d4, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %57, %58, %arg6 : vector<1x1x2x1xf32>, vector<1x1x16x1xf32> into vector<1x1x2x16xf32>
        scf.yield %59 : vector<1x1x2x16xf32>
      }
      %55 = vector.transfer_write %54, %52[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x2x16xf32>, tensor<1x1x2x16xf32>
      %56 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%55, %extracted_slice_1 : tensor<1x1x2x16xf32>, tensor<1x2xf32>) outs(%extracted_slice_0 : tensor<1x1x2x16xf32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>} {
      ^bb0(%in: f32, %in_2: f32, %out: f32):
        %57 = arith.addf %in, %in_2 : f32
        %58 = arith.maximumf %57, %cst : f32
        linalg.yield %58 : f32
      } -> tensor<1x1x2x16xf32>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %56 into %arg4[%c0, %c0, %arg3, %c0] [1, 1, 2, 16] [1, 1, 1, 1] : tensor<1x1x2x16xf32> into tensor<1x1x16x16xf32>
      }
    }
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %50 into %arg2[%arg0, %arg1, 0, 0] [1, 1, 16, 16] [1, 1, 1, 1] : tensor<1x1x16x16xf32> into tensor<?x?x16x16xf32>
    }
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  iree_tensor_ext.dispatch.tensor.store %49, %44, offsets = [0, 0, 0, 0], sizes = [%39, %40, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%39, %40}
  return
 }

 // -----// IR Dump After Canonicalizer (canonicalize) //----- //
 func.func @mmt4d_bias_relu_dispatch_0_mmt4d_DxDxDx16x16x1_f32() attributes {hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", iree.encoding.resolver = #iree_cpu.cpu_encoding_layout<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "none"}>, translation_info = #iree_codegen.translation_info<pipeline = Mmt4dTilingExpert>} {
  %c1 = arith.constant 1 : index
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
  %8 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
  %9 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
  %10 = arith.extui %0 : i32 to i64
  %11 = arith.extui %1 : i32 to i64
  %12 = arith.shli %11, %c32_i64 : i64
  %13 = arith.ori %10, %12 : i64
  %14 = arith.index_castui %13 : i64 to index
  %15 = arith.extui %2 : i32 to i64
  %16 = arith.extui %3 : i32 to i64
  %17 = arith.shli %16, %c32_i64 : i64
  %18 = arith.ori %15, %17 : i64
  %19 = arith.index_castui %18 : i64 to index
  %20 = arith.extui %4 : i32 to i64
  %21 = arith.extui %5 : i32 to i64
  %22 = arith.shli %21, %c32_i64 : i64
  %23 = arith.ori %20, %22 : i64
  %24 = arith.index_castui %23 : i64 to index
  %25 = arith.extui %6 : i32 to i64
  %26 = arith.extui %7 : i32 to i64
  %27 = arith.shli %26, %c32_i64 : i64
  %28 = arith.ori %25, %27 : i64
  %29 = arith.index_castui %28 : i64 to index
  %30 = arith.extui %8 : i32 to i64
  %31 = arith.extui %9 : i32 to i64
  %32 = arith.shli %31, %c32_i64 : i64
  %33 = arith.ori %30, %32 : i64
  %34 = arith.index_castui %33 : i64 to index
  %35:5 = util.assume.int 
      %14<umin = 0, umax = 9007199254740991>, 
      %19<umin = 0, umax = 9007199254740991>, 
      %24<umin = 0, umax = 9007199254740991>, 
      %29<umin = 0, umax = 9007199254740991>, 
      %34<umin = 0, umax = 9007199254740991>
    : index, index, index, index, index
  %36 = iree_tensor_ext.dispatch.workload.ordinal %35#0, 0 : index
  %37 = iree_tensor_ext.dispatch.workload.ordinal %35#1, 1 : index
  %38 = iree_tensor_ext.dispatch.workload.ordinal %35#2, 2 : index
  %39 = iree_tensor_ext.dispatch.workload.ordinal %35#3, 3 : index
  %40 = iree_tensor_ext.dispatch.workload.ordinal %35#4, 4 : index
  %41 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%39, %36}
  %42 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%37, %40}
  %43 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x16xf32>>{%38}
  %44 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(3) alignment(64) offset(%c0) flags(Indirect) : !iree_tensor_ext.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%39, %40}
  %45 = iree_tensor_ext.dispatch.tensor.load %41, offsets = [0, 0, 0, 0], sizes = [%39, %36, 16, 1], strides = [1, 1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%39, %36} -> tensor<?x?x16x1xf32>
  %46 = iree_tensor_ext.dispatch.tensor.load %42, offsets = [0, 0, 0, 0], sizes = [%37, %40, 16, 1], strides = [1, 1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%37, %40} -> tensor<?x?x16x1xf32>
  %47 = iree_tensor_ext.dispatch.tensor.load %43, offsets = [0, 0], sizes = [%38, 16], strides = [1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x16xf32>>{%38} -> tensor<?x16xf32>
  %48 = tensor.empty(%39, %40) : tensor<?x?x16x16xf32>
  %49 = scf.forall (%arg0, %arg1) in (%39, %37) shared_outs(%arg2 = %48) -> (tensor<?x?x16x16xf32>) {
    %extracted_slice = tensor.extract_slice %arg2[%arg0, %arg1, 0, 0] [1, 1, 16, 16] [1, 1, 1, 1] : tensor<?x?x16x16xf32> to tensor<1x1x16x16xf32>
    %50 = scf.forall (%arg3) = (0) to (16) step (2) shared_outs(%arg4 = %extracted_slice) -> (tensor<1x1x16x16xf32>) {
      %51 = tensor.empty() : tensor<1x1x2x16xf32>
      %extracted_slice_0 = tensor.extract_slice %arg4[0, 0, %arg3, 0] [1, 1, 2, 16] [1, 1, 1, 1] : tensor<1x1x16x16xf32> to tensor<1x1x2x16xf32>
      %extracted_slice_1 = tensor.extract_slice %47[%arg0, %arg3] [1, 2] [1, 1] : tensor<?x16xf32> to tensor<1x2xf32>
      %52 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>} ins(%cst : f32) outs(%51 : tensor<1x1x2x16xf32>) -> tensor<1x1x2x16xf32>
      %53 = vector.transfer_read %52[%c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true]} : tensor<1x1x2x16xf32>, vector<1x1x2x16xf32>
      %54 = scf.for %arg5 = %c0 to %36 step %c1 iter_args(%arg6 = %53) -> (vector<1x1x2x16xf32>) {
        %57 = vector.transfer_read %45[%arg0, %arg5, %arg3, %c0], %cst {in_bounds = [true, true, true, true]} : tensor<?x?x16x1xf32>, vector<1x1x2x1xf32>
        %58 = vector.transfer_read %46[%arg1, %arg5, %c0, %c0], %cst {in_bounds = [true, true, true, true]} : tensor<?x?x16x1xf32>, vector<1x1x16x1xf32>
        %59 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d2, d4, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %57, %58, %arg6 : vector<1x1x2x1xf32>, vector<1x1x16x1xf32> into vector<1x1x2x16xf32>
        scf.yield %59 : vector<1x1x2x16xf32>
      }
      %55 = vector.transfer_write %54, %52[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x2x16xf32>, tensor<1x1x2x16xf32>
      %56 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%55, %extracted_slice_1 : tensor<1x1x2x16xf32>, tensor<1x2xf32>) outs(%extracted_slice_0 : tensor<1x1x2x16xf32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>} {
      ^bb0(%in: f32, %in_2: f32, %out: f32):
        %57 = arith.addf %in, %in_2 : f32
        %58 = arith.maximumf %57, %cst : f32
        linalg.yield %58 : f32
      } -> tensor<1x1x2x16xf32>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %56 into %arg4[0, 0, %arg3, 0] [1, 1, 2, 16] [1, 1, 1, 1] : tensor<1x1x2x16xf32> into tensor<1x1x16x16xf32>
      }
    }
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %50 into %arg2[%arg0, %arg1, 0, 0] [1, 1, 16, 16] [1, 1, 1, 1] : tensor<1x1x16x16xf32> into tensor<?x?x16x16xf32>
    }
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  iree_tensor_ext.dispatch.tensor.store %49, %44, offsets = [0, 0, 0, 0], sizes = [%39, %40, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%39, %40}
  return
 }

 // -----// IR Dump After CSE (cse) //----- //
 func.func @mmt4d_bias_relu_dispatch_0_mmt4d_DxDxDx16x16x1_f32() attributes {hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", iree.encoding.resolver = #iree_cpu.cpu_encoding_layout<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "none"}>, translation_info = #iree_codegen.translation_info<pipeline = Mmt4dTilingExpert>} {
  %c1 = arith.constant 1 : index
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
  %8 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
  %9 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
  %10 = arith.extui %0 : i32 to i64
  %11 = arith.extui %1 : i32 to i64
  %12 = arith.shli %11, %c32_i64 : i64
  %13 = arith.ori %10, %12 : i64
  %14 = arith.index_castui %13 : i64 to index
  %15 = arith.extui %2 : i32 to i64
  %16 = arith.extui %3 : i32 to i64
  %17 = arith.shli %16, %c32_i64 : i64
  %18 = arith.ori %15, %17 : i64
  %19 = arith.index_castui %18 : i64 to index
  %20 = arith.extui %4 : i32 to i64
  %21 = arith.extui %5 : i32 to i64
  %22 = arith.shli %21, %c32_i64 : i64
  %23 = arith.ori %20, %22 : i64
  %24 = arith.index_castui %23 : i64 to index
  %25 = arith.extui %6 : i32 to i64
  %26 = arith.extui %7 : i32 to i64
  %27 = arith.shli %26, %c32_i64 : i64
  %28 = arith.ori %25, %27 : i64
  %29 = arith.index_castui %28 : i64 to index
  %30 = arith.extui %8 : i32 to i64
  %31 = arith.extui %9 : i32 to i64
  %32 = arith.shli %31, %c32_i64 : i64
  %33 = arith.ori %30, %32 : i64
  %34 = arith.index_castui %33 : i64 to index
  %35:5 = util.assume.int 
      %14<umin = 0, umax = 9007199254740991>, 
      %19<umin = 0, umax = 9007199254740991>, 
      %24<umin = 0, umax = 9007199254740991>, 
      %29<umin = 0, umax = 9007199254740991>, 
      %34<umin = 0, umax = 9007199254740991>
    : index, index, index, index, index
  %36 = iree_tensor_ext.dispatch.workload.ordinal %35#0, 0 : index
  %37 = iree_tensor_ext.dispatch.workload.ordinal %35#1, 1 : index
  %38 = iree_tensor_ext.dispatch.workload.ordinal %35#2, 2 : index
  %39 = iree_tensor_ext.dispatch.workload.ordinal %35#3, 3 : index
  %40 = iree_tensor_ext.dispatch.workload.ordinal %35#4, 4 : index
  %41 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%39, %36}
  %42 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%37, %40}
  %43 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x16xf32>>{%38}
  %44 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(3) alignment(64) offset(%c0) flags(Indirect) : !iree_tensor_ext.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%39, %40}
  %45 = iree_tensor_ext.dispatch.tensor.load %41, offsets = [0, 0, 0, 0], sizes = [%39, %36, 16, 1], strides = [1, 1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%39, %36} -> tensor<?x?x16x1xf32>
  %46 = iree_tensor_ext.dispatch.tensor.load %42, offsets = [0, 0, 0, 0], sizes = [%37, %40, 16, 1], strides = [1, 1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%37, %40} -> tensor<?x?x16x1xf32>
  %47 = iree_tensor_ext.dispatch.tensor.load %43, offsets = [0, 0], sizes = [%38, 16], strides = [1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x16xf32>>{%38} -> tensor<?x16xf32>
  %48 = tensor.empty(%39, %40) : tensor<?x?x16x16xf32>
  %49 = scf.forall (%arg0, %arg1) in (%39, %37) shared_outs(%arg2 = %48) -> (tensor<?x?x16x16xf32>) {
    %extracted_slice = tensor.extract_slice %arg2[%arg0, %arg1, 0, 0] [1, 1, 16, 16] [1, 1, 1, 1] : tensor<?x?x16x16xf32> to tensor<1x1x16x16xf32>
    %50 = scf.forall (%arg3) = (0) to (16) step (2) shared_outs(%arg4 = %extracted_slice) -> (tensor<1x1x16x16xf32>) {
      %51 = tensor.empty() : tensor<1x1x2x16xf32>
      %extracted_slice_0 = tensor.extract_slice %arg4[0, 0, %arg3, 0] [1, 1, 2, 16] [1, 1, 1, 1] : tensor<1x1x16x16xf32> to tensor<1x1x2x16xf32>
      %extracted_slice_1 = tensor.extract_slice %47[%arg0, %arg3] [1, 2] [1, 1] : tensor<?x16xf32> to tensor<1x2xf32>
      %52 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>} ins(%cst : f32) outs(%51 : tensor<1x1x2x16xf32>) -> tensor<1x1x2x16xf32>
      %53 = vector.transfer_read %52[%c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true]} : tensor<1x1x2x16xf32>, vector<1x1x2x16xf32>
      %54 = scf.for %arg5 = %c0 to %36 step %c1 iter_args(%arg6 = %53) -> (vector<1x1x2x16xf32>) {
        %57 = vector.transfer_read %45[%arg0, %arg5, %arg3, %c0], %cst {in_bounds = [true, true, true, true]} : tensor<?x?x16x1xf32>, vector<1x1x2x1xf32>
        %58 = vector.transfer_read %46[%arg1, %arg5, %c0, %c0], %cst {in_bounds = [true, true, true, true]} : tensor<?x?x16x1xf32>, vector<1x1x16x1xf32>
        %59 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d2, d4, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %57, %58, %arg6 : vector<1x1x2x1xf32>, vector<1x1x16x1xf32> into vector<1x1x2x16xf32>
        scf.yield %59 : vector<1x1x2x16xf32>
      }
      %55 = vector.transfer_write %54, %52[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x2x16xf32>, tensor<1x1x2x16xf32>
      %56 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%55, %extracted_slice_1 : tensor<1x1x2x16xf32>, tensor<1x2xf32>) outs(%extracted_slice_0 : tensor<1x1x2x16xf32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>} {
      ^bb0(%in: f32, %in_2: f32, %out: f32):
        %57 = arith.addf %in, %in_2 : f32
        %58 = arith.maximumf %57, %cst : f32
        linalg.yield %58 : f32
      } -> tensor<1x1x2x16xf32>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %56 into %arg4[0, 0, %arg3, 0] [1, 1, 2, 16] [1, 1, 1, 1] : tensor<1x1x2x16xf32> into tensor<1x1x16x16xf32>
      }
    }
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %50 into %arg2[%arg0, %arg1, 0, 0] [1, 1, 16, 16] [1, 1, 1, 1] : tensor<1x1x16x16xf32> into tensor<?x?x16x16xf32>
    }
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  iree_tensor_ext.dispatch.tensor.store %49, %44, offsets = [0, 0, 0, 0], sizes = [%39, %40, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%39, %40}
  return
 }

 // -----// IR Dump After LLVMCPUVerifyVectorSizeLegalityPass (iree-llvmcpu-verify-vector-size-legality) //----- //
 func.func @mmt4d_bias_relu_dispatch_0_mmt4d_DxDxDx16x16x1_f32() attributes {hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", iree.encoding.resolver = #iree_cpu.cpu_encoding_layout<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "none"}>, translation_info = #iree_codegen.translation_info<pipeline = Mmt4dTilingExpert>} {
  %c1 = arith.constant 1 : index
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
  %8 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
  %9 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
  %10 = arith.extui %0 : i32 to i64
  %11 = arith.extui %1 : i32 to i64
  %12 = arith.shli %11, %c32_i64 : i64
  %13 = arith.ori %10, %12 : i64
  %14 = arith.index_castui %13 : i64 to index
  %15 = arith.extui %2 : i32 to i64
  %16 = arith.extui %3 : i32 to i64
  %17 = arith.shli %16, %c32_i64 : i64
  %18 = arith.ori %15, %17 : i64
  %19 = arith.index_castui %18 : i64 to index
  %20 = arith.extui %4 : i32 to i64
  %21 = arith.extui %5 : i32 to i64
  %22 = arith.shli %21, %c32_i64 : i64
  %23 = arith.ori %20, %22 : i64
  %24 = arith.index_castui %23 : i64 to index
  %25 = arith.extui %6 : i32 to i64
  %26 = arith.extui %7 : i32 to i64
  %27 = arith.shli %26, %c32_i64 : i64
  %28 = arith.ori %25, %27 : i64
  %29 = arith.index_castui %28 : i64 to index
  %30 = arith.extui %8 : i32 to i64
  %31 = arith.extui %9 : i32 to i64
  %32 = arith.shli %31, %c32_i64 : i64
  %33 = arith.ori %30, %32 : i64
  %34 = arith.index_castui %33 : i64 to index
  %35:5 = util.assume.int 
      %14<umin = 0, umax = 9007199254740991>, 
      %19<umin = 0, umax = 9007199254740991>, 
      %24<umin = 0, umax = 9007199254740991>, 
      %29<umin = 0, umax = 9007199254740991>, 
      %34<umin = 0, umax = 9007199254740991>
    : index, index, index, index, index
  %36 = iree_tensor_ext.dispatch.workload.ordinal %35#0, 0 : index
  %37 = iree_tensor_ext.dispatch.workload.ordinal %35#1, 1 : index
  %38 = iree_tensor_ext.dispatch.workload.ordinal %35#2, 2 : index
  %39 = iree_tensor_ext.dispatch.workload.ordinal %35#3, 3 : index
  %40 = iree_tensor_ext.dispatch.workload.ordinal %35#4, 4 : index
  %41 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%39, %36}
  %42 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%37, %40}
  %43 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x16xf32>>{%38}
  %44 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(3) alignment(64) offset(%c0) flags(Indirect) : !iree_tensor_ext.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%39, %40}
  %45 = iree_tensor_ext.dispatch.tensor.load %41, offsets = [0, 0, 0, 0], sizes = [%39, %36, 16, 1], strides = [1, 1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%39, %36} -> tensor<?x?x16x1xf32>
  %46 = iree_tensor_ext.dispatch.tensor.load %42, offsets = [0, 0, 0, 0], sizes = [%37, %40, 16, 1], strides = [1, 1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%37, %40} -> tensor<?x?x16x1xf32>
  %47 = iree_tensor_ext.dispatch.tensor.load %43, offsets = [0, 0], sizes = [%38, 16], strides = [1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x16xf32>>{%38} -> tensor<?x16xf32>
  %48 = tensor.empty(%39, %40) : tensor<?x?x16x16xf32>
  %49 = scf.forall (%arg0, %arg1) in (%39, %37) shared_outs(%arg2 = %48) -> (tensor<?x?x16x16xf32>) {
    %extracted_slice = tensor.extract_slice %arg2[%arg0, %arg1, 0, 0] [1, 1, 16, 16] [1, 1, 1, 1] : tensor<?x?x16x16xf32> to tensor<1x1x16x16xf32>
    %50 = scf.forall (%arg3) = (0) to (16) step (2) shared_outs(%arg4 = %extracted_slice) -> (tensor<1x1x16x16xf32>) {
      %51 = tensor.empty() : tensor<1x1x2x16xf32>
      %extracted_slice_0 = tensor.extract_slice %arg4[0, 0, %arg3, 0] [1, 1, 2, 16] [1, 1, 1, 1] : tensor<1x1x16x16xf32> to tensor<1x1x2x16xf32>
      %extracted_slice_1 = tensor.extract_slice %47[%arg0, %arg3] [1, 2] [1, 1] : tensor<?x16xf32> to tensor<1x2xf32>
      %52 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>} ins(%cst : f32) outs(%51 : tensor<1x1x2x16xf32>) -> tensor<1x1x2x16xf32>
      %53 = vector.transfer_read %52[%c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true]} : tensor<1x1x2x16xf32>, vector<1x1x2x16xf32>
      %54 = scf.for %arg5 = %c0 to %36 step %c1 iter_args(%arg6 = %53) -> (vector<1x1x2x16xf32>) {
        %57 = vector.transfer_read %45[%arg0, %arg5, %arg3, %c0], %cst {in_bounds = [true, true, true, true]} : tensor<?x?x16x1xf32>, vector<1x1x2x1xf32>
        %58 = vector.transfer_read %46[%arg1, %arg5, %c0, %c0], %cst {in_bounds = [true, true, true, true]} : tensor<?x?x16x1xf32>, vector<1x1x16x1xf32>
        %59 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d2, d4, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %57, %58, %arg6 : vector<1x1x2x1xf32>, vector<1x1x16x1xf32> into vector<1x1x2x16xf32>
        scf.yield %59 : vector<1x1x2x16xf32>
      }
      %55 = vector.transfer_write %54, %52[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x2x16xf32>, tensor<1x1x2x16xf32>
      %56 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%55, %extracted_slice_1 : tensor<1x1x2x16xf32>, tensor<1x2xf32>) outs(%extracted_slice_0 : tensor<1x1x2x16xf32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>} {
      ^bb0(%in: f32, %in_2: f32, %out: f32):
        %57 = arith.addf %in, %in_2 : f32
        %58 = arith.maximumf %57, %cst : f32
        linalg.yield %58 : f32
      } -> tensor<1x1x2x16xf32>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %56 into %arg4[0, 0, %arg3, 0] [1, 1, 2, 16] [1, 1, 1, 1] : tensor<1x1x2x16xf32> into tensor<1x1x16x16xf32>
      }
    }
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %50 into %arg2[%arg0, %arg1, 0, 0] [1, 1, 16, 16] [1, 1, 1, 1] : tensor<1x1x16x16xf32> into tensor<?x?x16x16xf32>
    }
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  iree_tensor_ext.dispatch.tensor.store %49, %44, offsets = [0, 0, 0, 0], sizes = [%39, %40, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%39, %40}
  return
 }

 // -----// IR Dump After Canonicalizer (canonicalize) //----- //
 func.func @mmt4d_bias_relu_dispatch_0_mmt4d_DxDxDx16x16x1_f32() attributes {hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", iree.encoding.resolver = #iree_cpu.cpu_encoding_layout<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "none"}>, translation_info = #iree_codegen.translation_info<pipeline = Mmt4dTilingExpert>} {
  %c1 = arith.constant 1 : index
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
  %8 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
  %9 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
  %10 = arith.extui %0 : i32 to i64
  %11 = arith.extui %1 : i32 to i64
  %12 = arith.shli %11, %c32_i64 : i64
  %13 = arith.ori %10, %12 : i64
  %14 = arith.index_castui %13 : i64 to index
  %15 = arith.extui %2 : i32 to i64
  %16 = arith.extui %3 : i32 to i64
  %17 = arith.shli %16, %c32_i64 : i64
  %18 = arith.ori %15, %17 : i64
  %19 = arith.index_castui %18 : i64 to index
  %20 = arith.extui %4 : i32 to i64
  %21 = arith.extui %5 : i32 to i64
  %22 = arith.shli %21, %c32_i64 : i64
  %23 = arith.ori %20, %22 : i64
  %24 = arith.index_castui %23 : i64 to index
  %25 = arith.extui %6 : i32 to i64
  %26 = arith.extui %7 : i32 to i64
  %27 = arith.shli %26, %c32_i64 : i64
  %28 = arith.ori %25, %27 : i64
  %29 = arith.index_castui %28 : i64 to index
  %30 = arith.extui %8 : i32 to i64
  %31 = arith.extui %9 : i32 to i64
  %32 = arith.shli %31, %c32_i64 : i64
  %33 = arith.ori %30, %32 : i64
  %34 = arith.index_castui %33 : i64 to index
  %35:5 = util.assume.int 
      %14<umin = 0, umax = 9007199254740991>, 
      %19<umin = 0, umax = 9007199254740991>, 
      %24<umin = 0, umax = 9007199254740991>, 
      %29<umin = 0, umax = 9007199254740991>, 
      %34<umin = 0, umax = 9007199254740991>
    : index, index, index, index, index
  %36 = iree_tensor_ext.dispatch.workload.ordinal %35#0, 0 : index
  %37 = iree_tensor_ext.dispatch.workload.ordinal %35#1, 1 : index
  %38 = iree_tensor_ext.dispatch.workload.ordinal %35#2, 2 : index
  %39 = iree_tensor_ext.dispatch.workload.ordinal %35#3, 3 : index
  %40 = iree_tensor_ext.dispatch.workload.ordinal %35#4, 4 : index
  %41 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%39, %36}
  %42 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%37, %40}
  %43 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x16xf32>>{%38}
  %44 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(3) alignment(64) offset(%c0) flags(Indirect) : !iree_tensor_ext.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%39, %40}
  %45 = iree_tensor_ext.dispatch.tensor.load %41, offsets = [0, 0, 0, 0], sizes = [%39, %36, 16, 1], strides = [1, 1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%39, %36} -> tensor<?x?x16x1xf32>
  %46 = iree_tensor_ext.dispatch.tensor.load %42, offsets = [0, 0, 0, 0], sizes = [%37, %40, 16, 1], strides = [1, 1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%37, %40} -> tensor<?x?x16x1xf32>
  %47 = iree_tensor_ext.dispatch.tensor.load %43, offsets = [0, 0], sizes = [%38, 16], strides = [1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x16xf32>>{%38} -> tensor<?x16xf32>
  %48 = tensor.empty(%39, %40) : tensor<?x?x16x16xf32>
  %49 = scf.forall (%arg0, %arg1) in (%39, %37) shared_outs(%arg2 = %48) -> (tensor<?x?x16x16xf32>) {
    %extracted_slice = tensor.extract_slice %arg2[%arg0, %arg1, 0, 0] [1, 1, 16, 16] [1, 1, 1, 1] : tensor<?x?x16x16xf32> to tensor<1x1x16x16xf32>
    %50 = scf.forall (%arg3) = (0) to (16) step (2) shared_outs(%arg4 = %extracted_slice) -> (tensor<1x1x16x16xf32>) {
      %51 = tensor.empty() : tensor<1x1x2x16xf32>
      %extracted_slice_0 = tensor.extract_slice %arg4[0, 0, %arg3, 0] [1, 1, 2, 16] [1, 1, 1, 1] : tensor<1x1x16x16xf32> to tensor<1x1x2x16xf32>
      %extracted_slice_1 = tensor.extract_slice %47[%arg0, %arg3] [1, 2] [1, 1] : tensor<?x16xf32> to tensor<1x2xf32>
      %52 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>} ins(%cst : f32) outs(%51 : tensor<1x1x2x16xf32>) -> tensor<1x1x2x16xf32>
      %53 = vector.transfer_read %52[%c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true]} : tensor<1x1x2x16xf32>, vector<1x1x2x16xf32>
      %54 = scf.for %arg5 = %c0 to %36 step %c1 iter_args(%arg6 = %53) -> (vector<1x1x2x16xf32>) {
        %57 = vector.transfer_read %45[%arg0, %arg5, %arg3, %c0], %cst {in_bounds = [true, true, true, true]} : tensor<?x?x16x1xf32>, vector<1x1x2x1xf32>
        %58 = vector.transfer_read %46[%arg1, %arg5, %c0, %c0], %cst {in_bounds = [true, true, true, true]} : tensor<?x?x16x1xf32>, vector<1x1x16x1xf32>
        %59 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d2, d4, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %57, %58, %arg6 : vector<1x1x2x1xf32>, vector<1x1x16x1xf32> into vector<1x1x2x16xf32>
        scf.yield %59 : vector<1x1x2x16xf32>
      }
      %55 = vector.transfer_write %54, %52[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x2x16xf32>, tensor<1x1x2x16xf32>
      %56 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%55, %extracted_slice_1 : tensor<1x1x2x16xf32>, tensor<1x2xf32>) outs(%extracted_slice_0 : tensor<1x1x2x16xf32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>} {
      ^bb0(%in: f32, %in_2: f32, %out: f32):
        %57 = arith.addf %in, %in_2 : f32
        %58 = arith.maximumf %57, %cst : f32
        linalg.yield %58 : f32
      } -> tensor<1x1x2x16xf32>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %56 into %arg4[0, 0, %arg3, 0] [1, 1, 2, 16] [1, 1, 1, 1] : tensor<1x1x2x16xf32> into tensor<1x1x16x16xf32>
      }
    }
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %50 into %arg2[%arg0, %arg1, 0, 0] [1, 1, 16, 16] [1, 1, 1, 1] : tensor<1x1x16x16xf32> into tensor<?x?x16x16xf32>
    }
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  iree_tensor_ext.dispatch.tensor.store %49, %44, offsets = [0, 0, 0, 0], sizes = [%39, %40, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%39, %40}
  return
 }

 // -----// IR Dump After CSE (cse) //----- //
 func.func @mmt4d_bias_relu_dispatch_0_mmt4d_DxDxDx16x16x1_f32() attributes {hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", iree.encoding.resolver = #iree_cpu.cpu_encoding_layout<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "none"}>, translation_info = #iree_codegen.translation_info<pipeline = Mmt4dTilingExpert>} {
  %c1 = arith.constant 1 : index
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
  %8 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
  %9 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
  %10 = arith.extui %0 : i32 to i64
  %11 = arith.extui %1 : i32 to i64
  %12 = arith.shli %11, %c32_i64 : i64
  %13 = arith.ori %10, %12 : i64
  %14 = arith.index_castui %13 : i64 to index
  %15 = arith.extui %2 : i32 to i64
  %16 = arith.extui %3 : i32 to i64
  %17 = arith.shli %16, %c32_i64 : i64
  %18 = arith.ori %15, %17 : i64
  %19 = arith.index_castui %18 : i64 to index
  %20 = arith.extui %4 : i32 to i64
  %21 = arith.extui %5 : i32 to i64
  %22 = arith.shli %21, %c32_i64 : i64
  %23 = arith.ori %20, %22 : i64
  %24 = arith.index_castui %23 : i64 to index
  %25 = arith.extui %6 : i32 to i64
  %26 = arith.extui %7 : i32 to i64
  %27 = arith.shli %26, %c32_i64 : i64
  %28 = arith.ori %25, %27 : i64
  %29 = arith.index_castui %28 : i64 to index
  %30 = arith.extui %8 : i32 to i64
  %31 = arith.extui %9 : i32 to i64
  %32 = arith.shli %31, %c32_i64 : i64
  %33 = arith.ori %30, %32 : i64
  %34 = arith.index_castui %33 : i64 to index
  %35:5 = util.assume.int 
      %14<umin = 0, umax = 9007199254740991>, 
      %19<umin = 0, umax = 9007199254740991>, 
      %24<umin = 0, umax = 9007199254740991>, 
      %29<umin = 0, umax = 9007199254740991>, 
      %34<umin = 0, umax = 9007199254740991>
    : index, index, index, index, index
  %36 = iree_tensor_ext.dispatch.workload.ordinal %35#0, 0 : index
  %37 = iree_tensor_ext.dispatch.workload.ordinal %35#1, 1 : index
  %38 = iree_tensor_ext.dispatch.workload.ordinal %35#2, 2 : index
  %39 = iree_tensor_ext.dispatch.workload.ordinal %35#3, 3 : index
  %40 = iree_tensor_ext.dispatch.workload.ordinal %35#4, 4 : index
  %41 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%39, %36}
  %42 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%37, %40}
  %43 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x16xf32>>{%38}
  %44 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(3) alignment(64) offset(%c0) flags(Indirect) : !iree_tensor_ext.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%39, %40}
  %45 = iree_tensor_ext.dispatch.tensor.load %41, offsets = [0, 0, 0, 0], sizes = [%39, %36, 16, 1], strides = [1, 1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%39, %36} -> tensor<?x?x16x1xf32>
  %46 = iree_tensor_ext.dispatch.tensor.load %42, offsets = [0, 0, 0, 0], sizes = [%37, %40, 16, 1], strides = [1, 1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%37, %40} -> tensor<?x?x16x1xf32>
  %47 = iree_tensor_ext.dispatch.tensor.load %43, offsets = [0, 0], sizes = [%38, 16], strides = [1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x16xf32>>{%38} -> tensor<?x16xf32>
  %48 = tensor.empty(%39, %40) : tensor<?x?x16x16xf32>
  %49 = scf.forall (%arg0, %arg1) in (%39, %37) shared_outs(%arg2 = %48) -> (tensor<?x?x16x16xf32>) {
    %extracted_slice = tensor.extract_slice %arg2[%arg0, %arg1, 0, 0] [1, 1, 16, 16] [1, 1, 1, 1] : tensor<?x?x16x16xf32> to tensor<1x1x16x16xf32>
    %50 = scf.forall (%arg3) = (0) to (16) step (2) shared_outs(%arg4 = %extracted_slice) -> (tensor<1x1x16x16xf32>) {
      %51 = tensor.empty() : tensor<1x1x2x16xf32>
      %extracted_slice_0 = tensor.extract_slice %arg4[0, 0, %arg3, 0] [1, 1, 2, 16] [1, 1, 1, 1] : tensor<1x1x16x16xf32> to tensor<1x1x2x16xf32>
      %extracted_slice_1 = tensor.extract_slice %47[%arg0, %arg3] [1, 2] [1, 1] : tensor<?x16xf32> to tensor<1x2xf32>
      %52 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>} ins(%cst : f32) outs(%51 : tensor<1x1x2x16xf32>) -> tensor<1x1x2x16xf32>
      %53 = vector.transfer_read %52[%c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true]} : tensor<1x1x2x16xf32>, vector<1x1x2x16xf32>
      %54 = scf.for %arg5 = %c0 to %36 step %c1 iter_args(%arg6 = %53) -> (vector<1x1x2x16xf32>) {
        %57 = vector.transfer_read %45[%arg0, %arg5, %arg3, %c0], %cst {in_bounds = [true, true, true, true]} : tensor<?x?x16x1xf32>, vector<1x1x2x1xf32>
        %58 = vector.transfer_read %46[%arg1, %arg5, %c0, %c0], %cst {in_bounds = [true, true, true, true]} : tensor<?x?x16x1xf32>, vector<1x1x16x1xf32>
        %59 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d2, d4, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %57, %58, %arg6 : vector<1x1x2x1xf32>, vector<1x1x16x1xf32> into vector<1x1x2x16xf32>
        scf.yield %59 : vector<1x1x2x16xf32>
      }
      %55 = vector.transfer_write %54, %52[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x2x16xf32>, tensor<1x1x2x16xf32>
      %56 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%55, %extracted_slice_1 : tensor<1x1x2x16xf32>, tensor<1x2xf32>) outs(%extracted_slice_0 : tensor<1x1x2x16xf32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>} {
      ^bb0(%in: f32, %in_2: f32, %out: f32):
        %57 = arith.addf %in, %in_2 : f32
        %58 = arith.maximumf %57, %cst : f32
        linalg.yield %58 : f32
      } -> tensor<1x1x2x16xf32>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %56 into %arg4[0, 0, %arg3, 0] [1, 1, 2, 16] [1, 1, 1, 1] : tensor<1x1x2x16xf32> into tensor<1x1x16x16xf32>
      }
    }
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %50 into %arg2[%arg0, %arg1, 0, 0] [1, 1, 16, 16] [1, 1, 1, 1] : tensor<1x1x16x16xf32> into tensor<?x?x16x16xf32>
    }
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  iree_tensor_ext.dispatch.tensor.store %49, %44, offsets = [0, 0, 0, 0], sizes = [%39, %40, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%39, %40}
  return
 }

 // -----// IR Dump After EliminateEmptyTensorsPass (iree-eliminate-empty-tensors) //----- //
 func.func @mmt4d_bias_relu_dispatch_0_mmt4d_DxDxDx16x16x1_f32() attributes {hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", iree.encoding.resolver = #iree_cpu.cpu_encoding_layout<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "none"}>, translation_info = #iree_codegen.translation_info<pipeline = Mmt4dTilingExpert>} {
  %c1 = arith.constant 1 : index
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
  %8 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
  %9 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
  %10 = arith.extui %0 : i32 to i64
  %11 = arith.extui %1 : i32 to i64
  %12 = arith.shli %11, %c32_i64 : i64
  %13 = arith.ori %10, %12 : i64
  %14 = arith.index_castui %13 : i64 to index
  %15 = arith.extui %2 : i32 to i64
  %16 = arith.extui %3 : i32 to i64
  %17 = arith.shli %16, %c32_i64 : i64
  %18 = arith.ori %15, %17 : i64
  %19 = arith.index_castui %18 : i64 to index
  %20 = arith.extui %4 : i32 to i64
  %21 = arith.extui %5 : i32 to i64
  %22 = arith.shli %21, %c32_i64 : i64
  %23 = arith.ori %20, %22 : i64
  %24 = arith.index_castui %23 : i64 to index
  %25 = arith.extui %6 : i32 to i64
  %26 = arith.extui %7 : i32 to i64
  %27 = arith.shli %26, %c32_i64 : i64
  %28 = arith.ori %25, %27 : i64
  %29 = arith.index_castui %28 : i64 to index
  %30 = arith.extui %8 : i32 to i64
  %31 = arith.extui %9 : i32 to i64
  %32 = arith.shli %31, %c32_i64 : i64
  %33 = arith.ori %30, %32 : i64
  %34 = arith.index_castui %33 : i64 to index
  %35:5 = util.assume.int 
      %14<umin = 0, umax = 9007199254740991>, 
      %19<umin = 0, umax = 9007199254740991>, 
      %24<umin = 0, umax = 9007199254740991>, 
      %29<umin = 0, umax = 9007199254740991>, 
      %34<umin = 0, umax = 9007199254740991>
    : index, index, index, index, index
  %36 = iree_tensor_ext.dispatch.workload.ordinal %35#0, 0 : index
  %37 = iree_tensor_ext.dispatch.workload.ordinal %35#1, 1 : index
  %38 = iree_tensor_ext.dispatch.workload.ordinal %35#2, 2 : index
  %39 = iree_tensor_ext.dispatch.workload.ordinal %35#3, 3 : index
  %40 = iree_tensor_ext.dispatch.workload.ordinal %35#4, 4 : index
  %41 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%39, %36}
  %42 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%37, %40}
  %43 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x16xf32>>{%38}
  %44 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(3) alignment(64) offset(%c0) flags(Indirect) : !iree_tensor_ext.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%39, %40}
  %45 = iree_tensor_ext.dispatch.tensor.load %41, offsets = [0, 0, 0, 0], sizes = [%39, %36, 16, 1], strides = [1, 1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%39, %36} -> tensor<?x?x16x1xf32>
  %46 = iree_tensor_ext.dispatch.tensor.load %42, offsets = [0, 0, 0, 0], sizes = [%37, %40, 16, 1], strides = [1, 1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%37, %40} -> tensor<?x?x16x1xf32>
  %47 = iree_tensor_ext.dispatch.tensor.load %43, offsets = [0, 0], sizes = [%38, 16], strides = [1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x16xf32>>{%38} -> tensor<?x16xf32>
  %48 = iree_tensor_ext.dispatch.tensor.load %44, offsets = [0, 0, 0, 0], sizes = [%39, %40, 16, 16], strides = [1, 1, 1, 1] : !iree_tensor_ext.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%39, %40} -> tensor<?x?x16x16xf32>
  %49 = tensor.empty(%39, %40) : tensor<?x?x16x16xf32>
  %50 = scf.forall (%arg0, %arg1) in (%39, %37) shared_outs(%arg2 = %48) -> (tensor<?x?x16x16xf32>) {
    %extracted_slice = tensor.extract_slice %arg2[%arg0, %arg1, 0, 0] [1, 1, 16, 16] [1, 1, 1, 1] : tensor<?x?x16x16xf32> to tensor<1x1x16x16xf32>
    %51 = scf.forall (%arg3) = (0) to (16) step (2) shared_outs(%arg4 = %extracted_slice) -> (tensor<1x1x16x16xf32>) {
      %52 = tensor.empty() : tensor<1x1x2x16xf32>
      %extracted_slice_0 = tensor.extract_slice %arg4[0, 0, %arg3, 0] [1, 1, 2, 16] [1, 1, 1, 1] : tensor<1x1x16x16xf32> to tensor<1x1x2x16xf32>
      %extracted_slice_1 = tensor.extract_slice %47[%arg0, %arg3] [1, 2] [1, 1] : tensor<?x16xf32> to tensor<1x2xf32>
      %53 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>} ins(%cst : f32) outs(%52 : tensor<1x1x2x16xf32>) -> tensor<1x1x2x16xf32>
      %54 = vector.transfer_read %53[%c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true]} : tensor<1x1x2x16xf32>, vector<1x1x2x16xf32>
      %55 = scf.for %arg5 = %c0 to %36 step %c1 iter_args(%arg6 = %54) -> (vector<1x1x2x16xf32>) {
        %58 = vector.transfer_read %45[%arg0, %arg5, %arg3, %c0], %cst {in_bounds = [true, true, true, true]} : tensor<?x?x16x1xf32>, vector<1x1x2x1xf32>
        %59 = vector.transfer_read %46[%arg1, %arg5, %c0, %c0], %cst {in_bounds = [true, true, true, true]} : tensor<?x?x16x1xf32>, vector<1x1x16x1xf32>
        %60 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d2, d4, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %58, %59, %arg6 : vector<1x1x2x1xf32>, vector<1x1x16x1xf32> into vector<1x1x2x16xf32>
        scf.yield %60 : vector<1x1x2x16xf32>
      }
      %56 = vector.transfer_write %55, %53[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x2x16xf32>, tensor<1x1x2x16xf32>
      %57 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%56, %extracted_slice_1 : tensor<1x1x2x16xf32>, tensor<1x2xf32>) outs(%extracted_slice_0 : tensor<1x1x2x16xf32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>} {
      ^bb0(%in: f32, %in_2: f32, %out: f32):
        %58 = arith.addf %in, %in_2 : f32
        %59 = arith.maximumf %58, %cst : f32
        linalg.yield %59 : f32
      } -> tensor<1x1x2x16xf32>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %57 into %arg4[0, 0, %arg3, 0] [1, 1, 2, 16] [1, 1, 1, 1] : tensor<1x1x2x16xf32> into tensor<1x1x16x16xf32>
      }
    }
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %51 into %arg2[%arg0, %arg1, 0, 0] [1, 1, 16, 16] [1, 1, 1, 1] : tensor<1x1x16x16xf32> into tensor<?x?x16x16xf32>
    }
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  iree_tensor_ext.dispatch.tensor.store %50, %44, offsets = [0, 0, 0, 0], sizes = [%39, %40, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%39, %40}
  return
 }

 // -----// IR Dump After EmptyTensorToAllocTensorPass (empty-tensor-to-alloc-tensor) //----- //
 func.func @mmt4d_bias_relu_dispatch_0_mmt4d_DxDxDx16x16x1_f32() attributes {hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", iree.encoding.resolver = #iree_cpu.cpu_encoding_layout<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "none"}>, translation_info = #iree_codegen.translation_info<pipeline = Mmt4dTilingExpert>} {
  %c1 = arith.constant 1 : index
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
  %8 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
  %9 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
  %10 = arith.extui %0 : i32 to i64
  %11 = arith.extui %1 : i32 to i64
  %12 = arith.shli %11, %c32_i64 : i64
  %13 = arith.ori %10, %12 : i64
  %14 = arith.index_castui %13 : i64 to index
  %15 = arith.extui %2 : i32 to i64
  %16 = arith.extui %3 : i32 to i64
  %17 = arith.shli %16, %c32_i64 : i64
  %18 = arith.ori %15, %17 : i64
  %19 = arith.index_castui %18 : i64 to index
  %20 = arith.extui %4 : i32 to i64
  %21 = arith.extui %5 : i32 to i64
  %22 = arith.shli %21, %c32_i64 : i64
  %23 = arith.ori %20, %22 : i64
  %24 = arith.index_castui %23 : i64 to index
  %25 = arith.extui %6 : i32 to i64
  %26 = arith.extui %7 : i32 to i64
  %27 = arith.shli %26, %c32_i64 : i64
  %28 = arith.ori %25, %27 : i64
  %29 = arith.index_castui %28 : i64 to index
  %30 = arith.extui %8 : i32 to i64
  %31 = arith.extui %9 : i32 to i64
  %32 = arith.shli %31, %c32_i64 : i64
  %33 = arith.ori %30, %32 : i64
  %34 = arith.index_castui %33 : i64 to index
  %35:5 = util.assume.int 
      %14<umin = 0, umax = 9007199254740991>, 
      %19<umin = 0, umax = 9007199254740991>, 
      %24<umin = 0, umax = 9007199254740991>, 
      %29<umin = 0, umax = 9007199254740991>, 
      %34<umin = 0, umax = 9007199254740991>
    : index, index, index, index, index
  %36 = iree_tensor_ext.dispatch.workload.ordinal %35#0, 0 : index
  %37 = iree_tensor_ext.dispatch.workload.ordinal %35#1, 1 : index
  %38 = iree_tensor_ext.dispatch.workload.ordinal %35#2, 2 : index
  %39 = iree_tensor_ext.dispatch.workload.ordinal %35#3, 3 : index
  %40 = iree_tensor_ext.dispatch.workload.ordinal %35#4, 4 : index
  %41 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%39, %36}
  %42 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%37, %40}
  %43 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x16xf32>>{%38}
  %44 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(3) alignment(64) offset(%c0) flags(Indirect) : !iree_tensor_ext.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%39, %40}
  %45 = iree_tensor_ext.dispatch.tensor.load %41, offsets = [0, 0, 0, 0], sizes = [%39, %36, 16, 1], strides = [1, 1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%39, %36} -> tensor<?x?x16x1xf32>
  %46 = iree_tensor_ext.dispatch.tensor.load %42, offsets = [0, 0, 0, 0], sizes = [%37, %40, 16, 1], strides = [1, 1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%37, %40} -> tensor<?x?x16x1xf32>
  %47 = iree_tensor_ext.dispatch.tensor.load %43, offsets = [0, 0], sizes = [%38, 16], strides = [1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x16xf32>>{%38} -> tensor<?x16xf32>
  %48 = iree_tensor_ext.dispatch.tensor.load %44, offsets = [0, 0, 0, 0], sizes = [%39, %40, 16, 16], strides = [1, 1, 1, 1] : !iree_tensor_ext.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%39, %40} -> tensor<?x?x16x16xf32>
  %49 = scf.forall (%arg0, %arg1) in (%39, %37) shared_outs(%arg2 = %48) -> (tensor<?x?x16x16xf32>) {
    %extracted_slice = tensor.extract_slice %arg2[%arg0, %arg1, 0, 0] [1, 1, 16, 16] [1, 1, 1, 1] : tensor<?x?x16x16xf32> to tensor<1x1x16x16xf32>
    %50 = scf.forall (%arg3) = (0) to (16) step (2) shared_outs(%arg4 = %extracted_slice) -> (tensor<1x1x16x16xf32>) {
      %51 = bufferization.alloc_tensor() : tensor<1x1x2x16xf32>
      %extracted_slice_0 = tensor.extract_slice %arg4[0, 0, %arg3, 0] [1, 1, 2, 16] [1, 1, 1, 1] : tensor<1x1x16x16xf32> to tensor<1x1x2x16xf32>
      %extracted_slice_1 = tensor.extract_slice %47[%arg0, %arg3] [1, 2] [1, 1] : tensor<?x16xf32> to tensor<1x2xf32>
      %52 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>} ins(%cst : f32) outs(%51 : tensor<1x1x2x16xf32>) -> tensor<1x1x2x16xf32>
      %53 = vector.transfer_read %52[%c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true]} : tensor<1x1x2x16xf32>, vector<1x1x2x16xf32>
      %54 = scf.for %arg5 = %c0 to %36 step %c1 iter_args(%arg6 = %53) -> (vector<1x1x2x16xf32>) {
        %57 = vector.transfer_read %45[%arg0, %arg5, %arg3, %c0], %cst {in_bounds = [true, true, true, true]} : tensor<?x?x16x1xf32>, vector<1x1x2x1xf32>
        %58 = vector.transfer_read %46[%arg1, %arg5, %c0, %c0], %cst {in_bounds = [true, true, true, true]} : tensor<?x?x16x1xf32>, vector<1x1x16x1xf32>
        %59 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d2, d4, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %57, %58, %arg6 : vector<1x1x2x1xf32>, vector<1x1x16x1xf32> into vector<1x1x2x16xf32>
        scf.yield %59 : vector<1x1x2x16xf32>
      }
      %55 = vector.transfer_write %54, %52[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x2x16xf32>, tensor<1x1x2x16xf32>
      %56 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%55, %extracted_slice_1 : tensor<1x1x2x16xf32>, tensor<1x2xf32>) outs(%extracted_slice_0 : tensor<1x1x2x16xf32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>} {
      ^bb0(%in: f32, %in_2: f32, %out: f32):
        %57 = arith.addf %in, %in_2 : f32
        %58 = arith.maximumf %57, %cst : f32
        linalg.yield %58 : f32
      } -> tensor<1x1x2x16xf32>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %56 into %arg4[0, 0, %arg3, 0] [1, 1, 2, 16] [1, 1, 1, 1] : tensor<1x1x2x16xf32> into tensor<1x1x16x16xf32>
      }
    }
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %50 into %arg2[%arg0, %arg1, 0, 0] [1, 1, 16, 16] [1, 1, 1, 1] : tensor<1x1x16x16xf32> into tensor<?x?x16x16xf32>
    }
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  iree_tensor_ext.dispatch.tensor.store %49, %44, offsets = [0, 0, 0, 0], sizes = [%39, %40, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%39, %40}
  return
 }

 // -----// IR Dump After IREEComprehensiveBufferizePass (iree-codegen-iree-comprehensive-bufferize) //----- //
 func.func @mmt4d_bias_relu_dispatch_0_mmt4d_DxDxDx16x16x1_f32() attributes {hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", iree.encoding.resolver = #iree_cpu.cpu_encoding_layout<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "none"}>, translation_info = #iree_codegen.translation_info<pipeline = Mmt4dTilingExpert>} {
  %cst = arith.constant 0.000000e+00 : f32
  %c32_i64 = arith.constant 32 : i64
  %c0 = arith.constant 0 : index
  %c1 = arith.constant 1 : index
  %alloca = memref.alloca() {alignment = 64 : i64} : memref<1x1x2x16xf32>
  %0 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
  %8 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
  %9 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
  %10 = arith.extui %0 : i32 to i64
  %11 = arith.extui %1 : i32 to i64
  %12 = arith.shli %11, %c32_i64 : i64
  %13 = arith.ori %10, %12 : i64
  %14 = arith.index_castui %13 : i64 to index
  %15 = arith.extui %2 : i32 to i64
  %16 = arith.extui %3 : i32 to i64
  %17 = arith.shli %16, %c32_i64 : i64
  %18 = arith.ori %15, %17 : i64
  %19 = arith.index_castui %18 : i64 to index
  %20 = arith.extui %4 : i32 to i64
  %21 = arith.extui %5 : i32 to i64
  %22 = arith.shli %21, %c32_i64 : i64
  %23 = arith.ori %20, %22 : i64
  %24 = arith.index_castui %23 : i64 to index
  %25 = arith.extui %6 : i32 to i64
  %26 = arith.extui %7 : i32 to i64
  %27 = arith.shli %26, %c32_i64 : i64
  %28 = arith.ori %25, %27 : i64
  %29 = arith.index_castui %28 : i64 to index
  %30 = arith.extui %8 : i32 to i64
  %31 = arith.extui %9 : i32 to i64
  %32 = arith.shli %31, %c32_i64 : i64
  %33 = arith.ori %30, %32 : i64
  %34 = arith.index_castui %33 : i64 to index
  %35:5 = util.assume.int 
      %14<umin = 0, umax = 9007199254740991>, 
      %19<umin = 0, umax = 9007199254740991>, 
      %24<umin = 0, umax = 9007199254740991>, 
      %29<umin = 0, umax = 9007199254740991>, 
      %34<umin = 0, umax = 9007199254740991>
    : index, index, index, index, index
  %36 = iree_tensor_ext.dispatch.workload.ordinal %35#0, 0 : index
  %37 = iree_tensor_ext.dispatch.workload.ordinal %35#1, 1 : index
  %38 = iree_tensor_ext.dispatch.workload.ordinal %35#2, 2 : index
  %39 = iree_tensor_ext.dispatch.workload.ordinal %35#3, 3 : index
  %40 = iree_tensor_ext.dispatch.workload.ordinal %35#4, 4 : index
  %41 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>{%39, %36}
  memref.assume_alignment %41, 64 : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>
  %42 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>{%37, %40}
  memref.assume_alignment %42, 64 : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>
  %43 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x16xf32, #hal.descriptor_type<storage_buffer>>{%38}
  memref.assume_alignment %43, 64 : memref<?x16xf32, #hal.descriptor_type<storage_buffer>>
  %44 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(3) alignment(64) offset(%c0) flags(Indirect) : memref<?x?x16x16xf32, #hal.descriptor_type<storage_buffer>>{%39, %40}
  memref.assume_alignment %44, 64 : memref<?x?x16x16xf32, #hal.descriptor_type<storage_buffer>>
  scf.forall (%arg0, %arg1) in (%39, %37) {
    %subview = memref.subview %44[%arg0, %arg1, 0, 0] [1, 1, 16, 16] [1, 1, 1, 1] : memref<?x?x16x16xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1x16x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    scf.forall (%arg2) = (0) to (16) step (2) {
      %subview_1 = memref.subview %subview[0, 0, %arg2, 0] [1, 1, 2, 16] [1, 1, 1, 1] : memref<1x1x16x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x2x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %subview_2 = memref.subview %43[%arg0, %arg2] [1, 2] [1, 1] : memref<?x16xf32, #hal.descriptor_type<storage_buffer>> to memref<1x2xf32, strided<[16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>} ins(%cst : f32) outs(%alloca : memref<1x1x2x16xf32>)
      %45 = vector.transfer_read %alloca[%c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true]} : memref<1x1x2x16xf32>, vector<1x1x2x16xf32>
      %46 = scf.for %arg3 = %c0 to %36 step %c1 iter_args(%arg4 = %45) -> (vector<1x1x2x16xf32>) {
        %47 = vector.transfer_read %41[%arg0, %arg3, %arg2, %c0], %cst {in_bounds = [true, true, true, true]} : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>, vector<1x1x2x1xf32>
        %48 = vector.transfer_read %42[%arg1, %arg3, %c0, %c0], %cst {in_bounds = [true, true, true, true]} : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>, vector<1x1x16x1xf32>
        %49 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d2, d4, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %47, %48, %arg4 : vector<1x1x2x1xf32>, vector<1x1x16x1xf32> into vector<1x1x2x16xf32>
        scf.yield %49 : vector<1x1x2x16xf32>
      }
      vector.transfer_write %46, %alloca[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x2x16xf32>, memref<1x1x2x16xf32>
      linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%alloca, %subview_2 : memref<1x1x2x16xf32>, memref<1x2xf32, strided<[16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_1 : memref<1x1x2x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>} {
      ^bb0(%in: f32, %in_4: f32, %out: f32):
        %47 = arith.addf %in, %in_4 : f32
        %48 = arith.maximumf %47, %cst : f32
        linalg.yield %48 : f32
      }
      %subview_3 = memref.subview %subview[0, 0, %arg2, 0] [1, 1, 2, 16] [1, 1, 1, 1] : memref<1x1x16x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x2x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%subview_1 : memref<1x1x2x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_3 : memref<1x1x2x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) {
      ^bb0(%in: f32, %out: f32):
        linalg.yield %in : f32
      }
    }
    %subview_0 = memref.subview %44[%arg0, %arg1, 0, 0] [1, 1, 16, 16] [1, 1, 1, 1] : memref<?x?x16x16xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1x16x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%subview : memref<1x1x16x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_0 : memref<1x1x16x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) {
    ^bb0(%in: f32, %out: f32):
      linalg.yield %in : f32
    }
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%44 : memref<?x?x16x16xf32, #hal.descriptor_type<storage_buffer>>) outs(%44 : memref<?x?x16x16xf32, #hal.descriptor_type<storage_buffer>>) {
  ^bb0(%in: f32, %out: f32):
    linalg.yield %in : f32
  }
  return
 }

 // -----// IR Dump After ResolveShapedTypeResultDimsPass (resolve-shaped-type-result-dims) //----- //
 func.func @mmt4d_bias_relu_dispatch_0_mmt4d_DxDxDx16x16x1_f32() attributes {hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", iree.encoding.resolver = #iree_cpu.cpu_encoding_layout<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "none"}>, translation_info = #iree_codegen.translation_info<pipeline = Mmt4dTilingExpert>} {
  %cst = arith.constant 0.000000e+00 : f32
  %c32_i64 = arith.constant 32 : i64
  %c0 = arith.constant 0 : index
  %c1 = arith.constant 1 : index
  %alloca = memref.alloca() {alignment = 64 : i64} : memref<1x1x2x16xf32>
  %0 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
  %8 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
  %9 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
  %10 = arith.extui %0 : i32 to i64
  %11 = arith.extui %1 : i32 to i64
  %12 = arith.shli %11, %c32_i64 : i64
  %13 = arith.ori %10, %12 : i64
  %14 = arith.index_castui %13 : i64 to index
  %15 = arith.extui %2 : i32 to i64
  %16 = arith.extui %3 : i32 to i64
  %17 = arith.shli %16, %c32_i64 : i64
  %18 = arith.ori %15, %17 : i64
  %19 = arith.index_castui %18 : i64 to index
  %20 = arith.extui %4 : i32 to i64
  %21 = arith.extui %5 : i32 to i64
  %22 = arith.shli %21, %c32_i64 : i64
  %23 = arith.ori %20, %22 : i64
  %24 = arith.index_castui %23 : i64 to index
  %25 = arith.extui %6 : i32 to i64
  %26 = arith.extui %7 : i32 to i64
  %27 = arith.shli %26, %c32_i64 : i64
  %28 = arith.ori %25, %27 : i64
  %29 = arith.index_castui %28 : i64 to index
  %30 = arith.extui %8 : i32 to i64
  %31 = arith.extui %9 : i32 to i64
  %32 = arith.shli %31, %c32_i64 : i64
  %33 = arith.ori %30, %32 : i64
  %34 = arith.index_castui %33 : i64 to index
  %35:5 = util.assume.int 
      %14<umin = 0, umax = 9007199254740991>, 
      %19<umin = 0, umax = 9007199254740991>, 
      %24<umin = 0, umax = 9007199254740991>, 
      %29<umin = 0, umax = 9007199254740991>, 
      %34<umin = 0, umax = 9007199254740991>
    : index, index, index, index, index
  %36 = iree_tensor_ext.dispatch.workload.ordinal %35#0, 0 : index
  %37 = iree_tensor_ext.dispatch.workload.ordinal %35#1, 1 : index
  %38 = iree_tensor_ext.dispatch.workload.ordinal %35#2, 2 : index
  %39 = iree_tensor_ext.dispatch.workload.ordinal %35#3, 3 : index
  %40 = iree_tensor_ext.dispatch.workload.ordinal %35#4, 4 : index
  %41 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>{%39, %36}
  memref.assume_alignment %41, 64 : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>
  %42 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>{%37, %40}
  memref.assume_alignment %42, 64 : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>
  %43 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x16xf32, #hal.descriptor_type<storage_buffer>>{%38}
  memref.assume_alignment %43, 64 : memref<?x16xf32, #hal.descriptor_type<storage_buffer>>
  %44 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(3) alignment(64) offset(%c0) flags(Indirect) : memref<?x?x16x16xf32, #hal.descriptor_type<storage_buffer>>{%39, %40}
  memref.assume_alignment %44, 64 : memref<?x?x16x16xf32, #hal.descriptor_type<storage_buffer>>
  scf.forall (%arg0, %arg1) in (%39, %37) {
    %subview = memref.subview %44[%arg0, %arg1, 0, 0] [1, 1, 16, 16] [1, 1, 1, 1] : memref<?x?x16x16xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1x16x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    scf.forall (%arg2) = (0) to (16) step (2) {
      %subview_1 = memref.subview %subview[0, 0, %arg2, 0] [1, 1, 2, 16] [1, 1, 1, 1] : memref<1x1x16x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x2x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %subview_2 = memref.subview %43[%arg0, %arg2] [1, 2] [1, 1] : memref<?x16xf32, #hal.descriptor_type<storage_buffer>> to memref<1x2xf32, strided<[16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>} ins(%cst : f32) outs(%alloca : memref<1x1x2x16xf32>)
      %45 = vector.transfer_read %alloca[%c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true]} : memref<1x1x2x16xf32>, vector<1x1x2x16xf32>
      %46 = scf.for %arg3 = %c0 to %36 step %c1 iter_args(%arg4 = %45) -> (vector<1x1x2x16xf32>) {
        %47 = vector.transfer_read %41[%arg0, %arg3, %arg2, %c0], %cst {in_bounds = [true, true, true, true]} : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>, vector<1x1x2x1xf32>
        %48 = vector.transfer_read %42[%arg1, %arg3, %c0, %c0], %cst {in_bounds = [true, true, true, true]} : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>, vector<1x1x16x1xf32>
        %49 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d2, d4, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %47, %48, %arg4 : vector<1x1x2x1xf32>, vector<1x1x16x1xf32> into vector<1x1x2x16xf32>
        scf.yield %49 : vector<1x1x2x16xf32>
      }
      vector.transfer_write %46, %alloca[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x2x16xf32>, memref<1x1x2x16xf32>
      linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%alloca, %subview_2 : memref<1x1x2x16xf32>, memref<1x2xf32, strided<[16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_1 : memref<1x1x2x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>} {
      ^bb0(%in: f32, %in_4: f32, %out: f32):
        %47 = arith.addf %in, %in_4 : f32
        %48 = arith.maximumf %47, %cst : f32
        linalg.yield %48 : f32
      }
      %subview_3 = memref.subview %subview[0, 0, %arg2, 0] [1, 1, 2, 16] [1, 1, 1, 1] : memref<1x1x16x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x2x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%subview_1 : memref<1x1x2x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_3 : memref<1x1x2x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) {
      ^bb0(%in: f32, %out: f32):
        linalg.yield %in : f32
      }
    }
    %subview_0 = memref.subview %44[%arg0, %arg1, 0, 0] [1, 1, 16, 16] [1, 1, 1, 1] : memref<?x?x16x16xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1x16x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%subview : memref<1x1x16x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_0 : memref<1x1x16x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) {
    ^bb0(%in: f32, %out: f32):
      linalg.yield %in : f32
    }
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%44 : memref<?x?x16x16xf32, #hal.descriptor_type<storage_buffer>>) outs(%44 : memref<?x?x16x16xf32, #hal.descriptor_type<storage_buffer>>) {
  ^bb0(%in: f32, %out: f32):
    linalg.yield %in : f32
  }
  return
 }

 // -----// IR Dump After Canonicalizer (canonicalize) //----- //
 func.func @mmt4d_bias_relu_dispatch_0_mmt4d_DxDxDx16x16x1_f32() attributes {hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", iree.encoding.resolver = #iree_cpu.cpu_encoding_layout<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "none"}>, translation_info = #iree_codegen.translation_info<pipeline = Mmt4dTilingExpert>} {
  %cst = arith.constant 0.000000e+00 : f32
  %c32_i64 = arith.constant 32 : i64
  %c0 = arith.constant 0 : index
  %c1 = arith.constant 1 : index
  %alloca = memref.alloca() {alignment = 64 : i64} : memref<1x1x2x16xf32>
  %0 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
  %8 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
  %9 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
  %10 = arith.extui %0 : i32 to i64
  %11 = arith.extui %1 : i32 to i64
  %12 = arith.shli %11, %c32_i64 : i64
  %13 = arith.ori %10, %12 : i64
  %14 = arith.index_castui %13 : i64 to index
  %15 = arith.extui %2 : i32 to i64
  %16 = arith.extui %3 : i32 to i64
  %17 = arith.shli %16, %c32_i64 : i64
  %18 = arith.ori %15, %17 : i64
  %19 = arith.index_castui %18 : i64 to index
  %20 = arith.extui %4 : i32 to i64
  %21 = arith.extui %5 : i32 to i64
  %22 = arith.shli %21, %c32_i64 : i64
  %23 = arith.ori %20, %22 : i64
  %24 = arith.index_castui %23 : i64 to index
  %25 = arith.extui %6 : i32 to i64
  %26 = arith.extui %7 : i32 to i64
  %27 = arith.shli %26, %c32_i64 : i64
  %28 = arith.ori %25, %27 : i64
  %29 = arith.index_castui %28 : i64 to index
  %30 = arith.extui %8 : i32 to i64
  %31 = arith.extui %9 : i32 to i64
  %32 = arith.shli %31, %c32_i64 : i64
  %33 = arith.ori %30, %32 : i64
  %34 = arith.index_castui %33 : i64 to index
  %35:5 = util.assume.int 
      %14<umin = 0, umax = 9007199254740991>, 
      %19<umin = 0, umax = 9007199254740991>, 
      %24<umin = 0, umax = 9007199254740991>, 
      %29<umin = 0, umax = 9007199254740991>, 
      %34<umin = 0, umax = 9007199254740991>
    : index, index, index, index, index
  %36 = iree_tensor_ext.dispatch.workload.ordinal %35#0, 0 : index
  %37 = iree_tensor_ext.dispatch.workload.ordinal %35#1, 1 : index
  %38 = iree_tensor_ext.dispatch.workload.ordinal %35#2, 2 : index
  %39 = iree_tensor_ext.dispatch.workload.ordinal %35#3, 3 : index
  %40 = iree_tensor_ext.dispatch.workload.ordinal %35#4, 4 : index
  %41 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>{%39, %36}
  memref.assume_alignment %41, 64 : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>
  %42 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>{%37, %40}
  memref.assume_alignment %42, 64 : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>
  %43 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x16xf32, #hal.descriptor_type<storage_buffer>>{%38}
  memref.assume_alignment %43, 64 : memref<?x16xf32, #hal.descriptor_type<storage_buffer>>
  %44 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(3) alignment(64) offset(%c0) flags(Indirect) : memref<?x?x16x16xf32, #hal.descriptor_type<storage_buffer>>{%39, %40}
  memref.assume_alignment %44, 64 : memref<?x?x16x16xf32, #hal.descriptor_type<storage_buffer>>
  scf.forall (%arg0, %arg1) in (%39, %37) {
    %subview = memref.subview %44[%arg0, %arg1, 0, 0] [1, 1, 16, 16] [1, 1, 1, 1] : memref<?x?x16x16xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1x16x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    scf.forall (%arg2) = (0) to (16) step (2) {
      %subview_1 = memref.subview %subview[0, 0, %arg2, 0] [1, 1, 2, 16] [1, 1, 1, 1] : memref<1x1x16x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x2x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %subview_2 = memref.subview %43[%arg0, %arg2] [1, 2] [1, 1] : memref<?x16xf32, #hal.descriptor_type<storage_buffer>> to memref<1x2xf32, strided<[16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>} ins(%cst : f32) outs(%alloca : memref<1x1x2x16xf32>)
      %45 = vector.transfer_read %alloca[%c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true]} : memref<1x1x2x16xf32>, vector<1x1x2x16xf32>
      %46 = scf.for %arg3 = %c0 to %36 step %c1 iter_args(%arg4 = %45) -> (vector<1x1x2x16xf32>) {
        %47 = vector.transfer_read %41[%arg0, %arg3, %arg2, %c0], %cst {in_bounds = [true, true, true, true]} : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>, vector<1x1x2x1xf32>
        %48 = vector.transfer_read %42[%arg1, %arg3, %c0, %c0], %cst {in_bounds = [true, true, true, true]} : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>, vector<1x1x16x1xf32>
        %49 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d2, d4, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %47, %48, %arg4 : vector<1x1x2x1xf32>, vector<1x1x16x1xf32> into vector<1x1x2x16xf32>
        scf.yield %49 : vector<1x1x2x16xf32>
      }
      vector.transfer_write %46, %alloca[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x2x16xf32>, memref<1x1x2x16xf32>
      linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%alloca, %subview_2 : memref<1x1x2x16xf32>, memref<1x2xf32, strided<[16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_1 : memref<1x1x2x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>} {
      ^bb0(%in: f32, %in_4: f32, %out: f32):
        %47 = arith.addf %in, %in_4 : f32
        %48 = arith.maximumf %47, %cst : f32
        linalg.yield %48 : f32
      }
      %subview_3 = memref.subview %subview[0, 0, %arg2, 0] [1, 1, 2, 16] [1, 1, 1, 1] : memref<1x1x16x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x2x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%subview_1 : memref<1x1x2x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_3 : memref<1x1x2x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) {
      ^bb0(%in: f32, %out: f32):
        linalg.yield %in : f32
      }
    }
    %subview_0 = memref.subview %44[%arg0, %arg1, 0, 0] [1, 1, 16, 16] [1, 1, 1, 1] : memref<?x?x16x16xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1x16x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%subview : memref<1x1x16x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_0 : memref<1x1x16x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) {
    ^bb0(%in: f32, %out: f32):
      linalg.yield %in : f32
    }
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  return
 }

 // -----// IR Dump After CSE (cse) //----- //
 func.func @mmt4d_bias_relu_dispatch_0_mmt4d_DxDxDx16x16x1_f32() attributes {hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", iree.encoding.resolver = #iree_cpu.cpu_encoding_layout<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "none"}>, translation_info = #iree_codegen.translation_info<pipeline = Mmt4dTilingExpert>} {
  %cst = arith.constant 0.000000e+00 : f32
  %c32_i64 = arith.constant 32 : i64
  %c0 = arith.constant 0 : index
  %c1 = arith.constant 1 : index
  %alloca = memref.alloca() {alignment = 64 : i64} : memref<1x1x2x16xf32>
  %0 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
  %8 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
  %9 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
  %10 = arith.extui %0 : i32 to i64
  %11 = arith.extui %1 : i32 to i64
  %12 = arith.shli %11, %c32_i64 : i64
  %13 = arith.ori %10, %12 : i64
  %14 = arith.index_castui %13 : i64 to index
  %15 = arith.extui %2 : i32 to i64
  %16 = arith.extui %3 : i32 to i64
  %17 = arith.shli %16, %c32_i64 : i64
  %18 = arith.ori %15, %17 : i64
  %19 = arith.index_castui %18 : i64 to index
  %20 = arith.extui %4 : i32 to i64
  %21 = arith.extui %5 : i32 to i64
  %22 = arith.shli %21, %c32_i64 : i64
  %23 = arith.ori %20, %22 : i64
  %24 = arith.index_castui %23 : i64 to index
  %25 = arith.extui %6 : i32 to i64
  %26 = arith.extui %7 : i32 to i64
  %27 = arith.shli %26, %c32_i64 : i64
  %28 = arith.ori %25, %27 : i64
  %29 = arith.index_castui %28 : i64 to index
  %30 = arith.extui %8 : i32 to i64
  %31 = arith.extui %9 : i32 to i64
  %32 = arith.shli %31, %c32_i64 : i64
  %33 = arith.ori %30, %32 : i64
  %34 = arith.index_castui %33 : i64 to index
  %35:5 = util.assume.int 
      %14<umin = 0, umax = 9007199254740991>, 
      %19<umin = 0, umax = 9007199254740991>, 
      %24<umin = 0, umax = 9007199254740991>, 
      %29<umin = 0, umax = 9007199254740991>, 
      %34<umin = 0, umax = 9007199254740991>
    : index, index, index, index, index
  %36 = iree_tensor_ext.dispatch.workload.ordinal %35#0, 0 : index
  %37 = iree_tensor_ext.dispatch.workload.ordinal %35#1, 1 : index
  %38 = iree_tensor_ext.dispatch.workload.ordinal %35#2, 2 : index
  %39 = iree_tensor_ext.dispatch.workload.ordinal %35#3, 3 : index
  %40 = iree_tensor_ext.dispatch.workload.ordinal %35#4, 4 : index
  %41 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>{%39, %36}
  memref.assume_alignment %41, 64 : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>
  %42 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>{%37, %40}
  memref.assume_alignment %42, 64 : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>
  %43 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x16xf32, #hal.descriptor_type<storage_buffer>>{%38}
  memref.assume_alignment %43, 64 : memref<?x16xf32, #hal.descriptor_type<storage_buffer>>
  %44 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(3) alignment(64) offset(%c0) flags(Indirect) : memref<?x?x16x16xf32, #hal.descriptor_type<storage_buffer>>{%39, %40}
  memref.assume_alignment %44, 64 : memref<?x?x16x16xf32, #hal.descriptor_type<storage_buffer>>
  scf.forall (%arg0, %arg1) in (%39, %37) {
    %subview = memref.subview %44[%arg0, %arg1, 0, 0] [1, 1, 16, 16] [1, 1, 1, 1] : memref<?x?x16x16xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1x16x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    scf.forall (%arg2) = (0) to (16) step (2) {
      %subview_0 = memref.subview %subview[0, 0, %arg2, 0] [1, 1, 2, 16] [1, 1, 1, 1] : memref<1x1x16x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x2x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %subview_1 = memref.subview %43[%arg0, %arg2] [1, 2] [1, 1] : memref<?x16xf32, #hal.descriptor_type<storage_buffer>> to memref<1x2xf32, strided<[16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>} ins(%cst : f32) outs(%alloca : memref<1x1x2x16xf32>)
      %45 = vector.transfer_read %alloca[%c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true]} : memref<1x1x2x16xf32>, vector<1x1x2x16xf32>
      %46 = scf.for %arg3 = %c0 to %36 step %c1 iter_args(%arg4 = %45) -> (vector<1x1x2x16xf32>) {
        %47 = vector.transfer_read %41[%arg0, %arg3, %arg2, %c0], %cst {in_bounds = [true, true, true, true]} : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>, vector<1x1x2x1xf32>
        %48 = vector.transfer_read %42[%arg1, %arg3, %c0, %c0], %cst {in_bounds = [true, true, true, true]} : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>, vector<1x1x16x1xf32>
        %49 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d2, d4, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %47, %48, %arg4 : vector<1x1x2x1xf32>, vector<1x1x16x1xf32> into vector<1x1x2x16xf32>
        scf.yield %49 : vector<1x1x2x16xf32>
      }
      vector.transfer_write %46, %alloca[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x2x16xf32>, memref<1x1x2x16xf32>
      linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%alloca, %subview_1 : memref<1x1x2x16xf32>, memref<1x2xf32, strided<[16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_0 : memref<1x1x2x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>} {
      ^bb0(%in: f32, %in_2: f32, %out: f32):
        %47 = arith.addf %in, %in_2 : f32
        %48 = arith.maximumf %47, %cst : f32
        linalg.yield %48 : f32
      }
      linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%subview_0 : memref<1x1x2x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_0 : memref<1x1x2x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) {
      ^bb0(%in: f32, %out: f32):
        linalg.yield %in : f32
      }
    }
    linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%subview : memref<1x1x16x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview : memref<1x1x16x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) {
    ^bb0(%in: f32, %out: f32):
      linalg.yield %in : f32
    }
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  return
 }

 // -----// IR Dump After Canonicalizer (canonicalize) //----- //
 func.func @mmt4d_bias_relu_dispatch_0_mmt4d_DxDxDx16x16x1_f32() attributes {hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", iree.encoding.resolver = #iree_cpu.cpu_encoding_layout<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "none"}>, translation_info = #iree_codegen.translation_info<pipeline = Mmt4dTilingExpert>} {
  %cst = arith.constant 0.000000e+00 : f32
  %c32_i64 = arith.constant 32 : i64
  %c0 = arith.constant 0 : index
  %c1 = arith.constant 1 : index
  %alloca = memref.alloca() {alignment = 64 : i64} : memref<1x1x2x16xf32>
  %0 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
  %8 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
  %9 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
  %10 = arith.extui %0 : i32 to i64
  %11 = arith.extui %1 : i32 to i64
  %12 = arith.shli %11, %c32_i64 : i64
  %13 = arith.ori %10, %12 : i64
  %14 = arith.index_castui %13 : i64 to index
  %15 = arith.extui %2 : i32 to i64
  %16 = arith.extui %3 : i32 to i64
  %17 = arith.shli %16, %c32_i64 : i64
  %18 = arith.ori %15, %17 : i64
  %19 = arith.index_castui %18 : i64 to index
  %20 = arith.extui %4 : i32 to i64
  %21 = arith.extui %5 : i32 to i64
  %22 = arith.shli %21, %c32_i64 : i64
  %23 = arith.ori %20, %22 : i64
  %24 = arith.index_castui %23 : i64 to index
  %25 = arith.extui %6 : i32 to i64
  %26 = arith.extui %7 : i32 to i64
  %27 = arith.shli %26, %c32_i64 : i64
  %28 = arith.ori %25, %27 : i64
  %29 = arith.index_castui %28 : i64 to index
  %30 = arith.extui %8 : i32 to i64
  %31 = arith.extui %9 : i32 to i64
  %32 = arith.shli %31, %c32_i64 : i64
  %33 = arith.ori %30, %32 : i64
  %34 = arith.index_castui %33 : i64 to index
  %35:5 = util.assume.int 
      %14<umin = 0, umax = 9007199254740991>, 
      %19<umin = 0, umax = 9007199254740991>, 
      %24<umin = 0, umax = 9007199254740991>, 
      %29<umin = 0, umax = 9007199254740991>, 
      %34<umin = 0, umax = 9007199254740991>
    : index, index, index, index, index
  %36 = iree_tensor_ext.dispatch.workload.ordinal %35#0, 0 : index
  %37 = iree_tensor_ext.dispatch.workload.ordinal %35#1, 1 : index
  %38 = iree_tensor_ext.dispatch.workload.ordinal %35#2, 2 : index
  %39 = iree_tensor_ext.dispatch.workload.ordinal %35#3, 3 : index
  %40 = iree_tensor_ext.dispatch.workload.ordinal %35#4, 4 : index
  %41 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>{%39, %36}
  memref.assume_alignment %41, 64 : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>
  %42 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>{%37, %40}
  memref.assume_alignment %42, 64 : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>
  %43 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x16xf32, #hal.descriptor_type<storage_buffer>>{%38}
  memref.assume_alignment %43, 64 : memref<?x16xf32, #hal.descriptor_type<storage_buffer>>
  %44 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(3) alignment(64) offset(%c0) flags(Indirect) : memref<?x?x16x16xf32, #hal.descriptor_type<storage_buffer>>{%39, %40}
  memref.assume_alignment %44, 64 : memref<?x?x16x16xf32, #hal.descriptor_type<storage_buffer>>
  scf.forall (%arg0, %arg1) in (%39, %37) {
    %subview = memref.subview %44[%arg0, %arg1, 0, 0] [1, 1, 16, 16] [1, 1, 1, 1] : memref<?x?x16x16xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1x16x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    scf.forall (%arg2) = (0) to (16) step (2) {
      %subview_0 = memref.subview %subview[0, 0, %arg2, 0] [1, 1, 2, 16] [1, 1, 1, 1] : memref<1x1x16x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x2x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %subview_1 = memref.subview %43[%arg0, %arg2] [1, 2] [1, 1] : memref<?x16xf32, #hal.descriptor_type<storage_buffer>> to memref<1x2xf32, strided<[16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>} ins(%cst : f32) outs(%alloca : memref<1x1x2x16xf32>)
      %45 = vector.transfer_read %alloca[%c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true]} : memref<1x1x2x16xf32>, vector<1x1x2x16xf32>
      %46 = scf.for %arg3 = %c0 to %36 step %c1 iter_args(%arg4 = %45) -> (vector<1x1x2x16xf32>) {
        %47 = vector.transfer_read %41[%arg0, %arg3, %arg2, %c0], %cst {in_bounds = [true, true, true, true]} : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>, vector<1x1x2x1xf32>
        %48 = vector.transfer_read %42[%arg1, %arg3, %c0, %c0], %cst {in_bounds = [true, true, true, true]} : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>, vector<1x1x16x1xf32>
        %49 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d2, d4, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %47, %48, %arg4 : vector<1x1x2x1xf32>, vector<1x1x16x1xf32> into vector<1x1x2x16xf32>
        scf.yield %49 : vector<1x1x2x16xf32>
      }
      vector.transfer_write %46, %alloca[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x2x16xf32>, memref<1x1x2x16xf32>
      linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%alloca, %subview_1 : memref<1x1x2x16xf32>, memref<1x2xf32, strided<[16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_0 : memref<1x1x2x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>} {
      ^bb0(%in: f32, %in_2: f32, %out: f32):
        %47 = arith.addf %in, %in_2 : f32
        %48 = arith.maximumf %47, %cst : f32
        linalg.yield %48 : f32
      }
    }
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  return
 }

 // -----// IR Dump After CleanupBufferAllocViewPass (iree-codegen-cleanup-buffer-alloc-view) //----- //
 func.func @mmt4d_bias_relu_dispatch_0_mmt4d_DxDxDx16x16x1_f32() attributes {hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", iree.encoding.resolver = #iree_cpu.cpu_encoding_layout<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "none"}>, translation_info = #iree_codegen.translation_info<pipeline = Mmt4dTilingExpert>} {
  %cst = arith.constant 0.000000e+00 : f32
  %c32_i64 = arith.constant 32 : i64
  %c0 = arith.constant 0 : index
  %c1 = arith.constant 1 : index
  %alloca = memref.alloca() {alignment = 64 : i64} : memref<1x1x2x16xf32>
  %0 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
  %8 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
  %9 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
  %10 = arith.extui %0 : i32 to i64
  %11 = arith.extui %1 : i32 to i64
  %12 = arith.shli %11, %c32_i64 : i64
  %13 = arith.ori %10, %12 : i64
  %14 = arith.index_castui %13 : i64 to index
  %15 = arith.extui %2 : i32 to i64
  %16 = arith.extui %3 : i32 to i64
  %17 = arith.shli %16, %c32_i64 : i64
  %18 = arith.ori %15, %17 : i64
  %19 = arith.index_castui %18 : i64 to index
  %20 = arith.extui %4 : i32 to i64
  %21 = arith.extui %5 : i32 to i64
  %22 = arith.shli %21, %c32_i64 : i64
  %23 = arith.ori %20, %22 : i64
  %24 = arith.index_castui %23 : i64 to index
  %25 = arith.extui %6 : i32 to i64
  %26 = arith.extui %7 : i32 to i64
  %27 = arith.shli %26, %c32_i64 : i64
  %28 = arith.ori %25, %27 : i64
  %29 = arith.index_castui %28 : i64 to index
  %30 = arith.extui %8 : i32 to i64
  %31 = arith.extui %9 : i32 to i64
  %32 = arith.shli %31, %c32_i64 : i64
  %33 = arith.ori %30, %32 : i64
  %34 = arith.index_castui %33 : i64 to index
  %35:5 = util.assume.int 
      %14<umin = 0, umax = 9007199254740991>, 
      %19<umin = 0, umax = 9007199254740991>, 
      %24<umin = 0, umax = 9007199254740991>, 
      %29<umin = 0, umax = 9007199254740991>, 
      %34<umin = 0, umax = 9007199254740991>
    : index, index, index, index, index
  %36 = iree_tensor_ext.dispatch.workload.ordinal %35#0, 0 : index
  %37 = iree_tensor_ext.dispatch.workload.ordinal %35#1, 1 : index
  %38 = iree_tensor_ext.dispatch.workload.ordinal %35#2, 2 : index
  %39 = iree_tensor_ext.dispatch.workload.ordinal %35#3, 3 : index
  %40 = iree_tensor_ext.dispatch.workload.ordinal %35#4, 4 : index
  %41 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>{%39, %36}
  memref.assume_alignment %41, 64 : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>
  %42 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>{%37, %40}
  memref.assume_alignment %42, 64 : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>
  %43 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x16xf32, #hal.descriptor_type<storage_buffer>>{%38}
  memref.assume_alignment %43, 64 : memref<?x16xf32, #hal.descriptor_type<storage_buffer>>
  %44 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(3) alignment(64) offset(%c0) flags(Indirect) : memref<?x?x16x16xf32, #hal.descriptor_type<storage_buffer>>{%39, %40}
  memref.assume_alignment %44, 64 : memref<?x?x16x16xf32, #hal.descriptor_type<storage_buffer>>
  scf.forall (%arg0, %arg1) in (%39, %37) {
    %subview = memref.subview %44[%arg0, %arg1, 0, 0] [1, 1, 16, 16] [1, 1, 1, 1] : memref<?x?x16x16xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1x16x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    scf.forall (%arg2) = (0) to (16) step (2) {
      %subview_0 = memref.subview %subview[0, 0, %arg2, 0] [1, 1, 2, 16] [1, 1, 1, 1] : memref<1x1x16x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x2x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %subview_1 = memref.subview %43[%arg0, %arg2] [1, 2] [1, 1] : memref<?x16xf32, #hal.descriptor_type<storage_buffer>> to memref<1x2xf32, strided<[16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>} ins(%cst : f32) outs(%alloca : memref<1x1x2x16xf32>)
      %45 = vector.transfer_read %alloca[%c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true]} : memref<1x1x2x16xf32>, vector<1x1x2x16xf32>
      %46 = scf.for %arg3 = %c0 to %36 step %c1 iter_args(%arg4 = %45) -> (vector<1x1x2x16xf32>) {
        %47 = vector.transfer_read %41[%arg0, %arg3, %arg2, %c0], %cst {in_bounds = [true, true, true, true]} : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>, vector<1x1x2x1xf32>
        %48 = vector.transfer_read %42[%arg1, %arg3, %c0, %c0], %cst {in_bounds = [true, true, true, true]} : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>, vector<1x1x16x1xf32>
        %49 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d2, d4, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %47, %48, %arg4 : vector<1x1x2x1xf32>, vector<1x1x16x1xf32> into vector<1x1x2x16xf32>
        scf.yield %49 : vector<1x1x2x16xf32>
      }
      vector.transfer_write %46, %alloca[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x2x16xf32>, memref<1x1x2x16xf32>
      linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%alloca, %subview_1 : memref<1x1x2x16xf32>, memref<1x2xf32, strided<[16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_0 : memref<1x1x2x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>} {
      ^bb0(%in: f32, %in_2: f32, %out: f32):
        %47 = arith.addf %in, %in_2 : f32
        %48 = arith.maximumf %47, %cst : f32
        linalg.yield %48 : f32
      }
    }
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  return
 }

 // -----// IR Dump After LLVMCPUMmt4dVectorLoweringPass (iree-llvmcpu-mmt4d-vector-lowering) //----- //
 func.func @mmt4d_bias_relu_dispatch_0_mmt4d_DxDxDx16x16x1_f32() attributes {hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", iree.encoding.resolver = #iree_cpu.cpu_encoding_layout<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "none"}>, translation_info = #iree_codegen.translation_info<pipeline = Mmt4dTilingExpert>} {
  %cst = arith.constant 0.000000e+00 : f32
  %c32_i64 = arith.constant 32 : i64
  %c0 = arith.constant 0 : index
  %c1 = arith.constant 1 : index
  %alloca = memref.alloca() {alignment = 64 : i64} : memref<1x1x2x16xf32>
  %0 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
  %8 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
  %9 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
  %10 = arith.extui %0 : i32 to i64
  %11 = arith.extui %1 : i32 to i64
  %12 = arith.shli %11, %c32_i64 : i64
  %13 = arith.ori %10, %12 : i64
  %14 = arith.index_castui %13 : i64 to index
  %15 = arith.extui %2 : i32 to i64
  %16 = arith.extui %3 : i32 to i64
  %17 = arith.shli %16, %c32_i64 : i64
  %18 = arith.ori %15, %17 : i64
  %19 = arith.index_castui %18 : i64 to index
  %20 = arith.extui %4 : i32 to i64
  %21 = arith.extui %5 : i32 to i64
  %22 = arith.shli %21, %c32_i64 : i64
  %23 = arith.ori %20, %22 : i64
  %24 = arith.index_castui %23 : i64 to index
  %25 = arith.extui %6 : i32 to i64
  %26 = arith.extui %7 : i32 to i64
  %27 = arith.shli %26, %c32_i64 : i64
  %28 = arith.ori %25, %27 : i64
  %29 = arith.index_castui %28 : i64 to index
  %30 = arith.extui %8 : i32 to i64
  %31 = arith.extui %9 : i32 to i64
  %32 = arith.shli %31, %c32_i64 : i64
  %33 = arith.ori %30, %32 : i64
  %34 = arith.index_castui %33 : i64 to index
  %35:5 = util.assume.int 
      %14<umin = 0, umax = 9007199254740991>, 
      %19<umin = 0, umax = 9007199254740991>, 
      %24<umin = 0, umax = 9007199254740991>, 
      %29<umin = 0, umax = 9007199254740991>, 
      %34<umin = 0, umax = 9007199254740991>
    : index, index, index, index, index
  %36 = iree_tensor_ext.dispatch.workload.ordinal %35#0, 0 : index
  %37 = iree_tensor_ext.dispatch.workload.ordinal %35#1, 1 : index
  %38 = iree_tensor_ext.dispatch.workload.ordinal %35#2, 2 : index
  %39 = iree_tensor_ext.dispatch.workload.ordinal %35#3, 3 : index
  %40 = iree_tensor_ext.dispatch.workload.ordinal %35#4, 4 : index
  %41 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>{%39, %36}
  memref.assume_alignment %41, 64 : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>
  %42 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>{%37, %40}
  memref.assume_alignment %42, 64 : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>
  %43 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x16xf32, #hal.descriptor_type<storage_buffer>>{%38}
  memref.assume_alignment %43, 64 : memref<?x16xf32, #hal.descriptor_type<storage_buffer>>
  %44 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(3) alignment(64) offset(%c0) flags(Indirect) : memref<?x?x16x16xf32, #hal.descriptor_type<storage_buffer>>{%39, %40}
  memref.assume_alignment %44, 64 : memref<?x?x16x16xf32, #hal.descriptor_type<storage_buffer>>
  scf.forall (%arg0, %arg1) in (%39, %37) {
    %subview = memref.subview %44[%arg0, %arg1, 0, 0] [1, 1, 16, 16] [1, 1, 1, 1] : memref<?x?x16x16xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1x16x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    scf.forall (%arg2) = (0) to (16) step (2) {
      %subview_0 = memref.subview %subview[0, 0, %arg2, 0] [1, 1, 2, 16] [1, 1, 1, 1] : memref<1x1x16x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x2x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %subview_1 = memref.subview %43[%arg0, %arg2] [1, 2] [1, 1] : memref<?x16xf32, #hal.descriptor_type<storage_buffer>> to memref<1x2xf32, strided<[16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>} ins(%cst : f32) outs(%alloca : memref<1x1x2x16xf32>)
      %45 = vector.transfer_read %alloca[%c0, %c0, %c0, %c0], %cst {in_bounds = [true, true]} : memref<1x1x2x16xf32>, vector<2x16xf32>
      %46 = vector.broadcast %45 : vector<2x16xf32> to vector<1x1x2x16xf32>
      %47 = scf.for %arg3 = %c0 to %36 step %c1 iter_args(%arg4 = %46) -> (vector<1x1x2x16xf32>) {
        %49 = vector.transfer_read %41[%arg0, %arg3, %arg2, %c0], %cst {in_bounds = [true, true]} : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>, vector<2x1xf32>
        %50 = vector.transfer_read %42[%arg1, %arg3, %c0, %c0], %cst {in_bounds = [true, true]} : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>, vector<16x1xf32>
        %51 = vector.extract %arg4[0, 0] : vector<2x16xf32> from vector<1x1x2x16xf32>
        %52 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %49, %50, %51 : vector<2x1xf32>, vector<16x1xf32> into vector<2x16xf32>
        %53 = vector.broadcast %52 : vector<2x16xf32> to vector<1x2x16xf32>
        %54 = vector.broadcast %53 : vector<1x2x16xf32> to vector<1x1x2x16xf32>
        scf.yield %54 : vector<1x1x2x16xf32>
      }
      %48 = vector.extract %47[0, 0] : vector<2x16xf32> from vector<1x1x2x16xf32>
      vector.transfer_write %48, %alloca[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<2x16xf32>, memref<1x1x2x16xf32>
      linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%alloca, %subview_1 : memref<1x1x2x16xf32>, memref<1x2xf32, strided<[16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_0 : memref<1x1x2x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>} {
      ^bb0(%in: f32, %in_2: f32, %out: f32):
        %49 = arith.addf %in, %in_2 : f32
        %50 = arith.maximumf %49, %cst : f32
        linalg.yield %50 : f32
      }
    }
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  return
 }

 // -----// IR Dump After DropVectorUnitDimsPass (iree-codegen-drop-vector-unit-dims) //----- //
 func.func @mmt4d_bias_relu_dispatch_0_mmt4d_DxDxDx16x16x1_f32() attributes {hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", iree.encoding.resolver = #iree_cpu.cpu_encoding_layout<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "none"}>, translation_info = #iree_codegen.translation_info<pipeline = Mmt4dTilingExpert>} {
  %cst = arith.constant 0.000000e+00 : f32
  %c32_i64 = arith.constant 32 : i64
  %c0 = arith.constant 0 : index
  %c1 = arith.constant 1 : index
  %alloca = memref.alloca() {alignment = 64 : i64} : memref<1x1x2x16xf32>
  %0 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
  %8 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
  %9 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
  %10 = arith.extui %0 : i32 to i64
  %11 = arith.extui %1 : i32 to i64
  %12 = arith.shli %11, %c32_i64 : i64
  %13 = arith.ori %10, %12 : i64
  %14 = arith.index_castui %13 : i64 to index
  %15 = arith.extui %2 : i32 to i64
  %16 = arith.extui %3 : i32 to i64
  %17 = arith.shli %16, %c32_i64 : i64
  %18 = arith.ori %15, %17 : i64
  %19 = arith.index_castui %18 : i64 to index
  %20 = arith.extui %4 : i32 to i64
  %21 = arith.extui %5 : i32 to i64
  %22 = arith.shli %21, %c32_i64 : i64
  %23 = arith.ori %20, %22 : i64
  %24 = arith.index_castui %23 : i64 to index
  %25 = arith.extui %6 : i32 to i64
  %26 = arith.extui %7 : i32 to i64
  %27 = arith.shli %26, %c32_i64 : i64
  %28 = arith.ori %25, %27 : i64
  %29 = arith.index_castui %28 : i64 to index
  %30 = arith.extui %8 : i32 to i64
  %31 = arith.extui %9 : i32 to i64
  %32 = arith.shli %31, %c32_i64 : i64
  %33 = arith.ori %30, %32 : i64
  %34 = arith.index_castui %33 : i64 to index
  %35:5 = util.assume.int 
      %14<umin = 0, umax = 9007199254740991>, 
      %19<umin = 0, umax = 9007199254740991>, 
      %24<umin = 0, umax = 9007199254740991>, 
      %29<umin = 0, umax = 9007199254740991>, 
      %34<umin = 0, umax = 9007199254740991>
    : index, index, index, index, index
  %36 = iree_tensor_ext.dispatch.workload.ordinal %35#0, 0 : index
  %37 = iree_tensor_ext.dispatch.workload.ordinal %35#1, 1 : index
  %38 = iree_tensor_ext.dispatch.workload.ordinal %35#2, 2 : index
  %39 = iree_tensor_ext.dispatch.workload.ordinal %35#3, 3 : index
  %40 = iree_tensor_ext.dispatch.workload.ordinal %35#4, 4 : index
  %41 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>{%39, %36}
  memref.assume_alignment %41, 64 : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>
  %42 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>{%37, %40}
  memref.assume_alignment %42, 64 : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>
  %43 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x16xf32, #hal.descriptor_type<storage_buffer>>{%38}
  memref.assume_alignment %43, 64 : memref<?x16xf32, #hal.descriptor_type<storage_buffer>>
  %44 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(3) alignment(64) offset(%c0) flags(Indirect) : memref<?x?x16x16xf32, #hal.descriptor_type<storage_buffer>>{%39, %40}
  memref.assume_alignment %44, 64 : memref<?x?x16x16xf32, #hal.descriptor_type<storage_buffer>>
  scf.forall (%arg0, %arg1) in (%39, %37) {
    %subview = memref.subview %44[%arg0, %arg1, 0, 0] [1, 1, 16, 16] [1, 1, 1, 1] : memref<?x?x16x16xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1x16x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    scf.forall (%arg2) = (0) to (16) step (2) {
      %subview_0 = memref.subview %subview[0, 0, %arg2, 0] [1, 1, 2, 16] [1, 1, 1, 1] : memref<1x1x16x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x2x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %subview_1 = memref.subview %43[%arg0, %arg2] [1, 2] [1, 1] : memref<?x16xf32, #hal.descriptor_type<storage_buffer>> to memref<1x2xf32, strided<[16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>} ins(%cst : f32) outs(%alloca : memref<1x1x2x16xf32>)
      %subview_2 = memref.subview %alloca[0, 0, 0, 0] [1, 1, 2, 16] [1, 1, 1, 1] : memref<1x1x2x16xf32> to memref<2x16xf32>
      %45 = vector.transfer_read %subview_2[%c0, %c0], %cst {in_bounds = [true, true]} : memref<2x16xf32>, vector<2x16xf32>
      %46 = scf.for %arg3 = %c0 to %36 step %c1 iter_args(%arg4 = %45) -> (vector<2x16xf32>) {
        %dim = memref.dim %41, %c0 : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>
        %dim_4 = memref.dim %41, %c1 : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>
        %subview_5 = memref.subview %41[0, 0, 0, 0] [%dim, %dim_4, 16, 1] [1, 1, 1, 1] : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>> to memref<?x?x16xf32, strided<[?, 16, 1]>, #hal.descriptor_type<storage_buffer>>
        %47 = vector.transfer_read %subview_5[%arg0, %arg3, %arg2], %cst {in_bounds = [true]} : memref<?x?x16xf32, strided<[?, 16, 1]>, #hal.descriptor_type<storage_buffer>>, vector<2xf32>
        %48 = vector.shape_cast %47 : vector<2xf32> to vector<2x1xf32>
        %dim_6 = memref.dim %42, %c0 : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>
        %dim_7 = memref.dim %42, %c1 : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>
        %subview_8 = memref.subview %42[0, 0, 0, 0] [%dim_6, %dim_7, 16, 1] [1, 1, 1, 1] : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>> to memref<?x?x16xf32, strided<[?, 16, 1]>, #hal.descriptor_type<storage_buffer>>
        %49 = vector.transfer_read %subview_8[%arg1, %arg3, %c0], %cst {in_bounds = [true]} : memref<?x?x16xf32, strided<[?, 16, 1]>, #hal.descriptor_type<storage_buffer>>, vector<16xf32>
        %50 = vector.shape_cast %49 : vector<16xf32> to vector<16x1xf32>
        %51 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %48, %50, %arg4 : vector<2x1xf32>, vector<16x1xf32> into vector<2x16xf32>
        scf.yield %51 : vector<2x16xf32>
      }
      %subview_3 = memref.subview %alloca[0, 0, 0, 0] [1, 1, 2, 16] [1, 1, 1, 1] : memref<1x1x2x16xf32> to memref<2x16xf32>
      vector.transfer_write %46, %subview_3[%c0, %c0] {in_bounds = [true, true]} : vector<2x16xf32>, memref<2x16xf32>
      linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%alloca, %subview_1 : memref<1x1x2x16xf32>, memref<1x2xf32, strided<[16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_0 : memref<1x1x2x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>} {
      ^bb0(%in: f32, %in_4: f32, %out: f32):
        %47 = arith.addf %in, %in_4 : f32
        %48 = arith.maximumf %47, %cst : f32
        linalg.yield %48 : f32
      }
    }
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  return
 }

 // -----// IR Dump After LLVMCPUVirtualVectorLoweringPass (iree-llvmcpu-virtual-vector-lowering) //----- //
 func.func @mmt4d_bias_relu_dispatch_0_mmt4d_DxDxDx16x16x1_f32() attributes {hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", iree.encoding.resolver = #iree_cpu.cpu_encoding_layout<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "none"}>, translation_info = #iree_codegen.translation_info<pipeline = Mmt4dTilingExpert>} {
  %cst = arith.constant dense<0.000000e+00> : vector<2x16xf32>
  %cst_0 = arith.constant 0.000000e+00 : f32
  %c32_i64 = arith.constant 32 : i64
  %c0 = arith.constant 0 : index
  %c1 = arith.constant 1 : index
  %alloca = memref.alloca() {alignment = 64 : i64} : memref<1x1x2x16xf32>
  %0 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
  %8 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
  %9 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
  %10 = arith.extui %0 : i32 to i64
  %11 = arith.extui %1 : i32 to i64
  %12 = arith.shli %11, %c32_i64 : i64
  %13 = arith.ori %10, %12 : i64
  %14 = arith.index_castui %13 : i64 to index
  %15 = arith.extui %2 : i32 to i64
  %16 = arith.extui %3 : i32 to i64
  %17 = arith.shli %16, %c32_i64 : i64
  %18 = arith.ori %15, %17 : i64
  %19 = arith.index_castui %18 : i64 to index
  %20 = arith.extui %4 : i32 to i64
  %21 = arith.extui %5 : i32 to i64
  %22 = arith.shli %21, %c32_i64 : i64
  %23 = arith.ori %20, %22 : i64
  %24 = arith.index_castui %23 : i64 to index
  %25 = arith.extui %6 : i32 to i64
  %26 = arith.extui %7 : i32 to i64
  %27 = arith.shli %26, %c32_i64 : i64
  %28 = arith.ori %25, %27 : i64
  %29 = arith.index_castui %28 : i64 to index
  %30 = arith.extui %8 : i32 to i64
  %31 = arith.extui %9 : i32 to i64
  %32 = arith.shli %31, %c32_i64 : i64
  %33 = arith.ori %30, %32 : i64
  %34 = arith.index_castui %33 : i64 to index
  %35:5 = util.assume.int 
      %14<umin = 0, umax = 9007199254740991>, 
      %19<umin = 0, umax = 9007199254740991>, 
      %24<umin = 0, umax = 9007199254740991>, 
      %29<umin = 0, umax = 9007199254740991>, 
      %34<umin = 0, umax = 9007199254740991>
    : index, index, index, index, index
  %36 = iree_tensor_ext.dispatch.workload.ordinal %35#0, 0 : index
  %37 = iree_tensor_ext.dispatch.workload.ordinal %35#1, 1 : index
  %38 = iree_tensor_ext.dispatch.workload.ordinal %35#2, 2 : index
  %39 = iree_tensor_ext.dispatch.workload.ordinal %35#3, 3 : index
  %40 = iree_tensor_ext.dispatch.workload.ordinal %35#4, 4 : index
  %41 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>{%39, %36}
  memref.assume_alignment %41, 64 : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>
  %42 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>{%37, %40}
  memref.assume_alignment %42, 64 : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>
  %43 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x16xf32, #hal.descriptor_type<storage_buffer>>{%38}
  memref.assume_alignment %43, 64 : memref<?x16xf32, #hal.descriptor_type<storage_buffer>>
  %44 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(3) alignment(64) offset(%c0) flags(Indirect) : memref<?x?x16x16xf32, #hal.descriptor_type<storage_buffer>>{%39, %40}
  memref.assume_alignment %44, 64 : memref<?x?x16x16xf32, #hal.descriptor_type<storage_buffer>>
  scf.forall (%arg0, %arg1) in (%39, %37) {
    %subview = memref.subview %44[%arg0, %arg1, 0, 0] [1, 1, 16, 16] [1, 1, 1, 1] : memref<?x?x16x16xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1x16x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    scf.forall (%arg2) = (0) to (16) step (2) {
      %subview_1 = memref.subview %subview[0, 0, %arg2, 0] [1, 1, 2, 16] [1, 1, 1, 1] : memref<1x1x16x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x2x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %subview_2 = memref.subview %43[%arg0, %arg2] [1, 2] [1, 1] : memref<?x16xf32, #hal.descriptor_type<storage_buffer>> to memref<1x2xf32, strided<[16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>} ins(%cst_0 : f32) outs(%alloca : memref<1x1x2x16xf32>)
      %subview_3 = memref.subview %alloca[0, 0, 0, 0] [1, 1, 2, 16] [1, 1, 1, 1] : memref<1x1x2x16xf32> to memref<2x16xf32>
      %45 = vector.transfer_read %subview_3[%c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<2x16xf32>, vector<2x16xf32>
      %46 = scf.for %arg3 = %c0 to %36 step %c1 iter_args(%arg4 = %45) -> (vector<2x16xf32>) {
        %dim = memref.dim %41, %c0 : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>
        %dim_5 = memref.dim %41, %c1 : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>
        %subview_6 = memref.subview %41[0, 0, 0, 0] [%dim, %dim_5, 16, 1] [1, 1, 1, 1] : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>> to memref<?x?x16xf32, strided<[?, 16, 1]>, #hal.descriptor_type<storage_buffer>>
        %dim_7 = memref.dim %42, %c0 : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>
        %dim_8 = memref.dim %42, %c1 : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>
        %subview_9 = memref.subview %42[0, 0, 0, 0] [%dim_7, %dim_8, 16, 1] [1, 1, 1, 1] : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>> to memref<?x?x16xf32, strided<[?, 16, 1]>, #hal.descriptor_type<storage_buffer>>
        %47 = vector.transfer_read %subview_9[%arg1, %arg3, %c0], %cst_0 {in_bounds = [true]} : memref<?x?x16xf32, strided<[?, 16, 1]>, #hal.descriptor_type<storage_buffer>>, vector<16xf32>
        %48 = vector.shape_cast %47 : vector<16xf32> to vector<16x1xf32>
        %49 = vector.transpose %48, [1, 0] : vector<16x1xf32> to vector<1x16xf32>
        %50 = vector.extract %49[0] : vector<16xf32> from vector<1x16xf32>
        %51 = memref.load %subview_6[%arg0, %arg3, %arg2] : memref<?x?x16xf32, strided<[?, 16, 1]>, #hal.descriptor_type<storage_buffer>>
        %52 = vector.broadcast %51 : f32 to vector<16xf32>
        %53 = vector.extract %arg4[0] : vector<16xf32> from vector<2x16xf32>
        %54 = vector.fma %52, %50, %53 : vector<16xf32>
        %55 = vector.insert %54, %cst [0] : vector<16xf32> into vector<2x16xf32>
        %56 = affine.apply affine_map<()[s0] -> (s0 + 1)>()[%arg2]
        %57 = memref.load %subview_6[%arg0, %arg3, %56] : memref<?x?x16xf32, strided<[?, 16, 1]>, #hal.descriptor_type<storage_buffer>>
        %58 = vector.broadcast %57 : f32 to vector<16xf32>
        %59 = vector.extract %arg4[1] : vector<16xf32> from vector<2x16xf32>
        %60 = vector.fma %58, %50, %59 : vector<16xf32>
        %61 = vector.insert %60, %55 [1] : vector<16xf32> into vector<2x16xf32>
        scf.yield %61 : vector<2x16xf32>
      }
      %subview_4 = memref.subview %alloca[0, 0, 0, 0] [1, 1, 2, 16] [1, 1, 1, 1] : memref<1x1x2x16xf32> to memref<2x16xf32>
      vector.transfer_write %46, %subview_4[%c0, %c0] {in_bounds = [true, true]} : vector<2x16xf32>, memref<2x16xf32>
      linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%alloca, %subview_2 : memref<1x1x2x16xf32>, memref<1x2xf32, strided<[16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_1 : memref<1x1x2x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>} {
      ^bb0(%in: f32, %in_5: f32, %out: f32):
        %47 = arith.addf %in, %in_5 : f32
        %48 = arith.maximumf %47, %cst_0 : f32
        linalg.yield %48 : f32
      }
    }
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  return
 }

 // -----// IR Dump After Canonicalizer (canonicalize) //----- //
 func.func @mmt4d_bias_relu_dispatch_0_mmt4d_DxDxDx16x16x1_f32() attributes {hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", iree.encoding.resolver = #iree_cpu.cpu_encoding_layout<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "none"}>, translation_info = #iree_codegen.translation_info<pipeline = Mmt4dTilingExpert>} {
  %cst = arith.constant dense<0.000000e+00> : vector<2x16xf32>
  %cst_0 = arith.constant 0.000000e+00 : f32
  %c32_i64 = arith.constant 32 : i64
  %c0 = arith.constant 0 : index
  %c1 = arith.constant 1 : index
  %alloca = memref.alloca() {alignment = 64 : i64} : memref<1x1x2x16xf32>
  %0 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
  %8 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
  %9 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
  %10 = arith.extui %0 : i32 to i64
  %11 = arith.extui %1 : i32 to i64
  %12 = arith.shli %11, %c32_i64 : i64
  %13 = arith.ori %10, %12 : i64
  %14 = arith.index_castui %13 : i64 to index
  %15 = arith.extui %2 : i32 to i64
  %16 = arith.extui %3 : i32 to i64
  %17 = arith.shli %16, %c32_i64 : i64
  %18 = arith.ori %15, %17 : i64
  %19 = arith.index_castui %18 : i64 to index
  %20 = arith.extui %4 : i32 to i64
  %21 = arith.extui %5 : i32 to i64
  %22 = arith.shli %21, %c32_i64 : i64
  %23 = arith.ori %20, %22 : i64
  %24 = arith.index_castui %23 : i64 to index
  %25 = arith.extui %6 : i32 to i64
  %26 = arith.extui %7 : i32 to i64
  %27 = arith.shli %26, %c32_i64 : i64
  %28 = arith.ori %25, %27 : i64
  %29 = arith.index_castui %28 : i64 to index
  %30 = arith.extui %8 : i32 to i64
  %31 = arith.extui %9 : i32 to i64
  %32 = arith.shli %31, %c32_i64 : i64
  %33 = arith.ori %30, %32 : i64
  %34 = arith.index_castui %33 : i64 to index
  %35:5 = util.assume.int 
      %14<umin = 0, umax = 9007199254740991>, 
      %19<umin = 0, umax = 9007199254740991>, 
      %24<umin = 0, umax = 9007199254740991>, 
      %29<umin = 0, umax = 9007199254740991>, 
      %34<umin = 0, umax = 9007199254740991>
    : index, index, index, index, index
  %36 = iree_tensor_ext.dispatch.workload.ordinal %35#0, 0 : index
  %37 = iree_tensor_ext.dispatch.workload.ordinal %35#1, 1 : index
  %38 = iree_tensor_ext.dispatch.workload.ordinal %35#2, 2 : index
  %39 = iree_tensor_ext.dispatch.workload.ordinal %35#3, 3 : index
  %40 = iree_tensor_ext.dispatch.workload.ordinal %35#4, 4 : index
  %41 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>{%39, %36}
  memref.assume_alignment %41, 64 : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>
  %42 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>{%37, %40}
  memref.assume_alignment %42, 64 : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>
  %43 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x16xf32, #hal.descriptor_type<storage_buffer>>{%38}
  memref.assume_alignment %43, 64 : memref<?x16xf32, #hal.descriptor_type<storage_buffer>>
  %44 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(3) alignment(64) offset(%c0) flags(Indirect) : memref<?x?x16x16xf32, #hal.descriptor_type<storage_buffer>>{%39, %40}
  memref.assume_alignment %44, 64 : memref<?x?x16x16xf32, #hal.descriptor_type<storage_buffer>>
  scf.forall (%arg0, %arg1) in (%39, %37) {
    %subview = memref.subview %44[%arg0, %arg1, 0, 0] [1, 1, 16, 16] [1, 1, 1, 1] : memref<?x?x16x16xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1x16x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    scf.forall (%arg2) = (0) to (16) step (2) {
      %subview_1 = memref.subview %subview[0, 0, %arg2, 0] [1, 1, 2, 16] [1, 1, 1, 1] : memref<1x1x16x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x2x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %subview_2 = memref.subview %43[%arg0, %arg2] [1, 2] [1, 1] : memref<?x16xf32, #hal.descriptor_type<storage_buffer>> to memref<1x2xf32, strided<[16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>} ins(%cst_0 : f32) outs(%alloca : memref<1x1x2x16xf32>)
      %subview_3 = memref.subview %alloca[0, 0, 0, 0] [1, 1, 2, 16] [1, 1, 1, 1] : memref<1x1x2x16xf32> to memref<2x16xf32>
      %45 = vector.transfer_read %subview_3[%c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<2x16xf32>, vector<2x16xf32>
      %46 = scf.for %arg3 = %c0 to %36 step %c1 iter_args(%arg4 = %45) -> (vector<2x16xf32>) {
        %subview_5 = memref.subview %41[0, 0, 0, 0] [%39, %36, 16, 1] [1, 1, 1, 1] : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>> to memref<?x?x16xf32, strided<[?, 16, 1]>, #hal.descriptor_type<storage_buffer>>
        %subview_6 = memref.subview %42[0, 0, 0, 0] [%37, %40, 16, 1] [1, 1, 1, 1] : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>> to memref<?x?x16xf32, strided<[?, 16, 1]>, #hal.descriptor_type<storage_buffer>>
        %47 = vector.transfer_read %subview_6[%arg1, %arg3, %c0], %cst_0 {in_bounds = [true]} : memref<?x?x16xf32, strided<[?, 16, 1]>, #hal.descriptor_type<storage_buffer>>, vector<16xf32>
        %48 = memref.load %subview_5[%arg0, %arg3, %arg2] : memref<?x?x16xf32, strided<[?, 16, 1]>, #hal.descriptor_type<storage_buffer>>
        %49 = vector.broadcast %48 : f32 to vector<16xf32>
        %50 = vector.extract %arg4[0] : vector<16xf32> from vector<2x16xf32>
        %51 = vector.fma %49, %47, %50 : vector<16xf32>
        %52 = vector.insert %51, %cst [0] : vector<16xf32> into vector<2x16xf32>
        %53 = affine.apply affine_map<()[s0] -> (s0 + 1)>()[%arg2]
        %54 = memref.load %subview_5[%arg0, %arg3, %53] : memref<?x?x16xf32, strided<[?, 16, 1]>, #hal.descriptor_type<storage_buffer>>
        %55 = vector.broadcast %54 : f32 to vector<16xf32>
        %56 = vector.extract %arg4[1] : vector<16xf32> from vector<2x16xf32>
        %57 = vector.fma %55, %47, %56 : vector<16xf32>
        %58 = vector.insert %57, %52 [1] : vector<16xf32> into vector<2x16xf32>
        scf.yield %58 : vector<2x16xf32>
      }
      %subview_4 = memref.subview %alloca[0, 0, 0, 0] [1, 1, 2, 16] [1, 1, 1, 1] : memref<1x1x2x16xf32> to memref<2x16xf32>
      vector.transfer_write %46, %subview_4[%c0, %c0] {in_bounds = [true, true]} : vector<2x16xf32>, memref<2x16xf32>
      linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%alloca, %subview_2 : memref<1x1x2x16xf32>, memref<1x2xf32, strided<[16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_1 : memref<1x1x2x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>} {
      ^bb0(%in: f32, %in_5: f32, %out: f32):
        %47 = arith.addf %in, %in_5 : f32
        %48 = arith.maximumf %47, %cst_0 : f32
        linalg.yield %48 : f32
      }
    }
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  return
 }

 // -----// IR Dump After VectorTransferLoweringPass (iree-codegen-vector-transfer-lowering) //----- //
 func.func @mmt4d_bias_relu_dispatch_0_mmt4d_DxDxDx16x16x1_f32() attributes {hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", iree.encoding.resolver = #iree_cpu.cpu_encoding_layout<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "none"}>, translation_info = #iree_codegen.translation_info<pipeline = Mmt4dTilingExpert>} {
  %cst = arith.constant dense<0.000000e+00> : vector<2x16xf32>
  %cst_0 = arith.constant 0.000000e+00 : f32
  %c32_i64 = arith.constant 32 : i64
  %c0 = arith.constant 0 : index
  %c1 = arith.constant 1 : index
  %alloca = memref.alloca() {alignment = 64 : i64} : memref<1x1x2x16xf32>
  %0 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
  %8 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
  %9 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
  %10 = arith.extui %0 : i32 to i64
  %11 = arith.extui %1 : i32 to i64
  %12 = arith.shli %11, %c32_i64 : i64
  %13 = arith.ori %10, %12 : i64
  %14 = arith.index_castui %13 : i64 to index
  %15 = arith.extui %2 : i32 to i64
  %16 = arith.extui %3 : i32 to i64
  %17 = arith.shli %16, %c32_i64 : i64
  %18 = arith.ori %15, %17 : i64
  %19 = arith.index_castui %18 : i64 to index
  %20 = arith.extui %4 : i32 to i64
  %21 = arith.extui %5 : i32 to i64
  %22 = arith.shli %21, %c32_i64 : i64
  %23 = arith.ori %20, %22 : i64
  %24 = arith.index_castui %23 : i64 to index
  %25 = arith.extui %6 : i32 to i64
  %26 = arith.extui %7 : i32 to i64
  %27 = arith.shli %26, %c32_i64 : i64
  %28 = arith.ori %25, %27 : i64
  %29 = arith.index_castui %28 : i64 to index
  %30 = arith.extui %8 : i32 to i64
  %31 = arith.extui %9 : i32 to i64
  %32 = arith.shli %31, %c32_i64 : i64
  %33 = arith.ori %30, %32 : i64
  %34 = arith.index_castui %33 : i64 to index
  %35:5 = util.assume.int 
      %14<umin = 0, umax = 9007199254740991>, 
      %19<umin = 0, umax = 9007199254740991>, 
      %24<umin = 0, umax = 9007199254740991>, 
      %29<umin = 0, umax = 9007199254740991>, 
      %34<umin = 0, umax = 9007199254740991>
    : index, index, index, index, index
  %36 = iree_tensor_ext.dispatch.workload.ordinal %35#0, 0 : index
  %37 = iree_tensor_ext.dispatch.workload.ordinal %35#1, 1 : index
  %38 = iree_tensor_ext.dispatch.workload.ordinal %35#2, 2 : index
  %39 = iree_tensor_ext.dispatch.workload.ordinal %35#3, 3 : index
  %40 = iree_tensor_ext.dispatch.workload.ordinal %35#4, 4 : index
  %41 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>{%39, %36}
  memref.assume_alignment %41, 64 : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>
  %42 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>{%37, %40}
  memref.assume_alignment %42, 64 : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>
  %43 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x16xf32, #hal.descriptor_type<storage_buffer>>{%38}
  memref.assume_alignment %43, 64 : memref<?x16xf32, #hal.descriptor_type<storage_buffer>>
  %44 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(3) alignment(64) offset(%c0) flags(Indirect) : memref<?x?x16x16xf32, #hal.descriptor_type<storage_buffer>>{%39, %40}
  memref.assume_alignment %44, 64 : memref<?x?x16x16xf32, #hal.descriptor_type<storage_buffer>>
  scf.forall (%arg0, %arg1) in (%39, %37) {
    %subview = memref.subview %44[%arg0, %arg1, 0, 0] [1, 1, 16, 16] [1, 1, 1, 1] : memref<?x?x16x16xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1x16x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    scf.forall (%arg2) = (0) to (16) step (2) {
      %subview_1 = memref.subview %subview[0, 0, %arg2, 0] [1, 1, 2, 16] [1, 1, 1, 1] : memref<1x1x16x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x2x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %subview_2 = memref.subview %43[%arg0, %arg2] [1, 2] [1, 1] : memref<?x16xf32, #hal.descriptor_type<storage_buffer>> to memref<1x2xf32, strided<[16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>} ins(%cst_0 : f32) outs(%alloca : memref<1x1x2x16xf32>)
      %subview_3 = memref.subview %alloca[0, 0, 0, 0] [1, 1, 2, 16] [1, 1, 1, 1] : memref<1x1x2x16xf32> to memref<2x16xf32>
      %45 = vector.load %subview_3[%c0, %c0] : memref<2x16xf32>, vector<16xf32>
      %46 = vector.insert %45, %cst [0] : vector<16xf32> into vector<2x16xf32>
      %47 = vector.load %subview_3[%c1, %c0] : memref<2x16xf32>, vector<16xf32>
      %48 = vector.insert %47, %46 [1] : vector<16xf32> into vector<2x16xf32>
      %49 = scf.for %arg3 = %c0 to %36 step %c1 iter_args(%arg4 = %48) -> (vector<2x16xf32>) {
        %subview_5 = memref.subview %41[0, 0, 0, 0] [%39, %36, 16, 1] [1, 1, 1, 1] : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>> to memref<?x?x16xf32, strided<[?, 16, 1]>, #hal.descriptor_type<storage_buffer>>
        %subview_6 = memref.subview %42[0, 0, 0, 0] [%37, %40, 16, 1] [1, 1, 1, 1] : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>> to memref<?x?x16xf32, strided<[?, 16, 1]>, #hal.descriptor_type<storage_buffer>>
        %52 = vector.load %subview_6[%arg1, %arg3, %c0] : memref<?x?x16xf32, strided<[?, 16, 1]>, #hal.descriptor_type<storage_buffer>>, vector<16xf32>
        %53 = memref.load %subview_5[%arg0, %arg3, %arg2] : memref<?x?x16xf32, strided<[?, 16, 1]>, #hal.descriptor_type<storage_buffer>>
        %54 = vector.broadcast %53 : f32 to vector<16xf32>
        %55 = vector.extract %arg4[0] : vector<16xf32> from vector<2x16xf32>
        %56 = vector.fma %54, %52, %55 : vector<16xf32>
        %57 = vector.insert %56, %cst [0] : vector<16xf32> into vector<2x16xf32>
        %58 = affine.apply affine_map<()[s0] -> (s0 + 1)>()[%arg2]
        %59 = memref.load %subview_5[%arg0, %arg3, %58] : memref<?x?x16xf32, strided<[?, 16, 1]>, #hal.descriptor_type<storage_buffer>>
        %60 = vector.broadcast %59 : f32 to vector<16xf32>
        %61 = vector.extract %arg4[1] : vector<16xf32> from vector<2x16xf32>
        %62 = vector.fma %60, %52, %61 : vector<16xf32>
        %63 = vector.insert %62, %57 [1] : vector<16xf32> into vector<2x16xf32>
        scf.yield %63 : vector<2x16xf32>
      }
      %subview_4 = memref.subview %alloca[0, 0, 0, 0] [1, 1, 2, 16] [1, 1, 1, 1] : memref<1x1x2x16xf32> to memref<2x16xf32>
      %50 = vector.extract %49[0] : vector<16xf32> from vector<2x16xf32>
      vector.store %50, %subview_4[%c0, %c0] : memref<2x16xf32>, vector<16xf32>
      %51 = vector.extract %49[1] : vector<16xf32> from vector<2x16xf32>
      vector.store %51, %subview_4[%c1, %c0] : memref<2x16xf32>, vector<16xf32>
      linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%alloca, %subview_2 : memref<1x1x2x16xf32>, memref<1x2xf32, strided<[16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_1 : memref<1x1x2x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>} {
      ^bb0(%in: f32, %in_5: f32, %out: f32):
        %52 = arith.addf %in, %in_5 : f32
        %53 = arith.maximumf %52, %cst_0 : f32
        linalg.yield %53 : f32
      }
    }
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  return
 }

 // -----// IR Dump After LLVMCPUVectorTransposeLoweringPass (iree-llvmcpu-vector-transpose-lowering) //----- //
 func.func @mmt4d_bias_relu_dispatch_0_mmt4d_DxDxDx16x16x1_f32() attributes {hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", iree.encoding.resolver = #iree_cpu.cpu_encoding_layout<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "none"}>, translation_info = #iree_codegen.translation_info<pipeline = Mmt4dTilingExpert>} {
  %cst = arith.constant dense<0.000000e+00> : vector<2x16xf32>
  %cst_0 = arith.constant 0.000000e+00 : f32
  %c32_i64 = arith.constant 32 : i64
  %c0 = arith.constant 0 : index
  %c1 = arith.constant 1 : index
  %alloca = memref.alloca() {alignment = 64 : i64} : memref<1x1x2x16xf32>
  %0 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
  %8 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
  %9 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
  %10 = arith.extui %0 : i32 to i64
  %11 = arith.extui %1 : i32 to i64
  %12 = arith.shli %11, %c32_i64 : i64
  %13 = arith.ori %10, %12 : i64
  %14 = arith.index_castui %13 : i64 to index
  %15 = arith.extui %2 : i32 to i64
  %16 = arith.extui %3 : i32 to i64
  %17 = arith.shli %16, %c32_i64 : i64
  %18 = arith.ori %15, %17 : i64
  %19 = arith.index_castui %18 : i64 to index
  %20 = arith.extui %4 : i32 to i64
  %21 = arith.extui %5 : i32 to i64
  %22 = arith.shli %21, %c32_i64 : i64
  %23 = arith.ori %20, %22 : i64
  %24 = arith.index_castui %23 : i64 to index
  %25 = arith.extui %6 : i32 to i64
  %26 = arith.extui %7 : i32 to i64
  %27 = arith.shli %26, %c32_i64 : i64
  %28 = arith.ori %25, %27 : i64
  %29 = arith.index_castui %28 : i64 to index
  %30 = arith.extui %8 : i32 to i64
  %31 = arith.extui %9 : i32 to i64
  %32 = arith.shli %31, %c32_i64 : i64
  %33 = arith.ori %30, %32 : i64
  %34 = arith.index_castui %33 : i64 to index
  %35:5 = util.assume.int 
      %14<umin = 0, umax = 9007199254740991>, 
      %19<umin = 0, umax = 9007199254740991>, 
      %24<umin = 0, umax = 9007199254740991>, 
      %29<umin = 0, umax = 9007199254740991>, 
      %34<umin = 0, umax = 9007199254740991>
    : index, index, index, index, index
  %36 = iree_tensor_ext.dispatch.workload.ordinal %35#0, 0 : index
  %37 = iree_tensor_ext.dispatch.workload.ordinal %35#1, 1 : index
  %38 = iree_tensor_ext.dispatch.workload.ordinal %35#2, 2 : index
  %39 = iree_tensor_ext.dispatch.workload.ordinal %35#3, 3 : index
  %40 = iree_tensor_ext.dispatch.workload.ordinal %35#4, 4 : index
  %41 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>{%39, %36}
  memref.assume_alignment %41, 64 : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>
  %42 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>{%37, %40}
  memref.assume_alignment %42, 64 : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>
  %43 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x16xf32, #hal.descriptor_type<storage_buffer>>{%38}
  memref.assume_alignment %43, 64 : memref<?x16xf32, #hal.descriptor_type<storage_buffer>>
  %44 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(3) alignment(64) offset(%c0) flags(Indirect) : memref<?x?x16x16xf32, #hal.descriptor_type<storage_buffer>>{%39, %40}
  memref.assume_alignment %44, 64 : memref<?x?x16x16xf32, #hal.descriptor_type<storage_buffer>>
  scf.forall (%arg0, %arg1) in (%39, %37) {
    %subview = memref.subview %44[%arg0, %arg1, 0, 0] [1, 1, 16, 16] [1, 1, 1, 1] : memref<?x?x16x16xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1x16x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    scf.forall (%arg2) = (0) to (16) step (2) {
      %subview_1 = memref.subview %subview[0, 0, %arg2, 0] [1, 1, 2, 16] [1, 1, 1, 1] : memref<1x1x16x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x2x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %subview_2 = memref.subview %43[%arg0, %arg2] [1, 2] [1, 1] : memref<?x16xf32, #hal.descriptor_type<storage_buffer>> to memref<1x2xf32, strided<[16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>} ins(%cst_0 : f32) outs(%alloca : memref<1x1x2x16xf32>)
      %subview_3 = memref.subview %alloca[0, 0, 0, 0] [1, 1, 2, 16] [1, 1, 1, 1] : memref<1x1x2x16xf32> to memref<2x16xf32>
      %45 = vector.load %subview_3[%c0, %c0] : memref<2x16xf32>, vector<16xf32>
      %46 = vector.insert %45, %cst [0] : vector<16xf32> into vector<2x16xf32>
      %47 = vector.load %subview_3[%c1, %c0] : memref<2x16xf32>, vector<16xf32>
      %48 = vector.insert %47, %46 [1] : vector<16xf32> into vector<2x16xf32>
      %49 = scf.for %arg3 = %c0 to %36 step %c1 iter_args(%arg4 = %48) -> (vector<2x16xf32>) {
        %subview_5 = memref.subview %41[0, 0, 0, 0] [%39, %36, 16, 1] [1, 1, 1, 1] : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>> to memref<?x?x16xf32, strided<[?, 16, 1]>, #hal.descriptor_type<storage_buffer>>
        %subview_6 = memref.subview %42[0, 0, 0, 0] [%37, %40, 16, 1] [1, 1, 1, 1] : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>> to memref<?x?x16xf32, strided<[?, 16, 1]>, #hal.descriptor_type<storage_buffer>>
        %52 = vector.load %subview_6[%arg1, %arg3, %c0] : memref<?x?x16xf32, strided<[?, 16, 1]>, #hal.descriptor_type<storage_buffer>>, vector<16xf32>
        %53 = memref.load %subview_5[%arg0, %arg3, %arg2] : memref<?x?x16xf32, strided<[?, 16, 1]>, #hal.descriptor_type<storage_buffer>>
        %54 = vector.broadcast %53 : f32 to vector<16xf32>
        %55 = vector.extract %arg4[0] : vector<16xf32> from vector<2x16xf32>
        %56 = vector.fma %54, %52, %55 : vector<16xf32>
        %57 = vector.insert %56, %cst [0] : vector<16xf32> into vector<2x16xf32>
        %58 = affine.apply affine_map<()[s0] -> (s0 + 1)>()[%arg2]
        %59 = memref.load %subview_5[%arg0, %arg3, %58] : memref<?x?x16xf32, strided<[?, 16, 1]>, #hal.descriptor_type<storage_buffer>>
        %60 = vector.broadcast %59 : f32 to vector<16xf32>
        %61 = vector.extract %arg4[1] : vector<16xf32> from vector<2x16xf32>
        %62 = vector.fma %60, %52, %61 : vector<16xf32>
        %63 = vector.insert %62, %57 [1] : vector<16xf32> into vector<2x16xf32>
        scf.yield %63 : vector<2x16xf32>
      }
      %subview_4 = memref.subview %alloca[0, 0, 0, 0] [1, 1, 2, 16] [1, 1, 1, 1] : memref<1x1x2x16xf32> to memref<2x16xf32>
      %50 = vector.extract %49[0] : vector<16xf32> from vector<2x16xf32>
      vector.store %50, %subview_4[%c0, %c0] : memref<2x16xf32>, vector<16xf32>
      %51 = vector.extract %49[1] : vector<16xf32> from vector<2x16xf32>
      vector.store %51, %subview_4[%c1, %c0] : memref<2x16xf32>, vector<16xf32>
      linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%alloca, %subview_2 : memref<1x1x2x16xf32>, memref<1x2xf32, strided<[16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_1 : memref<1x1x2x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>} {
      ^bb0(%in: f32, %in_5: f32, %out: f32):
        %52 = arith.addf %in, %in_5 : f32
        %53 = arith.maximumf %52, %cst_0 : f32
        linalg.yield %53 : f32
      }
    }
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  return
 }

 // -----// IR Dump After Canonicalizer (canonicalize) //----- //
 func.func @mmt4d_bias_relu_dispatch_0_mmt4d_DxDxDx16x16x1_f32() attributes {hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", iree.encoding.resolver = #iree_cpu.cpu_encoding_layout<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "none"}>, translation_info = #iree_codegen.translation_info<pipeline = Mmt4dTilingExpert>} {
  %cst = arith.constant dense<0.000000e+00> : vector<2x16xf32>
  %cst_0 = arith.constant 0.000000e+00 : f32
  %c32_i64 = arith.constant 32 : i64
  %c0 = arith.constant 0 : index
  %c1 = arith.constant 1 : index
  %alloca = memref.alloca() {alignment = 64 : i64} : memref<1x1x2x16xf32>
  %0 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
  %8 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
  %9 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
  %10 = arith.extui %0 : i32 to i64
  %11 = arith.extui %1 : i32 to i64
  %12 = arith.shli %11, %c32_i64 : i64
  %13 = arith.ori %10, %12 : i64
  %14 = arith.index_castui %13 : i64 to index
  %15 = arith.extui %2 : i32 to i64
  %16 = arith.extui %3 : i32 to i64
  %17 = arith.shli %16, %c32_i64 : i64
  %18 = arith.ori %15, %17 : i64
  %19 = arith.index_castui %18 : i64 to index
  %20 = arith.extui %4 : i32 to i64
  %21 = arith.extui %5 : i32 to i64
  %22 = arith.shli %21, %c32_i64 : i64
  %23 = arith.ori %20, %22 : i64
  %24 = arith.index_castui %23 : i64 to index
  %25 = arith.extui %6 : i32 to i64
  %26 = arith.extui %7 : i32 to i64
  %27 = arith.shli %26, %c32_i64 : i64
  %28 = arith.ori %25, %27 : i64
  %29 = arith.index_castui %28 : i64 to index
  %30 = arith.extui %8 : i32 to i64
  %31 = arith.extui %9 : i32 to i64
  %32 = arith.shli %31, %c32_i64 : i64
  %33 = arith.ori %30, %32 : i64
  %34 = arith.index_castui %33 : i64 to index
  %35:5 = util.assume.int 
      %14<umin = 0, umax = 9007199254740991>, 
      %19<umin = 0, umax = 9007199254740991>, 
      %24<umin = 0, umax = 9007199254740991>, 
      %29<umin = 0, umax = 9007199254740991>, 
      %34<umin = 0, umax = 9007199254740991>
    : index, index, index, index, index
  %36 = iree_tensor_ext.dispatch.workload.ordinal %35#0, 0 : index
  %37 = iree_tensor_ext.dispatch.workload.ordinal %35#1, 1 : index
  %38 = iree_tensor_ext.dispatch.workload.ordinal %35#2, 2 : index
  %39 = iree_tensor_ext.dispatch.workload.ordinal %35#3, 3 : index
  %40 = iree_tensor_ext.dispatch.workload.ordinal %35#4, 4 : index
  %41 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>{%39, %36}
  memref.assume_alignment %41, 64 : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>
  %42 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>{%37, %40}
  memref.assume_alignment %42, 64 : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>
  %43 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x16xf32, #hal.descriptor_type<storage_buffer>>{%38}
  memref.assume_alignment %43, 64 : memref<?x16xf32, #hal.descriptor_type<storage_buffer>>
  %44 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(3) alignment(64) offset(%c0) flags(Indirect) : memref<?x?x16x16xf32, #hal.descriptor_type<storage_buffer>>{%39, %40}
  memref.assume_alignment %44, 64 : memref<?x?x16x16xf32, #hal.descriptor_type<storage_buffer>>
  scf.forall (%arg0, %arg1) in (%39, %37) {
    %subview = memref.subview %44[%arg0, %arg1, 0, 0] [1, 1, 16, 16] [1, 1, 1, 1] : memref<?x?x16x16xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1x16x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    scf.forall (%arg2) = (0) to (16) step (2) {
      %subview_1 = memref.subview %subview[0, 0, %arg2, 0] [1, 1, 2, 16] [1, 1, 1, 1] : memref<1x1x16x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x2x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %subview_2 = memref.subview %43[%arg0, %arg2] [1, 2] [1, 1] : memref<?x16xf32, #hal.descriptor_type<storage_buffer>> to memref<1x2xf32, strided<[16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>} ins(%cst_0 : f32) outs(%alloca : memref<1x1x2x16xf32>)
      %subview_3 = memref.subview %alloca[0, 0, 0, 0] [1, 1, 2, 16] [1, 1, 1, 1] : memref<1x1x2x16xf32> to memref<2x16xf32>
      %45 = vector.load %subview_3[%c0, %c0] : memref<2x16xf32>, vector<16xf32>
      %46 = vector.insert %45, %cst [0] : vector<16xf32> into vector<2x16xf32>
      %47 = vector.load %subview_3[%c1, %c0] : memref<2x16xf32>, vector<16xf32>
      %48 = vector.insert %47, %46 [1] : vector<16xf32> into vector<2x16xf32>
      %49 = scf.for %arg3 = %c0 to %36 step %c1 iter_args(%arg4 = %48) -> (vector<2x16xf32>) {
        %subview_5 = memref.subview %41[0, 0, 0, 0] [%39, %36, 16, 1] [1, 1, 1, 1] : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>> to memref<?x?x16xf32, strided<[?, 16, 1]>, #hal.descriptor_type<storage_buffer>>
        %subview_6 = memref.subview %42[0, 0, 0, 0] [%37, %40, 16, 1] [1, 1, 1, 1] : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>> to memref<?x?x16xf32, strided<[?, 16, 1]>, #hal.descriptor_type<storage_buffer>>
        %52 = vector.load %subview_6[%arg1, %arg3, %c0] : memref<?x?x16xf32, strided<[?, 16, 1]>, #hal.descriptor_type<storage_buffer>>, vector<16xf32>
        %53 = memref.load %subview_5[%arg0, %arg3, %arg2] : memref<?x?x16xf32, strided<[?, 16, 1]>, #hal.descriptor_type<storage_buffer>>
        %54 = vector.broadcast %53 : f32 to vector<16xf32>
        %55 = vector.extract %arg4[0] : vector<16xf32> from vector<2x16xf32>
        %56 = vector.fma %54, %52, %55 : vector<16xf32>
        %57 = vector.insert %56, %cst [0] : vector<16xf32> into vector<2x16xf32>
        %58 = affine.apply affine_map<()[s0] -> (s0 + 1)>()[%arg2]
        %59 = memref.load %subview_5[%arg0, %arg3, %58] : memref<?x?x16xf32, strided<[?, 16, 1]>, #hal.descriptor_type<storage_buffer>>
        %60 = vector.broadcast %59 : f32 to vector<16xf32>
        %61 = vector.extract %arg4[1] : vector<16xf32> from vector<2x16xf32>
        %62 = vector.fma %60, %52, %61 : vector<16xf32>
        %63 = vector.insert %62, %57 [1] : vector<16xf32> into vector<2x16xf32>
        scf.yield %63 : vector<2x16xf32>
      }
      %subview_4 = memref.subview %alloca[0, 0, 0, 0] [1, 1, 2, 16] [1, 1, 1, 1] : memref<1x1x2x16xf32> to memref<2x16xf32>
      %50 = vector.extract %49[0] : vector<16xf32> from vector<2x16xf32>
      vector.store %50, %subview_4[%c0, %c0] : memref<2x16xf32>, vector<16xf32>
      %51 = vector.extract %49[1] : vector<16xf32> from vector<2x16xf32>
      vector.store %51, %subview_4[%c1, %c0] : memref<2x16xf32>, vector<16xf32>
      linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%alloca, %subview_2 : memref<1x1x2x16xf32>, memref<1x2xf32, strided<[16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_1 : memref<1x1x2x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>} {
      ^bb0(%in: f32, %in_5: f32, %out: f32):
        %52 = arith.addf %in, %in_5 : f32
        %53 = arith.maximumf %52, %cst_0 : f32
        linalg.yield %53 : f32
      }
    }
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  return
 }

 // -----// IR Dump After LLVMCPUVectorShapeCastLoweringPass (iree-llvmcpu-vector-shape-cast-lowering) //----- //
 func.func @mmt4d_bias_relu_dispatch_0_mmt4d_DxDxDx16x16x1_f32() attributes {hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", iree.encoding.resolver = #iree_cpu.cpu_encoding_layout<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "none"}>, translation_info = #iree_codegen.translation_info<pipeline = Mmt4dTilingExpert>} {
  %cst = arith.constant dense<0.000000e+00> : vector<2x16xf32>
  %cst_0 = arith.constant 0.000000e+00 : f32
  %c32_i64 = arith.constant 32 : i64
  %c0 = arith.constant 0 : index
  %c1 = arith.constant 1 : index
  %alloca = memref.alloca() {alignment = 64 : i64} : memref<1x1x2x16xf32>
  %0 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
  %8 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
  %9 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
  %10 = arith.extui %0 : i32 to i64
  %11 = arith.extui %1 : i32 to i64
  %12 = arith.shli %11, %c32_i64 : i64
  %13 = arith.ori %10, %12 : i64
  %14 = arith.index_castui %13 : i64 to index
  %15 = arith.extui %2 : i32 to i64
  %16 = arith.extui %3 : i32 to i64
  %17 = arith.shli %16, %c32_i64 : i64
  %18 = arith.ori %15, %17 : i64
  %19 = arith.index_castui %18 : i64 to index
  %20 = arith.extui %4 : i32 to i64
  %21 = arith.extui %5 : i32 to i64
  %22 = arith.shli %21, %c32_i64 : i64
  %23 = arith.ori %20, %22 : i64
  %24 = arith.index_castui %23 : i64 to index
  %25 = arith.extui %6 : i32 to i64
  %26 = arith.extui %7 : i32 to i64
  %27 = arith.shli %26, %c32_i64 : i64
  %28 = arith.ori %25, %27 : i64
  %29 = arith.index_castui %28 : i64 to index
  %30 = arith.extui %8 : i32 to i64
  %31 = arith.extui %9 : i32 to i64
  %32 = arith.shli %31, %c32_i64 : i64
  %33 = arith.ori %30, %32 : i64
  %34 = arith.index_castui %33 : i64 to index
  %35:5 = util.assume.int 
      %14<umin = 0, umax = 9007199254740991>, 
      %19<umin = 0, umax = 9007199254740991>, 
      %24<umin = 0, umax = 9007199254740991>, 
      %29<umin = 0, umax = 9007199254740991>, 
      %34<umin = 0, umax = 9007199254740991>
    : index, index, index, index, index
  %36 = iree_tensor_ext.dispatch.workload.ordinal %35#0, 0 : index
  %37 = iree_tensor_ext.dispatch.workload.ordinal %35#1, 1 : index
  %38 = iree_tensor_ext.dispatch.workload.ordinal %35#2, 2 : index
  %39 = iree_tensor_ext.dispatch.workload.ordinal %35#3, 3 : index
  %40 = iree_tensor_ext.dispatch.workload.ordinal %35#4, 4 : index
  %41 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>{%39, %36}
  memref.assume_alignment %41, 64 : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>
  %42 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>{%37, %40}
  memref.assume_alignment %42, 64 : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>
  %43 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x16xf32, #hal.descriptor_type<storage_buffer>>{%38}
  memref.assume_alignment %43, 64 : memref<?x16xf32, #hal.descriptor_type<storage_buffer>>
  %44 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(3) alignment(64) offset(%c0) flags(Indirect) : memref<?x?x16x16xf32, #hal.descriptor_type<storage_buffer>>{%39, %40}
  memref.assume_alignment %44, 64 : memref<?x?x16x16xf32, #hal.descriptor_type<storage_buffer>>
  scf.forall (%arg0, %arg1) in (%39, %37) {
    %subview = memref.subview %44[%arg0, %arg1, 0, 0] [1, 1, 16, 16] [1, 1, 1, 1] : memref<?x?x16x16xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1x16x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    scf.forall (%arg2) = (0) to (16) step (2) {
      %subview_1 = memref.subview %subview[0, 0, %arg2, 0] [1, 1, 2, 16] [1, 1, 1, 1] : memref<1x1x16x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x2x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %subview_2 = memref.subview %43[%arg0, %arg2] [1, 2] [1, 1] : memref<?x16xf32, #hal.descriptor_type<storage_buffer>> to memref<1x2xf32, strided<[16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>} ins(%cst_0 : f32) outs(%alloca : memref<1x1x2x16xf32>)
      %subview_3 = memref.subview %alloca[0, 0, 0, 0] [1, 1, 2, 16] [1, 1, 1, 1] : memref<1x1x2x16xf32> to memref<2x16xf32>
      %45 = vector.load %subview_3[%c0, %c0] : memref<2x16xf32>, vector<16xf32>
      %46 = vector.insert %45, %cst [0] : vector<16xf32> into vector<2x16xf32>
      %47 = vector.load %subview_3[%c1, %c0] : memref<2x16xf32>, vector<16xf32>
      %48 = vector.insert %47, %46 [1] : vector<16xf32> into vector<2x16xf32>
      %49 = scf.for %arg3 = %c0 to %36 step %c1 iter_args(%arg4 = %48) -> (vector<2x16xf32>) {
        %subview_5 = memref.subview %41[0, 0, 0, 0] [%39, %36, 16, 1] [1, 1, 1, 1] : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>> to memref<?x?x16xf32, strided<[?, 16, 1]>, #hal.descriptor_type<storage_buffer>>
        %subview_6 = memref.subview %42[0, 0, 0, 0] [%37, %40, 16, 1] [1, 1, 1, 1] : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>> to memref<?x?x16xf32, strided<[?, 16, 1]>, #hal.descriptor_type<storage_buffer>>
        %52 = vector.load %subview_6[%arg1, %arg3, %c0] : memref<?x?x16xf32, strided<[?, 16, 1]>, #hal.descriptor_type<storage_buffer>>, vector<16xf32>
        %53 = memref.load %subview_5[%arg0, %arg3, %arg2] : memref<?x?x16xf32, strided<[?, 16, 1]>, #hal.descriptor_type<storage_buffer>>
        %54 = vector.broadcast %53 : f32 to vector<16xf32>
        %55 = vector.extract %arg4[0] : vector<16xf32> from vector<2x16xf32>
        %56 = vector.fma %54, %52, %55 : vector<16xf32>
        %57 = vector.insert %56, %cst [0] : vector<16xf32> into vector<2x16xf32>
        %58 = affine.apply affine_map<()[s0] -> (s0 + 1)>()[%arg2]
        %59 = memref.load %subview_5[%arg0, %arg3, %58] : memref<?x?x16xf32, strided<[?, 16, 1]>, #hal.descriptor_type<storage_buffer>>
        %60 = vector.broadcast %59 : f32 to vector<16xf32>
        %61 = vector.extract %arg4[1] : vector<16xf32> from vector<2x16xf32>
        %62 = vector.fma %60, %52, %61 : vector<16xf32>
        %63 = vector.insert %62, %57 [1] : vector<16xf32> into vector<2x16xf32>
        scf.yield %63 : vector<2x16xf32>
      }
      %subview_4 = memref.subview %alloca[0, 0, 0, 0] [1, 1, 2, 16] [1, 1, 1, 1] : memref<1x1x2x16xf32> to memref<2x16xf32>
      %50 = vector.extract %49[0] : vector<16xf32> from vector<2x16xf32>
      vector.store %50, %subview_4[%c0, %c0] : memref<2x16xf32>, vector<16xf32>
      %51 = vector.extract %49[1] : vector<16xf32> from vector<2x16xf32>
      vector.store %51, %subview_4[%c1, %c0] : memref<2x16xf32>, vector<16xf32>
      linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%alloca, %subview_2 : memref<1x1x2x16xf32>, memref<1x2xf32, strided<[16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_1 : memref<1x1x2x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>} {
      ^bb0(%in: f32, %in_5: f32, %out: f32):
        %52 = arith.addf %in, %in_5 : f32
        %53 = arith.maximumf %52, %cst_0 : f32
        linalg.yield %53 : f32
      }
    }
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  return
 }

 // -----// IR Dump After LLVMCPULowerExecutableTargetPass (iree-llvmcpu-lower-executable-target) //----- //
 func.func @mmt4d_bias_relu_dispatch_0_mmt4d_DxDxDx16x16x1_f32() attributes {hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", iree.encoding.resolver = #iree_cpu.cpu_encoding_layout<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "none"}>, translation_info = #iree_codegen.translation_info<pipeline = Mmt4dTilingExpert>} {
  %cst = arith.constant dense<0.000000e+00> : vector<2x16xf32>
  %cst_0 = arith.constant 0.000000e+00 : f32
  %c32_i64 = arith.constant 32 : i64
  %c0 = arith.constant 0 : index
  %c1 = arith.constant 1 : index
  %alloca = memref.alloca() {alignment = 64 : i64} : memref<1x1x2x16xf32>
  %0 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
  %8 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
  %9 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
  %10 = arith.extui %0 : i32 to i64
  %11 = arith.extui %1 : i32 to i64
  %12 = arith.shli %11, %c32_i64 : i64
  %13 = arith.ori %10, %12 : i64
  %14 = arith.index_castui %13 : i64 to index
  %15 = arith.extui %2 : i32 to i64
  %16 = arith.extui %3 : i32 to i64
  %17 = arith.shli %16, %c32_i64 : i64
  %18 = arith.ori %15, %17 : i64
  %19 = arith.index_castui %18 : i64 to index
  %20 = arith.extui %4 : i32 to i64
  %21 = arith.extui %5 : i32 to i64
  %22 = arith.shli %21, %c32_i64 : i64
  %23 = arith.ori %20, %22 : i64
  %24 = arith.index_castui %23 : i64 to index
  %25 = arith.extui %6 : i32 to i64
  %26 = arith.extui %7 : i32 to i64
  %27 = arith.shli %26, %c32_i64 : i64
  %28 = arith.ori %25, %27 : i64
  %29 = arith.index_castui %28 : i64 to index
  %30 = arith.extui %8 : i32 to i64
  %31 = arith.extui %9 : i32 to i64
  %32 = arith.shli %31, %c32_i64 : i64
  %33 = arith.ori %30, %32 : i64
  %34 = arith.index_castui %33 : i64 to index
  %35:5 = util.assume.int 
      %14<umin = 0, umax = 9007199254740991>, 
      %19<umin = 0, umax = 9007199254740991>, 
      %24<umin = 0, umax = 9007199254740991>, 
      %29<umin = 0, umax = 9007199254740991>, 
      %34<umin = 0, umax = 9007199254740991>
    : index, index, index, index, index
  %36 = iree_tensor_ext.dispatch.workload.ordinal %35#0, 0 : index
  %37 = iree_tensor_ext.dispatch.workload.ordinal %35#1, 1 : index
  %38 = iree_tensor_ext.dispatch.workload.ordinal %35#2, 2 : index
  %39 = iree_tensor_ext.dispatch.workload.ordinal %35#3, 3 : index
  %40 = iree_tensor_ext.dispatch.workload.ordinal %35#4, 4 : index
  %41 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>{%39, %36}
  memref.assume_alignment %41, 64 : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>
  %42 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>{%37, %40}
  memref.assume_alignment %42, 64 : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>
  %43 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x16xf32, #hal.descriptor_type<storage_buffer>>{%38}
  memref.assume_alignment %43, 64 : memref<?x16xf32, #hal.descriptor_type<storage_buffer>>
  %44 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(3) alignment(64) offset(%c0) flags(Indirect) : memref<?x?x16x16xf32, #hal.descriptor_type<storage_buffer>>{%39, %40}
  memref.assume_alignment %44, 64 : memref<?x?x16x16xf32, #hal.descriptor_type<storage_buffer>>
  scf.forall (%arg0, %arg1) in (%39, %37) {
    %subview = memref.subview %44[%arg0, %arg1, 0, 0] [1, 1, 16, 16] [1, 1, 1, 1] : memref<?x?x16x16xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1x16x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    scf.forall (%arg2) = (0) to (16) step (2) {
      %subview_1 = memref.subview %subview[0, 0, %arg2, 0] [1, 1, 2, 16] [1, 1, 1, 1] : memref<1x1x16x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x2x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %subview_2 = memref.subview %43[%arg0, %arg2] [1, 2] [1, 1] : memref<?x16xf32, #hal.descriptor_type<storage_buffer>> to memref<1x2xf32, strided<[16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>} ins(%cst_0 : f32) outs(%alloca : memref<1x1x2x16xf32>)
      %subview_3 = memref.subview %alloca[0, 0, 0, 0] [1, 1, 2, 16] [1, 1, 1, 1] : memref<1x1x2x16xf32> to memref<2x16xf32>
      %45 = vector.load %subview_3[%c0, %c0] : memref<2x16xf32>, vector<16xf32>
      %46 = vector.insert %45, %cst [0] : vector<16xf32> into vector<2x16xf32>
      %47 = vector.load %subview_3[%c1, %c0] : memref<2x16xf32>, vector<16xf32>
      %48 = vector.insert %47, %46 [1] : vector<16xf32> into vector<2x16xf32>
      %49 = scf.for %arg3 = %c0 to %36 step %c1 iter_args(%arg4 = %48) -> (vector<2x16xf32>) {
        %subview_5 = memref.subview %41[0, 0, 0, 0] [%39, %36, 16, 1] [1, 1, 1, 1] : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>> to memref<?x?x16xf32, strided<[?, 16, 1]>, #hal.descriptor_type<storage_buffer>>
        %subview_6 = memref.subview %42[0, 0, 0, 0] [%37, %40, 16, 1] [1, 1, 1, 1] : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>> to memref<?x?x16xf32, strided<[?, 16, 1]>, #hal.descriptor_type<storage_buffer>>
        %52 = vector.load %subview_6[%arg1, %arg3, %c0] : memref<?x?x16xf32, strided<[?, 16, 1]>, #hal.descriptor_type<storage_buffer>>, vector<16xf32>
        %53 = memref.load %subview_5[%arg0, %arg3, %arg2] : memref<?x?x16xf32, strided<[?, 16, 1]>, #hal.descriptor_type<storage_buffer>>
        %54 = vector.broadcast %53 : f32 to vector<16xf32>
        %55 = vector.extract %arg4[0] : vector<16xf32> from vector<2x16xf32>
        %56 = vector.fma %54, %52, %55 : vector<16xf32>
        %57 = vector.insert %56, %cst [0] : vector<16xf32> into vector<2x16xf32>
        %58 = affine.apply affine_map<()[s0] -> (s0 + 1)>()[%arg2]
        %59 = memref.load %subview_5[%arg0, %arg3, %58] : memref<?x?x16xf32, strided<[?, 16, 1]>, #hal.descriptor_type<storage_buffer>>
        %60 = vector.broadcast %59 : f32 to vector<16xf32>
        %61 = vector.extract %arg4[1] : vector<16xf32> from vector<2x16xf32>
        %62 = vector.fma %60, %52, %61 : vector<16xf32>
        %63 = vector.insert %62, %57 [1] : vector<16xf32> into vector<2x16xf32>
        scf.yield %63 : vector<2x16xf32>
      }
      %subview_4 = memref.subview %alloca[0, 0, 0, 0] [1, 1, 2, 16] [1, 1, 1, 1] : memref<1x1x2x16xf32> to memref<2x16xf32>
      %50 = vector.extract %49[0] : vector<16xf32> from vector<2x16xf32>
      vector.store %50, %subview_4[%c0, %c0] : memref<2x16xf32>, vector<16xf32>
      %51 = vector.extract %49[1] : vector<16xf32> from vector<2x16xf32>
      vector.store %51, %subview_4[%c1, %c0] : memref<2x16xf32>, vector<16xf32>
      linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%alloca, %subview_2 : memref<1x1x2x16xf32>, memref<1x2xf32, strided<[16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_1 : memref<1x1x2x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>} {
      ^bb0(%in: f32, %in_5: f32, %out: f32):
        %52 = arith.addf %in, %in_5 : f32
        %53 = arith.maximumf %52, %cst_0 : f32
        linalg.yield %53 : f32
      }
    }
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  return
 }
No results found