Skip to content

Instantly share code, notes, and snippets.

@pashu123
Created May 22, 2025 20:45
Show Gist options
  • Save pashu123/45e933052f0d767d822790a5c86c508a to your computer and use it in GitHub Desktop.
Save pashu123/45e933052f0d767d822790a5c86c508a to your computer and use it in GitHub Desktop.
// -----// IR Dump After LLVMCPUSelectLoweringStrategyPass (iree-llvmcpu-select-lowering-strategy) //----- //
#config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>
#config1 = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0, 0, 0], [1, 1, 0, 2, 16, 0], [0, 0, 1, 0, 0, 1]]>
#executable_target_embedded_elf_x86_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", iree.encoding.resolver = #iree_cpu.cpu_encoding_layout<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "none"}>
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d2)>
#pipeline_layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
#translation = #iree_codegen.translation_info<pipeline = Mmt4dTilingExpert>
module {
func.func @mmt4d_bias_relu_dispatch_0_mmt4d_DxDxDx16x16x1_f32() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64, translation_info = #translation} {
%c0 = arith.constant 0 : index
%c32_i64 = arith.constant 32 : i64
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32
%1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32
%2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32
%3 = hal.interface.constant.load layout(#pipeline_layout) ordinal(3) : i32
%4 = hal.interface.constant.load layout(#pipeline_layout) ordinal(4) : i32
%5 = hal.interface.constant.load layout(#pipeline_layout) ordinal(5) : i32
%6 = hal.interface.constant.load layout(#pipeline_layout) ordinal(6) : i32
%7 = hal.interface.constant.load layout(#pipeline_layout) ordinal(7) : i32
%8 = hal.interface.constant.load layout(#pipeline_layout) ordinal(8) : i32
%9 = hal.interface.constant.load layout(#pipeline_layout) ordinal(9) : i32
%10 = arith.extui %0 : i32 to i64
%11 = arith.extui %1 : i32 to i64
%12 = arith.shli %11, %c32_i64 : i64
%13 = arith.ori %10, %12 : i64
%14 = arith.index_castui %13 : i64 to index
%15 = arith.extui %2 : i32 to i64
%16 = arith.extui %3 : i32 to i64
%17 = arith.shli %16, %c32_i64 : i64
%18 = arith.ori %15, %17 : i64
%19 = arith.index_castui %18 : i64 to index
%20 = arith.extui %4 : i32 to i64
%21 = arith.extui %5 : i32 to i64
%22 = arith.shli %21, %c32_i64 : i64
%23 = arith.ori %20, %22 : i64
%24 = arith.index_castui %23 : i64 to index
%25 = arith.extui %6 : i32 to i64
%26 = arith.extui %7 : i32 to i64
%27 = arith.shli %26, %c32_i64 : i64
%28 = arith.ori %25, %27 : i64
%29 = arith.index_castui %28 : i64 to index
%30 = arith.extui %8 : i32 to i64
%31 = arith.extui %9 : i32 to i64
%32 = arith.shli %31, %c32_i64 : i64
%33 = arith.ori %30, %32 : i64
%34 = arith.index_castui %33 : i64 to index
%35:5 = util.assume.int
%14<umin = 0, umax = 9007199254740991>,
%19<umin = 0, umax = 9007199254740991>,
%24<umin = 0, umax = 9007199254740991>,
%29<umin = 0, umax = 9007199254740991>,
%34<umin = 0, umax = 9007199254740991>
: index, index, index, index, index
%36 = iree_tensor_ext.dispatch.workload.ordinal %35#0, 0 : index
%37 = iree_tensor_ext.dispatch.workload.ordinal %35#1, 1 : index
%38 = iree_tensor_ext.dispatch.workload.ordinal %35#2, 2 : index
%39 = iree_tensor_ext.dispatch.workload.ordinal %35#3, 3 : index
%40 = iree_tensor_ext.dispatch.workload.ordinal %35#4, 4 : index
%41 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%39, %36}
%42 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%37, %40}
%43 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x16xf32>>{%38}
%44 = hal.interface.binding.subspan layout(#pipeline_layout) binding(3) alignment(64) offset(%c0) flags(Indirect) : !iree_tensor_ext.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%39, %40}
%45 = iree_tensor_ext.dispatch.tensor.load %41, offsets = [0, 0, 0, 0], sizes = [%39, %36, 16, 1], strides = [1, 1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%39, %36} -> tensor<?x?x16x1xf32>
%46 = iree_tensor_ext.dispatch.tensor.load %42, offsets = [0, 0, 0, 0], sizes = [%37, %40, 16, 1], strides = [1, 1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%37, %40} -> tensor<?x?x16x1xf32>
%47 = iree_tensor_ext.dispatch.tensor.load %43, offsets = [0, 0], sizes = [%38, 16], strides = [1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x16xf32>>{%38} -> tensor<?x16xf32>
%48 = tensor.empty(%39, %40) : tensor<?x?x16x16xf32>
%49 = linalg.fill {lowering_config = #config} ins(%cst : f32) outs(%48 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32>
%50 = linalg.mmt4d {lowering_config = #config1} ins(%45, %46 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%49 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32>
%51 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%50, %47 : tensor<?x?x16x16xf32>, tensor<?x16xf32>) outs(%48 : tensor<?x?x16x16xf32>) attrs = {lowering_config = #config} {
^bb0(%in: f32, %in_0: f32, %out: f32):
%52 = arith.addf %in, %in_0 : f32
%53 = arith.maximumf %52, %cst : f32
linalg.yield %53 : f32
} -> tensor<?x?x16x16xf32>
iree_tensor_ext.dispatch.tensor.store %51, %44, offsets = [0, 0, 0, 0], sizes = [%39, %40, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%39, %40}
return
}
}
// -----// IR Dump After TileAndDistributeToWorkgroupsUsingForallOpPass (iree-codegen-tile-and-distribute-to-workgroups-using-forall-op) //----- //
func.func @mmt4d_bias_relu_dispatch_0_mmt4d_DxDxDx16x16x1_f32() attributes {hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", iree.encoding.resolver = #iree_cpu.cpu_encoding_layout<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "none"}>, translation_info = #iree_codegen.translation_info<pipeline = Mmt4dTilingExpert>} {
%c0 = arith.constant 0 : index
%c32_i64 = arith.constant 32 : i64
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
%4 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
%5 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
%6 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
%7 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
%8 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
%9 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
%10 = arith.extui %0 : i32 to i64
%11 = arith.extui %1 : i32 to i64
%12 = arith.shli %11, %c32_i64 : i64
%13 = arith.ori %10, %12 : i64
%14 = arith.index_castui %13 : i64 to index
%15 = arith.extui %2 : i32 to i64
%16 = arith.extui %3 : i32 to i64
%17 = arith.shli %16, %c32_i64 : i64
%18 = arith.ori %15, %17 : i64
%19 = arith.index_castui %18 : i64 to index
%20 = arith.extui %4 : i32 to i64
%21 = arith.extui %5 : i32 to i64
%22 = arith.shli %21, %c32_i64 : i64
%23 = arith.ori %20, %22 : i64
%24 = arith.index_castui %23 : i64 to index
%25 = arith.extui %6 : i32 to i64
%26 = arith.extui %7 : i32 to i64
%27 = arith.shli %26, %c32_i64 : i64
%28 = arith.ori %25, %27 : i64
%29 = arith.index_castui %28 : i64 to index
%30 = arith.extui %8 : i32 to i64
%31 = arith.extui %9 : i32 to i64
%32 = arith.shli %31, %c32_i64 : i64
%33 = arith.ori %30, %32 : i64
%34 = arith.index_castui %33 : i64 to index
%35:5 = util.assume.int
%14<umin = 0, umax = 9007199254740991>,
%19<umin = 0, umax = 9007199254740991>,
%24<umin = 0, umax = 9007199254740991>,
%29<umin = 0, umax = 9007199254740991>,
%34<umin = 0, umax = 9007199254740991>
: index, index, index, index, index
%36 = iree_tensor_ext.dispatch.workload.ordinal %35#0, 0 : index
%37 = iree_tensor_ext.dispatch.workload.ordinal %35#1, 1 : index
%38 = iree_tensor_ext.dispatch.workload.ordinal %35#2, 2 : index
%39 = iree_tensor_ext.dispatch.workload.ordinal %35#3, 3 : index
%40 = iree_tensor_ext.dispatch.workload.ordinal %35#4, 4 : index
%41 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%39, %36}
%42 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%37, %40}
%43 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x16xf32>>{%38}
%44 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(3) alignment(64) offset(%c0) flags(Indirect) : !iree_tensor_ext.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%39, %40}
%45 = iree_tensor_ext.dispatch.tensor.load %41, offsets = [0, 0, 0, 0], sizes = [%39, %36, 16, 1], strides = [1, 1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%39, %36} -> tensor<?x?x16x1xf32>
%46 = iree_tensor_ext.dispatch.tensor.load %42, offsets = [0, 0, 0, 0], sizes = [%37, %40, 16, 1], strides = [1, 1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%37, %40} -> tensor<?x?x16x1xf32>
%47 = iree_tensor_ext.dispatch.tensor.load %43, offsets = [0, 0], sizes = [%38, 16], strides = [1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x16xf32>>{%38} -> tensor<?x16xf32>
%48 = tensor.empty(%39, %40) : tensor<?x?x16x16xf32>
%49 = scf.forall (%arg0, %arg1) in (%39, %37) shared_outs(%arg2 = %48) -> (tensor<?x?x16x16xf32>) {
%extracted_slice = tensor.extract_slice %45[%arg0, 0, 0, 0] [1, %36, 16, 1] [1, 1, 1, 1] : tensor<?x?x16x1xf32> to tensor<1x?x16x1xf32>
%extracted_slice_0 = tensor.extract_slice %46[%arg1, 0, 0, 0] [1, %36, 16, 1] [1, 1, 1, 1] : tensor<?x?x16x1xf32> to tensor<1x?x16x1xf32>
%50 = tensor.empty() : tensor<1x1x16x16xf32>
%51 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>} ins(%cst : f32) outs(%50 : tensor<1x1x16x16xf32>) -> tensor<1x1x16x16xf32>
%52 = linalg.mmt4d {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0, 0, 0], [1, 1, 0, 2, 16, 0], [0, 0, 1, 0, 0, 1]]>} ins(%extracted_slice, %extracted_slice_0 : tensor<1x?x16x1xf32>, tensor<1x?x16x1xf32>) outs(%51 : tensor<1x1x16x16xf32>) -> tensor<1x1x16x16xf32>
%extracted_slice_1 = tensor.extract_slice %47[%arg0, 0] [1, 16] [1, 1] : tensor<?x16xf32> to tensor<1x16xf32>
%extracted_slice_2 = tensor.extract_slice %arg2[%arg0, %arg1, 0, 0] [1, 1, 16, 16] [1, 1, 1, 1] : tensor<?x?x16x16xf32> to tensor<1x1x16x16xf32>
%53 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%52, %extracted_slice_1 : tensor<1x1x16x16xf32>, tensor<1x16xf32>) outs(%extracted_slice_2 : tensor<1x1x16x16xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>} {
^bb0(%in: f32, %in_3: f32, %out: f32):
%54 = arith.addf %in, %in_3 : f32
%55 = arith.maximumf %54, %cst : f32
linalg.yield %55 : f32
} -> tensor<1x1x16x16xf32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %53 into %arg2[%arg0, %arg1, 0, 0] [1, 1, 16, 16] [1, 1, 1, 1] : tensor<1x1x16x16xf32> into tensor<?x?x16x16xf32>
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
iree_tensor_ext.dispatch.tensor.store %49, %44, offsets = [0, 0, 0, 0], sizes = [%39, %40, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%39, %40}
return
}
// -----// IR Dump After ConfigTrackingCanonicalizerPass (iree-codegen-config-tracking-canonicalize) //----- //
func.func @mmt4d_bias_relu_dispatch_0_mmt4d_DxDxDx16x16x1_f32() attributes {hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", iree.encoding.resolver = #iree_cpu.cpu_encoding_layout<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "none"}>, translation_info = #iree_codegen.translation_info<pipeline = Mmt4dTilingExpert>} {
%c0 = arith.constant 0 : index
%c32_i64 = arith.constant 32 : i64
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
%4 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
%5 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
%6 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
%7 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
%8 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
%9 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
%10 = arith.extui %0 : i32 to i64
%11 = arith.extui %1 : i32 to i64
%12 = arith.shli %11, %c32_i64 : i64
%13 = arith.ori %10, %12 : i64
%14 = arith.index_castui %13 : i64 to index
%15 = arith.extui %2 : i32 to i64
%16 = arith.extui %3 : i32 to i64
%17 = arith.shli %16, %c32_i64 : i64
%18 = arith.ori %15, %17 : i64
%19 = arith.index_castui %18 : i64 to index
%20 = arith.extui %4 : i32 to i64
%21 = arith.extui %5 : i32 to i64
%22 = arith.shli %21, %c32_i64 : i64
%23 = arith.ori %20, %22 : i64
%24 = arith.index_castui %23 : i64 to index
%25 = arith.extui %6 : i32 to i64
%26 = arith.extui %7 : i32 to i64
%27 = arith.shli %26, %c32_i64 : i64
%28 = arith.ori %25, %27 : i64
%29 = arith.index_castui %28 : i64 to index
%30 = arith.extui %8 : i32 to i64
%31 = arith.extui %9 : i32 to i64
%32 = arith.shli %31, %c32_i64 : i64
%33 = arith.ori %30, %32 : i64
%34 = arith.index_castui %33 : i64 to index
%35:5 = util.assume.int
%14<umin = 0, umax = 9007199254740991>,
%19<umin = 0, umax = 9007199254740991>,
%24<umin = 0, umax = 9007199254740991>,
%29<umin = 0, umax = 9007199254740991>,
%34<umin = 0, umax = 9007199254740991>
: index, index, index, index, index
%36 = iree_tensor_ext.dispatch.workload.ordinal %35#0, 0 : index
%37 = iree_tensor_ext.dispatch.workload.ordinal %35#1, 1 : index
%38 = iree_tensor_ext.dispatch.workload.ordinal %35#2, 2 : index
%39 = iree_tensor_ext.dispatch.workload.ordinal %35#3, 3 : index
%40 = iree_tensor_ext.dispatch.workload.ordinal %35#4, 4 : index
%41 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%39, %36}
%42 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%37, %40}
%43 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x16xf32>>{%38}
%44 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(3) alignment(64) offset(%c0) flags(Indirect) : !iree_tensor_ext.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%39, %40}
%45 = iree_tensor_ext.dispatch.tensor.load %41, offsets = [0, 0, 0, 0], sizes = [%39, %36, 16, 1], strides = [1, 1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%39, %36} -> tensor<?x?x16x1xf32>
%46 = iree_tensor_ext.dispatch.tensor.load %42, offsets = [0, 0, 0, 0], sizes = [%37, %40, 16, 1], strides = [1, 1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%37, %40} -> tensor<?x?x16x1xf32>
%47 = iree_tensor_ext.dispatch.tensor.load %43, offsets = [0, 0], sizes = [%38, 16], strides = [1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x16xf32>>{%38} -> tensor<?x16xf32>
%48 = tensor.empty(%39, %40) : tensor<?x?x16x16xf32>
%49 = scf.forall (%arg0, %arg1) in (%39, %37) shared_outs(%arg2 = %48) -> (tensor<?x?x16x16xf32>) {
%extracted_slice = tensor.extract_slice %45[%arg0, 0, 0, 0] [1, %36, 16, 1] [1, 1, 1, 1] : tensor<?x?x16x1xf32> to tensor<1x?x16x1xf32>
%extracted_slice_0 = tensor.extract_slice %46[%arg1, 0, 0, 0] [1, %36, 16, 1] [1, 1, 1, 1] : tensor<?x?x16x1xf32> to tensor<1x?x16x1xf32>
%50 = tensor.empty() : tensor<1x1x16x16xf32>
%51 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>} ins(%cst : f32) outs(%50 : tensor<1x1x16x16xf32>) -> tensor<1x1x16x16xf32>
%52 = linalg.mmt4d {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0, 0, 0], [1, 1, 0, 2, 16, 0], [0, 0, 1, 0, 0, 1]]>} ins(%extracted_slice, %extracted_slice_0 : tensor<1x?x16x1xf32>, tensor<1x?x16x1xf32>) outs(%51 : tensor<1x1x16x16xf32>) -> tensor<1x1x16x16xf32>
%extracted_slice_1 = tensor.extract_slice %47[%arg0, 0] [1, 16] [1, 1] : tensor<?x16xf32> to tensor<1x16xf32>
%extracted_slice_2 = tensor.extract_slice %arg2[%arg0, %arg1, 0, 0] [1, 1, 16, 16] [1, 1, 1, 1] : tensor<?x?x16x16xf32> to tensor<1x1x16x16xf32>
%53 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%52, %extracted_slice_1 : tensor<1x1x16x16xf32>, tensor<1x16xf32>) outs(%extracted_slice_2 : tensor<1x1x16x16xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>} {
^bb0(%in: f32, %in_3: f32, %out: f32):
%54 = arith.addf %in, %in_3 : f32
%55 = arith.maximumf %54, %cst : f32
linalg.yield %55 : f32
} -> tensor<1x1x16x16xf32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %53 into %arg2[%arg0, %arg1, 0, 0] [1, 1, 16, 16] [1, 1, 1, 1] : tensor<1x1x16x16xf32> into tensor<?x?x16x16xf32>
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
iree_tensor_ext.dispatch.tensor.store %49, %44, offsets = [0, 0, 0, 0], sizes = [%39, %40, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%39, %40}
return
}
// -----// IR Dump After CSE (cse) //----- //
func.func @mmt4d_bias_relu_dispatch_0_mmt4d_DxDxDx16x16x1_f32() attributes {hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", iree.encoding.resolver = #iree_cpu.cpu_encoding_layout<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "none"}>, translation_info = #iree_codegen.translation_info<pipeline = Mmt4dTilingExpert>} {
%c0 = arith.constant 0 : index
%c32_i64 = arith.constant 32 : i64
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
%4 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
%5 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
%6 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
%7 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
%8 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
%9 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
%10 = arith.extui %0 : i32 to i64
%11 = arith.extui %1 : i32 to i64
%12 = arith.shli %11, %c32_i64 : i64
%13 = arith.ori %10, %12 : i64
%14 = arith.index_castui %13 : i64 to index
%15 = arith.extui %2 : i32 to i64
%16 = arith.extui %3 : i32 to i64
%17 = arith.shli %16, %c32_i64 : i64
%18 = arith.ori %15, %17 : i64
%19 = arith.index_castui %18 : i64 to index
%20 = arith.extui %4 : i32 to i64
%21 = arith.extui %5 : i32 to i64
%22 = arith.shli %21, %c32_i64 : i64
%23 = arith.ori %20, %22 : i64
%24 = arith.index_castui %23 : i64 to index
%25 = arith.extui %6 : i32 to i64
%26 = arith.extui %7 : i32 to i64
%27 = arith.shli %26, %c32_i64 : i64
%28 = arith.ori %25, %27 : i64
%29 = arith.index_castui %28 : i64 to index
%30 = arith.extui %8 : i32 to i64
%31 = arith.extui %9 : i32 to i64
%32 = arith.shli %31, %c32_i64 : i64
%33 = arith.ori %30, %32 : i64
%34 = arith.index_castui %33 : i64 to index
%35:5 = util.assume.int
%14<umin = 0, umax = 9007199254740991>,
%19<umin = 0, umax = 9007199254740991>,
%24<umin = 0, umax = 9007199254740991>,
%29<umin = 0, umax = 9007199254740991>,
%34<umin = 0, umax = 9007199254740991>
: index, index, index, index, index
%36 = iree_tensor_ext.dispatch.workload.ordinal %35#0, 0 : index
%37 = iree_tensor_ext.dispatch.workload.ordinal %35#1, 1 : index
%38 = iree_tensor_ext.dispatch.workload.ordinal %35#2, 2 : index
%39 = iree_tensor_ext.dispatch.workload.ordinal %35#3, 3 : index
%40 = iree_tensor_ext.dispatch.workload.ordinal %35#4, 4 : index
%41 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%39, %36}
%42 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%37, %40}
%43 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x16xf32>>{%38}
%44 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(3) alignment(64) offset(%c0) flags(Indirect) : !iree_tensor_ext.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%39, %40}
%45 = iree_tensor_ext.dispatch.tensor.load %41, offsets = [0, 0, 0, 0], sizes = [%39, %36, 16, 1], strides = [1, 1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%39, %36} -> tensor<?x?x16x1xf32>
%46 = iree_tensor_ext.dispatch.tensor.load %42, offsets = [0, 0, 0, 0], sizes = [%37, %40, 16, 1], strides = [1, 1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%37, %40} -> tensor<?x?x16x1xf32>
%47 = iree_tensor_ext.dispatch.tensor.load %43, offsets = [0, 0], sizes = [%38, 16], strides = [1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x16xf32>>{%38} -> tensor<?x16xf32>
%48 = tensor.empty(%39, %40) : tensor<?x?x16x16xf32>
%49 = scf.forall (%arg0, %arg1) in (%39, %37) shared_outs(%arg2 = %48) -> (tensor<?x?x16x16xf32>) {
%extracted_slice = tensor.extract_slice %45[%arg0, 0, 0, 0] [1, %36, 16, 1] [1, 1, 1, 1] : tensor<?x?x16x1xf32> to tensor<1x?x16x1xf32>
%extracted_slice_0 = tensor.extract_slice %46[%arg1, 0, 0, 0] [1, %36, 16, 1] [1, 1, 1, 1] : tensor<?x?x16x1xf32> to tensor<1x?x16x1xf32>
%50 = tensor.empty() : tensor<1x1x16x16xf32>
%51 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>} ins(%cst : f32) outs(%50 : tensor<1x1x16x16xf32>) -> tensor<1x1x16x16xf32>
%52 = linalg.mmt4d {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0, 0, 0], [1, 1, 0, 2, 16, 0], [0, 0, 1, 0, 0, 1]]>} ins(%extracted_slice, %extracted_slice_0 : tensor<1x?x16x1xf32>, tensor<1x?x16x1xf32>) outs(%51 : tensor<1x1x16x16xf32>) -> tensor<1x1x16x16xf32>
%extracted_slice_1 = tensor.extract_slice %47[%arg0, 0] [1, 16] [1, 1] : tensor<?x16xf32> to tensor<1x16xf32>
%extracted_slice_2 = tensor.extract_slice %arg2[%arg0, %arg1, 0, 0] [1, 1, 16, 16] [1, 1, 1, 1] : tensor<?x?x16x16xf32> to tensor<1x1x16x16xf32>
%53 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%52, %extracted_slice_1 : tensor<1x1x16x16xf32>, tensor<1x16xf32>) outs(%extracted_slice_2 : tensor<1x1x16x16xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>} {
^bb0(%in: f32, %in_3: f32, %out: f32):
%54 = arith.addf %in, %in_3 : f32
%55 = arith.maximumf %54, %cst : f32
linalg.yield %55 : f32
} -> tensor<1x1x16x16xf32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %53 into %arg2[%arg0, %arg1, 0, 0] [1, 1, 16, 16] [1, 1, 1, 1] : tensor<1x1x16x16xf32> into tensor<?x?x16x16xf32>
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
iree_tensor_ext.dispatch.tensor.store %49, %44, offsets = [0, 0, 0, 0], sizes = [%39, %40, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%39, %40}
return
}
// -----// IR Dump After FuseTensorPadWithConsumerPass (iree-codegen-fuse-tensor-pad-with-consumer) //----- //
func.func @mmt4d_bias_relu_dispatch_0_mmt4d_DxDxDx16x16x1_f32() attributes {hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", iree.encoding.resolver = #iree_cpu.cpu_encoding_layout<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "none"}>, translation_info = #iree_codegen.translation_info<pipeline = Mmt4dTilingExpert>} {
%c0 = arith.constant 0 : index
%c32_i64 = arith.constant 32 : i64
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
%4 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
%5 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
%6 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
%7 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
%8 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
%9 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
%10 = arith.extui %0 : i32 to i64
%11 = arith.extui %1 : i32 to i64
%12 = arith.shli %11, %c32_i64 : i64
%13 = arith.ori %10, %12 : i64
%14 = arith.index_castui %13 : i64 to index
%15 = arith.extui %2 : i32 to i64
%16 = arith.extui %3 : i32 to i64
%17 = arith.shli %16, %c32_i64 : i64
%18 = arith.ori %15, %17 : i64
%19 = arith.index_castui %18 : i64 to index
%20 = arith.extui %4 : i32 to i64
%21 = arith.extui %5 : i32 to i64
%22 = arith.shli %21, %c32_i64 : i64
%23 = arith.ori %20, %22 : i64
%24 = arith.index_castui %23 : i64 to index
%25 = arith.extui %6 : i32 to i64
%26 = arith.extui %7 : i32 to i64
%27 = arith.shli %26, %c32_i64 : i64
%28 = arith.ori %25, %27 : i64
%29 = arith.index_castui %28 : i64 to index
%30 = arith.extui %8 : i32 to i64
%31 = arith.extui %9 : i32 to i64
%32 = arith.shli %31, %c32_i64 : i64
%33 = arith.ori %30, %32 : i64
%34 = arith.index_castui %33 : i64 to index
%35:5 = util.assume.int
%14<umin = 0, umax = 9007199254740991>,
%19<umin = 0, umax = 9007199254740991>,
%24<umin = 0, umax = 9007199254740991>,
%29<umin = 0, umax = 9007199254740991>,
%34<umin = 0, umax = 9007199254740991>
: index, index, index, index, index
%36 = iree_tensor_ext.dispatch.workload.ordinal %35#0, 0 : index
%37 = iree_tensor_ext.dispatch.workload.ordinal %35#1, 1 : index
%38 = iree_tensor_ext.dispatch.workload.ordinal %35#2, 2 : index
%39 = iree_tensor_ext.dispatch.workload.ordinal %35#3, 3 : index
%40 = iree_tensor_ext.dispatch.workload.ordinal %35#4, 4 : index
%41 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%39, %36}
%42 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%37, %40}
%43 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x16xf32>>{%38}
%44 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(3) alignment(64) offset(%c0) flags(Indirect) : !iree_tensor_ext.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%39, %40}
%45 = iree_tensor_ext.dispatch.tensor.load %41, offsets = [0, 0, 0, 0], sizes = [%39, %36, 16, 1], strides = [1, 1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%39, %36} -> tensor<?x?x16x1xf32>
%46 = iree_tensor_ext.dispatch.tensor.load %42, offsets = [0, 0, 0, 0], sizes = [%37, %40, 16, 1], strides = [1, 1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%37, %40} -> tensor<?x?x16x1xf32>
%47 = iree_tensor_ext.dispatch.tensor.load %43, offsets = [0, 0], sizes = [%38, 16], strides = [1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x16xf32>>{%38} -> tensor<?x16xf32>
%48 = tensor.empty(%39, %40) : tensor<?x?x16x16xf32>
%49 = scf.forall (%arg0, %arg1) in (%39, %37) shared_outs(%arg2 = %48) -> (tensor<?x?x16x16xf32>) {
%extracted_slice = tensor.extract_slice %45[%arg0, 0, 0, 0] [1, %36, 16, 1] [1, 1, 1, 1] : tensor<?x?x16x1xf32> to tensor<1x?x16x1xf32>
%extracted_slice_0 = tensor.extract_slice %46[%arg1, 0, 0, 0] [1, %36, 16, 1] [1, 1, 1, 1] : tensor<?x?x16x1xf32> to tensor<1x?x16x1xf32>
%50 = tensor.empty() : tensor<1x1x16x16xf32>
%51 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>} ins(%cst : f32) outs(%50 : tensor<1x1x16x16xf32>) -> tensor<1x1x16x16xf32>
%52 = linalg.mmt4d {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0, 0, 0], [1, 1, 0, 2, 16, 0], [0, 0, 1, 0, 0, 1]]>} ins(%extracted_slice, %extracted_slice_0 : tensor<1x?x16x1xf32>, tensor<1x?x16x1xf32>) outs(%51 : tensor<1x1x16x16xf32>) -> tensor<1x1x16x16xf32>
%extracted_slice_1 = tensor.extract_slice %47[%arg0, 0] [1, 16] [1, 1] : tensor<?x16xf32> to tensor<1x16xf32>
%extracted_slice_2 = tensor.extract_slice %arg2[%arg0, %arg1, 0, 0] [1, 1, 16, 16] [1, 1, 1, 1] : tensor<?x?x16x16xf32> to tensor<1x1x16x16xf32>
%53 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%52, %extracted_slice_1 : tensor<1x1x16x16xf32>, tensor<1x16xf32>) outs(%extracted_slice_2 : tensor<1x1x16x16xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>} {
^bb0(%in: f32, %in_3: f32, %out: f32):
%54 = arith.addf %in, %in_3 : f32
%55 = arith.maximumf %54, %cst : f32
linalg.yield %55 : f32
} -> tensor<1x1x16x16xf32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %53 into %arg2[%arg0, %arg1, 0, 0] [1, 1, 16, 16] [1, 1, 1, 1] : tensor<1x1x16x16xf32> into tensor<?x?x16x16xf32>
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
iree_tensor_ext.dispatch.tensor.store %49, %44, offsets = [0, 0, 0, 0], sizes = [%39, %40, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%39, %40}
return
}
// -----// IR Dump After ConcretizePadResultShapePass (iree-codegen-concretize-pad-result-shape) //----- //
func.func @mmt4d_bias_relu_dispatch_0_mmt4d_DxDxDx16x16x1_f32() attributes {hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", iree.encoding.resolver = #iree_cpu.cpu_encoding_layout<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "none"}>, translation_info = #iree_codegen.translation_info<pipeline = Mmt4dTilingExpert>} {
%c0 = arith.constant 0 : index
%c32_i64 = arith.constant 32 : i64
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
%4 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
%5 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
%6 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
%7 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
%8 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
%9 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
%10 = arith.extui %0 : i32 to i64
%11 = arith.extui %1 : i32 to i64
%12 = arith.shli %11, %c32_i64 : i64
%13 = arith.ori %10, %12 : i64
%14 = arith.index_castui %13 : i64 to index
%15 = arith.extui %2 : i32 to i64
%16 = arith.extui %3 : i32 to i64
%17 = arith.shli %16, %c32_i64 : i64
%18 = arith.ori %15, %17 : i64
%19 = arith.index_castui %18 : i64 to index
%20 = arith.extui %4 : i32 to i64
%21 = arith.extui %5 : i32 to i64
%22 = arith.shli %21, %c32_i64 : i64
%23 = arith.ori %20, %22 : i64
%24 = arith.index_castui %23 : i64 to index
%25 = arith.extui %6 : i32 to i64
%26 = arith.extui %7 : i32 to i64
%27 = arith.shli %26, %c32_i64 : i64
%28 = arith.ori %25, %27 : i64
%29 = arith.index_castui %28 : i64 to index
%30 = arith.extui %8 : i32 to i64
%31 = arith.extui %9 : i32 to i64
%32 = arith.shli %31, %c32_i64 : i64
%33 = arith.ori %30, %32 : i64
%34 = arith.index_castui %33 : i64 to index
%35:5 = util.assume.int
%14<umin = 0, umax = 9007199254740991>,
%19<umin = 0, umax = 9007199254740991>,
%24<umin = 0, umax = 9007199254740991>,
%29<umin = 0, umax = 9007199254740991>,
%34<umin = 0, umax = 9007199254740991>
: index, index, index, index, index
%36 = iree_tensor_ext.dispatch.workload.ordinal %35#0, 0 : index
%37 = iree_tensor_ext.dispatch.workload.ordinal %35#1, 1 : index
%38 = iree_tensor_ext.dispatch.workload.ordinal %35#2, 2 : index
%39 = iree_tensor_ext.dispatch.workload.ordinal %35#3, 3 : index
%40 = iree_tensor_ext.dispatch.workload.ordinal %35#4, 4 : index
%41 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%39, %36}
%42 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%37, %40}
%43 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x16xf32>>{%38}
%44 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(3) alignment(64) offset(%c0) flags(Indirect) : !iree_tensor_ext.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%39, %40}
%45 = iree_tensor_ext.dispatch.tensor.load %41, offsets = [0, 0, 0, 0], sizes = [%39, %36, 16, 1], strides = [1, 1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%39, %36} -> tensor<?x?x16x1xf32>
%46 = iree_tensor_ext.dispatch.tensor.load %42, offsets = [0, 0, 0, 0], sizes = [%37, %40, 16, 1], strides = [1, 1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%37, %40} -> tensor<?x?x16x1xf32>
%47 = iree_tensor_ext.dispatch.tensor.load %43, offsets = [0, 0], sizes = [%38, 16], strides = [1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x16xf32>>{%38} -> tensor<?x16xf32>
%48 = tensor.empty(%39, %40) : tensor<?x?x16x16xf32>
%49 = scf.forall (%arg0, %arg1) in (%39, %37) shared_outs(%arg2 = %48) -> (tensor<?x?x16x16xf32>) {
%extracted_slice = tensor.extract_slice %45[%arg0, 0, 0, 0] [1, %36, 16, 1] [1, 1, 1, 1] : tensor<?x?x16x1xf32> to tensor<1x?x16x1xf32>
%extracted_slice_0 = tensor.extract_slice %46[%arg1, 0, 0, 0] [1, %36, 16, 1] [1, 1, 1, 1] : tensor<?x?x16x1xf32> to tensor<1x?x16x1xf32>
%50 = tensor.empty() : tensor<1x1x16x16xf32>
%51 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>} ins(%cst : f32) outs(%50 : tensor<1x1x16x16xf32>) -> tensor<1x1x16x16xf32>
%52 = linalg.mmt4d {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0, 0, 0], [1, 1, 0, 2, 16, 0], [0, 0, 1, 0, 0, 1]]>} ins(%extracted_slice, %extracted_slice_0 : tensor<1x?x16x1xf32>, tensor<1x?x16x1xf32>) outs(%51 : tensor<1x1x16x16xf32>) -> tensor<1x1x16x16xf32>
%extracted_slice_1 = tensor.extract_slice %47[%arg0, 0] [1, 16] [1, 1] : tensor<?x16xf32> to tensor<1x16xf32>
%extracted_slice_2 = tensor.extract_slice %arg2[%arg0, %arg1, 0, 0] [1, 1, 16, 16] [1, 1, 1, 1] : tensor<?x?x16x16xf32> to tensor<1x1x16x16xf32>
%53 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%52, %extracted_slice_1 : tensor<1x1x16x16xf32>, tensor<1x16xf32>) outs(%extracted_slice_2 : tensor<1x1x16x16xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>} {
^bb0(%in: f32, %in_3: f32, %out: f32):
%54 = arith.addf %in, %in_3 : f32
%55 = arith.maximumf %54, %cst : f32
linalg.yield %55 : f32
} -> tensor<1x1x16x16xf32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %53 into %arg2[%arg0, %arg1, 0, 0] [1, 1, 16, 16] [1, 1, 1, 1] : tensor<1x1x16x16xf32> into tensor<?x?x16x16xf32>
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
iree_tensor_ext.dispatch.tensor.store %49, %44, offsets = [0, 0, 0, 0], sizes = [%39, %40, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%39, %40}
return
}
// -----// IR Dump After PropagateDispatchSizeBoundsPass (iree-codegen-propagate-dispatch-size-bounds) //----- //
func.func @mmt4d_bias_relu_dispatch_0_mmt4d_DxDxDx16x16x1_f32() attributes {hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", iree.encoding.resolver = #iree_cpu.cpu_encoding_layout<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "none"}>, translation_info = #iree_codegen.translation_info<pipeline = Mmt4dTilingExpert>} {
%c0 = arith.constant 0 : index
%c32_i64 = arith.constant 32 : i64
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
%4 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
%5 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
%6 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
%7 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
%8 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
%9 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
%10 = arith.extui %0 : i32 to i64
%11 = arith.extui %1 : i32 to i64
%12 = arith.shli %11, %c32_i64 : i64
%13 = arith.ori %10, %12 : i64
%14 = arith.index_castui %13 : i64 to index
%15 = arith.extui %2 : i32 to i64
%16 = arith.extui %3 : i32 to i64
%17 = arith.shli %16, %c32_i64 : i64
%18 = arith.ori %15, %17 : i64
%19 = arith.index_castui %18 : i64 to index
%20 = arith.extui %4 : i32 to i64
%21 = arith.extui %5 : i32 to i64
%22 = arith.shli %21, %c32_i64 : i64
%23 = arith.ori %20, %22 : i64
%24 = arith.index_castui %23 : i64 to index
%25 = arith.extui %6 : i32 to i64
%26 = arith.extui %7 : i32 to i64
%27 = arith.shli %26, %c32_i64 : i64
%28 = arith.ori %25, %27 : i64
%29 = arith.index_castui %28 : i64 to index
%30 = arith.extui %8 : i32 to i64
%31 = arith.extui %9 : i32 to i64
%32 = arith.shli %31, %c32_i64 : i64
%33 = arith.ori %30, %32 : i64
%34 = arith.index_castui %33 : i64 to index
%35:5 = util.assume.int
%14<umin = 0, umax = 9007199254740991>,
%19<umin = 0, umax = 9007199254740991>,
%24<umin = 0, umax = 9007199254740991>,
%29<umin = 0, umax = 9007199254740991>,
%34<umin = 0, umax = 9007199254740991>
: index, index, index, index, index
%36 = iree_tensor_ext.dispatch.workload.ordinal %35#0, 0 : index
%37 = iree_tensor_ext.dispatch.workload.ordinal %35#1, 1 : index
%38 = iree_tensor_ext.dispatch.workload.ordinal %35#2, 2 : index
%39 = iree_tensor_ext.dispatch.workload.ordinal %35#3, 3 : index
%40 = iree_tensor_ext.dispatch.workload.ordinal %35#4, 4 : index
%41 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%39, %36}
%42 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%37, %40}
%43 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x16xf32>>{%38}
%44 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(3) alignment(64) offset(%c0) flags(Indirect) : !iree_tensor_ext.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%39, %40}
%45 = iree_tensor_ext.dispatch.tensor.load %41, offsets = [0, 0, 0, 0], sizes = [%39, %36, 16, 1], strides = [1, 1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%39, %36} -> tensor<?x?x16x1xf32>
%46 = iree_tensor_ext.dispatch.tensor.load %42, offsets = [0, 0, 0, 0], sizes = [%37, %40, 16, 1], strides = [1, 1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%37, %40} -> tensor<?x?x16x1xf32>
%47 = iree_tensor_ext.dispatch.tensor.load %43, offsets = [0, 0], sizes = [%38, 16], strides = [1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x16xf32>>{%38} -> tensor<?x16xf32>
%48 = tensor.empty(%39, %40) : tensor<?x?x16x16xf32>
%49 = scf.forall (%arg0, %arg1) in (%39, %37) shared_outs(%arg2 = %48) -> (tensor<?x?x16x16xf32>) {
%extracted_slice = tensor.extract_slice %45[%arg0, 0, 0, 0] [1, %36, 16, 1] [1, 1, 1, 1] : tensor<?x?x16x1xf32> to tensor<1x?x16x1xf32>
%extracted_slice_0 = tensor.extract_slice %46[%arg1, 0, 0, 0] [1, %36, 16, 1] [1, 1, 1, 1] : tensor<?x?x16x1xf32> to tensor<1x?x16x1xf32>
%50 = tensor.empty() : tensor<1x1x16x16xf32>
%51 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>} ins(%cst : f32) outs(%50 : tensor<1x1x16x16xf32>) -> tensor<1x1x16x16xf32>
%52 = linalg.mmt4d {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0, 0, 0], [1, 1, 0, 2, 16, 0], [0, 0, 1, 0, 0, 1]]>} ins(%extracted_slice, %extracted_slice_0 : tensor<1x?x16x1xf32>, tensor<1x?x16x1xf32>) outs(%51 : tensor<1x1x16x16xf32>) -> tensor<1x1x16x16xf32>
%extracted_slice_1 = tensor.extract_slice %47[%arg0, 0] [1, 16] [1, 1] : tensor<?x16xf32> to tensor<1x16xf32>
%extracted_slice_2 = tensor.extract_slice %arg2[%arg0, %arg1, 0, 0] [1, 1, 16, 16] [1, 1, 1, 1] : tensor<?x?x16x16xf32> to tensor<1x1x16x16xf32>
%53 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%52, %extracted_slice_1 : tensor<1x1x16x16xf32>, tensor<1x16xf32>) outs(%extracted_slice_2 : tensor<1x1x16x16xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>} {
^bb0(%in: f32, %in_3: f32, %out: f32):
%54 = arith.addf %in, %in_3 : f32
%55 = arith.maximumf %54, %cst : f32
linalg.yield %55 : f32
} -> tensor<1x1x16x16xf32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %53 into %arg2[%arg0, %arg1, 0, 0] [1, 1, 16, 16] [1, 1, 1, 1] : tensor<1x1x16x16xf32> into tensor<?x?x16x16xf32>
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
iree_tensor_ext.dispatch.tensor.store %49, %44, offsets = [0, 0, 0, 0], sizes = [%39, %40, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%39, %40}
return
}
// -----// IR Dump After LLVMCPUTileRootAndFuseProducerConsumerPass (iree-llvmcpu-tile-root-and-fuse-producer-consumer) //----- //
func.func @mmt4d_bias_relu_dispatch_0_mmt4d_DxDxDx16x16x1_f32() attributes {hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", iree.encoding.resolver = #iree_cpu.cpu_encoding_layout<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "none"}>, translation_info = #iree_codegen.translation_info<pipeline = Mmt4dTilingExpert>} {
%c0 = arith.constant 0 : index
%c32_i64 = arith.constant 32 : i64
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
%4 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
%5 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
%6 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
%7 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
%8 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
%9 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
%10 = arith.extui %0 : i32 to i64
%11 = arith.extui %1 : i32 to i64
%12 = arith.shli %11, %c32_i64 : i64
%13 = arith.ori %10, %12 : i64
%14 = arith.index_castui %13 : i64 to index
%15 = arith.extui %2 : i32 to i64
%16 = arith.extui %3 : i32 to i64
%17 = arith.shli %16, %c32_i64 : i64
%18 = arith.ori %15, %17 : i64
%19 = arith.index_castui %18 : i64 to index
%20 = arith.extui %4 : i32 to i64
%21 = arith.extui %5 : i32 to i64
%22 = arith.shli %21, %c32_i64 : i64
%23 = arith.ori %20, %22 : i64
%24 = arith.index_castui %23 : i64 to index
%25 = arith.extui %6 : i32 to i64
%26 = arith.extui %7 : i32 to i64
%27 = arith.shli %26, %c32_i64 : i64
%28 = arith.ori %25, %27 : i64
%29 = arith.index_castui %28 : i64 to index
%30 = arith.extui %8 : i32 to i64
%31 = arith.extui %9 : i32 to i64
%32 = arith.shli %31, %c32_i64 : i64
%33 = arith.ori %30, %32 : i64
%34 = arith.index_castui %33 : i64 to index
%35:5 = util.assume.int
%14<umin = 0, umax = 9007199254740991>,
%19<umin = 0, umax = 9007199254740991>,
%24<umin = 0, umax = 9007199254740991>,
%29<umin = 0, umax = 9007199254740991>,
%34<umin = 0, umax = 9007199254740991>
: index, index, index, index, index
%36 = iree_tensor_ext.dispatch.workload.ordinal %35#0, 0 : index
%37 = iree_tensor_ext.dispatch.workload.ordinal %35#1, 1 : index
%38 = iree_tensor_ext.dispatch.workload.ordinal %35#2, 2 : index
%39 = iree_tensor_ext.dispatch.workload.ordinal %35#3, 3 : index
%40 = iree_tensor_ext.dispatch.workload.ordinal %35#4, 4 : index
%41 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%39, %36}
%42 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%37, %40}
%43 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x16xf32>>{%38}
%44 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(3) alignment(64) offset(%c0) flags(Indirect) : !iree_tensor_ext.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%39, %40}
%45 = iree_tensor_ext.dispatch.tensor.load %41, offsets = [0, 0, 0, 0], sizes = [%39, %36, 16, 1], strides = [1, 1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%39, %36} -> tensor<?x?x16x1xf32>
%46 = iree_tensor_ext.dispatch.tensor.load %42, offsets = [0, 0, 0, 0], sizes = [%37, %40, 16, 1], strides = [1, 1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%37, %40} -> tensor<?x?x16x1xf32>
%47 = iree_tensor_ext.dispatch.tensor.load %43, offsets = [0, 0], sizes = [%38, 16], strides = [1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x16xf32>>{%38} -> tensor<?x16xf32>
%48 = tensor.empty(%39, %40) : tensor<?x?x16x16xf32>
%49 = scf.forall (%arg0, %arg1) in (%39, %37) shared_outs(%arg2 = %48) -> (tensor<?x?x16x16xf32>) {
%extracted_slice = tensor.extract_slice %47[%arg0, 0] [1, 16] [1, 1] : tensor<?x16xf32> to tensor<1x16xf32>
%extracted_slice_0 = tensor.extract_slice %arg2[%arg0, %arg1, 0, 0] [1, 1, 16, 16] [1, 1, 1, 1] : tensor<?x?x16x16xf32> to tensor<1x1x16x16xf32>
%50 = scf.forall (%arg3) = (0) to (16) step (2) shared_outs(%arg4 = %extracted_slice_0) -> (tensor<1x1x16x16xf32>) {
%extracted_slice_1 = tensor.extract_slice %45[%arg0, 0, %arg3, 0] [1, %36, 2, 1] [1, 1, 1, 1] : tensor<?x?x16x1xf32> to tensor<1x?x2x1xf32>
%extracted_slice_2 = tensor.extract_slice %46[%arg1, 0, 0, 0] [1, %36, 16, 1] [1, 1, 1, 1] : tensor<?x?x16x1xf32> to tensor<1x?x16x1xf32>
%51 = tensor.empty() : tensor<1x1x2x16xf32>
%52 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>} ins(%cst : f32) outs(%51 : tensor<1x1x2x16xf32>) -> tensor<1x1x2x16xf32>
%53 = linalg.mmt4d {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0, 0, 0], [1, 1, 0, 2, 16, 0], [0, 0, 1, 0, 0, 1]]>} ins(%extracted_slice_1, %extracted_slice_2 : tensor<1x?x2x1xf32>, tensor<1x?x16x1xf32>) outs(%52 : tensor<1x1x2x16xf32>) -> tensor<1x1x2x16xf32>
%extracted_slice_3 = tensor.extract_slice %extracted_slice[0, %arg3] [1, 2] [1, 1] : tensor<1x16xf32> to tensor<1x2xf32>
%extracted_slice_4 = tensor.extract_slice %arg4[0, 0, %arg3, 0] [1, 1, 2, 16] [1, 1, 1, 1] : tensor<1x1x16x16xf32> to tensor<1x1x2x16xf32>
%54 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%53, %extracted_slice_3 : tensor<1x1x2x16xf32>, tensor<1x2xf32>) outs(%extracted_slice_4 : tensor<1x1x2x16xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>} {
^bb0(%in: f32, %in_5: f32, %out: f32):
%55 = arith.addf %in, %in_5 : f32
%56 = arith.maximumf %55, %cst : f32
linalg.yield %56 : f32
} -> tensor<1x1x2x16xf32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %54 into %arg4[%c0, %c0, %arg3, %c0] [1, 1, 2, 16] [1, 1, 1, 1] : tensor<1x1x2x16xf32> into tensor<1x1x16x16xf32>
}
}
scf.forall.in_parallel {
tensor.parallel_insert_slice %50 into %arg2[%arg0, %arg1, 0, 0] [1, 1, 16, 16] [1, 1, 1, 1] : tensor<1x1x16x16xf32> into tensor<?x?x16x16xf32>
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
iree_tensor_ext.dispatch.tensor.store %49, %44, offsets = [0, 0, 0, 0], sizes = [%39, %40, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%39, %40}
return
}
// -----// IR Dump After CPUPrepareUkernelsPass (iree-codegen-cpu-prepare-ukernels) //----- //
func.func @mmt4d_bias_relu_dispatch_0_mmt4d_DxDxDx16x16x1_f32() attributes {hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", iree.encoding.resolver = #iree_cpu.cpu_encoding_layout<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "none"}>, translation_info = #iree_codegen.translation_info<pipeline = Mmt4dTilingExpert>} {
%c0 = arith.constant 0 : index
%c32_i64 = arith.constant 32 : i64
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
%4 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
%5 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
%6 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
%7 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
%8 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
%9 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
%10 = arith.extui %0 : i32 to i64
%11 = arith.extui %1 : i32 to i64
%12 = arith.shli %11, %c32_i64 : i64
%13 = arith.ori %10, %12 : i64
%14 = arith.index_castui %13 : i64 to index
%15 = arith.extui %2 : i32 to i64
%16 = arith.extui %3 : i32 to i64
%17 = arith.shli %16, %c32_i64 : i64
%18 = arith.ori %15, %17 : i64
%19 = arith.index_castui %18 : i64 to index
%20 = arith.extui %4 : i32 to i64
%21 = arith.extui %5 : i32 to i64
%22 = arith.shli %21, %c32_i64 : i64
%23 = arith.ori %20, %22 : i64
%24 = arith.index_castui %23 : i64 to index
%25 = arith.extui %6 : i32 to i64
%26 = arith.extui %7 : i32 to i64
%27 = arith.shli %26, %c32_i64 : i64
%28 = arith.ori %25, %27 : i64
%29 = arith.index_castui %28 : i64 to index
%30 = arith.extui %8 : i32 to i64
%31 = arith.extui %9 : i32 to i64
%32 = arith.shli %31, %c32_i64 : i64
%33 = arith.ori %30, %32 : i64
%34 = arith.index_castui %33 : i64 to index
%35:5 = util.assume.int
%14<umin = 0, umax = 9007199254740991>,
%19<umin = 0, umax = 9007199254740991>,
%24<umin = 0, umax = 9007199254740991>,
%29<umin = 0, umax = 9007199254740991>,
%34<umin = 0, umax = 9007199254740991>
: index, index, index, index, index
%36 = iree_tensor_ext.dispatch.workload.ordinal %35#0, 0 : index
%37 = iree_tensor_ext.dispatch.workload.ordinal %35#1, 1 : index
%38 = iree_tensor_ext.dispatch.workload.ordinal %35#2, 2 : index
%39 = iree_tensor_ext.dispatch.workload.ordinal %35#3, 3 : index
%40 = iree_tensor_ext.dispatch.workload.ordinal %35#4, 4 : index
%41 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%39, %36}
%42 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%37, %40}
%43 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x16xf32>>{%38}
%44 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(3) alignment(64) offset(%c0) flags(Indirect) : !iree_tensor_ext.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%39, %40}
%45 = iree_tensor_ext.dispatch.tensor.load %41, offsets = [0, 0, 0, 0], sizes = [%39, %36, 16, 1], strides = [1, 1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%39, %36} -> tensor<?x?x16x1xf32>
%46 = iree_tensor_ext.dispatch.tensor.load %42, offsets = [0, 0, 0, 0], sizes = [%37, %40, 16, 1], strides = [1, 1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%37, %40} -> tensor<?x?x16x1xf32>
%47 = iree_tensor_ext.dispatch.tensor.load %43, offsets = [0, 0], sizes = [%38, 16], strides = [1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x16xf32>>{%38} -> tensor<?x16xf32>
%48 = tensor.empty(%39, %40) : tensor<?x?x16x16xf32>
%49 = scf.forall (%arg0, %arg1) in (%39, %37) shared_outs(%arg2 = %48) -> (tensor<?x?x16x16xf32>) {
%extracted_slice = tensor.extract_slice %arg2[%arg0, %arg1, 0, 0] [1, 1, 16, 16] [1, 1, 1, 1] : tensor<?x?x16x16xf32> to tensor<1x1x16x16xf32>
%50 = scf.forall (%arg3) = (0) to (16) step (2) shared_outs(%arg4 = %extracted_slice) -> (tensor<1x1x16x16xf32>) {
%extracted_slice_0 = tensor.extract_slice %45[%arg0, 0, %arg3, 0] [1, %36, 2, 1] [1, 1, 1, 1] : tensor<?x?x16x1xf32> to tensor<1x?x2x1xf32>
%extracted_slice_1 = tensor.extract_slice %46[%arg1, 0, 0, 0] [1, %36, 16, 1] [1, 1, 1, 1] : tensor<?x?x16x1xf32> to tensor<1x?x16x1xf32>
%51 = tensor.empty() : tensor<1x1x2x16xf32>
%52 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>} ins(%cst : f32) outs(%51 : tensor<1x1x2x16xf32>) -> tensor<1x1x2x16xf32>
%53 = linalg.mmt4d {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0, 0, 0], [1, 1, 0, 2, 16, 0], [0, 0, 1, 0, 0, 1]]>} ins(%extracted_slice_0, %extracted_slice_1 : tensor<1x?x2x1xf32>, tensor<1x?x16x1xf32>) outs(%52 : tensor<1x1x2x16xf32>) -> tensor<1x1x2x16xf32>
%extracted_slice_2 = tensor.extract_slice %47[%arg0, %arg3] [1, 2] [1, 1] : tensor<?x16xf32> to tensor<1x2xf32>
%extracted_slice_3 = tensor.extract_slice %arg4[0, 0, %arg3, 0] [1, 1, 2, 16] [1, 1, 1, 1] : tensor<1x1x16x16xf32> to tensor<1x1x2x16xf32>
%54 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%53, %extracted_slice_2 : tensor<1x1x2x16xf32>, tensor<1x2xf32>) outs(%extracted_slice_3 : tensor<1x1x2x16xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>} {
^bb0(%in: f32, %in_4: f32, %out: f32):
%55 = arith.addf %in, %in_4 : f32
%56 = arith.maximumf %55, %cst : f32
linalg.yield %56 : f32
} -> tensor<1x1x2x16xf32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %54 into %arg4[%c0, %c0, %arg3, %c0] [1, 1, 2, 16] [1, 1, 1, 1] : tensor<1x1x2x16xf32> into tensor<1x1x16x16xf32>
}
}
scf.forall.in_parallel {
tensor.parallel_insert_slice %50 into %arg2[%arg0, %arg1, 0, 0] [1, 1, 16, 16] [1, 1, 1, 1] : tensor<1x1x16x16xf32> into tensor<?x?x16x16xf32>
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
iree_tensor_ext.dispatch.tensor.store %49, %44, offsets = [0, 0, 0, 0], sizes = [%39, %40, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%39, %40}
return
}
// -----// IR Dump After CPULowerToUKernelsPass (iree-codegen-cpu-lower-to-ukernels) //----- //
func.func @mmt4d_bias_relu_dispatch_0_mmt4d_DxDxDx16x16x1_f32() attributes {hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", iree.encoding.resolver = #iree_cpu.cpu_encoding_layout<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "none"}>, translation_info = #iree_codegen.translation_info<pipeline = Mmt4dTilingExpert>} {
%c0 = arith.constant 0 : index
%c32_i64 = arith.constant 32 : i64
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
%4 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
%5 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
%6 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
%7 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
%8 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
%9 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
%10 = arith.extui %0 : i32 to i64
%11 = arith.extui %1 : i32 to i64
%12 = arith.shli %11, %c32_i64 : i64
%13 = arith.ori %10, %12 : i64
%14 = arith.index_castui %13 : i64 to index
%15 = arith.extui %2 : i32 to i64
%16 = arith.extui %3 : i32 to i64
%17 = arith.shli %16, %c32_i64 : i64
%18 = arith.ori %15, %17 : i64
%19 = arith.index_castui %18 : i64 to index
%20 = arith.extui %4 : i32 to i64
%21 = arith.extui %5 : i32 to i64
%22 = arith.shli %21, %c32_i64 : i64
%23 = arith.ori %20, %22 : i64
%24 = arith.index_castui %23 : i64 to index
%25 = arith.extui %6 : i32 to i64
%26 = arith.extui %7 : i32 to i64
%27 = arith.shli %26, %c32_i64 : i64
%28 = arith.ori %25, %27 : i64
%29 = arith.index_castui %28 : i64 to index
%30 = arith.extui %8 : i32 to i64
%31 = arith.extui %9 : i32 to i64
%32 = arith.shli %31, %c32_i64 : i64
%33 = arith.ori %30, %32 : i64
%34 = arith.index_castui %33 : i64 to index
%35:5 = util.assume.int
%14<umin = 0, umax = 9007199254740991>,
%19<umin = 0, umax = 9007199254740991>,
%24<umin = 0, umax = 9007199254740991>,
%29<umin = 0, umax = 9007199254740991>,
%34<umin = 0, umax = 9007199254740991>
: index, index, index, index, index
%36 = iree_tensor_ext.dispatch.workload.ordinal %35#0, 0 : index
%37 = iree_tensor_ext.dispatch.workload.ordinal %35#1, 1 : index
%38 = iree_tensor_ext.dispatch.workload.ordinal %35#2, 2 : index
%39 = iree_tensor_ext.dispatch.workload.ordinal %35#3, 3 : index
%40 = iree_tensor_ext.dispatch.workload.ordinal %35#4, 4 : index
%41 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%39, %36}
%42 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%37, %40}
%43 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x16xf32>>{%38}
%44 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(3) alignment(64) offset(%c0) flags(Indirect) : !iree_tensor_ext.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%39, %40}
%45 = iree_tensor_ext.dispatch.tensor.load %41, offsets = [0, 0, 0, 0], sizes = [%39, %36, 16, 1], strides = [1, 1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%39, %36} -> tensor<?x?x16x1xf32>
%46 = iree_tensor_ext.dispatch.tensor.load %42, offsets = [0, 0, 0, 0], sizes = [%37, %40, 16, 1], strides = [1, 1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%37, %40} -> tensor<?x?x16x1xf32>
%47 = iree_tensor_ext.dispatch.tensor.load %43, offsets = [0, 0], sizes = [%38, 16], strides = [1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x16xf32>>{%38} -> tensor<?x16xf32>
%48 = tensor.empty(%39, %40) : tensor<?x?x16x16xf32>
%49 = scf.forall (%arg0, %arg1) in (%39, %37) shared_outs(%arg2 = %48) -> (tensor<?x?x16x16xf32>) {
%extracted_slice = tensor.extract_slice %arg2[%arg0, %arg1, 0, 0] [1, 1, 16, 16] [1, 1, 1, 1] : tensor<?x?x16x16xf32> to tensor<1x1x16x16xf32>
%50 = scf.forall (%arg3) = (0) to (16) step (2) shared_outs(%arg4 = %extracted_slice) -> (tensor<1x1x16x16xf32>) {
%extracted_slice_0 = tensor.extract_slice %45[%arg0, 0, %arg3, 0] [1, %36, 2, 1] [1, 1, 1, 1] : tensor<?x?x16x1xf32> to tensor<1x?x2x1xf32>
%extracted_slice_1 = tensor.extract_slice %46[%arg1, 0, 0, 0] [1, %36, 16, 1] [1, 1, 1, 1] : tensor<?x?x16x1xf32> to tensor<1x?x16x1xf32>
%51 = tensor.empty() : tensor<1x1x2x16xf32>
%52 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>} ins(%cst : f32) outs(%51 : tensor<1x1x2x16xf32>) -> tensor<1x1x2x16xf32>
%53 = linalg.mmt4d {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0, 0, 0], [1, 1, 0, 2, 16, 0], [0, 0, 1, 0, 0, 1]]>} ins(%extracted_slice_0, %extracted_slice_1 : tensor<1x?x2x1xf32>, tensor<1x?x16x1xf32>) outs(%52 : tensor<1x1x2x16xf32>) -> tensor<1x1x2x16xf32>
%extracted_slice_2 = tensor.extract_slice %47[%arg0, %arg3] [1, 2] [1, 1] : tensor<?x16xf32> to tensor<1x2xf32>
%extracted_slice_3 = tensor.extract_slice %arg4[0, 0, %arg3, 0] [1, 1, 2, 16] [1, 1, 1, 1] : tensor<1x1x16x16xf32> to tensor<1x1x2x16xf32>
%54 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%53, %extracted_slice_2 : tensor<1x1x2x16xf32>, tensor<1x2xf32>) outs(%extracted_slice_3 : tensor<1x1x2x16xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>} {
^bb0(%in: f32, %in_4: f32, %out: f32):
%55 = arith.addf %in, %in_4 : f32
%56 = arith.maximumf %55, %cst : f32
linalg.yield %56 : f32
} -> tensor<1x1x2x16xf32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %54 into %arg4[%c0, %c0, %arg3, %c0] [1, 1, 2, 16] [1, 1, 1, 1] : tensor<1x1x2x16xf32> into tensor<1x1x16x16xf32>
}
}
scf.forall.in_parallel {
tensor.parallel_insert_slice %50 into %arg2[%arg0, %arg1, 0, 0] [1, 1, 16, 16] [1, 1, 1, 1] : tensor<1x1x16x16xf32> into tensor<?x?x16x16xf32>
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
iree_tensor_ext.dispatch.tensor.store %49, %44, offsets = [0, 0, 0, 0], sizes = [%39, %40, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%39, %40}
return
}
// -----// IR Dump After LLVMCPUTileRootAndFuseProducerConsumerPass (iree-llvmcpu-tile-root-and-fuse-producer-consumer) //----- //
func.func @mmt4d_bias_relu_dispatch_0_mmt4d_DxDxDx16x16x1_f32() attributes {hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", iree.encoding.resolver = #iree_cpu.cpu_encoding_layout<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "none"}>, translation_info = #iree_codegen.translation_info<pipeline = Mmt4dTilingExpert>} {
%c1 = arith.constant 1 : index
%c0 = arith.constant 0 : index
%c32_i64 = arith.constant 32 : i64
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
%4 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
%5 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
%6 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
%7 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
%8 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
%9 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
%10 = arith.extui %0 : i32 to i64
%11 = arith.extui %1 : i32 to i64
%12 = arith.shli %11, %c32_i64 : i64
%13 = arith.ori %10, %12 : i64
%14 = arith.index_castui %13 : i64 to index
%15 = arith.extui %2 : i32 to i64
%16 = arith.extui %3 : i32 to i64
%17 = arith.shli %16, %c32_i64 : i64
%18 = arith.ori %15, %17 : i64
%19 = arith.index_castui %18 : i64 to index
%20 = arith.extui %4 : i32 to i64
%21 = arith.extui %5 : i32 to i64
%22 = arith.shli %21, %c32_i64 : i64
%23 = arith.ori %20, %22 : i64
%24 = arith.index_castui %23 : i64 to index
%25 = arith.extui %6 : i32 to i64
%26 = arith.extui %7 : i32 to i64
%27 = arith.shli %26, %c32_i64 : i64
%28 = arith.ori %25, %27 : i64
%29 = arith.index_castui %28 : i64 to index
%30 = arith.extui %8 : i32 to i64
%31 = arith.extui %9 : i32 to i64
%32 = arith.shli %31, %c32_i64 : i64
%33 = arith.ori %30, %32 : i64
%34 = arith.index_castui %33 : i64 to index
%35:5 = util.assume.int
%14<umin = 0, umax = 9007199254740991>,
%19<umin = 0, umax = 9007199254740991>,
%24<umin = 0, umax = 9007199254740991>,
%29<umin = 0, umax = 9007199254740991>,
%34<umin = 0, umax = 9007199254740991>
: index, index, index, index, index
%36 = iree_tensor_ext.dispatch.workload.ordinal %35#0, 0 : index
%37 = iree_tensor_ext.dispatch.workload.ordinal %35#1, 1 : index
%38 = iree_tensor_ext.dispatch.workload.ordinal %35#2, 2 : index
%39 = iree_tensor_ext.dispatch.workload.ordinal %35#3, 3 : index
%40 = iree_tensor_ext.dispatch.workload.ordinal %35#4, 4 : index
%41 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%39, %36}
%42 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%37, %40}
%43 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x16xf32>>{%38}
%44 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(3) alignment(64) offset(%c0) flags(Indirect) : !iree_tensor_ext.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%39, %40}
%45 = iree_tensor_ext.dispatch.tensor.load %41, offsets = [0, 0, 0, 0], sizes = [%39, %36, 16, 1], strides = [1, 1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%39, %36} -> tensor<?x?x16x1xf32>
%46 = iree_tensor_ext.dispatch.tensor.load %42, offsets = [0, 0, 0, 0], sizes = [%37, %40, 16, 1], strides = [1, 1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%37, %40} -> tensor<?x?x16x1xf32>
%47 = iree_tensor_ext.dispatch.tensor.load %43, offsets = [0, 0], sizes = [%38, 16], strides = [1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x16xf32>>{%38} -> tensor<?x16xf32>
%48 = tensor.empty(%39, %40) : tensor<?x?x16x16xf32>
%49 = scf.forall (%arg0, %arg1) in (%39, %37) shared_outs(%arg2 = %48) -> (tensor<?x?x16x16xf32>) {
%extracted_slice = tensor.extract_slice %arg2[%arg0, %arg1, 0, 0] [1, 1, 16, 16] [1, 1, 1, 1] : tensor<?x?x16x16xf32> to tensor<1x1x16x16xf32>
%50 = scf.forall (%arg3) = (0) to (16) step (2) shared_outs(%arg4 = %extracted_slice) -> (tensor<1x1x16x16xf32>) {
%51 = tensor.empty() : tensor<1x1x2x16xf32>
%52 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>} ins(%cst : f32) outs(%51 : tensor<1x1x2x16xf32>) -> tensor<1x1x2x16xf32>
%53 = scf.for %arg5 = %c0 to %36 step %c1 iter_args(%arg6 = %52) -> (tensor<1x1x2x16xf32>) {
%extracted_slice_2 = tensor.extract_slice %45[%arg0, %arg5, %arg3, 0] [1, 1, 2, 1] [1, 1, 1, 1] : tensor<?x?x16x1xf32> to tensor<1x1x2x1xf32>
%extracted_slice_3 = tensor.extract_slice %46[%arg1, %arg5, 0, 0] [1, 1, 16, 1] [1, 1, 1, 1] : tensor<?x?x16x1xf32> to tensor<1x1x16x1xf32>
%55 = linalg.mmt4d {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0, 0, 0], [1, 1, 0, 2, 16, 0], [0, 0, 1, 0, 0, 1]]>} ins(%extracted_slice_2, %extracted_slice_3 : tensor<1x1x2x1xf32>, tensor<1x1x16x1xf32>) outs(%arg6 : tensor<1x1x2x16xf32>) -> tensor<1x1x2x16xf32>
scf.yield %55 : tensor<1x1x2x16xf32>
}
%extracted_slice_0 = tensor.extract_slice %47[%arg0, %arg3] [1, 2] [1, 1] : tensor<?x16xf32> to tensor<1x2xf32>
%extracted_slice_1 = tensor.extract_slice %arg4[0, 0, %arg3, 0] [1, 1, 2, 16] [1, 1, 1, 1] : tensor<1x1x16x16xf32> to tensor<1x1x2x16xf32>
%54 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%53, %extracted_slice_0 : tensor<1x1x2x16xf32>, tensor<1x2xf32>) outs(%extracted_slice_1 : tensor<1x1x2x16xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>} {
^bb0(%in: f32, %in_2: f32, %out: f32):
%55 = arith.addf %in, %in_2 : f32
%56 = arith.maximumf %55, %cst : f32
linalg.yield %56 : f32
} -> tensor<1x1x2x16xf32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %54 into %arg4[%c0, %c0, %arg3, %c0] [1, 1, 2, 16] [1, 1, 1, 1] : tensor<1x1x2x16xf32> into tensor<1x1x16x16xf32>
}
}
scf.forall.in_parallel {
tensor.parallel_insert_slice %50 into %arg2[%arg0, %arg1, 0, 0] [1, 1, 16, 16] [1, 1, 1, 1] : tensor<1x1x16x16xf32> into tensor<?x?x16x16xf32>
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
iree_tensor_ext.dispatch.tensor.store %49, %44, offsets = [0, 0, 0, 0], sizes = [%39, %40, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%39, %40}
return
}
// -----// IR Dump After GenericVectorizationPass (iree-codegen-generic-vectorization) //----- //
func.func @mmt4d_bias_relu_dispatch_0_mmt4d_DxDxDx16x16x1_f32() attributes {hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", iree.encoding.resolver = #iree_cpu.cpu_encoding_layout<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "none"}>, translation_info = #iree_codegen.translation_info<pipeline = Mmt4dTilingExpert>} {
%c1 = arith.constant 1 : index
%c0 = arith.constant 0 : index
%c32_i64 = arith.constant 32 : i64
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
%4 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
%5 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
%6 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
%7 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
%8 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
%9 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
%10 = arith.extui %0 : i32 to i64
%11 = arith.extui %1 : i32 to i64
%12 = arith.shli %11, %c32_i64 : i64
%13 = arith.ori %10, %12 : i64
%14 = arith.index_castui %13 : i64 to index
%15 = arith.extui %2 : i32 to i64
%16 = arith.extui %3 : i32 to i64
%17 = arith.shli %16, %c32_i64 : i64
%18 = arith.ori %15, %17 : i64
%19 = arith.index_castui %18 : i64 to index
%20 = arith.extui %4 : i32 to i64
%21 = arith.extui %5 : i32 to i64
%22 = arith.shli %21, %c32_i64 : i64
%23 = arith.ori %20, %22 : i64
%24 = arith.index_castui %23 : i64 to index
%25 = arith.extui %6 : i32 to i64
%26 = arith.extui %7 : i32 to i64
%27 = arith.shli %26, %c32_i64 : i64
%28 = arith.ori %25, %27 : i64
%29 = arith.index_castui %28 : i64 to index
%30 = arith.extui %8 : i32 to i64
%31 = arith.extui %9 : i32 to i64
%32 = arith.shli %31, %c32_i64 : i64
%33 = arith.ori %30, %32 : i64
%34 = arith.index_castui %33 : i64 to index
%35:5 = util.assume.int
%14<umin = 0, umax = 9007199254740991>,
%19<umin = 0, umax = 9007199254740991>,
%24<umin = 0, umax = 9007199254740991>,
%29<umin = 0, umax = 9007199254740991>,
%34<umin = 0, umax = 9007199254740991>
: index, index, index, index, index
%36 = iree_tensor_ext.dispatch.workload.ordinal %35#0, 0 : index
%37 = iree_tensor_ext.dispatch.workload.ordinal %35#1, 1 : index
%38 = iree_tensor_ext.dispatch.workload.ordinal %35#2, 2 : index
%39 = iree_tensor_ext.dispatch.workload.ordinal %35#3, 3 : index
%40 = iree_tensor_ext.dispatch.workload.ordinal %35#4, 4 : index
%41 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%39, %36}
%42 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%37, %40}
%43 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x16xf32>>{%38}
%44 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(3) alignment(64) offset(%c0) flags(Indirect) : !iree_tensor_ext.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%39, %40}
%45 = iree_tensor_ext.dispatch.tensor.load %41, offsets = [0, 0, 0, 0], sizes = [%39, %36, 16, 1], strides = [1, 1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%39, %36} -> tensor<?x?x16x1xf32>
%46 = iree_tensor_ext.dispatch.tensor.load %42, offsets = [0, 0, 0, 0], sizes = [%37, %40, 16, 1], strides = [1, 1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%37, %40} -> tensor<?x?x16x1xf32>
%47 = iree_tensor_ext.dispatch.tensor.load %43, offsets = [0, 0], sizes = [%38, 16], strides = [1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x16xf32>>{%38} -> tensor<?x16xf32>
%48 = tensor.empty(%39, %40) : tensor<?x?x16x16xf32>
%49 = scf.forall (%arg0, %arg1) in (%39, %37) shared_outs(%arg2 = %48) -> (tensor<?x?x16x16xf32>) {
%extracted_slice = tensor.extract_slice %arg2[%arg0, %arg1, 0, 0] [1, 1, 16, 16] [1, 1, 1, 1] : tensor<?x?x16x16xf32> to tensor<1x1x16x16xf32>
%50 = scf.forall (%arg3) = (0) to (16) step (2) shared_outs(%arg4 = %extracted_slice) -> (tensor<1x1x16x16xf32>) {
%51 = tensor.empty() : tensor<1x1x2x16xf32>
%52 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>} ins(%cst : f32) outs(%51 : tensor<1x1x2x16xf32>) -> tensor<1x1x2x16xf32>
%53 = scf.for %arg5 = %c0 to %36 step %c1 iter_args(%arg6 = %52) -> (tensor<1x1x2x16xf32>) {
%extracted_slice_2 = tensor.extract_slice %45[%arg0, %arg5, %arg3, 0] [1, 1, 2, 1] [1, 1, 1, 1] : tensor<?x?x16x1xf32> to tensor<1x1x2x1xf32>
%extracted_slice_3 = tensor.extract_slice %46[%arg1, %arg5, 0, 0] [1, 1, 16, 1] [1, 1, 1, 1] : tensor<?x?x16x1xf32> to tensor<1x1x16x1xf32>
%55 = vector.transfer_read %extracted_slice_2[%c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true]} : tensor<1x1x2x1xf32>, vector<1x1x2x1xf32>
%56 = vector.transfer_read %extracted_slice_3[%c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true]} : tensor<1x1x16x1xf32>, vector<1x1x16x1xf32>
%57 = vector.transfer_read %arg6[%c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true]} : tensor<1x1x2x16xf32>, vector<1x1x2x16xf32>
%58 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d2, d4, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %55, %56, %57 : vector<1x1x2x1xf32>, vector<1x1x16x1xf32> into vector<1x1x2x16xf32>
%59 = vector.transfer_write %58, %arg6[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x2x16xf32>, tensor<1x1x2x16xf32>
scf.yield %59 : tensor<1x1x2x16xf32>
}
%extracted_slice_0 = tensor.extract_slice %47[%arg0, %arg3] [1, 2] [1, 1] : tensor<?x16xf32> to tensor<1x2xf32>
%extracted_slice_1 = tensor.extract_slice %arg4[0, 0, %arg3, 0] [1, 1, 2, 16] [1, 1, 1, 1] : tensor<1x1x16x16xf32> to tensor<1x1x2x16xf32>
%54 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%53, %extracted_slice_0 : tensor<1x1x2x16xf32>, tensor<1x2xf32>) outs(%extracted_slice_1 : tensor<1x1x2x16xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>} {
^bb0(%in: f32, %in_2: f32, %out: f32):
%55 = arith.addf %in, %in_2 : f32
%56 = arith.maximumf %55, %cst : f32
linalg.yield %56 : f32
} -> tensor<1x1x2x16xf32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %54 into %arg4[%c0, %c0, %arg3, %c0] [1, 1, 2, 16] [1, 1, 1, 1] : tensor<1x1x2x16xf32> into tensor<1x1x16x16xf32>
}
}
scf.forall.in_parallel {
tensor.parallel_insert_slice %50 into %arg2[%arg0, %arg1, 0, 0] [1, 1, 16, 16] [1, 1, 1, 1] : tensor<1x1x16x16xf32> into tensor<?x?x16x16xf32>
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
iree_tensor_ext.dispatch.tensor.store %49, %44, offsets = [0, 0, 0, 0], sizes = [%39, %40, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%39, %40}
return
}
// -----// IR Dump After OptimizeTensorInsertExtractSlicesPass (iree-codegen-optimize-tensor-insert-extract-slices) //----- //
func.func @mmt4d_bias_relu_dispatch_0_mmt4d_DxDxDx16x16x1_f32() attributes {hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", iree.encoding.resolver = #iree_cpu.cpu_encoding_layout<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "none"}>, translation_info = #iree_codegen.translation_info<pipeline = Mmt4dTilingExpert>} {
%c1 = arith.constant 1 : index
%c0 = arith.constant 0 : index
%c32_i64 = arith.constant 32 : i64
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
%4 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
%5 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
%6 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
%7 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
%8 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
%9 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
%10 = arith.extui %0 : i32 to i64
%11 = arith.extui %1 : i32 to i64
%12 = arith.shli %11, %c32_i64 : i64
%13 = arith.ori %10, %12 : i64
%14 = arith.index_castui %13 : i64 to index
%15 = arith.extui %2 : i32 to i64
%16 = arith.extui %3 : i32 to i64
%17 = arith.shli %16, %c32_i64 : i64
%18 = arith.ori %15, %17 : i64
%19 = arith.index_castui %18 : i64 to index
%20 = arith.extui %4 : i32 to i64
%21 = arith.extui %5 : i32 to i64
%22 = arith.shli %21, %c32_i64 : i64
%23 = arith.ori %20, %22 : i64
%24 = arith.index_castui %23 : i64 to index
%25 = arith.extui %6 : i32 to i64
%26 = arith.extui %7 : i32 to i64
%27 = arith.shli %26, %c32_i64 : i64
%28 = arith.ori %25, %27 : i64
%29 = arith.index_castui %28 : i64 to index
%30 = arith.extui %8 : i32 to i64
%31 = arith.extui %9 : i32 to i64
%32 = arith.shli %31, %c32_i64 : i64
%33 = arith.ori %30, %32 : i64
%34 = arith.index_castui %33 : i64 to index
%35:5 = util.assume.int
%14<umin = 0, umax = 9007199254740991>,
%19<umin = 0, umax = 9007199254740991>,
%24<umin = 0, umax = 9007199254740991>,
%29<umin = 0, umax = 9007199254740991>,
%34<umin = 0, umax = 9007199254740991>
: index, index, index, index, index
%36 = iree_tensor_ext.dispatch.workload.ordinal %35#0, 0 : index
%37 = iree_tensor_ext.dispatch.workload.ordinal %35#1, 1 : index
%38 = iree_tensor_ext.dispatch.workload.ordinal %35#2, 2 : index
%39 = iree_tensor_ext.dispatch.workload.ordinal %35#3, 3 : index
%40 = iree_tensor_ext.dispatch.workload.ordinal %35#4, 4 : index
%41 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%39, %36}
%42 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%37, %40}
%43 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x16xf32>>{%38}
%44 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(3) alignment(64) offset(%c0) flags(Indirect) : !iree_tensor_ext.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%39, %40}
%45 = iree_tensor_ext.dispatch.tensor.load %41, offsets = [0, 0, 0, 0], sizes = [%39, %36, 16, 1], strides = [1, 1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%39, %36} -> tensor<?x?x16x1xf32>
%46 = iree_tensor_ext.dispatch.tensor.load %42, offsets = [0, 0, 0, 0], sizes = [%37, %40, 16, 1], strides = [1, 1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%37, %40} -> tensor<?x?x16x1xf32>
%47 = iree_tensor_ext.dispatch.tensor.load %43, offsets = [0, 0], sizes = [%38, 16], strides = [1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x16xf32>>{%38} -> tensor<?x16xf32>
%48 = tensor.empty(%39, %40) : tensor<?x?x16x16xf32>
%49 = scf.forall (%arg0, %arg1) in (%39, %37) shared_outs(%arg2 = %48) -> (tensor<?x?x16x16xf32>) {
%extracted_slice = tensor.extract_slice %arg2[%arg0, %arg1, 0, 0] [1, 1, 16, 16] [1, 1, 1, 1] : tensor<?x?x16x16xf32> to tensor<1x1x16x16xf32>
%50 = scf.forall (%arg3) = (0) to (16) step (2) shared_outs(%arg4 = %extracted_slice) -> (tensor<1x1x16x16xf32>) {
%51 = tensor.empty() : tensor<1x1x2x16xf32>
%extracted_slice_0 = tensor.extract_slice %arg4[0, 0, %arg3, 0] [1, 1, 2, 16] [1, 1, 1, 1] : tensor<1x1x16x16xf32> to tensor<1x1x2x16xf32>
%extracted_slice_1 = tensor.extract_slice %47[%arg0, %arg3] [1, 2] [1, 1] : tensor<?x16xf32> to tensor<1x2xf32>
%52 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>} ins(%cst : f32) outs(%51 : tensor<1x1x2x16xf32>) -> tensor<1x1x2x16xf32>
%53 = vector.transfer_read %52[%c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true]} : tensor<1x1x2x16xf32>, vector<1x1x2x16xf32>
%54 = scf.for %arg5 = %c0 to %36 step %c1 iter_args(%arg6 = %53) -> (vector<1x1x2x16xf32>) {
%57 = vector.transfer_read %45[%arg0, %arg5, %arg3, %c0], %cst {in_bounds = [true, true, true, true]} : tensor<?x?x16x1xf32>, vector<1x1x2x1xf32>
%58 = vector.transfer_read %46[%arg1, %arg5, %c0, %c0], %cst {in_bounds = [true, true, true, true]} : tensor<?x?x16x1xf32>, vector<1x1x16x1xf32>
%59 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d2, d4, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %57, %58, %arg6 : vector<1x1x2x1xf32>, vector<1x1x16x1xf32> into vector<1x1x2x16xf32>
scf.yield %59 : vector<1x1x2x16xf32>
}
%55 = vector.transfer_write %54, %52[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x2x16xf32>, tensor<1x1x2x16xf32>
%56 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%55, %extracted_slice_1 : tensor<1x1x2x16xf32>, tensor<1x2xf32>) outs(%extracted_slice_0 : tensor<1x1x2x16xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>} {
^bb0(%in: f32, %in_2: f32, %out: f32):
%57 = arith.addf %in, %in_2 : f32
%58 = arith.maximumf %57, %cst : f32
linalg.yield %58 : f32
} -> tensor<1x1x2x16xf32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %56 into %arg4[%c0, %c0, %arg3, %c0] [1, 1, 2, 16] [1, 1, 1, 1] : tensor<1x1x2x16xf32> into tensor<1x1x16x16xf32>
}
}
scf.forall.in_parallel {
tensor.parallel_insert_slice %50 into %arg2[%arg0, %arg1, 0, 0] [1, 1, 16, 16] [1, 1, 1, 1] : tensor<1x1x16x16xf32> into tensor<?x?x16x16xf32>
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
iree_tensor_ext.dispatch.tensor.store %49, %44, offsets = [0, 0, 0, 0], sizes = [%39, %40, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%39, %40}
return
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @mmt4d_bias_relu_dispatch_0_mmt4d_DxDxDx16x16x1_f32() attributes {hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", iree.encoding.resolver = #iree_cpu.cpu_encoding_layout<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "none"}>, translation_info = #iree_codegen.translation_info<pipeline = Mmt4dTilingExpert>} {
%c1 = arith.constant 1 : index
%c0 = arith.constant 0 : index
%c32_i64 = arith.constant 32 : i64
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
%4 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
%5 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
%6 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
%7 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
%8 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
%9 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
%10 = arith.extui %0 : i32 to i64
%11 = arith.extui %1 : i32 to i64
%12 = arith.shli %11, %c32_i64 : i64
%13 = arith.ori %10, %12 : i64
%14 = arith.index_castui %13 : i64 to index
%15 = arith.extui %2 : i32 to i64
%16 = arith.extui %3 : i32 to i64
%17 = arith.shli %16, %c32_i64 : i64
%18 = arith.ori %15, %17 : i64
%19 = arith.index_castui %18 : i64 to index
%20 = arith.extui %4 : i32 to i64
%21 = arith.extui %5 : i32 to i64
%22 = arith.shli %21, %c32_i64 : i64
%23 = arith.ori %20, %22 : i64
%24 = arith.index_castui %23 : i64 to index
%25 = arith.extui %6 : i32 to i64
%26 = arith.extui %7 : i32 to i64
%27 = arith.shli %26, %c32_i64 : i64
%28 = arith.ori %25, %27 : i64
%29 = arith.index_castui %28 : i64 to index
%30 = arith.extui %8 : i32 to i64
%31 = arith.extui %9 : i32 to i64
%32 = arith.shli %31, %c32_i64 : i64
%33 = arith.ori %30, %32 : i64
%34 = arith.index_castui %33 : i64 to index
%35:5 = util.assume.int
%14<umin = 0, umax = 9007199254740991>,
%19<umin = 0, umax = 9007199254740991>,
%24<umin = 0, umax = 9007199254740991>,
%29<umin = 0, umax = 9007199254740991>,
%34<umin = 0, umax = 9007199254740991>
: index, index, index, index, index
%36 = iree_tensor_ext.dispatch.workload.ordinal %35#0, 0 : index
%37 = iree_tensor_ext.dispatch.workload.ordinal %35#1, 1 : index
%38 = iree_tensor_ext.dispatch.workload.ordinal %35#2, 2 : index
%39 = iree_tensor_ext.dispatch.workload.ordinal %35#3, 3 : index
%40 = iree_tensor_ext.dispatch.workload.ordinal %35#4, 4 : index
%41 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%39, %36}
%42 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%37, %40}
%43 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x16xf32>>{%38}
%44 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(3) alignment(64) offset(%c0) flags(Indirect) : !iree_tensor_ext.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%39, %40}
%45 = iree_tensor_ext.dispatch.tensor.load %41, offsets = [0, 0, 0, 0], sizes = [%39, %36, 16, 1], strides = [1, 1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%39, %36} -> tensor<?x?x16x1xf32>
%46 = iree_tensor_ext.dispatch.tensor.load %42, offsets = [0, 0, 0, 0], sizes = [%37, %40, 16, 1], strides = [1, 1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%37, %40} -> tensor<?x?x16x1xf32>
%47 = iree_tensor_ext.dispatch.tensor.load %43, offsets = [0, 0], sizes = [%38, 16], strides = [1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x16xf32>>{%38} -> tensor<?x16xf32>
%48 = tensor.empty(%39, %40) : tensor<?x?x16x16xf32>
%49 = scf.forall (%arg0, %arg1) in (%39, %37) shared_outs(%arg2 = %48) -> (tensor<?x?x16x16xf32>) {
%extracted_slice = tensor.extract_slice %arg2[%arg0, %arg1, 0, 0] [1, 1, 16, 16] [1, 1, 1, 1] : tensor<?x?x16x16xf32> to tensor<1x1x16x16xf32>
%50 = scf.forall (%arg3) = (0) to (16) step (2) shared_outs(%arg4 = %extracted_slice) -> (tensor<1x1x16x16xf32>) {
%51 = tensor.empty() : tensor<1x1x2x16xf32>
%extracted_slice_0 = tensor.extract_slice %arg4[0, 0, %arg3, 0] [1, 1, 2, 16] [1, 1, 1, 1] : tensor<1x1x16x16xf32> to tensor<1x1x2x16xf32>
%extracted_slice_1 = tensor.extract_slice %47[%arg0, %arg3] [1, 2] [1, 1] : tensor<?x16xf32> to tensor<1x2xf32>
%52 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>} ins(%cst : f32) outs(%51 : tensor<1x1x2x16xf32>) -> tensor<1x1x2x16xf32>
%53 = vector.transfer_read %52[%c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true]} : tensor<1x1x2x16xf32>, vector<1x1x2x16xf32>
%54 = scf.for %arg5 = %c0 to %36 step %c1 iter_args(%arg6 = %53) -> (vector<1x1x2x16xf32>) {
%57 = vector.transfer_read %45[%arg0, %arg5, %arg3, %c0], %cst {in_bounds = [true, true, true, true]} : tensor<?x?x16x1xf32>, vector<1x1x2x1xf32>
%58 = vector.transfer_read %46[%arg1, %arg5, %c0, %c0], %cst {in_bounds = [true, true, true, true]} : tensor<?x?x16x1xf32>, vector<1x1x16x1xf32>
%59 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d2, d4, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %57, %58, %arg6 : vector<1x1x2x1xf32>, vector<1x1x16x1xf32> into vector<1x1x2x16xf32>
scf.yield %59 : vector<1x1x2x16xf32>
}
%55 = vector.transfer_write %54, %52[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x2x16xf32>, tensor<1x1x2x16xf32>
%56 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%55, %extracted_slice_1 : tensor<1x1x2x16xf32>, tensor<1x2xf32>) outs(%extracted_slice_0 : tensor<1x1x2x16xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>} {
^bb0(%in: f32, %in_2: f32, %out: f32):
%57 = arith.addf %in, %in_2 : f32
%58 = arith.maximumf %57, %cst : f32
linalg.yield %58 : f32
} -> tensor<1x1x2x16xf32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %56 into %arg4[0, 0, %arg3, 0] [1, 1, 2, 16] [1, 1, 1, 1] : tensor<1x1x2x16xf32> into tensor<1x1x16x16xf32>
}
}
scf.forall.in_parallel {
tensor.parallel_insert_slice %50 into %arg2[%arg0, %arg1, 0, 0] [1, 1, 16, 16] [1, 1, 1, 1] : tensor<1x1x16x16xf32> into tensor<?x?x16x16xf32>
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
iree_tensor_ext.dispatch.tensor.store %49, %44, offsets = [0, 0, 0, 0], sizes = [%39, %40, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%39, %40}
return
}
// -----// IR Dump After CSE (cse) //----- //
func.func @mmt4d_bias_relu_dispatch_0_mmt4d_DxDxDx16x16x1_f32() attributes {hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", iree.encoding.resolver = #iree_cpu.cpu_encoding_layout<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "none"}>, translation_info = #iree_codegen.translation_info<pipeline = Mmt4dTilingExpert>} {
%c1 = arith.constant 1 : index
%c0 = arith.constant 0 : index
%c32_i64 = arith.constant 32 : i64
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
%4 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
%5 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
%6 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
%7 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
%8 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
%9 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
%10 = arith.extui %0 : i32 to i64
%11 = arith.extui %1 : i32 to i64
%12 = arith.shli %11, %c32_i64 : i64
%13 = arith.ori %10, %12 : i64
%14 = arith.index_castui %13 : i64 to index
%15 = arith.extui %2 : i32 to i64
%16 = arith.extui %3 : i32 to i64
%17 = arith.shli %16, %c32_i64 : i64
%18 = arith.ori %15, %17 : i64
%19 = arith.index_castui %18 : i64 to index
%20 = arith.extui %4 : i32 to i64
%21 = arith.extui %5 : i32 to i64
%22 = arith.shli %21, %c32_i64 : i64
%23 = arith.ori %20, %22 : i64
%24 = arith.index_castui %23 : i64 to index
%25 = arith.extui %6 : i32 to i64
%26 = arith.extui %7 : i32 to i64
%27 = arith.shli %26, %c32_i64 : i64
%28 = arith.ori %25, %27 : i64
%29 = arith.index_castui %28 : i64 to index
%30 = arith.extui %8 : i32 to i64
%31 = arith.extui %9 : i32 to i64
%32 = arith.shli %31, %c32_i64 : i64
%33 = arith.ori %30, %32 : i64
%34 = arith.index_castui %33 : i64 to index
%35:5 = util.assume.int
%14<umin = 0, umax = 9007199254740991>,
%19<umin = 0, umax = 9007199254740991>,
%24<umin = 0, umax = 9007199254740991>,
%29<umin = 0, umax = 9007199254740991>,
%34<umin = 0, umax = 9007199254740991>
: index, index, index, index, index
%36 = iree_tensor_ext.dispatch.workload.ordinal %35#0, 0 : index
%37 = iree_tensor_ext.dispatch.workload.ordinal %35#1, 1 : index
%38 = iree_tensor_ext.dispatch.workload.ordinal %35#2, 2 : index
%39 = iree_tensor_ext.dispatch.workload.ordinal %35#3, 3 : index
%40 = iree_tensor_ext.dispatch.workload.ordinal %35#4, 4 : index
%41 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%39, %36}
%42 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%37, %40}
%43 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x16xf32>>{%38}
%44 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(3) alignment(64) offset(%c0) flags(Indirect) : !iree_tensor_ext.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%39, %40}
%45 = iree_tensor_ext.dispatch.tensor.load %41, offsets = [0, 0, 0, 0], sizes = [%39, %36, 16, 1], strides = [1, 1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%39, %36} -> tensor<?x?x16x1xf32>
%46 = iree_tensor_ext.dispatch.tensor.load %42, offsets = [0, 0, 0, 0], sizes = [%37, %40, 16, 1], strides = [1, 1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%37, %40} -> tensor<?x?x16x1xf32>
%47 = iree_tensor_ext.dispatch.tensor.load %43, offsets = [0, 0], sizes = [%38, 16], strides = [1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x16xf32>>{%38} -> tensor<?x16xf32>
%48 = tensor.empty(%39, %40) : tensor<?x?x16x16xf32>
%49 = scf.forall (%arg0, %arg1) in (%39, %37) shared_outs(%arg2 = %48) -> (tensor<?x?x16x16xf32>) {
%extracted_slice = tensor.extract_slice %arg2[%arg0, %arg1, 0, 0] [1, 1, 16, 16] [1, 1, 1, 1] : tensor<?x?x16x16xf32> to tensor<1x1x16x16xf32>
%50 = scf.forall (%arg3) = (0) to (16) step (2) shared_outs(%arg4 = %extracted_slice) -> (tensor<1x1x16x16xf32>) {
%51 = tensor.empty() : tensor<1x1x2x16xf32>
%extracted_slice_0 = tensor.extract_slice %arg4[0, 0, %arg3, 0] [1, 1, 2, 16] [1, 1, 1, 1] : tensor<1x1x16x16xf32> to tensor<1x1x2x16xf32>
%extracted_slice_1 = tensor.extract_slice %47[%arg0, %arg3] [1, 2] [1, 1] : tensor<?x16xf32> to tensor<1x2xf32>
%52 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>} ins(%cst : f32) outs(%51 : tensor<1x1x2x16xf32>) -> tensor<1x1x2x16xf32>
%53 = vector.transfer_read %52[%c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true]} : tensor<1x1x2x16xf32>, vector<1x1x2x16xf32>
%54 = scf.for %arg5 = %c0 to %36 step %c1 iter_args(%arg6 = %53) -> (vector<1x1x2x16xf32>) {
%57 = vector.transfer_read %45[%arg0, %arg5, %arg3, %c0], %cst {in_bounds = [true, true, true, true]} : tensor<?x?x16x1xf32>, vector<1x1x2x1xf32>
%58 = vector.transfer_read %46[%arg1, %arg5, %c0, %c0], %cst {in_bounds = [true, true, true, true]} : tensor<?x?x16x1xf32>, vector<1x1x16x1xf32>
%59 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d2, d4, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %57, %58, %arg6 : vector<1x1x2x1xf32>, vector<1x1x16x1xf32> into vector<1x1x2x16xf32>
scf.yield %59 : vector<1x1x2x16xf32>
}
%55 = vector.transfer_write %54, %52[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x2x16xf32>, tensor<1x1x2x16xf32>
%56 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%55, %extracted_slice_1 : tensor<1x1x2x16xf32>, tensor<1x2xf32>) outs(%extracted_slice_0 : tensor<1x1x2x16xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>} {
^bb0(%in: f32, %in_2: f32, %out: f32):
%57 = arith.addf %in, %in_2 : f32
%58 = arith.maximumf %57, %cst : f32
linalg.yield %58 : f32
} -> tensor<1x1x2x16xf32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %56 into %arg4[0, 0, %arg3, 0] [1, 1, 2, 16] [1, 1, 1, 1] : tensor<1x1x2x16xf32> into tensor<1x1x16x16xf32>
}
}
scf.forall.in_parallel {
tensor.parallel_insert_slice %50 into %arg2[%arg0, %arg1, 0, 0] [1, 1, 16, 16] [1, 1, 1, 1] : tensor<1x1x16x16xf32> into tensor<?x?x16x16xf32>
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
iree_tensor_ext.dispatch.tensor.store %49, %44, offsets = [0, 0, 0, 0], sizes = [%39, %40, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%39, %40}
return
}
// -----// IR Dump After LLVMCPUVerifyVectorSizeLegalityPass (iree-llvmcpu-verify-vector-size-legality) //----- //
func.func @mmt4d_bias_relu_dispatch_0_mmt4d_DxDxDx16x16x1_f32() attributes {hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", iree.encoding.resolver = #iree_cpu.cpu_encoding_layout<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "none"}>, translation_info = #iree_codegen.translation_info<pipeline = Mmt4dTilingExpert>} {
%c1 = arith.constant 1 : index
%c0 = arith.constant 0 : index
%c32_i64 = arith.constant 32 : i64
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
%4 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
%5 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
%6 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
%7 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
%8 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
%9 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
%10 = arith.extui %0 : i32 to i64
%11 = arith.extui %1 : i32 to i64
%12 = arith.shli %11, %c32_i64 : i64
%13 = arith.ori %10, %12 : i64
%14 = arith.index_castui %13 : i64 to index
%15 = arith.extui %2 : i32 to i64
%16 = arith.extui %3 : i32 to i64
%17 = arith.shli %16, %c32_i64 : i64
%18 = arith.ori %15, %17 : i64
%19 = arith.index_castui %18 : i64 to index
%20 = arith.extui %4 : i32 to i64
%21 = arith.extui %5 : i32 to i64
%22 = arith.shli %21, %c32_i64 : i64
%23 = arith.ori %20, %22 : i64
%24 = arith.index_castui %23 : i64 to index
%25 = arith.extui %6 : i32 to i64
%26 = arith.extui %7 : i32 to i64
%27 = arith.shli %26, %c32_i64 : i64
%28 = arith.ori %25, %27 : i64
%29 = arith.index_castui %28 : i64 to index
%30 = arith.extui %8 : i32 to i64
%31 = arith.extui %9 : i32 to i64
%32 = arith.shli %31, %c32_i64 : i64
%33 = arith.ori %30, %32 : i64
%34 = arith.index_castui %33 : i64 to index
%35:5 = util.assume.int
%14<umin = 0, umax = 9007199254740991>,
%19<umin = 0, umax = 9007199254740991>,
%24<umin = 0, umax = 9007199254740991>,
%29<umin = 0, umax = 9007199254740991>,
%34<umin = 0, umax = 9007199254740991>
: index, index, index, index, index
%36 = iree_tensor_ext.dispatch.workload.ordinal %35#0, 0 : index
%37 = iree_tensor_ext.dispatch.workload.ordinal %35#1, 1 : index
%38 = iree_tensor_ext.dispatch.workload.ordinal %35#2, 2 : index
%39 = iree_tensor_ext.dispatch.workload.ordinal %35#3, 3 : index
%40 = iree_tensor_ext.dispatch.workload.ordinal %35#4, 4 : index
%41 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%39, %36}
%42 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%37, %40}
%43 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x16xf32>>{%38}
%44 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(3) alignment(64) offset(%c0) flags(Indirect) : !iree_tensor_ext.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%39, %40}
%45 = iree_tensor_ext.dispatch.tensor.load %41, offsets = [0, 0, 0, 0], sizes = [%39, %36, 16, 1], strides = [1, 1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%39, %36} -> tensor<?x?x16x1xf32>
%46 = iree_tensor_ext.dispatch.tensor.load %42, offsets = [0, 0, 0, 0], sizes = [%37, %40, 16, 1], strides = [1, 1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%37, %40} -> tensor<?x?x16x1xf32>
%47 = iree_tensor_ext.dispatch.tensor.load %43, offsets = [0, 0], sizes = [%38, 16], strides = [1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x16xf32>>{%38} -> tensor<?x16xf32>
%48 = tensor.empty(%39, %40) : tensor<?x?x16x16xf32>
%49 = scf.forall (%arg0, %arg1) in (%39, %37) shared_outs(%arg2 = %48) -> (tensor<?x?x16x16xf32>) {
%extracted_slice = tensor.extract_slice %arg2[%arg0, %arg1, 0, 0] [1, 1, 16, 16] [1, 1, 1, 1] : tensor<?x?x16x16xf32> to tensor<1x1x16x16xf32>
%50 = scf.forall (%arg3) = (0) to (16) step (2) shared_outs(%arg4 = %extracted_slice) -> (tensor<1x1x16x16xf32>) {
%51 = tensor.empty() : tensor<1x1x2x16xf32>
%extracted_slice_0 = tensor.extract_slice %arg4[0, 0, %arg3, 0] [1, 1, 2, 16] [1, 1, 1, 1] : tensor<1x1x16x16xf32> to tensor<1x1x2x16xf32>
%extracted_slice_1 = tensor.extract_slice %47[%arg0, %arg3] [1, 2] [1, 1] : tensor<?x16xf32> to tensor<1x2xf32>
%52 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>} ins(%cst : f32) outs(%51 : tensor<1x1x2x16xf32>) -> tensor<1x1x2x16xf32>
%53 = vector.transfer_read %52[%c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true]} : tensor<1x1x2x16xf32>, vector<1x1x2x16xf32>
%54 = scf.for %arg5 = %c0 to %36 step %c1 iter_args(%arg6 = %53) -> (vector<1x1x2x16xf32>) {
%57 = vector.transfer_read %45[%arg0, %arg5, %arg3, %c0], %cst {in_bounds = [true, true, true, true]} : tensor<?x?x16x1xf32>, vector<1x1x2x1xf32>
%58 = vector.transfer_read %46[%arg1, %arg5, %c0, %c0], %cst {in_bounds = [true, true, true, true]} : tensor<?x?x16x1xf32>, vector<1x1x16x1xf32>
%59 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d2, d4, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %57, %58, %arg6 : vector<1x1x2x1xf32>, vector<1x1x16x1xf32> into vector<1x1x2x16xf32>
scf.yield %59 : vector<1x1x2x16xf32>
}
%55 = vector.transfer_write %54, %52[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x2x16xf32>, tensor<1x1x2x16xf32>
%56 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%55, %extracted_slice_1 : tensor<1x1x2x16xf32>, tensor<1x2xf32>) outs(%extracted_slice_0 : tensor<1x1x2x16xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>} {
^bb0(%in: f32, %in_2: f32, %out: f32):
%57 = arith.addf %in, %in_2 : f32
%58 = arith.maximumf %57, %cst : f32
linalg.yield %58 : f32
} -> tensor<1x1x2x16xf32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %56 into %arg4[0, 0, %arg3, 0] [1, 1, 2, 16] [1, 1, 1, 1] : tensor<1x1x2x16xf32> into tensor<1x1x16x16xf32>
}
}
scf.forall.in_parallel {
tensor.parallel_insert_slice %50 into %arg2[%arg0, %arg1, 0, 0] [1, 1, 16, 16] [1, 1, 1, 1] : tensor<1x1x16x16xf32> into tensor<?x?x16x16xf32>
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
iree_tensor_ext.dispatch.tensor.store %49, %44, offsets = [0, 0, 0, 0], sizes = [%39, %40, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%39, %40}
return
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @mmt4d_bias_relu_dispatch_0_mmt4d_DxDxDx16x16x1_f32() attributes {hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", iree.encoding.resolver = #iree_cpu.cpu_encoding_layout<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "none"}>, translation_info = #iree_codegen.translation_info<pipeline = Mmt4dTilingExpert>} {
%c1 = arith.constant 1 : index
%c0 = arith.constant 0 : index
%c32_i64 = arith.constant 32 : i64
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
%4 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
%5 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
%6 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
%7 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
%8 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
%9 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
%10 = arith.extui %0 : i32 to i64
%11 = arith.extui %1 : i32 to i64
%12 = arith.shli %11, %c32_i64 : i64
%13 = arith.ori %10, %12 : i64
%14 = arith.index_castui %13 : i64 to index
%15 = arith.extui %2 : i32 to i64
%16 = arith.extui %3 : i32 to i64
%17 = arith.shli %16, %c32_i64 : i64
%18 = arith.ori %15, %17 : i64
%19 = arith.index_castui %18 : i64 to index
%20 = arith.extui %4 : i32 to i64
%21 = arith.extui %5 : i32 to i64
%22 = arith.shli %21, %c32_i64 : i64
%23 = arith.ori %20, %22 : i64
%24 = arith.index_castui %23 : i64 to index
%25 = arith.extui %6 : i32 to i64
%26 = arith.extui %7 : i32 to i64
%27 = arith.shli %26, %c32_i64 : i64
%28 = arith.ori %25, %27 : i64
%29 = arith.index_castui %28 : i64 to index
%30 = arith.extui %8 : i32 to i64
%31 = arith.extui %9 : i32 to i64
%32 = arith.shli %31, %c32_i64 : i64
%33 = arith.ori %30, %32 : i64
%34 = arith.index_castui %33 : i64 to index
%35:5 = util.assume.int
%14<umin = 0, umax = 9007199254740991>,
%19<umin = 0, umax = 9007199254740991>,
%24<umin = 0, umax = 9007199254740991>,
%29<umin = 0, umax = 9007199254740991>,
%34<umin = 0, umax = 9007199254740991>
: index, index, index, index, index
%36 = iree_tensor_ext.dispatch.workload.ordinal %35#0, 0 : index
%37 = iree_tensor_ext.dispatch.workload.ordinal %35#1, 1 : index
%38 = iree_tensor_ext.dispatch.workload.ordinal %35#2, 2 : index
%39 = iree_tensor_ext.dispatch.workload.ordinal %35#3, 3 : index
%40 = iree_tensor_ext.dispatch.workload.ordinal %35#4, 4 : index
%41 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%39, %36}
%42 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%37, %40}
%43 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x16xf32>>{%38}
%44 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(3) alignment(64) offset(%c0) flags(Indirect) : !iree_tensor_ext.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%39, %40}
%45 = iree_tensor_ext.dispatch.tensor.load %41, offsets = [0, 0, 0, 0], sizes = [%39, %36, 16, 1], strides = [1, 1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%39, %36} -> tensor<?x?x16x1xf32>
%46 = iree_tensor_ext.dispatch.tensor.load %42, offsets = [0, 0, 0, 0], sizes = [%37, %40, 16, 1], strides = [1, 1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%37, %40} -> tensor<?x?x16x1xf32>
%47 = iree_tensor_ext.dispatch.tensor.load %43, offsets = [0, 0], sizes = [%38, 16], strides = [1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x16xf32>>{%38} -> tensor<?x16xf32>
%48 = tensor.empty(%39, %40) : tensor<?x?x16x16xf32>
%49 = scf.forall (%arg0, %arg1) in (%39, %37) shared_outs(%arg2 = %48) -> (tensor<?x?x16x16xf32>) {
%extracted_slice = tensor.extract_slice %arg2[%arg0, %arg1, 0, 0] [1, 1, 16, 16] [1, 1, 1, 1] : tensor<?x?x16x16xf32> to tensor<1x1x16x16xf32>
%50 = scf.forall (%arg3) = (0) to (16) step (2) shared_outs(%arg4 = %extracted_slice) -> (tensor<1x1x16x16xf32>) {
%51 = tensor.empty() : tensor<1x1x2x16xf32>
%extracted_slice_0 = tensor.extract_slice %arg4[0, 0, %arg3, 0] [1, 1, 2, 16] [1, 1, 1, 1] : tensor<1x1x16x16xf32> to tensor<1x1x2x16xf32>
%extracted_slice_1 = tensor.extract_slice %47[%arg0, %arg3] [1, 2] [1, 1] : tensor<?x16xf32> to tensor<1x2xf32>
%52 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>} ins(%cst : f32) outs(%51 : tensor<1x1x2x16xf32>) -> tensor<1x1x2x16xf32>
%53 = vector.transfer_read %52[%c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true]} : tensor<1x1x2x16xf32>, vector<1x1x2x16xf32>
%54 = scf.for %arg5 = %c0 to %36 step %c1 iter_args(%arg6 = %53) -> (vector<1x1x2x16xf32>) {
%57 = vector.transfer_read %45[%arg0, %arg5, %arg3, %c0], %cst {in_bounds = [true, true, true, true]} : tensor<?x?x16x1xf32>, vector<1x1x2x1xf32>
%58 = vector.transfer_read %46[%arg1, %arg5, %c0, %c0], %cst {in_bounds = [true, true, true, true]} : tensor<?x?x16x1xf32>, vector<1x1x16x1xf32>
%59 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d2, d4, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %57, %58, %arg6 : vector<1x1x2x1xf32>, vector<1x1x16x1xf32> into vector<1x1x2x16xf32>
scf.yield %59 : vector<1x1x2x16xf32>
}
%55 = vector.transfer_write %54, %52[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x2x16xf32>, tensor<1x1x2x16xf32>
%56 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%55, %extracted_slice_1 : tensor<1x1x2x16xf32>, tensor<1x2xf32>) outs(%extracted_slice_0 : tensor<1x1x2x16xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>} {
^bb0(%in: f32, %in_2: f32, %out: f32):
%57 = arith.addf %in, %in_2 : f32
%58 = arith.maximumf %57, %cst : f32
linalg.yield %58 : f32
} -> tensor<1x1x2x16xf32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %56 into %arg4[0, 0, %arg3, 0] [1, 1, 2, 16] [1, 1, 1, 1] : tensor<1x1x2x16xf32> into tensor<1x1x16x16xf32>
}
}
scf.forall.in_parallel {
tensor.parallel_insert_slice %50 into %arg2[%arg0, %arg1, 0, 0] [1, 1, 16, 16] [1, 1, 1, 1] : tensor<1x1x16x16xf32> into tensor<?x?x16x16xf32>
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
iree_tensor_ext.dispatch.tensor.store %49, %44, offsets = [0, 0, 0, 0], sizes = [%39, %40, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%39, %40}
return
}
// -----// IR Dump After CSE (cse) //----- //
func.func @mmt4d_bias_relu_dispatch_0_mmt4d_DxDxDx16x16x1_f32() attributes {hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", iree.encoding.resolver = #iree_cpu.cpu_encoding_layout<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "none"}>, translation_info = #iree_codegen.translation_info<pipeline = Mmt4dTilingExpert>} {
%c1 = arith.constant 1 : index
%c0 = arith.constant 0 : index
%c32_i64 = arith.constant 32 : i64
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
%4 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
%5 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
%6 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
%7 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
%8 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
%9 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
%10 = arith.extui %0 : i32 to i64
%11 = arith.extui %1 : i32 to i64
%12 = arith.shli %11, %c32_i64 : i64
%13 = arith.ori %10, %12 : i64
%14 = arith.index_castui %13 : i64 to index
%15 = arith.extui %2 : i32 to i64
%16 = arith.extui %3 : i32 to i64
%17 = arith.shli %16, %c32_i64 : i64
%18 = arith.ori %15, %17 : i64
%19 = arith.index_castui %18 : i64 to index
%20 = arith.extui %4 : i32 to i64
%21 = arith.extui %5 : i32 to i64
%22 = arith.shli %21, %c32_i64 : i64
%23 = arith.ori %20, %22 : i64
%24 = arith.index_castui %23 : i64 to index
%25 = arith.extui %6 : i32 to i64
%26 = arith.extui %7 : i32 to i64
%27 = arith.shli %26, %c32_i64 : i64
%28 = arith.ori %25, %27 : i64
%29 = arith.index_castui %28 : i64 to index
%30 = arith.extui %8 : i32 to i64
%31 = arith.extui %9 : i32 to i64
%32 = arith.shli %31, %c32_i64 : i64
%33 = arith.ori %30, %32 : i64
%34 = arith.index_castui %33 : i64 to index
%35:5 = util.assume.int
%14<umin = 0, umax = 9007199254740991>,
%19<umin = 0, umax = 9007199254740991>,
%24<umin = 0, umax = 9007199254740991>,
%29<umin = 0, umax = 9007199254740991>,
%34<umin = 0, umax = 9007199254740991>
: index, index, index, index, index
%36 = iree_tensor_ext.dispatch.workload.ordinal %35#0, 0 : index
%37 = iree_tensor_ext.dispatch.workload.ordinal %35#1, 1 : index
%38 = iree_tensor_ext.dispatch.workload.ordinal %35#2, 2 : index
%39 = iree_tensor_ext.dispatch.workload.ordinal %35#3, 3 : index
%40 = iree_tensor_ext.dispatch.workload.ordinal %35#4, 4 : index
%41 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%39, %36}
%42 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%37, %40}
%43 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x16xf32>>{%38}
%44 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(3) alignment(64) offset(%c0) flags(Indirect) : !iree_tensor_ext.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%39, %40}
%45 = iree_tensor_ext.dispatch.tensor.load %41, offsets = [0, 0, 0, 0], sizes = [%39, %36, 16, 1], strides = [1, 1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%39, %36} -> tensor<?x?x16x1xf32>
%46 = iree_tensor_ext.dispatch.tensor.load %42, offsets = [0, 0, 0, 0], sizes = [%37, %40, 16, 1], strides = [1, 1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%37, %40} -> tensor<?x?x16x1xf32>
%47 = iree_tensor_ext.dispatch.tensor.load %43, offsets = [0, 0], sizes = [%38, 16], strides = [1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x16xf32>>{%38} -> tensor<?x16xf32>
%48 = tensor.empty(%39, %40) : tensor<?x?x16x16xf32>
%49 = scf.forall (%arg0, %arg1) in (%39, %37) shared_outs(%arg2 = %48) -> (tensor<?x?x16x16xf32>) {
%extracted_slice = tensor.extract_slice %arg2[%arg0, %arg1, 0, 0] [1, 1, 16, 16] [1, 1, 1, 1] : tensor<?x?x16x16xf32> to tensor<1x1x16x16xf32>
%50 = scf.forall (%arg3) = (0) to (16) step (2) shared_outs(%arg4 = %extracted_slice) -> (tensor<1x1x16x16xf32>) {
%51 = tensor.empty() : tensor<1x1x2x16xf32>
%extracted_slice_0 = tensor.extract_slice %arg4[0, 0, %arg3, 0] [1, 1, 2, 16] [1, 1, 1, 1] : tensor<1x1x16x16xf32> to tensor<1x1x2x16xf32>
%extracted_slice_1 = tensor.extract_slice %47[%arg0, %arg3] [1, 2] [1, 1] : tensor<?x16xf32> to tensor<1x2xf32>
%52 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>} ins(%cst : f32) outs(%51 : tensor<1x1x2x16xf32>) -> tensor<1x1x2x16xf32>
%53 = vector.transfer_read %52[%c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true]} : tensor<1x1x2x16xf32>, vector<1x1x2x16xf32>
%54 = scf.for %arg5 = %c0 to %36 step %c1 iter_args(%arg6 = %53) -> (vector<1x1x2x16xf32>) {
%57 = vector.transfer_read %45[%arg0, %arg5, %arg3, %c0], %cst {in_bounds = [true, true, true, true]} : tensor<?x?x16x1xf32>, vector<1x1x2x1xf32>
%58 = vector.transfer_read %46[%arg1, %arg5, %c0, %c0], %cst {in_bounds = [true, true, true, true]} : tensor<?x?x16x1xf32>, vector<1x1x16x1xf32>
%59 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d2, d4, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %57, %58, %arg6 : vector<1x1x2x1xf32>, vector<1x1x16x1xf32> into vector<1x1x2x16xf32>
scf.yield %59 : vector<1x1x2x16xf32>
}
%55 = vector.transfer_write %54, %52[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x2x16xf32>, tensor<1x1x2x16xf32>
%56 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%55, %extracted_slice_1 : tensor<1x1x2x16xf32>, tensor<1x2xf32>) outs(%extracted_slice_0 : tensor<1x1x2x16xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>} {
^bb0(%in: f32, %in_2: f32, %out: f32):
%57 = arith.addf %in, %in_2 : f32
%58 = arith.maximumf %57, %cst : f32
linalg.yield %58 : f32
} -> tensor<1x1x2x16xf32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %56 into %arg4[0, 0, %arg3, 0] [1, 1, 2, 16] [1, 1, 1, 1] : tensor<1x1x2x16xf32> into tensor<1x1x16x16xf32>
}
}
scf.forall.in_parallel {
tensor.parallel_insert_slice %50 into %arg2[%arg0, %arg1, 0, 0] [1, 1, 16, 16] [1, 1, 1, 1] : tensor<1x1x16x16xf32> into tensor<?x?x16x16xf32>
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
iree_tensor_ext.dispatch.tensor.store %49, %44, offsets = [0, 0, 0, 0], sizes = [%39, %40, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%39, %40}
return
}
// -----// IR Dump After EliminateEmptyTensorsPass (iree-eliminate-empty-tensors) //----- //
func.func @mmt4d_bias_relu_dispatch_0_mmt4d_DxDxDx16x16x1_f32() attributes {hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", iree.encoding.resolver = #iree_cpu.cpu_encoding_layout<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "none"}>, translation_info = #iree_codegen.translation_info<pipeline = Mmt4dTilingExpert>} {
%c1 = arith.constant 1 : index
%c0 = arith.constant 0 : index
%c32_i64 = arith.constant 32 : i64
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
%4 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
%5 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
%6 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
%7 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
%8 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
%9 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
%10 = arith.extui %0 : i32 to i64
%11 = arith.extui %1 : i32 to i64
%12 = arith.shli %11, %c32_i64 : i64
%13 = arith.ori %10, %12 : i64
%14 = arith.index_castui %13 : i64 to index
%15 = arith.extui %2 : i32 to i64
%16 = arith.extui %3 : i32 to i64
%17 = arith.shli %16, %c32_i64 : i64
%18 = arith.ori %15, %17 : i64
%19 = arith.index_castui %18 : i64 to index
%20 = arith.extui %4 : i32 to i64
%21 = arith.extui %5 : i32 to i64
%22 = arith.shli %21, %c32_i64 : i64
%23 = arith.ori %20, %22 : i64
%24 = arith.index_castui %23 : i64 to index
%25 = arith.extui %6 : i32 to i64
%26 = arith.extui %7 : i32 to i64
%27 = arith.shli %26, %c32_i64 : i64
%28 = arith.ori %25, %27 : i64
%29 = arith.index_castui %28 : i64 to index
%30 = arith.extui %8 : i32 to i64
%31 = arith.extui %9 : i32 to i64
%32 = arith.shli %31, %c32_i64 : i64
%33 = arith.ori %30, %32 : i64
%34 = arith.index_castui %33 : i64 to index
%35:5 = util.assume.int
%14<umin = 0, umax = 9007199254740991>,
%19<umin = 0, umax = 9007199254740991>,
%24<umin = 0, umax = 9007199254740991>,
%29<umin = 0, umax = 9007199254740991>,
%34<umin = 0, umax = 9007199254740991>
: index, index, index, index, index
%36 = iree_tensor_ext.dispatch.workload.ordinal %35#0, 0 : index
%37 = iree_tensor_ext.dispatch.workload.ordinal %35#1, 1 : index
%38 = iree_tensor_ext.dispatch.workload.ordinal %35#2, 2 : index
%39 = iree_tensor_ext.dispatch.workload.ordinal %35#3, 3 : index
%40 = iree_tensor_ext.dispatch.workload.ordinal %35#4, 4 : index
%41 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%39, %36}
%42 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%37, %40}
%43 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x16xf32>>{%38}
%44 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(3) alignment(64) offset(%c0) flags(Indirect) : !iree_tensor_ext.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%39, %40}
%45 = iree_tensor_ext.dispatch.tensor.load %41, offsets = [0, 0, 0, 0], sizes = [%39, %36, 16, 1], strides = [1, 1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%39, %36} -> tensor<?x?x16x1xf32>
%46 = iree_tensor_ext.dispatch.tensor.load %42, offsets = [0, 0, 0, 0], sizes = [%37, %40, 16, 1], strides = [1, 1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%37, %40} -> tensor<?x?x16x1xf32>
%47 = iree_tensor_ext.dispatch.tensor.load %43, offsets = [0, 0], sizes = [%38, 16], strides = [1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x16xf32>>{%38} -> tensor<?x16xf32>
%48 = iree_tensor_ext.dispatch.tensor.load %44, offsets = [0, 0, 0, 0], sizes = [%39, %40, 16, 16], strides = [1, 1, 1, 1] : !iree_tensor_ext.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%39, %40} -> tensor<?x?x16x16xf32>
%49 = tensor.empty(%39, %40) : tensor<?x?x16x16xf32>
%50 = scf.forall (%arg0, %arg1) in (%39, %37) shared_outs(%arg2 = %48) -> (tensor<?x?x16x16xf32>) {
%extracted_slice = tensor.extract_slice %arg2[%arg0, %arg1, 0, 0] [1, 1, 16, 16] [1, 1, 1, 1] : tensor<?x?x16x16xf32> to tensor<1x1x16x16xf32>
%51 = scf.forall (%arg3) = (0) to (16) step (2) shared_outs(%arg4 = %extracted_slice) -> (tensor<1x1x16x16xf32>) {
%52 = tensor.empty() : tensor<1x1x2x16xf32>
%extracted_slice_0 = tensor.extract_slice %arg4[0, 0, %arg3, 0] [1, 1, 2, 16] [1, 1, 1, 1] : tensor<1x1x16x16xf32> to tensor<1x1x2x16xf32>
%extracted_slice_1 = tensor.extract_slice %47[%arg0, %arg3] [1, 2] [1, 1] : tensor<?x16xf32> to tensor<1x2xf32>
%53 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>} ins(%cst : f32) outs(%52 : tensor<1x1x2x16xf32>) -> tensor<1x1x2x16xf32>
%54 = vector.transfer_read %53[%c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true]} : tensor<1x1x2x16xf32>, vector<1x1x2x16xf32>
%55 = scf.for %arg5 = %c0 to %36 step %c1 iter_args(%arg6 = %54) -> (vector<1x1x2x16xf32>) {
%58 = vector.transfer_read %45[%arg0, %arg5, %arg3, %c0], %cst {in_bounds = [true, true, true, true]} : tensor<?x?x16x1xf32>, vector<1x1x2x1xf32>
%59 = vector.transfer_read %46[%arg1, %arg5, %c0, %c0], %cst {in_bounds = [true, true, true, true]} : tensor<?x?x16x1xf32>, vector<1x1x16x1xf32>
%60 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d2, d4, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %58, %59, %arg6 : vector<1x1x2x1xf32>, vector<1x1x16x1xf32> into vector<1x1x2x16xf32>
scf.yield %60 : vector<1x1x2x16xf32>
}
%56 = vector.transfer_write %55, %53[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x2x16xf32>, tensor<1x1x2x16xf32>
%57 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%56, %extracted_slice_1 : tensor<1x1x2x16xf32>, tensor<1x2xf32>) outs(%extracted_slice_0 : tensor<1x1x2x16xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>} {
^bb0(%in: f32, %in_2: f32, %out: f32):
%58 = arith.addf %in, %in_2 : f32
%59 = arith.maximumf %58, %cst : f32
linalg.yield %59 : f32
} -> tensor<1x1x2x16xf32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %57 into %arg4[0, 0, %arg3, 0] [1, 1, 2, 16] [1, 1, 1, 1] : tensor<1x1x2x16xf32> into tensor<1x1x16x16xf32>
}
}
scf.forall.in_parallel {
tensor.parallel_insert_slice %51 into %arg2[%arg0, %arg1, 0, 0] [1, 1, 16, 16] [1, 1, 1, 1] : tensor<1x1x16x16xf32> into tensor<?x?x16x16xf32>
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
iree_tensor_ext.dispatch.tensor.store %50, %44, offsets = [0, 0, 0, 0], sizes = [%39, %40, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%39, %40}
return
}
// -----// IR Dump After EmptyTensorToAllocTensorPass (empty-tensor-to-alloc-tensor) //----- //
func.func @mmt4d_bias_relu_dispatch_0_mmt4d_DxDxDx16x16x1_f32() attributes {hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", iree.encoding.resolver = #iree_cpu.cpu_encoding_layout<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "none"}>, translation_info = #iree_codegen.translation_info<pipeline = Mmt4dTilingExpert>} {
%c1 = arith.constant 1 : index
%c0 = arith.constant 0 : index
%c32_i64 = arith.constant 32 : i64
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
%4 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
%5 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
%6 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
%7 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
%8 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
%9 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
%10 = arith.extui %0 : i32 to i64
%11 = arith.extui %1 : i32 to i64
%12 = arith.shli %11, %c32_i64 : i64
%13 = arith.ori %10, %12 : i64
%14 = arith.index_castui %13 : i64 to index
%15 = arith.extui %2 : i32 to i64
%16 = arith.extui %3 : i32 to i64
%17 = arith.shli %16, %c32_i64 : i64
%18 = arith.ori %15, %17 : i64
%19 = arith.index_castui %18 : i64 to index
%20 = arith.extui %4 : i32 to i64
%21 = arith.extui %5 : i32 to i64
%22 = arith.shli %21, %c32_i64 : i64
%23 = arith.ori %20, %22 : i64
%24 = arith.index_castui %23 : i64 to index
%25 = arith.extui %6 : i32 to i64
%26 = arith.extui %7 : i32 to i64
%27 = arith.shli %26, %c32_i64 : i64
%28 = arith.ori %25, %27 : i64
%29 = arith.index_castui %28 : i64 to index
%30 = arith.extui %8 : i32 to i64
%31 = arith.extui %9 : i32 to i64
%32 = arith.shli %31, %c32_i64 : i64
%33 = arith.ori %30, %32 : i64
%34 = arith.index_castui %33 : i64 to index
%35:5 = util.assume.int
%14<umin = 0, umax = 9007199254740991>,
%19<umin = 0, umax = 9007199254740991>,
%24<umin = 0, umax = 9007199254740991>,
%29<umin = 0, umax = 9007199254740991>,
%34<umin = 0, umax = 9007199254740991>
: index, index, index, index, index
%36 = iree_tensor_ext.dispatch.workload.ordinal %35#0, 0 : index
%37 = iree_tensor_ext.dispatch.workload.ordinal %35#1, 1 : index
%38 = iree_tensor_ext.dispatch.workload.ordinal %35#2, 2 : index
%39 = iree_tensor_ext.dispatch.workload.ordinal %35#3, 3 : index
%40 = iree_tensor_ext.dispatch.workload.ordinal %35#4, 4 : index
%41 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%39, %36}
%42 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%37, %40}
%43 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x16xf32>>{%38}
%44 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(3) alignment(64) offset(%c0) flags(Indirect) : !iree_tensor_ext.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%39, %40}
%45 = iree_tensor_ext.dispatch.tensor.load %41, offsets = [0, 0, 0, 0], sizes = [%39, %36, 16, 1], strides = [1, 1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%39, %36} -> tensor<?x?x16x1xf32>
%46 = iree_tensor_ext.dispatch.tensor.load %42, offsets = [0, 0, 0, 0], sizes = [%37, %40, 16, 1], strides = [1, 1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%37, %40} -> tensor<?x?x16x1xf32>
%47 = iree_tensor_ext.dispatch.tensor.load %43, offsets = [0, 0], sizes = [%38, 16], strides = [1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x16xf32>>{%38} -> tensor<?x16xf32>
%48 = iree_tensor_ext.dispatch.tensor.load %44, offsets = [0, 0, 0, 0], sizes = [%39, %40, 16, 16], strides = [1, 1, 1, 1] : !iree_tensor_ext.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%39, %40} -> tensor<?x?x16x16xf32>
%49 = scf.forall (%arg0, %arg1) in (%39, %37) shared_outs(%arg2 = %48) -> (tensor<?x?x16x16xf32>) {
%extracted_slice = tensor.extract_slice %arg2[%arg0, %arg1, 0, 0] [1, 1, 16, 16] [1, 1, 1, 1] : tensor<?x?x16x16xf32> to tensor<1x1x16x16xf32>
%50 = scf.forall (%arg3) = (0) to (16) step (2) shared_outs(%arg4 = %extracted_slice) -> (tensor<1x1x16x16xf32>) {
%51 = bufferization.alloc_tensor() : tensor<1x1x2x16xf32>
%extracted_slice_0 = tensor.extract_slice %arg4[0, 0, %arg3, 0] [1, 1, 2, 16] [1, 1, 1, 1] : tensor<1x1x16x16xf32> to tensor<1x1x2x16xf32>
%extracted_slice_1 = tensor.extract_slice %47[%arg0, %arg3] [1, 2] [1, 1] : tensor<?x16xf32> to tensor<1x2xf32>
%52 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>} ins(%cst : f32) outs(%51 : tensor<1x1x2x16xf32>) -> tensor<1x1x2x16xf32>
%53 = vector.transfer_read %52[%c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true]} : tensor<1x1x2x16xf32>, vector<1x1x2x16xf32>
%54 = scf.for %arg5 = %c0 to %36 step %c1 iter_args(%arg6 = %53) -> (vector<1x1x2x16xf32>) {
%57 = vector.transfer_read %45[%arg0, %arg5, %arg3, %c0], %cst {in_bounds = [true, true, true, true]} : tensor<?x?x16x1xf32>, vector<1x1x2x1xf32>
%58 = vector.transfer_read %46[%arg1, %arg5, %c0, %c0], %cst {in_bounds = [true, true, true, true]} : tensor<?x?x16x1xf32>, vector<1x1x16x1xf32>
%59 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d2, d4, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %57, %58, %arg6 : vector<1x1x2x1xf32>, vector<1x1x16x1xf32> into vector<1x1x2x16xf32>
scf.yield %59 : vector<1x1x2x16xf32>
}
%55 = vector.transfer_write %54, %52[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x2x16xf32>, tensor<1x1x2x16xf32>
%56 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%55, %extracted_slice_1 : tensor<1x1x2x16xf32>, tensor<1x2xf32>) outs(%extracted_slice_0 : tensor<1x1x2x16xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>} {
^bb0(%in: f32, %in_2: f32, %out: f32):
%57 = arith.addf %in, %in_2 : f32
%58 = arith.maximumf %57, %cst : f32
linalg.yield %58 : f32
} -> tensor<1x1x2x16xf32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %56 into %arg4[0, 0, %arg3, 0] [1, 1, 2, 16] [1, 1, 1, 1] : tensor<1x1x2x16xf32> into tensor<1x1x16x16xf32>
}
}
scf.forall.in_parallel {
tensor.parallel_insert_slice %50 into %arg2[%arg0, %arg1, 0, 0] [1, 1, 16, 16] [1, 1, 1, 1] : tensor<1x1x16x16xf32> into tensor<?x?x16x16xf32>
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
iree_tensor_ext.dispatch.tensor.store %49, %44, offsets = [0, 0, 0, 0], sizes = [%39, %40, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%39, %40}
return
}
// -----// IR Dump After IREEComprehensiveBufferizePass (iree-codegen-iree-comprehensive-bufferize) //----- //
func.func @mmt4d_bias_relu_dispatch_0_mmt4d_DxDxDx16x16x1_f32() attributes {hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", iree.encoding.resolver = #iree_cpu.cpu_encoding_layout<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "none"}>, translation_info = #iree_codegen.translation_info<pipeline = Mmt4dTilingExpert>} {
%cst = arith.constant 0.000000e+00 : f32
%c32_i64 = arith.constant 32 : i64
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%alloca = memref.alloca() {alignment = 64 : i64} : memref<1x1x2x16xf32>
%0 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
%4 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
%5 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
%6 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
%7 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
%8 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
%9 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
%10 = arith.extui %0 : i32 to i64
%11 = arith.extui %1 : i32 to i64
%12 = arith.shli %11, %c32_i64 : i64
%13 = arith.ori %10, %12 : i64
%14 = arith.index_castui %13 : i64 to index
%15 = arith.extui %2 : i32 to i64
%16 = arith.extui %3 : i32 to i64
%17 = arith.shli %16, %c32_i64 : i64
%18 = arith.ori %15, %17 : i64
%19 = arith.index_castui %18 : i64 to index
%20 = arith.extui %4 : i32 to i64
%21 = arith.extui %5 : i32 to i64
%22 = arith.shli %21, %c32_i64 : i64
%23 = arith.ori %20, %22 : i64
%24 = arith.index_castui %23 : i64 to index
%25 = arith.extui %6 : i32 to i64
%26 = arith.extui %7 : i32 to i64
%27 = arith.shli %26, %c32_i64 : i64
%28 = arith.ori %25, %27 : i64
%29 = arith.index_castui %28 : i64 to index
%30 = arith.extui %8 : i32 to i64
%31 = arith.extui %9 : i32 to i64
%32 = arith.shli %31, %c32_i64 : i64
%33 = arith.ori %30, %32 : i64
%34 = arith.index_castui %33 : i64 to index
%35:5 = util.assume.int
%14<umin = 0, umax = 9007199254740991>,
%19<umin = 0, umax = 9007199254740991>,
%24<umin = 0, umax = 9007199254740991>,
%29<umin = 0, umax = 9007199254740991>,
%34<umin = 0, umax = 9007199254740991>
: index, index, index, index, index
%36 = iree_tensor_ext.dispatch.workload.ordinal %35#0, 0 : index
%37 = iree_tensor_ext.dispatch.workload.ordinal %35#1, 1 : index
%38 = iree_tensor_ext.dispatch.workload.ordinal %35#2, 2 : index
%39 = iree_tensor_ext.dispatch.workload.ordinal %35#3, 3 : index
%40 = iree_tensor_ext.dispatch.workload.ordinal %35#4, 4 : index
%41 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>{%39, %36}
memref.assume_alignment %41, 64 : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>
%42 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>{%37, %40}
memref.assume_alignment %42, 64 : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>
%43 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x16xf32, #hal.descriptor_type<storage_buffer>>{%38}
memref.assume_alignment %43, 64 : memref<?x16xf32, #hal.descriptor_type<storage_buffer>>
%44 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(3) alignment(64) offset(%c0) flags(Indirect) : memref<?x?x16x16xf32, #hal.descriptor_type<storage_buffer>>{%39, %40}
memref.assume_alignment %44, 64 : memref<?x?x16x16xf32, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg0, %arg1) in (%39, %37) {
%subview = memref.subview %44[%arg0, %arg1, 0, 0] [1, 1, 16, 16] [1, 1, 1, 1] : memref<?x?x16x16xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1x16x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg2) = (0) to (16) step (2) {
%subview_1 = memref.subview %subview[0, 0, %arg2, 0] [1, 1, 2, 16] [1, 1, 1, 1] : memref<1x1x16x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x2x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_2 = memref.subview %43[%arg0, %arg2] [1, 2] [1, 1] : memref<?x16xf32, #hal.descriptor_type<storage_buffer>> to memref<1x2xf32, strided<[16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>} ins(%cst : f32) outs(%alloca : memref<1x1x2x16xf32>)
%45 = vector.transfer_read %alloca[%c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true]} : memref<1x1x2x16xf32>, vector<1x1x2x16xf32>
%46 = scf.for %arg3 = %c0 to %36 step %c1 iter_args(%arg4 = %45) -> (vector<1x1x2x16xf32>) {
%47 = vector.transfer_read %41[%arg0, %arg3, %arg2, %c0], %cst {in_bounds = [true, true, true, true]} : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>, vector<1x1x2x1xf32>
%48 = vector.transfer_read %42[%arg1, %arg3, %c0, %c0], %cst {in_bounds = [true, true, true, true]} : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>, vector<1x1x16x1xf32>
%49 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d2, d4, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %47, %48, %arg4 : vector<1x1x2x1xf32>, vector<1x1x16x1xf32> into vector<1x1x2x16xf32>
scf.yield %49 : vector<1x1x2x16xf32>
}
vector.transfer_write %46, %alloca[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x2x16xf32>, memref<1x1x2x16xf32>
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%alloca, %subview_2 : memref<1x1x2x16xf32>, memref<1x2xf32, strided<[16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_1 : memref<1x1x2x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>} {
^bb0(%in: f32, %in_4: f32, %out: f32):
%47 = arith.addf %in, %in_4 : f32
%48 = arith.maximumf %47, %cst : f32
linalg.yield %48 : f32
}
%subview_3 = memref.subview %subview[0, 0, %arg2, 0] [1, 1, 2, 16] [1, 1, 1, 1] : memref<1x1x16x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x2x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%subview_1 : memref<1x1x2x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_3 : memref<1x1x2x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) {
^bb0(%in: f32, %out: f32):
linalg.yield %in : f32
}
}
%subview_0 = memref.subview %44[%arg0, %arg1, 0, 0] [1, 1, 16, 16] [1, 1, 1, 1] : memref<?x?x16x16xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1x16x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%subview : memref<1x1x16x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_0 : memref<1x1x16x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) {
^bb0(%in: f32, %out: f32):
linalg.yield %in : f32
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%44 : memref<?x?x16x16xf32, #hal.descriptor_type<storage_buffer>>) outs(%44 : memref<?x?x16x16xf32, #hal.descriptor_type<storage_buffer>>) {
^bb0(%in: f32, %out: f32):
linalg.yield %in : f32
}
return
}
// -----// IR Dump After ResolveShapedTypeResultDimsPass (resolve-shaped-type-result-dims) //----- //
func.func @mmt4d_bias_relu_dispatch_0_mmt4d_DxDxDx16x16x1_f32() attributes {hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", iree.encoding.resolver = #iree_cpu.cpu_encoding_layout<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "none"}>, translation_info = #iree_codegen.translation_info<pipeline = Mmt4dTilingExpert>} {
%cst = arith.constant 0.000000e+00 : f32
%c32_i64 = arith.constant 32 : i64
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%alloca = memref.alloca() {alignment = 64 : i64} : memref<1x1x2x16xf32>
%0 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
%4 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
%5 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
%6 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
%7 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
%8 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
%9 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
%10 = arith.extui %0 : i32 to i64
%11 = arith.extui %1 : i32 to i64
%12 = arith.shli %11, %c32_i64 : i64
%13 = arith.ori %10, %12 : i64
%14 = arith.index_castui %13 : i64 to index
%15 = arith.extui %2 : i32 to i64
%16 = arith.extui %3 : i32 to i64
%17 = arith.shli %16, %c32_i64 : i64
%18 = arith.ori %15, %17 : i64
%19 = arith.index_castui %18 : i64 to index
%20 = arith.extui %4 : i32 to i64
%21 = arith.extui %5 : i32 to i64
%22 = arith.shli %21, %c32_i64 : i64
%23 = arith.ori %20, %22 : i64
%24 = arith.index_castui %23 : i64 to index
%25 = arith.extui %6 : i32 to i64
%26 = arith.extui %7 : i32 to i64
%27 = arith.shli %26, %c32_i64 : i64
%28 = arith.ori %25, %27 : i64
%29 = arith.index_castui %28 : i64 to index
%30 = arith.extui %8 : i32 to i64
%31 = arith.extui %9 : i32 to i64
%32 = arith.shli %31, %c32_i64 : i64
%33 = arith.ori %30, %32 : i64
%34 = arith.index_castui %33 : i64 to index
%35:5 = util.assume.int
%14<umin = 0, umax = 9007199254740991>,
%19<umin = 0, umax = 9007199254740991>,
%24<umin = 0, umax = 9007199254740991>,
%29<umin = 0, umax = 9007199254740991>,
%34<umin = 0, umax = 9007199254740991>
: index, index, index, index, index
%36 = iree_tensor_ext.dispatch.workload.ordinal %35#0, 0 : index
%37 = iree_tensor_ext.dispatch.workload.ordinal %35#1, 1 : index
%38 = iree_tensor_ext.dispatch.workload.ordinal %35#2, 2 : index
%39 = iree_tensor_ext.dispatch.workload.ordinal %35#3, 3 : index
%40 = iree_tensor_ext.dispatch.workload.ordinal %35#4, 4 : index
%41 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>{%39, %36}
memref.assume_alignment %41, 64 : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>
%42 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>{%37, %40}
memref.assume_alignment %42, 64 : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>
%43 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x16xf32, #hal.descriptor_type<storage_buffer>>{%38}
memref.assume_alignment %43, 64 : memref<?x16xf32, #hal.descriptor_type<storage_buffer>>
%44 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(3) alignment(64) offset(%c0) flags(Indirect) : memref<?x?x16x16xf32, #hal.descriptor_type<storage_buffer>>{%39, %40}
memref.assume_alignment %44, 64 : memref<?x?x16x16xf32, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg0, %arg1) in (%39, %37) {
%subview = memref.subview %44[%arg0, %arg1, 0, 0] [1, 1, 16, 16] [1, 1, 1, 1] : memref<?x?x16x16xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1x16x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg2) = (0) to (16) step (2) {
%subview_1 = memref.subview %subview[0, 0, %arg2, 0] [1, 1, 2, 16] [1, 1, 1, 1] : memref<1x1x16x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x2x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_2 = memref.subview %43[%arg0, %arg2] [1, 2] [1, 1] : memref<?x16xf32, #hal.descriptor_type<storage_buffer>> to memref<1x2xf32, strided<[16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>} ins(%cst : f32) outs(%alloca : memref<1x1x2x16xf32>)
%45 = vector.transfer_read %alloca[%c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true]} : memref<1x1x2x16xf32>, vector<1x1x2x16xf32>
%46 = scf.for %arg3 = %c0 to %36 step %c1 iter_args(%arg4 = %45) -> (vector<1x1x2x16xf32>) {
%47 = vector.transfer_read %41[%arg0, %arg3, %arg2, %c0], %cst {in_bounds = [true, true, true, true]} : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>, vector<1x1x2x1xf32>
%48 = vector.transfer_read %42[%arg1, %arg3, %c0, %c0], %cst {in_bounds = [true, true, true, true]} : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>, vector<1x1x16x1xf32>
%49 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d2, d4, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %47, %48, %arg4 : vector<1x1x2x1xf32>, vector<1x1x16x1xf32> into vector<1x1x2x16xf32>
scf.yield %49 : vector<1x1x2x16xf32>
}
vector.transfer_write %46, %alloca[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x2x16xf32>, memref<1x1x2x16xf32>
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%alloca, %subview_2 : memref<1x1x2x16xf32>, memref<1x2xf32, strided<[16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_1 : memref<1x1x2x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>} {
^bb0(%in: f32, %in_4: f32, %out: f32):
%47 = arith.addf %in, %in_4 : f32
%48 = arith.maximumf %47, %cst : f32
linalg.yield %48 : f32
}
%subview_3 = memref.subview %subview[0, 0, %arg2, 0] [1, 1, 2, 16] [1, 1, 1, 1] : memref<1x1x16x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x2x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%subview_1 : memref<1x1x2x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_3 : memref<1x1x2x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) {
^bb0(%in: f32, %out: f32):
linalg.yield %in : f32
}
}
%subview_0 = memref.subview %44[%arg0, %arg1, 0, 0] [1, 1, 16, 16] [1, 1, 1, 1] : memref<?x?x16x16xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1x16x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%subview : memref<1x1x16x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_0 : memref<1x1x16x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) {
^bb0(%in: f32, %out: f32):
linalg.yield %in : f32
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%44 : memref<?x?x16x16xf32, #hal.descriptor_type<storage_buffer>>) outs(%44 : memref<?x?x16x16xf32, #hal.descriptor_type<storage_buffer>>) {
^bb0(%in: f32, %out: f32):
linalg.yield %in : f32
}
return
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @mmt4d_bias_relu_dispatch_0_mmt4d_DxDxDx16x16x1_f32() attributes {hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", iree.encoding.resolver = #iree_cpu.cpu_encoding_layout<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "none"}>, translation_info = #iree_codegen.translation_info<pipeline = Mmt4dTilingExpert>} {
%cst = arith.constant 0.000000e+00 : f32
%c32_i64 = arith.constant 32 : i64
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%alloca = memref.alloca() {alignment = 64 : i64} : memref<1x1x2x16xf32>
%0 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
%4 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
%5 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
%6 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
%7 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
%8 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
%9 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
%10 = arith.extui %0 : i32 to i64
%11 = arith.extui %1 : i32 to i64
%12 = arith.shli %11, %c32_i64 : i64
%13 = arith.ori %10, %12 : i64
%14 = arith.index_castui %13 : i64 to index
%15 = arith.extui %2 : i32 to i64
%16 = arith.extui %3 : i32 to i64
%17 = arith.shli %16, %c32_i64 : i64
%18 = arith.ori %15, %17 : i64
%19 = arith.index_castui %18 : i64 to index
%20 = arith.extui %4 : i32 to i64
%21 = arith.extui %5 : i32 to i64
%22 = arith.shli %21, %c32_i64 : i64
%23 = arith.ori %20, %22 : i64
%24 = arith.index_castui %23 : i64 to index
%25 = arith.extui %6 : i32 to i64
%26 = arith.extui %7 : i32 to i64
%27 = arith.shli %26, %c32_i64 : i64
%28 = arith.ori %25, %27 : i64
%29 = arith.index_castui %28 : i64 to index
%30 = arith.extui %8 : i32 to i64
%31 = arith.extui %9 : i32 to i64
%32 = arith.shli %31, %c32_i64 : i64
%33 = arith.ori %30, %32 : i64
%34 = arith.index_castui %33 : i64 to index
%35:5 = util.assume.int
%14<umin = 0, umax = 9007199254740991>,
%19<umin = 0, umax = 9007199254740991>,
%24<umin = 0, umax = 9007199254740991>,
%29<umin = 0, umax = 9007199254740991>,
%34<umin = 0, umax = 9007199254740991>
: index, index, index, index, index
%36 = iree_tensor_ext.dispatch.workload.ordinal %35#0, 0 : index
%37 = iree_tensor_ext.dispatch.workload.ordinal %35#1, 1 : index
%38 = iree_tensor_ext.dispatch.workload.ordinal %35#2, 2 : index
%39 = iree_tensor_ext.dispatch.workload.ordinal %35#3, 3 : index
%40 = iree_tensor_ext.dispatch.workload.ordinal %35#4, 4 : index
%41 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>{%39, %36}
memref.assume_alignment %41, 64 : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>
%42 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>{%37, %40}
memref.assume_alignment %42, 64 : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>
%43 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x16xf32, #hal.descriptor_type<storage_buffer>>{%38}
memref.assume_alignment %43, 64 : memref<?x16xf32, #hal.descriptor_type<storage_buffer>>
%44 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(3) alignment(64) offset(%c0) flags(Indirect) : memref<?x?x16x16xf32, #hal.descriptor_type<storage_buffer>>{%39, %40}
memref.assume_alignment %44, 64 : memref<?x?x16x16xf32, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg0, %arg1) in (%39, %37) {
%subview = memref.subview %44[%arg0, %arg1, 0, 0] [1, 1, 16, 16] [1, 1, 1, 1] : memref<?x?x16x16xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1x16x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg2) = (0) to (16) step (2) {
%subview_1 = memref.subview %subview[0, 0, %arg2, 0] [1, 1, 2, 16] [1, 1, 1, 1] : memref<1x1x16x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x2x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_2 = memref.subview %43[%arg0, %arg2] [1, 2] [1, 1] : memref<?x16xf32, #hal.descriptor_type<storage_buffer>> to memref<1x2xf32, strided<[16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>} ins(%cst : f32) outs(%alloca : memref<1x1x2x16xf32>)
%45 = vector.transfer_read %alloca[%c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true]} : memref<1x1x2x16xf32>, vector<1x1x2x16xf32>
%46 = scf.for %arg3 = %c0 to %36 step %c1 iter_args(%arg4 = %45) -> (vector<1x1x2x16xf32>) {
%47 = vector.transfer_read %41[%arg0, %arg3, %arg2, %c0], %cst {in_bounds = [true, true, true, true]} : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>, vector<1x1x2x1xf32>
%48 = vector.transfer_read %42[%arg1, %arg3, %c0, %c0], %cst {in_bounds = [true, true, true, true]} : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>, vector<1x1x16x1xf32>
%49 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d2, d4, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %47, %48, %arg4 : vector<1x1x2x1xf32>, vector<1x1x16x1xf32> into vector<1x1x2x16xf32>
scf.yield %49 : vector<1x1x2x16xf32>
}
vector.transfer_write %46, %alloca[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x2x16xf32>, memref<1x1x2x16xf32>
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%alloca, %subview_2 : memref<1x1x2x16xf32>, memref<1x2xf32, strided<[16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_1 : memref<1x1x2x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>} {
^bb0(%in: f32, %in_4: f32, %out: f32):
%47 = arith.addf %in, %in_4 : f32
%48 = arith.maximumf %47, %cst : f32
linalg.yield %48 : f32
}
%subview_3 = memref.subview %subview[0, 0, %arg2, 0] [1, 1, 2, 16] [1, 1, 1, 1] : memref<1x1x16x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x2x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%subview_1 : memref<1x1x2x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_3 : memref<1x1x2x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) {
^bb0(%in: f32, %out: f32):
linalg.yield %in : f32
}
}
%subview_0 = memref.subview %44[%arg0, %arg1, 0, 0] [1, 1, 16, 16] [1, 1, 1, 1] : memref<?x?x16x16xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1x16x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%subview : memref<1x1x16x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_0 : memref<1x1x16x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) {
^bb0(%in: f32, %out: f32):
linalg.yield %in : f32
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
return
}
// -----// IR Dump After CSE (cse) //----- //
func.func @mmt4d_bias_relu_dispatch_0_mmt4d_DxDxDx16x16x1_f32() attributes {hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", iree.encoding.resolver = #iree_cpu.cpu_encoding_layout<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "none"}>, translation_info = #iree_codegen.translation_info<pipeline = Mmt4dTilingExpert>} {
%cst = arith.constant 0.000000e+00 : f32
%c32_i64 = arith.constant 32 : i64
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%alloca = memref.alloca() {alignment = 64 : i64} : memref<1x1x2x16xf32>
%0 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
%4 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
%5 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
%6 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
%7 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
%8 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
%9 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
%10 = arith.extui %0 : i32 to i64
%11 = arith.extui %1 : i32 to i64
%12 = arith.shli %11, %c32_i64 : i64
%13 = arith.ori %10, %12 : i64
%14 = arith.index_castui %13 : i64 to index
%15 = arith.extui %2 : i32 to i64
%16 = arith.extui %3 : i32 to i64
%17 = arith.shli %16, %c32_i64 : i64
%18 = arith.ori %15, %17 : i64
%19 = arith.index_castui %18 : i64 to index
%20 = arith.extui %4 : i32 to i64
%21 = arith.extui %5 : i32 to i64
%22 = arith.shli %21, %c32_i64 : i64
%23 = arith.ori %20, %22 : i64
%24 = arith.index_castui %23 : i64 to index
%25 = arith.extui %6 : i32 to i64
%26 = arith.extui %7 : i32 to i64
%27 = arith.shli %26, %c32_i64 : i64
%28 = arith.ori %25, %27 : i64
%29 = arith.index_castui %28 : i64 to index
%30 = arith.extui %8 : i32 to i64
%31 = arith.extui %9 : i32 to i64
%32 = arith.shli %31, %c32_i64 : i64
%33 = arith.ori %30, %32 : i64
%34 = arith.index_castui %33 : i64 to index
%35:5 = util.assume.int
%14<umin = 0, umax = 9007199254740991>,
%19<umin = 0, umax = 9007199254740991>,
%24<umin = 0, umax = 9007199254740991>,
%29<umin = 0, umax = 9007199254740991>,
%34<umin = 0, umax = 9007199254740991>
: index, index, index, index, index
%36 = iree_tensor_ext.dispatch.workload.ordinal %35#0, 0 : index
%37 = iree_tensor_ext.dispatch.workload.ordinal %35#1, 1 : index
%38 = iree_tensor_ext.dispatch.workload.ordinal %35#2, 2 : index
%39 = iree_tensor_ext.dispatch.workload.ordinal %35#3, 3 : index
%40 = iree_tensor_ext.dispatch.workload.ordinal %35#4, 4 : index
%41 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>{%39, %36}
memref.assume_alignment %41, 64 : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>
%42 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>{%37, %40}
memref.assume_alignment %42, 64 : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>
%43 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x16xf32, #hal.descriptor_type<storage_buffer>>{%38}
memref.assume_alignment %43, 64 : memref<?x16xf32, #hal.descriptor_type<storage_buffer>>
%44 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(3) alignment(64) offset(%c0) flags(Indirect) : memref<?x?x16x16xf32, #hal.descriptor_type<storage_buffer>>{%39, %40}
memref.assume_alignment %44, 64 : memref<?x?x16x16xf32, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg0, %arg1) in (%39, %37) {
%subview = memref.subview %44[%arg0, %arg1, 0, 0] [1, 1, 16, 16] [1, 1, 1, 1] : memref<?x?x16x16xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1x16x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg2) = (0) to (16) step (2) {
%subview_0 = memref.subview %subview[0, 0, %arg2, 0] [1, 1, 2, 16] [1, 1, 1, 1] : memref<1x1x16x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x2x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_1 = memref.subview %43[%arg0, %arg2] [1, 2] [1, 1] : memref<?x16xf32, #hal.descriptor_type<storage_buffer>> to memref<1x2xf32, strided<[16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>} ins(%cst : f32) outs(%alloca : memref<1x1x2x16xf32>)
%45 = vector.transfer_read %alloca[%c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true]} : memref<1x1x2x16xf32>, vector<1x1x2x16xf32>
%46 = scf.for %arg3 = %c0 to %36 step %c1 iter_args(%arg4 = %45) -> (vector<1x1x2x16xf32>) {
%47 = vector.transfer_read %41[%arg0, %arg3, %arg2, %c0], %cst {in_bounds = [true, true, true, true]} : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>, vector<1x1x2x1xf32>
%48 = vector.transfer_read %42[%arg1, %arg3, %c0, %c0], %cst {in_bounds = [true, true, true, true]} : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>, vector<1x1x16x1xf32>
%49 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d2, d4, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %47, %48, %arg4 : vector<1x1x2x1xf32>, vector<1x1x16x1xf32> into vector<1x1x2x16xf32>
scf.yield %49 : vector<1x1x2x16xf32>
}
vector.transfer_write %46, %alloca[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x2x16xf32>, memref<1x1x2x16xf32>
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%alloca, %subview_1 : memref<1x1x2x16xf32>, memref<1x2xf32, strided<[16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_0 : memref<1x1x2x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>} {
^bb0(%in: f32, %in_2: f32, %out: f32):
%47 = arith.addf %in, %in_2 : f32
%48 = arith.maximumf %47, %cst : f32
linalg.yield %48 : f32
}
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%subview_0 : memref<1x1x2x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_0 : memref<1x1x2x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) {
^bb0(%in: f32, %out: f32):
linalg.yield %in : f32
}
}
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%subview : memref<1x1x16x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview : memref<1x1x16x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) {
^bb0(%in: f32, %out: f32):
linalg.yield %in : f32
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
return
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @mmt4d_bias_relu_dispatch_0_mmt4d_DxDxDx16x16x1_f32() attributes {hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", iree.encoding.resolver = #iree_cpu.cpu_encoding_layout<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "none"}>, translation_info = #iree_codegen.translation_info<pipeline = Mmt4dTilingExpert>} {
%cst = arith.constant 0.000000e+00 : f32
%c32_i64 = arith.constant 32 : i64
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%alloca = memref.alloca() {alignment = 64 : i64} : memref<1x1x2x16xf32>
%0 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
%4 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
%5 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
%6 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
%7 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
%8 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
%9 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
%10 = arith.extui %0 : i32 to i64
%11 = arith.extui %1 : i32 to i64
%12 = arith.shli %11, %c32_i64 : i64
%13 = arith.ori %10, %12 : i64
%14 = arith.index_castui %13 : i64 to index
%15 = arith.extui %2 : i32 to i64
%16 = arith.extui %3 : i32 to i64
%17 = arith.shli %16, %c32_i64 : i64
%18 = arith.ori %15, %17 : i64
%19 = arith.index_castui %18 : i64 to index
%20 = arith.extui %4 : i32 to i64
%21 = arith.extui %5 : i32 to i64
%22 = arith.shli %21, %c32_i64 : i64
%23 = arith.ori %20, %22 : i64
%24 = arith.index_castui %23 : i64 to index
%25 = arith.extui %6 : i32 to i64
%26 = arith.extui %7 : i32 to i64
%27 = arith.shli %26, %c32_i64 : i64
%28 = arith.ori %25, %27 : i64
%29 = arith.index_castui %28 : i64 to index
%30 = arith.extui %8 : i32 to i64
%31 = arith.extui %9 : i32 to i64
%32 = arith.shli %31, %c32_i64 : i64
%33 = arith.ori %30, %32 : i64
%34 = arith.index_castui %33 : i64 to index
%35:5 = util.assume.int
%14<umin = 0, umax = 9007199254740991>,
%19<umin = 0, umax = 9007199254740991>,
%24<umin = 0, umax = 9007199254740991>,
%29<umin = 0, umax = 9007199254740991>,
%34<umin = 0, umax = 9007199254740991>
: index, index, index, index, index
%36 = iree_tensor_ext.dispatch.workload.ordinal %35#0, 0 : index
%37 = iree_tensor_ext.dispatch.workload.ordinal %35#1, 1 : index
%38 = iree_tensor_ext.dispatch.workload.ordinal %35#2, 2 : index
%39 = iree_tensor_ext.dispatch.workload.ordinal %35#3, 3 : index
%40 = iree_tensor_ext.dispatch.workload.ordinal %35#4, 4 : index
%41 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>{%39, %36}
memref.assume_alignment %41, 64 : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>
%42 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>{%37, %40}
memref.assume_alignment %42, 64 : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>
%43 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x16xf32, #hal.descriptor_type<storage_buffer>>{%38}
memref.assume_alignment %43, 64 : memref<?x16xf32, #hal.descriptor_type<storage_buffer>>
%44 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(3) alignment(64) offset(%c0) flags(Indirect) : memref<?x?x16x16xf32, #hal.descriptor_type<storage_buffer>>{%39, %40}
memref.assume_alignment %44, 64 : memref<?x?x16x16xf32, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg0, %arg1) in (%39, %37) {
%subview = memref.subview %44[%arg0, %arg1, 0, 0] [1, 1, 16, 16] [1, 1, 1, 1] : memref<?x?x16x16xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1x16x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg2) = (0) to (16) step (2) {
%subview_0 = memref.subview %subview[0, 0, %arg2, 0] [1, 1, 2, 16] [1, 1, 1, 1] : memref<1x1x16x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x2x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_1 = memref.subview %43[%arg0, %arg2] [1, 2] [1, 1] : memref<?x16xf32, #hal.descriptor_type<storage_buffer>> to memref<1x2xf32, strided<[16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>} ins(%cst : f32) outs(%alloca : memref<1x1x2x16xf32>)
%45 = vector.transfer_read %alloca[%c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true]} : memref<1x1x2x16xf32>, vector<1x1x2x16xf32>
%46 = scf.for %arg3 = %c0 to %36 step %c1 iter_args(%arg4 = %45) -> (vector<1x1x2x16xf32>) {
%47 = vector.transfer_read %41[%arg0, %arg3, %arg2, %c0], %cst {in_bounds = [true, true, true, true]} : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>, vector<1x1x2x1xf32>
%48 = vector.transfer_read %42[%arg1, %arg3, %c0, %c0], %cst {in_bounds = [true, true, true, true]} : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>, vector<1x1x16x1xf32>
%49 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d2, d4, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %47, %48, %arg4 : vector<1x1x2x1xf32>, vector<1x1x16x1xf32> into vector<1x1x2x16xf32>
scf.yield %49 : vector<1x1x2x16xf32>
}
vector.transfer_write %46, %alloca[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x2x16xf32>, memref<1x1x2x16xf32>
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%alloca, %subview_1 : memref<1x1x2x16xf32>, memref<1x2xf32, strided<[16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_0 : memref<1x1x2x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>} {
^bb0(%in: f32, %in_2: f32, %out: f32):
%47 = arith.addf %in, %in_2 : f32
%48 = arith.maximumf %47, %cst : f32
linalg.yield %48 : f32
}
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
return
}
// -----// IR Dump After CleanupBufferAllocViewPass (iree-codegen-cleanup-buffer-alloc-view) //----- //
func.func @mmt4d_bias_relu_dispatch_0_mmt4d_DxDxDx16x16x1_f32() attributes {hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", iree.encoding.resolver = #iree_cpu.cpu_encoding_layout<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "none"}>, translation_info = #iree_codegen.translation_info<pipeline = Mmt4dTilingExpert>} {
%cst = arith.constant 0.000000e+00 : f32
%c32_i64 = arith.constant 32 : i64
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%alloca = memref.alloca() {alignment = 64 : i64} : memref<1x1x2x16xf32>
%0 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
%4 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
%5 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
%6 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
%7 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
%8 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
%9 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
%10 = arith.extui %0 : i32 to i64
%11 = arith.extui %1 : i32 to i64
%12 = arith.shli %11, %c32_i64 : i64
%13 = arith.ori %10, %12 : i64
%14 = arith.index_castui %13 : i64 to index
%15 = arith.extui %2 : i32 to i64
%16 = arith.extui %3 : i32 to i64
%17 = arith.shli %16, %c32_i64 : i64
%18 = arith.ori %15, %17 : i64
%19 = arith.index_castui %18 : i64 to index
%20 = arith.extui %4 : i32 to i64
%21 = arith.extui %5 : i32 to i64
%22 = arith.shli %21, %c32_i64 : i64
%23 = arith.ori %20, %22 : i64
%24 = arith.index_castui %23 : i64 to index
%25 = arith.extui %6 : i32 to i64
%26 = arith.extui %7 : i32 to i64
%27 = arith.shli %26, %c32_i64 : i64
%28 = arith.ori %25, %27 : i64
%29 = arith.index_castui %28 : i64 to index
%30 = arith.extui %8 : i32 to i64
%31 = arith.extui %9 : i32 to i64
%32 = arith.shli %31, %c32_i64 : i64
%33 = arith.ori %30, %32 : i64
%34 = arith.index_castui %33 : i64 to index
%35:5 = util.assume.int
%14<umin = 0, umax = 9007199254740991>,
%19<umin = 0, umax = 9007199254740991>,
%24<umin = 0, umax = 9007199254740991>,
%29<umin = 0, umax = 9007199254740991>,
%34<umin = 0, umax = 9007199254740991>
: index, index, index, index, index
%36 = iree_tensor_ext.dispatch.workload.ordinal %35#0, 0 : index
%37 = iree_tensor_ext.dispatch.workload.ordinal %35#1, 1 : index
%38 = iree_tensor_ext.dispatch.workload.ordinal %35#2, 2 : index
%39 = iree_tensor_ext.dispatch.workload.ordinal %35#3, 3 : index
%40 = iree_tensor_ext.dispatch.workload.ordinal %35#4, 4 : index
%41 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>{%39, %36}
memref.assume_alignment %41, 64 : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>
%42 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>{%37, %40}
memref.assume_alignment %42, 64 : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>
%43 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x16xf32, #hal.descriptor_type<storage_buffer>>{%38}
memref.assume_alignment %43, 64 : memref<?x16xf32, #hal.descriptor_type<storage_buffer>>
%44 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(3) alignment(64) offset(%c0) flags(Indirect) : memref<?x?x16x16xf32, #hal.descriptor_type<storage_buffer>>{%39, %40}
memref.assume_alignment %44, 64 : memref<?x?x16x16xf32, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg0, %arg1) in (%39, %37) {
%subview = memref.subview %44[%arg0, %arg1, 0, 0] [1, 1, 16, 16] [1, 1, 1, 1] : memref<?x?x16x16xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1x16x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg2) = (0) to (16) step (2) {
%subview_0 = memref.subview %subview[0, 0, %arg2, 0] [1, 1, 2, 16] [1, 1, 1, 1] : memref<1x1x16x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x2x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_1 = memref.subview %43[%arg0, %arg2] [1, 2] [1, 1] : memref<?x16xf32, #hal.descriptor_type<storage_buffer>> to memref<1x2xf32, strided<[16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>} ins(%cst : f32) outs(%alloca : memref<1x1x2x16xf32>)
%45 = vector.transfer_read %alloca[%c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true]} : memref<1x1x2x16xf32>, vector<1x1x2x16xf32>
%46 = scf.for %arg3 = %c0 to %36 step %c1 iter_args(%arg4 = %45) -> (vector<1x1x2x16xf32>) {
%47 = vector.transfer_read %41[%arg0, %arg3, %arg2, %c0], %cst {in_bounds = [true, true, true, true]} : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>, vector<1x1x2x1xf32>
%48 = vector.transfer_read %42[%arg1, %arg3, %c0, %c0], %cst {in_bounds = [true, true, true, true]} : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>, vector<1x1x16x1xf32>
%49 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d2, d4, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %47, %48, %arg4 : vector<1x1x2x1xf32>, vector<1x1x16x1xf32> into vector<1x1x2x16xf32>
scf.yield %49 : vector<1x1x2x16xf32>
}
vector.transfer_write %46, %alloca[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x2x16xf32>, memref<1x1x2x16xf32>
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%alloca, %subview_1 : memref<1x1x2x16xf32>, memref<1x2xf32, strided<[16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_0 : memref<1x1x2x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>} {
^bb0(%in: f32, %in_2: f32, %out: f32):
%47 = arith.addf %in, %in_2 : f32
%48 = arith.maximumf %47, %cst : f32
linalg.yield %48 : f32
}
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
return
}
// -----// IR Dump After LLVMCPUMmt4dVectorLoweringPass (iree-llvmcpu-mmt4d-vector-lowering) //----- //
func.func @mmt4d_bias_relu_dispatch_0_mmt4d_DxDxDx16x16x1_f32() attributes {hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", iree.encoding.resolver = #iree_cpu.cpu_encoding_layout<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "none"}>, translation_info = #iree_codegen.translation_info<pipeline = Mmt4dTilingExpert>} {
%cst = arith.constant 0.000000e+00 : f32
%c32_i64 = arith.constant 32 : i64
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%alloca = memref.alloca() {alignment = 64 : i64} : memref<1x1x2x16xf32>
%0 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
%4 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
%5 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
%6 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
%7 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
%8 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
%9 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
%10 = arith.extui %0 : i32 to i64
%11 = arith.extui %1 : i32 to i64
%12 = arith.shli %11, %c32_i64 : i64
%13 = arith.ori %10, %12 : i64
%14 = arith.index_castui %13 : i64 to index
%15 = arith.extui %2 : i32 to i64
%16 = arith.extui %3 : i32 to i64
%17 = arith.shli %16, %c32_i64 : i64
%18 = arith.ori %15, %17 : i64
%19 = arith.index_castui %18 : i64 to index
%20 = arith.extui %4 : i32 to i64
%21 = arith.extui %5 : i32 to i64
%22 = arith.shli %21, %c32_i64 : i64
%23 = arith.ori %20, %22 : i64
%24 = arith.index_castui %23 : i64 to index
%25 = arith.extui %6 : i32 to i64
%26 = arith.extui %7 : i32 to i64
%27 = arith.shli %26, %c32_i64 : i64
%28 = arith.ori %25, %27 : i64
%29 = arith.index_castui %28 : i64 to index
%30 = arith.extui %8 : i32 to i64
%31 = arith.extui %9 : i32 to i64
%32 = arith.shli %31, %c32_i64 : i64
%33 = arith.ori %30, %32 : i64
%34 = arith.index_castui %33 : i64 to index
%35:5 = util.assume.int
%14<umin = 0, umax = 9007199254740991>,
%19<umin = 0, umax = 9007199254740991>,
%24<umin = 0, umax = 9007199254740991>,
%29<umin = 0, umax = 9007199254740991>,
%34<umin = 0, umax = 9007199254740991>
: index, index, index, index, index
%36 = iree_tensor_ext.dispatch.workload.ordinal %35#0, 0 : index
%37 = iree_tensor_ext.dispatch.workload.ordinal %35#1, 1 : index
%38 = iree_tensor_ext.dispatch.workload.ordinal %35#2, 2 : index
%39 = iree_tensor_ext.dispatch.workload.ordinal %35#3, 3 : index
%40 = iree_tensor_ext.dispatch.workload.ordinal %35#4, 4 : index
%41 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>{%39, %36}
memref.assume_alignment %41, 64 : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>
%42 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>{%37, %40}
memref.assume_alignment %42, 64 : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>
%43 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x16xf32, #hal.descriptor_type<storage_buffer>>{%38}
memref.assume_alignment %43, 64 : memref<?x16xf32, #hal.descriptor_type<storage_buffer>>
%44 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(3) alignment(64) offset(%c0) flags(Indirect) : memref<?x?x16x16xf32, #hal.descriptor_type<storage_buffer>>{%39, %40}
memref.assume_alignment %44, 64 : memref<?x?x16x16xf32, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg0, %arg1) in (%39, %37) {
%subview = memref.subview %44[%arg0, %arg1, 0, 0] [1, 1, 16, 16] [1, 1, 1, 1] : memref<?x?x16x16xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1x16x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg2) = (0) to (16) step (2) {
%subview_0 = memref.subview %subview[0, 0, %arg2, 0] [1, 1, 2, 16] [1, 1, 1, 1] : memref<1x1x16x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x2x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_1 = memref.subview %43[%arg0, %arg2] [1, 2] [1, 1] : memref<?x16xf32, #hal.descriptor_type<storage_buffer>> to memref<1x2xf32, strided<[16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>} ins(%cst : f32) outs(%alloca : memref<1x1x2x16xf32>)
%45 = vector.transfer_read %alloca[%c0, %c0, %c0, %c0], %cst {in_bounds = [true, true]} : memref<1x1x2x16xf32>, vector<2x16xf32>
%46 = vector.broadcast %45 : vector<2x16xf32> to vector<1x1x2x16xf32>
%47 = scf.for %arg3 = %c0 to %36 step %c1 iter_args(%arg4 = %46) -> (vector<1x1x2x16xf32>) {
%49 = vector.transfer_read %41[%arg0, %arg3, %arg2, %c0], %cst {in_bounds = [true, true]} : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>, vector<2x1xf32>
%50 = vector.transfer_read %42[%arg1, %arg3, %c0, %c0], %cst {in_bounds = [true, true]} : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>, vector<16x1xf32>
%51 = vector.extract %arg4[0, 0] : vector<2x16xf32> from vector<1x1x2x16xf32>
%52 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %49, %50, %51 : vector<2x1xf32>, vector<16x1xf32> into vector<2x16xf32>
%53 = vector.broadcast %52 : vector<2x16xf32> to vector<1x2x16xf32>
%54 = vector.broadcast %53 : vector<1x2x16xf32> to vector<1x1x2x16xf32>
scf.yield %54 : vector<1x1x2x16xf32>
}
%48 = vector.extract %47[0, 0] : vector<2x16xf32> from vector<1x1x2x16xf32>
vector.transfer_write %48, %alloca[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<2x16xf32>, memref<1x1x2x16xf32>
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%alloca, %subview_1 : memref<1x1x2x16xf32>, memref<1x2xf32, strided<[16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_0 : memref<1x1x2x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>} {
^bb0(%in: f32, %in_2: f32, %out: f32):
%49 = arith.addf %in, %in_2 : f32
%50 = arith.maximumf %49, %cst : f32
linalg.yield %50 : f32
}
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
return
}
// -----// IR Dump After DropVectorUnitDimsPass (iree-codegen-drop-vector-unit-dims) //----- //
func.func @mmt4d_bias_relu_dispatch_0_mmt4d_DxDxDx16x16x1_f32() attributes {hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", iree.encoding.resolver = #iree_cpu.cpu_encoding_layout<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "none"}>, translation_info = #iree_codegen.translation_info<pipeline = Mmt4dTilingExpert>} {
%cst = arith.constant 0.000000e+00 : f32
%c32_i64 = arith.constant 32 : i64
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%alloca = memref.alloca() {alignment = 64 : i64} : memref<1x1x2x16xf32>
%0 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
%4 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
%5 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
%6 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
%7 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
%8 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
%9 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
%10 = arith.extui %0 : i32 to i64
%11 = arith.extui %1 : i32 to i64
%12 = arith.shli %11, %c32_i64 : i64
%13 = arith.ori %10, %12 : i64
%14 = arith.index_castui %13 : i64 to index
%15 = arith.extui %2 : i32 to i64
%16 = arith.extui %3 : i32 to i64
%17 = arith.shli %16, %c32_i64 : i64
%18 = arith.ori %15, %17 : i64
%19 = arith.index_castui %18 : i64 to index
%20 = arith.extui %4 : i32 to i64
%21 = arith.extui %5 : i32 to i64
%22 = arith.shli %21, %c32_i64 : i64
%23 = arith.ori %20, %22 : i64
%24 = arith.index_castui %23 : i64 to index
%25 = arith.extui %6 : i32 to i64
%26 = arith.extui %7 : i32 to i64
%27 = arith.shli %26, %c32_i64 : i64
%28 = arith.ori %25, %27 : i64
%29 = arith.index_castui %28 : i64 to index
%30 = arith.extui %8 : i32 to i64
%31 = arith.extui %9 : i32 to i64
%32 = arith.shli %31, %c32_i64 : i64
%33 = arith.ori %30, %32 : i64
%34 = arith.index_castui %33 : i64 to index
%35:5 = util.assume.int
%14<umin = 0, umax = 9007199254740991>,
%19<umin = 0, umax = 9007199254740991>,
%24<umin = 0, umax = 9007199254740991>,
%29<umin = 0, umax = 9007199254740991>,
%34<umin = 0, umax = 9007199254740991>
: index, index, index, index, index
%36 = iree_tensor_ext.dispatch.workload.ordinal %35#0, 0 : index
%37 = iree_tensor_ext.dispatch.workload.ordinal %35#1, 1 : index
%38 = iree_tensor_ext.dispatch.workload.ordinal %35#2, 2 : index
%39 = iree_tensor_ext.dispatch.workload.ordinal %35#3, 3 : index
%40 = iree_tensor_ext.dispatch.workload.ordinal %35#4, 4 : index
%41 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>{%39, %36}
memref.assume_alignment %41, 64 : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>
%42 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>{%37, %40}
memref.assume_alignment %42, 64 : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>
%43 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x16xf32, #hal.descriptor_type<storage_buffer>>{%38}
memref.assume_alignment %43, 64 : memref<?x16xf32, #hal.descriptor_type<storage_buffer>>
%44 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(3) alignment(64) offset(%c0) flags(Indirect) : memref<?x?x16x16xf32, #hal.descriptor_type<storage_buffer>>{%39, %40}
memref.assume_alignment %44, 64 : memref<?x?x16x16xf32, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg0, %arg1) in (%39, %37) {
%subview = memref.subview %44[%arg0, %arg1, 0, 0] [1, 1, 16, 16] [1, 1, 1, 1] : memref<?x?x16x16xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1x16x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg2) = (0) to (16) step (2) {
%subview_0 = memref.subview %subview[0, 0, %arg2, 0] [1, 1, 2, 16] [1, 1, 1, 1] : memref<1x1x16x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x2x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_1 = memref.subview %43[%arg0, %arg2] [1, 2] [1, 1] : memref<?x16xf32, #hal.descriptor_type<storage_buffer>> to memref<1x2xf32, strided<[16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>} ins(%cst : f32) outs(%alloca : memref<1x1x2x16xf32>)
%subview_2 = memref.subview %alloca[0, 0, 0, 0] [1, 1, 2, 16] [1, 1, 1, 1] : memref<1x1x2x16xf32> to memref<2x16xf32>
%45 = vector.transfer_read %subview_2[%c0, %c0], %cst {in_bounds = [true, true]} : memref<2x16xf32>, vector<2x16xf32>
%46 = scf.for %arg3 = %c0 to %36 step %c1 iter_args(%arg4 = %45) -> (vector<2x16xf32>) {
%dim = memref.dim %41, %c0 : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>
%dim_4 = memref.dim %41, %c1 : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>
%subview_5 = memref.subview %41[0, 0, 0, 0] [%dim, %dim_4, 16, 1] [1, 1, 1, 1] : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>> to memref<?x?x16xf32, strided<[?, 16, 1]>, #hal.descriptor_type<storage_buffer>>
%47 = vector.transfer_read %subview_5[%arg0, %arg3, %arg2], %cst {in_bounds = [true]} : memref<?x?x16xf32, strided<[?, 16, 1]>, #hal.descriptor_type<storage_buffer>>, vector<2xf32>
%48 = vector.shape_cast %47 : vector<2xf32> to vector<2x1xf32>
%dim_6 = memref.dim %42, %c0 : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>
%dim_7 = memref.dim %42, %c1 : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>
%subview_8 = memref.subview %42[0, 0, 0, 0] [%dim_6, %dim_7, 16, 1] [1, 1, 1, 1] : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>> to memref<?x?x16xf32, strided<[?, 16, 1]>, #hal.descriptor_type<storage_buffer>>
%49 = vector.transfer_read %subview_8[%arg1, %arg3, %c0], %cst {in_bounds = [true]} : memref<?x?x16xf32, strided<[?, 16, 1]>, #hal.descriptor_type<storage_buffer>>, vector<16xf32>
%50 = vector.shape_cast %49 : vector<16xf32> to vector<16x1xf32>
%51 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %48, %50, %arg4 : vector<2x1xf32>, vector<16x1xf32> into vector<2x16xf32>
scf.yield %51 : vector<2x16xf32>
}
%subview_3 = memref.subview %alloca[0, 0, 0, 0] [1, 1, 2, 16] [1, 1, 1, 1] : memref<1x1x2x16xf32> to memref<2x16xf32>
vector.transfer_write %46, %subview_3[%c0, %c0] {in_bounds = [true, true]} : vector<2x16xf32>, memref<2x16xf32>
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%alloca, %subview_1 : memref<1x1x2x16xf32>, memref<1x2xf32, strided<[16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_0 : memref<1x1x2x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>} {
^bb0(%in: f32, %in_4: f32, %out: f32):
%47 = arith.addf %in, %in_4 : f32
%48 = arith.maximumf %47, %cst : f32
linalg.yield %48 : f32
}
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
return
}
// -----// IR Dump After LLVMCPUVirtualVectorLoweringPass (iree-llvmcpu-virtual-vector-lowering) //----- //
func.func @mmt4d_bias_relu_dispatch_0_mmt4d_DxDxDx16x16x1_f32() attributes {hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", iree.encoding.resolver = #iree_cpu.cpu_encoding_layout<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "none"}>, translation_info = #iree_codegen.translation_info<pipeline = Mmt4dTilingExpert>} {
%cst = arith.constant dense<0.000000e+00> : vector<2x16xf32>
%cst_0 = arith.constant 0.000000e+00 : f32
%c32_i64 = arith.constant 32 : i64
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%alloca = memref.alloca() {alignment = 64 : i64} : memref<1x1x2x16xf32>
%0 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
%4 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
%5 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
%6 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
%7 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
%8 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
%9 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
%10 = arith.extui %0 : i32 to i64
%11 = arith.extui %1 : i32 to i64
%12 = arith.shli %11, %c32_i64 : i64
%13 = arith.ori %10, %12 : i64
%14 = arith.index_castui %13 : i64 to index
%15 = arith.extui %2 : i32 to i64
%16 = arith.extui %3 : i32 to i64
%17 = arith.shli %16, %c32_i64 : i64
%18 = arith.ori %15, %17 : i64
%19 = arith.index_castui %18 : i64 to index
%20 = arith.extui %4 : i32 to i64
%21 = arith.extui %5 : i32 to i64
%22 = arith.shli %21, %c32_i64 : i64
%23 = arith.ori %20, %22 : i64
%24 = arith.index_castui %23 : i64 to index
%25 = arith.extui %6 : i32 to i64
%26 = arith.extui %7 : i32 to i64
%27 = arith.shli %26, %c32_i64 : i64
%28 = arith.ori %25, %27 : i64
%29 = arith.index_castui %28 : i64 to index
%30 = arith.extui %8 : i32 to i64
%31 = arith.extui %9 : i32 to i64
%32 = arith.shli %31, %c32_i64 : i64
%33 = arith.ori %30, %32 : i64
%34 = arith.index_castui %33 : i64 to index
%35:5 = util.assume.int
%14<umin = 0, umax = 9007199254740991>,
%19<umin = 0, umax = 9007199254740991>,
%24<umin = 0, umax = 9007199254740991>,
%29<umin = 0, umax = 9007199254740991>,
%34<umin = 0, umax = 9007199254740991>
: index, index, index, index, index
%36 = iree_tensor_ext.dispatch.workload.ordinal %35#0, 0 : index
%37 = iree_tensor_ext.dispatch.workload.ordinal %35#1, 1 : index
%38 = iree_tensor_ext.dispatch.workload.ordinal %35#2, 2 : index
%39 = iree_tensor_ext.dispatch.workload.ordinal %35#3, 3 : index
%40 = iree_tensor_ext.dispatch.workload.ordinal %35#4, 4 : index
%41 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>{%39, %36}
memref.assume_alignment %41, 64 : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>
%42 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>{%37, %40}
memref.assume_alignment %42, 64 : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>
%43 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x16xf32, #hal.descriptor_type<storage_buffer>>{%38}
memref.assume_alignment %43, 64 : memref<?x16xf32, #hal.descriptor_type<storage_buffer>>
%44 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(3) alignment(64) offset(%c0) flags(Indirect) : memref<?x?x16x16xf32, #hal.descriptor_type<storage_buffer>>{%39, %40}
memref.assume_alignment %44, 64 : memref<?x?x16x16xf32, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg0, %arg1) in (%39, %37) {
%subview = memref.subview %44[%arg0, %arg1, 0, 0] [1, 1, 16, 16] [1, 1, 1, 1] : memref<?x?x16x16xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1x16x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg2) = (0) to (16) step (2) {
%subview_1 = memref.subview %subview[0, 0, %arg2, 0] [1, 1, 2, 16] [1, 1, 1, 1] : memref<1x1x16x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x2x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_2 = memref.subview %43[%arg0, %arg2] [1, 2] [1, 1] : memref<?x16xf32, #hal.descriptor_type<storage_buffer>> to memref<1x2xf32, strided<[16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>} ins(%cst_0 : f32) outs(%alloca : memref<1x1x2x16xf32>)
%subview_3 = memref.subview %alloca[0, 0, 0, 0] [1, 1, 2, 16] [1, 1, 1, 1] : memref<1x1x2x16xf32> to memref<2x16xf32>
%45 = vector.transfer_read %subview_3[%c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<2x16xf32>, vector<2x16xf32>
%46 = scf.for %arg3 = %c0 to %36 step %c1 iter_args(%arg4 = %45) -> (vector<2x16xf32>) {
%dim = memref.dim %41, %c0 : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>
%dim_5 = memref.dim %41, %c1 : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>
%subview_6 = memref.subview %41[0, 0, 0, 0] [%dim, %dim_5, 16, 1] [1, 1, 1, 1] : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>> to memref<?x?x16xf32, strided<[?, 16, 1]>, #hal.descriptor_type<storage_buffer>>
%dim_7 = memref.dim %42, %c0 : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>
%dim_8 = memref.dim %42, %c1 : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>
%subview_9 = memref.subview %42[0, 0, 0, 0] [%dim_7, %dim_8, 16, 1] [1, 1, 1, 1] : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>> to memref<?x?x16xf32, strided<[?, 16, 1]>, #hal.descriptor_type<storage_buffer>>
%47 = vector.transfer_read %subview_9[%arg1, %arg3, %c0], %cst_0 {in_bounds = [true]} : memref<?x?x16xf32, strided<[?, 16, 1]>, #hal.descriptor_type<storage_buffer>>, vector<16xf32>
%48 = vector.shape_cast %47 : vector<16xf32> to vector<16x1xf32>
%49 = vector.transpose %48, [1, 0] : vector<16x1xf32> to vector<1x16xf32>
%50 = vector.extract %49[0] : vector<16xf32> from vector<1x16xf32>
%51 = memref.load %subview_6[%arg0, %arg3, %arg2] : memref<?x?x16xf32, strided<[?, 16, 1]>, #hal.descriptor_type<storage_buffer>>
%52 = vector.broadcast %51 : f32 to vector<16xf32>
%53 = vector.extract %arg4[0] : vector<16xf32> from vector<2x16xf32>
%54 = vector.fma %52, %50, %53 : vector<16xf32>
%55 = vector.insert %54, %cst [0] : vector<16xf32> into vector<2x16xf32>
%56 = affine.apply affine_map<()[s0] -> (s0 + 1)>()[%arg2]
%57 = memref.load %subview_6[%arg0, %arg3, %56] : memref<?x?x16xf32, strided<[?, 16, 1]>, #hal.descriptor_type<storage_buffer>>
%58 = vector.broadcast %57 : f32 to vector<16xf32>
%59 = vector.extract %arg4[1] : vector<16xf32> from vector<2x16xf32>
%60 = vector.fma %58, %50, %59 : vector<16xf32>
%61 = vector.insert %60, %55 [1] : vector<16xf32> into vector<2x16xf32>
scf.yield %61 : vector<2x16xf32>
}
%subview_4 = memref.subview %alloca[0, 0, 0, 0] [1, 1, 2, 16] [1, 1, 1, 1] : memref<1x1x2x16xf32> to memref<2x16xf32>
vector.transfer_write %46, %subview_4[%c0, %c0] {in_bounds = [true, true]} : vector<2x16xf32>, memref<2x16xf32>
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%alloca, %subview_2 : memref<1x1x2x16xf32>, memref<1x2xf32, strided<[16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_1 : memref<1x1x2x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>} {
^bb0(%in: f32, %in_5: f32, %out: f32):
%47 = arith.addf %in, %in_5 : f32
%48 = arith.maximumf %47, %cst_0 : f32
linalg.yield %48 : f32
}
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
return
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @mmt4d_bias_relu_dispatch_0_mmt4d_DxDxDx16x16x1_f32() attributes {hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", iree.encoding.resolver = #iree_cpu.cpu_encoding_layout<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "none"}>, translation_info = #iree_codegen.translation_info<pipeline = Mmt4dTilingExpert>} {
%cst = arith.constant dense<0.000000e+00> : vector<2x16xf32>
%cst_0 = arith.constant 0.000000e+00 : f32
%c32_i64 = arith.constant 32 : i64
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%alloca = memref.alloca() {alignment = 64 : i64} : memref<1x1x2x16xf32>
%0 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
%4 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
%5 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
%6 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
%7 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
%8 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
%9 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
%10 = arith.extui %0 : i32 to i64
%11 = arith.extui %1 : i32 to i64
%12 = arith.shli %11, %c32_i64 : i64
%13 = arith.ori %10, %12 : i64
%14 = arith.index_castui %13 : i64 to index
%15 = arith.extui %2 : i32 to i64
%16 = arith.extui %3 : i32 to i64
%17 = arith.shli %16, %c32_i64 : i64
%18 = arith.ori %15, %17 : i64
%19 = arith.index_castui %18 : i64 to index
%20 = arith.extui %4 : i32 to i64
%21 = arith.extui %5 : i32 to i64
%22 = arith.shli %21, %c32_i64 : i64
%23 = arith.ori %20, %22 : i64
%24 = arith.index_castui %23 : i64 to index
%25 = arith.extui %6 : i32 to i64
%26 = arith.extui %7 : i32 to i64
%27 = arith.shli %26, %c32_i64 : i64
%28 = arith.ori %25, %27 : i64
%29 = arith.index_castui %28 : i64 to index
%30 = arith.extui %8 : i32 to i64
%31 = arith.extui %9 : i32 to i64
%32 = arith.shli %31, %c32_i64 : i64
%33 = arith.ori %30, %32 : i64
%34 = arith.index_castui %33 : i64 to index
%35:5 = util.assume.int
%14<umin = 0, umax = 9007199254740991>,
%19<umin = 0, umax = 9007199254740991>,
%24<umin = 0, umax = 9007199254740991>,
%29<umin = 0, umax = 9007199254740991>,
%34<umin = 0, umax = 9007199254740991>
: index, index, index, index, index
%36 = iree_tensor_ext.dispatch.workload.ordinal %35#0, 0 : index
%37 = iree_tensor_ext.dispatch.workload.ordinal %35#1, 1 : index
%38 = iree_tensor_ext.dispatch.workload.ordinal %35#2, 2 : index
%39 = iree_tensor_ext.dispatch.workload.ordinal %35#3, 3 : index
%40 = iree_tensor_ext.dispatch.workload.ordinal %35#4, 4 : index
%41 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>{%39, %36}
memref.assume_alignment %41, 64 : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>
%42 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>{%37, %40}
memref.assume_alignment %42, 64 : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>
%43 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x16xf32, #hal.descriptor_type<storage_buffer>>{%38}
memref.assume_alignment %43, 64 : memref<?x16xf32, #hal.descriptor_type<storage_buffer>>
%44 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(3) alignment(64) offset(%c0) flags(Indirect) : memref<?x?x16x16xf32, #hal.descriptor_type<storage_buffer>>{%39, %40}
memref.assume_alignment %44, 64 : memref<?x?x16x16xf32, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg0, %arg1) in (%39, %37) {
%subview = memref.subview %44[%arg0, %arg1, 0, 0] [1, 1, 16, 16] [1, 1, 1, 1] : memref<?x?x16x16xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1x16x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg2) = (0) to (16) step (2) {
%subview_1 = memref.subview %subview[0, 0, %arg2, 0] [1, 1, 2, 16] [1, 1, 1, 1] : memref<1x1x16x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x2x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_2 = memref.subview %43[%arg0, %arg2] [1, 2] [1, 1] : memref<?x16xf32, #hal.descriptor_type<storage_buffer>> to memref<1x2xf32, strided<[16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>} ins(%cst_0 : f32) outs(%alloca : memref<1x1x2x16xf32>)
%subview_3 = memref.subview %alloca[0, 0, 0, 0] [1, 1, 2, 16] [1, 1, 1, 1] : memref<1x1x2x16xf32> to memref<2x16xf32>
%45 = vector.transfer_read %subview_3[%c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<2x16xf32>, vector<2x16xf32>
%46 = scf.for %arg3 = %c0 to %36 step %c1 iter_args(%arg4 = %45) -> (vector<2x16xf32>) {
%subview_5 = memref.subview %41[0, 0, 0, 0] [%39, %36, 16, 1] [1, 1, 1, 1] : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>> to memref<?x?x16xf32, strided<[?, 16, 1]>, #hal.descriptor_type<storage_buffer>>
%subview_6 = memref.subview %42[0, 0, 0, 0] [%37, %40, 16, 1] [1, 1, 1, 1] : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>> to memref<?x?x16xf32, strided<[?, 16, 1]>, #hal.descriptor_type<storage_buffer>>
%47 = vector.transfer_read %subview_6[%arg1, %arg3, %c0], %cst_0 {in_bounds = [true]} : memref<?x?x16xf32, strided<[?, 16, 1]>, #hal.descriptor_type<storage_buffer>>, vector<16xf32>
%48 = memref.load %subview_5[%arg0, %arg3, %arg2] : memref<?x?x16xf32, strided<[?, 16, 1]>, #hal.descriptor_type<storage_buffer>>
%49 = vector.broadcast %48 : f32 to vector<16xf32>
%50 = vector.extract %arg4[0] : vector<16xf32> from vector<2x16xf32>
%51 = vector.fma %49, %47, %50 : vector<16xf32>
%52 = vector.insert %51, %cst [0] : vector<16xf32> into vector<2x16xf32>
%53 = affine.apply affine_map<()[s0] -> (s0 + 1)>()[%arg2]
%54 = memref.load %subview_5[%arg0, %arg3, %53] : memref<?x?x16xf32, strided<[?, 16, 1]>, #hal.descriptor_type<storage_buffer>>
%55 = vector.broadcast %54 : f32 to vector<16xf32>
%56 = vector.extract %arg4[1] : vector<16xf32> from vector<2x16xf32>
%57 = vector.fma %55, %47, %56 : vector<16xf32>
%58 = vector.insert %57, %52 [1] : vector<16xf32> into vector<2x16xf32>
scf.yield %58 : vector<2x16xf32>
}
%subview_4 = memref.subview %alloca[0, 0, 0, 0] [1, 1, 2, 16] [1, 1, 1, 1] : memref<1x1x2x16xf32> to memref<2x16xf32>
vector.transfer_write %46, %subview_4[%c0, %c0] {in_bounds = [true, true]} : vector<2x16xf32>, memref<2x16xf32>
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%alloca, %subview_2 : memref<1x1x2x16xf32>, memref<1x2xf32, strided<[16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_1 : memref<1x1x2x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>} {
^bb0(%in: f32, %in_5: f32, %out: f32):
%47 = arith.addf %in, %in_5 : f32
%48 = arith.maximumf %47, %cst_0 : f32
linalg.yield %48 : f32
}
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
return
}
// -----// IR Dump After VectorTransferLoweringPass (iree-codegen-vector-transfer-lowering) //----- //
func.func @mmt4d_bias_relu_dispatch_0_mmt4d_DxDxDx16x16x1_f32() attributes {hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", iree.encoding.resolver = #iree_cpu.cpu_encoding_layout<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "none"}>, translation_info = #iree_codegen.translation_info<pipeline = Mmt4dTilingExpert>} {
%cst = arith.constant dense<0.000000e+00> : vector<2x16xf32>
%cst_0 = arith.constant 0.000000e+00 : f32
%c32_i64 = arith.constant 32 : i64
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%alloca = memref.alloca() {alignment = 64 : i64} : memref<1x1x2x16xf32>
%0 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
%4 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
%5 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
%6 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
%7 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
%8 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
%9 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
%10 = arith.extui %0 : i32 to i64
%11 = arith.extui %1 : i32 to i64
%12 = arith.shli %11, %c32_i64 : i64
%13 = arith.ori %10, %12 : i64
%14 = arith.index_castui %13 : i64 to index
%15 = arith.extui %2 : i32 to i64
%16 = arith.extui %3 : i32 to i64
%17 = arith.shli %16, %c32_i64 : i64
%18 = arith.ori %15, %17 : i64
%19 = arith.index_castui %18 : i64 to index
%20 = arith.extui %4 : i32 to i64
%21 = arith.extui %5 : i32 to i64
%22 = arith.shli %21, %c32_i64 : i64
%23 = arith.ori %20, %22 : i64
%24 = arith.index_castui %23 : i64 to index
%25 = arith.extui %6 : i32 to i64
%26 = arith.extui %7 : i32 to i64
%27 = arith.shli %26, %c32_i64 : i64
%28 = arith.ori %25, %27 : i64
%29 = arith.index_castui %28 : i64 to index
%30 = arith.extui %8 : i32 to i64
%31 = arith.extui %9 : i32 to i64
%32 = arith.shli %31, %c32_i64 : i64
%33 = arith.ori %30, %32 : i64
%34 = arith.index_castui %33 : i64 to index
%35:5 = util.assume.int
%14<umin = 0, umax = 9007199254740991>,
%19<umin = 0, umax = 9007199254740991>,
%24<umin = 0, umax = 9007199254740991>,
%29<umin = 0, umax = 9007199254740991>,
%34<umin = 0, umax = 9007199254740991>
: index, index, index, index, index
%36 = iree_tensor_ext.dispatch.workload.ordinal %35#0, 0 : index
%37 = iree_tensor_ext.dispatch.workload.ordinal %35#1, 1 : index
%38 = iree_tensor_ext.dispatch.workload.ordinal %35#2, 2 : index
%39 = iree_tensor_ext.dispatch.workload.ordinal %35#3, 3 : index
%40 = iree_tensor_ext.dispatch.workload.ordinal %35#4, 4 : index
%41 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>{%39, %36}
memref.assume_alignment %41, 64 : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>
%42 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>{%37, %40}
memref.assume_alignment %42, 64 : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>
%43 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x16xf32, #hal.descriptor_type<storage_buffer>>{%38}
memref.assume_alignment %43, 64 : memref<?x16xf32, #hal.descriptor_type<storage_buffer>>
%44 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(3) alignment(64) offset(%c0) flags(Indirect) : memref<?x?x16x16xf32, #hal.descriptor_type<storage_buffer>>{%39, %40}
memref.assume_alignment %44, 64 : memref<?x?x16x16xf32, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg0, %arg1) in (%39, %37) {
%subview = memref.subview %44[%arg0, %arg1, 0, 0] [1, 1, 16, 16] [1, 1, 1, 1] : memref<?x?x16x16xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1x16x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg2) = (0) to (16) step (2) {
%subview_1 = memref.subview %subview[0, 0, %arg2, 0] [1, 1, 2, 16] [1, 1, 1, 1] : memref<1x1x16x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x2x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_2 = memref.subview %43[%arg0, %arg2] [1, 2] [1, 1] : memref<?x16xf32, #hal.descriptor_type<storage_buffer>> to memref<1x2xf32, strided<[16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>} ins(%cst_0 : f32) outs(%alloca : memref<1x1x2x16xf32>)
%subview_3 = memref.subview %alloca[0, 0, 0, 0] [1, 1, 2, 16] [1, 1, 1, 1] : memref<1x1x2x16xf32> to memref<2x16xf32>
%45 = vector.load %subview_3[%c0, %c0] : memref<2x16xf32>, vector<16xf32>
%46 = vector.insert %45, %cst [0] : vector<16xf32> into vector<2x16xf32>
%47 = vector.load %subview_3[%c1, %c0] : memref<2x16xf32>, vector<16xf32>
%48 = vector.insert %47, %46 [1] : vector<16xf32> into vector<2x16xf32>
%49 = scf.for %arg3 = %c0 to %36 step %c1 iter_args(%arg4 = %48) -> (vector<2x16xf32>) {
%subview_5 = memref.subview %41[0, 0, 0, 0] [%39, %36, 16, 1] [1, 1, 1, 1] : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>> to memref<?x?x16xf32, strided<[?, 16, 1]>, #hal.descriptor_type<storage_buffer>>
%subview_6 = memref.subview %42[0, 0, 0, 0] [%37, %40, 16, 1] [1, 1, 1, 1] : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>> to memref<?x?x16xf32, strided<[?, 16, 1]>, #hal.descriptor_type<storage_buffer>>
%52 = vector.load %subview_6[%arg1, %arg3, %c0] : memref<?x?x16xf32, strided<[?, 16, 1]>, #hal.descriptor_type<storage_buffer>>, vector<16xf32>
%53 = memref.load %subview_5[%arg0, %arg3, %arg2] : memref<?x?x16xf32, strided<[?, 16, 1]>, #hal.descriptor_type<storage_buffer>>
%54 = vector.broadcast %53 : f32 to vector<16xf32>
%55 = vector.extract %arg4[0] : vector<16xf32> from vector<2x16xf32>
%56 = vector.fma %54, %52, %55 : vector<16xf32>
%57 = vector.insert %56, %cst [0] : vector<16xf32> into vector<2x16xf32>
%58 = affine.apply affine_map<()[s0] -> (s0 + 1)>()[%arg2]
%59 = memref.load %subview_5[%arg0, %arg3, %58] : memref<?x?x16xf32, strided<[?, 16, 1]>, #hal.descriptor_type<storage_buffer>>
%60 = vector.broadcast %59 : f32 to vector<16xf32>
%61 = vector.extract %arg4[1] : vector<16xf32> from vector<2x16xf32>
%62 = vector.fma %60, %52, %61 : vector<16xf32>
%63 = vector.insert %62, %57 [1] : vector<16xf32> into vector<2x16xf32>
scf.yield %63 : vector<2x16xf32>
}
%subview_4 = memref.subview %alloca[0, 0, 0, 0] [1, 1, 2, 16] [1, 1, 1, 1] : memref<1x1x2x16xf32> to memref<2x16xf32>
%50 = vector.extract %49[0] : vector<16xf32> from vector<2x16xf32>
vector.store %50, %subview_4[%c0, %c0] : memref<2x16xf32>, vector<16xf32>
%51 = vector.extract %49[1] : vector<16xf32> from vector<2x16xf32>
vector.store %51, %subview_4[%c1, %c0] : memref<2x16xf32>, vector<16xf32>
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%alloca, %subview_2 : memref<1x1x2x16xf32>, memref<1x2xf32, strided<[16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_1 : memref<1x1x2x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>} {
^bb0(%in: f32, %in_5: f32, %out: f32):
%52 = arith.addf %in, %in_5 : f32
%53 = arith.maximumf %52, %cst_0 : f32
linalg.yield %53 : f32
}
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
return
}
// -----// IR Dump After LLVMCPUVectorTransposeLoweringPass (iree-llvmcpu-vector-transpose-lowering) //----- //
func.func @mmt4d_bias_relu_dispatch_0_mmt4d_DxDxDx16x16x1_f32() attributes {hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", iree.encoding.resolver = #iree_cpu.cpu_encoding_layout<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "none"}>, translation_info = #iree_codegen.translation_info<pipeline = Mmt4dTilingExpert>} {
%cst = arith.constant dense<0.000000e+00> : vector<2x16xf32>
%cst_0 = arith.constant 0.000000e+00 : f32
%c32_i64 = arith.constant 32 : i64
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%alloca = memref.alloca() {alignment = 64 : i64} : memref<1x1x2x16xf32>
%0 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
%4 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
%5 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
%6 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
%7 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
%8 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
%9 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
%10 = arith.extui %0 : i32 to i64
%11 = arith.extui %1 : i32 to i64
%12 = arith.shli %11, %c32_i64 : i64
%13 = arith.ori %10, %12 : i64
%14 = arith.index_castui %13 : i64 to index
%15 = arith.extui %2 : i32 to i64
%16 = arith.extui %3 : i32 to i64
%17 = arith.shli %16, %c32_i64 : i64
%18 = arith.ori %15, %17 : i64
%19 = arith.index_castui %18 : i64 to index
%20 = arith.extui %4 : i32 to i64
%21 = arith.extui %5 : i32 to i64
%22 = arith.shli %21, %c32_i64 : i64
%23 = arith.ori %20, %22 : i64
%24 = arith.index_castui %23 : i64 to index
%25 = arith.extui %6 : i32 to i64
%26 = arith.extui %7 : i32 to i64
%27 = arith.shli %26, %c32_i64 : i64
%28 = arith.ori %25, %27 : i64
%29 = arith.index_castui %28 : i64 to index
%30 = arith.extui %8 : i32 to i64
%31 = arith.extui %9 : i32 to i64
%32 = arith.shli %31, %c32_i64 : i64
%33 = arith.ori %30, %32 : i64
%34 = arith.index_castui %33 : i64 to index
%35:5 = util.assume.int
%14<umin = 0, umax = 9007199254740991>,
%19<umin = 0, umax = 9007199254740991>,
%24<umin = 0, umax = 9007199254740991>,
%29<umin = 0, umax = 9007199254740991>,
%34<umin = 0, umax = 9007199254740991>
: index, index, index, index, index
%36 = iree_tensor_ext.dispatch.workload.ordinal %35#0, 0 : index
%37 = iree_tensor_ext.dispatch.workload.ordinal %35#1, 1 : index
%38 = iree_tensor_ext.dispatch.workload.ordinal %35#2, 2 : index
%39 = iree_tensor_ext.dispatch.workload.ordinal %35#3, 3 : index
%40 = iree_tensor_ext.dispatch.workload.ordinal %35#4, 4 : index
%41 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>{%39, %36}
memref.assume_alignment %41, 64 : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>
%42 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>{%37, %40}
memref.assume_alignment %42, 64 : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>
%43 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x16xf32, #hal.descriptor_type<storage_buffer>>{%38}
memref.assume_alignment %43, 64 : memref<?x16xf32, #hal.descriptor_type<storage_buffer>>
%44 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(3) alignment(64) offset(%c0) flags(Indirect) : memref<?x?x16x16xf32, #hal.descriptor_type<storage_buffer>>{%39, %40}
memref.assume_alignment %44, 64 : memref<?x?x16x16xf32, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg0, %arg1) in (%39, %37) {
%subview = memref.subview %44[%arg0, %arg1, 0, 0] [1, 1, 16, 16] [1, 1, 1, 1] : memref<?x?x16x16xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1x16x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg2) = (0) to (16) step (2) {
%subview_1 = memref.subview %subview[0, 0, %arg2, 0] [1, 1, 2, 16] [1, 1, 1, 1] : memref<1x1x16x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x2x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_2 = memref.subview %43[%arg0, %arg2] [1, 2] [1, 1] : memref<?x16xf32, #hal.descriptor_type<storage_buffer>> to memref<1x2xf32, strided<[16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>} ins(%cst_0 : f32) outs(%alloca : memref<1x1x2x16xf32>)
%subview_3 = memref.subview %alloca[0, 0, 0, 0] [1, 1, 2, 16] [1, 1, 1, 1] : memref<1x1x2x16xf32> to memref<2x16xf32>
%45 = vector.load %subview_3[%c0, %c0] : memref<2x16xf32>, vector<16xf32>
%46 = vector.insert %45, %cst [0] : vector<16xf32> into vector<2x16xf32>
%47 = vector.load %subview_3[%c1, %c0] : memref<2x16xf32>, vector<16xf32>
%48 = vector.insert %47, %46 [1] : vector<16xf32> into vector<2x16xf32>
%49 = scf.for %arg3 = %c0 to %36 step %c1 iter_args(%arg4 = %48) -> (vector<2x16xf32>) {
%subview_5 = memref.subview %41[0, 0, 0, 0] [%39, %36, 16, 1] [1, 1, 1, 1] : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>> to memref<?x?x16xf32, strided<[?, 16, 1]>, #hal.descriptor_type<storage_buffer>>
%subview_6 = memref.subview %42[0, 0, 0, 0] [%37, %40, 16, 1] [1, 1, 1, 1] : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>> to memref<?x?x16xf32, strided<[?, 16, 1]>, #hal.descriptor_type<storage_buffer>>
%52 = vector.load %subview_6[%arg1, %arg3, %c0] : memref<?x?x16xf32, strided<[?, 16, 1]>, #hal.descriptor_type<storage_buffer>>, vector<16xf32>
%53 = memref.load %subview_5[%arg0, %arg3, %arg2] : memref<?x?x16xf32, strided<[?, 16, 1]>, #hal.descriptor_type<storage_buffer>>
%54 = vector.broadcast %53 : f32 to vector<16xf32>
%55 = vector.extract %arg4[0] : vector<16xf32> from vector<2x16xf32>
%56 = vector.fma %54, %52, %55 : vector<16xf32>
%57 = vector.insert %56, %cst [0] : vector<16xf32> into vector<2x16xf32>
%58 = affine.apply affine_map<()[s0] -> (s0 + 1)>()[%arg2]
%59 = memref.load %subview_5[%arg0, %arg3, %58] : memref<?x?x16xf32, strided<[?, 16, 1]>, #hal.descriptor_type<storage_buffer>>
%60 = vector.broadcast %59 : f32 to vector<16xf32>
%61 = vector.extract %arg4[1] : vector<16xf32> from vector<2x16xf32>
%62 = vector.fma %60, %52, %61 : vector<16xf32>
%63 = vector.insert %62, %57 [1] : vector<16xf32> into vector<2x16xf32>
scf.yield %63 : vector<2x16xf32>
}
%subview_4 = memref.subview %alloca[0, 0, 0, 0] [1, 1, 2, 16] [1, 1, 1, 1] : memref<1x1x2x16xf32> to memref<2x16xf32>
%50 = vector.extract %49[0] : vector<16xf32> from vector<2x16xf32>
vector.store %50, %subview_4[%c0, %c0] : memref<2x16xf32>, vector<16xf32>
%51 = vector.extract %49[1] : vector<16xf32> from vector<2x16xf32>
vector.store %51, %subview_4[%c1, %c0] : memref<2x16xf32>, vector<16xf32>
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%alloca, %subview_2 : memref<1x1x2x16xf32>, memref<1x2xf32, strided<[16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_1 : memref<1x1x2x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>} {
^bb0(%in: f32, %in_5: f32, %out: f32):
%52 = arith.addf %in, %in_5 : f32
%53 = arith.maximumf %52, %cst_0 : f32
linalg.yield %53 : f32
}
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
return
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @mmt4d_bias_relu_dispatch_0_mmt4d_DxDxDx16x16x1_f32() attributes {hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", iree.encoding.resolver = #iree_cpu.cpu_encoding_layout<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "none"}>, translation_info = #iree_codegen.translation_info<pipeline = Mmt4dTilingExpert>} {
%cst = arith.constant dense<0.000000e+00> : vector<2x16xf32>
%cst_0 = arith.constant 0.000000e+00 : f32
%c32_i64 = arith.constant 32 : i64
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%alloca = memref.alloca() {alignment = 64 : i64} : memref<1x1x2x16xf32>
%0 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
%4 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
%5 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
%6 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
%7 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
%8 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
%9 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
%10 = arith.extui %0 : i32 to i64
%11 = arith.extui %1 : i32 to i64
%12 = arith.shli %11, %c32_i64 : i64
%13 = arith.ori %10, %12 : i64
%14 = arith.index_castui %13 : i64 to index
%15 = arith.extui %2 : i32 to i64
%16 = arith.extui %3 : i32 to i64
%17 = arith.shli %16, %c32_i64 : i64
%18 = arith.ori %15, %17 : i64
%19 = arith.index_castui %18 : i64 to index
%20 = arith.extui %4 : i32 to i64
%21 = arith.extui %5 : i32 to i64
%22 = arith.shli %21, %c32_i64 : i64
%23 = arith.ori %20, %22 : i64
%24 = arith.index_castui %23 : i64 to index
%25 = arith.extui %6 : i32 to i64
%26 = arith.extui %7 : i32 to i64
%27 = arith.shli %26, %c32_i64 : i64
%28 = arith.ori %25, %27 : i64
%29 = arith.index_castui %28 : i64 to index
%30 = arith.extui %8 : i32 to i64
%31 = arith.extui %9 : i32 to i64
%32 = arith.shli %31, %c32_i64 : i64
%33 = arith.ori %30, %32 : i64
%34 = arith.index_castui %33 : i64 to index
%35:5 = util.assume.int
%14<umin = 0, umax = 9007199254740991>,
%19<umin = 0, umax = 9007199254740991>,
%24<umin = 0, umax = 9007199254740991>,
%29<umin = 0, umax = 9007199254740991>,
%34<umin = 0, umax = 9007199254740991>
: index, index, index, index, index
%36 = iree_tensor_ext.dispatch.workload.ordinal %35#0, 0 : index
%37 = iree_tensor_ext.dispatch.workload.ordinal %35#1, 1 : index
%38 = iree_tensor_ext.dispatch.workload.ordinal %35#2, 2 : index
%39 = iree_tensor_ext.dispatch.workload.ordinal %35#3, 3 : index
%40 = iree_tensor_ext.dispatch.workload.ordinal %35#4, 4 : index
%41 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>{%39, %36}
memref.assume_alignment %41, 64 : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>
%42 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>{%37, %40}
memref.assume_alignment %42, 64 : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>
%43 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x16xf32, #hal.descriptor_type<storage_buffer>>{%38}
memref.assume_alignment %43, 64 : memref<?x16xf32, #hal.descriptor_type<storage_buffer>>
%44 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(3) alignment(64) offset(%c0) flags(Indirect) : memref<?x?x16x16xf32, #hal.descriptor_type<storage_buffer>>{%39, %40}
memref.assume_alignment %44, 64 : memref<?x?x16x16xf32, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg0, %arg1) in (%39, %37) {
%subview = memref.subview %44[%arg0, %arg1, 0, 0] [1, 1, 16, 16] [1, 1, 1, 1] : memref<?x?x16x16xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1x16x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg2) = (0) to (16) step (2) {
%subview_1 = memref.subview %subview[0, 0, %arg2, 0] [1, 1, 2, 16] [1, 1, 1, 1] : memref<1x1x16x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x2x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_2 = memref.subview %43[%arg0, %arg2] [1, 2] [1, 1] : memref<?x16xf32, #hal.descriptor_type<storage_buffer>> to memref<1x2xf32, strided<[16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>} ins(%cst_0 : f32) outs(%alloca : memref<1x1x2x16xf32>)
%subview_3 = memref.subview %alloca[0, 0, 0, 0] [1, 1, 2, 16] [1, 1, 1, 1] : memref<1x1x2x16xf32> to memref<2x16xf32>
%45 = vector.load %subview_3[%c0, %c0] : memref<2x16xf32>, vector<16xf32>
%46 = vector.insert %45, %cst [0] : vector<16xf32> into vector<2x16xf32>
%47 = vector.load %subview_3[%c1, %c0] : memref<2x16xf32>, vector<16xf32>
%48 = vector.insert %47, %46 [1] : vector<16xf32> into vector<2x16xf32>
%49 = scf.for %arg3 = %c0 to %36 step %c1 iter_args(%arg4 = %48) -> (vector<2x16xf32>) {
%subview_5 = memref.subview %41[0, 0, 0, 0] [%39, %36, 16, 1] [1, 1, 1, 1] : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>> to memref<?x?x16xf32, strided<[?, 16, 1]>, #hal.descriptor_type<storage_buffer>>
%subview_6 = memref.subview %42[0, 0, 0, 0] [%37, %40, 16, 1] [1, 1, 1, 1] : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>> to memref<?x?x16xf32, strided<[?, 16, 1]>, #hal.descriptor_type<storage_buffer>>
%52 = vector.load %subview_6[%arg1, %arg3, %c0] : memref<?x?x16xf32, strided<[?, 16, 1]>, #hal.descriptor_type<storage_buffer>>, vector<16xf32>
%53 = memref.load %subview_5[%arg0, %arg3, %arg2] : memref<?x?x16xf32, strided<[?, 16, 1]>, #hal.descriptor_type<storage_buffer>>
%54 = vector.broadcast %53 : f32 to vector<16xf32>
%55 = vector.extract %arg4[0] : vector<16xf32> from vector<2x16xf32>
%56 = vector.fma %54, %52, %55 : vector<16xf32>
%57 = vector.insert %56, %cst [0] : vector<16xf32> into vector<2x16xf32>
%58 = affine.apply affine_map<()[s0] -> (s0 + 1)>()[%arg2]
%59 = memref.load %subview_5[%arg0, %arg3, %58] : memref<?x?x16xf32, strided<[?, 16, 1]>, #hal.descriptor_type<storage_buffer>>
%60 = vector.broadcast %59 : f32 to vector<16xf32>
%61 = vector.extract %arg4[1] : vector<16xf32> from vector<2x16xf32>
%62 = vector.fma %60, %52, %61 : vector<16xf32>
%63 = vector.insert %62, %57 [1] : vector<16xf32> into vector<2x16xf32>
scf.yield %63 : vector<2x16xf32>
}
%subview_4 = memref.subview %alloca[0, 0, 0, 0] [1, 1, 2, 16] [1, 1, 1, 1] : memref<1x1x2x16xf32> to memref<2x16xf32>
%50 = vector.extract %49[0] : vector<16xf32> from vector<2x16xf32>
vector.store %50, %subview_4[%c0, %c0] : memref<2x16xf32>, vector<16xf32>
%51 = vector.extract %49[1] : vector<16xf32> from vector<2x16xf32>
vector.store %51, %subview_4[%c1, %c0] : memref<2x16xf32>, vector<16xf32>
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%alloca, %subview_2 : memref<1x1x2x16xf32>, memref<1x2xf32, strided<[16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_1 : memref<1x1x2x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>} {
^bb0(%in: f32, %in_5: f32, %out: f32):
%52 = arith.addf %in, %in_5 : f32
%53 = arith.maximumf %52, %cst_0 : f32
linalg.yield %53 : f32
}
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
return
}
// -----// IR Dump After LLVMCPUVectorShapeCastLoweringPass (iree-llvmcpu-vector-shape-cast-lowering) //----- //
func.func @mmt4d_bias_relu_dispatch_0_mmt4d_DxDxDx16x16x1_f32() attributes {hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", iree.encoding.resolver = #iree_cpu.cpu_encoding_layout<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "none"}>, translation_info = #iree_codegen.translation_info<pipeline = Mmt4dTilingExpert>} {
%cst = arith.constant dense<0.000000e+00> : vector<2x16xf32>
%cst_0 = arith.constant 0.000000e+00 : f32
%c32_i64 = arith.constant 32 : i64
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%alloca = memref.alloca() {alignment = 64 : i64} : memref<1x1x2x16xf32>
%0 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
%4 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
%5 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
%6 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
%7 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
%8 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
%9 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
%10 = arith.extui %0 : i32 to i64
%11 = arith.extui %1 : i32 to i64
%12 = arith.shli %11, %c32_i64 : i64
%13 = arith.ori %10, %12 : i64
%14 = arith.index_castui %13 : i64 to index
%15 = arith.extui %2 : i32 to i64
%16 = arith.extui %3 : i32 to i64
%17 = arith.shli %16, %c32_i64 : i64
%18 = arith.ori %15, %17 : i64
%19 = arith.index_castui %18 : i64 to index
%20 = arith.extui %4 : i32 to i64
%21 = arith.extui %5 : i32 to i64
%22 = arith.shli %21, %c32_i64 : i64
%23 = arith.ori %20, %22 : i64
%24 = arith.index_castui %23 : i64 to index
%25 = arith.extui %6 : i32 to i64
%26 = arith.extui %7 : i32 to i64
%27 = arith.shli %26, %c32_i64 : i64
%28 = arith.ori %25, %27 : i64
%29 = arith.index_castui %28 : i64 to index
%30 = arith.extui %8 : i32 to i64
%31 = arith.extui %9 : i32 to i64
%32 = arith.shli %31, %c32_i64 : i64
%33 = arith.ori %30, %32 : i64
%34 = arith.index_castui %33 : i64 to index
%35:5 = util.assume.int
%14<umin = 0, umax = 9007199254740991>,
%19<umin = 0, umax = 9007199254740991>,
%24<umin = 0, umax = 9007199254740991>,
%29<umin = 0, umax = 9007199254740991>,
%34<umin = 0, umax = 9007199254740991>
: index, index, index, index, index
%36 = iree_tensor_ext.dispatch.workload.ordinal %35#0, 0 : index
%37 = iree_tensor_ext.dispatch.workload.ordinal %35#1, 1 : index
%38 = iree_tensor_ext.dispatch.workload.ordinal %35#2, 2 : index
%39 = iree_tensor_ext.dispatch.workload.ordinal %35#3, 3 : index
%40 = iree_tensor_ext.dispatch.workload.ordinal %35#4, 4 : index
%41 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>{%39, %36}
memref.assume_alignment %41, 64 : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>
%42 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>{%37, %40}
memref.assume_alignment %42, 64 : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>
%43 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x16xf32, #hal.descriptor_type<storage_buffer>>{%38}
memref.assume_alignment %43, 64 : memref<?x16xf32, #hal.descriptor_type<storage_buffer>>
%44 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(3) alignment(64) offset(%c0) flags(Indirect) : memref<?x?x16x16xf32, #hal.descriptor_type<storage_buffer>>{%39, %40}
memref.assume_alignment %44, 64 : memref<?x?x16x16xf32, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg0, %arg1) in (%39, %37) {
%subview = memref.subview %44[%arg0, %arg1, 0, 0] [1, 1, 16, 16] [1, 1, 1, 1] : memref<?x?x16x16xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1x16x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg2) = (0) to (16) step (2) {
%subview_1 = memref.subview %subview[0, 0, %arg2, 0] [1, 1, 2, 16] [1, 1, 1, 1] : memref<1x1x16x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x2x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_2 = memref.subview %43[%arg0, %arg2] [1, 2] [1, 1] : memref<?x16xf32, #hal.descriptor_type<storage_buffer>> to memref<1x2xf32, strided<[16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>} ins(%cst_0 : f32) outs(%alloca : memref<1x1x2x16xf32>)
%subview_3 = memref.subview %alloca[0, 0, 0, 0] [1, 1, 2, 16] [1, 1, 1, 1] : memref<1x1x2x16xf32> to memref<2x16xf32>
%45 = vector.load %subview_3[%c0, %c0] : memref<2x16xf32>, vector<16xf32>
%46 = vector.insert %45, %cst [0] : vector<16xf32> into vector<2x16xf32>
%47 = vector.load %subview_3[%c1, %c0] : memref<2x16xf32>, vector<16xf32>
%48 = vector.insert %47, %46 [1] : vector<16xf32> into vector<2x16xf32>
%49 = scf.for %arg3 = %c0 to %36 step %c1 iter_args(%arg4 = %48) -> (vector<2x16xf32>) {
%subview_5 = memref.subview %41[0, 0, 0, 0] [%39, %36, 16, 1] [1, 1, 1, 1] : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>> to memref<?x?x16xf32, strided<[?, 16, 1]>, #hal.descriptor_type<storage_buffer>>
%subview_6 = memref.subview %42[0, 0, 0, 0] [%37, %40, 16, 1] [1, 1, 1, 1] : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>> to memref<?x?x16xf32, strided<[?, 16, 1]>, #hal.descriptor_type<storage_buffer>>
%52 = vector.load %subview_6[%arg1, %arg3, %c0] : memref<?x?x16xf32, strided<[?, 16, 1]>, #hal.descriptor_type<storage_buffer>>, vector<16xf32>
%53 = memref.load %subview_5[%arg0, %arg3, %arg2] : memref<?x?x16xf32, strided<[?, 16, 1]>, #hal.descriptor_type<storage_buffer>>
%54 = vector.broadcast %53 : f32 to vector<16xf32>
%55 = vector.extract %arg4[0] : vector<16xf32> from vector<2x16xf32>
%56 = vector.fma %54, %52, %55 : vector<16xf32>
%57 = vector.insert %56, %cst [0] : vector<16xf32> into vector<2x16xf32>
%58 = affine.apply affine_map<()[s0] -> (s0 + 1)>()[%arg2]
%59 = memref.load %subview_5[%arg0, %arg3, %58] : memref<?x?x16xf32, strided<[?, 16, 1]>, #hal.descriptor_type<storage_buffer>>
%60 = vector.broadcast %59 : f32 to vector<16xf32>
%61 = vector.extract %arg4[1] : vector<16xf32> from vector<2x16xf32>
%62 = vector.fma %60, %52, %61 : vector<16xf32>
%63 = vector.insert %62, %57 [1] : vector<16xf32> into vector<2x16xf32>
scf.yield %63 : vector<2x16xf32>
}
%subview_4 = memref.subview %alloca[0, 0, 0, 0] [1, 1, 2, 16] [1, 1, 1, 1] : memref<1x1x2x16xf32> to memref<2x16xf32>
%50 = vector.extract %49[0] : vector<16xf32> from vector<2x16xf32>
vector.store %50, %subview_4[%c0, %c0] : memref<2x16xf32>, vector<16xf32>
%51 = vector.extract %49[1] : vector<16xf32> from vector<2x16xf32>
vector.store %51, %subview_4[%c1, %c0] : memref<2x16xf32>, vector<16xf32>
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%alloca, %subview_2 : memref<1x1x2x16xf32>, memref<1x2xf32, strided<[16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_1 : memref<1x1x2x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>} {
^bb0(%in: f32, %in_5: f32, %out: f32):
%52 = arith.addf %in, %in_5 : f32
%53 = arith.maximumf %52, %cst_0 : f32
linalg.yield %53 : f32
}
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
return
}
// -----// IR Dump After LLVMCPULowerExecutableTargetPass (iree-llvmcpu-lower-executable-target) //----- //
func.func @mmt4d_bias_relu_dispatch_0_mmt4d_DxDxDx16x16x1_f32() attributes {hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", iree.encoding.resolver = #iree_cpu.cpu_encoding_layout<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "none"}>, translation_info = #iree_codegen.translation_info<pipeline = Mmt4dTilingExpert>} {
%cst = arith.constant dense<0.000000e+00> : vector<2x16xf32>
%cst_0 = arith.constant 0.000000e+00 : f32
%c32_i64 = arith.constant 32 : i64
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%alloca = memref.alloca() {alignment = 64 : i64} : memref<1x1x2x16xf32>
%0 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
%4 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
%5 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
%6 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
%7 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
%8 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
%9 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
%10 = arith.extui %0 : i32 to i64
%11 = arith.extui %1 : i32 to i64
%12 = arith.shli %11, %c32_i64 : i64
%13 = arith.ori %10, %12 : i64
%14 = arith.index_castui %13 : i64 to index
%15 = arith.extui %2 : i32 to i64
%16 = arith.extui %3 : i32 to i64
%17 = arith.shli %16, %c32_i64 : i64
%18 = arith.ori %15, %17 : i64
%19 = arith.index_castui %18 : i64 to index
%20 = arith.extui %4 : i32 to i64
%21 = arith.extui %5 : i32 to i64
%22 = arith.shli %21, %c32_i64 : i64
%23 = arith.ori %20, %22 : i64
%24 = arith.index_castui %23 : i64 to index
%25 = arith.extui %6 : i32 to i64
%26 = arith.extui %7 : i32 to i64
%27 = arith.shli %26, %c32_i64 : i64
%28 = arith.ori %25, %27 : i64
%29 = arith.index_castui %28 : i64 to index
%30 = arith.extui %8 : i32 to i64
%31 = arith.extui %9 : i32 to i64
%32 = arith.shli %31, %c32_i64 : i64
%33 = arith.ori %30, %32 : i64
%34 = arith.index_castui %33 : i64 to index
%35:5 = util.assume.int
%14<umin = 0, umax = 9007199254740991>,
%19<umin = 0, umax = 9007199254740991>,
%24<umin = 0, umax = 9007199254740991>,
%29<umin = 0, umax = 9007199254740991>,
%34<umin = 0, umax = 9007199254740991>
: index, index, index, index, index
%36 = iree_tensor_ext.dispatch.workload.ordinal %35#0, 0 : index
%37 = iree_tensor_ext.dispatch.workload.ordinal %35#1, 1 : index
%38 = iree_tensor_ext.dispatch.workload.ordinal %35#2, 2 : index
%39 = iree_tensor_ext.dispatch.workload.ordinal %35#3, 3 : index
%40 = iree_tensor_ext.dispatch.workload.ordinal %35#4, 4 : index
%41 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>{%39, %36}
memref.assume_alignment %41, 64 : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>
%42 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>{%37, %40}
memref.assume_alignment %42, 64 : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>>
%43 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<?x16xf32, #hal.descriptor_type<storage_buffer>>{%38}
memref.assume_alignment %43, 64 : memref<?x16xf32, #hal.descriptor_type<storage_buffer>>
%44 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(3) alignment(64) offset(%c0) flags(Indirect) : memref<?x?x16x16xf32, #hal.descriptor_type<storage_buffer>>{%39, %40}
memref.assume_alignment %44, 64 : memref<?x?x16x16xf32, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg0, %arg1) in (%39, %37) {
%subview = memref.subview %44[%arg0, %arg1, 0, 0] [1, 1, 16, 16] [1, 1, 1, 1] : memref<?x?x16x16xf32, #hal.descriptor_type<storage_buffer>> to memref<1x1x16x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg2) = (0) to (16) step (2) {
%subview_1 = memref.subview %subview[0, 0, %arg2, 0] [1, 1, 2, 16] [1, 1, 1, 1] : memref<1x1x16x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x2x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_2 = memref.subview %43[%arg0, %arg2] [1, 2] [1, 1] : memref<?x16xf32, #hal.descriptor_type<storage_buffer>> to memref<1x2xf32, strided<[16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>} ins(%cst_0 : f32) outs(%alloca : memref<1x1x2x16xf32>)
%subview_3 = memref.subview %alloca[0, 0, 0, 0] [1, 1, 2, 16] [1, 1, 1, 1] : memref<1x1x2x16xf32> to memref<2x16xf32>
%45 = vector.load %subview_3[%c0, %c0] : memref<2x16xf32>, vector<16xf32>
%46 = vector.insert %45, %cst [0] : vector<16xf32> into vector<2x16xf32>
%47 = vector.load %subview_3[%c1, %c0] : memref<2x16xf32>, vector<16xf32>
%48 = vector.insert %47, %46 [1] : vector<16xf32> into vector<2x16xf32>
%49 = scf.for %arg3 = %c0 to %36 step %c1 iter_args(%arg4 = %48) -> (vector<2x16xf32>) {
%subview_5 = memref.subview %41[0, 0, 0, 0] [%39, %36, 16, 1] [1, 1, 1, 1] : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>> to memref<?x?x16xf32, strided<[?, 16, 1]>, #hal.descriptor_type<storage_buffer>>
%subview_6 = memref.subview %42[0, 0, 0, 0] [%37, %40, 16, 1] [1, 1, 1, 1] : memref<?x?x16x1xf32, #hal.descriptor_type<storage_buffer>> to memref<?x?x16xf32, strided<[?, 16, 1]>, #hal.descriptor_type<storage_buffer>>
%52 = vector.load %subview_6[%arg1, %arg3, %c0] : memref<?x?x16xf32, strided<[?, 16, 1]>, #hal.descriptor_type<storage_buffer>>, vector<16xf32>
%53 = memref.load %subview_5[%arg0, %arg3, %arg2] : memref<?x?x16xf32, strided<[?, 16, 1]>, #hal.descriptor_type<storage_buffer>>
%54 = vector.broadcast %53 : f32 to vector<16xf32>
%55 = vector.extract %arg4[0] : vector<16xf32> from vector<2x16xf32>
%56 = vector.fma %54, %52, %55 : vector<16xf32>
%57 = vector.insert %56, %cst [0] : vector<16xf32> into vector<2x16xf32>
%58 = affine.apply affine_map<()[s0] -> (s0 + 1)>()[%arg2]
%59 = memref.load %subview_5[%arg0, %arg3, %58] : memref<?x?x16xf32, strided<[?, 16, 1]>, #hal.descriptor_type<storage_buffer>>
%60 = vector.broadcast %59 : f32 to vector<16xf32>
%61 = vector.extract %arg4[1] : vector<16xf32> from vector<2x16xf32>
%62 = vector.fma %60, %52, %61 : vector<16xf32>
%63 = vector.insert %62, %57 [1] : vector<16xf32> into vector<2x16xf32>
scf.yield %63 : vector<2x16xf32>
}
%subview_4 = memref.subview %alloca[0, 0, 0, 0] [1, 1, 2, 16] [1, 1, 1, 1] : memref<1x1x2x16xf32> to memref<2x16xf32>
%50 = vector.extract %49[0] : vector<16xf32> from vector<2x16xf32>
vector.store %50, %subview_4[%c0, %c0] : memref<2x16xf32>, vector<16xf32>
%51 = vector.extract %49[1] : vector<16xf32> from vector<2x16xf32>
vector.store %51, %subview_4[%c1, %c0] : memref<2x16xf32>, vector<16xf32>
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%alloca, %subview_2 : memref<1x1x2x16xf32>, memref<1x2xf32, strided<[16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_1 : memref<1x1x2x16xf32, strided<[?, 256, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>} {
^bb0(%in: f32, %in_5: f32, %out: f32):
%52 = arith.addf %in, %in_5 : f32
%53 = arith.maximumf %52, %cst_0 : f32
linalg.yield %53 : f32
}
}
} {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
return
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment